getpatter 0.6.2 → 0.6.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -4
- package/dist/{carrier-config-4ZKVYAWV.mjs → carrier-config-7YGNRBPO.mjs} +60 -12
- package/dist/{chunk-R2T4JABZ.mjs → chunk-3VVATR6A.mjs} +8 -6
- package/dist/{chunk-LE63CSOB.mjs → chunk-7IIV3BY4.mjs} +1679 -228
- package/dist/{chunk-CL2U3YET.mjs → chunk-BO227NTF.mjs} +271 -54
- package/dist/cli.js +63 -20
- package/dist/dashboard/ui.html +10 -10
- package/dist/index.d.mts +4217 -3319
- package/dist/index.d.ts +4217 -3319
- package/dist/index.js +2815 -705
- package/dist/index.mjs +760 -392
- package/dist/{openai-realtime-2-CNFARP25.mjs → openai-realtime-2-L5EKAAUH.mjs} +1 -1
- package/dist/{silero-vad-LNDFGIY7.mjs → silero-vad-RGF5HCIR.mjs} +1 -1
- package/dist/{test-mode-RS57BDM6.mjs → test-mode-4QLLWYVV.mjs} +2 -2
- package/package.json +2 -1
- package/src/dashboard/ui.html +10 -10
|
@@ -47,6 +47,45 @@ var OpenAIRealtimeVADType = {
|
|
|
47
47
|
SERVER_VAD: "server_vad",
|
|
48
48
|
SEMANTIC_VAD: "semantic_vad"
|
|
49
49
|
};
|
|
50
|
+
function validateRealtimeTurnDetection(td) {
|
|
51
|
+
if (td === void 0) return;
|
|
52
|
+
if (td.type !== void 0 && td.type !== "server_vad" && td.type !== "semantic_vad") {
|
|
53
|
+
throw new Error(
|
|
54
|
+
`RealtimeTurnDetection.type must be 'server_vad' or 'semantic_vad', got ${JSON.stringify(td.type)}`
|
|
55
|
+
);
|
|
56
|
+
}
|
|
57
|
+
if (td.eagerness !== void 0 && td.eagerness !== "low" && td.eagerness !== "medium" && td.eagerness !== "high" && td.eagerness !== "auto") {
|
|
58
|
+
throw new Error(
|
|
59
|
+
`RealtimeTurnDetection.eagerness must be one of low|medium|high|auto, got ${JSON.stringify(td.eagerness)}`
|
|
60
|
+
);
|
|
61
|
+
}
|
|
62
|
+
if (td.eagerness !== void 0 && td.type !== "semantic_vad") {
|
|
63
|
+
throw new Error(
|
|
64
|
+
"RealtimeTurnDetection.eagerness is only valid when type='semantic_vad'"
|
|
65
|
+
);
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
function buildTurnDetection(td, opts) {
|
|
69
|
+
validateRealtimeTurnDetection(td);
|
|
70
|
+
let detection;
|
|
71
|
+
if (td?.type === "semantic_vad") {
|
|
72
|
+
detection = { type: "semantic_vad" };
|
|
73
|
+
if (td.eagerness !== void 0) detection.eagerness = td.eagerness;
|
|
74
|
+
} else {
|
|
75
|
+
detection = {
|
|
76
|
+
type: td?.type ?? opts.defaultType,
|
|
77
|
+
threshold: td?.threshold ?? 0.5,
|
|
78
|
+
prefix_padding_ms: td?.prefixPaddingMs ?? 300,
|
|
79
|
+
silence_duration_ms: td?.silenceDurationMs ?? opts.defaultSilenceMs
|
|
80
|
+
};
|
|
81
|
+
}
|
|
82
|
+
if (opts.includeResponseGating) {
|
|
83
|
+
const serverManaged = !(opts.gateResponseOnTranscript ?? false);
|
|
84
|
+
detection.create_response = serverManaged;
|
|
85
|
+
detection.interrupt_response = serverManaged;
|
|
86
|
+
}
|
|
87
|
+
return detection;
|
|
88
|
+
}
|
|
50
89
|
var OpenAIRealtimeAdapter = class {
|
|
51
90
|
constructor(apiKey, model = OpenAIRealtimeModel.GPT_REALTIME_MINI, voice = OpenAIVoice.ALLOY, instructions = "", tools, audioFormat = OpenAIRealtimeAudioFormat.G711_ULAW, options = {}) {
|
|
52
91
|
this.apiKey = apiKey;
|
|
@@ -56,6 +95,7 @@ var OpenAIRealtimeAdapter = class {
|
|
|
56
95
|
this.tools = tools;
|
|
57
96
|
this.audioFormat = audioFormat;
|
|
58
97
|
this.options = options;
|
|
98
|
+
this.gateResponseOnTranscript = options.gateResponseOnTranscript ?? false;
|
|
59
99
|
}
|
|
60
100
|
apiKey;
|
|
61
101
|
model;
|
|
@@ -85,6 +125,23 @@ var OpenAIRealtimeAdapter = class {
|
|
|
85
125
|
// could have produced, which is what the user actually heard.
|
|
86
126
|
currentResponseFirstAudioAt = null;
|
|
87
127
|
options;
|
|
128
|
+
// When true, the stream handler waits for the Whisper ``transcript_input``
|
|
129
|
+
// event before requesting the model response (legacy behavior). When false
|
|
130
|
+
// (default) the response is requested on ``speech_stopped`` and the
|
|
131
|
+
// transcript is display-only. Read by the stream handler via
|
|
132
|
+
// ``getGateResponseOnTranscript()``.
|
|
133
|
+
gateResponseOnTranscript;
|
|
134
|
+
/**
|
|
135
|
+
* Whether the stream handler should gate the model response on the Whisper
|
|
136
|
+
* transcript (legacy) or fire it on `speech_stopped` (default, decoupled).
|
|
137
|
+
*
|
|
138
|
+
* `false` (default) — the response is requested on `speech_stopped`,
|
|
139
|
+
* independently of Whisper. `true` — the response is requested only after
|
|
140
|
+
* `transcript_input` passes the hallucination filter.
|
|
141
|
+
*/
|
|
142
|
+
getGateResponseOnTranscript() {
|
|
143
|
+
return this.gateResponseOnTranscript;
|
|
144
|
+
}
|
|
88
145
|
/**
|
|
89
146
|
* Build the production session.update body. Mirrors the body sent
|
|
90
147
|
* inside `connect()` so warmup can apply identical configuration to
|
|
@@ -96,16 +153,26 @@ var OpenAIRealtimeAdapter = class {
|
|
|
96
153
|
output_audio_format: this.audioFormat,
|
|
97
154
|
voice: this.voice,
|
|
98
155
|
instructions: this.instructions || "You are a helpful voice assistant. Be concise.",
|
|
99
|
-
turn_detection
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
156
|
+
// v1 turn_detection carries NO create_response / interrupt_response
|
|
157
|
+
// keys. The v1 server defaults (`create_response: true`,
|
|
158
|
+
// `interrupt_response: true`) ARE the server-managed behaviour we want by
|
|
159
|
+
// default, so omitting them is equivalent to sending `true` — gating
|
|
160
|
+
// disabled here. `gateResponseOnTranscript` is still threaded through for
|
|
161
|
+
// symmetry with the GA builder, but has no wire effect while
|
|
162
|
+
// includeResponseGating is false.
|
|
163
|
+
turn_detection: buildTurnDetection(this.options.turnDetection, {
|
|
164
|
+
defaultType: this.options.vadType ?? OpenAIRealtimeVADType.SERVER_VAD,
|
|
165
|
+
defaultSilenceMs: this.options.silenceDurationMs ?? 300,
|
|
166
|
+
includeResponseGating: false,
|
|
167
|
+
gateResponseOnTranscript: this.gateResponseOnTranscript
|
|
168
|
+
}),
|
|
105
169
|
input_audio_transcription: {
|
|
106
170
|
model: this.options.inputAudioTranscriptionModel ?? OpenAITranscriptionModel.WHISPER_1
|
|
107
171
|
}
|
|
108
172
|
};
|
|
173
|
+
if (this.options.noiseReduction !== void 0) {
|
|
174
|
+
config.input_audio_noise_reduction = { type: this.options.noiseReduction };
|
|
175
|
+
}
|
|
109
176
|
if (this.options.temperature !== void 0) config.temperature = this.options.temperature;
|
|
110
177
|
if (this.options.maxResponseOutputTokens !== void 0) {
|
|
111
178
|
config.max_response_output_tokens = this.options.maxResponseOutputTokens;
|
|
@@ -369,6 +436,10 @@ var OpenAIRealtimeAdapter = class {
|
|
|
369
436
|
};
|
|
370
437
|
const timer = setTimeout(() => {
|
|
371
438
|
cleanup();
|
|
439
|
+
try {
|
|
440
|
+
ws.close();
|
|
441
|
+
} catch {
|
|
442
|
+
}
|
|
372
443
|
reject(new Error("OpenAI Realtime park connect timeout"));
|
|
373
444
|
}, 8e3);
|
|
374
445
|
ws.on("message", onMessage);
|
|
@@ -463,20 +534,33 @@ var OpenAIRealtimeAdapter = class {
|
|
|
463
534
|
dispatch("error", { type: "socket_error", message: err?.message ?? String(err) });
|
|
464
535
|
});
|
|
465
536
|
}
|
|
466
|
-
/** Truncate the in-flight assistant turn
|
|
537
|
+
/** Truncate the in-flight assistant turn's playback offset on the server.
|
|
538
|
+
*
|
|
539
|
+
* Sends ONLY ``conversation.item.truncate`` — no ``response.cancel``. This
|
|
540
|
+
* is the half of barge-in handling that a WebSocket transport MUST always
|
|
541
|
+
* perform: per OpenAI's docs, the GA server auto-truncates on barge-in only
|
|
542
|
+
* over WebRTC / SIP; on the WebSocket transport the client is responsible
|
|
543
|
+
* for telling the server how much of the assistant turn was actually heard.
|
|
544
|
+
* In server-managed mode (``interrupt_response: true``) the server already
|
|
545
|
+
* cancels the response itself, so issuing ``response.cancel`` here would be
|
|
546
|
+
* redundant / rejected — call this method, not {@link cancelResponse}.
|
|
467
547
|
*
|
|
468
548
|
* ``audio_end_ms`` MUST reflect what the caller actually heard, not what
|
|
469
549
|
* the server generated. OpenAI streams audio at 5-10x real-time, so the
|
|
470
550
|
* byte-derived counter overstates playback whenever the consumer cleared
|
|
471
|
-
* its playout buffer (e.g. ``
|
|
551
|
+
* its playout buffer (e.g. ``sendClear``) before the audio reached the
|
|
472
552
|
* speaker. We bound the truncate point by wall-clock time since the first
|
|
473
553
|
* chunk of this response — that's the physical maximum a 1x real-time
|
|
474
554
|
* playback could have produced. Without this cap, OpenAI keeps the full
|
|
475
555
|
* generated assistant text on the transcript, and the model replays /
|
|
476
556
|
* resumes from it on the next turn — manifesting as re-greetings and
|
|
477
557
|
* mid-sentence fragments after a barge-in storm.
|
|
558
|
+
*
|
|
559
|
+
* No-op when no response is in flight, keeping it idempotent across stale
|
|
560
|
+
* callers. Resets per-response tracking so post-truncate late frames and
|
|
561
|
+
* the next response start clean.
|
|
478
562
|
*/
|
|
479
|
-
|
|
563
|
+
truncate() {
|
|
480
564
|
if (!this.ws) return;
|
|
481
565
|
if (!this.currentResponseItemId) {
|
|
482
566
|
return;
|
|
@@ -496,11 +580,31 @@ var OpenAIRealtimeAdapter = class {
|
|
|
496
580
|
} catch (err) {
|
|
497
581
|
getLogger().debug?.(`conversation.item.truncate failed: ${String(err)}`);
|
|
498
582
|
}
|
|
499
|
-
this.ws.send(JSON.stringify({ type: "response.cancel" }));
|
|
500
583
|
this.currentResponseItemId = null;
|
|
501
584
|
this.currentResponseAudioMs = 0;
|
|
502
585
|
this.currentResponseFirstAudioAt = null;
|
|
503
586
|
}
|
|
587
|
+
/** Truncate the in-flight assistant turn AND cancel the active response.
|
|
588
|
+
*
|
|
589
|
+
* Sends BOTH ``conversation.item.truncate`` (the played-offset bookkeeping)
|
|
590
|
+
* AND ``response.cancel``. Use this on the LEGACY client-managed barge-in
|
|
591
|
+
* path (``gateResponseOnTranscript`` true → ``interrupt_response: false``,
|
|
592
|
+
* so the server does NOT cancel for us) and for explicit cancels driven by
|
|
593
|
+
* Patter (e.g. on transfer / hangup). In server-managed mode call
|
|
594
|
+
* {@link truncate} instead — the server already cancels the response, and an
|
|
595
|
+
* extra ``response.cancel`` would be redundant / rejected.
|
|
596
|
+
*
|
|
597
|
+
* Truncation bounding semantics are identical to {@link truncate}; see its
|
|
598
|
+
* doc comment for the ``audio_end_ms`` wall-clock cap rationale.
|
|
599
|
+
*/
|
|
600
|
+
cancelResponse() {
|
|
601
|
+
if (!this.ws) return;
|
|
602
|
+
if (!this.currentResponseItemId) {
|
|
603
|
+
return;
|
|
604
|
+
}
|
|
605
|
+
this.truncate();
|
|
606
|
+
this.ws.send(JSON.stringify({ type: "response.cancel" }));
|
|
607
|
+
}
|
|
504
608
|
/** Inject a user text turn and request a new response. */
|
|
505
609
|
async sendText(text) {
|
|
506
610
|
this.ws?.send(JSON.stringify({
|
|
@@ -545,6 +649,32 @@ var OpenAIRealtimeAdapter = class {
|
|
|
545
649
|
}
|
|
546
650
|
}));
|
|
547
651
|
}
|
|
652
|
+
/**
|
|
653
|
+
* Speak a short reassurance filler WITHOUT injecting a `role:user` turn.
|
|
654
|
+
*
|
|
655
|
+
* Same no-fake-turn shape as {@link sendFirstMessage}: a bare
|
|
656
|
+
* `response.create` carrying explicit `instructions`, so the filler is the
|
|
657
|
+
* assistant's own in-band audio. The reassurance scheduler in the
|
|
658
|
+
* stream-handler routes here instead of {@link sendText} — which would emit
|
|
659
|
+
* a `conversation.item.create` with `role:'user'` and falsely show the
|
|
660
|
+
* caller saying "One moment." in the transcript. Fillers must not imply
|
|
661
|
+
* success or failure.
|
|
662
|
+
*
|
|
663
|
+
* Uses `modalities: ['audio', 'text']` (v1-beta shape). The GA subclass
|
|
664
|
+
* {@link OpenAIRealtime2Adapter} overrides this with `output_modalities`
|
|
665
|
+
* and re-injects `audio.output.voice` so the GA endpoint does not reject
|
|
666
|
+
* the request. Mirrors Python `OpenAIRealtimeAdapter.send_reassurance` in
|
|
667
|
+
* `providers/openai_realtime.py`.
|
|
668
|
+
*/
|
|
669
|
+
async sendReassurance(text) {
|
|
670
|
+
this.ws?.send(JSON.stringify({
|
|
671
|
+
type: "response.create",
|
|
672
|
+
response: {
|
|
673
|
+
modalities: ["audio", "text"],
|
|
674
|
+
instructions: `Say exactly this and nothing else: "${text}"`
|
|
675
|
+
}
|
|
676
|
+
}));
|
|
677
|
+
}
|
|
548
678
|
/** Submit a tool/function-call result and request the next response. */
|
|
549
679
|
async sendFunctionResult(callId, result) {
|
|
550
680
|
this.ws?.send(JSON.stringify({
|
|
@@ -727,7 +857,12 @@ var StatefulResampler = class {
|
|
|
727
857
|
* Resets all state after flushing.
|
|
728
858
|
*/
|
|
729
859
|
flush() {
|
|
730
|
-
this.carry.flush();
|
|
860
|
+
const carryTail = this.carry.flush();
|
|
861
|
+
if (carryTail.length > 0) {
|
|
862
|
+
getLogger().warn(
|
|
863
|
+
"[patter] StatefulResampler.flush: trailing odd byte discarded \u2014 upstream produced odd-length PCM stream"
|
|
864
|
+
);
|
|
865
|
+
}
|
|
731
866
|
if (this.srcRate === 16e3 && this.dstRate === 8e3 && this.firPendingSample !== null) {
|
|
732
867
|
const s = this.firPendingSample;
|
|
733
868
|
const tmp = Buffer.alloc(4);
|
|
@@ -1012,44 +1147,46 @@ var OpenAIRealtime2Adapter = class extends OpenAIRealtimeAdapter {
|
|
|
1012
1147
|
buildGASessionConfig() {
|
|
1013
1148
|
const opts = this.options;
|
|
1014
1149
|
const fmt = { type: "audio/pcm", rate: 24e3 };
|
|
1150
|
+
const audioInput = {
|
|
1151
|
+
format: fmt,
|
|
1152
|
+
transcription: {
|
|
1153
|
+
model: opts.inputAudioTranscriptionModel ?? OpenAITranscriptionModel.WHISPER_1
|
|
1154
|
+
},
|
|
1155
|
+
// Response creation + barge-in cancellation (issue #154 — hand
|
|
1156
|
+
// turn-taking to the server by default):
|
|
1157
|
+
// - DEFAULT (`gateResponseOnTranscript` false → SERVER-MANAGED):
|
|
1158
|
+
// `create_response: true` lets the SERVER auto-create the response
|
|
1159
|
+
// when it commits the user's audio buffer
|
|
1160
|
+
// (`input_audio_buffer.committed`). `interrupt_response: true` lets the
|
|
1161
|
+
// SERVER cancel the in-flight response on its own VAD `speech_started`.
|
|
1162
|
+
// The e2e model replies immediately, in parallel with the Whisper
|
|
1163
|
+
// transcript — no transcript wait (~500 ms reclaimed), no client-side
|
|
1164
|
+
// race. On a WebSocket transport the client STILL must clear the
|
|
1165
|
+
// carrier buffer (`sendClear`) and `conversation.item.truncate` the
|
|
1166
|
+
// played offset on barge-in (the server only auto-truncates on
|
|
1167
|
+
// WebRTC/SIP), but it does NOT send `response.cancel`. Whisper is
|
|
1168
|
+
// display-only — it can never trigger / gate / cancel the response.
|
|
1169
|
+
// - LEGACY (`gateResponseOnTranscript` true → CLIENT-MANAGED opt-out):
|
|
1170
|
+
// `create_response: false` + `interrupt_response: false` so the stream
|
|
1171
|
+
// handler drives `response.create` (after the hallucination filter)
|
|
1172
|
+
// and `response.cancel` (on barge-in) itself. Escape hatch for no-AEC
|
|
1173
|
+
// PSTN self-interruption. Both keys are tied to the same switch inside
|
|
1174
|
+
// `buildTurnDetection`.
|
|
1175
|
+
turn_detection: buildTurnDetection(opts.turnDetection, {
|
|
1176
|
+
defaultType: opts.vadType ?? OpenAIRealtimeVADType.SERVER_VAD,
|
|
1177
|
+
defaultSilenceMs: opts.silenceDurationMs ?? 300,
|
|
1178
|
+
includeResponseGating: true,
|
|
1179
|
+
gateResponseOnTranscript: this.getGateResponseOnTranscript()
|
|
1180
|
+
})
|
|
1181
|
+
};
|
|
1182
|
+
if (opts.noiseReduction !== void 0) {
|
|
1183
|
+
audioInput.noise_reduction = { type: opts.noiseReduction };
|
|
1184
|
+
}
|
|
1015
1185
|
const config = {
|
|
1016
1186
|
type: "realtime",
|
|
1017
1187
|
output_modalities: opts.modalities ?? ["audio"],
|
|
1018
1188
|
audio: {
|
|
1019
|
-
input:
|
|
1020
|
-
format: fmt,
|
|
1021
|
-
transcription: {
|
|
1022
|
-
model: opts.inputAudioTranscriptionModel ?? OpenAITranscriptionModel.WHISPER_1
|
|
1023
|
-
},
|
|
1024
|
-
// VAD threshold raised back to the OpenAI default (0.5) on
|
|
1025
|
-
// 2026-05-22. The earlier 0.1 tuning (motivated by the
|
|
1026
|
-
// upsampled telephony-band loss in high frequencies) made the
|
|
1027
|
-
// server VAD trigger on the carrier-loopback echo of the
|
|
1028
|
-
// agent's OWN outbound audio in PSTN no-AEC scenarios.
|
|
1029
|
-
// Combined with the default ``turn_detection.create_response:
|
|
1030
|
-
// true``, every phantom ``speech_started`` ended a turn early
|
|
1031
|
-
// and auto-created a new response that the agent immediately
|
|
1032
|
-
// spoke over, leading to a runaway loop where the first
|
|
1033
|
-
// message was repeatedly cut and re-generated.
|
|
1034
|
-
turn_detection: {
|
|
1035
|
-
type: opts.vadType ?? OpenAIRealtimeVADType.SERVER_VAD,
|
|
1036
|
-
threshold: 0.5,
|
|
1037
|
-
prefix_padding_ms: 300,
|
|
1038
|
-
silence_duration_ms: opts.silenceDurationMs ?? 500,
|
|
1039
|
-
// Defer ``response.create`` to the application: when OpenAI's
|
|
1040
|
-
// server VAD commits an ``input_audio_buffer.committed`` segment
|
|
1041
|
-
// that turns out to be a Whisper hallucination on silence/echo,
|
|
1042
|
-
// auto-creating a response would generate a phantom turn (the
|
|
1043
|
-
// model reads the hallucinated text as user input). Patter
|
|
1044
|
-
// triggers ``response.create`` explicitly in the Realtime
|
|
1045
|
-
// stream-handler AFTER validating ``transcript_input`` against
|
|
1046
|
-
// the hallucination filter. Pair with ``interrupt_response:
|
|
1047
|
-
// false`` so server VAD also leaves in-flight responses alone —
|
|
1048
|
-
// barge-in is gated client-side.
|
|
1049
|
-
create_response: false,
|
|
1050
|
-
interrupt_response: false
|
|
1051
|
-
}
|
|
1052
|
-
},
|
|
1189
|
+
input: audioInput,
|
|
1053
1190
|
output: {
|
|
1054
1191
|
format: fmt,
|
|
1055
1192
|
voice: this.voice
|
|
@@ -1102,14 +1239,7 @@ var OpenAIRealtime2Adapter = class extends OpenAIRealtimeAdapter {
|
|
|
1102
1239
|
if (t && t in GA_TO_V1_EVENT_NAMES) {
|
|
1103
1240
|
const newType = GA_TO_V1_EVENT_NAMES[t];
|
|
1104
1241
|
if (t === "response.output_audio.delta" && typeof parsed.delta === "string") {
|
|
1105
|
-
|
|
1106
|
-
const FRAME_BYTES = 160;
|
|
1107
|
-
if (mulaw.length === 0) return;
|
|
1108
|
-
for (let off = 0; off < mulaw.length; off += FRAME_BYTES) {
|
|
1109
|
-
const slice = mulaw.subarray(off, Math.min(off + FRAME_BYTES, mulaw.length));
|
|
1110
|
-
const frame = { ...parsed, type: newType, delta: slice.toString("base64") };
|
|
1111
|
-
handler(Buffer.from(JSON.stringify(frame)), ...rest);
|
|
1112
|
-
}
|
|
1242
|
+
this.translateGaAudioDelta(parsed, handler, rest);
|
|
1113
1243
|
return;
|
|
1114
1244
|
}
|
|
1115
1245
|
parsed.type = newType;
|
|
@@ -1138,6 +1268,7 @@ var OpenAIRealtime2Adapter = class extends OpenAIRealtimeAdapter {
|
|
|
1138
1268
|
sessionCreated = true;
|
|
1139
1269
|
ws.send(JSON.stringify({ type: "session.update", session: this.buildGASessionConfig() }));
|
|
1140
1270
|
} else if (msg.type === "session.updated") {
|
|
1271
|
+
this.warnIfOutputFormatUnexpected(msg);
|
|
1141
1272
|
cleanup();
|
|
1142
1273
|
resolve();
|
|
1143
1274
|
} else if (msg.type === "error") {
|
|
@@ -1243,6 +1374,10 @@ var OpenAIRealtime2Adapter = class extends OpenAIRealtimeAdapter {
|
|
|
1243
1374
|
};
|
|
1244
1375
|
const timer = setTimeout(() => {
|
|
1245
1376
|
cleanup();
|
|
1377
|
+
try {
|
|
1378
|
+
ws.close();
|
|
1379
|
+
} catch {
|
|
1380
|
+
}
|
|
1246
1381
|
reject(new Error("OpenAI Realtime 2 park connect timeout"));
|
|
1247
1382
|
}, 8e3);
|
|
1248
1383
|
ws.on("message", onMessage);
|
|
@@ -1290,8 +1425,12 @@ var OpenAIRealtime2Adapter = class extends OpenAIRealtimeAdapter {
|
|
|
1290
1425
|
const parsed = JSON.parse(text);
|
|
1291
1426
|
const t = parsed.type;
|
|
1292
1427
|
if (t && Object.prototype.hasOwnProperty.call(GA_TO_V1_EVENT_NAMES, t)) {
|
|
1428
|
+
if (t === "response.output_audio.delta" && typeof parsed.delta === "string") {
|
|
1429
|
+
this.translateGaAudioDelta(parsed, handler, rest);
|
|
1430
|
+
return;
|
|
1431
|
+
}
|
|
1293
1432
|
parsed.type = GA_TO_V1_EVENT_NAMES[t];
|
|
1294
|
-
handler(JSON.stringify(parsed), ...rest);
|
|
1433
|
+
handler(Buffer.from(JSON.stringify(parsed)), ...rest);
|
|
1295
1434
|
return;
|
|
1296
1435
|
}
|
|
1297
1436
|
} catch {
|
|
@@ -1376,6 +1515,55 @@ var OpenAIRealtime2Adapter = class extends OpenAIRealtimeAdapter {
|
|
|
1376
1515
|
}
|
|
1377
1516
|
return out;
|
|
1378
1517
|
}
|
|
1518
|
+
/**
|
|
1519
|
+
* Log-only safety net for issue #154. The GA server echoes the *effective*
|
|
1520
|
+
* session config in `session.updated`; we request `audio/pcm` @ 24 kHz and
|
|
1521
|
+
* transcode PCM24→mulaw8 ourselves (see
|
|
1522
|
+
* `transcodeOutboundPcm24ToMulaw8Buffer`). If a future GA schema change ever
|
|
1523
|
+
* made the server return a different output format, that transcode — which
|
|
1524
|
+
* assumes PCM16-LE @ 24 kHz — would silently corrupt audio, exactly the
|
|
1525
|
+
* v1-beta failure mode #154 fixed. Warn so the drift surfaces in logs instead
|
|
1526
|
+
* of as static. Never gates audio.
|
|
1527
|
+
*/
|
|
1528
|
+
warnIfOutputFormatUnexpected(msg) {
|
|
1529
|
+
const fmt = msg?.session?.audio?.output?.format;
|
|
1530
|
+
if (!fmt || typeof fmt !== "object") return;
|
|
1531
|
+
if (fmt.type !== "audio/pcm" || fmt.rate != null && fmt.rate !== 24e3) {
|
|
1532
|
+
getLogger().warn(
|
|
1533
|
+
`OpenAI Realtime 2: server-echoed output format ${JSON.stringify(fmt)} differs from the requested audio/pcm@24000 \u2014 the outbound PCM24\u2192mulaw8 transcode assumes PCM16-LE 24 kHz, so carrier audio may be garbled (issue #154). Informational only; audio is not gated on this.`
|
|
1534
|
+
);
|
|
1535
|
+
}
|
|
1536
|
+
}
|
|
1537
|
+
/**
|
|
1538
|
+
* Shared audio-delta translation helper. Transcodes a GA
|
|
1539
|
+
* `response.output_audio.delta` payload (base64 PCM-16-LE 24 kHz)
|
|
1540
|
+
* into mulaw 8 kHz and splits the result into 160-byte (20 ms) frames,
|
|
1541
|
+
* dispatching one synthetic `response.audio.delta` event per frame.
|
|
1542
|
+
*
|
|
1543
|
+
* Called from BOTH the `connect()` shim and the `adoptWebSocket()` shim
|
|
1544
|
+
* so that warm-path (prewarm/adopted) calls receive identical transcoding
|
|
1545
|
+
* to cold-path calls. Without this, adopted sockets forwarded raw PCM-24
|
|
1546
|
+
* to Twilio/Telnyx, producing garbled or silent audio on every warm call.
|
|
1547
|
+
*
|
|
1548
|
+
* @param parsed - The parsed GA event object (type already checked to be
|
|
1549
|
+
* `response.output_audio.delta` with a string `delta`).
|
|
1550
|
+
* @param handler - The downstream message listener to dispatch each frame to.
|
|
1551
|
+
* @param rest - Extra arguments forwarded from the original `message` event.
|
|
1552
|
+
* @returns `true` if frames were dispatched (caller should return early),
|
|
1553
|
+
* `false` if the resampler is still warming up (zero output bytes).
|
|
1554
|
+
*/
|
|
1555
|
+
translateGaAudioDelta(parsed, handler, rest) {
|
|
1556
|
+
const newType = GA_TO_V1_EVENT_NAMES["response.output_audio.delta"];
|
|
1557
|
+
const mulaw = this.transcodeOutboundPcm24ToMulaw8Buffer(parsed.delta);
|
|
1558
|
+
const FRAME_BYTES = 160;
|
|
1559
|
+
if (mulaw.length === 0) return false;
|
|
1560
|
+
for (let off = 0; off < mulaw.length; off += FRAME_BYTES) {
|
|
1561
|
+
const slice = mulaw.subarray(off, Math.min(off + FRAME_BYTES, mulaw.length));
|
|
1562
|
+
const frame = { ...parsed, type: newType, delta: slice.toString("base64") };
|
|
1563
|
+
handler(Buffer.from(JSON.stringify(frame)), ...rest);
|
|
1564
|
+
}
|
|
1565
|
+
return true;
|
|
1566
|
+
}
|
|
1379
1567
|
/**
|
|
1380
1568
|
* Base64 PCM-16-LE 24 kHz → Base64 mulaw 8 kHz. Used by the WS
|
|
1381
1569
|
* translation shim on each `response.output_audio.delta`. The stateful
|
|
@@ -1405,6 +1593,34 @@ var OpenAIRealtime2Adapter = class extends OpenAIRealtimeAdapter {
|
|
|
1405
1593
|
}
|
|
1406
1594
|
this.ws?.send(JSON.stringify({ type: "response.create", response: responseBody }));
|
|
1407
1595
|
}
|
|
1596
|
+
/**
|
|
1597
|
+
* Speak a short reassurance filler WITHOUT injecting a `role:user` turn.
|
|
1598
|
+
*
|
|
1599
|
+
* GA-shape sibling of {@link sendFirstMessage} (and override of the base v1
|
|
1600
|
+
* {@link OpenAIRealtimeAdapter.sendReassurance}): a bare `response.create`
|
|
1601
|
+
* carrying explicit `instructions` so the filler is the assistant's own
|
|
1602
|
+
* in-band audio. No `conversation.item.create` with `role:"user"` is
|
|
1603
|
+
* emitted, so the transcript shows no phantom caller line. The GA endpoint
|
|
1604
|
+
* rejects `response.modalities` and does not inherit `audio.output.voice`
|
|
1605
|
+
* for an explicit `response.create`, so — exactly as in
|
|
1606
|
+
* {@link sendFirstMessage} — we send `output_modalities` and re-inject the
|
|
1607
|
+
* voice. Fillers must not imply success or failure.
|
|
1608
|
+
*
|
|
1609
|
+
* Mirrors Python `OpenAIRealtime2Adapter.send_reassurance` in
|
|
1610
|
+
* `providers/openai_realtime_2.py`.
|
|
1611
|
+
*/
|
|
1612
|
+
async sendReassurance(text) {
|
|
1613
|
+
if (!this.ws) return;
|
|
1614
|
+
const responseBody = {
|
|
1615
|
+
output_modalities: ["audio"],
|
|
1616
|
+
audio: { output: { voice: this.voice } },
|
|
1617
|
+
instructions: `Say exactly this and nothing else: "${text}"`
|
|
1618
|
+
};
|
|
1619
|
+
if (this.options.reasoningEffort !== void 0) {
|
|
1620
|
+
responseBody.reasoning = { effort: this.options.reasoningEffort };
|
|
1621
|
+
}
|
|
1622
|
+
this.ws.send(JSON.stringify({ type: "response.create", response: responseBody }));
|
|
1623
|
+
}
|
|
1408
1624
|
};
|
|
1409
1625
|
|
|
1410
1626
|
export {
|
|
@@ -1413,6 +1629,7 @@ export {
|
|
|
1413
1629
|
OpenAIVoice,
|
|
1414
1630
|
OpenAITranscriptionModel,
|
|
1415
1631
|
OpenAIRealtimeVADType,
|
|
1632
|
+
validateRealtimeTurnDetection,
|
|
1416
1633
|
OpenAIRealtimeAdapter,
|
|
1417
1634
|
mulawToPcm16,
|
|
1418
1635
|
pcm16ToMulaw,
|
package/dist/cli.js
CHANGED
|
@@ -185,14 +185,49 @@ var MetricsStore = class extends import_events.EventEmitter {
|
|
|
185
185
|
} else {
|
|
186
186
|
for (let i = this.calls.length - 1; i >= 0; i--) {
|
|
187
187
|
if (this.calls[i].call_id === callId) {
|
|
188
|
-
this.calls[i].status
|
|
189
|
-
Object.assign(this.calls[i], extra);
|
|
188
|
+
this.calls[i] = { ...this.calls[i], status, ...extra };
|
|
190
189
|
break;
|
|
191
190
|
}
|
|
192
191
|
}
|
|
193
192
|
}
|
|
194
193
|
this.publish("call_status", { call_id: callId, status, ...extra });
|
|
195
194
|
}
|
|
195
|
+
/**
|
|
196
|
+
* Record a single transcript line (user/assistant) as it becomes known.
|
|
197
|
+
*
|
|
198
|
+
* FIX-5 (issue #154): the live forward path for the dashboard transcript.
|
|
199
|
+
* The Realtime stream handler calls this the moment each line is known — the
|
|
200
|
+
* user line right after the hallucination filter accepts it, the assistant
|
|
201
|
+
* line when its turn flushes — keyed by the monotonic ``turnIndex`` reserved
|
|
202
|
+
* at turn-open (``reserveTurnIndex``). Each line is appended to the active
|
|
203
|
+
* call's ``transcript`` array and broadcast over SSE as a ``transcript_line``
|
|
204
|
+
* event so the dashboard can render lines as they arrive and re-sort by
|
|
205
|
+
* ``(turnIndex, user<assistant)`` — making a late-arriving user line land
|
|
206
|
+
* ABOVE its agent line. ``recordTurn`` de-dups against the lines pushed here
|
|
207
|
+
* by ``(turnIndex, role)`` so the metrics path never double-pushes the same
|
|
208
|
+
* text. Parity with Python ``record_transcript_line``.
|
|
209
|
+
*/
|
|
210
|
+
recordTranscriptLine(data) {
|
|
211
|
+
const callId = data.call_id || "";
|
|
212
|
+
const { role, text, turnIndex } = data;
|
|
213
|
+
if (!callId || role !== "user" && role !== "assistant" || !text) return;
|
|
214
|
+
const active = this.activeCalls.get(callId);
|
|
215
|
+
if (active) {
|
|
216
|
+
if (!active.transcript) active.transcript = [];
|
|
217
|
+
active.transcript.push({
|
|
218
|
+
role,
|
|
219
|
+
text,
|
|
220
|
+
timestamp: Date.now() / 1e3,
|
|
221
|
+
turnIndex
|
|
222
|
+
});
|
|
223
|
+
}
|
|
224
|
+
this.publish("transcript_line", {
|
|
225
|
+
call_id: callId,
|
|
226
|
+
turnIndex,
|
|
227
|
+
role,
|
|
228
|
+
text
|
|
229
|
+
});
|
|
230
|
+
}
|
|
196
231
|
/** Append a single conversation turn to an active call and broadcast it via SSE. */
|
|
197
232
|
recordTurn(data) {
|
|
198
233
|
const callId = data.call_id || "";
|
|
@@ -207,14 +242,19 @@ var MetricsStore = class extends import_events.EventEmitter {
|
|
|
207
242
|
const userText = typeof turnRecord.user_text === "string" ? turnRecord.user_text : "";
|
|
208
243
|
const agentText = typeof turnRecord.agent_text === "string" ? turnRecord.agent_text : "";
|
|
209
244
|
const ts = typeof turnRecord.timestamp === "number" ? turnRecord.timestamp : Date.now() / 1e3;
|
|
210
|
-
|
|
211
|
-
|
|
245
|
+
const turnIndex = typeof turnRecord.turn_index === "number" ? turnRecord.turn_index : void 0;
|
|
246
|
+
const alreadyLive = (role) => turnIndex !== void 0 && (active.transcript ?? []).some(
|
|
247
|
+
(e) => e.turnIndex === turnIndex && e.role === role
|
|
248
|
+
);
|
|
249
|
+
if (userText.length > 0 && !alreadyLive("user")) {
|
|
250
|
+
active.transcript.push({ role: "user", text: userText, timestamp: ts, turnIndex });
|
|
212
251
|
}
|
|
213
|
-
if (agentText.length > 0 && agentText !== "[interrupted]") {
|
|
252
|
+
if (agentText.length > 0 && agentText !== "[interrupted]" && !alreadyLive("assistant")) {
|
|
214
253
|
active.transcript.push({
|
|
215
254
|
role: "assistant",
|
|
216
255
|
text: agentText,
|
|
217
|
-
timestamp: ts
|
|
256
|
+
timestamp: ts,
|
|
257
|
+
turnIndex
|
|
218
258
|
});
|
|
219
259
|
}
|
|
220
260
|
}
|
|
@@ -287,7 +327,7 @@ var MetricsStore = class extends import_events.EventEmitter {
|
|
|
287
327
|
getCall(callId) {
|
|
288
328
|
if (this.deletedCallIds.has(callId)) return null;
|
|
289
329
|
for (let i = this.calls.length - 1; i >= 0; i--) {
|
|
290
|
-
if (this.calls[i].call_id === callId) return this.calls[i];
|
|
330
|
+
if (this.calls[i].call_id === callId) return { ...this.calls[i] };
|
|
291
331
|
}
|
|
292
332
|
return null;
|
|
293
333
|
}
|
|
@@ -329,7 +369,9 @@ var MetricsStore = class extends import_events.EventEmitter {
|
|
|
329
369
|
}
|
|
330
370
|
if (accepted.length === 0) return [];
|
|
331
371
|
accepted.sort();
|
|
332
|
-
this.persistDeletedIds()
|
|
372
|
+
this.persistDeletedIds().catch(
|
|
373
|
+
(err) => getLogger().debug(`MetricsStore.deleteCalls: persistDeletedIds failed: ${String(err)}`)
|
|
374
|
+
);
|
|
333
375
|
this.publish("calls_deleted", { call_ids: accepted });
|
|
334
376
|
return accepted;
|
|
335
377
|
}
|
|
@@ -341,19 +383,19 @@ var MetricsStore = class extends import_events.EventEmitter {
|
|
|
341
383
|
getDeletedCallIds() {
|
|
342
384
|
return Array.from(this.deletedCallIds).sort();
|
|
343
385
|
}
|
|
344
|
-
/** Atomically persist the deleted-ids set to disk. Best-effort. */
|
|
345
|
-
persistDeletedIds() {
|
|
386
|
+
/** Atomically persist the deleted-ids set to disk. Best-effort async. */
|
|
387
|
+
async persistDeletedIds() {
|
|
346
388
|
if (this.deletedIdsPath === null) return;
|
|
347
389
|
try {
|
|
348
390
|
const dir = path2.dirname(this.deletedIdsPath);
|
|
349
|
-
fs2.
|
|
391
|
+
await fs2.promises.mkdir(dir, { recursive: true });
|
|
350
392
|
const tmp = this.deletedIdsPath + ".tmp";
|
|
351
393
|
const payload = {
|
|
352
394
|
version: 1,
|
|
353
395
|
deleted_call_ids: Array.from(this.deletedCallIds).sort()
|
|
354
396
|
};
|
|
355
|
-
fs2.
|
|
356
|
-
fs2.
|
|
397
|
+
await fs2.promises.writeFile(tmp, JSON.stringify(payload, null, 2), "utf8");
|
|
398
|
+
await fs2.promises.rename(tmp, this.deletedIdsPath);
|
|
357
399
|
} catch (err) {
|
|
358
400
|
getLogger().debug(
|
|
359
401
|
`MetricsStore.persistDeletedIds: ${String(err)}`
|
|
@@ -362,7 +404,8 @@ var MetricsStore = class extends import_events.EventEmitter {
|
|
|
362
404
|
}
|
|
363
405
|
/** Look up an active call by id (returns undefined if not active or unknown). */
|
|
364
406
|
getActive(callId) {
|
|
365
|
-
|
|
407
|
+
const rec = this.activeCalls.get(callId);
|
|
408
|
+
return rec !== void 0 ? { ...rec } : void 0;
|
|
366
409
|
}
|
|
367
410
|
/** Return all currently active (not yet ended) calls. */
|
|
368
411
|
getActiveCalls() {
|
|
@@ -607,8 +650,8 @@ function loadTranscriptJsonl(filePath) {
|
|
|
607
650
|
} catch {
|
|
608
651
|
continue;
|
|
609
652
|
}
|
|
610
|
-
const tsIso = typeof row.ts === "string" ? Date.parse(row.ts) : NaN;
|
|
611
|
-
const tsNumeric = typeof row.timestamp === "number" ? row.timestamp
|
|
653
|
+
const tsIso = typeof row.ts === "string" ? Date.parse(row.ts) / 1e3 : NaN;
|
|
654
|
+
const tsNumeric = typeof row.timestamp === "number" ? row.timestamp : NaN;
|
|
612
655
|
const timestamp = Number.isFinite(tsIso) ? tsIso : Number.isFinite(tsNumeric) ? tsNumeric : 0;
|
|
613
656
|
const userText = typeof row.user_text === "string" ? row.user_text : "";
|
|
614
657
|
const agentText = typeof row.agent_text === "string" ? row.agent_text : "";
|
|
@@ -759,8 +802,8 @@ function mountDashboard(app, store, token = "") {
|
|
|
759
802
|
res.type("text/html").send(DASHBOARD_HTML);
|
|
760
803
|
});
|
|
761
804
|
app.get("/api/dashboard/calls", auth, (req, res) => {
|
|
762
|
-
const limit = Math.min(parseInt(req.query.limit || "50", 10) || 50, 1e3);
|
|
763
|
-
const offset = parseInt(req.query.offset || "0", 10) || 0;
|
|
805
|
+
const limit = Math.min(Math.max(0, parseInt(req.query.limit || "50", 10) || 50), 1e3);
|
|
806
|
+
const offset = Math.max(0, parseInt(req.query.offset || "0", 10) || 0);
|
|
764
807
|
res.json(store.getCalls(limit, offset));
|
|
765
808
|
});
|
|
766
809
|
app.get("/api/dashboard/calls/:callId", auth, (req, res) => {
|
|
@@ -850,8 +893,8 @@ data: ${data}
|
|
|
850
893
|
function mountApi(app, store, token = "") {
|
|
851
894
|
const auth = makeAuthMiddleware(token);
|
|
852
895
|
app.get("/api/v1/calls", auth, (req, res) => {
|
|
853
|
-
const limit = Math.min(parseInt(req.query.limit || "50", 10) || 50, 1e3);
|
|
854
|
-
const offset = parseInt(req.query.offset || "0", 10) || 0;
|
|
896
|
+
const limit = Math.min(Math.max(0, parseInt(req.query.limit || "50", 10) || 50), 1e3);
|
|
897
|
+
const offset = Math.max(0, parseInt(req.query.offset || "0", 10) || 0);
|
|
855
898
|
const calls = store.getCalls(limit, offset);
|
|
856
899
|
res.json({
|
|
857
900
|
data: calls,
|