@livekit/agents-plugin-phonic 1.0.46

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/index.ts ADDED
@@ -0,0 +1,18 @@
1
+ // SPDX-FileCopyrightText: 2026 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ import { Plugin } from '@livekit/agents';
5
+
6
+ export * as realtime from './realtime/index.js';
7
+
8
+ class PhonicPlugin extends Plugin {
9
+ constructor() {
10
+ super({
11
+ title: 'phonic',
12
+ version: '0.1.0',
13
+ package: '@livekit/agents-plugin-phonic',
14
+ });
15
+ }
16
+ }
17
+
18
+ Plugin.registerPlugin(new PhonicPlugin());
@@ -0,0 +1,32 @@
1
+ // SPDX-FileCopyrightText: 2026 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ import type { Phonic } from 'phonic';
5
+
6
+ export type ServerEvent =
7
+ | Phonic.ReadyToStartConversationPayload
8
+ | Phonic.ConversationCreatedPayload
9
+ | Phonic.InputTextPayload
10
+ | Phonic.InputCancelledPayload
11
+ | Phonic.AudioChunkResponsePayload
12
+ | Phonic.UserStartedSpeakingPayload
13
+ | Phonic.UserFinishedSpeakingPayload
14
+ | Phonic.DtmfPayload
15
+ | Phonic.ToolCallPayload
16
+ | Phonic.ToolCallOutputProcessedPayload
17
+ | Phonic.ToolCallInterruptedPayload
18
+ | Phonic.AssistantChoseNotToRespondPayload
19
+ | Phonic.AssistantEndedConversationPayload
20
+ | Phonic.AssistantStartedSpeakingPayload
21
+ | Phonic.AssistantFinishedSpeakingPayload
22
+ | Phonic.ErrorPayload;
23
+
24
+ export type Voice =
25
+ | 'sabrina'
26
+ | 'grant'
27
+ | 'virginia'
28
+ | 'landon'
29
+ | 'eleanor'
30
+ | 'shelby'
31
+ | 'nolan'
32
+ | string;
@@ -0,0 +1,5 @@
1
+ // SPDX-FileCopyrightText: 2026 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ export { RealtimeModel, type RealtimeModelOptions } from './realtime_model.js';
5
+ export type { Voice } from './api_proto.js';
@@ -0,0 +1,575 @@
1
+ // SPDX-FileCopyrightText: 2026 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ import type { APIConnectOptions } from '@livekit/agents';
5
+ import {
6
+ AudioByteStream,
7
+ DEFAULT_API_CONNECT_OPTIONS,
8
+ llm,
9
+ log,
10
+ shortuuid,
11
+ stream,
12
+ } from '@livekit/agents';
13
+ import { AudioFrame, AudioResampler } from '@livekit/rtc-node';
14
+ import type { Phonic } from 'phonic';
15
+ import { PhonicClient } from 'phonic';
16
+ import type { ServerEvent, Voice } from './api_proto.js';
17
+
18
+ const PHONIC_INPUT_SAMPLE_RATE = 44100;
19
+ const PHONIC_OUTPUT_SAMPLE_RATE = 44100;
20
+ const PHONIC_NUM_CHANNELS = 1;
21
+ const PHONIC_INPUT_FRAME_MS = 20;
22
+ const DEFAULT_MODEL = 'merritt';
23
+ const WS_CLOSE_NORMAL = 1000;
24
+
25
+ export interface RealtimeModelOptions {
26
+ apiKey: string;
27
+ model: string;
28
+ phonicAgent?: string;
29
+ voice?: Voice | string;
30
+ welcomeMessage?: string;
31
+ generateWelcomeMessage?: boolean;
32
+ project?: string;
33
+ connOptions: APIConnectOptions;
34
+ baseUrl?: string;
35
+ languages?: string[];
36
+ audioSpeed?: number;
37
+ phonicTools?: string[];
38
+ boostedKeywords?: string[];
39
+ generateNoInputPokeText?: boolean;
40
+ noInputPokeSec?: number;
41
+ noInputPokeText?: string;
42
+ noInputEndConversationSec?: number;
43
+ /** Set by `updateInstructions` via `voice.Agent` rather than the RealtimeModel constructor */
44
+ instructions?: string;
45
+ }
46
+
47
+ export class RealtimeModel extends llm.RealtimeModel {
48
+ /** @internal */
49
+ _options: RealtimeModelOptions;
50
+
51
+ get model(): string {
52
+ return this._options.model;
53
+ }
54
+
55
+ constructor(
56
+ options: {
57
+ /**
58
+ * Phonic API key. If not provided, will attempt to read from PHONIC_API_KEY environment variable
59
+ */
60
+ apiKey?: string;
61
+ /**
62
+ * The name of the model to use. Defaults to 'merritt'
63
+ */
64
+ model?: Phonic.ConfigPayload['model'] | string;
65
+ /**
66
+ * Phonic agent to use for the conversation. Options explicitly set here will override the agent settings.
67
+ */
68
+ phonicAgent?: string;
69
+ /**
70
+ * Voice ID for agent outputs
71
+ */
72
+ voice?: Voice;
73
+ /**
74
+ * Welcome message for the agent to say when the conversation starts. Ignored when generateWelcomeMessage is true
75
+ */
76
+ welcomeMessage?: string;
77
+ /**
78
+ * When true, the welcome message will be automatically generated and welcomeMessage will be ignored
79
+ */
80
+ generateWelcomeMessage?: boolean;
81
+ /**
82
+ * Project name to use for the conversation. Defaults to `main`
83
+ */
84
+ project?: string;
85
+ /**
86
+ * ISO 639-1 language codes the agent should recognize and speak
87
+ */
88
+ languages?: string[];
89
+ /**
90
+ * Audio playback speed
91
+ */
92
+ audioSpeed?: number;
93
+ /**
94
+ * Phonic tool names available to the assistant
95
+ */
96
+ phonicTools?: string[];
97
+ /**
98
+ * Keywords to boost in speech recognition
99
+ */
100
+ boostedKeywords?: string[];
101
+ /**
102
+ * Auto-generate poke text when user is silent
103
+ */
104
+ generateNoInputPokeText?: boolean;
105
+ /**
106
+ * Seconds of silence before sending poke message
107
+ */
108
+ noInputPokeSec?: number;
109
+ /**
110
+ * Poke message text (ignored when generateNoInputPokeText is true)
111
+ */
112
+ noInputPokeText?: string;
113
+ /**
114
+ * Seconds of silence before ending conversation
115
+ */
116
+ noInputEndConversationSec?: number;
117
+ /**
118
+ * Connection options for the API connection
119
+ */
120
+ connOptions?: APIConnectOptions;
121
+ baseUrl?: string;
122
+ } = {},
123
+ ) {
124
+ super({
125
+ messageTruncation: false,
126
+ turnDetection: true,
127
+ userTranscription: true,
128
+ // TODO @Phonic-Co: Implement tool support
129
+ // Phonic has automatic tool reply generation, but tools are not supported with LiveKit Agents yet.
130
+ autoToolReplyGeneration: true,
131
+ audioOutput: true,
132
+ });
133
+
134
+ const apiKey = options.apiKey || process.env.PHONIC_API_KEY;
135
+ if (!apiKey) {
136
+ throw new Error('Phonic API key is required. Provide apiKey or set PHONIC_API_KEY.');
137
+ }
138
+
139
+ this._options = {
140
+ apiKey,
141
+ voice: options.voice,
142
+ phonicAgent: options.phonicAgent,
143
+ project: options.project,
144
+ welcomeMessage: options.welcomeMessage,
145
+ generateWelcomeMessage: options.generateWelcomeMessage,
146
+ languages: options.languages,
147
+ audioSpeed: options.audioSpeed,
148
+ phonicTools: options.phonicTools,
149
+ boostedKeywords: options.boostedKeywords,
150
+ generateNoInputPokeText: options.generateNoInputPokeText,
151
+ noInputPokeSec: options.noInputPokeSec,
152
+ noInputPokeText: options.noInputPokeText,
153
+ noInputEndConversationSec: options.noInputEndConversationSec,
154
+ connOptions: options.connOptions ?? DEFAULT_API_CONNECT_OPTIONS,
155
+ model: options.model ?? DEFAULT_MODEL,
156
+ baseUrl: options.baseUrl,
157
+ };
158
+ }
159
+
160
+ /**
161
+ * Create a new realtime session
162
+ */
163
+ session(): RealtimeSession {
164
+ return new RealtimeSession(this);
165
+ }
166
+
167
+ async close(): Promise<void> {}
168
+ }
169
+
170
+ interface GenerationState {
171
+ responseId: string;
172
+ messageChannel: stream.StreamChannel<llm.MessageGeneration>;
173
+ functionChannel: stream.StreamChannel<llm.FunctionCall>;
174
+ textChannel: stream.StreamChannel<string>;
175
+ audioChannel: stream.StreamChannel<AudioFrame>;
176
+ outputText: string;
177
+ }
178
+
179
+ /**
180
+ * Realtime session for Phonic (https://docs.phonic.co/)
181
+ */
182
+ export class RealtimeSession extends llm.RealtimeSession {
183
+ private _tools: llm.ToolContext = {};
184
+ private _chatCtx = llm.ChatContext.empty();
185
+
186
+ private options: RealtimeModelOptions;
187
+ private bstream: AudioByteStream;
188
+ private inputResampler?: AudioResampler;
189
+ private inputResamplerInputRate?: number;
190
+
191
+ private currentGeneration?: GenerationState;
192
+ private conversationId?: string;
193
+
194
+ private client: PhonicClient;
195
+ private socket?: Awaited<ReturnType<PhonicClient['conversations']['connect']>>;
196
+ private logger = log();
197
+ private closed = false;
198
+ private configSent = false;
199
+ private instructionsReady: Promise<void>;
200
+ private resolveInstructionsReady: () => void;
201
+ private connectTask: Promise<void>;
202
+
203
+ constructor(realtimeModel: RealtimeModel) {
204
+ super(realtimeModel);
205
+ this.options = realtimeModel._options;
206
+
207
+ this.resolveInstructionsReady = () => {};
208
+ this.instructionsReady = new Promise<void>((resolve) => {
209
+ this.resolveInstructionsReady = resolve;
210
+ });
211
+
212
+ this.client = new PhonicClient({
213
+ apiKey: this.options.apiKey,
214
+ baseUrl: this.options.baseUrl,
215
+ });
216
+ this.bstream = new AudioByteStream(
217
+ PHONIC_INPUT_SAMPLE_RATE,
218
+ PHONIC_NUM_CHANNELS,
219
+ (PHONIC_INPUT_SAMPLE_RATE * PHONIC_INPUT_FRAME_MS) / 1000,
220
+ );
221
+ this.connectTask = this.connect().catch((error: unknown) => {
222
+ const normalizedError = error instanceof Error ? error : new Error(String(error));
223
+ this.emitError(normalizedError, false);
224
+ });
225
+ }
226
+
227
+ get chatCtx(): llm.ChatContext {
228
+ return this._chatCtx.copy();
229
+ }
230
+
231
+ get tools(): llm.ToolContext {
232
+ return { ...this._tools };
233
+ }
234
+
235
+ async updateInstructions(instructions: string): Promise<void> {
236
+ if (this.configSent) {
237
+ this.logger.warn(
238
+ 'updateInstructions called after config was already sent. Phonic does not support updating instructions mid-session.',
239
+ );
240
+ return;
241
+ }
242
+ this.options.instructions = instructions;
243
+ this.resolveInstructionsReady();
244
+ }
245
+
246
+ async updateChatCtx(_chatCtx: llm.ChatContext): Promise<void> {
247
+ this.logger.warn('updateChatCtx is not supported by the Phonic realtime model.');
248
+ }
249
+
250
+ async updateTools(tools: llm.ToolContext): Promise<void> {
251
+ if (Object.keys(tools).length > 0) {
252
+ this.logger.warn('Tool use is not supported by the Phonic realtime model.');
253
+ }
254
+ }
255
+
256
+ updateOptions(_options: { toolChoice?: llm.ToolChoice | null }): void {
257
+ this.logger.warn('updateOptions is not supported by the Phonic realtime model.');
258
+ }
259
+
260
+ pushAudio(frame: AudioFrame): void {
261
+ if (this.closed) {
262
+ return;
263
+ }
264
+
265
+ for (const resampledFrame of this.resampleAudio(frame)) {
266
+ for (const chunk of this.bstream.write(resampledFrame.data.buffer as ArrayBuffer)) {
267
+ const bytes = Buffer.from(chunk.data.buffer, chunk.data.byteOffset, chunk.data.byteLength);
268
+ const payload: Phonic.AudioChunkPayload = {
269
+ type: 'audio_chunk',
270
+ audio: bytes.toString('base64'),
271
+ };
272
+
273
+ if (!this.socket) {
274
+ continue;
275
+ }
276
+ this.socket.sendAudioChunk(payload);
277
+ }
278
+ }
279
+ }
280
+
281
+ // TODO @Phonic-Co: Implement generateReply
282
+ async generateReply(_instructions?: string): Promise<llm.GenerationCreatedEvent> {
283
+ throw new Error(
284
+ 'generateReply is not yet supported by the Phonic realtime model. Consider using `welcomeMessage` instead.',
285
+ );
286
+ }
287
+
288
+ async commitAudio(): Promise<void> {
289
+ this.logger.warn('commitAudio is not supported by the Phonic realtime model.');
290
+ }
291
+ async clearAudio(): Promise<void> {
292
+ this.logger.warn('clearAudio is not supported by the Phonic realtime model.');
293
+ }
294
+
295
+ async interrupt(): Promise<void> {
296
+ this.logger.warn('interrupt is not supported by the Phonic realtime model.');
297
+ }
298
+
299
+ async truncate(_options: { messageId: string; audioEndMs: number; audioTranscript?: string }) {
300
+ this.logger.warn('truncate is not supported by the Phonic realtime model.');
301
+ }
302
+
303
+ async close(): Promise<void> {
304
+ this.closed = true;
305
+ this.resolveInstructionsReady();
306
+ this.closeCurrentGeneration({ interrupted: false });
307
+ this.socket?.close();
308
+ await this.connectTask;
309
+ await super.close();
310
+ }
311
+
312
+ private async connect(): Promise<void> {
313
+ this.socket = await this.client.conversations.connect({
314
+ reconnectAttempts: this.options.connOptions.maxRetry,
315
+ });
316
+
317
+ if (this.closed) {
318
+ this.socket.close();
319
+ return;
320
+ }
321
+
322
+ this.socket.on('message', (message: unknown) =>
323
+ this.handleServerMessage(message as ServerEvent),
324
+ );
325
+ this.socket.on('error', (error: Error) => this.emitError(error, false));
326
+ this.socket.on('close', (event: { code?: number }) => {
327
+ this.closeCurrentGeneration({ interrupted: false });
328
+ if (!this.closed && event.code !== WS_CLOSE_NORMAL) {
329
+ this.emitError(new Error(`Phonic STS socket closed with code ${event.code ?? -1}`), false);
330
+ }
331
+ });
332
+
333
+ await this.socket.waitForOpen();
334
+ await this.instructionsReady;
335
+ if (this.closed) return;
336
+ this.configSent = true;
337
+ this.socket.sendConfig({
338
+ type: 'config',
339
+ model: this.options.model as Phonic.ConfigPayload['model'],
340
+ agent: this.options.phonicAgent,
341
+ project: this.options.project,
342
+ welcome_message: this.options.welcomeMessage,
343
+ generate_welcome_message: this.options.generateWelcomeMessage,
344
+ system_prompt: this.options.instructions,
345
+ voice_id: this.options.voice,
346
+ input_format: 'pcm_44100',
347
+ output_format: 'pcm_44100',
348
+ recognized_languages: this.options.languages,
349
+ audio_speed: this.options.audioSpeed,
350
+ tools: this.options.phonicTools,
351
+ boosted_keywords: this.options.boostedKeywords,
352
+ generate_no_input_poke_text: this.options.generateNoInputPokeText,
353
+ no_input_poke_sec: this.options.noInputPokeSec,
354
+ no_input_poke_text: this.options.noInputPokeText,
355
+ no_input_end_conversation_sec: this.options.noInputEndConversationSec,
356
+ });
357
+ }
358
+
359
+ private handleServerMessage(message: ServerEvent): void {
360
+ if (this.closed) {
361
+ return;
362
+ }
363
+
364
+ switch (message.type) {
365
+ case 'assistant_started_speaking':
366
+ this.startNewAssistantTurn();
367
+ break;
368
+ case 'assistant_finished_speaking':
369
+ this.finishAssistantTurn();
370
+ break;
371
+ case 'audio_chunk':
372
+ this.handleAudioChunk(message);
373
+ break;
374
+ case 'input_text':
375
+ this.handleInputText(message);
376
+ break;
377
+ case 'user_started_speaking':
378
+ this.handleInputSpeechStarted();
379
+ break;
380
+ case 'user_finished_speaking':
381
+ this.handleInputSpeechStopped();
382
+ break;
383
+ case 'error':
384
+ this.emitError(new Error(message.error.message), false);
385
+ break;
386
+ case 'tool_call':
387
+ this.emitError(
388
+ new Error(
389
+ `WebSocket tool calls are not yet supported by the Phonic realtime model with LiveKit Agents.`,
390
+ ),
391
+ false,
392
+ );
393
+ break;
394
+ case 'assistant_ended_conversation':
395
+ this.emitError(
396
+ new Error(
397
+ 'assistant_ended_conversation is not supported by the Phonic realtime model with LiveKit Agents.',
398
+ ),
399
+ false,
400
+ );
401
+ break;
402
+ case 'conversation_created':
403
+ this.conversationId = message.conversation_id;
404
+ this.logger.info(`Phonic Conversation began with ID: ${this.conversationId}`);
405
+ break;
406
+ case 'assistant_chose_not_to_respond':
407
+ case 'ready_to_start_conversation':
408
+ case 'input_cancelled':
409
+ case 'tool_call_output_processed':
410
+ case 'tool_call_interrupted':
411
+ case 'dtmf':
412
+ default:
413
+ break;
414
+ }
415
+ }
416
+
417
+ private handleAudioChunk(message: Phonic.AudioChunkResponsePayload): void {
418
+ /**
419
+ * Although Phonic sends audio chunks when the assistant is not speaking (i.e. containing silence or background noise),
420
+ * we only process the chunks when the assistant is speaking to align with the generations model, whereby new streams are created for each turn.
421
+ */
422
+ const gen = this.currentGeneration;
423
+ if (!gen) return;
424
+
425
+ if (message.text) {
426
+ gen.outputText += message.text;
427
+ gen.textChannel.write(message.text);
428
+ }
429
+
430
+ if (message.audio) {
431
+ const bytes = Buffer.from(message.audio, 'base64');
432
+ const sampleCount = Math.floor(bytes.byteLength / Int16Array.BYTES_PER_ELEMENT);
433
+ if (sampleCount > 0) {
434
+ const pcm = new Int16Array(
435
+ bytes.buffer.slice(
436
+ bytes.byteOffset,
437
+ bytes.byteOffset + sampleCount * Int16Array.BYTES_PER_ELEMENT,
438
+ ),
439
+ );
440
+ const frame = new AudioFrame(
441
+ pcm,
442
+ PHONIC_OUTPUT_SAMPLE_RATE,
443
+ PHONIC_NUM_CHANNELS,
444
+ sampleCount / PHONIC_NUM_CHANNELS,
445
+ );
446
+ gen.audioChannel.write(frame);
447
+ }
448
+ }
449
+ }
450
+
451
+ private handleInputText(message: Phonic.InputTextPayload): void {
452
+ const itemId = shortuuid('PI_');
453
+ this.emit('input_audio_transcription_completed', {
454
+ itemId,
455
+ transcript: message.text,
456
+ isFinal: true,
457
+ });
458
+
459
+ this._chatCtx.addMessage({
460
+ role: 'user',
461
+ content: message.text,
462
+ id: itemId,
463
+ });
464
+ }
465
+
466
+ private handleInputSpeechStarted(): void {
467
+ this.emit('input_speech_started', {});
468
+ this.closeCurrentGeneration({ interrupted: true });
469
+ }
470
+
471
+ private handleInputSpeechStopped(): void {
472
+ this.emit('input_speech_stopped', {
473
+ userTranscriptionEnabled: true,
474
+ });
475
+ }
476
+
477
+ private startNewAssistantTurn(): void {
478
+ if (this.currentGeneration) {
479
+ this.closeCurrentGeneration({ interrupted: true });
480
+ }
481
+
482
+ const responseId = shortuuid('PS_');
483
+
484
+ const textChannel = stream.createStreamChannel<string>();
485
+ const audioChannel = stream.createStreamChannel<AudioFrame>();
486
+ const functionChannel = stream.createStreamChannel<llm.FunctionCall>();
487
+ const messageChannel = stream.createStreamChannel<llm.MessageGeneration>();
488
+
489
+ messageChannel.write({
490
+ messageId: responseId,
491
+ textStream: textChannel.stream(),
492
+ audioStream: audioChannel.stream(),
493
+ modalities: Promise.resolve(['audio', 'text']),
494
+ });
495
+
496
+ this.currentGeneration = {
497
+ responseId,
498
+ messageChannel,
499
+ functionChannel,
500
+ textChannel,
501
+ audioChannel,
502
+ outputText: '',
503
+ };
504
+
505
+ this.emit('generation_created', {
506
+ messageStream: messageChannel.stream(),
507
+ functionStream: functionChannel.stream(),
508
+ userInitiated: false,
509
+ responseId,
510
+ });
511
+ }
512
+
513
+ private finishAssistantTurn(): void {
514
+ this.closeCurrentGeneration({ interrupted: false });
515
+ }
516
+
517
+ private closeCurrentGeneration({ interrupted }: { interrupted: boolean }): void {
518
+ const gen = this.currentGeneration;
519
+ if (!gen) return;
520
+
521
+ if (gen.outputText) {
522
+ this._chatCtx.addMessage({
523
+ role: 'assistant',
524
+ content: gen.outputText,
525
+ id: gen.responseId,
526
+ interrupted,
527
+ });
528
+ }
529
+
530
+ gen.textChannel.close();
531
+ gen.audioChannel.close();
532
+ gen.functionChannel.close();
533
+ gen.messageChannel.close();
534
+ this.currentGeneration = undefined;
535
+ }
536
+
537
+ private emitError(error: Error, recoverable: boolean): void {
538
+ this.emit('error', {
539
+ timestamp: Date.now(),
540
+ label: 'phonic_realtime',
541
+ type: 'realtime_model_error',
542
+ error,
543
+ recoverable,
544
+ } satisfies llm.RealtimeModelError);
545
+ }
546
+
547
+ private *resampleAudio(frame: AudioFrame): Generator<AudioFrame> {
548
+ if (this.inputResampler) {
549
+ if (frame.sampleRate !== this.inputResamplerInputRate) {
550
+ this.inputResampler = undefined;
551
+ this.inputResamplerInputRate = undefined;
552
+ }
553
+ }
554
+
555
+ if (
556
+ this.inputResampler === undefined &&
557
+ (frame.sampleRate !== PHONIC_INPUT_SAMPLE_RATE || frame.channels !== PHONIC_NUM_CHANNELS)
558
+ ) {
559
+ this.inputResampler = new AudioResampler(
560
+ frame.sampleRate,
561
+ PHONIC_INPUT_SAMPLE_RATE,
562
+ PHONIC_NUM_CHANNELS,
563
+ );
564
+ this.inputResamplerInputRate = frame.sampleRate;
565
+ }
566
+
567
+ if (this.inputResampler) {
568
+ for (const resampledFrame of this.inputResampler.push(frame)) {
569
+ yield resampledFrame;
570
+ }
571
+ } else {
572
+ yield frame;
573
+ }
574
+ }
575
+ }