@livekit/agents-plugin-openai 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/.turbo/turbo-build.log +1 -1
  2. package/CHANGELOG.md +17 -0
  3. package/dist/index.d.ts +1 -1
  4. package/dist/index.d.ts.map +1 -1
  5. package/dist/index.js +1 -1
  6. package/dist/index.js.map +1 -1
  7. package/dist/realtime/api_proto.d.ts +400 -0
  8. package/dist/realtime/api_proto.d.ts.map +1 -0
  9. package/dist/realtime/api_proto.js +9 -0
  10. package/dist/realtime/api_proto.js.map +1 -0
  11. package/dist/realtime/index.d.ts +3 -0
  12. package/dist/realtime/index.d.ts.map +1 -0
  13. package/dist/realtime/index.js +6 -0
  14. package/dist/realtime/index.js.map +1 -0
  15. package/dist/realtime/realtime_model.d.ts +148 -0
  16. package/dist/realtime/realtime_model.d.ts.map +1 -0
  17. package/dist/realtime/realtime_model.js +555 -0
  18. package/dist/realtime/realtime_model.js.map +1 -0
  19. package/package.json +5 -3
  20. package/src/index.ts +1 -2
  21. package/src/realtime/api_proto.ts +568 -0
  22. package/src/realtime/index.ts +5 -0
  23. package/src/realtime/realtime_model.ts +842 -0
  24. package/dist/omni_assistant/agent_playout.d.ts +0 -27
  25. package/dist/omni_assistant/agent_playout.d.ts.map +0 -1
  26. package/dist/omni_assistant/agent_playout.js +0 -111
  27. package/dist/omni_assistant/agent_playout.js.map +0 -1
  28. package/dist/omni_assistant/index.d.ts +0 -61
  29. package/dist/omni_assistant/index.d.ts.map +0 -1
  30. package/dist/omni_assistant/index.js +0 -453
  31. package/dist/omni_assistant/index.js.map +0 -1
  32. package/dist/omni_assistant/proto.d.ts +0 -218
  33. package/dist/omni_assistant/proto.d.ts.map +0 -1
  34. package/dist/omni_assistant/proto.js +0 -68
  35. package/dist/omni_assistant/proto.js.map +0 -1
  36. package/dist/omni_assistant/transcription_forwarder.d.ts +0 -28
  37. package/dist/omni_assistant/transcription_forwarder.d.ts.map +0 -1
  38. package/dist/omni_assistant/transcription_forwarder.js +0 -117
  39. package/dist/omni_assistant/transcription_forwarder.js.map +0 -1
  40. package/src/omni_assistant/agent_playout.ts +0 -127
  41. package/src/omni_assistant/index.ts +0 -547
  42. package/src/omni_assistant/proto.ts +0 -280
  43. package/src/omni_assistant/transcription_forwarder.ts +0 -128
@@ -0,0 +1,568 @@
1
+ // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+
5
+ export const SAMPLE_RATE = 24000;
6
+ export const NUM_CHANNELS = 1;
7
+ export const IN_FRAME_SIZE = 2400; // 100ms
8
+ export const OUT_FRAME_SIZE = 1200; // 50ms
9
+
10
+ export const API_URL = 'wss://api.openai.com/v1/realtime';
11
+
12
+ export type Model = 'gpt-4o-realtime-preview-2024-10-01' | string; // Open-ended, for future models
13
+ export type Voice = 'alloy' | 'shimmer' | 'echo' | string;
14
+ export type AudioFormat = 'pcm16'; // TODO: 'g711-ulaw' | 'g711-alaw'
15
+ export type Role = 'system' | 'assistant' | 'user' | 'tool';
16
+ export type GenerationFinishedReason = 'stop' | 'max_tokens' | 'content_filter' | 'interrupt';
17
+ export type InputTranscriptionModel = 'whisper-1' | string; // Open-ended, for future models
18
+ export type Modality = 'text' | 'audio';
19
+ export type ToolChoice = 'auto' | 'none' | 'required' | string;
20
+ export type State = 'initializing' | 'listening' | 'thinking' | 'speaking' | string;
21
+ export type ResponseStatus =
22
+ | 'in_progress'
23
+ | 'completed'
24
+ | 'incomplete'
25
+ | 'cancelled'
26
+ | 'failed'
27
+ | string;
28
+ export type ClientEventType =
29
+ | 'session.update'
30
+ | 'input_audio_buffer.append'
31
+ | 'input_audio_buffer.commit'
32
+ | 'input_audio_buffer.clear'
33
+ | 'conversation.item.create'
34
+ | 'conversation.item.truncate'
35
+ | 'conversation.item.delete'
36
+ | 'response.create'
37
+ | 'response.cancel';
38
+ export type ServerEventType =
39
+ | 'error'
40
+ | 'session.created'
41
+ | 'session.updated'
42
+ | 'conversation.created'
43
+ | 'input_audio_buffer.committed'
44
+ | 'input_audio_buffer.cleared'
45
+ | 'input_audio_buffer.speech_started'
46
+ | 'input_audio_buffer.speech_stopped'
47
+ | 'conversation.item.created'
48
+ | 'conversation.item.input_audio_transcription.completed'
49
+ | 'conversation.item.input_audio_transcription.failed'
50
+ | 'conversation.item.truncated'
51
+ | 'conversation.item.deleted'
52
+ | 'response.created'
53
+ | 'response.done'
54
+ | 'response.output_item.added'
55
+ | 'response.output_item.done'
56
+ | 'response.content_part.added'
57
+ | 'response.content_part.done'
58
+ | 'response.text.delta'
59
+ | 'response.text.done'
60
+ | 'response.audio_transcript.delta'
61
+ | 'response.audio_transcript.done'
62
+ | 'response.audio.delta'
63
+ | 'response.audio.done'
64
+ | 'response.function_call_arguments.delta'
65
+ | 'response.function_call_arguments.done'
66
+ | 'rate_limits.updated';
67
+
68
+ export type AudioBase64Bytes = string;
69
+
70
+ export interface Tool {
71
+ type: 'function';
72
+ name: string;
73
+ description?: string;
74
+ parameters: {
75
+ type: 'object';
76
+ properties: {
77
+ [prop: string]: {
78
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
79
+ [prop: string]: any;
80
+ };
81
+ };
82
+ required_properties: string[];
83
+ };
84
+ }
85
+
86
+ export type TurnDetectionType =
87
+ | {
88
+ type: 'server_vad';
89
+ threshold?: number; // 0.0 to 1.0, default: 0.5
90
+ prefix_padding_ms?: number; // default: 300
91
+ silence_duration_ms?: number; // default: 200
92
+ }
93
+ | {
94
+ type: 'none';
95
+ };
96
+
97
+ export type InputAudioTranscription = {
98
+ model: InputTranscriptionModel;
99
+ };
100
+
101
+ export interface InputTextContent {
102
+ type: 'input_text';
103
+ text: string;
104
+ }
105
+
106
+ export interface InputAudioContent {
107
+ type: 'input_audio';
108
+ audio: AudioBase64Bytes;
109
+ }
110
+
111
+ export interface TextContent {
112
+ type: 'text';
113
+ text: string;
114
+ }
115
+
116
+ export interface AudioContent {
117
+ type: 'audio';
118
+ audio: AudioBase64Bytes;
119
+ transcript: string;
120
+ }
121
+
122
+ export type Content = InputTextContent | InputAudioContent | TextContent | AudioContent;
123
+ export type ContentPart = {
124
+ type: 'text' | 'audio';
125
+ audio?: AudioBase64Bytes;
126
+ transcript?: string;
127
+ };
128
+
129
+ export interface BaseItem {
130
+ id: string;
131
+ object: 'realtime.item';
132
+ type: string;
133
+ }
134
+
135
+ export interface SystemItem extends BaseItem {
136
+ type: 'message';
137
+ role: 'system';
138
+ content: InputTextContent;
139
+ }
140
+
141
+ export interface UserItem extends BaseItem {
142
+ type: 'message';
143
+ role: 'user';
144
+ content: (InputTextContent | InputAudioContent)[];
145
+ }
146
+
147
+ export interface AssistantItem extends BaseItem {
148
+ type: 'message';
149
+ role: 'assistant';
150
+ content: (TextContent | AudioContent)[];
151
+ }
152
+
153
+ export interface FunctionCallItem extends BaseItem {
154
+ type: 'function_call';
155
+ call_id: string;
156
+ name: string;
157
+ arguments: string;
158
+ }
159
+
160
+ export interface FunctionCallOutputItem extends BaseItem {
161
+ type: 'function_call_output';
162
+ call_id: string;
163
+ output: string;
164
+ }
165
+
166
+ export type ItemResource =
167
+ | SystemItem
168
+ | UserItem
169
+ | AssistantItem
170
+ | FunctionCallItem
171
+ | FunctionCallOutputItem;
172
+
173
+ // Session Resource
174
+ export interface SessionResource {
175
+ id: string;
176
+ object: 'realtime.session';
177
+ model: string;
178
+ modalities: ['text', 'audio'] | ['text']; // default: ["text", "audio"]
179
+ instructions?: string; // default: null
180
+ voice: Voice; // default: "alloy"
181
+ input_audio_format: AudioFormat; // default: "pcm16"
182
+ output_audio_format: AudioFormat; // default: "pcm16"
183
+ input_audio_transcription?: InputAudioTranscription; // default: null
184
+ turn_detection: TurnDetectionType;
185
+ tools: Tool[];
186
+ tool_choice: ToolChoice; // default: "auto"
187
+ temperature: number; // default: 0.8
188
+ max_response_output_tokens: number | null;
189
+ }
190
+
191
+ // Conversation Resource
192
+ export interface ConversationResource {
193
+ id: string;
194
+ object: 'realtime.conversation';
195
+ }
196
+
197
+ export type ResponseStatusDetails =
198
+ | {
199
+ type: 'incomplete';
200
+ reason: 'max_output_tokens' | 'content_filter' | string;
201
+ }
202
+ | {
203
+ type: 'failed';
204
+ error?: {
205
+ code: 'server_error' | 'rate_limit_exceeded' | string;
206
+ message: string;
207
+ };
208
+ }
209
+ | {
210
+ type: 'cancelled';
211
+ reason: 'turn_detected' | 'client_cancelled' | string;
212
+ };
213
+
214
+ export interface ResponseResource {
215
+ id: string;
216
+ object: 'realtime.response';
217
+ status: ResponseStatus;
218
+ status_details: ResponseStatusDetails;
219
+ output: ItemResource[];
220
+ usage?: {
221
+ total_tokens: number;
222
+ input_tokens: number;
223
+ output_tokens: number;
224
+ };
225
+ }
226
+
227
+ // Client Events
228
+ interface BaseClientEvent {
229
+ event_id?: string;
230
+ type: ClientEventType;
231
+ }
232
+
233
+ export interface SessionUpdateEvent extends BaseClientEvent {
234
+ type: 'session.update';
235
+ session: Partial<{
236
+ modalities: ['text', 'audio'] | ['text'];
237
+ instructions: string;
238
+ voice: Voice;
239
+ input_audio_format: AudioFormat;
240
+ output_audio_format: AudioFormat;
241
+ input_audio_transcription?: InputAudioTranscription;
242
+ turn_detection: TurnDetectionType;
243
+ tools: Tool[];
244
+ tool_choice: ToolChoice;
245
+ temperature: number;
246
+ max_response_output_tokens: number;
247
+ }>;
248
+ }
249
+
250
+ export interface InputAudioBufferAppendEvent extends BaseClientEvent {
251
+ type: 'input_audio_buffer.append';
252
+ audio: AudioBase64Bytes;
253
+ }
254
+
255
+ export interface InputAudioBufferCommitEvent extends BaseClientEvent {
256
+ type: 'input_audio_buffer.commit';
257
+ }
258
+
259
+ export interface InputAudioBufferClearEvent extends BaseClientEvent {
260
+ type: 'input_audio_buffer.clear';
261
+ }
262
+
263
+ export interface UserItemCreate {
264
+ type: 'message';
265
+ role: 'user';
266
+ content: (InputTextContent | InputAudioContent)[];
267
+ }
268
+
269
+ export interface AssistantItemCreate {
270
+ type: 'message';
271
+ role: 'assistant';
272
+ content: TextContent[];
273
+ }
274
+
275
+ export interface SystemItemCreate {
276
+ type: 'message';
277
+ role: 'system';
278
+ content: InputTextContent[];
279
+ }
280
+
281
+ export interface FunctionCallOutputItemCreate {
282
+ type: 'function_call_output';
283
+ call_id: string;
284
+ output: string;
285
+ }
286
+
287
+ export type ConversationItemCreateContent =
288
+ | UserItemCreate
289
+ | AssistantItemCreate
290
+ | SystemItemCreate
291
+ | FunctionCallOutputItemCreate;
292
+
293
+ export interface ConversationItemCreateEvent extends BaseClientEvent {
294
+ type: 'conversation.item.create';
295
+ previous_item_id?: string;
296
+ item: ConversationItemCreateContent;
297
+ }
298
+
299
+ export interface ConversationItemTruncateEvent extends BaseClientEvent {
300
+ type: 'conversation.item.truncate';
301
+ item_id: string;
302
+ content_index: number;
303
+ audio_end_ms: number;
304
+ }
305
+
306
+ export interface ConversationItemDeleteEvent extends BaseClientEvent {
307
+ type: 'conversation.item.delete';
308
+ item_id: string;
309
+ }
310
+
311
+ export interface ResponseCreateEvent extends BaseClientEvent {
312
+ type: 'response.create';
313
+ response?: Partial<{
314
+ modalities: ['text', 'audio'] | ['text'];
315
+ instructions: string;
316
+ voice: Voice;
317
+ output_audio_format: AudioFormat;
318
+ tools?: Tool[];
319
+ tool_choice: ToolChoice;
320
+ temperature: number;
321
+ max_response_output_tokens: number;
322
+ }>;
323
+ }
324
+
325
+ export interface ResponseCancelEvent extends BaseClientEvent {
326
+ type: 'response.cancel';
327
+ }
328
+
329
+ export type ClientEvent =
330
+ | SessionUpdateEvent
331
+ | InputAudioBufferAppendEvent
332
+ | InputAudioBufferCommitEvent
333
+ | InputAudioBufferClearEvent
334
+ | ConversationItemCreateEvent
335
+ | ConversationItemTruncateEvent
336
+ | ConversationItemDeleteEvent
337
+ | ResponseCreateEvent
338
+ | ResponseCancelEvent;
339
+
340
+ interface BaseServerEvent {
341
+ event_id: string;
342
+ type: ServerEventType;
343
+ }
344
+
345
+ export interface ErrorEvent extends BaseServerEvent {
346
+ type: 'error';
347
+ error: {
348
+ type: 'invalid_request_error' | 'server_error' | string;
349
+ code?: string;
350
+ message: string;
351
+ param: string;
352
+ event_id: string;
353
+ };
354
+ }
355
+
356
+ export interface SessionCreatedEvent extends BaseServerEvent {
357
+ type: 'session.created';
358
+ session: SessionResource;
359
+ }
360
+
361
+ export interface SessionUpdatedEvent extends BaseServerEvent {
362
+ type: 'session.updated';
363
+ session: SessionResource;
364
+ }
365
+
366
+ export interface ConversationCreatedEvent extends BaseServerEvent {
367
+ type: 'conversation.created';
368
+ conversation: ConversationResource;
369
+ }
370
+
371
+ export interface InputAudioBufferCommittedEvent extends BaseServerEvent {
372
+ type: 'input_audio_buffer.committed';
373
+ item_id: string;
374
+ }
375
+
376
+ export interface InputAudioBufferClearedEvent extends BaseServerEvent {
377
+ type: 'input_audio_buffer.cleared';
378
+ }
379
+
380
+ export interface InputAudioBufferSpeechStartedEvent extends BaseServerEvent {
381
+ type: 'input_audio_buffer.speech_started';
382
+ audio_start_ms: number;
383
+ item_id: string;
384
+ }
385
+
386
+ export interface InputAudioBufferSpeechStoppedEvent extends BaseServerEvent {
387
+ type: 'input_audio_buffer.speech_stopped';
388
+ audio_end_ms: number;
389
+ item_id: string;
390
+ }
391
+
392
+ export interface ConversationItemCreatedEvent extends BaseServerEvent {
393
+ type: 'conversation.item.created';
394
+ item: ItemResource;
395
+ }
396
+
397
+ export interface ConversationItemInputAudioTranscriptionCompletedEvent extends BaseServerEvent {
398
+ type: 'conversation.item.input_audio_transcription.completed';
399
+ item_id: string;
400
+ content_index: number;
401
+ transcript: string;
402
+ }
403
+
404
+ export interface ConversationItemInputAudioTranscriptionFailedEvent extends BaseServerEvent {
405
+ type: 'conversation.item.input_audio_transcription.failed';
406
+ item_id: string;
407
+ content_index: number;
408
+ error: {
409
+ type: string;
410
+ code?: string;
411
+ message: string;
412
+ param: null;
413
+ };
414
+ }
415
+
416
+ export interface ConversationItemTruncatedEvent extends BaseServerEvent {
417
+ type: 'conversation.item.truncated';
418
+ item_id: string;
419
+ content_index: number;
420
+ audio_end_ms: number;
421
+ }
422
+
423
+ export interface ConversationItemDeletedEvent extends BaseServerEvent {
424
+ type: 'conversation.item.deleted';
425
+ item_id: string;
426
+ }
427
+
428
+ export interface ResponseCreatedEvent extends BaseServerEvent {
429
+ type: 'response.created';
430
+ response: ResponseResource;
431
+ }
432
+
433
+ export interface ResponseDoneEvent extends BaseServerEvent {
434
+ type: 'response.done';
435
+ response: ResponseResource;
436
+ }
437
+
438
+ export interface ResponseOutputItemAddedEvent extends BaseServerEvent {
439
+ type: 'response.output_item.added';
440
+ response_id: string;
441
+ output_index: number;
442
+ item: ItemResource;
443
+ }
444
+
445
+ export interface ResponseOutputItemDoneEvent extends BaseServerEvent {
446
+ type: 'response.output_item.done';
447
+ response_id: string;
448
+ output_index: number;
449
+ item: ItemResource;
450
+ }
451
+
452
+ export interface ResponseContentPartAddedEvent extends BaseServerEvent {
453
+ type: 'response.content_part.added';
454
+ response_id: string;
455
+ item_id: string;
456
+ output_index: number;
457
+ content_index: number;
458
+ part: ContentPart;
459
+ }
460
+
461
+ export interface ResponseContentPartDoneEvent extends BaseServerEvent {
462
+ type: 'response.content_part.done';
463
+ response_id: string;
464
+ output_index: number;
465
+ content_index: number;
466
+ part: ContentPart;
467
+ }
468
+
469
+ export interface ResponseTextDeltaEvent extends BaseServerEvent {
470
+ type: 'response.text.delta';
471
+ response_id: string;
472
+ output_index: number;
473
+ content_index: number;
474
+ delta: string;
475
+ }
476
+
477
+ export interface ResponseTextDoneEvent extends BaseServerEvent {
478
+ type: 'response.text.done';
479
+ response_id: string;
480
+ output_index: number;
481
+ content_index: number;
482
+ text: string;
483
+ }
484
+
485
+ export interface ResponseAudioTranscriptDeltaEvent extends BaseServerEvent {
486
+ type: 'response.audio_transcript.delta';
487
+ response_id: string;
488
+ output_index: number;
489
+ content_index: number;
490
+ delta: string;
491
+ }
492
+
493
+ export interface ResponseAudioTranscriptDoneEvent extends BaseServerEvent {
494
+ type: 'response.audio_transcript.done';
495
+ response_id: string;
496
+ output_index: number;
497
+ content_index: number;
498
+ transcript: string;
499
+ }
500
+
501
+ export interface ResponseAudioDeltaEvent extends BaseServerEvent {
502
+ type: 'response.audio.delta';
503
+ response_id: string;
504
+ output_index: number;
505
+ content_index: number;
506
+ delta: AudioBase64Bytes;
507
+ }
508
+
509
+ export interface ResponseAudioDoneEvent extends BaseServerEvent {
510
+ type: 'response.audio.done';
511
+ response_id: string;
512
+ output_index: number;
513
+ content_index: number;
514
+ }
515
+
516
+ export interface ResponseFunctionCallArgumentsDeltaEvent extends BaseServerEvent {
517
+ type: 'response.function_call_arguments.delta';
518
+ response_id: string;
519
+ output_index: number;
520
+ delta: string;
521
+ }
522
+
523
+ export interface ResponseFunctionCallArgumentsDoneEvent extends BaseServerEvent {
524
+ type: 'response.function_call_arguments.done';
525
+ response_id: string;
526
+ output_index: number;
527
+ arguments: string;
528
+ }
529
+
530
+ export interface RateLimitsUpdatedEvent extends BaseServerEvent {
531
+ type: 'rate_limits.updated';
532
+ rate_limits: {
533
+ name: 'requests' | 'tokens' | 'input_tokens' | 'output_tokens' | string;
534
+ limit: number;
535
+ remaining: number;
536
+ reset_seconds: number;
537
+ }[];
538
+ }
539
+
540
+ export type ServerEvent =
541
+ | ErrorEvent
542
+ | SessionCreatedEvent
543
+ | SessionUpdatedEvent
544
+ | ConversationCreatedEvent
545
+ | InputAudioBufferCommittedEvent
546
+ | InputAudioBufferClearedEvent
547
+ | InputAudioBufferSpeechStartedEvent
548
+ | InputAudioBufferSpeechStoppedEvent
549
+ | ConversationItemCreatedEvent
550
+ | ConversationItemInputAudioTranscriptionCompletedEvent
551
+ | ConversationItemInputAudioTranscriptionFailedEvent
552
+ | ConversationItemTruncatedEvent
553
+ | ConversationItemDeletedEvent
554
+ | ResponseCreatedEvent
555
+ | ResponseDoneEvent
556
+ | ResponseOutputItemAddedEvent
557
+ | ResponseOutputItemDoneEvent
558
+ | ResponseContentPartAddedEvent
559
+ | ResponseContentPartDoneEvent
560
+ | ResponseTextDeltaEvent
561
+ | ResponseTextDoneEvent
562
+ | ResponseAudioTranscriptDeltaEvent
563
+ | ResponseAudioTranscriptDoneEvent
564
+ | ResponseAudioDeltaEvent
565
+ | ResponseAudioDoneEvent
566
+ | ResponseFunctionCallArgumentsDeltaEvent
567
+ | ResponseFunctionCallArgumentsDoneEvent
568
+ | RateLimitsUpdatedEvent;
@@ -0,0 +1,5 @@
1
+ // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ export * from './api_proto.js';
5
+ export * from './realtime_model.js';