@livekit/agents-plugin-openai 0.2.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/.turbo/turbo-build.log +1 -1
  2. package/CHANGELOG.md +28 -0
  3. package/dist/index.d.ts +1 -1
  4. package/dist/index.d.ts.map +1 -1
  5. package/dist/index.js +1 -1
  6. package/dist/index.js.map +1 -1
  7. package/dist/realtime/api_proto.d.ts +399 -0
  8. package/dist/realtime/api_proto.d.ts.map +1 -0
  9. package/dist/realtime/api_proto.js +9 -0
  10. package/dist/realtime/api_proto.js.map +1 -0
  11. package/dist/realtime/index.d.ts +3 -0
  12. package/dist/realtime/index.d.ts.map +1 -0
  13. package/dist/realtime/index.js +6 -0
  14. package/dist/realtime/index.js.map +1 -0
  15. package/dist/realtime/realtime_model.d.ts +149 -0
  16. package/dist/realtime/realtime_model.d.ts.map +1 -0
  17. package/dist/realtime/realtime_model.js +571 -0
  18. package/dist/realtime/realtime_model.js.map +1 -0
  19. package/package.json +5 -3
  20. package/src/index.ts +1 -2
  21. package/src/realtime/api_proto.ts +565 -0
  22. package/src/realtime/index.ts +5 -0
  23. package/src/realtime/realtime_model.ts +859 -0
  24. package/dist/omni_assistant/agent_playout.d.ts +0 -27
  25. package/dist/omni_assistant/agent_playout.d.ts.map +0 -1
  26. package/dist/omni_assistant/agent_playout.js +0 -111
  27. package/dist/omni_assistant/agent_playout.js.map +0 -1
  28. package/dist/omni_assistant/index.d.ts +0 -61
  29. package/dist/omni_assistant/index.d.ts.map +0 -1
  30. package/dist/omni_assistant/index.js +0 -453
  31. package/dist/omni_assistant/index.js.map +0 -1
  32. package/dist/omni_assistant/proto.d.ts +0 -218
  33. package/dist/omni_assistant/proto.d.ts.map +0 -1
  34. package/dist/omni_assistant/proto.js +0 -68
  35. package/dist/omni_assistant/proto.js.map +0 -1
  36. package/dist/omni_assistant/transcription_forwarder.d.ts +0 -28
  37. package/dist/omni_assistant/transcription_forwarder.d.ts.map +0 -1
  38. package/dist/omni_assistant/transcription_forwarder.js +0 -117
  39. package/dist/omni_assistant/transcription_forwarder.js.map +0 -1
  40. package/src/omni_assistant/agent_playout.ts +0 -127
  41. package/src/omni_assistant/index.ts +0 -547
  42. package/src/omni_assistant/proto.ts +0 -280
  43. package/src/omni_assistant/transcription_forwarder.ts +0 -128
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@livekit/agents-plugin-openai",
3
- "version": "0.2.0",
3
+ "version": "0.3.1",
4
4
  "description": "OpenAI plugin for LiveKit Node Agents",
5
5
  "main": "dist/index.js",
6
6
  "types": "dist/index.d.ts",
@@ -12,12 +12,14 @@
12
12
  "typescript": "^5.0.0"
13
13
  },
14
14
  "dependencies": {
15
- "@livekit/rtc-node": "^0.8.1",
15
+ "@livekit/rtc-node": "^0.9.0",
16
16
  "ws": "^8.16.0",
17
- "@livekit/agents": "0.2.0"
17
+ "@livekit/agents": "0.3.1"
18
18
  },
19
19
  "scripts": {
20
20
  "build": "tsc",
21
+ "clean": "rm -rf dist",
22
+ "clean:build": "pnpm clean && pnpm build",
21
23
  "lint": "eslint -f unix \"src/**/*.{ts,js}\"",
22
24
  "api:check": "api-extractor run --typescript-compiler-folder ../../node_modules/typescript",
23
25
  "api:update": "api-extractor run --local --typescript-compiler-folder ../../node_modules/typescript --verbose"
package/src/index.ts CHANGED
@@ -1,5 +1,4 @@
1
1
  // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
2
  //
3
3
  // SPDX-License-Identifier: Apache-2.0
4
-
5
- export * from './omni_assistant/index.js';
4
+ export * as realtime from './realtime/index.js';
@@ -0,0 +1,565 @@
1
+ // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+
5
+ export const SAMPLE_RATE = 24000;
6
+ export const NUM_CHANNELS = 1;
7
+ export const IN_FRAME_SIZE = 2400; // 100ms
8
+ export const OUT_FRAME_SIZE = 1200; // 50ms
9
+
10
+ export const API_URL = 'wss://api.openai.com/v1/realtime';
11
+
12
+ export type Model = 'gpt-4o-realtime-preview-2024-10-01' | string; // Open-ended, for future models
13
+ export type Voice = 'alloy' | 'shimmer' | 'echo' | string;
14
+ export type AudioFormat = 'pcm16'; // TODO: 'g711-ulaw' | 'g711-alaw'
15
+ export type Role = 'system' | 'assistant' | 'user' | 'tool';
16
+ export type GenerationFinishedReason = 'stop' | 'max_tokens' | 'content_filter' | 'interrupt';
17
+ export type InputTranscriptionModel = 'whisper-1' | string; // Open-ended, for future models
18
+ export type Modality = 'text' | 'audio';
19
+ export type ToolChoice = 'auto' | 'none' | 'required' | string;
20
+ export type State = 'initializing' | 'listening' | 'thinking' | 'speaking' | string;
21
+ export type ResponseStatus =
22
+ | 'in_progress'
23
+ | 'completed'
24
+ | 'incomplete'
25
+ | 'cancelled'
26
+ | 'failed'
27
+ | string;
28
+ export type ClientEventType =
29
+ | 'session.update'
30
+ | 'input_audio_buffer.append'
31
+ | 'input_audio_buffer.commit'
32
+ | 'input_audio_buffer.clear'
33
+ | 'conversation.item.create'
34
+ | 'conversation.item.truncate'
35
+ | 'conversation.item.delete'
36
+ | 'response.create'
37
+ | 'response.cancel';
38
+ export type ServerEventType =
39
+ | 'error'
40
+ | 'session.created'
41
+ | 'session.updated'
42
+ | 'conversation.created'
43
+ | 'input_audio_buffer.committed'
44
+ | 'input_audio_buffer.cleared'
45
+ | 'input_audio_buffer.speech_started'
46
+ | 'input_audio_buffer.speech_stopped'
47
+ | 'conversation.item.created'
48
+ | 'conversation.item.input_audio_transcription.completed'
49
+ | 'conversation.item.input_audio_transcription.failed'
50
+ | 'conversation.item.truncated'
51
+ | 'conversation.item.deleted'
52
+ | 'response.created'
53
+ | 'response.done'
54
+ | 'response.output_item.added'
55
+ | 'response.output_item.done'
56
+ | 'response.content_part.added'
57
+ | 'response.content_part.done'
58
+ | 'response.text.delta'
59
+ | 'response.text.done'
60
+ | 'response.audio_transcript.delta'
61
+ | 'response.audio_transcript.done'
62
+ | 'response.audio.delta'
63
+ | 'response.audio.done'
64
+ | 'response.function_call_arguments.delta'
65
+ | 'response.function_call_arguments.done'
66
+ | 'rate_limits.updated';
67
+
68
+ export type AudioBase64Bytes = string;
69
+
70
+ export interface Tool {
71
+ type: 'function';
72
+ name: string;
73
+ description?: string;
74
+ parameters: {
75
+ type: 'object';
76
+ properties: {
77
+ [prop: string]: {
78
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
79
+ [prop: string]: any;
80
+ };
81
+ };
82
+ required_properties: string[];
83
+ };
84
+ }
85
+
86
+ export type TurnDetectionType = {
87
+ type: 'server_vad';
88
+ threshold?: number; // 0.0 to 1.0, default: 0.5
89
+ prefix_padding_ms?: number; // default: 300
90
+ silence_duration_ms?: number; // default: 200
91
+ };
92
+
93
+ export type InputAudioTranscription = {
94
+ model: InputTranscriptionModel;
95
+ };
96
+
97
+ export interface InputTextContent {
98
+ type: 'input_text';
99
+ text: string;
100
+ }
101
+
102
+ export interface InputAudioContent {
103
+ type: 'input_audio';
104
+ audio: AudioBase64Bytes;
105
+ }
106
+
107
+ export interface TextContent {
108
+ type: 'text';
109
+ text: string;
110
+ }
111
+
112
+ export interface AudioContent {
113
+ type: 'audio';
114
+ audio: AudioBase64Bytes;
115
+ transcript: string;
116
+ }
117
+
118
+ export type Content = InputTextContent | InputAudioContent | TextContent | AudioContent;
119
+ export type ContentPart = {
120
+ type: 'text' | 'audio';
121
+ audio?: AudioBase64Bytes;
122
+ transcript?: string;
123
+ };
124
+
125
+ export interface BaseItem {
126
+ id: string;
127
+ object: 'realtime.item';
128
+ type: string;
129
+ }
130
+
131
+ export interface SystemItem extends BaseItem {
132
+ type: 'message';
133
+ role: 'system';
134
+ content: InputTextContent;
135
+ }
136
+
137
+ export interface UserItem extends BaseItem {
138
+ type: 'message';
139
+ role: 'user';
140
+ content: (InputTextContent | InputAudioContent)[];
141
+ }
142
+
143
+ export interface AssistantItem extends BaseItem {
144
+ type: 'message';
145
+ role: 'assistant';
146
+ content: (TextContent | AudioContent)[];
147
+ }
148
+
149
+ export interface FunctionCallItem extends BaseItem {
150
+ type: 'function_call';
151
+ call_id: string;
152
+ name: string;
153
+ arguments: string;
154
+ }
155
+
156
+ export interface FunctionCallOutputItem extends BaseItem {
157
+ type: 'function_call_output';
158
+ call_id: string;
159
+ output: string;
160
+ }
161
+
162
+ export type ItemResource =
163
+ | SystemItem
164
+ | UserItem
165
+ | AssistantItem
166
+ | FunctionCallItem
167
+ | FunctionCallOutputItem;
168
+
169
+ // Session Resource
170
+ export interface SessionResource {
171
+ id: string;
172
+ object: 'realtime.session';
173
+ model: string;
174
+ modalities: ['text', 'audio'] | ['text']; // default: ["text", "audio"]
175
+ instructions: string;
176
+ voice: Voice; // default: "alloy"
177
+ input_audio_format: AudioFormat; // default: "pcm16"
178
+ output_audio_format: AudioFormat; // default: "pcm16"
179
+ input_audio_transcription: InputAudioTranscription | null;
180
+ turn_detection: TurnDetectionType | null;
181
+ tools: Tool[];
182
+ tool_choice: ToolChoice; // default: "auto"
183
+ temperature: number; // default: 0.8
184
+ max_response_output_tokens: number | 'inf';
185
+ expires_at: number;
186
+ }
187
+
188
+ // Conversation Resource
189
+ export interface ConversationResource {
190
+ id: string;
191
+ object: 'realtime.conversation';
192
+ }
193
+
194
+ export type ResponseStatusDetails =
195
+ | {
196
+ type: 'incomplete';
197
+ reason: 'max_output_tokens' | 'content_filter' | string;
198
+ }
199
+ | {
200
+ type: 'failed';
201
+ error?: {
202
+ code: 'server_error' | 'rate_limit_exceeded' | string;
203
+ message: string;
204
+ };
205
+ }
206
+ | {
207
+ type: 'cancelled';
208
+ reason: 'turn_detected' | 'client_cancelled' | string;
209
+ };
210
+
211
+ export interface ResponseResource {
212
+ id: string;
213
+ object: 'realtime.response';
214
+ status: ResponseStatus;
215
+ status_details: ResponseStatusDetails;
216
+ output: ItemResource[];
217
+ usage?: {
218
+ total_tokens: number;
219
+ input_tokens: number;
220
+ output_tokens: number;
221
+ };
222
+ }
223
+
224
+ // Client Events
225
+ interface BaseClientEvent {
226
+ event_id?: string;
227
+ type: ClientEventType;
228
+ }
229
+
230
+ export interface SessionUpdateEvent extends BaseClientEvent {
231
+ type: 'session.update';
232
+ session: Partial<{
233
+ modalities: ['text', 'audio'] | ['text'];
234
+ instructions: string;
235
+ voice: Voice;
236
+ input_audio_format: AudioFormat;
237
+ output_audio_format: AudioFormat;
238
+ input_audio_transcription: InputAudioTranscription | null;
239
+ turn_detection: TurnDetectionType | null;
240
+ tools: Tool[];
241
+ tool_choice: ToolChoice;
242
+ temperature: number;
243
+ max_response_output_tokens: number | 'inf';
244
+ }>;
245
+ }
246
+
247
+ export interface InputAudioBufferAppendEvent extends BaseClientEvent {
248
+ type: 'input_audio_buffer.append';
249
+ audio: AudioBase64Bytes;
250
+ }
251
+
252
+ export interface InputAudioBufferCommitEvent extends BaseClientEvent {
253
+ type: 'input_audio_buffer.commit';
254
+ }
255
+
256
+ export interface InputAudioBufferClearEvent extends BaseClientEvent {
257
+ type: 'input_audio_buffer.clear';
258
+ }
259
+
260
+ export interface UserItemCreate {
261
+ type: 'message';
262
+ role: 'user';
263
+ content: (InputTextContent | InputAudioContent)[];
264
+ }
265
+
266
+ export interface AssistantItemCreate {
267
+ type: 'message';
268
+ role: 'assistant';
269
+ content: TextContent[];
270
+ }
271
+
272
+ export interface SystemItemCreate {
273
+ type: 'message';
274
+ role: 'system';
275
+ content: InputTextContent[];
276
+ }
277
+
278
+ export interface FunctionCallOutputItemCreate {
279
+ type: 'function_call_output';
280
+ call_id: string;
281
+ output: string;
282
+ }
283
+
284
+ export type ConversationItemCreateContent =
285
+ | UserItemCreate
286
+ | AssistantItemCreate
287
+ | SystemItemCreate
288
+ | FunctionCallOutputItemCreate;
289
+
290
+ export interface ConversationItemCreateEvent extends BaseClientEvent {
291
+ type: 'conversation.item.create';
292
+ previous_item_id?: string;
293
+ item: ConversationItemCreateContent;
294
+ }
295
+
296
+ export interface ConversationItemTruncateEvent extends BaseClientEvent {
297
+ type: 'conversation.item.truncate';
298
+ item_id: string;
299
+ content_index: number;
300
+ audio_end_ms: number;
301
+ }
302
+
303
+ export interface ConversationItemDeleteEvent extends BaseClientEvent {
304
+ type: 'conversation.item.delete';
305
+ item_id: string;
306
+ }
307
+
308
+ export interface ResponseCreateEvent extends BaseClientEvent {
309
+ type: 'response.create';
310
+ response?: Partial<{
311
+ modalities: ['text', 'audio'] | ['text'];
312
+ instructions: string;
313
+ voice: Voice;
314
+ output_audio_format: AudioFormat;
315
+ tools?: Tool[];
316
+ tool_choice: ToolChoice;
317
+ temperature: number;
318
+ max_output_tokens: number | 'inf';
319
+ }>;
320
+ }
321
+
322
+ export interface ResponseCancelEvent extends BaseClientEvent {
323
+ type: 'response.cancel';
324
+ }
325
+
326
+ export type ClientEvent =
327
+ | SessionUpdateEvent
328
+ | InputAudioBufferAppendEvent
329
+ | InputAudioBufferCommitEvent
330
+ | InputAudioBufferClearEvent
331
+ | ConversationItemCreateEvent
332
+ | ConversationItemTruncateEvent
333
+ | ConversationItemDeleteEvent
334
+ | ResponseCreateEvent
335
+ | ResponseCancelEvent;
336
+
337
+ interface BaseServerEvent {
338
+ event_id: string;
339
+ type: ServerEventType;
340
+ }
341
+
342
+ export interface ErrorEvent extends BaseServerEvent {
343
+ type: 'error';
344
+ error: {
345
+ type: 'invalid_request_error' | 'server_error' | string;
346
+ code?: string;
347
+ message: string;
348
+ param: string;
349
+ event_id: string;
350
+ };
351
+ }
352
+
353
+ export interface SessionCreatedEvent extends BaseServerEvent {
354
+ type: 'session.created';
355
+ session: SessionResource;
356
+ }
357
+
358
+ export interface SessionUpdatedEvent extends BaseServerEvent {
359
+ type: 'session.updated';
360
+ session: SessionResource;
361
+ }
362
+
363
+ export interface ConversationCreatedEvent extends BaseServerEvent {
364
+ type: 'conversation.created';
365
+ conversation: ConversationResource;
366
+ }
367
+
368
+ export interface InputAudioBufferCommittedEvent extends BaseServerEvent {
369
+ type: 'input_audio_buffer.committed';
370
+ item_id: string;
371
+ }
372
+
373
+ export interface InputAudioBufferClearedEvent extends BaseServerEvent {
374
+ type: 'input_audio_buffer.cleared';
375
+ }
376
+
377
+ export interface InputAudioBufferSpeechStartedEvent extends BaseServerEvent {
378
+ type: 'input_audio_buffer.speech_started';
379
+ audio_start_ms: number;
380
+ item_id: string;
381
+ }
382
+
383
+ export interface InputAudioBufferSpeechStoppedEvent extends BaseServerEvent {
384
+ type: 'input_audio_buffer.speech_stopped';
385
+ audio_end_ms: number;
386
+ item_id: string;
387
+ }
388
+
389
+ export interface ConversationItemCreatedEvent extends BaseServerEvent {
390
+ type: 'conversation.item.created';
391
+ item: ItemResource;
392
+ }
393
+
394
+ export interface ConversationItemInputAudioTranscriptionCompletedEvent extends BaseServerEvent {
395
+ type: 'conversation.item.input_audio_transcription.completed';
396
+ item_id: string;
397
+ content_index: number;
398
+ transcript: string;
399
+ }
400
+
401
+ export interface ConversationItemInputAudioTranscriptionFailedEvent extends BaseServerEvent {
402
+ type: 'conversation.item.input_audio_transcription.failed';
403
+ item_id: string;
404
+ content_index: number;
405
+ error: {
406
+ type: string;
407
+ code?: string;
408
+ message: string;
409
+ param: null;
410
+ };
411
+ }
412
+
413
+ export interface ConversationItemTruncatedEvent extends BaseServerEvent {
414
+ type: 'conversation.item.truncated';
415
+ item_id: string;
416
+ content_index: number;
417
+ audio_end_ms: number;
418
+ }
419
+
420
+ export interface ConversationItemDeletedEvent extends BaseServerEvent {
421
+ type: 'conversation.item.deleted';
422
+ item_id: string;
423
+ }
424
+
425
+ export interface ResponseCreatedEvent extends BaseServerEvent {
426
+ type: 'response.created';
427
+ response: ResponseResource;
428
+ }
429
+
430
+ export interface ResponseDoneEvent extends BaseServerEvent {
431
+ type: 'response.done';
432
+ response: ResponseResource;
433
+ }
434
+
435
+ export interface ResponseOutputItemAddedEvent extends BaseServerEvent {
436
+ type: 'response.output_item.added';
437
+ response_id: string;
438
+ output_index: number;
439
+ item: ItemResource;
440
+ }
441
+
442
+ export interface ResponseOutputItemDoneEvent extends BaseServerEvent {
443
+ type: 'response.output_item.done';
444
+ response_id: string;
445
+ output_index: number;
446
+ item: ItemResource;
447
+ }
448
+
449
+ export interface ResponseContentPartAddedEvent extends BaseServerEvent {
450
+ type: 'response.content_part.added';
451
+ response_id: string;
452
+ item_id: string;
453
+ output_index: number;
454
+ content_index: number;
455
+ part: ContentPart;
456
+ }
457
+
458
+ export interface ResponseContentPartDoneEvent extends BaseServerEvent {
459
+ type: 'response.content_part.done';
460
+ response_id: string;
461
+ output_index: number;
462
+ content_index: number;
463
+ part: ContentPart;
464
+ }
465
+
466
+ export interface ResponseTextDeltaEvent extends BaseServerEvent {
467
+ type: 'response.text.delta';
468
+ response_id: string;
469
+ output_index: number;
470
+ content_index: number;
471
+ delta: string;
472
+ }
473
+
474
+ export interface ResponseTextDoneEvent extends BaseServerEvent {
475
+ type: 'response.text.done';
476
+ response_id: string;
477
+ output_index: number;
478
+ content_index: number;
479
+ text: string;
480
+ }
481
+
482
+ export interface ResponseAudioTranscriptDeltaEvent extends BaseServerEvent {
483
+ type: 'response.audio_transcript.delta';
484
+ response_id: string;
485
+ output_index: number;
486
+ content_index: number;
487
+ delta: string;
488
+ }
489
+
490
+ export interface ResponseAudioTranscriptDoneEvent extends BaseServerEvent {
491
+ type: 'response.audio_transcript.done';
492
+ response_id: string;
493
+ output_index: number;
494
+ content_index: number;
495
+ transcript: string;
496
+ }
497
+
498
+ export interface ResponseAudioDeltaEvent extends BaseServerEvent {
499
+ type: 'response.audio.delta';
500
+ response_id: string;
501
+ output_index: number;
502
+ content_index: number;
503
+ delta: AudioBase64Bytes;
504
+ }
505
+
506
+ export interface ResponseAudioDoneEvent extends BaseServerEvent {
507
+ type: 'response.audio.done';
508
+ response_id: string;
509
+ output_index: number;
510
+ content_index: number;
511
+ }
512
+
513
+ export interface ResponseFunctionCallArgumentsDeltaEvent extends BaseServerEvent {
514
+ type: 'response.function_call_arguments.delta';
515
+ response_id: string;
516
+ output_index: number;
517
+ delta: string;
518
+ }
519
+
520
+ export interface ResponseFunctionCallArgumentsDoneEvent extends BaseServerEvent {
521
+ type: 'response.function_call_arguments.done';
522
+ response_id: string;
523
+ output_index: number;
524
+ arguments: string;
525
+ }
526
+
527
+ export interface RateLimitsUpdatedEvent extends BaseServerEvent {
528
+ type: 'rate_limits.updated';
529
+ rate_limits: {
530
+ name: 'requests' | 'tokens' | 'input_tokens' | 'output_tokens' | string;
531
+ limit: number;
532
+ remaining: number;
533
+ reset_seconds: number;
534
+ }[];
535
+ }
536
+
537
+ export type ServerEvent =
538
+ | ErrorEvent
539
+ | SessionCreatedEvent
540
+ | SessionUpdatedEvent
541
+ | ConversationCreatedEvent
542
+ | InputAudioBufferCommittedEvent
543
+ | InputAudioBufferClearedEvent
544
+ | InputAudioBufferSpeechStartedEvent
545
+ | InputAudioBufferSpeechStoppedEvent
546
+ | ConversationItemCreatedEvent
547
+ | ConversationItemInputAudioTranscriptionCompletedEvent
548
+ | ConversationItemInputAudioTranscriptionFailedEvent
549
+ | ConversationItemTruncatedEvent
550
+ | ConversationItemDeletedEvent
551
+ | ResponseCreatedEvent
552
+ | ResponseDoneEvent
553
+ | ResponseOutputItemAddedEvent
554
+ | ResponseOutputItemDoneEvent
555
+ | ResponseContentPartAddedEvent
556
+ | ResponseContentPartDoneEvent
557
+ | ResponseTextDeltaEvent
558
+ | ResponseTextDoneEvent
559
+ | ResponseAudioTranscriptDeltaEvent
560
+ | ResponseAudioTranscriptDoneEvent
561
+ | ResponseAudioDeltaEvent
562
+ | ResponseAudioDoneEvent
563
+ | ResponseFunctionCallArgumentsDeltaEvent
564
+ | ResponseFunctionCallArgumentsDoneEvent
565
+ | RateLimitsUpdatedEvent;
@@ -0,0 +1,5 @@
1
+ // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ export * from './api_proto.js';
5
+ export * from './realtime_model.js';