@livekit/agents 1.0.17 → 1.0.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/inference/llm.cjs +35 -13
- package/dist/inference/llm.cjs.map +1 -1
- package/dist/inference/llm.d.cts +10 -5
- package/dist/inference/llm.d.ts +10 -5
- package/dist/inference/llm.d.ts.map +1 -1
- package/dist/inference/llm.js +35 -13
- package/dist/inference/llm.js.map +1 -1
- package/dist/llm/chat_context.d.cts +1 -1
- package/dist/llm/chat_context.d.ts +1 -1
- package/dist/llm/llm.cjs.map +1 -1
- package/dist/llm/llm.d.cts +1 -1
- package/dist/llm/llm.d.ts +1 -1
- package/dist/llm/llm.d.ts.map +1 -1
- package/dist/llm/llm.js.map +1 -1
- package/dist/llm/provider_format/google.cjs.map +1 -1
- package/dist/llm/provider_format/google.d.cts +1 -1
- package/dist/llm/provider_format/google.d.ts +1 -1
- package/dist/llm/provider_format/google.d.ts.map +1 -1
- package/dist/llm/provider_format/google.js.map +1 -1
- package/dist/llm/provider_format/index.d.cts +1 -1
- package/dist/llm/provider_format/index.d.ts +1 -1
- package/dist/llm/provider_format/index.d.ts.map +1 -1
- package/dist/llm/realtime.cjs.map +1 -1
- package/dist/llm/realtime.d.cts +4 -0
- package/dist/llm/realtime.d.ts +4 -0
- package/dist/llm/realtime.d.ts.map +1 -1
- package/dist/llm/realtime.js.map +1 -1
- package/dist/llm/utils.cjs +2 -2
- package/dist/llm/utils.cjs.map +1 -1
- package/dist/llm/utils.d.cts +1 -1
- package/dist/llm/utils.d.ts +1 -1
- package/dist/llm/utils.d.ts.map +1 -1
- package/dist/llm/utils.js +2 -2
- package/dist/llm/utils.js.map +1 -1
- package/dist/llm/zod-utils.cjs +6 -3
- package/dist/llm/zod-utils.cjs.map +1 -1
- package/dist/llm/zod-utils.d.cts +1 -1
- package/dist/llm/zod-utils.d.ts +1 -1
- package/dist/llm/zod-utils.d.ts.map +1 -1
- package/dist/llm/zod-utils.js +6 -3
- package/dist/llm/zod-utils.js.map +1 -1
- package/dist/llm/zod-utils.test.cjs +83 -0
- package/dist/llm/zod-utils.test.cjs.map +1 -1
- package/dist/llm/zod-utils.test.js +83 -0
- package/dist/llm/zod-utils.test.js.map +1 -1
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.cts +7 -0
- package/dist/utils.d.ts +7 -0
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js.map +1 -1
- package/dist/voice/agent_activity.cjs +69 -20
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +69 -20
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +40 -1
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +5 -0
- package/dist/voice/agent_session.d.ts +5 -0
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +40 -1
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/interruption_detection.test.cjs +114 -0
- package/dist/voice/interruption_detection.test.cjs.map +1 -0
- package/dist/voice/interruption_detection.test.js +113 -0
- package/dist/voice/interruption_detection.test.js.map +1 -0
- package/dist/voice/room_io/room_io.cjs +3 -0
- package/dist/voice/room_io/room_io.cjs.map +1 -1
- package/dist/voice/room_io/room_io.d.cts +1 -0
- package/dist/voice/room_io/room_io.d.ts +1 -0
- package/dist/voice/room_io/room_io.d.ts.map +1 -1
- package/dist/voice/room_io/room_io.js +3 -0
- package/dist/voice/room_io/room_io.js.map +1 -1
- package/package.json +3 -3
- package/src/inference/llm.ts +53 -21
- package/src/llm/__snapshots__/zod-utils.test.ts.snap +218 -0
- package/src/llm/llm.ts +1 -1
- package/src/llm/provider_format/google.ts +4 -4
- package/src/llm/realtime.ts +8 -1
- package/src/llm/utils.ts +7 -2
- package/src/llm/zod-utils.test.ts +101 -0
- package/src/llm/zod-utils.ts +12 -3
- package/src/utils.ts +17 -0
- package/src/voice/agent_activity.ts +96 -24
- package/src/voice/agent_session.ts +54 -0
- package/src/voice/interruption_detection.test.ts +151 -0
- package/src/voice/room_io/room_io.ts +4 -0
|
@@ -339,3 +339,221 @@ exports[`Zod Utils > zodSchemaToJsonSchema > Zod v4 schemas > should handle v4 s
|
|
|
339
339
|
"type": "object",
|
|
340
340
|
}
|
|
341
341
|
`;
|
|
342
|
+
|
|
343
|
+
exports[`Zod Utils > zodSchemaToJsonSchema > strict parameter > should handle arrays in strict mode 1`] = `
|
|
344
|
+
{
|
|
345
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
346
|
+
"additionalProperties": false,
|
|
347
|
+
"properties": {
|
|
348
|
+
"numbers": {
|
|
349
|
+
"items": {
|
|
350
|
+
"type": "number",
|
|
351
|
+
},
|
|
352
|
+
"type": "array",
|
|
353
|
+
},
|
|
354
|
+
"tags": {
|
|
355
|
+
"items": {
|
|
356
|
+
"type": "string",
|
|
357
|
+
},
|
|
358
|
+
"type": "array",
|
|
359
|
+
},
|
|
360
|
+
},
|
|
361
|
+
"required": [
|
|
362
|
+
"tags",
|
|
363
|
+
"numbers",
|
|
364
|
+
],
|
|
365
|
+
"type": "object",
|
|
366
|
+
}
|
|
367
|
+
`;
|
|
368
|
+
|
|
369
|
+
exports[`Zod Utils > zodSchemaToJsonSchema > strict parameter > should handle default values in strict mode 1`] = `
|
|
370
|
+
{
|
|
371
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
372
|
+
"additionalProperties": false,
|
|
373
|
+
"properties": {
|
|
374
|
+
"active": {
|
|
375
|
+
"default": true,
|
|
376
|
+
"type": "boolean",
|
|
377
|
+
},
|
|
378
|
+
"name": {
|
|
379
|
+
"type": "string",
|
|
380
|
+
},
|
|
381
|
+
"role": {
|
|
382
|
+
"default": "user",
|
|
383
|
+
"type": "string",
|
|
384
|
+
},
|
|
385
|
+
},
|
|
386
|
+
"required": [
|
|
387
|
+
"name",
|
|
388
|
+
"role",
|
|
389
|
+
"active",
|
|
390
|
+
],
|
|
391
|
+
"type": "object",
|
|
392
|
+
}
|
|
393
|
+
`;
|
|
394
|
+
|
|
395
|
+
exports[`Zod Utils > zodSchemaToJsonSchema > strict parameter > should handle nested objects in strict mode 1`] = `
|
|
396
|
+
{
|
|
397
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
398
|
+
"additionalProperties": false,
|
|
399
|
+
"properties": {
|
|
400
|
+
"metadata": {
|
|
401
|
+
"additionalProperties": false,
|
|
402
|
+
"properties": {
|
|
403
|
+
"created": {
|
|
404
|
+
"type": "string",
|
|
405
|
+
},
|
|
406
|
+
},
|
|
407
|
+
"required": [
|
|
408
|
+
"created",
|
|
409
|
+
],
|
|
410
|
+
"type": "object",
|
|
411
|
+
},
|
|
412
|
+
"user": {
|
|
413
|
+
"additionalProperties": false,
|
|
414
|
+
"properties": {
|
|
415
|
+
"email": {
|
|
416
|
+
"anyOf": [
|
|
417
|
+
{
|
|
418
|
+
"type": "string",
|
|
419
|
+
},
|
|
420
|
+
{
|
|
421
|
+
"type": "null",
|
|
422
|
+
},
|
|
423
|
+
],
|
|
424
|
+
},
|
|
425
|
+
"name": {
|
|
426
|
+
"type": "string",
|
|
427
|
+
},
|
|
428
|
+
},
|
|
429
|
+
"required": [
|
|
430
|
+
"name",
|
|
431
|
+
"email",
|
|
432
|
+
],
|
|
433
|
+
"type": "object",
|
|
434
|
+
},
|
|
435
|
+
},
|
|
436
|
+
"required": [
|
|
437
|
+
"user",
|
|
438
|
+
"metadata",
|
|
439
|
+
],
|
|
440
|
+
"type": "object",
|
|
441
|
+
}
|
|
442
|
+
`;
|
|
443
|
+
|
|
444
|
+
exports[`Zod Utils > zodSchemaToJsonSchema > strict parameter > should handle nullable fields in strict mode 1`] = `
|
|
445
|
+
{
|
|
446
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
447
|
+
"additionalProperties": false,
|
|
448
|
+
"properties": {
|
|
449
|
+
"optional": {
|
|
450
|
+
"anyOf": [
|
|
451
|
+
{
|
|
452
|
+
"type": "string",
|
|
453
|
+
},
|
|
454
|
+
{
|
|
455
|
+
"type": "null",
|
|
456
|
+
},
|
|
457
|
+
],
|
|
458
|
+
},
|
|
459
|
+
"required": {
|
|
460
|
+
"type": "string",
|
|
461
|
+
},
|
|
462
|
+
},
|
|
463
|
+
"required": [
|
|
464
|
+
"required",
|
|
465
|
+
"optional",
|
|
466
|
+
],
|
|
467
|
+
"type": "object",
|
|
468
|
+
}
|
|
469
|
+
`;
|
|
470
|
+
|
|
471
|
+
exports[`Zod Utils > zodSchemaToJsonSchema > strict parameter > should handle optional fields in strict mode 1`] = `
|
|
472
|
+
{
|
|
473
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
474
|
+
"additionalProperties": false,
|
|
475
|
+
"properties": {
|
|
476
|
+
"optional": {
|
|
477
|
+
"anyOf": [
|
|
478
|
+
{
|
|
479
|
+
"type": "string",
|
|
480
|
+
},
|
|
481
|
+
{
|
|
482
|
+
"type": "null",
|
|
483
|
+
},
|
|
484
|
+
],
|
|
485
|
+
},
|
|
486
|
+
"required": {
|
|
487
|
+
"type": "string",
|
|
488
|
+
},
|
|
489
|
+
},
|
|
490
|
+
"required": [
|
|
491
|
+
"required",
|
|
492
|
+
"optional",
|
|
493
|
+
],
|
|
494
|
+
"type": "object",
|
|
495
|
+
}
|
|
496
|
+
`;
|
|
497
|
+
|
|
498
|
+
exports[`Zod Utils > zodSchemaToJsonSchema > strict parameter > should handle v3 schemas in strict mode 1`] = `
|
|
499
|
+
{
|
|
500
|
+
"$schema": "https://json-schema.org/draft/2019-09/schema#",
|
|
501
|
+
"additionalProperties": false,
|
|
502
|
+
"properties": {
|
|
503
|
+
"age": {
|
|
504
|
+
"type": [
|
|
505
|
+
"number",
|
|
506
|
+
"null",
|
|
507
|
+
],
|
|
508
|
+
},
|
|
509
|
+
"name": {
|
|
510
|
+
"type": "string",
|
|
511
|
+
},
|
|
512
|
+
},
|
|
513
|
+
"required": [
|
|
514
|
+
"name",
|
|
515
|
+
"age",
|
|
516
|
+
],
|
|
517
|
+
"type": "object",
|
|
518
|
+
}
|
|
519
|
+
`;
|
|
520
|
+
|
|
521
|
+
exports[`Zod Utils > zodSchemaToJsonSchema > strict parameter > should produce standard JSON schema with strict: false 1`] = `
|
|
522
|
+
{
|
|
523
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
524
|
+
"additionalProperties": false,
|
|
525
|
+
"properties": {
|
|
526
|
+
"age": {
|
|
527
|
+
"type": "number",
|
|
528
|
+
},
|
|
529
|
+
"name": {
|
|
530
|
+
"type": "string",
|
|
531
|
+
},
|
|
532
|
+
},
|
|
533
|
+
"required": [
|
|
534
|
+
"name",
|
|
535
|
+
"age",
|
|
536
|
+
],
|
|
537
|
+
"type": "object",
|
|
538
|
+
}
|
|
539
|
+
`;
|
|
540
|
+
|
|
541
|
+
exports[`Zod Utils > zodSchemaToJsonSchema > strict parameter > should produce strict JSON schema with strict: true 1`] = `
|
|
542
|
+
{
|
|
543
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
544
|
+
"additionalProperties": false,
|
|
545
|
+
"properties": {
|
|
546
|
+
"age": {
|
|
547
|
+
"type": "number",
|
|
548
|
+
},
|
|
549
|
+
"name": {
|
|
550
|
+
"type": "string",
|
|
551
|
+
},
|
|
552
|
+
},
|
|
553
|
+
"required": [
|
|
554
|
+
"name",
|
|
555
|
+
"age",
|
|
556
|
+
],
|
|
557
|
+
"type": "object",
|
|
558
|
+
}
|
|
559
|
+
`;
|
package/src/llm/llm.ts
CHANGED
|
@@ -78,7 +78,7 @@ export abstract class LLM extends (EventEmitter as new () => TypedEmitter<LLMCal
|
|
|
78
78
|
connOptions?: APIConnectOptions;
|
|
79
79
|
parallelToolCalls?: boolean;
|
|
80
80
|
toolChoice?: ToolChoice;
|
|
81
|
-
extraKwargs?: Record<string,
|
|
81
|
+
extraKwargs?: Record<string, unknown>;
|
|
82
82
|
}): LLMStream;
|
|
83
83
|
|
|
84
84
|
/**
|
|
@@ -12,11 +12,11 @@ export interface GoogleFormatData {
|
|
|
12
12
|
export async function toChatCtx(
|
|
13
13
|
chatCtx: ChatContext,
|
|
14
14
|
injectDummyUserMessage: boolean = true,
|
|
15
|
-
): Promise<[Record<string,
|
|
16
|
-
const turns: Record<string,
|
|
15
|
+
): Promise<[Record<string, unknown>[], GoogleFormatData]> {
|
|
16
|
+
const turns: Record<string, unknown>[] = [];
|
|
17
17
|
const systemMessages: string[] = [];
|
|
18
18
|
let currentRole: string | null = null;
|
|
19
|
-
let parts: Record<string,
|
|
19
|
+
let parts: Record<string, unknown>[] = [];
|
|
20
20
|
|
|
21
21
|
// Flatten all grouped tool calls to get individual messages
|
|
22
22
|
const itemGroups = groupToolCalls(chatCtx);
|
|
@@ -104,7 +104,7 @@ export async function toChatCtx(
|
|
|
104
104
|
];
|
|
105
105
|
}
|
|
106
106
|
|
|
107
|
-
async function toImagePart(image: ImageContent): Promise<Record<string,
|
|
107
|
+
async function toImagePart(image: ImageContent): Promise<Record<string, unknown>> {
|
|
108
108
|
const cacheKey = 'serialized_image';
|
|
109
109
|
if (!image._cache[cacheKey]) {
|
|
110
110
|
image._cache[cacheKey] = await serializeImage(image);
|
package/src/llm/realtime.ts
CHANGED
|
@@ -19,6 +19,7 @@ export interface MessageGeneration {
|
|
|
19
19
|
messageId: string;
|
|
20
20
|
textStream: ReadableStream<string>;
|
|
21
21
|
audioStream: ReadableStream<AudioFrame>;
|
|
22
|
+
modalities?: Promise<('text' | 'audio')[]>;
|
|
22
23
|
}
|
|
23
24
|
|
|
24
25
|
export interface GenerationCreatedEvent {
|
|
@@ -40,6 +41,7 @@ export interface RealtimeCapabilities {
|
|
|
40
41
|
turnDetection: boolean;
|
|
41
42
|
userTranscription: boolean;
|
|
42
43
|
autoToolReplyGeneration: boolean;
|
|
44
|
+
audioOutput: boolean;
|
|
43
45
|
}
|
|
44
46
|
|
|
45
47
|
export interface InputTranscriptionCompleted {
|
|
@@ -121,7 +123,12 @@ export abstract class RealtimeSession extends EventEmitter {
|
|
|
121
123
|
/**
|
|
122
124
|
* Truncate the message at the given audio end time
|
|
123
125
|
*/
|
|
124
|
-
abstract truncate(options: {
|
|
126
|
+
abstract truncate(options: {
|
|
127
|
+
messageId: string;
|
|
128
|
+
audioEndMs: number;
|
|
129
|
+
modalities?: ('text' | 'audio')[];
|
|
130
|
+
audioTranscript?: string;
|
|
131
|
+
}): Promise<void>;
|
|
125
132
|
|
|
126
133
|
async close(): Promise<void> {
|
|
127
134
|
this._mainTask.cancel();
|
package/src/llm/utils.ts
CHANGED
|
@@ -323,9 +323,14 @@ export function computeChatCtxDiff(oldCtx: ChatContext, newCtx: ChatContext): Di
|
|
|
323
323
|
};
|
|
324
324
|
}
|
|
325
325
|
|
|
326
|
-
export function toJsonSchema(
|
|
326
|
+
export function toJsonSchema(
|
|
327
|
+
schema: ToolInputSchema<any>,
|
|
328
|
+
isOpenai: boolean = true,
|
|
329
|
+
strict: boolean = false,
|
|
330
|
+
): JSONSchema7 {
|
|
327
331
|
if (isZodSchema(schema)) {
|
|
328
|
-
return zodSchemaToJsonSchema(schema, isOpenai);
|
|
332
|
+
return zodSchemaToJsonSchema(schema, isOpenai, strict);
|
|
329
333
|
}
|
|
334
|
+
|
|
330
335
|
return schema as JSONSchema7;
|
|
331
336
|
}
|
|
@@ -260,6 +260,107 @@ describe('Zod Utils', () => {
|
|
|
260
260
|
expect(jsonSchema7).toHaveProperty('properties');
|
|
261
261
|
});
|
|
262
262
|
});
|
|
263
|
+
|
|
264
|
+
describe('strict parameter', () => {
|
|
265
|
+
it('should produce strict JSON schema with strict: true', () => {
|
|
266
|
+
const schema = z4.object({
|
|
267
|
+
name: z4.string(),
|
|
268
|
+
age: z4.number(),
|
|
269
|
+
});
|
|
270
|
+
|
|
271
|
+
const strictSchema = zodSchemaToJsonSchema(schema, true, true);
|
|
272
|
+
expect(strictSchema).toMatchSnapshot();
|
|
273
|
+
});
|
|
274
|
+
|
|
275
|
+
it('should handle nullable fields in strict mode', () => {
|
|
276
|
+
const schema = z4.object({
|
|
277
|
+
required: z4.string(),
|
|
278
|
+
optional: z4.string().nullable(),
|
|
279
|
+
});
|
|
280
|
+
|
|
281
|
+
const strictSchema = zodSchemaToJsonSchema(schema, true, true);
|
|
282
|
+
expect(strictSchema).toMatchSnapshot();
|
|
283
|
+
});
|
|
284
|
+
|
|
285
|
+
it('should handle default values in strict mode', () => {
|
|
286
|
+
const schema = z4.object({
|
|
287
|
+
name: z4.string(),
|
|
288
|
+
role: z4.string().default('user'),
|
|
289
|
+
active: z4.boolean().default(true),
|
|
290
|
+
});
|
|
291
|
+
|
|
292
|
+
const strictSchema = zodSchemaToJsonSchema(schema, true, true);
|
|
293
|
+
expect(strictSchema).toMatchSnapshot();
|
|
294
|
+
});
|
|
295
|
+
|
|
296
|
+
it('should handle nested objects in strict mode', () => {
|
|
297
|
+
const schema = z4.object({
|
|
298
|
+
user: z4.object({
|
|
299
|
+
name: z4.string(),
|
|
300
|
+
email: z4.string().nullable(),
|
|
301
|
+
}),
|
|
302
|
+
metadata: z4.object({
|
|
303
|
+
created: z4.string(),
|
|
304
|
+
}),
|
|
305
|
+
});
|
|
306
|
+
|
|
307
|
+
const strictSchema = zodSchemaToJsonSchema(schema, true, true);
|
|
308
|
+
expect(strictSchema).toMatchSnapshot();
|
|
309
|
+
});
|
|
310
|
+
|
|
311
|
+
it('should handle arrays in strict mode', () => {
|
|
312
|
+
const schema = z4.object({
|
|
313
|
+
tags: z4.array(z4.string()),
|
|
314
|
+
numbers: z4.array(z4.number()),
|
|
315
|
+
});
|
|
316
|
+
|
|
317
|
+
const strictSchema = zodSchemaToJsonSchema(schema, true, true);
|
|
318
|
+
expect(strictSchema).toMatchSnapshot();
|
|
319
|
+
});
|
|
320
|
+
|
|
321
|
+
it('should handle v3 schemas in strict mode', () => {
|
|
322
|
+
const schema = z3.object({
|
|
323
|
+
name: z3.string(),
|
|
324
|
+
age: z3.number().optional(),
|
|
325
|
+
});
|
|
326
|
+
|
|
327
|
+
const strictSchema = zodSchemaToJsonSchema(schema, true, true);
|
|
328
|
+
expect(strictSchema).toMatchSnapshot();
|
|
329
|
+
});
|
|
330
|
+
|
|
331
|
+
it('should throw error when using .optional() without .nullable() in strict mode', () => {
|
|
332
|
+
const schema = z4.object({
|
|
333
|
+
required: z4.string(),
|
|
334
|
+
optional: z4.string().optional(),
|
|
335
|
+
});
|
|
336
|
+
|
|
337
|
+
expect(() => zodSchemaToJsonSchema(schema, true, true)).toThrow(
|
|
338
|
+
/uses `.optional\(\)` without `.nullable\(\)` which is not supported by the API/,
|
|
339
|
+
);
|
|
340
|
+
});
|
|
341
|
+
|
|
342
|
+
it('should throw error for nested .optional() fields in strict mode', () => {
|
|
343
|
+
const schema = z4.object({
|
|
344
|
+
user: z4.object({
|
|
345
|
+
name: z4.string(),
|
|
346
|
+
email: z4.string().optional(),
|
|
347
|
+
}),
|
|
348
|
+
});
|
|
349
|
+
|
|
350
|
+
expect(() => zodSchemaToJsonSchema(schema, true, true)).toThrow(
|
|
351
|
+
/uses `.optional\(\)` without `.nullable\(\)` which is not supported by the API/,
|
|
352
|
+
);
|
|
353
|
+
});
|
|
354
|
+
|
|
355
|
+
it('should NOT throw error when using .optional() in non-strict mode', () => {
|
|
356
|
+
const schema = z4.object({
|
|
357
|
+
required: z4.string(),
|
|
358
|
+
optional: z4.string().optional(),
|
|
359
|
+
});
|
|
360
|
+
|
|
361
|
+
expect(() => zodSchemaToJsonSchema(schema, true, false)).not.toThrow();
|
|
362
|
+
});
|
|
363
|
+
});
|
|
263
364
|
});
|
|
264
365
|
|
|
265
366
|
describe('parseZodSchema', () => {
|
package/src/llm/zod-utils.ts
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
//
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
import type { JSONSchema7 } from 'json-schema';
|
|
5
|
+
import { toStrictJsonSchema } from 'openai/lib/transform';
|
|
5
6
|
import { zodToJsonSchema as zodToJsonSchemaV3 } from 'zod-to-json-schema';
|
|
6
7
|
import type * as z3 from 'zod/v3';
|
|
7
8
|
import * as z4 from 'zod/v4';
|
|
@@ -101,12 +102,18 @@ export function isZodObjectSchema(schema: ZodSchema): boolean {
|
|
|
101
102
|
* @param isOpenai - Whether to use OpenAI-specific formatting (default: true)
|
|
102
103
|
* @returns A JSON Schema representation of the Zod schema
|
|
103
104
|
*/
|
|
104
|
-
export function zodSchemaToJsonSchema(
|
|
105
|
+
export function zodSchemaToJsonSchema(
|
|
106
|
+
schema: ZodSchema,
|
|
107
|
+
isOpenai: boolean = true,
|
|
108
|
+
strict: boolean = false,
|
|
109
|
+
): JSONSchema7 {
|
|
110
|
+
let result: JSONSchema7;
|
|
111
|
+
|
|
105
112
|
if (isZod4Schema(schema)) {
|
|
106
113
|
// Zod v4 has native toJSONSchema support
|
|
107
114
|
// Configuration adapted from Vercel AI SDK to support OpenAPI conversion for Google
|
|
108
115
|
// Source: https://github.com/vercel/ai/blob/main/packages/provider-utils/src/schema.ts#L255-L258
|
|
109
|
-
|
|
116
|
+
result = z4.toJSONSchema(schema, {
|
|
110
117
|
target: 'draft-7',
|
|
111
118
|
io: 'output',
|
|
112
119
|
reused: 'inline', // Don't use references by default (to support openapi conversion for google)
|
|
@@ -115,11 +122,13 @@ export function zodSchemaToJsonSchema(schema: ZodSchema, isOpenai: boolean = tru
|
|
|
115
122
|
// Zod v3 requires the zod-to-json-schema library
|
|
116
123
|
// Configuration adapted from Vercel AI SDK
|
|
117
124
|
// $refStrategy: 'none' is equivalent to v4's reused: 'inline'
|
|
118
|
-
|
|
125
|
+
result = zodToJsonSchemaV3(schema, {
|
|
119
126
|
target: isOpenai ? 'openAi' : 'jsonSchema7',
|
|
120
127
|
$refStrategy: 'none', // Don't use references by default (to support openapi conversion for google)
|
|
121
128
|
}) as JSONSchema7;
|
|
122
129
|
}
|
|
130
|
+
|
|
131
|
+
return strict ? (toStrictJsonSchema(result) as JSONSchema7) : result;
|
|
123
132
|
}
|
|
124
133
|
|
|
125
134
|
/**
|
package/src/utils.ts
CHANGED
|
@@ -15,6 +15,23 @@ import { TransformStream, type TransformStreamDefaultController } from 'node:str
|
|
|
15
15
|
import { v4 as uuidv4 } from 'uuid';
|
|
16
16
|
import { log } from './log.js';
|
|
17
17
|
|
|
18
|
+
/**
|
|
19
|
+
* Recursively expands all nested properties of a type,
|
|
20
|
+
* resolving aliases so as to inspect the real shape in IDE.
|
|
21
|
+
*/
|
|
22
|
+
// eslint-disable-next-line @typescript-eslint/ban-types
|
|
23
|
+
export type Expand<T> = T extends Function
|
|
24
|
+
? T
|
|
25
|
+
: T extends object
|
|
26
|
+
? T extends Array<infer U>
|
|
27
|
+
? Array<Expand<U>>
|
|
28
|
+
: T extends Map<infer K, infer V>
|
|
29
|
+
? Map<Expand<K>, Expand<V>>
|
|
30
|
+
: T extends Set<infer M>
|
|
31
|
+
? Set<Expand<M>>
|
|
32
|
+
: { [K in keyof T]: Expand<T[K]> }
|
|
33
|
+
: T;
|
|
34
|
+
|
|
18
35
|
/** Union of a single and a list of {@link AudioFrame}s */
|
|
19
36
|
export type AudioBuffer = AudioFrame[] | AudioFrame;
|
|
20
37
|
|
|
@@ -235,6 +235,14 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
235
235
|
} catch (error) {
|
|
236
236
|
this.logger.error(error, 'failed to update the tools');
|
|
237
237
|
}
|
|
238
|
+
|
|
239
|
+
if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
|
|
240
|
+
this.logger.error(
|
|
241
|
+
'audio output is enabled but RealtimeModel has no audio modality ' +
|
|
242
|
+
'and no TTS is set. Either enable audio modality in the RealtimeModel ' +
|
|
243
|
+
'or set a TTS model.',
|
|
244
|
+
);
|
|
245
|
+
}
|
|
238
246
|
} else if (this.llm instanceof LLM) {
|
|
239
247
|
try {
|
|
240
248
|
updateInstructions({
|
|
@@ -625,11 +633,21 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
625
633
|
return;
|
|
626
634
|
}
|
|
627
635
|
|
|
636
|
+
// Refactored interruption word count check:
|
|
637
|
+
// - Always apply minInterruptionWords filtering when STT is available and minInterruptionWords > 0
|
|
638
|
+
// - Apply check to all STT results: empty string, undefined, or any length
|
|
639
|
+
// - This ensures consistent behavior across all interruption scenarios
|
|
628
640
|
if (this.stt && this.agentSession.options.minInterruptionWords > 0 && this.audioRecognition) {
|
|
629
641
|
const text = this.audioRecognition.currentTranscript;
|
|
630
|
-
|
|
631
642
|
// TODO(shubhra): better word splitting for multi-language
|
|
632
|
-
|
|
643
|
+
|
|
644
|
+
// Normalize text: convert undefined/null to empty string for consistent word counting
|
|
645
|
+
const normalizedText = text ?? '';
|
|
646
|
+
const wordCount = splitWords(normalizedText, true).length;
|
|
647
|
+
|
|
648
|
+
// Only allow interruption if word count meets or exceeds minInterruptionWords
|
|
649
|
+
// This applies to all cases: empty strings, partial speech, and full speech
|
|
650
|
+
if (wordCount < this.agentSession.options.minInterruptionWords) {
|
|
633
651
|
return;
|
|
634
652
|
}
|
|
635
653
|
}
|
|
@@ -767,19 +785,30 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
767
785
|
return true;
|
|
768
786
|
}
|
|
769
787
|
|
|
788
|
+
// Refactored interruption word count check for consistency with onVADInferenceDone:
|
|
789
|
+
// - Always apply minInterruptionWords filtering when STT is available and minInterruptionWords > 0
|
|
790
|
+
// - Use consistent word splitting logic with splitWords (matching onVADInferenceDone pattern)
|
|
770
791
|
if (
|
|
771
792
|
this.stt &&
|
|
772
793
|
this.turnDetection !== 'manual' &&
|
|
773
794
|
this._currentSpeech &&
|
|
774
795
|
this._currentSpeech.allowInterruptions &&
|
|
775
796
|
!this._currentSpeech.interrupted &&
|
|
776
|
-
this.agentSession.options.minInterruptionWords > 0
|
|
777
|
-
info.newTranscript.split(' ').length < this.agentSession.options.minInterruptionWords
|
|
797
|
+
this.agentSession.options.minInterruptionWords > 0
|
|
778
798
|
) {
|
|
779
|
-
|
|
780
|
-
this.
|
|
781
|
-
|
|
782
|
-
|
|
799
|
+
const wordCount = splitWords(info.newTranscript, true).length;
|
|
800
|
+
if (wordCount < this.agentSession.options.minInterruptionWords) {
|
|
801
|
+
// avoid interruption if the new_transcript contains fewer words than minInterruptionWords
|
|
802
|
+
this.cancelPreemptiveGeneration();
|
|
803
|
+
this.logger.info(
|
|
804
|
+
{
|
|
805
|
+
wordCount,
|
|
806
|
+
minInterruptionWords: this.agentSession.options.minInterruptionWords,
|
|
807
|
+
},
|
|
808
|
+
'skipping user input, word count below minimum interruption threshold',
|
|
809
|
+
);
|
|
810
|
+
return false;
|
|
811
|
+
}
|
|
783
812
|
}
|
|
784
813
|
|
|
785
814
|
const oldTask = this._userTurnCompletedTask;
|
|
@@ -1612,7 +1641,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1612
1641
|
|
|
1613
1642
|
const readMessages = async (
|
|
1614
1643
|
abortController: AbortController,
|
|
1615
|
-
outputs: Array<[string, _TextOut | null, _AudioOut | null]>,
|
|
1644
|
+
outputs: Array<[string, _TextOut | null, _AudioOut | null, ('text' | 'audio')[] | undefined]>,
|
|
1616
1645
|
) => {
|
|
1617
1646
|
replyAbortController.signal.addEventListener('abort', () => abortController.abort(), {
|
|
1618
1647
|
once: true,
|
|
@@ -1627,7 +1656,25 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1627
1656
|
);
|
|
1628
1657
|
break;
|
|
1629
1658
|
}
|
|
1630
|
-
|
|
1659
|
+
|
|
1660
|
+
const msgModalities = msg.modalities ? await msg.modalities : undefined;
|
|
1661
|
+
let ttsTextInput: ReadableStream<string> | null = null;
|
|
1662
|
+
let trTextInput: ReadableStream<string>;
|
|
1663
|
+
|
|
1664
|
+
if (msgModalities && !msgModalities.includes('audio') && this.tts) {
|
|
1665
|
+
if (this.llm instanceof RealtimeModel && this.llm.capabilities.audioOutput) {
|
|
1666
|
+
this.logger.warn(
|
|
1667
|
+
'text response received from realtime API, falling back to use a TTS model.',
|
|
1668
|
+
);
|
|
1669
|
+
}
|
|
1670
|
+
const [_ttsTextInput, _trTextInput] = msg.textStream.tee();
|
|
1671
|
+
ttsTextInput = _ttsTextInput;
|
|
1672
|
+
trTextInput = _trTextInput;
|
|
1673
|
+
} else {
|
|
1674
|
+
trTextInput = msg.textStream;
|
|
1675
|
+
}
|
|
1676
|
+
|
|
1677
|
+
const trNodeResult = await this.agent.transcriptionNode(trTextInput, modelSettings);
|
|
1631
1678
|
let textOut: _TextOut | null = null;
|
|
1632
1679
|
if (trNodeResult) {
|
|
1633
1680
|
const [textForwardTask, _textOut] = performTextForwarding(
|
|
@@ -1638,30 +1685,51 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1638
1685
|
forwardTasks.push(textForwardTask);
|
|
1639
1686
|
textOut = _textOut;
|
|
1640
1687
|
}
|
|
1688
|
+
|
|
1641
1689
|
let audioOut: _AudioOut | null = null;
|
|
1642
1690
|
if (audioOutput) {
|
|
1643
|
-
|
|
1644
|
-
|
|
1645
|
-
|
|
1646
|
-
|
|
1647
|
-
|
|
1691
|
+
let realtimeAudioResult: ReadableStream<AudioFrame> | null = null;
|
|
1692
|
+
|
|
1693
|
+
if (ttsTextInput) {
|
|
1694
|
+
const [ttsTask, ttsStream] = performTTSInference(
|
|
1695
|
+
(...args) => this.agent.ttsNode(...args),
|
|
1696
|
+
ttsTextInput,
|
|
1697
|
+
modelSettings,
|
|
1698
|
+
abortController,
|
|
1699
|
+
);
|
|
1700
|
+
tasks.push(ttsTask);
|
|
1701
|
+
realtimeAudioResult = ttsStream;
|
|
1702
|
+
} else if (msgModalities && msgModalities.includes('audio')) {
|
|
1703
|
+
realtimeAudioResult = await this.agent.realtimeAudioOutputNode(
|
|
1704
|
+
msg.audioStream,
|
|
1705
|
+
modelSettings,
|
|
1706
|
+
);
|
|
1707
|
+
} else if (this.llm instanceof RealtimeModel && this.llm.capabilities.audioOutput) {
|
|
1708
|
+
this.logger.error(
|
|
1709
|
+
'Text message received from Realtime API with audio modality. ' +
|
|
1710
|
+
'This usually happens when text chat context is synced to the API. ' +
|
|
1711
|
+
'Try to add a TTS model as fallback or use text modality with TTS instead.',
|
|
1712
|
+
);
|
|
1713
|
+
} else {
|
|
1714
|
+
this.logger.warn(
|
|
1715
|
+
'audio output is enabled but neither tts nor realtime audio is available',
|
|
1716
|
+
);
|
|
1717
|
+
}
|
|
1718
|
+
|
|
1719
|
+
if (realtimeAudioResult) {
|
|
1648
1720
|
const [forwardTask, _audioOut] = performAudioForwarding(
|
|
1649
|
-
|
|
1721
|
+
realtimeAudioResult,
|
|
1650
1722
|
audioOutput,
|
|
1651
1723
|
abortController,
|
|
1652
1724
|
);
|
|
1653
1725
|
forwardTasks.push(forwardTask);
|
|
1654
1726
|
audioOut = _audioOut;
|
|
1655
1727
|
audioOut.firstFrameFut.await.finally(onFirstFrame);
|
|
1656
|
-
} else {
|
|
1657
|
-
this.logger.warn(
|
|
1658
|
-
'audio output is enabled but neither tts nor realtime audio is available',
|
|
1659
|
-
);
|
|
1660
1728
|
}
|
|
1661
1729
|
} else if (textOut) {
|
|
1662
1730
|
textOut.firstTextFut.await.finally(onFirstFrame);
|
|
1663
1731
|
}
|
|
1664
|
-
outputs.push([msg.messageId, textOut, audioOut]);
|
|
1732
|
+
outputs.push([msg.messageId, textOut, audioOut, msgModalities]);
|
|
1665
1733
|
}
|
|
1666
1734
|
await waitFor(forwardTasks);
|
|
1667
1735
|
} catch (error) {
|
|
@@ -1671,7 +1739,9 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1671
1739
|
}
|
|
1672
1740
|
};
|
|
1673
1741
|
|
|
1674
|
-
const messageOutputs: Array<
|
|
1742
|
+
const messageOutputs: Array<
|
|
1743
|
+
[string, _TextOut | null, _AudioOut | null, ('text' | 'audio')[] | undefined]
|
|
1744
|
+
> = [];
|
|
1675
1745
|
const tasks = [
|
|
1676
1746
|
Task.from(
|
|
1677
1747
|
(controller) => readMessages(controller, messageOutputs),
|
|
@@ -1750,7 +1820,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1750
1820
|
|
|
1751
1821
|
if (messageOutputs.length > 0) {
|
|
1752
1822
|
// there should be only one message
|
|
1753
|
-
const [msgId, textOut, audioOut] = messageOutputs[0]!;
|
|
1823
|
+
const [msgId, textOut, audioOut, msgModalities] = messageOutputs[0]!;
|
|
1754
1824
|
let forwardedText = textOut?.text || '';
|
|
1755
1825
|
|
|
1756
1826
|
if (audioOutput) {
|
|
@@ -1775,6 +1845,8 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1775
1845
|
this.realtimeSession.truncate({
|
|
1776
1846
|
messageId: msgId,
|
|
1777
1847
|
audioEndMs: Math.floor(playbackPosition),
|
|
1848
|
+
modalities: msgModalities,
|
|
1849
|
+
audioTranscript: forwardedText,
|
|
1778
1850
|
});
|
|
1779
1851
|
}
|
|
1780
1852
|
|
|
@@ -1805,7 +1877,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1805
1877
|
|
|
1806
1878
|
if (messageOutputs.length > 0) {
|
|
1807
1879
|
// there should be only one message
|
|
1808
|
-
const [msgId, textOut, _] = messageOutputs[0]!;
|
|
1880
|
+
const [msgId, textOut, _, __] = messageOutputs[0]!;
|
|
1809
1881
|
const message = ChatMessage.create({
|
|
1810
1882
|
role: 'assistant',
|
|
1811
1883
|
content: textOut?.text || '',
|