@clawdbot/voice-call 2026.1.21 → 2026.1.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +17 -0
- package/README.md +21 -0
- package/clawdbot.plugin.json +207 -11
- package/index.ts +21 -3
- package/package.json +2 -2
- package/src/config.ts +73 -22
- package/src/core-bridge.ts +6 -0
- package/src/manager/context.ts +0 -1
- package/src/manager/events.ts +0 -1
- package/src/manager/lookup.ts +0 -1
- package/src/manager/outbound.ts +4 -3
- package/src/manager/state.ts +0 -1
- package/src/manager/store.ts +0 -1
- package/src/manager/timers.ts +0 -1
- package/src/manager/twiml.ts +0 -1
- package/src/manager.ts +4 -2
- package/src/media-stream.test.ts +97 -0
- package/src/media-stream.ts +114 -0
- package/src/providers/plivo.test.ts +0 -1
- package/src/providers/stt-openai-realtime.ts +8 -0
- package/src/providers/twilio/webhook.ts +0 -1
- package/src/providers/twilio.test.ts +64 -0
- package/src/providers/twilio.ts +51 -24
- package/src/runtime.ts +12 -13
- package/src/telephony-audio.ts +88 -0
- package/src/telephony-tts.ts +95 -0
- package/src/webhook.ts +10 -0
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,22 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 2026.1.24
|
|
4
|
+
|
|
5
|
+
### Changes
|
|
6
|
+
- Breaking: voice-call TTS now uses core `messages.tts` (plugin TTS config deep‑merges with core).
|
|
7
|
+
- Telephony TTS supports OpenAI + ElevenLabs; Edge TTS is ignored for calls.
|
|
8
|
+
- Removed legacy `tts.model`/`tts.voice`/`tts.instructions` plugin fields.
|
|
9
|
+
|
|
10
|
+
## 2026.1.23
|
|
11
|
+
|
|
12
|
+
### Changes
|
|
13
|
+
- Version alignment with core Clawdbot release numbers.
|
|
14
|
+
|
|
15
|
+
## 2026.1.22
|
|
16
|
+
|
|
17
|
+
### Changes
|
|
18
|
+
- Version alignment with core Clawdbot release numbers.
|
|
19
|
+
|
|
3
20
|
## 2026.1.21
|
|
4
21
|
|
|
5
22
|
### Changes
|
package/README.md
CHANGED
|
@@ -75,6 +75,27 @@ Notes:
|
|
|
75
75
|
- Twilio/Telnyx/Plivo require a **publicly reachable** webhook URL.
|
|
76
76
|
- `mock` is a local dev provider (no network calls).
|
|
77
77
|
|
|
78
|
+
## TTS for calls
|
|
79
|
+
|
|
80
|
+
Voice Call uses the core `messages.tts` configuration (OpenAI or ElevenLabs) for
|
|
81
|
+
streaming speech on calls. You can override it under the plugin config with the
|
|
82
|
+
same shape — overrides deep-merge with `messages.tts`.
|
|
83
|
+
|
|
84
|
+
```json5
|
|
85
|
+
{
|
|
86
|
+
tts: {
|
|
87
|
+
provider: "openai",
|
|
88
|
+
openai: {
|
|
89
|
+
voice: "alloy"
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
Notes:
|
|
96
|
+
- Edge TTS is ignored for voice calls (telephony audio needs PCM; Edge output is unreliable).
|
|
97
|
+
- Core TTS is used when Twilio media streaming is enabled; otherwise calls fall back to provider native voices.
|
|
98
|
+
|
|
78
99
|
## CLI
|
|
79
100
|
|
|
80
101
|
```bash
|
package/clawdbot.plugin.json
CHANGED
|
@@ -99,16 +99,39 @@
|
|
|
99
99
|
"label": "Media Stream Path",
|
|
100
100
|
"advanced": true
|
|
101
101
|
},
|
|
102
|
-
"tts.
|
|
103
|
-
"label": "TTS
|
|
102
|
+
"tts.provider": {
|
|
103
|
+
"label": "TTS Provider Override",
|
|
104
|
+
"help": "Deep-merges with messages.tts (Edge is ignored for calls).",
|
|
104
105
|
"advanced": true
|
|
105
106
|
},
|
|
106
|
-
"tts.
|
|
107
|
-
"label": "TTS
|
|
107
|
+
"tts.openai.model": {
|
|
108
|
+
"label": "OpenAI TTS Model",
|
|
108
109
|
"advanced": true
|
|
109
110
|
},
|
|
110
|
-
"tts.
|
|
111
|
-
"label": "TTS
|
|
111
|
+
"tts.openai.voice": {
|
|
112
|
+
"label": "OpenAI TTS Voice",
|
|
113
|
+
"advanced": true
|
|
114
|
+
},
|
|
115
|
+
"tts.openai.apiKey": {
|
|
116
|
+
"label": "OpenAI API Key",
|
|
117
|
+
"sensitive": true,
|
|
118
|
+
"advanced": true
|
|
119
|
+
},
|
|
120
|
+
"tts.elevenlabs.modelId": {
|
|
121
|
+
"label": "ElevenLabs Model ID",
|
|
122
|
+
"advanced": true
|
|
123
|
+
},
|
|
124
|
+
"tts.elevenlabs.voiceId": {
|
|
125
|
+
"label": "ElevenLabs Voice ID",
|
|
126
|
+
"advanced": true
|
|
127
|
+
},
|
|
128
|
+
"tts.elevenlabs.apiKey": {
|
|
129
|
+
"label": "ElevenLabs API Key",
|
|
130
|
+
"sensitive": true,
|
|
131
|
+
"advanced": true
|
|
132
|
+
},
|
|
133
|
+
"tts.elevenlabs.baseUrl": {
|
|
134
|
+
"label": "ElevenLabs Base URL",
|
|
112
135
|
"advanced": true
|
|
113
136
|
},
|
|
114
137
|
"publicUrl": {
|
|
@@ -370,20 +393,193 @@
|
|
|
370
393
|
"type": "object",
|
|
371
394
|
"additionalProperties": false,
|
|
372
395
|
"properties": {
|
|
396
|
+
"auto": {
|
|
397
|
+
"type": "string",
|
|
398
|
+
"enum": [
|
|
399
|
+
"off",
|
|
400
|
+
"always",
|
|
401
|
+
"inbound",
|
|
402
|
+
"tagged"
|
|
403
|
+
]
|
|
404
|
+
},
|
|
405
|
+
"enabled": {
|
|
406
|
+
"type": "boolean"
|
|
407
|
+
},
|
|
408
|
+
"mode": {
|
|
409
|
+
"type": "string",
|
|
410
|
+
"enum": [
|
|
411
|
+
"final",
|
|
412
|
+
"all"
|
|
413
|
+
]
|
|
414
|
+
},
|
|
373
415
|
"provider": {
|
|
374
416
|
"type": "string",
|
|
375
417
|
"enum": [
|
|
376
|
-
"openai"
|
|
418
|
+
"openai",
|
|
419
|
+
"elevenlabs",
|
|
420
|
+
"edge"
|
|
377
421
|
]
|
|
378
422
|
},
|
|
379
|
-
"
|
|
423
|
+
"summaryModel": {
|
|
380
424
|
"type": "string"
|
|
381
425
|
},
|
|
382
|
-
"
|
|
383
|
-
"type": "
|
|
426
|
+
"modelOverrides": {
|
|
427
|
+
"type": "object",
|
|
428
|
+
"additionalProperties": false,
|
|
429
|
+
"properties": {
|
|
430
|
+
"enabled": {
|
|
431
|
+
"type": "boolean"
|
|
432
|
+
},
|
|
433
|
+
"allowText": {
|
|
434
|
+
"type": "boolean"
|
|
435
|
+
},
|
|
436
|
+
"allowProvider": {
|
|
437
|
+
"type": "boolean"
|
|
438
|
+
},
|
|
439
|
+
"allowVoice": {
|
|
440
|
+
"type": "boolean"
|
|
441
|
+
},
|
|
442
|
+
"allowModelId": {
|
|
443
|
+
"type": "boolean"
|
|
444
|
+
},
|
|
445
|
+
"allowVoiceSettings": {
|
|
446
|
+
"type": "boolean"
|
|
447
|
+
},
|
|
448
|
+
"allowNormalization": {
|
|
449
|
+
"type": "boolean"
|
|
450
|
+
},
|
|
451
|
+
"allowSeed": {
|
|
452
|
+
"type": "boolean"
|
|
453
|
+
}
|
|
454
|
+
}
|
|
455
|
+
},
|
|
456
|
+
"elevenlabs": {
|
|
457
|
+
"type": "object",
|
|
458
|
+
"additionalProperties": false,
|
|
459
|
+
"properties": {
|
|
460
|
+
"apiKey": {
|
|
461
|
+
"type": "string"
|
|
462
|
+
},
|
|
463
|
+
"baseUrl": {
|
|
464
|
+
"type": "string"
|
|
465
|
+
},
|
|
466
|
+
"voiceId": {
|
|
467
|
+
"type": "string"
|
|
468
|
+
},
|
|
469
|
+
"modelId": {
|
|
470
|
+
"type": "string"
|
|
471
|
+
},
|
|
472
|
+
"seed": {
|
|
473
|
+
"type": "integer",
|
|
474
|
+
"minimum": 0,
|
|
475
|
+
"maximum": 4294967295
|
|
476
|
+
},
|
|
477
|
+
"applyTextNormalization": {
|
|
478
|
+
"type": "string",
|
|
479
|
+
"enum": [
|
|
480
|
+
"auto",
|
|
481
|
+
"on",
|
|
482
|
+
"off"
|
|
483
|
+
]
|
|
484
|
+
},
|
|
485
|
+
"languageCode": {
|
|
486
|
+
"type": "string"
|
|
487
|
+
},
|
|
488
|
+
"voiceSettings": {
|
|
489
|
+
"type": "object",
|
|
490
|
+
"additionalProperties": false,
|
|
491
|
+
"properties": {
|
|
492
|
+
"stability": {
|
|
493
|
+
"type": "number",
|
|
494
|
+
"minimum": 0,
|
|
495
|
+
"maximum": 1
|
|
496
|
+
},
|
|
497
|
+
"similarityBoost": {
|
|
498
|
+
"type": "number",
|
|
499
|
+
"minimum": 0,
|
|
500
|
+
"maximum": 1
|
|
501
|
+
},
|
|
502
|
+
"style": {
|
|
503
|
+
"type": "number",
|
|
504
|
+
"minimum": 0,
|
|
505
|
+
"maximum": 1
|
|
506
|
+
},
|
|
507
|
+
"useSpeakerBoost": {
|
|
508
|
+
"type": "boolean"
|
|
509
|
+
},
|
|
510
|
+
"speed": {
|
|
511
|
+
"type": "number",
|
|
512
|
+
"minimum": 0.5,
|
|
513
|
+
"maximum": 2
|
|
514
|
+
}
|
|
515
|
+
}
|
|
516
|
+
}
|
|
517
|
+
}
|
|
518
|
+
},
|
|
519
|
+
"openai": {
|
|
520
|
+
"type": "object",
|
|
521
|
+
"additionalProperties": false,
|
|
522
|
+
"properties": {
|
|
523
|
+
"apiKey": {
|
|
524
|
+
"type": "string"
|
|
525
|
+
},
|
|
526
|
+
"model": {
|
|
527
|
+
"type": "string"
|
|
528
|
+
},
|
|
529
|
+
"voice": {
|
|
530
|
+
"type": "string"
|
|
531
|
+
}
|
|
532
|
+
}
|
|
384
533
|
},
|
|
385
|
-
"
|
|
534
|
+
"edge": {
|
|
535
|
+
"type": "object",
|
|
536
|
+
"additionalProperties": false,
|
|
537
|
+
"properties": {
|
|
538
|
+
"enabled": {
|
|
539
|
+
"type": "boolean"
|
|
540
|
+
},
|
|
541
|
+
"voice": {
|
|
542
|
+
"type": "string"
|
|
543
|
+
},
|
|
544
|
+
"lang": {
|
|
545
|
+
"type": "string"
|
|
546
|
+
},
|
|
547
|
+
"outputFormat": {
|
|
548
|
+
"type": "string"
|
|
549
|
+
},
|
|
550
|
+
"pitch": {
|
|
551
|
+
"type": "string"
|
|
552
|
+
},
|
|
553
|
+
"rate": {
|
|
554
|
+
"type": "string"
|
|
555
|
+
},
|
|
556
|
+
"volume": {
|
|
557
|
+
"type": "string"
|
|
558
|
+
},
|
|
559
|
+
"saveSubtitles": {
|
|
560
|
+
"type": "boolean"
|
|
561
|
+
},
|
|
562
|
+
"proxy": {
|
|
563
|
+
"type": "string"
|
|
564
|
+
},
|
|
565
|
+
"timeoutMs": {
|
|
566
|
+
"type": "integer",
|
|
567
|
+
"minimum": 1000,
|
|
568
|
+
"maximum": 120000
|
|
569
|
+
}
|
|
570
|
+
}
|
|
571
|
+
},
|
|
572
|
+
"prefsPath": {
|
|
386
573
|
"type": "string"
|
|
574
|
+
},
|
|
575
|
+
"maxTextLength": {
|
|
576
|
+
"type": "integer",
|
|
577
|
+
"minimum": 1
|
|
578
|
+
},
|
|
579
|
+
"timeoutMs": {
|
|
580
|
+
"type": "integer",
|
|
581
|
+
"minimum": 1000,
|
|
582
|
+
"maximum": 120000
|
|
387
583
|
}
|
|
388
584
|
}
|
|
389
585
|
},
|
package/index.ts
CHANGED
|
@@ -74,9 +74,26 @@ const voiceCallConfigSchema = {
|
|
|
74
74
|
},
|
|
75
75
|
"streaming.sttModel": { label: "Realtime STT Model", advanced: true },
|
|
76
76
|
"streaming.streamPath": { label: "Media Stream Path", advanced: true },
|
|
77
|
-
"tts.
|
|
78
|
-
|
|
79
|
-
|
|
77
|
+
"tts.provider": {
|
|
78
|
+
label: "TTS Provider Override",
|
|
79
|
+
help: "Deep-merges with messages.tts (Edge is ignored for calls).",
|
|
80
|
+
advanced: true,
|
|
81
|
+
},
|
|
82
|
+
"tts.openai.model": { label: "OpenAI TTS Model", advanced: true },
|
|
83
|
+
"tts.openai.voice": { label: "OpenAI TTS Voice", advanced: true },
|
|
84
|
+
"tts.openai.apiKey": {
|
|
85
|
+
label: "OpenAI API Key",
|
|
86
|
+
sensitive: true,
|
|
87
|
+
advanced: true,
|
|
88
|
+
},
|
|
89
|
+
"tts.elevenlabs.modelId": { label: "ElevenLabs Model ID", advanced: true },
|
|
90
|
+
"tts.elevenlabs.voiceId": { label: "ElevenLabs Voice ID", advanced: true },
|
|
91
|
+
"tts.elevenlabs.apiKey": {
|
|
92
|
+
label: "ElevenLabs API Key",
|
|
93
|
+
sensitive: true,
|
|
94
|
+
advanced: true,
|
|
95
|
+
},
|
|
96
|
+
"tts.elevenlabs.baseUrl": { label: "ElevenLabs Base URL", advanced: true },
|
|
80
97
|
publicUrl: { label: "Public Webhook URL", advanced: true },
|
|
81
98
|
skipSignatureVerification: {
|
|
82
99
|
label: "Skip Signature Verification",
|
|
@@ -161,6 +178,7 @@ const voiceCallPlugin = {
|
|
|
161
178
|
runtimePromise = createVoiceCallRuntime({
|
|
162
179
|
config: cfg,
|
|
163
180
|
coreConfig: api.config as CoreConfig,
|
|
181
|
+
ttsRuntime: api.runtime.tts,
|
|
164
182
|
logger: api.logger,
|
|
165
183
|
});
|
|
166
184
|
}
|
package/package.json
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@clawdbot/voice-call",
|
|
3
|
-
"version": "2026.1.
|
|
3
|
+
"version": "2026.1.24",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Clawdbot voice-call plugin",
|
|
6
6
|
"dependencies": {
|
|
7
7
|
"@sinclair/typebox": "0.34.47",
|
|
8
8
|
"ws": "^8.19.0",
|
|
9
|
-
"zod": "^4.3.
|
|
9
|
+
"zod": "^4.3.6"
|
|
10
10
|
},
|
|
11
11
|
"clawdbot": {
|
|
12
12
|
"extensions": [
|
package/src/config.ts
CHANGED
|
@@ -82,31 +82,82 @@ export const SttConfigSchema = z
|
|
|
82
82
|
.default({ provider: "openai", model: "whisper-1" });
|
|
83
83
|
export type SttConfig = z.infer<typeof SttConfigSchema>;
|
|
84
84
|
|
|
85
|
+
export const TtsProviderSchema = z.enum(["openai", "elevenlabs", "edge"]);
|
|
86
|
+
export const TtsModeSchema = z.enum(["final", "all"]);
|
|
87
|
+
export const TtsAutoSchema = z.enum(["off", "always", "inbound", "tagged"]);
|
|
88
|
+
|
|
85
89
|
export const TtsConfigSchema = z
|
|
86
90
|
.object({
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
91
|
+
auto: TtsAutoSchema.optional(),
|
|
92
|
+
enabled: z.boolean().optional(),
|
|
93
|
+
mode: TtsModeSchema.optional(),
|
|
94
|
+
provider: TtsProviderSchema.optional(),
|
|
95
|
+
summaryModel: z.string().optional(),
|
|
96
|
+
modelOverrides: z
|
|
97
|
+
.object({
|
|
98
|
+
enabled: z.boolean().optional(),
|
|
99
|
+
allowText: z.boolean().optional(),
|
|
100
|
+
allowProvider: z.boolean().optional(),
|
|
101
|
+
allowVoice: z.boolean().optional(),
|
|
102
|
+
allowModelId: z.boolean().optional(),
|
|
103
|
+
allowVoiceSettings: z.boolean().optional(),
|
|
104
|
+
allowNormalization: z.boolean().optional(),
|
|
105
|
+
allowSeed: z.boolean().optional(),
|
|
106
|
+
})
|
|
107
|
+
.strict()
|
|
108
|
+
.optional(),
|
|
109
|
+
elevenlabs: z
|
|
110
|
+
.object({
|
|
111
|
+
apiKey: z.string().optional(),
|
|
112
|
+
baseUrl: z.string().optional(),
|
|
113
|
+
voiceId: z.string().optional(),
|
|
114
|
+
modelId: z.string().optional(),
|
|
115
|
+
seed: z.number().int().min(0).max(4294967295).optional(),
|
|
116
|
+
applyTextNormalization: z.enum(["auto", "on", "off"]).optional(),
|
|
117
|
+
languageCode: z.string().optional(),
|
|
118
|
+
voiceSettings: z
|
|
119
|
+
.object({
|
|
120
|
+
stability: z.number().min(0).max(1).optional(),
|
|
121
|
+
similarityBoost: z.number().min(0).max(1).optional(),
|
|
122
|
+
style: z.number().min(0).max(1).optional(),
|
|
123
|
+
useSpeakerBoost: z.boolean().optional(),
|
|
124
|
+
speed: z.number().min(0.5).max(2).optional(),
|
|
125
|
+
})
|
|
126
|
+
.strict()
|
|
127
|
+
.optional(),
|
|
128
|
+
})
|
|
129
|
+
.strict()
|
|
130
|
+
.optional(),
|
|
131
|
+
openai: z
|
|
132
|
+
.object({
|
|
133
|
+
apiKey: z.string().optional(),
|
|
134
|
+
model: z.string().optional(),
|
|
135
|
+
voice: z.string().optional(),
|
|
136
|
+
})
|
|
137
|
+
.strict()
|
|
138
|
+
.optional(),
|
|
139
|
+
edge: z
|
|
140
|
+
.object({
|
|
141
|
+
enabled: z.boolean().optional(),
|
|
142
|
+
voice: z.string().optional(),
|
|
143
|
+
lang: z.string().optional(),
|
|
144
|
+
outputFormat: z.string().optional(),
|
|
145
|
+
pitch: z.string().optional(),
|
|
146
|
+
rate: z.string().optional(),
|
|
147
|
+
volume: z.string().optional(),
|
|
148
|
+
saveSubtitles: z.boolean().optional(),
|
|
149
|
+
proxy: z.string().optional(),
|
|
150
|
+
timeoutMs: z.number().int().min(1000).max(120000).optional(),
|
|
151
|
+
})
|
|
152
|
+
.strict()
|
|
153
|
+
.optional(),
|
|
154
|
+
prefsPath: z.string().optional(),
|
|
155
|
+
maxTextLength: z.number().int().min(1).optional(),
|
|
156
|
+
timeoutMs: z.number().int().min(1000).max(120000).optional(),
|
|
106
157
|
})
|
|
107
158
|
.strict()
|
|
108
|
-
.
|
|
109
|
-
export type
|
|
159
|
+
.optional();
|
|
160
|
+
export type VoiceCallTtsConfig = z.infer<typeof TtsConfigSchema>;
|
|
110
161
|
|
|
111
162
|
// -----------------------------------------------------------------------------
|
|
112
163
|
// Webhook Server Configuration
|
|
@@ -307,7 +358,7 @@ export const VoiceCallConfigSchema = z
|
|
|
307
358
|
/** STT configuration */
|
|
308
359
|
stt: SttConfigSchema,
|
|
309
360
|
|
|
310
|
-
/** TTS
|
|
361
|
+
/** TTS override (deep-merges with core messages.tts) */
|
|
311
362
|
tts: TtsConfigSchema,
|
|
312
363
|
|
|
313
364
|
/** Store path for call logs */
|
package/src/core-bridge.ts
CHANGED
|
@@ -2,10 +2,16 @@ import fs from "node:fs";
|
|
|
2
2
|
import path from "node:path";
|
|
3
3
|
import { fileURLToPath, pathToFileURL } from "node:url";
|
|
4
4
|
|
|
5
|
+
import type { VoiceCallTtsConfig } from "./config.js";
|
|
6
|
+
|
|
5
7
|
export type CoreConfig = {
|
|
6
8
|
session?: {
|
|
7
9
|
store?: string;
|
|
8
10
|
};
|
|
11
|
+
messages?: {
|
|
12
|
+
tts?: VoiceCallTtsConfig;
|
|
13
|
+
};
|
|
14
|
+
[key: string]: unknown;
|
|
9
15
|
};
|
|
10
16
|
|
|
11
17
|
type CoreAgentDeps = {
|
package/src/manager/context.ts
CHANGED
package/src/manager/events.ts
CHANGED
package/src/manager/lookup.ts
CHANGED
package/src/manager/outbound.ts
CHANGED
|
@@ -68,7 +68,7 @@ export async function initiateCall(
|
|
|
68
68
|
// For notify mode with a message, use inline TwiML with <Say>.
|
|
69
69
|
let inlineTwiml: string | undefined;
|
|
70
70
|
if (mode === "notify" && initialMessage) {
|
|
71
|
-
const pollyVoice = mapVoiceToPolly(ctx.config.tts
|
|
71
|
+
const pollyVoice = mapVoiceToPolly(ctx.config.tts?.openai?.voice);
|
|
72
72
|
inlineTwiml = generateNotifyTwiml(initialMessage, pollyVoice);
|
|
73
73
|
console.log(`[voice-call] Using inline TwiML for notify mode (voice: ${pollyVoice})`);
|
|
74
74
|
}
|
|
@@ -120,11 +120,13 @@ export async function speak(
|
|
|
120
120
|
|
|
121
121
|
addTranscriptEntry(call, "bot", text);
|
|
122
122
|
|
|
123
|
+
const voice =
|
|
124
|
+
ctx.provider?.name === "twilio" ? ctx.config.tts?.openai?.voice : undefined;
|
|
123
125
|
await ctx.provider.playTts({
|
|
124
126
|
callId,
|
|
125
127
|
providerCallId: call.providerCallId,
|
|
126
128
|
text,
|
|
127
|
-
voice
|
|
129
|
+
voice,
|
|
128
130
|
});
|
|
129
131
|
|
|
130
132
|
return { success: true };
|
|
@@ -244,4 +246,3 @@ export async function endCall(
|
|
|
244
246
|
return { success: false, error: err instanceof Error ? err.message : String(err) };
|
|
245
247
|
}
|
|
246
248
|
}
|
|
247
|
-
|
package/src/manager/state.ts
CHANGED
package/src/manager/store.ts
CHANGED
package/src/manager/timers.ts
CHANGED
package/src/manager/twiml.ts
CHANGED
package/src/manager.ts
CHANGED
|
@@ -143,7 +143,7 @@ export class CallManager {
|
|
|
143
143
|
// For notify mode with a message, use inline TwiML with <Say>
|
|
144
144
|
let inlineTwiml: string | undefined;
|
|
145
145
|
if (mode === "notify" && initialMessage) {
|
|
146
|
-
const pollyVoice = mapVoiceToPolly(this.config.tts
|
|
146
|
+
const pollyVoice = mapVoiceToPolly(this.config.tts?.openai?.voice);
|
|
147
147
|
inlineTwiml = this.generateNotifyTwiml(initialMessage, pollyVoice);
|
|
148
148
|
console.log(
|
|
149
149
|
`[voice-call] Using inline TwiML for notify mode (voice: ${pollyVoice})`,
|
|
@@ -210,11 +210,13 @@ export class CallManager {
|
|
|
210
210
|
this.addTranscriptEntry(call, "bot", text);
|
|
211
211
|
|
|
212
212
|
// Play TTS
|
|
213
|
+
const voice =
|
|
214
|
+
this.provider?.name === "twilio" ? this.config.tts?.openai?.voice : undefined;
|
|
213
215
|
await this.provider.playTts({
|
|
214
216
|
callId,
|
|
215
217
|
providerCallId: call.providerCallId,
|
|
216
218
|
text,
|
|
217
|
-
voice
|
|
219
|
+
voice,
|
|
218
220
|
});
|
|
219
221
|
|
|
220
222
|
return { success: true };
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import { describe, expect, it } from "vitest";
|
|
2
|
+
|
|
3
|
+
import type {
|
|
4
|
+
OpenAIRealtimeSTTProvider,
|
|
5
|
+
RealtimeSTTSession,
|
|
6
|
+
} from "./providers/stt-openai-realtime.js";
|
|
7
|
+
import { MediaStreamHandler } from "./media-stream.js";
|
|
8
|
+
|
|
9
|
+
const createStubSession = (): RealtimeSTTSession => ({
|
|
10
|
+
connect: async () => {},
|
|
11
|
+
sendAudio: () => {},
|
|
12
|
+
waitForTranscript: async () => "",
|
|
13
|
+
onPartial: () => {},
|
|
14
|
+
onTranscript: () => {},
|
|
15
|
+
onSpeechStart: () => {},
|
|
16
|
+
close: () => {},
|
|
17
|
+
isConnected: () => true,
|
|
18
|
+
});
|
|
19
|
+
|
|
20
|
+
const createStubSttProvider = (): OpenAIRealtimeSTTProvider =>
|
|
21
|
+
({
|
|
22
|
+
createSession: () => createStubSession(),
|
|
23
|
+
}) as unknown as OpenAIRealtimeSTTProvider;
|
|
24
|
+
|
|
25
|
+
const flush = async (): Promise<void> => {
|
|
26
|
+
await new Promise((resolve) => setTimeout(resolve, 0));
|
|
27
|
+
};
|
|
28
|
+
|
|
29
|
+
const waitForAbort = (signal: AbortSignal): Promise<void> =>
|
|
30
|
+
new Promise((resolve) => {
|
|
31
|
+
if (signal.aborted) {
|
|
32
|
+
resolve();
|
|
33
|
+
return;
|
|
34
|
+
}
|
|
35
|
+
signal.addEventListener("abort", () => resolve(), { once: true });
|
|
36
|
+
});
|
|
37
|
+
|
|
38
|
+
describe("MediaStreamHandler TTS queue", () => {
|
|
39
|
+
it("serializes TTS playback and resolves in order", async () => {
|
|
40
|
+
const handler = new MediaStreamHandler({
|
|
41
|
+
sttProvider: createStubSttProvider(),
|
|
42
|
+
});
|
|
43
|
+
const started: number[] = [];
|
|
44
|
+
const finished: number[] = [];
|
|
45
|
+
|
|
46
|
+
let resolveFirst!: () => void;
|
|
47
|
+
const firstGate = new Promise<void>((resolve) => {
|
|
48
|
+
resolveFirst = resolve;
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
const first = handler.queueTts("stream-1", async () => {
|
|
52
|
+
started.push(1);
|
|
53
|
+
await firstGate;
|
|
54
|
+
finished.push(1);
|
|
55
|
+
});
|
|
56
|
+
const second = handler.queueTts("stream-1", async () => {
|
|
57
|
+
started.push(2);
|
|
58
|
+
finished.push(2);
|
|
59
|
+
});
|
|
60
|
+
|
|
61
|
+
await flush();
|
|
62
|
+
expect(started).toEqual([1]);
|
|
63
|
+
|
|
64
|
+
resolveFirst();
|
|
65
|
+
await first;
|
|
66
|
+
await second;
|
|
67
|
+
|
|
68
|
+
expect(started).toEqual([1, 2]);
|
|
69
|
+
expect(finished).toEqual([1, 2]);
|
|
70
|
+
});
|
|
71
|
+
|
|
72
|
+
it("cancels active playback and clears queued items", async () => {
|
|
73
|
+
const handler = new MediaStreamHandler({
|
|
74
|
+
sttProvider: createStubSttProvider(),
|
|
75
|
+
});
|
|
76
|
+
|
|
77
|
+
let queuedRan = false;
|
|
78
|
+
const started: string[] = [];
|
|
79
|
+
|
|
80
|
+
const active = handler.queueTts("stream-1", async (signal) => {
|
|
81
|
+
started.push("active");
|
|
82
|
+
await waitForAbort(signal);
|
|
83
|
+
});
|
|
84
|
+
void handler.queueTts("stream-1", async () => {
|
|
85
|
+
queuedRan = true;
|
|
86
|
+
});
|
|
87
|
+
|
|
88
|
+
await flush();
|
|
89
|
+
expect(started).toEqual(["active"]);
|
|
90
|
+
|
|
91
|
+
handler.clearTtsQueue("stream-1");
|
|
92
|
+
await active;
|
|
93
|
+
await flush();
|
|
94
|
+
|
|
95
|
+
expect(queuedRan).toBe(false);
|
|
96
|
+
});
|
|
97
|
+
});
|