@ozaiya/openclaw-channel 0.7.6 → 0.7.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/src/api.d.ts +34 -0
- package/dist/src/api.js +71 -0
- package/dist/src/api.js.map +1 -1
- package/dist/src/channel.js +524 -43
- package/dist/src/channel.js.map +1 -1
- package/dist/src/gateway.js +1 -0
- package/dist/src/gateway.js.map +1 -1
- package/dist/src/phoneCall.d.ts +72 -0
- package/dist/src/phoneCall.js +360 -0
- package/dist/src/phoneCall.js.map +1 -0
- package/dist/src/ttsEngine.d.ts +32 -0
- package/dist/src/ttsEngine.js +202 -0
- package/dist/src/ttsEngine.js.map +1 -0
- package/dist/src/types.d.ts +33 -0
- package/dist/src/voiceCall.d.ts +19 -2
- package/dist/src/voiceCall.js +112 -21
- package/dist/src/voiceCall.js.map +1 -1
- package/package.json +1 -1
- package/types/openclaw-plugin-sdk.d.ts +16 -0
package/dist/src/channel.js
CHANGED
|
@@ -9,7 +9,7 @@ import fs from "node:fs/promises";
|
|
|
9
9
|
import path from "node:path";
|
|
10
10
|
import { registerPluginHttpRoute } from "openclaw/plugin-sdk/webhook-ingress";
|
|
11
11
|
import { unwrapGroupKey, decryptMessage, encryptMessage, wrapGroupKey } from "./crypto.js";
|
|
12
|
-
import { sendMessage, probeApi, fetchGroups, addMember, getUserPublicKeys, toggleReaction, editMessage, deleteMessage, pinMessage, unpinMessage, uploadFile, searchUsers, fetchLinkPreview, joinCall, leaveCall, } from "./api.js";
|
|
12
|
+
import { sendMessage, probeApi, fetchGroups, addMember, getUserPublicKeys, toggleReaction, editMessage, deleteMessage, pinMessage, unpinMessage, uploadFile, searchUsers, fetchLinkPreview, joinCall, leaveCall, startPhoneCall, endPhoneCall, updatePhoneCallStatus, } from "./api.js";
|
|
13
13
|
import { botCreateDirect, botCreateGroup } from "./botActions.js";
|
|
14
14
|
import { buildInlineKeyboardSummary, buildLinkPreviewSummary, normalizeMessageText, normalizeToolInlineKeyboardRows, } from "./richContent.js";
|
|
15
15
|
import { normalizeCallbackQueryPayload } from "./callbackQuery.js";
|
|
@@ -19,6 +19,7 @@ import { getOzaiyaRuntime } from "./runtime.js";
|
|
|
19
19
|
import { maybeTranscribeInboundAudio, prependVoiceTranscriptToAgentInput, resolveOzaiyaSttConfig, } from "./transcribeAudio.js";
|
|
20
20
|
import { startGatewayMode } from "./gateway.js";
|
|
21
21
|
import { VoiceCallSession } from "./voiceCall.js";
|
|
22
|
+
import { PhoneCallSession } from "./phoneCall.js";
|
|
22
23
|
const DEFAULT_API_BASE_URL = "https://api.ozai.dev";
|
|
23
24
|
const DEFAULT_WEBHOOK_PATH = "/ozaiya/webhook";
|
|
24
25
|
const DEFAULT_ACCOUNT_ID = "default";
|
|
@@ -34,6 +35,10 @@ const unwrappedKeys = new Map();
|
|
|
34
35
|
const groupToBotAccountId = new Map();
|
|
35
36
|
// Active voice call sessions keyed by callId
|
|
36
37
|
const activeVoiceCalls = new Map();
|
|
38
|
+
// Active phone call sessions keyed by phoneCallId
|
|
39
|
+
const activePhoneCalls = new Map();
|
|
40
|
+
// Map groupId → phoneCallId for active phone calls (for manual mode message routing)
|
|
41
|
+
const groupToActivePhoneCall = new Map();
|
|
37
42
|
// Runtime state tracking
|
|
38
43
|
const runtimeState = new Map();
|
|
39
44
|
function recordState(accountId, patch) {
|
|
@@ -321,13 +326,12 @@ function resolveAttachmentMime(fileName, mime) {
|
|
|
321
326
|
return "application/octet-stream";
|
|
322
327
|
}
|
|
323
328
|
/**
|
|
324
|
-
* Synthesize text to speech via Deepgram Aura and return
|
|
329
|
+
* Synthesize text to speech via Deepgram Aura and return an MP3 buffer.
|
|
325
330
|
* Returns null if TTS is unavailable or fails.
|
|
326
331
|
*/
|
|
327
|
-
async function
|
|
332
|
+
async function synthesizeSpeechToMp3(text, deepgramApiKey, ttsModel, log) {
|
|
328
333
|
try {
|
|
329
|
-
const
|
|
330
|
-
const url = `https://api.deepgram.com/v1/speak?model=${encodeURIComponent(ttsModel)}&encoding=linear16&sample_rate=${sampleRate}`;
|
|
334
|
+
const url = `https://api.deepgram.com/v1/speak?model=${encodeURIComponent(ttsModel)}&encoding=mp3`;
|
|
331
335
|
const res = await fetch(url, {
|
|
332
336
|
method: "POST",
|
|
333
337
|
headers: {
|
|
@@ -341,33 +345,264 @@ async function synthesizeSpeechToWav(text, deepgramApiKey, ttsModel, log) {
|
|
|
341
345
|
log?.warn?.(`ozaiya: TTS error: ${res.status} ${res.statusText}`);
|
|
342
346
|
return null;
|
|
343
347
|
}
|
|
344
|
-
|
|
345
|
-
// Wrap raw PCM in a WAV container
|
|
346
|
-
const numChannels = 1;
|
|
347
|
-
const bitsPerSample = 16;
|
|
348
|
-
const byteRate = sampleRate * numChannels * (bitsPerSample / 8);
|
|
349
|
-
const blockAlign = numChannels * (bitsPerSample / 8);
|
|
350
|
-
const header = Buffer.alloc(44);
|
|
351
|
-
header.write("RIFF", 0);
|
|
352
|
-
header.writeUInt32LE(36 + pcmBuffer.length, 4);
|
|
353
|
-
header.write("WAVE", 8);
|
|
354
|
-
header.write("fmt ", 12);
|
|
355
|
-
header.writeUInt32LE(16, 16);
|
|
356
|
-
header.writeUInt16LE(1, 20);
|
|
357
|
-
header.writeUInt16LE(numChannels, 22);
|
|
358
|
-
header.writeUInt32LE(sampleRate, 24);
|
|
359
|
-
header.writeUInt32LE(byteRate, 28);
|
|
360
|
-
header.writeUInt16LE(blockAlign, 32);
|
|
361
|
-
header.writeUInt16LE(bitsPerSample, 34);
|
|
362
|
-
header.write("data", 36);
|
|
363
|
-
header.writeUInt32LE(pcmBuffer.length, 40);
|
|
364
|
-
return Buffer.concat([header, pcmBuffer]);
|
|
348
|
+
return Buffer.from(await res.arrayBuffer());
|
|
365
349
|
}
|
|
366
350
|
catch (err) {
|
|
367
351
|
log?.warn?.(`ozaiya: TTS synthesis failed: ${String(err)}`);
|
|
368
352
|
return null;
|
|
369
353
|
}
|
|
370
354
|
}
|
|
355
|
+
/**
|
|
356
|
+
* Synthesize text to speech via Volcengine (火山引擎/豆包) TTS v3 streaming API.
|
|
357
|
+
* Uses X-Api-Key auth with the unidirectional streaming endpoint.
|
|
358
|
+
* Returns audio buffer or null on failure.
|
|
359
|
+
*/
|
|
360
|
+
async function synthesizeSpeechVolcengineV3(text, apiKey, voice, encoding, speedRatio, resourceId, log) {
|
|
361
|
+
const headers = {
|
|
362
|
+
"Content-Type": "application/json",
|
|
363
|
+
"X-Api-Key": apiKey,
|
|
364
|
+
"X-Api-Resource-Id": resourceId,
|
|
365
|
+
"X-Api-Connect-Id": `tts-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`,
|
|
366
|
+
};
|
|
367
|
+
const body = {
|
|
368
|
+
user: { uid: "ozaiya-bot" },
|
|
369
|
+
req_params: { text, speaker: voice, speed_ratio: speedRatio },
|
|
370
|
+
audio_params: { format: encoding, sample_rate: 24000 },
|
|
371
|
+
};
|
|
372
|
+
log?.info?.(`ozaiya: Volcengine TTS v3 request: speaker=${voice} encoding=${encoding} resourceId=${resourceId}`);
|
|
373
|
+
const res = await fetch("https://openspeech.bytedance.com/api/v3/tts/unidirectional", {
|
|
374
|
+
method: "POST",
|
|
375
|
+
headers,
|
|
376
|
+
body: JSON.stringify(body),
|
|
377
|
+
signal: AbortSignal.timeout(30_000),
|
|
378
|
+
});
|
|
379
|
+
if (!res.ok) {
|
|
380
|
+
const errText = await res.text().catch(() => "");
|
|
381
|
+
log?.warn?.(`ozaiya: Volcengine TTS v3 HTTP error: ${res.status} ${errText}`);
|
|
382
|
+
return null;
|
|
383
|
+
}
|
|
384
|
+
// Collect streaming response — line-delimited JSON, each line has { data: "<base64>" }
|
|
385
|
+
const responseText = await res.text();
|
|
386
|
+
const audioChunks = [];
|
|
387
|
+
for (const line of responseText.split("\n")) {
|
|
388
|
+
const trimmed = line.trim();
|
|
389
|
+
if (!trimmed)
|
|
390
|
+
continue;
|
|
391
|
+
try {
|
|
392
|
+
const chunk = JSON.parse(trimmed);
|
|
393
|
+
if (chunk.data) {
|
|
394
|
+
audioChunks.push(Buffer.from(chunk.data, "base64"));
|
|
395
|
+
}
|
|
396
|
+
// Non-success, non-final codes indicate errors
|
|
397
|
+
if (chunk.code && chunk.code !== 20000000 && chunk.code !== 0) {
|
|
398
|
+
log?.warn?.(`ozaiya: Volcengine TTS v3 chunk error: code=${chunk.code} message=${chunk.message}`);
|
|
399
|
+
}
|
|
400
|
+
}
|
|
401
|
+
catch {
|
|
402
|
+
// skip non-JSON lines
|
|
403
|
+
}
|
|
404
|
+
}
|
|
405
|
+
if (audioChunks.length === 0) {
|
|
406
|
+
log?.warn?.(`ozaiya: Volcengine TTS v3: no audio data in response (${responseText.length} bytes raw)`);
|
|
407
|
+
return null;
|
|
408
|
+
}
|
|
409
|
+
log?.info?.(`ozaiya: Volcengine TTS v3: collected ${audioChunks.length} chunks`);
|
|
410
|
+
return Buffer.concat(audioChunks);
|
|
411
|
+
}
|
|
412
|
+
/**
|
|
413
|
+
* Synthesize text to speech via Volcengine (火山引擎/豆包) TTS v1 API.
|
|
414
|
+
* Uses Bearer;token auth (legacy).
|
|
415
|
+
* Returns audio buffer or null on failure.
|
|
416
|
+
*/
|
|
417
|
+
async function synthesizeSpeechVolcengineV1(text, config, voice, encoding, speedRatio, log) {
|
|
418
|
+
const cluster = config.cluster ?? "volcano_tts";
|
|
419
|
+
const reqId = `tts-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
|
|
420
|
+
const payload = {
|
|
421
|
+
app: {
|
|
422
|
+
appid: config.appId ?? "",
|
|
423
|
+
token: config.accessToken ?? "",
|
|
424
|
+
cluster,
|
|
425
|
+
},
|
|
426
|
+
user: { uid: "ozaiya-bot" },
|
|
427
|
+
audio: {
|
|
428
|
+
voice_type: voice,
|
|
429
|
+
encoding,
|
|
430
|
+
speed_ratio: speedRatio,
|
|
431
|
+
},
|
|
432
|
+
request: {
|
|
433
|
+
reqid: reqId,
|
|
434
|
+
text,
|
|
435
|
+
operation: "query",
|
|
436
|
+
},
|
|
437
|
+
};
|
|
438
|
+
const headers = {
|
|
439
|
+
"Content-Type": "application/json",
|
|
440
|
+
Authorization: `Bearer;${config.accessToken}`,
|
|
441
|
+
};
|
|
442
|
+
const res = await fetch("https://openspeech.bytedance.com/api/v1/tts", {
|
|
443
|
+
method: "POST",
|
|
444
|
+
headers,
|
|
445
|
+
body: JSON.stringify(payload),
|
|
446
|
+
signal: AbortSignal.timeout(30_000),
|
|
447
|
+
});
|
|
448
|
+
if (!res.ok) {
|
|
449
|
+
log?.warn?.(`ozaiya: Volcengine TTS v1 HTTP error: ${res.status} ${res.statusText}`);
|
|
450
|
+
return null;
|
|
451
|
+
}
|
|
452
|
+
const result = await res.json();
|
|
453
|
+
if (result.code !== 3000 || !result.data) {
|
|
454
|
+
log?.warn?.(`ozaiya: Volcengine TTS v1 error: code=${result.code} message=${result.message}`);
|
|
455
|
+
return null;
|
|
456
|
+
}
|
|
457
|
+
return Buffer.from(result.data, "base64");
|
|
458
|
+
}
|
|
459
|
+
/**
|
|
460
|
+
* Synthesize text to speech via Volcengine AI Gateway (OpenAI-compatible endpoint).
|
|
461
|
+
* Uses the Ark API Key with Bearer token auth.
|
|
462
|
+
* Returns audio buffer or null on failure.
|
|
463
|
+
*/
|
|
464
|
+
async function synthesizeSpeechVolcengineGateway(text, arkApiKey, voice, encoding, speedRatio, log) {
|
|
465
|
+
log?.info?.(`ozaiya: Volcengine AI Gateway TTS request: voice=${voice}`);
|
|
466
|
+
const res = await fetch("https://ai-gateway.vei.volces.com/v1/audio/speech", {
|
|
467
|
+
method: "POST",
|
|
468
|
+
headers: {
|
|
469
|
+
"Content-Type": "application/json",
|
|
470
|
+
Authorization: `Bearer ${arkApiKey}`,
|
|
471
|
+
},
|
|
472
|
+
body: JSON.stringify({
|
|
473
|
+
model: "doubao-tts",
|
|
474
|
+
input: text,
|
|
475
|
+
voice,
|
|
476
|
+
response_format: encoding,
|
|
477
|
+
speed: speedRatio,
|
|
478
|
+
}),
|
|
479
|
+
signal: AbortSignal.timeout(30_000),
|
|
480
|
+
});
|
|
481
|
+
if (!res.ok) {
|
|
482
|
+
const errText = await res.text().catch(() => "");
|
|
483
|
+
log?.warn?.(`ozaiya: Volcengine AI Gateway TTS HTTP error: ${res.status} ${errText}`);
|
|
484
|
+
return null;
|
|
485
|
+
}
|
|
486
|
+
// Response is raw audio binary
|
|
487
|
+
const arrayBuffer = await res.arrayBuffer();
|
|
488
|
+
if (arrayBuffer.byteLength === 0) {
|
|
489
|
+
log?.warn?.(`ozaiya: Volcengine AI Gateway TTS: empty response`);
|
|
490
|
+
return null;
|
|
491
|
+
}
|
|
492
|
+
log?.info?.(`ozaiya: Volcengine AI Gateway TTS: received ${arrayBuffer.byteLength} bytes`);
|
|
493
|
+
return Buffer.from(arrayBuffer);
|
|
494
|
+
}
|
|
495
|
+
/**
|
|
496
|
+
* Synthesize text to speech via Volcengine (火山引擎/豆包) TTS API.
|
|
497
|
+
* Priority:
|
|
498
|
+
* 1. arkApiKey → AI Gateway OpenAI-compatible endpoint (simplest, uses Ark API Key)
|
|
499
|
+
* 2. apiKey → v3 streaming openspeech endpoint (uses speech-specific API Key)
|
|
500
|
+
* 3. accessToken → v1 non-streaming openspeech endpoint (legacy)
|
|
501
|
+
* Returns audio buffer or null on failure.
|
|
502
|
+
*/
|
|
503
|
+
async function synthesizeSpeechVolcengine(text, config, log) {
|
|
504
|
+
try {
|
|
505
|
+
const voice = config.voice ?? "zh_female_wanwanxiaohe_moon_bigtts";
|
|
506
|
+
const encoding = config.encoding ?? "mp3";
|
|
507
|
+
const speedRatio = config.speedRatio ?? 1.0;
|
|
508
|
+
if (config.arkApiKey) {
|
|
509
|
+
return await synthesizeSpeechVolcengineGateway(text, config.arkApiKey, voice, encoding, speedRatio, log);
|
|
510
|
+
}
|
|
511
|
+
else if (config.apiKey) {
|
|
512
|
+
const resourceId = config.resourceId ?? "volc.service_type.10029";
|
|
513
|
+
return await synthesizeSpeechVolcengineV3(text, config.apiKey, voice, encoding, speedRatio, resourceId, log);
|
|
514
|
+
}
|
|
515
|
+
else if (config.accessToken) {
|
|
516
|
+
return await synthesizeSpeechVolcengineV1(text, config, voice, encoding, speedRatio, log);
|
|
517
|
+
}
|
|
518
|
+
else {
|
|
519
|
+
log?.warn?.(`ozaiya: Volcengine TTS: no arkApiKey, apiKey, or accessToken configured`);
|
|
520
|
+
return null;
|
|
521
|
+
}
|
|
522
|
+
}
|
|
523
|
+
catch (err) {
|
|
524
|
+
log?.warn?.(`ozaiya: Volcengine TTS failed: ${String(err)}`);
|
|
525
|
+
return null;
|
|
526
|
+
}
|
|
527
|
+
}
|
|
528
|
+
/**
|
|
529
|
+
* Synthesize voice reply. Priority:
|
|
530
|
+
* 1. Volcengine TTS (if configured — supports Chinese)
|
|
531
|
+
* 2. OpenClaw runtime TTS (if available — uses configured provider)
|
|
532
|
+
* 3. Deepgram TTS (fallback — English only)
|
|
533
|
+
*/
|
|
534
|
+
async function synthesizeVoiceReply(text,
|
|
535
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
536
|
+
ctx, voiceOverride) {
|
|
537
|
+
const ozaiyaChannelCfg = (ctx.cfg?.channels?.ozaiya ?? {});
|
|
538
|
+
const account = ctx.account;
|
|
539
|
+
const perBotVoice = account?.voiceConfig;
|
|
540
|
+
// 0. Per-bot voice config from gateway (highest priority)
|
|
541
|
+
if (perBotVoice?.provider === 'volcengine' && perBotVoice.appId && perBotVoice.accessToken) {
|
|
542
|
+
ctx.log?.info?.(`ozaiya: using per-bot Volcengine TTS`);
|
|
543
|
+
const volcCfg = {
|
|
544
|
+
appId: perBotVoice.appId,
|
|
545
|
+
accessToken: perBotVoice.accessToken,
|
|
546
|
+
...(voiceOverride ? { voice: voiceOverride } : {}),
|
|
547
|
+
};
|
|
548
|
+
const buf = await synthesizeSpeechVolcengine(text, volcCfg, ctx.log);
|
|
549
|
+
if (buf) {
|
|
550
|
+
return { data: buf, ext: `.${volcCfg.encoding ?? "mp3"}` };
|
|
551
|
+
}
|
|
552
|
+
ctx.log?.warn?.(`ozaiya: per-bot Volcengine TTS failed, trying fallbacks`);
|
|
553
|
+
}
|
|
554
|
+
if (perBotVoice?.provider === 'deepgram' && perBotVoice.apiKey) {
|
|
555
|
+
ctx.log?.info?.(`ozaiya: using per-bot Deepgram TTS`);
|
|
556
|
+
const ttsModel = ozaiyaChannelCfg?.voiceCall?.tts?.model ?? "aura-asteria-en";
|
|
557
|
+
const mp3Buffer = await synthesizeSpeechToMp3(text, perBotVoice.apiKey, ttsModel, ctx.log);
|
|
558
|
+
if (mp3Buffer)
|
|
559
|
+
return { data: mp3Buffer, ext: ".mp3" };
|
|
560
|
+
ctx.log?.warn?.(`ozaiya: per-bot Deepgram TTS failed, trying fallbacks`);
|
|
561
|
+
}
|
|
562
|
+
// 1. Volcengine TTS (preferred for Chinese)
|
|
563
|
+
if (ozaiyaChannelCfg?.volcengineTts?.arkApiKey || ozaiyaChannelCfg?.volcengineTts?.apiKey || ozaiyaChannelCfg?.volcengineTts?.appId) {
|
|
564
|
+
const effectiveVolcCfg = voiceOverride
|
|
565
|
+
? { ...ozaiyaChannelCfg.volcengineTts, voice: voiceOverride }
|
|
566
|
+
: ozaiyaChannelCfg.volcengineTts;
|
|
567
|
+
ctx.log?.info?.(`ozaiya: using Volcengine TTS (voice=${effectiveVolcCfg.voice ?? 'default'})`);
|
|
568
|
+
const buf = await synthesizeSpeechVolcengine(text, effectiveVolcCfg, ctx.log);
|
|
569
|
+
if (buf) {
|
|
570
|
+
const ext = `.${ozaiyaChannelCfg.volcengineTts.encoding ?? "mp3"}`;
|
|
571
|
+
return { data: buf, ext };
|
|
572
|
+
}
|
|
573
|
+
ctx.log?.warn?.(`ozaiya: Volcengine TTS failed, trying fallbacks`);
|
|
574
|
+
}
|
|
575
|
+
// 2. OpenClaw runtime TTS (edge, openai, elevenlabs etc.)
|
|
576
|
+
try {
|
|
577
|
+
const runtime = getOzaiyaRuntime();
|
|
578
|
+
if (runtime.tts?.textToSpeech) {
|
|
579
|
+
ctx.log?.info?.(`ozaiya: using runtime TTS`);
|
|
580
|
+
const result = await runtime.tts.textToSpeech({
|
|
581
|
+
text,
|
|
582
|
+
cfg: ctx.cfg,
|
|
583
|
+
channel: "ozaiya",
|
|
584
|
+
});
|
|
585
|
+
if (result.success && result.audioPath) {
|
|
586
|
+
const audioData = await fs.readFile(result.audioPath);
|
|
587
|
+
const ext = path.extname(result.audioPath) || ".mp3";
|
|
588
|
+
ctx.log?.info?.(`ozaiya: runtime TTS succeeded (provider=${result.provider}, ${result.latencyMs}ms, ${ext})`);
|
|
589
|
+
return { data: Buffer.from(audioData), ext };
|
|
590
|
+
}
|
|
591
|
+
ctx.log?.warn?.(`ozaiya: runtime TTS failed: ${result.error}`);
|
|
592
|
+
}
|
|
593
|
+
}
|
|
594
|
+
catch (err) {
|
|
595
|
+
ctx.log?.warn?.(`ozaiya: runtime TTS unavailable: ${String(err)}`);
|
|
596
|
+
}
|
|
597
|
+
// 3. Deepgram direct API (English fallback)
|
|
598
|
+
const ttsApiKey = ozaiyaChannelCfg?.voiceCall?.deepgramApiKey || process.env.DEEPGRAM_API_KEY || "";
|
|
599
|
+
const ttsModel = ozaiyaChannelCfg?.voiceCall?.tts?.model ?? "aura-asteria-en";
|
|
600
|
+
if (!ttsApiKey)
|
|
601
|
+
return null;
|
|
602
|
+
ctx.log?.info?.(`ozaiya: falling back to Deepgram TTS`);
|
|
603
|
+
const mp3Buffer = await synthesizeSpeechToMp3(text, ttsApiKey, ttsModel, ctx.log);
|
|
604
|
+
return mp3Buffer ? { data: mp3Buffer, ext: ".mp3" } : null;
|
|
605
|
+
}
|
|
371
606
|
async function getGroupKeyOrThrow(account, groupId) {
|
|
372
607
|
let groupKey = unwrappedKeys.get(groupId);
|
|
373
608
|
if (!groupKey) {
|
|
@@ -700,6 +935,8 @@ export const ozaiyaPlugin = {
|
|
|
700
935
|
createPinMessageTool(account),
|
|
701
936
|
createSearchUsersTool(account),
|
|
702
937
|
createListGroupsTool(account),
|
|
938
|
+
createMakePhoneCallTool(account, cfg),
|
|
939
|
+
createHangUpCallTool(account),
|
|
703
940
|
];
|
|
704
941
|
}),
|
|
705
942
|
gateway: {
|
|
@@ -712,7 +949,7 @@ export const ozaiyaPlugin = {
|
|
|
712
949
|
ctx.log?.info(`[gateway] starting gateway mode`);
|
|
713
950
|
recordState(account.accountId, { running: true, lastStartAt: Date.now() });
|
|
714
951
|
const ozaiya = resolveConfig(ctx.cfg);
|
|
715
|
-
const stateDir = ctx.runtime?.state?.resolveStateDir?.() ?? process.env.HOME ?? ".";
|
|
952
|
+
const stateDir = process.env.OPENCLAW_STATE_DIR ?? ctx.runtime?.state?.resolveStateDir?.() ?? process.env.HOME ?? ".";
|
|
716
953
|
// Track per-bot unregister functions for hot-reload
|
|
717
954
|
const botUnregisters = new Map();
|
|
718
955
|
const startBotHandler = (botAccount) => {
|
|
@@ -840,6 +1077,12 @@ export const ozaiyaPlugin = {
|
|
|
840
1077
|
session.disconnect().catch(() => { });
|
|
841
1078
|
activeVoiceCalls.delete(callId);
|
|
842
1079
|
}
|
|
1080
|
+
// Disconnect all active phone call sessions
|
|
1081
|
+
for (const [phoneCallId, session] of activePhoneCalls) {
|
|
1082
|
+
session.disconnect().catch(() => { });
|
|
1083
|
+
activePhoneCalls.delete(phoneCallId);
|
|
1084
|
+
}
|
|
1085
|
+
groupToActivePhoneCall.clear();
|
|
843
1086
|
for (const id of gatewayBotAccounts.keys()) {
|
|
844
1087
|
recordState(id, { running: false, lastStopAt: Date.now() });
|
|
845
1088
|
}
|
|
@@ -1523,6 +1766,227 @@ function createListGroupsTool(account) {
|
|
|
1523
1766
|
},
|
|
1524
1767
|
};
|
|
1525
1768
|
}
|
|
1769
|
+
function createMakePhoneCallTool(account, cfg) {
|
|
1770
|
+
return {
|
|
1771
|
+
label: "Make Phone Call",
|
|
1772
|
+
name: "make_phone_call",
|
|
1773
|
+
ownerOnly: false,
|
|
1774
|
+
description: "Initiate an outbound phone call to a PSTN number via SIP. " +
|
|
1775
|
+
"The call is recorded and the recording + transcript are saved to the chat. " +
|
|
1776
|
+
"In 'auto' mode, you speak directly to the callee via STT/TTS. " +
|
|
1777
|
+
"In 'manual' mode, the callee's speech is transcribed to chat for the user to respond.",
|
|
1778
|
+
parameters: {
|
|
1779
|
+
type: "object",
|
|
1780
|
+
properties: {
|
|
1781
|
+
groupId: {
|
|
1782
|
+
type: "string",
|
|
1783
|
+
description: "The group/DM where the call record will be posted.",
|
|
1784
|
+
},
|
|
1785
|
+
phoneNumber: {
|
|
1786
|
+
type: "string",
|
|
1787
|
+
description: "E.164 phone number to call (e.g. +8613800138000).",
|
|
1788
|
+
},
|
|
1789
|
+
mode: {
|
|
1790
|
+
type: "string",
|
|
1791
|
+
enum: ["auto", "manual"],
|
|
1792
|
+
description: "auto = AI handles conversation, manual = transcribe to chat for user to reply. Default: auto.",
|
|
1793
|
+
},
|
|
1794
|
+
purpose: {
|
|
1795
|
+
type: "string",
|
|
1796
|
+
description: "Brief note about why you're making this call (logged for reference).",
|
|
1797
|
+
},
|
|
1798
|
+
},
|
|
1799
|
+
required: ["groupId", "phoneNumber"],
|
|
1800
|
+
},
|
|
1801
|
+
execute: async (_toolCallId, rawArgs) => {
|
|
1802
|
+
const args = rawArgs;
|
|
1803
|
+
try {
|
|
1804
|
+
const mode = args.mode ?? "auto";
|
|
1805
|
+
const result = await startPhoneCall(account.apiBaseUrl, account.botToken, args.groupId, args.phoneNumber, mode, args.purpose);
|
|
1806
|
+
// Report connected status
|
|
1807
|
+
updatePhoneCallStatus(account.apiBaseUrl, account.botToken, result.phoneCallId, "connected").catch(() => { });
|
|
1808
|
+
const ozaiyaCfg = (cfg?.channels?.ozaiya ?? {});
|
|
1809
|
+
const voiceCallCfg = ozaiyaCfg.voiceCall ?? {};
|
|
1810
|
+
// Override per-bot voice config and pass volcengineTts from channel config
|
|
1811
|
+
const effectiveVoiceCallCfg = { ...voiceCallCfg };
|
|
1812
|
+
if (account.voiceConfig?.provider === "deepgram" && account.voiceConfig.apiKey) {
|
|
1813
|
+
effectiveVoiceCallCfg.deepgramApiKey = account.voiceConfig.apiKey;
|
|
1814
|
+
}
|
|
1815
|
+
if (!effectiveVoiceCallCfg.volcengineTts && ozaiyaCfg.volcengineTts) {
|
|
1816
|
+
effectiveVoiceCallCfg.volcengineTts = ozaiyaCfg.volcengineTts;
|
|
1817
|
+
}
|
|
1818
|
+
// Create PhoneCallSession
|
|
1819
|
+
const session = new PhoneCallSession({
|
|
1820
|
+
phoneCallId: result.phoneCallId,
|
|
1821
|
+
groupId: args.groupId,
|
|
1822
|
+
livekitToken: result.livekitToken,
|
|
1823
|
+
livekitUrl: result.livekitUrl,
|
|
1824
|
+
mode,
|
|
1825
|
+
voiceCallConfig: effectiveVoiceCallCfg,
|
|
1826
|
+
onTranscript: (text) => {
|
|
1827
|
+
if (mode === "auto") {
|
|
1828
|
+
// Auto mode: dispatch to agent, speak reply
|
|
1829
|
+
void handlePhoneAutoTranscript(text, session, account, cfg, args.groupId);
|
|
1830
|
+
}
|
|
1831
|
+
else {
|
|
1832
|
+
// Manual mode: post to chat as a message
|
|
1833
|
+
void postPhoneTranscriptToChat(text, account, args.groupId);
|
|
1834
|
+
}
|
|
1835
|
+
},
|
|
1836
|
+
onPhoneHangUp: () => {
|
|
1837
|
+
// Phone person hung up — end the call
|
|
1838
|
+
void cleanupPhoneCall(result.phoneCallId, session, account);
|
|
1839
|
+
},
|
|
1840
|
+
});
|
|
1841
|
+
activePhoneCalls.set(result.phoneCallId, session);
|
|
1842
|
+
groupToActivePhoneCall.set(args.groupId, result.phoneCallId);
|
|
1843
|
+
// Connect to LiveKit
|
|
1844
|
+
await session.connect();
|
|
1845
|
+
return {
|
|
1846
|
+
content: [{
|
|
1847
|
+
type: "text",
|
|
1848
|
+
text: `Phone call initiated (ID: ${result.phoneCallId}). ` +
|
|
1849
|
+
`Calling ${args.phoneNumber} in ${mode} mode. ` +
|
|
1850
|
+
`The call is being recorded. Use hang_up_call to end it.`,
|
|
1851
|
+
}],
|
|
1852
|
+
};
|
|
1853
|
+
}
|
|
1854
|
+
catch (err) {
|
|
1855
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
1856
|
+
return { content: [{ type: "text", text: `Error starting phone call: ${msg}` }] };
|
|
1857
|
+
}
|
|
1858
|
+
},
|
|
1859
|
+
};
|
|
1860
|
+
}
|
|
1861
|
+
function createHangUpCallTool(account) {
|
|
1862
|
+
return {
|
|
1863
|
+
label: "Hang Up Call",
|
|
1864
|
+
name: "hang_up_call",
|
|
1865
|
+
ownerOnly: false,
|
|
1866
|
+
description: "End an active outbound phone call.",
|
|
1867
|
+
parameters: {
|
|
1868
|
+
type: "object",
|
|
1869
|
+
properties: {
|
|
1870
|
+
phoneCallId: {
|
|
1871
|
+
type: "string",
|
|
1872
|
+
description: "The phone call ID returned by make_phone_call.",
|
|
1873
|
+
},
|
|
1874
|
+
},
|
|
1875
|
+
required: ["phoneCallId"],
|
|
1876
|
+
},
|
|
1877
|
+
execute: async (_toolCallId, rawArgs) => {
|
|
1878
|
+
const args = rawArgs;
|
|
1879
|
+
try {
|
|
1880
|
+
const session = activePhoneCalls.get(args.phoneCallId);
|
|
1881
|
+
if (session) {
|
|
1882
|
+
await cleanupPhoneCall(args.phoneCallId, session, account);
|
|
1883
|
+
}
|
|
1884
|
+
else {
|
|
1885
|
+
// No local session, still try API
|
|
1886
|
+
await endPhoneCall(account.apiBaseUrl, account.botToken, args.phoneCallId);
|
|
1887
|
+
}
|
|
1888
|
+
return {
|
|
1889
|
+
content: [{ type: "text", text: `Phone call ${args.phoneCallId} ended.` }],
|
|
1890
|
+
};
|
|
1891
|
+
}
|
|
1892
|
+
catch (err) {
|
|
1893
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
1894
|
+
return { content: [{ type: "text", text: `Error ending phone call: ${msg}` }] };
|
|
1895
|
+
}
|
|
1896
|
+
},
|
|
1897
|
+
};
|
|
1898
|
+
}
|
|
1899
|
+
/**
|
|
1900
|
+
* Handle phone transcription in auto mode: dispatch to agent, speak the reply.
|
|
1901
|
+
*/
|
|
1902
|
+
async function handlePhoneAutoTranscript(text, session, account, cfg, groupId) {
|
|
1903
|
+
if (session.isDisposed)
|
|
1904
|
+
return;
|
|
1905
|
+
const runtime = getOzaiyaRuntime();
|
|
1906
|
+
const ch = runtime.channel;
|
|
1907
|
+
const route = ch.routing.resolveAgentRoute({
|
|
1908
|
+
cfg,
|
|
1909
|
+
channel: "ozaiya",
|
|
1910
|
+
accountId: account.accountId,
|
|
1911
|
+
peer: { kind: "group", id: groupId },
|
|
1912
|
+
});
|
|
1913
|
+
const ozaiyaCfg = (cfg?.channels?.ozaiya ?? {});
|
|
1914
|
+
const voicePrompt = ozaiyaCfg.voiceCall?.agentPrompt ??
|
|
1915
|
+
"[Phone Call] You are in a live phone call. Your response will be spoken aloud via TTS. " +
|
|
1916
|
+
"Rules: respond concisely (1-3 sentences), use natural spoken language, " +
|
|
1917
|
+
"never use markdown/code blocks/bullet lists/URLs/emojis. " +
|
|
1918
|
+
"Do not say \"sure\" or \"of course\" — just answer directly.";
|
|
1919
|
+
const bodyForAgent = `${voicePrompt}\n\n${text}`;
|
|
1920
|
+
const body = `[ozaiya] from: phone caller | at: ${new Date().toISOString()}\n---\n${text}`;
|
|
1921
|
+
const msgCtx = ch.reply.finalizeInboundContext({
|
|
1922
|
+
Body: body,
|
|
1923
|
+
BodyForAgent: bodyForAgent,
|
|
1924
|
+
RawBody: text,
|
|
1925
|
+
CommandBody: text,
|
|
1926
|
+
From: `ozaiya:group:${groupId}`,
|
|
1927
|
+
To: `ozaiya:group:${groupId}`,
|
|
1928
|
+
SessionKey: route.sessionKey,
|
|
1929
|
+
AccountId: route.accountId,
|
|
1930
|
+
ChatType: "group",
|
|
1931
|
+
ConversationLabel: `group:${groupId}`,
|
|
1932
|
+
GroupSubject: groupId,
|
|
1933
|
+
SenderId: "phone-caller",
|
|
1934
|
+
SenderName: "Phone Caller",
|
|
1935
|
+
Provider: "ozaiya",
|
|
1936
|
+
Surface: "ozaiya-phone",
|
|
1937
|
+
MessageSid: `phone-${Date.now()}`,
|
|
1938
|
+
Timestamp: Date.now(),
|
|
1939
|
+
NumFiles: 0,
|
|
1940
|
+
NumMedia: 0,
|
|
1941
|
+
HasFiles: false,
|
|
1942
|
+
CommandAuthorized: true,
|
|
1943
|
+
OriginatingChannel: "ozaiya",
|
|
1944
|
+
OriginatingTo: `ozaiya:group:${groupId}`,
|
|
1945
|
+
});
|
|
1946
|
+
await ch.reply.dispatchReplyWithBufferedBlockDispatcher({
|
|
1947
|
+
ctx: msgCtx,
|
|
1948
|
+
cfg,
|
|
1949
|
+
dispatcherOptions: {
|
|
1950
|
+
deliver: async (replyPayload, _info) => {
|
|
1951
|
+
const replyText = replyPayload.text;
|
|
1952
|
+
if (!replyText?.trim() || session.isDisposed)
|
|
1953
|
+
return;
|
|
1954
|
+
await session.speakReply(replyText);
|
|
1955
|
+
},
|
|
1956
|
+
onError: (err) => {
|
|
1957
|
+
// eslint-disable-next-line no-console
|
|
1958
|
+
console.warn(`ozaiya: phone call auto-reply error: ${String(err)}`);
|
|
1959
|
+
},
|
|
1960
|
+
},
|
|
1961
|
+
});
|
|
1962
|
+
}
|
|
1963
|
+
/**
|
|
1964
|
+
* Post phone person's transcribed speech to the chat group (manual mode).
|
|
1965
|
+
*/
|
|
1966
|
+
async function postPhoneTranscriptToChat(text, account, groupId) {
|
|
1967
|
+
try {
|
|
1968
|
+
const groupKey = unwrappedKeys.get(groupId);
|
|
1969
|
+
if (!groupKey)
|
|
1970
|
+
return;
|
|
1971
|
+
const content = {
|
|
1972
|
+
text: `📞 对方说: ${text}`,
|
|
1973
|
+
};
|
|
1974
|
+
const encrypted = encryptMessage(content, groupKey);
|
|
1975
|
+
await sendMessage(account.apiBaseUrl, account.botToken, groupId, encrypted);
|
|
1976
|
+
}
|
|
1977
|
+
catch {
|
|
1978
|
+
// Fire-and-forget
|
|
1979
|
+
}
|
|
1980
|
+
}
|
|
1981
|
+
/**
|
|
1982
|
+
* Clean up a phone call session: disconnect, end call via API, remove from maps.
|
|
1983
|
+
*/
|
|
1984
|
+
async function cleanupPhoneCall(phoneCallId, session, account) {
|
|
1985
|
+
activePhoneCalls.delete(phoneCallId);
|
|
1986
|
+
groupToActivePhoneCall.delete(session.groupId);
|
|
1987
|
+
await session.disconnect();
|
|
1988
|
+
await endPhoneCall(account.apiBaseUrl, account.botToken, phoneCallId, session.transcript).catch(() => { });
|
|
1989
|
+
}
|
|
1526
1990
|
/**
|
|
1527
1991
|
* Handle an inbound webhook message:
|
|
1528
1992
|
* 1. Decrypt message content
|
|
@@ -1534,7 +1998,7 @@ function createListGroupsTool(account) {
|
|
|
1534
1998
|
async function handleInboundMessage(payload,
|
|
1535
1999
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
1536
2000
|
ctx) {
|
|
1537
|
-
const { groupId, groupType = "group", message, replyAllowed = true, voiceReply, voiceReplyPrompt, context } = payload;
|
|
2001
|
+
const { groupId, groupType = "group", message, replyAllowed = true, voiceReply, voiceReplyPrompt, voiceReplyVoice, context } = payload;
|
|
1538
2002
|
const account = ctx.account;
|
|
1539
2003
|
// Record inbound activity
|
|
1540
2004
|
recordState(account.accountId, { lastInboundAt: Date.now() });
|
|
@@ -1561,6 +2025,20 @@ ctx) {
|
|
|
1561
2025
|
return;
|
|
1562
2026
|
}
|
|
1563
2027
|
const messageText = normalizeMessageText(content.text);
|
|
2028
|
+
// Manual mode phone call: if this group has an active phone call in manual mode,
|
|
2029
|
+
// speak the user's text message to the phone via TTS instead of dispatching to agent.
|
|
2030
|
+
const activePhoneCallId = groupToActivePhoneCall.get(groupId);
|
|
2031
|
+
if (activePhoneCallId && messageText) {
|
|
2032
|
+
const phoneSession = activePhoneCalls.get(activePhoneCallId);
|
|
2033
|
+
if (phoneSession && phoneSession.mode === "manual" && !phoneSession.isDisposed) {
|
|
2034
|
+
// Don't intercept bot's own messages (transcriptions posted by the bot)
|
|
2035
|
+
if (message.senderId !== account.accountId) {
|
|
2036
|
+
ctx.log?.info?.(`ozaiya: routing user message to active phone call ${activePhoneCallId} (manual mode)`);
|
|
2037
|
+
phoneSession.speakReply(messageText).catch(() => { });
|
|
2038
|
+
return; // Don't dispatch to agent
|
|
2039
|
+
}
|
|
2040
|
+
}
|
|
2041
|
+
}
|
|
1564
2042
|
const inboundAttachments = normalizeAttachments(content.files);
|
|
1565
2043
|
const attachmentSummary = buildAttachmentSummary(inboundAttachments);
|
|
1566
2044
|
const linkPreviewSummary = buildLinkPreviewSummary(content.linkPreviews);
|
|
@@ -1731,12 +2209,6 @@ ctx) {
|
|
|
1731
2209
|
}).catch((err) => {
|
|
1732
2210
|
ctx.log?.warn?.(`ozaiya: failed recording session: ${String(err)}`);
|
|
1733
2211
|
});
|
|
1734
|
-
// Resolve TTS config when voiceReply is active
|
|
1735
|
-
const ozaiyaChannelCfg = voiceReply
|
|
1736
|
-
? (ctx.cfg?.channels?.ozaiya ?? {})
|
|
1737
|
-
: null;
|
|
1738
|
-
const ttsApiKey = ozaiyaChannelCfg?.voiceCall?.deepgramApiKey || process.env.DEEPGRAM_API_KEY || "";
|
|
1739
|
-
const ttsModel = ozaiyaChannelCfg?.voiceCall?.tts?.model ?? "aura-asteria-en";
|
|
1740
2212
|
// Dispatch to agent with buffered block dispatcher
|
|
1741
2213
|
await ch.reply.dispatchReplyWithBufferedBlockDispatcher({
|
|
1742
2214
|
ctx: msgCtx,
|
|
@@ -1744,26 +2216,27 @@ ctx) {
|
|
|
1744
2216
|
dispatcherOptions: {
|
|
1745
2217
|
deliver: async (replyPayload, _info) => {
|
|
1746
2218
|
const replyText = replyPayload.text;
|
|
1747
|
-
ctx.log?.info?.(`ozaiya: deliver called, text length=${replyText?.length ?? 0}, empty=${!replyText?.trim()}, voiceReply=${voiceReply},
|
|
2219
|
+
ctx.log?.info?.(`ozaiya: deliver called, text length=${replyText?.length ?? 0}, empty=${!replyText?.trim()}, voiceReply=${voiceReply}, voiceReplyVoice=${voiceReplyVoice ?? 'none'}`);
|
|
1748
2220
|
if (!replyText?.trim())
|
|
1749
2221
|
return;
|
|
1750
2222
|
// Voice reply: synthesize TTS audio and send as voice message
|
|
1751
|
-
if (voiceReply
|
|
2223
|
+
if (voiceReply) {
|
|
1752
2224
|
ctx.log?.info?.(`ozaiya: voice reply — synthesizing TTS for group ${groupId}`);
|
|
1753
|
-
const
|
|
1754
|
-
if (
|
|
1755
|
-
const
|
|
2225
|
+
const audioBuffer = await synthesizeVoiceReply(replyText, ctx, voiceReplyVoice ?? undefined);
|
|
2226
|
+
if (audioBuffer) {
|
|
2227
|
+
const ext = audioBuffer.ext;
|
|
2228
|
+
const mime = ext === ".mp3" ? "audio/mpeg" : ext === ".opus" ? "audio/ogg" : ext === ".wav" ? "audio/wav" : "audio/mpeg";
|
|
2229
|
+
const fileInfo = await uploadFile(account.apiBaseUrl, account.botToken, groupId, `voice${ext}`, mime, audioBuffer.data);
|
|
1756
2230
|
await sendEncryptedChatContent({
|
|
1757
2231
|
account,
|
|
1758
2232
|
groupId,
|
|
1759
|
-
content: { files: [fileInfo] },
|
|
2233
|
+
content: { text: replyText, files: [fileInfo] },
|
|
1760
2234
|
log: ctx.log,
|
|
1761
2235
|
});
|
|
1762
|
-
ctx.log?.info?.(`ozaiya: voice reply sent successfully`);
|
|
2236
|
+
ctx.log?.info?.(`ozaiya: voice reply sent successfully (${ext}, ${audioBuffer.data.length} bytes)`);
|
|
1763
2237
|
return;
|
|
1764
2238
|
}
|
|
1765
2239
|
ctx.log?.warn?.(`ozaiya: TTS failed, falling back to text reply`);
|
|
1766
|
-
// Notify the user that voice synthesis failed before sending text fallback
|
|
1767
2240
|
await sendEncryptedChatContent({
|
|
1768
2241
|
account,
|
|
1769
2242
|
groupId,
|
|
@@ -1992,12 +2465,20 @@ ctx) {
|
|
|
1992
2465
|
id: payload.groupId,
|
|
1993
2466
|
},
|
|
1994
2467
|
});
|
|
2468
|
+
// Override per-bot voice config and pass volcengineTts from channel config
|
|
2469
|
+
const effectiveVoiceCallCfg = { ...voiceCallCfg };
|
|
2470
|
+
if (account.voiceConfig?.provider === 'deepgram' && account.voiceConfig.apiKey) {
|
|
2471
|
+
effectiveVoiceCallCfg.deepgramApiKey = account.voiceConfig.apiKey;
|
|
2472
|
+
}
|
|
2473
|
+
if (!effectiveVoiceCallCfg.volcengineTts && ozaiyaCfg.volcengineTts) {
|
|
2474
|
+
effectiveVoiceCallCfg.volcengineTts = ozaiyaCfg.volcengineTts;
|
|
2475
|
+
}
|
|
1995
2476
|
const session = new VoiceCallSession({
|
|
1996
2477
|
callId: payload.callId,
|
|
1997
2478
|
groupId: payload.groupId,
|
|
1998
2479
|
livekitToken: joinResult.token,
|
|
1999
2480
|
livekitUrl: joinResult.url,
|
|
2000
|
-
voiceCallConfig:
|
|
2481
|
+
voiceCallConfig: effectiveVoiceCallCfg,
|
|
2001
2482
|
onTranscript: (text) => {
|
|
2002
2483
|
// Dispatch transcript to agent and speak the reply
|
|
2003
2484
|
void handleVoiceTranscript(text, session, route, account, ctx);
|