@ozaiya/openclaw-channel 0.8.0 → 0.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,7 +9,7 @@ import fs from "node:fs/promises";
9
9
  import path from "node:path";
10
10
  import { registerPluginHttpRoute } from "openclaw/plugin-sdk/webhook-ingress";
11
11
  import { unwrapGroupKey, decryptMessage, encryptMessage, wrapGroupKey } from "./crypto.js";
12
- import { sendMessage, probeApi, fetchGroups, addMember, getUserPublicKeys, toggleReaction, editMessage, deleteMessage, pinMessage, unpinMessage, uploadFile, searchUsers, fetchLinkPreview, joinCall, leaveCall, } from "./api.js";
12
+ import { sendMessage, probeApi, fetchGroups, addMember, getUserPublicKeys, toggleReaction, editMessage, deleteMessage, pinMessage, unpinMessage, uploadFile, searchUsers, fetchLinkPreview, joinCall, leaveCall, startCall, startPhoneCall, endPhoneCall, updatePhoneCallStatus, } from "./api.js";
13
13
  import { botCreateDirect, botCreateGroup } from "./botActions.js";
14
14
  import { buildInlineKeyboardSummary, buildLinkPreviewSummary, normalizeMessageText, normalizeToolInlineKeyboardRows, } from "./richContent.js";
15
15
  import { normalizeCallbackQueryPayload } from "./callbackQuery.js";
@@ -19,6 +19,7 @@ import { getOzaiyaRuntime } from "./runtime.js";
19
19
  import { maybeTranscribeInboundAudio, prependVoiceTranscriptToAgentInput, resolveOzaiyaSttConfig, } from "./transcribeAudio.js";
20
20
  import { startGatewayMode } from "./gateway.js";
21
21
  import { VoiceCallSession } from "./voiceCall.js";
22
+ import { PhoneCallSession } from "./phoneCall.js";
22
23
  const DEFAULT_API_BASE_URL = "https://api.ozai.dev";
23
24
  const DEFAULT_WEBHOOK_PATH = "/ozaiya/webhook";
24
25
  const DEFAULT_ACCOUNT_ID = "default";
@@ -34,6 +35,10 @@ const unwrappedKeys = new Map();
34
35
  const groupToBotAccountId = new Map();
35
36
  // Active voice call sessions keyed by callId
36
37
  const activeVoiceCalls = new Map();
38
+ // Active phone call sessions keyed by phoneCallId
39
+ const activePhoneCalls = new Map();
40
+ // Map groupId → phoneCallId for active phone calls (for manual mode message routing)
41
+ const groupToActivePhoneCall = new Map();
37
42
  // Runtime state tracking
38
43
  const runtimeState = new Map();
39
44
  function recordState(accountId, patch) {
@@ -348,13 +353,229 @@ async function synthesizeSpeechToMp3(text, deepgramApiKey, ttsModel, log) {
348
353
  }
349
354
  }
350
355
  /**
351
- * Synthesize voice reply using OpenClaw runtime TTS (supports Volcengine/Edge/OpenAI etc.),
352
- * falling back to Deepgram if runtime TTS is unavailable.
356
+ * Synthesize text to speech via Volcengine (火山引擎/豆包) TTS v3 streaming API.
357
+ * Uses X-Api-Key auth with the unidirectional streaming endpoint.
358
+ * Returns audio buffer or null on failure.
359
+ */
360
+ async function synthesizeSpeechVolcengineV3(text, apiKey, voice, encoding, speedRatio, resourceId, log) {
361
+ const headers = {
362
+ "Content-Type": "application/json",
363
+ "X-Api-Key": apiKey,
364
+ "X-Api-Resource-Id": resourceId,
365
+ "X-Api-Connect-Id": `tts-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`,
366
+ };
367
+ const body = {
368
+ user: { uid: "ozaiya-bot" },
369
+ req_params: { text, speaker: voice, speed_ratio: speedRatio },
370
+ audio_params: { format: encoding, sample_rate: 24000 },
371
+ };
372
+ log?.info?.(`ozaiya: Volcengine TTS v3 request: speaker=${voice} encoding=${encoding} resourceId=${resourceId}`);
373
+ const res = await fetch("https://openspeech.bytedance.com/api/v3/tts/unidirectional", {
374
+ method: "POST",
375
+ headers,
376
+ body: JSON.stringify(body),
377
+ signal: AbortSignal.timeout(30_000),
378
+ });
379
+ if (!res.ok) {
380
+ const errText = await res.text().catch(() => "");
381
+ log?.warn?.(`ozaiya: Volcengine TTS v3 HTTP error: ${res.status} ${errText}`);
382
+ return null;
383
+ }
384
+ // Collect streaming response — line-delimited JSON, each line has { data: "<base64>" }
385
+ const responseText = await res.text();
386
+ const audioChunks = [];
387
+ for (const line of responseText.split("\n")) {
388
+ const trimmed = line.trim();
389
+ if (!trimmed)
390
+ continue;
391
+ try {
392
+ const chunk = JSON.parse(trimmed);
393
+ if (chunk.data) {
394
+ audioChunks.push(Buffer.from(chunk.data, "base64"));
395
+ }
396
+ // Non-success, non-final codes indicate errors
397
+ if (chunk.code && chunk.code !== 20000000 && chunk.code !== 0) {
398
+ log?.warn?.(`ozaiya: Volcengine TTS v3 chunk error: code=${chunk.code} message=${chunk.message}`);
399
+ }
400
+ }
401
+ catch {
402
+ // skip non-JSON lines
403
+ }
404
+ }
405
+ if (audioChunks.length === 0) {
406
+ log?.warn?.(`ozaiya: Volcengine TTS v3: no audio data in response (${responseText.length} bytes raw)`);
407
+ return null;
408
+ }
409
+ log?.info?.(`ozaiya: Volcengine TTS v3: collected ${audioChunks.length} chunks`);
410
+ return Buffer.concat(audioChunks);
411
+ }
412
+ /**
413
+ * Synthesize text to speech via Volcengine (火山引擎/豆包) TTS v1 API.
414
+ * Uses Bearer;token auth (legacy).
415
+ * Returns audio buffer or null on failure.
416
+ */
417
+ async function synthesizeSpeechVolcengineV1(text, config, voice, encoding, speedRatio, log) {
418
+ const cluster = config.cluster ?? "volcano_tts";
419
+ const reqId = `tts-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
420
+ const payload = {
421
+ app: {
422
+ appid: config.appId ?? "",
423
+ token: config.accessToken ?? "",
424
+ cluster,
425
+ },
426
+ user: { uid: "ozaiya-bot" },
427
+ audio: {
428
+ voice_type: voice,
429
+ encoding,
430
+ speed_ratio: speedRatio,
431
+ },
432
+ request: {
433
+ reqid: reqId,
434
+ text,
435
+ operation: "query",
436
+ },
437
+ };
438
+ const headers = {
439
+ "Content-Type": "application/json",
440
+ Authorization: `Bearer;${config.accessToken}`,
441
+ };
442
+ const res = await fetch("https://openspeech.bytedance.com/api/v1/tts", {
443
+ method: "POST",
444
+ headers,
445
+ body: JSON.stringify(payload),
446
+ signal: AbortSignal.timeout(30_000),
447
+ });
448
+ if (!res.ok) {
449
+ log?.warn?.(`ozaiya: Volcengine TTS v1 HTTP error: ${res.status} ${res.statusText}`);
450
+ return null;
451
+ }
452
+ const result = await res.json();
453
+ if (result.code !== 3000 || !result.data) {
454
+ log?.warn?.(`ozaiya: Volcengine TTS v1 error: code=${result.code} message=${result.message}`);
455
+ return null;
456
+ }
457
+ return Buffer.from(result.data, "base64");
458
+ }
459
+ /**
460
+ * Synthesize text to speech via Volcengine AI Gateway (OpenAI-compatible endpoint).
461
+ * Uses the Ark API Key with Bearer token auth.
462
+ * Returns audio buffer or null on failure.
463
+ */
464
+ async function synthesizeSpeechVolcengineGateway(text, arkApiKey, voice, encoding, speedRatio, log) {
465
+ log?.info?.(`ozaiya: Volcengine AI Gateway TTS request: voice=${voice}`);
466
+ const res = await fetch("https://ai-gateway.vei.volces.com/v1/audio/speech", {
467
+ method: "POST",
468
+ headers: {
469
+ "Content-Type": "application/json",
470
+ Authorization: `Bearer ${arkApiKey}`,
471
+ },
472
+ body: JSON.stringify({
473
+ model: "doubao-tts",
474
+ input: text,
475
+ voice,
476
+ response_format: encoding,
477
+ speed: speedRatio,
478
+ }),
479
+ signal: AbortSignal.timeout(30_000),
480
+ });
481
+ if (!res.ok) {
482
+ const errText = await res.text().catch(() => "");
483
+ log?.warn?.(`ozaiya: Volcengine AI Gateway TTS HTTP error: ${res.status} ${errText}`);
484
+ return null;
485
+ }
486
+ // Response is raw audio binary
487
+ const arrayBuffer = await res.arrayBuffer();
488
+ if (arrayBuffer.byteLength === 0) {
489
+ log?.warn?.(`ozaiya: Volcengine AI Gateway TTS: empty response`);
490
+ return null;
491
+ }
492
+ log?.info?.(`ozaiya: Volcengine AI Gateway TTS: received ${arrayBuffer.byteLength} bytes`);
493
+ return Buffer.from(arrayBuffer);
494
+ }
495
+ /**
496
+ * Synthesize text to speech via Volcengine (火山引擎/豆包) TTS API.
497
+ * Priority:
498
+ * 1. arkApiKey → AI Gateway OpenAI-compatible endpoint (simplest, uses Ark API Key)
499
+ * 2. apiKey → v3 streaming openspeech endpoint (uses speech-specific API Key)
500
+ * 3. accessToken → v1 non-streaming openspeech endpoint (legacy)
501
+ * Returns audio buffer or null on failure.
502
+ */
503
+ async function synthesizeSpeechVolcengine(text, config, log) {
504
+ try {
505
+ const voice = config.voice ?? "zh_female_wanwanxiaohe_moon_bigtts";
506
+ const encoding = config.encoding ?? "mp3";
507
+ const speedRatio = config.speedRatio ?? 1.0;
508
+ if (config.arkApiKey) {
509
+ const result = await synthesizeSpeechVolcengineGateway(text, config.arkApiKey, voice, encoding, speedRatio, log);
510
+ if (result)
511
+ return result;
512
+ log?.warn?.(`ozaiya: Ark Gateway TTS failed, trying fallback`);
513
+ }
514
+ if (config.apiKey) {
515
+ const resourceId = config.resourceId ?? "volc.service_type.10029";
516
+ return await synthesizeSpeechVolcengineV3(text, config.apiKey, voice, encoding, speedRatio, resourceId, log);
517
+ }
518
+ if (config.accessToken) {
519
+ return await synthesizeSpeechVolcengineV1(text, config, voice, encoding, speedRatio, log);
520
+ }
521
+ if (!config.arkApiKey) {
522
+ log?.warn?.(`ozaiya: Volcengine TTS: no arkApiKey, apiKey, or accessToken configured`);
523
+ }
524
+ return null;
525
+ }
526
+ catch (err) {
527
+ log?.warn?.(`ozaiya: Volcengine TTS failed: ${String(err)}`);
528
+ return null;
529
+ }
530
+ }
531
+ /**
532
+ * Synthesize voice reply. Priority:
533
+ * 1. Volcengine TTS (if configured — supports Chinese)
534
+ * 2. OpenClaw runtime TTS (if available — uses configured provider)
535
+ * 3. Deepgram TTS (fallback — English only)
353
536
  */
354
537
  async function synthesizeVoiceReply(text,
355
538
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
356
- ctx) {
357
- // Try OpenClaw runtime TTS first (uses configured provider: volcano, edge, openai, etc.)
539
+ ctx, voiceOverride) {
540
+ const ozaiyaChannelCfg = (ctx.cfg?.channels?.ozaiya ?? {});
541
+ const account = ctx.account;
542
+ const perBotVoice = account?.voiceConfig;
543
+ // 0. Per-bot voice config from gateway (highest priority)
544
+ if (perBotVoice?.provider === 'volcengine' && perBotVoice.appId && perBotVoice.accessToken) {
545
+ ctx.log?.info?.(`ozaiya: using per-bot Volcengine TTS`);
546
+ const volcCfg = {
547
+ appId: perBotVoice.appId,
548
+ accessToken: perBotVoice.accessToken,
549
+ ...(voiceOverride ? { voice: voiceOverride } : {}),
550
+ };
551
+ const buf = await synthesizeSpeechVolcengine(text, volcCfg, ctx.log);
552
+ if (buf) {
553
+ return { data: buf, ext: `.${volcCfg.encoding ?? "mp3"}` };
554
+ }
555
+ ctx.log?.warn?.(`ozaiya: per-bot Volcengine TTS failed, trying fallbacks`);
556
+ }
557
+ if (perBotVoice?.provider === 'deepgram' && perBotVoice.apiKey) {
558
+ ctx.log?.info?.(`ozaiya: using per-bot Deepgram TTS`);
559
+ const ttsModel = ozaiyaChannelCfg?.voiceCall?.tts?.model ?? "aura-asteria-en";
560
+ const mp3Buffer = await synthesizeSpeechToMp3(text, perBotVoice.apiKey, ttsModel, ctx.log);
561
+ if (mp3Buffer)
562
+ return { data: mp3Buffer, ext: ".mp3" };
563
+ ctx.log?.warn?.(`ozaiya: per-bot Deepgram TTS failed, trying fallbacks`);
564
+ }
565
+ // 1. Volcengine TTS (preferred for Chinese)
566
+ if (ozaiyaChannelCfg?.volcengineTts?.arkApiKey || ozaiyaChannelCfg?.volcengineTts?.apiKey || ozaiyaChannelCfg?.volcengineTts?.appId) {
567
+ const effectiveVolcCfg = voiceOverride
568
+ ? { ...ozaiyaChannelCfg.volcengineTts, voice: voiceOverride }
569
+ : ozaiyaChannelCfg.volcengineTts;
570
+ ctx.log?.info?.(`ozaiya: using Volcengine TTS (voice=${effectiveVolcCfg.voice ?? 'default'})`);
571
+ const buf = await synthesizeSpeechVolcengine(text, effectiveVolcCfg, ctx.log);
572
+ if (buf) {
573
+ const ext = `.${ozaiyaChannelCfg.volcengineTts.encoding ?? "mp3"}`;
574
+ return { data: buf, ext };
575
+ }
576
+ ctx.log?.warn?.(`ozaiya: Volcengine TTS failed, trying fallbacks`);
577
+ }
578
+ // 2. OpenClaw runtime TTS (edge, openai, elevenlabs etc.)
358
579
  try {
359
580
  const runtime = getOzaiyaRuntime();
360
581
  if (runtime.tts?.textToSpeech) {
@@ -376,8 +597,7 @@ ctx) {
376
597
  catch (err) {
377
598
  ctx.log?.warn?.(`ozaiya: runtime TTS unavailable: ${String(err)}`);
378
599
  }
379
- // Fallback: Deepgram direct API
380
- const ozaiyaChannelCfg = (ctx.cfg?.channels?.ozaiya ?? {});
600
+ // 3. Deepgram direct API (English fallback)
381
601
  const ttsApiKey = ozaiyaChannelCfg?.voiceCall?.deepgramApiKey || process.env.DEEPGRAM_API_KEY || "";
382
602
  const ttsModel = ozaiyaChannelCfg?.voiceCall?.tts?.model ?? "aura-asteria-en";
383
603
  if (!ttsApiKey)
@@ -718,6 +938,9 @@ export const ozaiyaPlugin = {
718
938
  createPinMessageTool(account),
719
939
  createSearchUsersTool(account),
720
940
  createListGroupsTool(account),
941
+ createMakePhoneCallTool(account, cfg),
942
+ createHangUpCallTool(account),
943
+ createStartInAppCallTool(account, cfg),
721
944
  ];
722
945
  }),
723
946
  gateway: {
@@ -730,7 +953,7 @@ export const ozaiyaPlugin = {
730
953
  ctx.log?.info(`[gateway] starting gateway mode`);
731
954
  recordState(account.accountId, { running: true, lastStartAt: Date.now() });
732
955
  const ozaiya = resolveConfig(ctx.cfg);
733
- const stateDir = ctx.runtime?.state?.resolveStateDir?.() ?? process.env.HOME ?? ".";
956
+ const stateDir = process.env.OPENCLAW_STATE_DIR ?? ctx.runtime?.state?.resolveStateDir?.() ?? process.env.HOME ?? ".";
734
957
  // Track per-bot unregister functions for hot-reload
735
958
  const botUnregisters = new Map();
736
959
  const startBotHandler = (botAccount) => {
@@ -858,6 +1081,12 @@ export const ozaiyaPlugin = {
858
1081
  session.disconnect().catch(() => { });
859
1082
  activeVoiceCalls.delete(callId);
860
1083
  }
1084
+ // Disconnect all active phone call sessions
1085
+ for (const [phoneCallId, session] of activePhoneCalls) {
1086
+ session.disconnect().catch(() => { });
1087
+ activePhoneCalls.delete(phoneCallId);
1088
+ }
1089
+ groupToActivePhoneCall.clear();
861
1090
  for (const id of gatewayBotAccounts.keys()) {
862
1091
  recordState(id, { running: false, lastStopAt: Date.now() });
863
1092
  }
@@ -1541,6 +1770,323 @@ function createListGroupsTool(account) {
1541
1770
  },
1542
1771
  };
1543
1772
  }
1773
+ function createStartInAppCallTool(account, cfg) {
1774
+ return {
1775
+ label: "Start In-App Call",
1776
+ name: "start_in_app_call",
1777
+ ownerOnly: false,
1778
+ description: "Start a voice or video call in an Ozaiya group/DM. " +
1779
+ "All group members will receive a call notification. " +
1780
+ "The bot joins the call automatically and can speak via STT/TTS (standard engine).",
1781
+ parameters: {
1782
+ type: "object",
1783
+ properties: {
1784
+ groupId: {
1785
+ type: "string",
1786
+ description: "The group/DM to start the call in.",
1787
+ },
1788
+ type: {
1789
+ type: "string",
1790
+ enum: ["voice", "video"],
1791
+ description: "Call type. Default: voice.",
1792
+ },
1793
+ },
1794
+ required: ["groupId"],
1795
+ },
1796
+ execute: async (_toolCallId, rawArgs) => {
1797
+ const args = rawArgs;
1798
+ try {
1799
+ const result = await startCall(account.apiBaseUrl, account.botToken, args.groupId, args.type ?? "voice");
1800
+ if (!result) {
1801
+ return { content: [{ type: "text", text: "Failed to start call. The bot may not have permission or is not a member of the group." }] };
1802
+ }
1803
+ if (result.joined) {
1804
+ return {
1805
+ content: [{
1806
+ type: "text",
1807
+ text: `Joined existing active call (ID: ${result.callId}).`,
1808
+ }],
1809
+ };
1810
+ }
1811
+ // For standard engine: create VoiceCallSession so bot can speak via STT/TTS
1812
+ if (result.voiceEngine === "standard") {
1813
+ const ozaiyaCfg = (cfg?.channels?.ozaiya ?? {});
1814
+ const voiceCallCfg = ozaiyaCfg.voiceCall ?? {};
1815
+ if (voiceCallCfg.enabled !== false) {
1816
+ const runtime = getOzaiyaRuntime();
1817
+ const ch = runtime.channel;
1818
+ const route = ch.routing.resolveAgentRoute({
1819
+ cfg,
1820
+ channel: "ozaiya",
1821
+ accountId: account.accountId,
1822
+ peer: { kind: "group", id: args.groupId },
1823
+ });
1824
+ const effectiveVoiceCallCfg = { ...voiceCallCfg };
1825
+ if (account.voiceConfig?.provider === "deepgram" && account.voiceConfig.apiKey) {
1826
+ effectiveVoiceCallCfg.deepgramApiKey = account.voiceConfig.apiKey;
1827
+ }
1828
+ if (!effectiveVoiceCallCfg.volcengineTts && ozaiyaCfg.volcengineTts) {
1829
+ effectiveVoiceCallCfg.volcengineTts = ozaiyaCfg.volcengineTts;
1830
+ }
1831
+ const session = new VoiceCallSession({
1832
+ callId: result.callId,
1833
+ groupId: args.groupId,
1834
+ livekitToken: result.token,
1835
+ livekitUrl: result.url,
1836
+ voiceCallConfig: effectiveVoiceCallCfg,
1837
+ onTranscript: (text) => {
1838
+ void handleVoiceTranscript(text, session, route, account, { cfg, log: undefined });
1839
+ },
1840
+ });
1841
+ activeVoiceCalls.set(result.callId, session);
1842
+ try {
1843
+ await session.connect();
1844
+ }
1845
+ catch (err) {
1846
+ activeVoiceCalls.delete(result.callId);
1847
+ await session.disconnect();
1848
+ await leaveCall(account.apiBaseUrl, account.botToken, result.callId).catch(() => { });
1849
+ throw err;
1850
+ }
1851
+ }
1852
+ }
1853
+ // For doubao engine: Volcengine AI agent handles voice — no LiveKit connection needed from bot
1854
+ return {
1855
+ content: [{
1856
+ type: "text",
1857
+ text: `Call started (ID: ${result.callId}, engine: ${result.voiceEngine}). ` +
1858
+ `All group members have been notified.`,
1859
+ }],
1860
+ };
1861
+ }
1862
+ catch (err) {
1863
+ const msg = err instanceof Error ? err.message : String(err);
1864
+ return { content: [{ type: "text", text: `Error starting call: ${msg}` }] };
1865
+ }
1866
+ },
1867
+ };
1868
+ }
1869
+ function createMakePhoneCallTool(account, cfg) {
1870
+ return {
1871
+ label: "Make Phone Call",
1872
+ name: "make_phone_call",
1873
+ ownerOnly: false,
1874
+ description: "Initiate an outbound phone call to a PSTN number via SIP. " +
1875
+ "The call is recorded and the recording + transcript are saved to the chat. " +
1876
+ "In 'auto' mode, you speak directly to the callee via STT/TTS. " +
1877
+ "In 'manual' mode, the callee's speech is transcribed to chat for the user to respond.",
1878
+ parameters: {
1879
+ type: "object",
1880
+ properties: {
1881
+ groupId: {
1882
+ type: "string",
1883
+ description: "The group/DM where the call record will be posted.",
1884
+ },
1885
+ phoneNumber: {
1886
+ type: "string",
1887
+ description: "E.164 phone number to call (e.g. +8613800138000).",
1888
+ },
1889
+ mode: {
1890
+ type: "string",
1891
+ enum: ["auto", "manual"],
1892
+ description: "auto = AI handles conversation, manual = transcribe to chat for user to reply. Default: auto.",
1893
+ },
1894
+ purpose: {
1895
+ type: "string",
1896
+ description: "Brief note about why you're making this call (logged for reference).",
1897
+ },
1898
+ },
1899
+ required: ["groupId", "phoneNumber"],
1900
+ },
1901
+ execute: async (_toolCallId, rawArgs) => {
1902
+ const args = rawArgs;
1903
+ try {
1904
+ const mode = args.mode ?? "auto";
1905
+ const result = await startPhoneCall(account.apiBaseUrl, account.botToken, args.groupId, args.phoneNumber, mode, args.purpose);
1906
+ // Report connected status
1907
+ updatePhoneCallStatus(account.apiBaseUrl, account.botToken, result.phoneCallId, "connected").catch(() => { });
1908
+ const ozaiyaCfg = (cfg?.channels?.ozaiya ?? {});
1909
+ const voiceCallCfg = ozaiyaCfg.voiceCall ?? {};
1910
+ // Override per-bot voice config and pass volcengineTts from channel config
1911
+ const effectiveVoiceCallCfg = { ...voiceCallCfg };
1912
+ if (account.voiceConfig?.provider === "deepgram" && account.voiceConfig.apiKey) {
1913
+ effectiveVoiceCallCfg.deepgramApiKey = account.voiceConfig.apiKey;
1914
+ }
1915
+ if (!effectiveVoiceCallCfg.volcengineTts && ozaiyaCfg.volcengineTts) {
1916
+ effectiveVoiceCallCfg.volcengineTts = ozaiyaCfg.volcengineTts;
1917
+ }
1918
+ // Create PhoneCallSession
1919
+ const session = new PhoneCallSession({
1920
+ phoneCallId: result.phoneCallId,
1921
+ groupId: args.groupId,
1922
+ livekitToken: result.livekitToken,
1923
+ livekitUrl: result.livekitUrl,
1924
+ mode,
1925
+ voiceCallConfig: effectiveVoiceCallCfg,
1926
+ onTranscript: (text) => {
1927
+ if (mode === "auto") {
1928
+ // Auto mode: dispatch to agent, speak reply
1929
+ void handlePhoneAutoTranscript(text, session, account, cfg, args.groupId);
1930
+ }
1931
+ else {
1932
+ // Manual mode: post to chat as a message
1933
+ void postPhoneTranscriptToChat(text, account, args.groupId);
1934
+ }
1935
+ },
1936
+ onPhoneHangUp: () => {
1937
+ // Phone person hung up — end the call
1938
+ void cleanupPhoneCall(result.phoneCallId, session, account);
1939
+ },
1940
+ });
1941
+ activePhoneCalls.set(result.phoneCallId, session);
1942
+ groupToActivePhoneCall.set(args.groupId, result.phoneCallId);
1943
+ // Connect to LiveKit
1944
+ await session.connect();
1945
+ return {
1946
+ content: [{
1947
+ type: "text",
1948
+ text: `Phone call initiated (ID: ${result.phoneCallId}). ` +
1949
+ `Calling ${args.phoneNumber} in ${mode} mode. ` +
1950
+ `The call is being recorded. Use hang_up_call to end it.`,
1951
+ }],
1952
+ };
1953
+ }
1954
+ catch (err) {
1955
+ const msg = err instanceof Error ? err.message : String(err);
1956
+ return { content: [{ type: "text", text: `Error starting phone call: ${msg}` }] };
1957
+ }
1958
+ },
1959
+ };
1960
+ }
1961
+ function createHangUpCallTool(account) {
1962
+ return {
1963
+ label: "Hang Up Call",
1964
+ name: "hang_up_call",
1965
+ ownerOnly: false,
1966
+ description: "End an active outbound phone call.",
1967
+ parameters: {
1968
+ type: "object",
1969
+ properties: {
1970
+ phoneCallId: {
1971
+ type: "string",
1972
+ description: "The phone call ID returned by make_phone_call.",
1973
+ },
1974
+ },
1975
+ required: ["phoneCallId"],
1976
+ },
1977
+ execute: async (_toolCallId, rawArgs) => {
1978
+ const args = rawArgs;
1979
+ try {
1980
+ const session = activePhoneCalls.get(args.phoneCallId);
1981
+ if (session) {
1982
+ await cleanupPhoneCall(args.phoneCallId, session, account);
1983
+ }
1984
+ else {
1985
+ // No local session, still try API
1986
+ await endPhoneCall(account.apiBaseUrl, account.botToken, args.phoneCallId);
1987
+ }
1988
+ return {
1989
+ content: [{ type: "text", text: `Phone call ${args.phoneCallId} ended.` }],
1990
+ };
1991
+ }
1992
+ catch (err) {
1993
+ const msg = err instanceof Error ? err.message : String(err);
1994
+ return { content: [{ type: "text", text: `Error ending phone call: ${msg}` }] };
1995
+ }
1996
+ },
1997
+ };
1998
+ }
1999
+ /**
2000
+ * Handle phone transcription in auto mode: dispatch to agent, speak the reply.
2001
+ */
2002
+ async function handlePhoneAutoTranscript(text, session, account, cfg, groupId) {
2003
+ if (session.isDisposed)
2004
+ return;
2005
+ const runtime = getOzaiyaRuntime();
2006
+ const ch = runtime.channel;
2007
+ const route = ch.routing.resolveAgentRoute({
2008
+ cfg,
2009
+ channel: "ozaiya",
2010
+ accountId: account.accountId,
2011
+ peer: { kind: "group", id: groupId },
2012
+ });
2013
+ const ozaiyaCfg = (cfg?.channels?.ozaiya ?? {});
2014
+ const voicePrompt = ozaiyaCfg.voiceCall?.agentPrompt ??
2015
+ "[Phone Call] You are in a live phone call. Your response will be spoken aloud via TTS. " +
2016
+ "Rules: respond concisely (1-3 sentences), use natural spoken language, " +
2017
+ "never use markdown/code blocks/bullet lists/URLs/emojis. " +
2018
+ "Do not say \"sure\" or \"of course\" — just answer directly.";
2019
+ const bodyForAgent = `${voicePrompt}\n\n${text}`;
2020
+ const body = `[ozaiya] from: phone caller | at: ${new Date().toISOString()}\n---\n${text}`;
2021
+ const msgCtx = ch.reply.finalizeInboundContext({
2022
+ Body: body,
2023
+ BodyForAgent: bodyForAgent,
2024
+ RawBody: text,
2025
+ CommandBody: text,
2026
+ From: `ozaiya:group:${groupId}`,
2027
+ To: `ozaiya:group:${groupId}`,
2028
+ SessionKey: route.sessionKey,
2029
+ AccountId: route.accountId,
2030
+ ChatType: "group",
2031
+ ConversationLabel: `group:${groupId}`,
2032
+ GroupSubject: groupId,
2033
+ SenderId: "phone-caller",
2034
+ SenderName: "Phone Caller",
2035
+ Provider: "ozaiya",
2036
+ Surface: "ozaiya-phone",
2037
+ MessageSid: `phone-${Date.now()}`,
2038
+ Timestamp: Date.now(),
2039
+ NumFiles: 0,
2040
+ NumMedia: 0,
2041
+ HasFiles: false,
2042
+ CommandAuthorized: true,
2043
+ OriginatingChannel: "ozaiya",
2044
+ OriginatingTo: `ozaiya:group:${groupId}`,
2045
+ });
2046
+ await ch.reply.dispatchReplyWithBufferedBlockDispatcher({
2047
+ ctx: msgCtx,
2048
+ cfg,
2049
+ dispatcherOptions: {
2050
+ deliver: async (replyPayload, _info) => {
2051
+ const replyText = replyPayload.text;
2052
+ if (!replyText?.trim() || session.isDisposed)
2053
+ return;
2054
+ await session.speakReply(replyText);
2055
+ },
2056
+ onError: (err) => {
2057
+ // eslint-disable-next-line no-console
2058
+ console.warn(`ozaiya: phone call auto-reply error: ${String(err)}`);
2059
+ },
2060
+ },
2061
+ });
2062
+ }
2063
+ /**
2064
+ * Post phone person's transcribed speech to the chat group (manual mode).
2065
+ */
2066
+ async function postPhoneTranscriptToChat(text, account, groupId) {
2067
+ try {
2068
+ const groupKey = unwrappedKeys.get(groupId);
2069
+ if (!groupKey)
2070
+ return;
2071
+ const content = {
2072
+ text: `📞 对方说: ${text}`,
2073
+ };
2074
+ const encrypted = encryptMessage(content, groupKey);
2075
+ await sendMessage(account.apiBaseUrl, account.botToken, groupId, encrypted);
2076
+ }
2077
+ catch {
2078
+ // Fire-and-forget
2079
+ }
2080
+ }
2081
+ /**
2082
+ * Clean up a phone call session: disconnect, end call via API, remove from maps.
2083
+ */
2084
+ async function cleanupPhoneCall(phoneCallId, session, account) {
2085
+ activePhoneCalls.delete(phoneCallId);
2086
+ groupToActivePhoneCall.delete(session.groupId);
2087
+ await session.disconnect();
2088
+ await endPhoneCall(account.apiBaseUrl, account.botToken, phoneCallId, session.transcript).catch(() => { });
2089
+ }
1544
2090
  /**
1545
2091
  * Handle an inbound webhook message:
1546
2092
  * 1. Decrypt message content
@@ -1552,7 +2098,7 @@ function createListGroupsTool(account) {
1552
2098
  async function handleInboundMessage(payload,
1553
2099
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
1554
2100
  ctx) {
1555
- const { groupId, groupType = "group", message, replyAllowed = true, voiceReply, voiceReplyPrompt, context } = payload;
2101
+ const { groupId, groupType = "group", message, replyAllowed = true, voiceReply, voiceReplyPrompt, voiceReplyVoice, context } = payload;
1556
2102
  const account = ctx.account;
1557
2103
  // Record inbound activity
1558
2104
  recordState(account.accountId, { lastInboundAt: Date.now() });
@@ -1579,6 +2125,20 @@ ctx) {
1579
2125
  return;
1580
2126
  }
1581
2127
  const messageText = normalizeMessageText(content.text);
2128
+ // Manual mode phone call: if this group has an active phone call in manual mode,
2129
+ // speak the user's text message to the phone via TTS instead of dispatching to agent.
2130
+ const activePhoneCallId = groupToActivePhoneCall.get(groupId);
2131
+ if (activePhoneCallId && messageText) {
2132
+ const phoneSession = activePhoneCalls.get(activePhoneCallId);
2133
+ if (phoneSession && phoneSession.mode === "manual" && !phoneSession.isDisposed) {
2134
+ // Don't intercept bot's own messages (transcriptions posted by the bot)
2135
+ if (message.senderId !== account.accountId) {
2136
+ ctx.log?.info?.(`ozaiya: routing user message to active phone call ${activePhoneCallId} (manual mode)`);
2137
+ phoneSession.speakReply(messageText).catch(() => { });
2138
+ return; // Don't dispatch to agent
2139
+ }
2140
+ }
2141
+ }
1582
2142
  const inboundAttachments = normalizeAttachments(content.files);
1583
2143
  const attachmentSummary = buildAttachmentSummary(inboundAttachments);
1584
2144
  const linkPreviewSummary = buildLinkPreviewSummary(content.linkPreviews);
@@ -1756,13 +2316,13 @@ ctx) {
1756
2316
  dispatcherOptions: {
1757
2317
  deliver: async (replyPayload, _info) => {
1758
2318
  const replyText = replyPayload.text;
1759
- ctx.log?.info?.(`ozaiya: deliver called, text length=${replyText?.length ?? 0}, empty=${!replyText?.trim()}, voiceReply=${voiceReply}`);
2319
+ ctx.log?.info?.(`ozaiya: deliver called, text length=${replyText?.length ?? 0}, empty=${!replyText?.trim()}, voiceReply=${voiceReply}, voiceReplyVoice=${voiceReplyVoice ?? 'none'}`);
1760
2320
  if (!replyText?.trim())
1761
2321
  return;
1762
2322
  // Voice reply: synthesize TTS audio and send as voice message
1763
2323
  if (voiceReply) {
1764
2324
  ctx.log?.info?.(`ozaiya: voice reply — synthesizing TTS for group ${groupId}`);
1765
- const audioBuffer = await synthesizeVoiceReply(replyText, ctx);
2325
+ const audioBuffer = await synthesizeVoiceReply(replyText, ctx, voiceReplyVoice ?? undefined);
1766
2326
  if (audioBuffer) {
1767
2327
  const ext = audioBuffer.ext;
1768
2328
  const mime = ext === ".mp3" ? "audio/mpeg" : ext === ".opus" ? "audio/ogg" : ext === ".wav" ? "audio/wav" : "audio/mpeg";
@@ -1770,7 +2330,7 @@ ctx) {
1770
2330
  await sendEncryptedChatContent({
1771
2331
  account,
1772
2332
  groupId,
1773
- content: { files: [fileInfo] },
2333
+ content: { text: replyText, files: [fileInfo] },
1774
2334
  log: ctx.log,
1775
2335
  });
1776
2336
  ctx.log?.info?.(`ozaiya: voice reply sent successfully (${ext}, ${audioBuffer.data.length} bytes)`);
@@ -2005,12 +2565,20 @@ ctx) {
2005
2565
  id: payload.groupId,
2006
2566
  },
2007
2567
  });
2568
+ // Override per-bot voice config and pass volcengineTts from channel config
2569
+ const effectiveVoiceCallCfg = { ...voiceCallCfg };
2570
+ if (account.voiceConfig?.provider === 'deepgram' && account.voiceConfig.apiKey) {
2571
+ effectiveVoiceCallCfg.deepgramApiKey = account.voiceConfig.apiKey;
2572
+ }
2573
+ if (!effectiveVoiceCallCfg.volcengineTts && ozaiyaCfg.volcengineTts) {
2574
+ effectiveVoiceCallCfg.volcengineTts = ozaiyaCfg.volcengineTts;
2575
+ }
2008
2576
  const session = new VoiceCallSession({
2009
2577
  callId: payload.callId,
2010
2578
  groupId: payload.groupId,
2011
2579
  livekitToken: joinResult.token,
2012
2580
  livekitUrl: joinResult.url,
2013
- voiceCallConfig: voiceCallCfg,
2581
+ voiceCallConfig: effectiveVoiceCallCfg,
2014
2582
  onTranscript: (text) => {
2015
2583
  // Dispatch transcript to agent and speak the reply
2016
2584
  void handleVoiceTranscript(text, session, route, account, ctx);