@ozaiya/openclaw-channel 0.10.9 → 0.10.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,6 +11,7 @@ import { registerPluginHttpRoute } from "openclaw/plugin-sdk/webhook-ingress";
11
11
  import { unwrapGroupKey, decryptMessage, encryptMessage, wrapGroupKey } from "./crypto.js";
12
12
  import { resolveImageGenerationConfig, generateImage } from "./imageGeneration.js";
13
13
  import { createDesktopTool } from "./desktopTool.js";
14
+ import { startRecording, stopAndExtractRecording } from "./desktopRecorder.js";
14
15
  import { sendMessage, probeApi, fetchGroups, addMember, getUserPublicKeys, toggleReaction, editMessage, deleteMessage, pinMessage, unpinMessage, uploadFile, searchUsers, fetchLinkPreview, joinCall, leaveCall, startCall, startPhoneCall, endPhoneCall, updatePhoneCallStatus, } from "./api.js";
15
16
  import { botCreateDirect, botCreateGroup } from "./botActions.js";
16
17
  import { buildInlineKeyboardSummary, buildLinkPreviewSummary, normalizeMessageText, normalizeToolInlineKeyboardRows, } from "./richContent.js";
@@ -26,6 +27,7 @@ import { summarizeWithYuanbao } from "./yuanbao.js";
26
27
  import { summarizeWithDoubao } from "./doubao.js";
27
28
  import { fetchXueqiuPost, searchXueqiuPosts } from "./xueqiu.js";
28
29
  import { fetchSocialMediaPost, searchSocialMedia, extractSocialMediaContent } from "./socialMedia.js";
30
+ import { requestConfirmation, parseActionCallback, resolveConfirmation } from "./actionConfirmation.js";
29
31
  const DEFAULT_API_BASE_URL = "https://api.ozai.dev";
30
32
  const DEFAULT_WEBHOOK_PATH = "/ozaiya/webhook";
31
33
  const DEFAULT_ACCOUNT_ID = "default";
@@ -39,8 +41,136 @@ const RICH_MESSAGE_GUIDANCE = "Prefer plain text for normal prose, code, markdow
39
41
  const unwrappedKeys = new Map();
40
42
  // Track which bot account handles each group (groupId → botAccountId) — used for gateway mode
41
43
  const groupToBotAccountId = new Map();
44
+ // Track the originating groupId for each bot account's current dispatch
45
+ // (accountId → groupId). Used by tools that don't receive groupId
46
+ // (message_user, create_group) to know which group to send confirmation cards to.
47
+ const accountToOriginGroupId = new Map();
42
48
  // Active voice call sessions keyed by callId
43
49
  const activeVoiceCalls = new Map();
50
+ // --- Task Progress Tracking ---
51
+ // Tools that are read-only and should not produce progress steps
52
+ const READ_ONLY_TOOLS = new Set([
53
+ "search_users",
54
+ "list_groups",
55
+ "fetch_xueqiu_post",
56
+ "search_xueqiu",
57
+ "scrape_social_media_post",
58
+ "summarize_url",
59
+ ]);
60
+ // Explicit tool → category overrides. Anything not listed falls through to the
61
+ // keyword heuristic in categorize() below.
62
+ const TOOL_CATEGORY = {
63
+ desktop: "browser",
64
+ search_xueqiu: "api",
65
+ fetch_xueqiu_post: "api",
66
+ scrape_social_media_post: "api",
67
+ summarize_url: "api",
68
+ search_users: "api",
69
+ list_groups: "api",
70
+ generate_image: "api",
71
+ message_user: "other",
72
+ create_group: "other",
73
+ };
74
+ /**
75
+ * Classify a tool into a coarse category for the per-task usage card + the
76
+ * "正在使用 X" banner icon. Falls back to keyword matching on the tool name so
77
+ * the agent's own tools (bash/browser/file ops) categorize sensibly too.
78
+ */
79
+ function categorize(toolName) {
80
+ const explicit = TOOL_CATEGORY[toolName];
81
+ if (explicit)
82
+ return explicit;
83
+ const n = toolName.toLowerCase();
84
+ if (/(browser|navigate|page|click|screenshot|web|url|surf)/.test(n))
85
+ return "browser";
86
+ if (/(bash|shell|exec|command|terminal|run|python|node|script)/.test(n))
87
+ return "command";
88
+ if (/(file|write|read|edit|create|save|download|upload|fs)/.test(n))
89
+ return "file";
90
+ if (/(fetch|api|http|request|search|query|generate|summar)/.test(n))
91
+ return "api";
92
+ return "other";
93
+ }
94
+ // Active dispatches keyed by accountId (one dispatch at a time per account)
95
+ const activeDispatches = new Map();
96
+ function buildTaskProgressContent(dispatch, completed) {
97
+ return {
98
+ taskProgress: {
99
+ taskId: dispatch.taskId,
100
+ steps: dispatch.steps.map((s) => ({
101
+ toolName: s.toolName,
102
+ label: s.label,
103
+ status: s.status,
104
+ category: s.category,
105
+ startedAt: s.startedAt,
106
+ completedAt: s.completedAt,
107
+ })),
108
+ completed,
109
+ live: !completed,
110
+ currentTool: completed ? undefined : dispatch.currentTool,
111
+ currentLabel: completed ? undefined : dispatch.currentLabel,
112
+ currentCategory: completed ? undefined : dispatch.currentCategory,
113
+ screenId: dispatch.screenId,
114
+ startedAt: dispatch.startedAt,
115
+ endedAt: completed ? Date.now() : undefined,
116
+ },
117
+ };
118
+ }
119
+ async function sendOrEditProgressMessage(dispatch, completed) {
120
+ const content = buildTaskProgressContent(dispatch, completed);
121
+ const groupKey = await getGroupKeyOrThrow(dispatch.account, dispatch.groupId);
122
+ const encrypted = encryptMessage(content, groupKey);
123
+ if (!dispatch.progressMessageId) {
124
+ const result = await sendMessage(dispatch.account.apiBaseUrl, dispatch.account.botToken, dispatch.groupId, encrypted);
125
+ dispatch.progressMessageId = result.message.id;
126
+ }
127
+ else {
128
+ await editMessage(dispatch.account.apiBaseUrl, dispatch.account.botToken, dispatch.progressMessageId, encrypted);
129
+ }
130
+ }
131
+ function onToolCallStart(dispatch, toolName, toolLabel) {
132
+ const category = categorize(toolName);
133
+ dispatch.steps.push({
134
+ toolName,
135
+ label: toolLabel,
136
+ status: "in_progress",
137
+ category,
138
+ startedAt: Date.now(),
139
+ });
140
+ dispatch.currentTool = toolName;
141
+ dispatch.currentLabel = toolLabel;
142
+ dispatch.currentCategory = category;
143
+ sendOrEditProgressMessage(dispatch, false).catch(() => { });
144
+ }
145
+ function onToolCallComplete(dispatch, toolName, success) {
146
+ const step = dispatch.steps.find((s) => s.toolName === toolName && s.status === "in_progress");
147
+ if (step) {
148
+ step.status = success ? "completed" : "failed";
149
+ step.completedAt = Date.now();
150
+ }
151
+ // Clear the "current action" — the dispatch is still live, just idle between tools.
152
+ if (dispatch.currentTool === toolName) {
153
+ dispatch.currentTool = undefined;
154
+ dispatch.currentLabel = undefined;
155
+ dispatch.currentCategory = undefined;
156
+ }
157
+ sendOrEditProgressMessage(dispatch, false).catch(() => { });
158
+ }
159
+ async function finalizeTaskProgress(dispatch) {
160
+ const now = Date.now();
161
+ for (const step of dispatch.steps) {
162
+ if (step.status === "in_progress") {
163
+ step.status = "completed";
164
+ step.completedAt = now;
165
+ }
166
+ }
167
+ dispatch.currentTool = undefined;
168
+ dispatch.currentLabel = undefined;
169
+ dispatch.currentCategory = undefined;
170
+ if (dispatch.steps.length > 0) {
171
+ await sendOrEditProgressMessage(dispatch, true).catch(() => { });
172
+ }
173
+ }
44
174
  /**
45
175
  * Report TTS/STT usage to the server for billing (fire-and-forget).
46
176
  * Bot owner is charged based on usage type and amount.
@@ -656,6 +786,17 @@ async function sendEncryptedChatContent(params) {
656
786
  recordState(account.accountId, { lastOutboundAt: Date.now() });
657
787
  return result;
658
788
  }
789
+ /** Build ConfirmationDeps for the action confirmation module. */
790
+ function makeConfirmationDeps(account) {
791
+ return {
792
+ sendEncrypted: (groupId, content) => sendEncryptedChatContent({ account, groupId, content }),
793
+ editEncrypted: async (groupId, messageId, content) => {
794
+ const groupKey = await getGroupKeyOrThrow(account, groupId);
795
+ const encrypted = encryptMessage(content, groupKey);
796
+ await editMessage(account.apiBaseUrl, account.botToken, messageId, encrypted);
797
+ },
798
+ };
799
+ }
659
800
  async function prepareOutboundAttachment(account, groupId, attachment) {
660
801
  if (attachment.url) {
661
802
  const mediaRes = await fetch(attachment.url, { signal: AbortSignal.timeout(60_000) });
@@ -714,10 +855,176 @@ async function stageInboundAttachmentsForAgent(params) {
714
855
  }
715
856
  return staged;
716
857
  }
858
+ /**
859
+ * Auto-discover an OpenAI-compatible provider (e.g. clider) and a vision-capable
860
+ * model from the OpenClaw model config to use for native PDF reading. No extra
861
+ * channel config required — it reuses whatever chat provider the bot already has.
862
+ */
863
+ function resolveNativeDocReader(cfg) {
864
+ const providers = cfg?.models?.providers;
865
+ if (!providers)
866
+ return null;
867
+ for (const provider of Object.values(providers)) {
868
+ if (!provider?.baseUrl?.trim() || !provider?.apiKey?.trim())
869
+ continue;
870
+ if (!(provider.api ?? "").toLowerCase().includes("openai"))
871
+ continue;
872
+ const vision = (provider.models ?? []).find((m) => m?.id?.trim() && Array.isArray(m.input) && m.input.includes("image"));
873
+ if (vision?.id?.trim()) {
874
+ return {
875
+ baseUrl: provider.baseUrl.trim().replace(/\/+$/, ""),
876
+ apiKey: provider.apiKey.trim(),
877
+ model: vision.id.trim(),
878
+ };
879
+ }
880
+ }
881
+ return null;
882
+ }
883
+ const NATIVE_DOC_MAX_BYTES = 15 * 1024 * 1024;
884
+ const NATIVE_DOC_PROMPT = "请完整、逐字地提取这个文档的全部文字内容,保留标题、段落、列表和表格结构,用 Markdown 输出。" +
885
+ "这可能是扫描件,请用视觉识别(OCR)。直接输出文档内容本身,不要添加任何前言或说明。";
886
+ /** Read a PDF natively via an OpenAI-compatible model's `input_file` content block. */
887
+ async function readDocumentNatively(params) {
888
+ const { reader, buffer, filename, mimeType } = params;
889
+ const dataUrl = `data:${mimeType};base64,${buffer.toString("base64")}`;
890
+ const res = await fetch(`${reader.baseUrl}/chat/completions`, {
891
+ method: "POST",
892
+ headers: { "Content-Type": "application/json", Authorization: `Bearer ${reader.apiKey}` },
893
+ body: JSON.stringify({
894
+ model: reader.model,
895
+ max_tokens: 16000,
896
+ messages: [
897
+ {
898
+ role: "user",
899
+ content: [
900
+ { type: "text", text: NATIVE_DOC_PROMPT },
901
+ { type: "file", file: { filename, file_data: dataUrl } },
902
+ ],
903
+ },
904
+ ],
905
+ }),
906
+ signal: AbortSignal.timeout(params.timeoutMs ?? 120_000),
907
+ });
908
+ if (!res.ok) {
909
+ const body = await res.text().catch(() => "");
910
+ throw new Error(`document read failed (${res.status})${body ? `: ${body.slice(0, 200)}` : ""}`);
911
+ }
912
+ const json = (await res.json().catch(() => null));
913
+ const content = json?.choices?.[0]?.message?.content;
914
+ const text = typeof content === "string"
915
+ ? content
916
+ : Array.isArray(content)
917
+ ? content
918
+ .map((b) => b && typeof b === "object" && "text" in b
919
+ ? String(b.text ?? "")
920
+ : "")
921
+ .join("")
922
+ : "";
923
+ if (!text.trim())
924
+ throw new Error("document read returned empty content");
925
+ return text.trim();
926
+ }
927
+ // Rich document types (PDF + Office) worth feeding to the model's native file
928
+ // parser. Plain text / csv / json / md are left to OpenClaw's own text decoding;
929
+ // images / audio / video are handled by their own paths.
930
+ const RICH_DOC_EXTS = new Set([
931
+ ".pdf",
932
+ ".doc",
933
+ ".docx",
934
+ ".xls",
935
+ ".xlsx",
936
+ ".ppt",
937
+ ".pptx",
938
+ ".odt",
939
+ ".ods",
940
+ ".odp",
941
+ ".rtf",
942
+ ]);
943
+ const DOC_MIME_BY_EXT = {
944
+ ".pdf": "application/pdf",
945
+ ".doc": "application/msword",
946
+ ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
947
+ ".xls": "application/vnd.ms-excel",
948
+ ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
949
+ ".ppt": "application/vnd.ms-powerpoint",
950
+ ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
951
+ ".odt": "application/vnd.oasis.opendocument.text",
952
+ ".ods": "application/vnd.oasis.opendocument.spreadsheet",
953
+ ".odp": "application/vnd.oasis.opendocument.presentation",
954
+ ".rtf": "application/rtf",
955
+ };
956
+ function isRichDocumentAttachment(att) {
957
+ if (att.kind === "pdf" || att.mime === "application/pdf")
958
+ return true;
959
+ if (RICH_DOC_EXTS.has(path.extname(att.name || "").toLowerCase()))
960
+ return true;
961
+ return /(msword|ms-excel|ms-powerpoint|officedocument|opendocument|application\/rtf)/.test((att.mime || "").toLowerCase());
962
+ }
963
+ function resolveDocMime(att) {
964
+ if (att.mime?.trim() && att.mime.toLowerCase() !== "application/octet-stream")
965
+ return att.mime;
966
+ return DOC_MIME_BY_EXT[path.extname(att.name || "").toLowerCase()] ?? "application/pdf";
967
+ }
968
+ /**
969
+ * For each staged rich document (PDF / Office), read it natively via the model's
970
+ * file parser and return a text block to append to the agent's turn, so it can
971
+ * answer inline without a tool round-trip or file output. Returns "" when there
972
+ * are no such documents, no usable provider, or all reads fail (in which case the
973
+ * agent falls back to its normal tool path).
974
+ */
975
+ async function buildNativeDocumentContext(params) {
976
+ const docs = params.staged.filter(isRichDocumentAttachment);
977
+ if (docs.length === 0)
978
+ return "";
979
+ const reader = resolveNativeDocReader(params.cfg);
980
+ if (!reader)
981
+ return "";
982
+ const blocks = [];
983
+ const failed = [];
984
+ for (const att of docs) {
985
+ const name = att.name || "document";
986
+ try {
987
+ const buffer = await fs.readFile(att.stagedPath);
988
+ if (buffer.byteLength > NATIVE_DOC_MAX_BYTES) {
989
+ params.log?.warn?.(`ozaiya: skipping native read of ${name} (too large: ${buffer.byteLength} bytes)`);
990
+ failed.push(`「${name}」(过大)`);
991
+ continue;
992
+ }
993
+ const content = await readDocumentNatively({
994
+ reader,
995
+ buffer,
996
+ filename: name,
997
+ mimeType: resolveDocMime(att),
998
+ });
999
+ params.log?.info?.(`ozaiya: natively read document "${name}" via ${reader.model} (${content.length} chars)`);
1000
+ blocks.push(`【系统已自动读取附件「${name}」的完整内容(已用模型解析,扫描件也已 OCR)】:\n\n${content}`);
1001
+ }
1002
+ catch (err) {
1003
+ params.log?.warn?.(`ozaiya: native read of ${name} failed: ${err instanceof Error ? err.message : String(err)}`);
1004
+ failed.push(`「${name}」`);
1005
+ }
1006
+ }
1007
+ if (blocks.length === 0 && failed.length === 0)
1008
+ return "";
1009
+ const parts = [];
1010
+ if (blocks.length > 0) {
1011
+ parts.push(blocks.join("\n\n---\n\n"));
1012
+ parts.push("[以上为附件的完整原文,已替你读取完毕。请据此直接在聊天里用文字回答用户的请求;" +
1013
+ "不要再调用 pdf/文件读取工具,也不要用 exec/python 生成或发送任何文件" +
1014
+ "(.doc/.docx/.xlsx/.rtf 等)——把内容或结果直接发在消息里即可。]");
1015
+ }
1016
+ if (failed.length > 0) {
1017
+ parts.push(`[注意:附件 ${failed.join("、")} 无法自动读取,请用一句话告诉用户把它转换成 PDF 后重新发送。]`);
1018
+ }
1019
+ return "\n\n" + parts.join("\n\n");
1020
+ }
717
1021
  /**
718
1022
  * Build the full set of channel agent tools for a given bot account.
719
1023
  * Used by both the plugin's agentTools factory (for OpenClaw tool registration)
720
1024
  * and the text-based tool call fallback in deliver().
1025
+ *
1026
+ * Non-read-only tools are wrapped to report progress steps to the active
1027
+ * dispatch's task progress card (when one exists for this account).
721
1028
  */
722
1029
  function buildChannelTools(account, cfg) {
723
1030
  const resolveForGroup = (groupId) => resolveAccountForGroup(cfg, groupId) ?? account;
@@ -753,6 +1060,36 @@ function buildChannelTools(account, cfg) {
753
1060
  const summarizeTool = createSummarizeUrlTool(cfg);
754
1061
  if (summarizeTool)
755
1062
  tools.push(summarizeTool);
1063
+ // Wrap non-read-only tools with progress tracking.
1064
+ // When the tool executes, it looks up the current active dispatch for this account.
1065
+ const accountId = account.accountId;
1066
+ for (let i = 0; i < tools.length; i++) {
1067
+ const tool = tools[i];
1068
+ if (READ_ONLY_TOOLS.has(tool.name))
1069
+ continue;
1070
+ const originalExecute = tool.execute;
1071
+ tools[i] = {
1072
+ ...tool,
1073
+ execute: async (toolCallId, rawArgs) => {
1074
+ const dispatch = activeDispatches.get(accountId);
1075
+ if (dispatch)
1076
+ onToolCallStart(dispatch, tool.name, tool.label);
1077
+ try {
1078
+ const result = await originalExecute(toolCallId, rawArgs);
1079
+ const dispatchAfter = activeDispatches.get(accountId);
1080
+ if (dispatchAfter)
1081
+ onToolCallComplete(dispatchAfter, tool.name, true);
1082
+ return result;
1083
+ }
1084
+ catch (err) {
1085
+ const dispatchAfter = activeDispatches.get(accountId);
1086
+ if (dispatchAfter)
1087
+ onToolCallComplete(dispatchAfter, tool.name, false);
1088
+ throw err;
1089
+ }
1090
+ },
1091
+ };
1092
+ }
756
1093
  return tools;
757
1094
  }
758
1095
  export const ozaiyaPlugin = {
@@ -1351,6 +1688,18 @@ function createSendDirectMessageTool(account) {
1351
1688
  try {
1352
1689
  const args = rawArgs;
1353
1690
  const { userId, message } = args;
1691
+ // Request user confirmation before sending a DM
1692
+ const originGroupId = accountToOriginGroupId.get(account.accountId);
1693
+ if (originGroupId) {
1694
+ const deps = makeConfirmationDeps(account);
1695
+ const approved = await requestConfirmation(deps, originGroupId, "message_user", {
1696
+ userId,
1697
+ message: message.length > 100 ? message.slice(0, 100) + "…" : message,
1698
+ });
1699
+ if (!approved) {
1700
+ return { content: [{ type: "text", text: "Direct message was not approved by the user." }] };
1701
+ }
1702
+ }
1354
1703
  // Try to resolve as ozaiyaId first, fall back to treating as internal accountId
1355
1704
  let accountId = userId;
1356
1705
  const users = await searchUsers(account.apiBaseUrl, account.botToken, userId).catch(() => []);
@@ -1566,6 +1915,18 @@ function createCreateGroupTool(account) {
1566
1915
  if (!groupName) {
1567
1916
  return { content: [{ type: "text", text: "Error: group name is required." }] };
1568
1917
  }
1918
+ // Request user confirmation before creating a group
1919
+ const originGroupId = accountToOriginGroupId.get(account.accountId);
1920
+ if (originGroupId) {
1921
+ const deps = makeConfirmationDeps(account);
1922
+ const approved = await requestConfirmation(deps, originGroupId, "create_group", {
1923
+ name: groupName,
1924
+ ...(args.memberIds?.length ? { members: args.memberIds.join(", ") } : {}),
1925
+ });
1926
+ if (!approved) {
1927
+ return { content: [{ type: "text", text: "Group creation was not approved by the user." }] };
1928
+ }
1929
+ }
1569
1930
  // Resolve ozaiyaIds to account IDs
1570
1931
  let accountIds;
1571
1932
  if (args.memberIds && args.memberIds.length > 0) {
@@ -1934,6 +2295,15 @@ function createStartInAppCallTool(account, cfg) {
1934
2295
  execute: async (_toolCallId, rawArgs) => {
1935
2296
  const args = rawArgs;
1936
2297
  try {
2298
+ // Request user confirmation before starting the call
2299
+ const deps = makeConfirmationDeps(account);
2300
+ const approved = await requestConfirmation(deps, args.groupId, "start_in_app_call", {
2301
+ groupId: args.groupId,
2302
+ type: args.type ?? "voice",
2303
+ });
2304
+ if (!approved) {
2305
+ return { content: [{ type: "text", text: "Call was not approved by the user." }] };
2306
+ }
1937
2307
  const result = await startCall(account.apiBaseUrl, account.botToken, args.groupId, args.type ?? "voice");
1938
2308
  if (!result) {
1939
2309
  return { content: [{ type: "text", text: "Failed to start call. The bot may not have permission or is not a member of the group." }] };
@@ -2039,6 +2409,16 @@ function createMakePhoneCallTool(account, cfg) {
2039
2409
  execute: async (_toolCallId, rawArgs) => {
2040
2410
  const args = rawArgs;
2041
2411
  try {
2412
+ // Request user confirmation before making the phone call
2413
+ const deps = makeConfirmationDeps(account);
2414
+ const approved = await requestConfirmation(deps, args.groupId, "make_phone_call", {
2415
+ phoneNumber: args.phoneNumber,
2416
+ mode: args.mode ?? "auto",
2417
+ ...(args.purpose ? { purpose: args.purpose } : {}),
2418
+ });
2419
+ if (!approved) {
2420
+ return { content: [{ type: "text", text: "Phone call was not approved by the user." }] };
2421
+ }
2042
2422
  const mode = args.mode ?? "auto";
2043
2423
  const result = await startPhoneCall(account.apiBaseUrl, account.botToken, args.groupId, args.phoneNumber, mode, args.purpose);
2044
2424
  // Report connected status
@@ -2637,6 +3017,14 @@ ctx) {
2637
3017
  if (inboundAttachments.length > 0) {
2638
3018
  ctx.log?.info?.(`ozaiya: staged ${stagedInboundAttachments.length}/${inboundAttachments.length} inbound attachments for OpenClaw media context`);
2639
3019
  }
3020
+ // Read inbound PDFs natively (feed straight to a vision model) and inject their
3021
+ // content into the agent's turn, so it answers inline without a pdf-tool round
3022
+ // trip or file generation. Empty string when there are no PDFs / no provider.
3023
+ const nativeDocContext = await buildNativeDocumentContext({
3024
+ cfg: ctx.cfg,
3025
+ staged: stagedInboundAttachments,
3026
+ log: ctx.log,
3027
+ });
2640
3028
  const transcriptionResult = await maybeTranscribeInboundAudio({
2641
3029
  stt: account.stt,
2642
3030
  messageText,
@@ -2685,6 +3073,9 @@ ctx) {
2685
3073
  id: groupId,
2686
3074
  },
2687
3075
  });
3076
+ // Track originating group for this account so tools without groupId
3077
+ // (message_user, create_group) can send confirmation cards to the right group.
3078
+ accountToOriginGroupId.set(account.accountId, groupId);
2688
3079
  const fromAddress = `ozaiya:group:${groupId}`;
2689
3080
  const conversationLabel = `group:${groupId}`;
2690
3081
  // Build inbound session envelope context
@@ -2702,9 +3093,10 @@ ctx) {
2702
3093
  "Rules: respond concisely (1-3 sentences), use natural spoken language, " +
2703
3094
  "never use markdown/code blocks/bullet lists/URLs/emojis. " +
2704
3095
  'Do not say "sure" or "of course" — just answer directly.';
3096
+ const agentInputWithDocs = nativeDocContext ? `${agentInput}${nativeDocContext}` : agentInput;
2705
3097
  const effectiveAgentInput = voiceReply
2706
- ? `${voiceReplyPrompt || DEFAULT_VOICE_REPLY_PROMPT}\n\n${agentInput}`
2707
- : agentInput;
3098
+ ? `${voiceReplyPrompt || DEFAULT_VOICE_REPLY_PROMPT}\n\n${agentInputWithDocs}`
3099
+ : agentInputWithDocs;
2708
3100
  const body = ch.reply.formatAgentEnvelope({
2709
3101
  channel: "Ozaiya",
2710
3102
  from: `${message.senderName} (${conversationLabel})`,
@@ -2788,6 +3180,25 @@ ctx) {
2788
3180
  }).catch((err) => {
2789
3181
  ctx.log?.warn?.(`ozaiya: failed recording session: ${String(err)}`);
2790
3182
  });
3183
+ // Create dispatch tracking for task progress card
3184
+ const dispatch = {
3185
+ taskId: `${account.accountId}:${groupId}:${Date.now()}`,
3186
+ groupId,
3187
+ account,
3188
+ steps: [],
3189
+ progressMessageId: null,
3190
+ startedAt: Date.now(),
3191
+ screenId: "sandbox",
3192
+ recording: null,
3193
+ };
3194
+ activeDispatches.set(account.accountId, dispatch);
3195
+ // Start screen recording for replay (best-effort; null if nothing recordable).
3196
+ try {
3197
+ dispatch.recording = startRecording(dispatch.taskId);
3198
+ }
3199
+ catch (err) {
3200
+ ctx.log?.warn?.(`ozaiya: startRecording failed: ${String(err)}`);
3201
+ }
2791
3202
  // Build channel tools map for text-based tool call fallback.
2792
3203
  // When a model outputs tool calls as plain text instead of structured API tool_calls,
2793
3204
  // we match against registered tool names and execute via their .execute() method.
@@ -2796,136 +3207,180 @@ ctx) {
2796
3207
  const channelToolsByName = new Map(channelTools.map((t) => [t.name, t]));
2797
3208
  ctx.log?.info?.(`ozaiya: text fallback tools loaded: ${channelToolsByName.size} tools [${[...channelToolsByName.keys()].join(", ")}]`);
2798
3209
  // Dispatch to agent with buffered block dispatcher
2799
- await ch.reply.dispatchReplyWithBufferedBlockDispatcher({
2800
- ctx: msgCtx,
2801
- cfg: ctx.cfg,
2802
- dispatcherOptions: {
2803
- deliver: async (replyPayload, _info) => {
2804
- let replyText = replyPayload.text;
2805
- ctx.log?.info?.(`ozaiya: deliver called, text length=${replyText?.length ?? 0}, empty=${!replyText?.trim()}, voiceReply=${voiceReply}, voiceReplyVoice=${voiceReplyVoice ?? 'none'}`);
2806
- if (!replyText?.trim())
2807
- return;
2808
- // Generic fallback: intercept tool calls that models output as text
2809
- // instead of structured API tool_calls. Supports two formats:
2810
- // 1. JSON function syntax: tool_name({"arg":"value"}) or tool_name({arg: "value"})
2811
- // 2. XML: <function_calls><invoke name="tool_name"><parameter name="arg">value</parameter></invoke></function_calls>
2812
- let textToolsExecuted = false;
2813
- // --- Format 1: JSON function syntax tool_name({"key":"val"}) ---
2814
- if (channelToolsByName.size > 0) {
2815
- const toolNames = [...channelToolsByName.keys()].map((n) => n.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"));
2816
- const jsonFnRegex = new RegExp(`\\b(${toolNames.join("|")})\\s*\\(\\s*(\\{[\\s\\S]*?\\})\\s*\\)`, "g");
2817
- let jsonMatch;
2818
- while ((jsonMatch = jsonFnRegex.exec(replyText)) !== null) {
2819
- const toolName = jsonMatch[1];
2820
- const tool = channelToolsByName.get(toolName);
2821
- if (!tool)
2822
- continue;
2823
- // Always strip the tool call text from the message — users should never
2824
- // see raw tool_name({...}) syntax even if execution fails.
2825
- textToolsExecuted = true;
2826
- try {
2827
- // Try strict JSON first, then lenient (unquoted keys, trailing commas)
2828
- let args;
3210
+ try {
3211
+ await ch.reply.dispatchReplyWithBufferedBlockDispatcher({
3212
+ ctx: msgCtx,
3213
+ cfg: ctx.cfg,
3214
+ dispatcherOptions: {
3215
+ deliver: async (replyPayload, _info) => {
3216
+ let replyText = replyPayload.text;
3217
+ ctx.log?.info?.(`ozaiya: deliver called, text length=${replyText?.length ?? 0}, empty=${!replyText?.trim()}, voiceReply=${voiceReply}, voiceReplyVoice=${voiceReplyVoice ?? 'none'}`);
3218
+ if (!replyText?.trim())
3219
+ return;
3220
+ // Generic fallback: intercept tool calls that models output as text
3221
+ // instead of structured API tool_calls. Supports two formats:
3222
+ // 1. JSON function syntax: tool_name({"arg":"value"}) or tool_name({arg: "value"})
3223
+ // 2. XML: <function_calls><invoke name="tool_name"><parameter name="arg">value</parameter></invoke></function_calls>
3224
+ let textToolsExecuted = false;
3225
+ // --- Format 1: JSON function syntax tool_name({"key":"val"}) ---
3226
+ if (channelToolsByName.size > 0) {
3227
+ const toolNames = [...channelToolsByName.keys()].map((n) => n.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"));
3228
+ const jsonFnRegex = new RegExp(`\\b(${toolNames.join("|")})\\s*\\(\\s*(\\{[\\s\\S]*?\\})\\s*\\)`, "g");
3229
+ let jsonMatch;
3230
+ while ((jsonMatch = jsonFnRegex.exec(replyText)) !== null) {
3231
+ const toolName = jsonMatch[1];
3232
+ const tool = channelToolsByName.get(toolName);
3233
+ if (!tool)
3234
+ continue;
3235
+ // Always strip the tool call text from the message — users should never
3236
+ // see raw tool_name({...}) syntax even if execution fails.
3237
+ textToolsExecuted = true;
2829
3238
  try {
2830
- args = JSON.parse(jsonMatch[2]);
3239
+ // Try strict JSON first, then lenient (unquoted keys, trailing commas)
3240
+ let args;
3241
+ try {
3242
+ args = JSON.parse(jsonMatch[2]);
3243
+ }
3244
+ catch {
3245
+ // Handle JS-style object literals: unquoted keys, trailing commas
3246
+ const lenient = jsonMatch[2]
3247
+ .replace(/([{,]\s*)([a-zA-Z_]\w*)\s*:/g, '$1"$2":')
3248
+ .replace(/,\s*}/g, "}");
3249
+ args = JSON.parse(lenient);
3250
+ }
3251
+ ctx.log?.info?.(`ozaiya: text fallback — executing ${toolName}(${JSON.stringify(args)})`);
3252
+ await tool.execute(`text-fallback-${Date.now()}`, args);
2831
3253
  }
2832
- catch {
2833
- // Handle JS-style object literals: unquoted keys, trailing commas
2834
- const lenient = jsonMatch[2]
2835
- .replace(/([{,]\s*)([a-zA-Z_]\w*)\s*:/g, '$1"$2":')
2836
- .replace(/,\s*}/g, "}");
2837
- args = JSON.parse(lenient);
3254
+ catch (err) {
3255
+ ctx.log?.warn?.(`ozaiya: text fallback ${toolName} failed: ${String(err)}`);
2838
3256
  }
2839
- ctx.log?.info?.(`ozaiya: text fallback — executing ${toolName}(${JSON.stringify(args)})`);
2840
- await tool.execute(`text-fallback-${Date.now()}`, args);
2841
3257
  }
2842
- catch (err) {
2843
- ctx.log?.warn?.(`ozaiya: text fallback — ${toolName} failed: ${String(err)}`);
3258
+ if (textToolsExecuted) {
3259
+ replyText = replyText.replace(jsonFnRegex, "").trim();
3260
+ if (!replyText)
3261
+ return;
2844
3262
  }
2845
3263
  }
2846
- if (textToolsExecuted) {
2847
- replyText = replyText.replace(jsonFnRegex, "").trim();
3264
+ // --- Format 2: XML <function_calls> ---
3265
+ if (replyText.includes("<function_calls>") && replyText.includes("<invoke")) {
3266
+ const invokeRegex = /<invoke\s+name="([^"]+)">([\s\S]*?)<\/invoke>/g;
3267
+ let match;
3268
+ while ((match = invokeRegex.exec(replyText)) !== null) {
3269
+ const toolName = match[1];
3270
+ const tool = channelToolsByName.get(toolName);
3271
+ if (!tool)
3272
+ continue;
3273
+ const paramsXml = match[2];
3274
+ const paramRegex = /<parameter\s+name="([^"]+)">([^<]*)<\/parameter>/g;
3275
+ const args = {};
3276
+ let pm;
3277
+ while ((pm = paramRegex.exec(paramsXml)) !== null) {
3278
+ args[pm[1]] = pm[2];
3279
+ }
3280
+ ctx.log?.info?.(`ozaiya: text fallback (XML) — executing ${toolName}(${JSON.stringify(args)})`);
3281
+ try {
3282
+ await tool.execute(`text-fallback-xml-${Date.now()}`, args);
3283
+ textToolsExecuted = true;
3284
+ }
3285
+ catch (err) {
3286
+ ctx.log?.warn?.(`ozaiya: text fallback (XML) — ${toolName} failed: ${String(err)}`);
3287
+ }
3288
+ }
3289
+ replyText = replyText
3290
+ .replace(/<function_calls>[\s\S]*?<\/function_calls>/g, "")
3291
+ .replace(/<function_results>[\s\S]*?<\/function_results>/g, "")
3292
+ .replace(/\[\[reply_to_current\]\]/g, "")
3293
+ .replace(/NO_REPLY/g, "")
3294
+ .trim();
3295
+ if (!replyText && textToolsExecuted)
3296
+ return;
2848
3297
  if (!replyText)
2849
3298
  return;
2850
3299
  }
2851
- }
2852
- // --- Format 2: XML <function_calls> ---
2853
- if (replyText.includes("<function_calls>") && replyText.includes("<invoke")) {
2854
- const invokeRegex = /<invoke\s+name="([^"]+)">([\s\S]*?)<\/invoke>/g;
2855
- let match;
2856
- while ((match = invokeRegex.exec(replyText)) !== null) {
2857
- const toolName = match[1];
2858
- const tool = channelToolsByName.get(toolName);
2859
- if (!tool)
2860
- continue;
2861
- const paramsXml = match[2];
2862
- const paramRegex = /<parameter\s+name="([^"]+)">([^<]*)<\/parameter>/g;
2863
- const args = {};
2864
- let pm;
2865
- while ((pm = paramRegex.exec(paramsXml)) !== null) {
2866
- args[pm[1]] = pm[2];
2867
- }
2868
- ctx.log?.info?.(`ozaiya: text fallback (XML) — executing ${toolName}(${JSON.stringify(args)})`);
2869
- try {
2870
- await tool.execute(`text-fallback-xml-${Date.now()}`, args);
2871
- textToolsExecuted = true;
2872
- }
2873
- catch (err) {
2874
- ctx.log?.warn?.(`ozaiya: text fallback (XML) — ${toolName} failed: ${String(err)}`);
3300
+ // Voice reply: synthesize TTS audio and send as voice message
3301
+ if (voiceReply) {
3302
+ ctx.log?.info?.(`ozaiya: voice reply — synthesizing TTS for group ${groupId}`);
3303
+ const audioBuffer = await synthesizeVoiceReply(replyText, ctx, voiceReplyVoice ?? undefined);
3304
+ if (audioBuffer) {
3305
+ const ext = audioBuffer.ext;
3306
+ const mime = ext === ".mp3" ? "audio/mpeg" : ext === ".opus" ? "audio/ogg" : ext === ".wav" ? "audio/wav" : "audio/mpeg";
3307
+ const fileInfo = await uploadFile(account.apiBaseUrl, account.botToken, groupId, `voice${ext}`, mime, audioBuffer.data);
3308
+ await sendEncryptedChatContent({
3309
+ account,
3310
+ groupId,
3311
+ content: { text: replyText, files: [fileInfo] },
3312
+ log: ctx.log,
3313
+ });
3314
+ ctx.log?.info?.(`ozaiya: voice reply sent successfully (${ext}, ${audioBuffer.data.length} bytes)`);
3315
+ reportUsage(account, "tts", { provider: "voice-reply" });
3316
+ return;
2875
3317
  }
2876
- }
2877
- replyText = replyText
2878
- .replace(/<function_calls>[\s\S]*?<\/function_calls>/g, "")
2879
- .replace(/<function_results>[\s\S]*?<\/function_results>/g, "")
2880
- .replace(/\[\[reply_to_current\]\]/g, "")
2881
- .replace(/NO_REPLY/g, "")
2882
- .trim();
2883
- if (!replyText && textToolsExecuted)
2884
- return;
2885
- if (!replyText)
2886
- return;
2887
- }
2888
- // Voice reply: synthesize TTS audio and send as voice message
2889
- if (voiceReply) {
2890
- ctx.log?.info?.(`ozaiya: voice reply — synthesizing TTS for group ${groupId}`);
2891
- const audioBuffer = await synthesizeVoiceReply(replyText, ctx, voiceReplyVoice ?? undefined);
2892
- if (audioBuffer) {
2893
- const ext = audioBuffer.ext;
2894
- const mime = ext === ".mp3" ? "audio/mpeg" : ext === ".opus" ? "audio/ogg" : ext === ".wav" ? "audio/wav" : "audio/mpeg";
2895
- const fileInfo = await uploadFile(account.apiBaseUrl, account.botToken, groupId, `voice${ext}`, mime, audioBuffer.data);
3318
+ ctx.log?.warn?.(`ozaiya: TTS failed, falling back to text reply`);
2896
3319
  await sendEncryptedChatContent({
2897
3320
  account,
2898
3321
  groupId,
2899
- content: { text: replyText, files: [fileInfo] },
3322
+ content: { text: "⚠️ Voice synthesis failed, falling back to text." },
2900
3323
  log: ctx.log,
2901
3324
  });
2902
- ctx.log?.info?.(`ozaiya: voice reply sent successfully (${ext}, ${audioBuffer.data.length} bytes)`);
2903
- reportUsage(account, "tts", { provider: "voice-reply" });
2904
- return;
2905
3325
  }
2906
- ctx.log?.warn?.(`ozaiya: TTS failed, falling back to text reply`);
3326
+ ctx.log?.info?.(`ozaiya: sending reply to group ${groupId}`);
2907
3327
  await sendEncryptedChatContent({
2908
3328
  account,
2909
3329
  groupId,
2910
- content: { text: "⚠️ Voice synthesis failed, falling back to text." },
3330
+ content: { text: replyText },
2911
3331
  log: ctx.log,
2912
3332
  });
2913
- }
2914
- ctx.log?.info?.(`ozaiya: sending reply to group ${groupId}`);
2915
- await sendEncryptedChatContent({
2916
- account,
2917
- groupId,
2918
- content: { text: replyText },
2919
- log: ctx.log,
2920
- });
2921
- ctx.log?.info?.(`ozaiya: reply sent successfully`);
2922
- },
2923
- onError: (err) => {
2924
- ctx.log?.warn?.(`ozaiya: reply dispatch error: ${String(err)}`);
3333
+ ctx.log?.info?.(`ozaiya: reply sent successfully`);
3334
+ },
3335
+ onError: (err) => {
3336
+ ctx.log?.warn?.(`ozaiya: reply dispatch error: ${String(err)}`);
3337
+ },
2925
3338
  },
2926
- },
2927
- replyOptions: account.model ? { isHeartbeat: true, heartbeatModelOverride: account.model } : undefined,
2928
- });
3339
+ replyOptions: account.model ? { isHeartbeat: true, heartbeatModelOverride: account.model } : undefined,
3340
+ });
3341
+ }
3342
+ finally {
3343
+ await finalizeTaskProgress(dispatch);
3344
+ // Stop + upload the screen recording, then register it for replay (all best-effort).
3345
+ const recording = dispatch.recording;
3346
+ if (recording) {
3347
+ try {
3348
+ const rec = await stopAndExtractRecording(recording);
3349
+ if (rec && rec.mp4.length > 0) {
3350
+ const filename = `replay-${dispatch.taskId}.mp4`.replace(/[^a-zA-Z0-9_.-]/g, "_");
3351
+ const uploaded = await uploadFile(account.apiBaseUrl, account.botToken, groupId, filename, "video/mp4", rec.mp4);
3352
+ const steps = dispatch.steps.map((s) => ({
3353
+ toolName: s.toolName,
3354
+ label: s.label,
3355
+ status: s.status,
3356
+ category: s.category,
3357
+ startMs: s.startedAt != null ? Math.max(0, s.startedAt - recording.startedAt) : null,
3358
+ endMs: s.completedAt != null ? Math.max(0, s.completedAt - recording.startedAt) : null,
3359
+ }));
3360
+ await fetch(`${account.apiBaseUrl}/v1/bot/task-recordings`, {
3361
+ method: "POST",
3362
+ headers: {
3363
+ Authorization: `Bearer ${account.botToken}`,
3364
+ "Content-Type": "application/json",
3365
+ },
3366
+ body: JSON.stringify({
3367
+ taskId: dispatch.taskId,
3368
+ groupId,
3369
+ videoUrl: uploaded.url,
3370
+ videoPath: uploaded.path ?? null,
3371
+ durationMs: rec.durationMs,
3372
+ recordingStartedAt: recording.startedAt,
3373
+ steps,
3374
+ }),
3375
+ }).catch(() => { });
3376
+ }
3377
+ }
3378
+ catch (err) {
3379
+ ctx.log?.warn?.(`ozaiya: recording upload failed: ${String(err)}`);
3380
+ }
3381
+ }
3382
+ activeDispatches.delete(account.accountId);
3383
+ }
2929
3384
  }
2930
3385
  async function resetRouteSession(route, ctx) {
2931
3386
  const runtime = getOzaiyaRuntime();
@@ -2983,6 +3438,15 @@ async function handleCallbackQuery(payload,
2983
3438
  ctx) {
2984
3439
  const { groupId, messageId, callbackData, buttonText, buttonRowIndex, buttonIndex, from } = payload;
2985
3440
  const account = ctx.account;
3441
+ // Intercept action confirmation callbacks (oz_ac: prefix) — resolve the
3442
+ // pending confirmation promise and don't dispatch to the agent.
3443
+ const actionCallback = parseActionCallback(callbackData);
3444
+ if (actionCallback) {
3445
+ const deps = makeConfirmationDeps(account);
3446
+ const resolved = await resolveConfirmation(deps, actionCallback.actionId, actionCallback.approved);
3447
+ if (resolved)
3448
+ return; // intercepted — don't dispatch to agent
3449
+ }
2986
3450
  const runtime = getOzaiyaRuntime();
2987
3451
  const ch = runtime.channel;
2988
3452
  const { callbackBody, callbackCommand, callbackLabel, callbackOptionIndex } = normalizeCallbackQueryPayload({