openclacky 1.3.4 → 1.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +27 -0
  3. data/lib/clacky/agent/fake_tool_call_detector.rb +52 -0
  4. data/lib/clacky/agent/session_serializer.rb +3 -2
  5. data/lib/clacky/agent/tool_executor.rb +0 -12
  6. data/lib/clacky/agent.rb +74 -9
  7. data/lib/clacky/api_extension.rb +81 -0
  8. data/lib/clacky/api_extension_loader.rb +13 -1
  9. data/lib/clacky/client.rb +14 -17
  10. data/lib/clacky/default_agents/_panels/time_machine/panel.js +22 -0
  11. data/lib/clacky/default_agents/base_prompt.md +1 -0
  12. data/lib/clacky/default_extensions/meeting/handler.rb +331 -0
  13. data/lib/clacky/default_extensions/meeting/meeting.js +790 -0
  14. data/lib/clacky/default_extensions/meeting/meta.yml +3 -0
  15. data/lib/clacky/default_extensions/meeting/skills/meeting-summarizer/SKILL.md +44 -0
  16. data/lib/clacky/default_skills/media-gen/SKILL.md +63 -0
  17. data/lib/clacky/default_skills/media-gen/scripts/video_seq.sh +114 -0
  18. data/lib/clacky/json_ui_controller.rb +1 -1
  19. data/lib/clacky/media/base.rb +60 -0
  20. data/lib/clacky/media/dashscope.rb +385 -21
  21. data/lib/clacky/media/gemini.rb +9 -0
  22. data/lib/clacky/media/generator.rb +52 -0
  23. data/lib/clacky/media/openai_compat.rb +166 -0
  24. data/lib/clacky/null_ui_controller.rb +13 -0
  25. data/lib/clacky/plain_ui_controller.rb +1 -1
  26. data/lib/clacky/providers.rb +50 -2
  27. data/lib/clacky/rich_ui/rich_ui_controller.rb +1 -1
  28. data/lib/clacky/server/channel/channel_ui_controller.rb +1 -1
  29. data/lib/clacky/server/http_server.rb +144 -9
  30. data/lib/clacky/server/session_registry.rb +4 -2
  31. data/lib/clacky/server/web_ui_controller.rb +3 -2
  32. data/lib/clacky/skill_loader.rb +14 -2
  33. data/lib/clacky/tools/terminal/output_cleaner.rb +1 -3
  34. data/lib/clacky/tools/terminal.rb +0 -43
  35. data/lib/clacky/ui2/components/modal_component.rb +1 -1
  36. data/lib/clacky/ui2/ui_controller.rb +140 -31
  37. data/lib/clacky/ui_interface.rb +10 -1
  38. data/lib/clacky/utils/encoding.rb +25 -0
  39. data/lib/clacky/version.rb +1 -1
  40. data/lib/clacky/web/app.css +145 -22
  41. data/lib/clacky/web/components/onboard.js +1 -14
  42. data/lib/clacky/web/features/brand/view.js +8 -5
  43. data/lib/clacky/web/features/channels/store.js +1 -20
  44. data/lib/clacky/web/features/mcp/store.js +1 -20
  45. data/lib/clacky/web/features/profile/store.js +1 -13
  46. data/lib/clacky/web/features/profile/view.js +16 -4
  47. data/lib/clacky/web/features/skills/store.js +6 -21
  48. data/lib/clacky/web/features/version/store.js +2 -0
  49. data/lib/clacky/web/i18n.js +24 -1
  50. data/lib/clacky/web/index.html +15 -0
  51. data/lib/clacky/web/sessions.js +141 -51
  52. data/lib/clacky/web/settings.js +34 -2
  53. data/lib/clacky/web/ws-dispatcher.js +11 -3
  54. data/lib/clacky.rb +12 -5
  55. metadata +8 -1
@@ -0,0 +1,790 @@
1
+ // Meeting Mode — WebUI Extension
2
+ // Records audio, transcribes via STT, displays live captions,
3
+ // detects wake words to trigger agent, and runs background annotations.
4
+
5
+ (function () {
6
+ const ANNOTATE_INTERVAL_MS = 120000;
7
+ const WAKE_PATTERNS = [/@clacky/i, /小[克客可刻课氪]/, /clacky/i, /克拉奇/];
8
+
9
+ // Self-contained i18n: extensions can't register keys into the host I18n
10
+ // dictionary, so we keep our own table and pick the language via I18n.lang().
11
+ const MEETING_I18N = {
12
+ en: {
13
+ "tab.label": "Meeting",
14
+ "btn.start": "Start Meeting",
15
+ "btn.stop": "End Meeting",
16
+ "btn.resume": "Resume Recording",
17
+ "hint.wake": 'Say "@clacky" or "小克" to ask a question during the meeting.',
18
+ "hint.resume": "A meeting is still in progress. Resume recording to continue (microphone access required again).",
19
+ "status.recording": "Recording",
20
+ "status.transcribing": "Transcribing…",
21
+ "status.listening": "Listening… ({{n}}s)",
22
+ "status.thinking": "Thinking…",
23
+ "status.speaking": "Playing…",
24
+ "vocab.label": "Meeting vocabulary (proper nouns)",
25
+ "vocab.placeholder": "Type a term, press Enter",
26
+ "vocab.save": "Save vocabulary",
27
+ "vocab.saved": "Saved",
28
+ "vocab.saveFailed": "Save failed",
29
+ "annotations.title": "Annotations",
30
+ "captions.empty": "Waiting for speech…",
31
+ "annotations.empty": "No annotations yet",
32
+ "stt.failed": "⚠ Transcription failed: {{msg}}",
33
+ "alert.noSession": "No active session",
34
+ "alert.startFailed": "Failed to start meeting: {{msg}}",
35
+ },
36
+ zh: {
37
+ "tab.label": "会议",
38
+ "btn.start": "开始会议",
39
+ "btn.stop": "结束会议",
40
+ "btn.resume": "继续录音",
41
+ "hint.wake": "会议中说「@clacky」或「小克」即可向我提问。",
42
+ "hint.resume": "有一场会议仍在进行中。点击「继续录音」继续(需要重新授权麦克风)。",
43
+ "status.recording": "录音中",
44
+ "status.transcribing": "识别中…",
45
+ "status.listening": "正在听你说…({{n}}s)",
46
+ "status.thinking": "思考中…",
47
+ "status.speaking": "播放中…",
48
+ "vocab.label": "会议词汇(专有名词)",
49
+ "vocab.placeholder": "输入词汇后回车添加",
50
+ "vocab.save": "保存词汇",
51
+ "vocab.saved": "已保存",
52
+ "vocab.saveFailed": "保存失败",
53
+ "annotations.title": "标注",
54
+ "captions.empty": "正在等待发言…",
55
+ "annotations.empty": "暂无标注",
56
+ "stt.failed": "⚠ 识别失败:{{msg}}",
57
+ "alert.noSession": "没有进行中的会话",
58
+ "alert.startFailed": "开启会议失败:{{msg}}",
59
+ },
60
+ };
61
+
62
+ function t(key, vars) {
63
+ const lang = (typeof I18n !== "undefined" && I18n.lang && I18n.lang()) || "en";
64
+ const dict = MEETING_I18N[lang] || MEETING_I18N.en;
65
+ let str = dict[key] != null ? dict[key] : (MEETING_I18N.en[key] != null ? MEETING_I18N.en[key] : key);
66
+ if (vars) Object.keys(vars).forEach((k) => { str = str.split("{{" + k + "}}").join(vars[k]); });
67
+ return str;
68
+ }
69
+
70
+ // VAD (voice activity detection) — slice on natural speech pauses instead
71
+ // of a fixed timer, so a sentence is never cut mid-word.
72
+ const VAD_SILENCE_THRESHOLD = 0.012; // RMS below this counts as silence
73
+ const VAD_SILENCE_HOLD_MS = 500; // pause this long => end of utterance
74
+ const VAD_MIN_SPEECH_MS = 400; // ignore utterances shorter than this
75
+ const VAD_MAX_SEGMENT_MS = 12000; // force-cut a very long monologue
76
+
77
+ const HALLUCINATION_PHRASES = new Set([
78
+ "no", "no.", "yes", "yes.", "ok", "okay", "thank you", "thank you.",
79
+ "thanks", "thanks for watching", "thanks for watching!", "you", "bye",
80
+ "uh", "um", "hmm", "mm", "mm-hmm", ".", "..", "...",
81
+ "yeah", "yeah.", "yep", "but", "and", "so", "oh", "ah", "ahh", "huh",
82
+ "an", "a", "i", "the", "more", "well", "right", "hi", "hey", "wow",
83
+ "嗯", "啊", "哦", "呃", "谢谢", "谢谢观看", "谢谢大家", "好", "好的", "对",
84
+ ]);
85
+
86
+ // STT models hallucinate single isolated words during silence/noise. Real
87
+ // speech segments (gated by VAD_MIN_SPEECH_MS) almost never decode to a lone
88
+ // 1-2 word fragment, so treat those as noise.
89
+ function isStructuralNoise(t) {
90
+ const stripped = t.replace(/[\(\[][^\)\]]*[\)\]]/g, "").trim();
91
+ if (stripped === "") return true;
92
+ // Pure punctuation / digits / timestamp-like fragments ("00:00", "1.", ":").
93
+ if (/^[\s\d.:,;!?。,、!?\-—]+$/.test(stripped)) return true;
94
+ const hasCJK = /[\u4e00-\u9fff\u3040-\u30ff]/.test(stripped);
95
+ const core = stripped.replace(/[\s。,、!?!?.,;:]+/g, "");
96
+ if (hasCJK) {
97
+ // A single isolated CJK character is almost always a filler/hallucination.
98
+ if (core.length <= 1) return true;
99
+ } else {
100
+ const words = stripped.split(/\s+/).filter(Boolean);
101
+ // A single short Latin word (≤3 chars), e.g. "An", "Oh", "Zero" miswrites.
102
+ if (words.length === 1 && words[0].replace(/[^A-Za-z]/g, "").length <= 3) return true;
103
+ }
104
+ return false;
105
+ }
106
+
107
+ function isHallucination(text) {
108
+ const t = text.trim();
109
+ if (isStructuralNoise(t)) return true;
110
+ const normalized = t.toLowerCase().replace(/[\s。,,!!??]+/g, " ").trim();
111
+ return normalized === "" || HALLUCINATION_PHRASES.has(normalized);
112
+ }
113
+
114
+ let state = {
115
+ active: false,
116
+ sessionId: null,
117
+ meetingId: null,
118
+ mediaRecorder: null,
119
+ annotateTimer: null,
120
+ transcripts: [],
121
+ annotations: [],
122
+ sttError: null,
123
+ audioCtx: null,
124
+ vadRaf: null,
125
+ stream: null,
126
+ vocabulary: "",
127
+ conversationUntil: 0,
128
+ expectingSpeech: false,
129
+ asking: false,
130
+ phase: "idle", // idle | listening | transcribing | conversation | thinking | speaking
131
+ phaseTimer: null,
132
+ transcribing: 0, // count of in-flight STT requests
133
+ container: null,
134
+ resumable: false,
135
+ };
136
+
137
+ function apiUrl(path) {
138
+ return `/api/ext/meeting${path}`;
139
+ }
140
+
141
+ async function postJson(path, body) {
142
+ const res = await fetch(apiUrl(path), {
143
+ method: "POST",
144
+ headers: { "Content-Type": "application/json" },
145
+ body: JSON.stringify(body),
146
+ });
147
+ const data = await res.json();
148
+ if (!res.ok) throw new Error(data.error || `Request failed (${res.status})`);
149
+ return data;
150
+ }
151
+
152
+ function currentSessionId() {
153
+ return state.sessionId || (window.Clacky && Clacky.ext && Clacky.ext.context.sessionId) || null;
154
+ }
155
+
156
+ // After a page refresh the browser forgets everything but the backend still
157
+ // has the meeting. Probe for it and, if found, restore captions and offer a
158
+ // "Resume Recording" button (mic access can't survive a refresh).
159
+ async function probeActiveMeeting(container) {
160
+ const sessionId = currentSessionId();
161
+ if (!sessionId || state.active) return;
162
+ try {
163
+ const res = await fetch(apiUrl("/active/" + encodeURIComponent(sessionId)));
164
+ const data = await res.json();
165
+ if (!data || !data.active) return;
166
+ state.sessionId = sessionId;
167
+ state.meetingId = data.meeting_id;
168
+ state.resumable = true;
169
+ state.transcripts = (data.transcript || []).map((e) => ({
170
+ ts: e.ts ? Date.parse(e.ts) || Date.now() : Date.now(),
171
+ text: String(e.text || "").trim(),
172
+ })).filter((e) => e.text);
173
+ } catch (_e) {
174
+ // probing is best-effort; ignore failures
175
+ }
176
+ renderUI(container);
177
+ }
178
+
179
+ async function startMeeting(container) {
180
+ const sessionId = currentSessionId();
181
+ if (!sessionId) {
182
+ alert(t("alert.noSession"));
183
+ return;
184
+ }
185
+
186
+ try {
187
+ const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
188
+ // Resuming an in-progress meeting (after a page refresh): keep the
189
+ // existing meeting_id so new captions append to the same transcript.
190
+ if (!(state.resumable && state.meetingId)) {
191
+ const data = await postJson("/start", { session_id: sessionId });
192
+ state.meetingId = data.meeting_id;
193
+ }
194
+ state.active = true;
195
+ state.sessionId = sessionId;
196
+ state.resumable = false;
197
+ state.stream = stream;
198
+
199
+ startVadRecording(stream);
200
+
201
+ state.annotateTimer = setInterval(() => runAnnotate(), ANNOTATE_INTERVAL_MS);
202
+
203
+ renderUI(container);
204
+ } catch (err) {
205
+ console.error("[meeting] start failed:", err);
206
+ alert(t("alert.startFailed", { msg: err.message }));
207
+ }
208
+ }
209
+
210
+ // Records continuously and cuts a segment only when speech is followed by a
211
+ // sustained pause (or the segment grows too long). Each segment is a fresh,
212
+ // self-contained webm so the STT backend can always decode it.
213
+ function startVadRecording(stream) {
214
+ const mime = getSupportedMime();
215
+ const AudioCtx = window.AudioContext || window.webkitAudioContext;
216
+ const audioCtx = new AudioCtx();
217
+ state.audioCtx = audioCtx;
218
+ const source = audioCtx.createMediaStreamSource(stream);
219
+ const analyser = audioCtx.createAnalyser();
220
+ analyser.fftSize = 1024;
221
+ source.connect(analyser);
222
+ const buf = new Float32Array(analyser.fftSize);
223
+
224
+ let recorder = null;
225
+ let chunks = [];
226
+ let hadSpeech = false;
227
+ let segmentStart = 0;
228
+ let silenceStart = 0;
229
+
230
+ function newRecorder() {
231
+ const r = new MediaRecorder(stream, mime ? { mimeType: mime } : {});
232
+ chunks = [];
233
+ hadSpeech = false;
234
+ segmentStart = performance.now();
235
+ silenceStart = 0;
236
+ r.ondataavailable = (e) => { if (e.data && e.data.size > 0) chunks.push(e.data); };
237
+ r.onstop = () => {
238
+ const captured = chunks;
239
+ const speech = hadSpeech;
240
+ const dur = performance.now() - segmentStart;
241
+ if (state.active) newRecorder();
242
+ if (speech && dur >= VAD_MIN_SPEECH_MS && captured.length) {
243
+ sendAudioChunk(new Blob(captured, mime ? { type: mime } : {}));
244
+ }
245
+ };
246
+ r.start(200); // emit chunks every 200ms so a cut loses nothing
247
+ recorder = r;
248
+ state.mediaRecorder = r;
249
+ }
250
+
251
+ function rms() {
252
+ analyser.getFloatTimeDomainData(buf);
253
+ let sum = 0;
254
+ for (let i = 0; i < buf.length; i++) sum += buf[i] * buf[i];
255
+ return Math.sqrt(sum / buf.length);
256
+ }
257
+
258
+ function tick() {
259
+ if (!state.active) return;
260
+ const now = performance.now();
261
+ const level = rms();
262
+
263
+ if (level >= VAD_SILENCE_THRESHOLD) {
264
+ hadSpeech = true;
265
+ silenceStart = 0;
266
+ } else if (hadSpeech) {
267
+ if (silenceStart === 0) silenceStart = now;
268
+ else if (now - silenceStart >= VAD_SILENCE_HOLD_MS) {
269
+ if (recorder.state === "recording") recorder.stop();
270
+ }
271
+ }
272
+
273
+ if (now - segmentStart >= VAD_MAX_SEGMENT_MS && recorder.state === "recording") {
274
+ recorder.stop();
275
+ }
276
+
277
+ state.vadRaf = requestAnimationFrame(tick);
278
+ }
279
+
280
+ newRecorder();
281
+ state.vadRaf = requestAnimationFrame(tick);
282
+ }
283
+
284
+ async function stopMeeting(container) {
285
+ state.active = false;
286
+ if (state.vadRaf) cancelAnimationFrame(state.vadRaf);
287
+ if (state.mediaRecorder && state.mediaRecorder.state !== "inactive") {
288
+ state.mediaRecorder.stop();
289
+ }
290
+ if (state.stream) {
291
+ state.stream.getTracks().forEach((t) => t.stop());
292
+ }
293
+ if (state.audioCtx) {
294
+ try { await state.audioCtx.close(); } catch (_) {}
295
+ }
296
+ clearInterval(state.annotateTimer);
297
+
298
+ try {
299
+ const result = await postJson("/end", {
300
+ session_id: state.sessionId,
301
+ meeting_id: state.meetingId,
302
+ });
303
+ if (result && result.ok === false) {
304
+ console.error("[meeting] end summarization failed:", result.error);
305
+ } else if (result && result.skipped) {
306
+ console.warn("[meeting] end: transcript was empty, no summary generated");
307
+ }
308
+ } catch (err) {
309
+ console.error("[meeting] end failed:", err);
310
+ }
311
+
312
+ state.mediaRecorder = null;
313
+ state.annotateTimer = null;
314
+ state.transcripts = [];
315
+ state.annotations = [];
316
+ state.sttError = null;
317
+ state.audioCtx = null;
318
+ state.vadRaf = null;
319
+ state.stream = null;
320
+ state.conversationUntil = 0;
321
+ state.expectingSpeech = false;
322
+ state.asking = false;
323
+ state.transcribing = 0;
324
+ state.resumable = false;
325
+ stopStatusTicker();
326
+ renderUI(container);
327
+ }
328
+
329
+ const MIN_AUDIO_BYTES = 2000; // drop near-empty blobs before hitting STT
330
+
331
+ async function sendAudioChunk(blob) {
332
+ if (!blob || blob.size < MIN_AUDIO_BYTES) return; // too little audio to be speech
333
+ const buf = await blob.arrayBuffer();
334
+ const bytes = new Uint8Array(buf);
335
+ let binary = "";
336
+ for (let i = 0; i < bytes.length; i++) binary += String.fromCharCode(bytes[i]);
337
+ const base64 = btoa(binary);
338
+
339
+ state.transcribing++;
340
+ updateStatus();
341
+ try {
342
+ const result = await postJson("/transcribe", {
343
+ session_id: state.sessionId,
344
+ meeting_id: state.meetingId,
345
+ audio_base64: base64,
346
+ mime_type: blob.type,
347
+ vocabulary: state.vocabulary,
348
+ });
349
+
350
+ if (result.text && result.text.trim() && !isHallucination(result.text)) {
351
+ const entry = { ts: Date.now(), text: result.text.trim() };
352
+ state.transcripts.push(entry);
353
+ checkWakeWord(entry.text);
354
+ updateCaptions();
355
+ }
356
+ if (state.sttError) {
357
+ state.sttError = null;
358
+ updateSttError();
359
+ }
360
+ } catch (e) {
361
+ console.error("STT failed:", e.message);
362
+ state.sttError = e.message;
363
+ updateSttError();
364
+ } finally {
365
+ state.transcribing = Math.max(0, state.transcribing - 1);
366
+ updateStatus();
367
+ }
368
+ }
369
+
370
+ const CONVERSATION_WINDOW_MS = 30000; // after a wake word, keep listening this long without re-triggering
371
+
372
+ function checkWakeWord(text) {
373
+ const triggered = WAKE_PATTERNS.some((p) => p.test(text));
374
+ const inConversation = state.conversationUntil && Date.now() < state.conversationUntil;
375
+
376
+ if (!triggered && !inConversation) return;
377
+ if (state.asking) return; // a question is still being answered; don't pile on
378
+
379
+ const question = text.replace(/@clacky/gi, "").replace(/clacky/gi, "").replace(/小[克客可刻课氪]/g, "").replace(/克拉奇/g, "").trim();
380
+ if (!question) return;
381
+
382
+ state.conversationUntil = Date.now() + CONVERSATION_WINDOW_MS;
383
+ state.expectingSpeech = true;
384
+ state.asking = true;
385
+ postJson("/ask", {
386
+ session_id: state.sessionId,
387
+ meeting_id: state.meetingId,
388
+ question: question,
389
+ }).catch((e) => {
390
+ state.asking = false;
391
+ console.error("[meeting] ask failed:", e.message);
392
+ });
393
+ }
394
+
395
+ async function runAnnotate() {
396
+ if (!state.active) return;
397
+ try {
398
+ const result = await postJson("/annotate", {
399
+ session_id: state.sessionId,
400
+ meeting_id: state.meetingId,
401
+ });
402
+ if (result.annotations && result.annotations.length > 0) {
403
+ state.annotations.push(...result.annotations);
404
+ updateAnnotations();
405
+ }
406
+ } catch (_) {}
407
+ }
408
+
409
+ // Single source of truth for the header status line. Priority high→low:
410
+ // speaking > thinking > conversation(countdown) > transcribing > listening.
411
+ function updateStatus() {
412
+ const el = document.getElementById("meeting-status");
413
+ if (!el) return;
414
+ const now = Date.now();
415
+ const inConversation = state.conversationUntil && now < state.conversationUntil;
416
+ let text, cls;
417
+ if (state.phase === "speaking") {
418
+ text = t("status.speaking");
419
+ cls = "speaking";
420
+ } else if (state.asking) {
421
+ text = t("status.thinking");
422
+ cls = "thinking";
423
+ } else if (inConversation) {
424
+ const left = Math.ceil((state.conversationUntil - now) / 1000);
425
+ text = t("status.listening", { n: left });
426
+ cls = "listening";
427
+ } else if (state.transcribing > 0) {
428
+ text = t("status.transcribing");
429
+ cls = "transcribing";
430
+ } else {
431
+ text = t("status.recording");
432
+ cls = "recording";
433
+ }
434
+ el.textContent = " " + text;
435
+ el.className = "meeting-status meeting-status-" + cls;
436
+ }
437
+
438
+ function startStatusTicker() {
439
+ stopStatusTicker();
440
+ updateStatus();
441
+ state.phaseTimer = setInterval(updateStatus, 500);
442
+ }
443
+
444
+ function stopStatusTicker() {
445
+ if (state.phaseTimer) {
446
+ clearInterval(state.phaseTimer);
447
+ state.phaseTimer = null;
448
+ }
449
+ }
450
+
451
+ function updateSttError() {
452
+ const el = document.getElementById("meeting-stt-error");
453
+ if (!el) return;
454
+ if (state.sttError) {
455
+ el.textContent = t("stt.failed", { msg: state.sttError });
456
+ el.style.display = "block";
457
+ } else {
458
+ el.textContent = "";
459
+ el.style.display = "none";
460
+ }
461
+ }
462
+
463
+ function updateCaptions() {
464
+ const el = document.getElementById("meeting-captions");
465
+ if (!el) return;
466
+ const recent = state.transcripts.slice(-20);
467
+ if (!recent.length) {
468
+ el.innerHTML = `<div class="meeting-empty">${escHtml(t("captions.empty"))}</div>`;
469
+ return;
470
+ }
471
+ el.innerHTML = recent
472
+ .map((t) => {
473
+ const time = new Date(t.ts).toLocaleTimeString();
474
+ return `<div class="meeting-caption"><span class="meeting-ts">${time}</span> ${escHtml(t.text)}</div>`;
475
+ })
476
+ .join("");
477
+ el.scrollTop = el.scrollHeight;
478
+ }
479
+
480
+ function updateAnnotations() {
481
+ const el = document.getElementById("meeting-annotations");
482
+ if (!el) return;
483
+ const recent = state.annotations.slice(-10);
484
+ if (!recent.length) {
485
+ el.innerHTML = `<div class="meeting-empty">${escHtml(t("annotations.empty"))}</div>`;
486
+ return;
487
+ }
488
+ el.innerHTML = recent
489
+ .map((a) => {
490
+ const icon = a.type === "decision" ? "📋" : a.type === "action" ? "✅" : "💡";
491
+ return `<div class="meeting-annotation">${icon} ${escHtml(a.text)}</div>`;
492
+ })
493
+ .join("");
494
+ }
495
+
496
+ function escHtml(s) {
497
+ const d = document.createElement("div");
498
+ d.textContent = s;
499
+ return d.innerHTML;
500
+ }
501
+
502
+ function getSupportedMime() {
503
+ const types = ["audio/webm;codecs=opus", "audio/webm", "audio/ogg;codecs=opus", "audio/mp4"];
504
+ for (const t of types) {
505
+ if (MediaRecorder.isTypeSupported(t)) return t;
506
+ }
507
+ return "";
508
+ }
509
+
510
+ function renderUI(container) {
511
+ if (!container) return;
512
+ container.replaceChildren();
513
+
514
+ const wrapper = document.createElement("div");
515
+ wrapper.className = "meeting-panel";
516
+
517
+ if (!state.active) {
518
+ const btn = document.createElement("button");
519
+ btn.className = "meeting-btn meeting-btn-start";
520
+ btn.textContent = state.resumable ? t("btn.resume") : t("btn.start");
521
+ btn.onclick = () => startMeeting(container);
522
+ wrapper.appendChild(btn);
523
+
524
+ const hint = document.createElement("p");
525
+ hint.className = "meeting-hint";
526
+ hint.textContent = state.resumable ? t("hint.resume") : t("hint.wake");
527
+ wrapper.appendChild(hint);
528
+
529
+ // Vocabulary can only be set before a meeting starts; once a meeting is
530
+ // in progress (resume state), hide the editor since changes won't apply.
531
+ if (!state.resumable) {
532
+ const vocabSection = document.createElement("div");
533
+ vocabSection.className = "meeting-vocab-section";
534
+
535
+ const vocabLabel = document.createElement("label");
536
+ vocabLabel.className = "meeting-vocab-label";
537
+ vocabLabel.textContent = t("vocab.label");
538
+ vocabSection.appendChild(vocabLabel);
539
+
540
+ const savedHint = document.createElement("span");
541
+ savedHint.className = "meeting-vocab-saved";
542
+
543
+ const parseTerms = (s) =>
544
+ String(s || "").split(/[,,]/).map((x) => x.trim()).filter(Boolean);
545
+
546
+ const box = document.createElement("div");
547
+ box.className = "meeting-vocab-box";
548
+
549
+ const input = document.createElement("input");
550
+ input.type = "text";
551
+ input.className = "meeting-vocab-tag-input";
552
+ input.placeholder = t("vocab.placeholder");
553
+
554
+ const persist = async (terms) => {
555
+ const value = terms.join(", ");
556
+ state.vocabulary = value;
557
+ try {
558
+ await postJson("/vocabulary", { vocabulary: value });
559
+ savedHint.textContent = t("vocab.saved");
560
+ setTimeout(() => { savedHint.textContent = ""; }, 2000);
561
+ } catch (err) {
562
+ savedHint.textContent = t("vocab.saveFailed");
563
+ }
564
+ };
565
+
566
+ const renderTags = () => {
567
+ box.querySelectorAll(".meeting-vocab-tag").forEach((el) => el.remove());
568
+ const terms = parseTerms(state.vocabulary);
569
+ terms.forEach((term, i) => {
570
+ const tag = document.createElement("span");
571
+ tag.className = "meeting-vocab-tag";
572
+ const label = document.createElement("span");
573
+ label.textContent = term;
574
+ const x = document.createElement("button");
575
+ x.type = "button";
576
+ x.className = "meeting-vocab-tag-x";
577
+ x.textContent = "×";
578
+ x.onclick = () => {
579
+ const next = parseTerms(state.vocabulary);
580
+ next.splice(i, 1);
581
+ persist(next);
582
+ renderTags();
583
+ };
584
+ tag.appendChild(label);
585
+ tag.appendChild(x);
586
+ box.insertBefore(tag, input);
587
+ });
588
+ };
589
+
590
+ const addTerm = (raw) => {
591
+ const term = String(raw || "").trim();
592
+ if (!term) return;
593
+ const terms = parseTerms(state.vocabulary);
594
+ if (terms.includes(term)) { input.value = ""; return; }
595
+ terms.push(term);
596
+ persist(terms);
597
+ input.value = "";
598
+ renderTags();
599
+ };
600
+
601
+ input.addEventListener("keydown", (e) => {
602
+ if (e.key === "Enter" || e.key === ",") {
603
+ e.preventDefault();
604
+ addTerm(input.value);
605
+ } else if (e.key === "Backspace" && !input.value) {
606
+ const terms = parseTerms(state.vocabulary);
607
+ if (terms.length) { terms.pop(); persist(terms); renderTags(); }
608
+ }
609
+ });
610
+ input.addEventListener("blur", () => addTerm(input.value));
611
+ box.onclick = () => input.focus();
612
+
613
+ box.appendChild(input);
614
+ renderTags();
615
+ vocabSection.appendChild(box);
616
+ vocabSection.appendChild(savedHint);
617
+
618
+ wrapper.appendChild(vocabSection);
619
+ }
620
+
621
+ // When resuming after a refresh, show the captions captured so far.
622
+ if (state.resumable && state.transcripts.length) {
623
+ const captions = document.createElement("div");
624
+ captions.id = "meeting-captions";
625
+ captions.className = "meeting-captions";
626
+ wrapper.appendChild(captions);
627
+ wrapper._restoreCaptions = true;
628
+ }
629
+ } else {
630
+ const header = document.createElement("div");
631
+ header.className = "meeting-header";
632
+ const dot = document.createElement("span");
633
+ dot.className = "meeting-recording-dot";
634
+ header.appendChild(dot);
635
+ const label = document.createElement("span");
636
+ label.id = "meeting-status";
637
+ label.className = "meeting-status";
638
+ header.appendChild(label);
639
+ const stopBtn = document.createElement("button");
640
+ stopBtn.className = "meeting-btn meeting-btn-stop";
641
+ stopBtn.textContent = t("btn.stop");
642
+ stopBtn.onclick = () => stopMeeting(container);
643
+ header.appendChild(stopBtn);
644
+ wrapper.appendChild(header);
645
+
646
+ const sttError = document.createElement("div");
647
+ sttError.id = "meeting-stt-error";
648
+ sttError.className = "meeting-stt-error";
649
+ sttError.style.display = "none";
650
+ wrapper.appendChild(sttError);
651
+
652
+ const captions = document.createElement("div");
653
+ captions.id = "meeting-captions";
654
+ captions.className = "meeting-captions";
655
+ wrapper.appendChild(captions);
656
+
657
+ const annoSection = document.createElement("div");
658
+ annoSection.className = "meeting-annotations-section";
659
+ const annoTitle = document.createElement("h4");
660
+ annoTitle.textContent = t("annotations.title");
661
+ annoSection.appendChild(annoTitle);
662
+ const annoList = document.createElement("div");
663
+ annoList.id = "meeting-annotations";
664
+ annoList.className = "meeting-annotations";
665
+ annoSection.appendChild(annoList);
666
+ wrapper.appendChild(annoSection);
667
+ }
668
+
669
+ container.appendChild(wrapper);
670
+
671
+ if (state.active) {
672
+ updateCaptions();
673
+ updateAnnotations();
674
+ updateSttError();
675
+ startStatusTicker();
676
+ } else {
677
+ stopStatusTicker();
678
+ if (wrapper._restoreCaptions) updateCaptions();
679
+ }
680
+ }
681
+
682
+ // Strip markdown so TTS reads clean prose, not symbols.
683
+ function plainText(md) {
684
+ return String(md)
685
+ .replace(/```[\s\S]*?```/g, " ")
686
+ .replace(/`([^`]*)`/g, "$1")
687
+ .replace(/!\[[^\]]*\]\([^)]*\)/g, " ")
688
+ .replace(/\[([^\]]*)\]\([^)]*\)/g, "$1")
689
+ .replace(/[*_#>~]/g, "")
690
+ .replace(/\s+/g, " ")
691
+ .trim();
692
+ }
693
+
694
+ async function speakAnswer(content) {
695
+ const text = plainText(content);
696
+ if (!text) return;
697
+ try {
698
+ const data = await postJson("/speak", { text: text });
699
+ if (!data.audio_base64) return;
700
+ const audio = new Audio(`data:${data.mime_type || "audio/wav"};base64,${data.audio_base64}`);
701
+ state.phase = "speaking";
702
+ updateStatus();
703
+ const clear = () => { state.phase = "idle"; updateStatus(); };
704
+ audio.onended = clear;
705
+ audio.onerror = clear;
706
+ audio.play().catch(clear);
707
+ } catch (e) {
708
+ state.phase = "idle";
709
+ updateStatus();
710
+ console.error("[meeting] TTS failed:", e.message);
711
+ }
712
+ }
713
+
714
+ // Speak the agent's reply aloud, but only while a meeting is live.
715
+ Clacky.ext.subscribe("session:assistant-message", function (payload) {
716
+ if (payload && payload.sessionId && state.sessionId && payload.sessionId !== state.sessionId) return;
717
+ state.asking = false;
718
+ if (!state.active) return;
719
+ if (!state.expectingSpeech) return;
720
+ state.expectingSpeech = false;
721
+ speakAnswer(payload && payload.content);
722
+ });
723
+
724
+ // Register as a tab in the session aside panel
725
+ Clacky.ext.ui.mount("session.aside", function (ctx) {
726
+ const container = document.createElement("div");
727
+ container.className = "meeting-container";
728
+ state.container = container;
729
+ renderUI(container);
730
+
731
+ // Load saved vocabulary, then probe for an in-progress meeting so a page
732
+ // refresh restores the captions instead of silently dropping them.
733
+ fetch(apiUrl("/vocabulary"))
734
+ .then((r) => r.json())
735
+ .then((d) => {
736
+ state.vocabulary = (d && d.vocabulary) || "";
737
+ renderUI(container);
738
+ })
739
+ .catch(() => null)
740
+ .then(() => probeActiveMeeting(container));
741
+
742
+ return container;
743
+ }, {
744
+ tab: { id: "meeting", label: () => t("tab.label") },
745
+ order: 200,
746
+ });
747
+
748
+ // Re-render on language switch so all labels follow the host language.
749
+ document.addEventListener("langchange", function () {
750
+ if (state.container) renderUI(state.container);
751
+ });
752
+
753
+ // Inject minimal styles
754
+ const style = document.createElement("style");
755
+ style.textContent = `
756
+ .meeting-container { padding: 16px; font-size: 13px; color: var(--color-text-secondary); }
757
+ .meeting-btn { padding: 7px 14px; border-radius: var(--radius-sm); border: 1px solid transparent; cursor: pointer; font-size: 13px; font-weight: 500; transition: background .15s, border-color .15s; }
758
+ .meeting-btn-start { background: var(--color-button-primary); color: var(--color-button-primary-text); }
759
+ .meeting-btn-start:hover { background: var(--color-button-primary-hover); }
760
+ .meeting-btn-stop { background: transparent; color: var(--color-error); border-color: var(--color-error-border); margin-left: auto; }
761
+ .meeting-btn-stop:hover { background: var(--color-error-bg); }
762
+ .meeting-hint { color: var(--color-text-tertiary); margin: 10px 0 0; font-size: 12px; line-height: 1.5; }
763
+ .meeting-vocab-section { margin-top: 20px; border-top: 1px solid var(--color-border-primary); padding-top: 16px; }
764
+ .meeting-vocab-label { display: block; font-size: 12px; color: var(--color-text-tertiary); margin-bottom: 6px; }
765
+ .meeting-vocab-box { display: flex; flex-wrap: wrap; gap: 6px; align-items: center; width: 100%; box-sizing: border-box; background: var(--color-bg-input); border: 1px solid var(--color-border-primary); border-radius: var(--radius-sm); padding: 6px 8px; cursor: text; min-height: 36px; }
766
+ .meeting-vocab-box:focus-within { border-color: var(--color-accent-primary); }
767
+ .meeting-vocab-tag { display: inline-flex; align-items: center; gap: 4px; background: var(--color-bg-hover); color: var(--color-text-primary); border: 1px solid var(--color-border-primary); border-radius: var(--radius-sm); padding: 2px 4px 2px 8px; font-size: 12px; line-height: 1.4; }
768
+ .meeting-vocab-tag-x { background: none; border: none; color: var(--color-text-muted); cursor: pointer; font-size: 14px; line-height: 1; padding: 0 2px; }
769
+ .meeting-vocab-tag-x:hover { color: var(--color-text-primary); }
770
+ .meeting-vocab-tag-input { flex: 1; min-width: 80px; background: none; border: none; outline: none; color: var(--color-text-primary); font-size: 12px; font-family: inherit; padding: 2px 0; }
771
+ .meeting-vocab-tag-input::placeholder { color: var(--color-text-muted); }
772
+ .meeting-vocab-saved { display: block; font-size: 12px; color: var(--color-success); margin-top: 8px; min-height: 14px; }
773
+ .meeting-header { display: flex; align-items: center; gap: 8px; margin-bottom: 14px; }
774
+ .meeting-recording-dot { width: 8px; height: 8px; border-radius: 50%; background: var(--color-error); animation: meeting-pulse 1.5s infinite; flex: none; }
775
+ @keyframes meeting-pulse { 0%,100% { opacity: 1; } 50% { opacity: 0.3; } }
776
+ .meeting-status { font-size: 13px; color: var(--color-text-secondary); }
777
+ .meeting-status-thinking, .meeting-status-listening { color: var(--color-accent-primary); }
778
+ .meeting-status-speaking { color: var(--color-success); }
779
+ .meeting-stt-error { background: var(--color-error-bg); color: var(--color-error); border: 1px solid var(--color-error-border); border-radius: var(--radius-sm); padding: 8px 10px; margin-bottom: 12px; font-size: 12px; line-height: 1.4; }
780
+ .meeting-captions { max-height: 300px; overflow-y: auto; border: 1px solid var(--color-border-primary); border-radius: var(--radius-sm); padding: 10px; margin: 14px 0; background: var(--color-bg-secondary); }
781
+ .meeting-caption { margin-bottom: 5px; line-height: 1.5; color: var(--color-text-primary); }
782
+ .meeting-ts { color: var(--color-text-muted); font-size: 11px; margin-right: 6px; }
783
+ .meeting-annotations-section { border-top: 1px solid var(--color-border-primary); padding-top: 12px; }
784
+ .meeting-annotations-section h4 { margin: 0 0 8px; font-size: 12px; font-weight: 600; color: var(--color-text-tertiary); }
785
+ .meeting-annotation { margin-bottom: 5px; font-size: 12px; line-height: 1.5; }
786
+ .meeting-empty { color: var(--color-text-muted); font-size: 12px; padding: 2px 0; }
787
+ .meeting-captions:has(.meeting-empty) { border: none; padding: 0; background: none; }
788
+ `;
789
+ document.head.appendChild(style);
790
+ })();