openbuilder 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1613 @@
1
+ #!/usr/bin/env npx tsx
2
+ /**
3
+ * builder-join.ts — Join a Google Meet meeting and capture live captions
4
+ *
5
+ * Usage:
6
+ * npx openbuilder join <meet-url> --auth
7
+ * npx openbuilder join <meet-url> --anon --bot-name "OpenBuilder Bot"
8
+ * npx openbuilder join <meet-url> --auth --duration 60m
9
+ *
10
+ * Extends OpenUtter patterns with OpenBuilder's AI report generation.
11
+ * When a meeting ends, automatically generates an AI-powered report
12
+ * if an API key is configured.
13
+ */
14
+
15
+ import { execSync } from "node:child_process";
16
+ import {
17
+ appendFileSync,
18
+ existsSync,
19
+ mkdirSync,
20
+ readFileSync,
21
+ unlinkSync,
22
+ writeFileSync,
23
+ } from "node:fs";
24
+ import { homedir } from "node:os";
25
+ import { join } from "node:path";
26
+
27
+ import {
28
+ AUTH_FILE,
29
+ OPENBUILDER_DIR,
30
+ PID_FILE,
31
+ SCREENSHOT_READY_FILE,
32
+ TRANSCRIPTS_DIR,
33
+ WORKSPACE_DIR,
34
+ getConfig,
35
+ ensureDirs,
36
+ } from "../src/utils/config.js";
37
+
38
+ type PlaywrightMod = typeof import("playwright-core");
39
+ type Page = import("playwright-core").Page;
40
+ type BrowserContext = import("playwright-core").BrowserContext;
41
+
42
+ // ── Send image/message to user's chat via openclaw ──────────────────────
43
+
44
+ import { exec as execAsync } from "node:child_process";
45
+
46
+ function sendImage(opts: {
47
+ channel?: string;
48
+ target?: string;
49
+ message: string;
50
+ mediaPath: string;
51
+ }): void {
52
+ if (opts.channel && opts.target) {
53
+ try {
54
+ execAsync(
55
+ `openclaw message send --channel ${opts.channel} --target ${JSON.stringify(opts.target)} --message ${JSON.stringify(opts.message)} --media ${JSON.stringify(opts.mediaPath)}`,
56
+ { timeout: 30_000 },
57
+ () => {},
58
+ );
59
+ } catch (err) {
60
+ console.error("Failed to send image:", err instanceof Error ? err.message : String(err));
61
+ }
62
+ }
63
+ }
64
+
65
+ function sendMessage(opts: { channel?: string; target?: string; message: string }): void {
66
+ if (opts.channel && opts.target) {
67
+ try {
68
+ execAsync(
69
+ `openclaw message send --channel ${opts.channel} --target ${JSON.stringify(opts.target)} --message ${JSON.stringify(opts.message)}`,
70
+ { timeout: 30_000 },
71
+ () => {},
72
+ );
73
+ } catch {
74
+ // Best-effort
75
+ }
76
+ }
77
+ }
78
+
79
+ // ── CLI parsing ────────────────────────────────────────────────────────
80
+
81
+ function parseArgs() {
82
+ const args = process.argv.slice(2);
83
+ const meetUrl = args.find((a) => !a.startsWith("--"));
84
+ const headed = args.includes("--headed");
85
+ const useAuth = args.includes("--auth");
86
+ const useAnon = args.includes("--anon");
87
+ const noCamera = !args.includes("--camera");
88
+ const noMic = !args.includes("--mic");
89
+ const verbose = args.includes("--verbose");
90
+ const noReport = args.includes("--no-report");
91
+ const forceAudio = args.includes("--audio");
92
+ const forceCaptions = args.includes("--captions");
93
+
94
+ const durationIdx = args.indexOf("--duration");
95
+ const durationRaw = durationIdx >= 0 ? args[durationIdx + 1] : undefined;
96
+ const botNameIdx = args.indexOf("--bot-name");
97
+ const botName = botNameIdx >= 0 ? args[botNameIdx + 1] : undefined;
98
+ const channelIdx = args.indexOf("--channel");
99
+ const channel = channelIdx >= 0 ? args[channelIdx + 1] : undefined;
100
+ const targetIdx = args.indexOf("--target");
101
+ const target = targetIdx >= 0 ? args[targetIdx + 1] : undefined;
102
+
103
+ if (!meetUrl) {
104
+ console.error(
105
+ "Usage: npx openbuilder join <meet-url> --auth|--anon [--bot-name <name>] [--duration 60m] [--channel <ch>] [--target <id>]",
106
+ );
107
+ process.exit(1);
108
+ }
109
+
110
+ if (!useAuth && !useAnon) {
111
+ console.error("ERROR: You must specify either --auth or --anon.");
112
+ console.error("ASK THE USER which mode they want before retrying. Do NOT choose for them.");
113
+ console.error(" --auth Join using saved Google account (~/.openbuilder/auth.json)");
114
+ console.error(" --anon Join as a guest (no Google account)");
115
+ process.exit(1);
116
+ }
117
+
118
+ if (useAuth && useAnon) {
119
+ console.error("ERROR: Cannot use both --auth and --anon.");
120
+ process.exit(1);
121
+ }
122
+
123
+ if (useAnon && !botName) {
124
+ console.error("ERROR: --anon requires --bot-name <name>.");
125
+ console.error("ASK THE USER what name they want the bot to use. Do NOT choose a default.");
126
+ process.exit(1);
127
+ }
128
+
129
+ if (forceAudio && forceCaptions) {
130
+ console.error("ERROR: Cannot use both --audio and --captions.");
131
+ process.exit(1);
132
+ }
133
+
134
+ // Parse duration to milliseconds
135
+ let durationMs: number | undefined;
136
+ if (durationRaw) {
137
+ const match = durationRaw.match(/^(\d+)(ms|s|m|h)?$/);
138
+ if (match) {
139
+ const value = parseInt(match[1]!, 10);
140
+ const unit = match[2] ?? "ms";
141
+ const multipliers: Record<string, number> = { ms: 1, s: 1000, m: 60_000, h: 3_600_000 };
142
+ durationMs = value * (multipliers[unit] ?? 1);
143
+ }
144
+ }
145
+
146
+ // Determine capture mode
147
+ let captureMode: "audio" | "captions" | "auto" = "auto";
148
+ if (forceAudio) captureMode = "audio";
149
+ if (forceCaptions) captureMode = "captions";
150
+
151
+ return {
152
+ meetUrl,
153
+ headed,
154
+ noAuth: useAnon,
155
+ noCamera,
156
+ noMic,
157
+ verbose,
158
+ noReport,
159
+ durationMs,
160
+ botName,
161
+ channel,
162
+ target,
163
+ captureMode,
164
+ };
165
+ }
166
+
167
+ // ── Google Meet UI automation ──────────────────────────────────────────
168
+
169
+ async function isBlockedFromJoining(page: Page): Promise<boolean> {
170
+ try {
171
+ const blocked = page
172
+ .locator("text=/You can't join this video call/i, text=/can.t join this video call/i")
173
+ .first();
174
+ return await blocked.isVisible({ timeout: 2000 });
175
+ } catch {
176
+ return false;
177
+ }
178
+ }
179
+
180
+ async function dismissOverlays(page: Page): Promise<void> {
181
+ const dismissTexts = ["Got it", "Dismiss", "OK", "Accept all", "Continue without microphone", "No thanks"];
182
+
183
+ for (let round = 0; round < 3; round++) {
184
+ let dismissed = false;
185
+
186
+ for (const text of dismissTexts) {
187
+ try {
188
+ const btn = page.locator(`button:has-text("${text}")`).first();
189
+ if (await btn.isVisible({ timeout: 1500 })) {
190
+ await btn.click();
191
+ console.log(` Dismissed overlay ("${text}")`);
192
+ dismissed = true;
193
+ await page.waitForTimeout(500);
194
+ }
195
+ } catch {
196
+ // Not present
197
+ }
198
+ }
199
+
200
+ // Dismiss Gemini banner
201
+ try {
202
+ const gemini = page.locator("text=/Use Gemini/i").first();
203
+ if (await gemini.isVisible({ timeout: 1000 })) {
204
+ await page.keyboard.press("Escape");
205
+ console.log(" Dismissed Gemini banner");
206
+ dismissed = true;
207
+ await page.waitForTimeout(500);
208
+ }
209
+ } catch {
210
+ // Not present
211
+ }
212
+
213
+ await page.keyboard.press("Escape");
214
+ await page.waitForTimeout(300);
215
+
216
+ if (!dismissed) break;
217
+ }
218
+ }
219
+
220
+ async function dismissPostJoinDialogs(page: Page): Promise<void> {
221
+ await page.waitForTimeout(2000);
222
+
223
+ for (let round = 0; round < 3; round++) {
224
+ let dismissed = false;
225
+
226
+ for (const text of ["Got it", "OK", "Dismiss", "Close"]) {
227
+ try {
228
+ const btn = page.locator(`button:has-text("${text}")`).first();
229
+ if (await btn.isVisible({ timeout: 1000 })) {
230
+ await btn.click();
231
+ console.log(` Dismissed post-join dialog ("${text}")`);
232
+ dismissed = true;
233
+ await page.waitForTimeout(500);
234
+ }
235
+ } catch {
236
+ // Not present
237
+ }
238
+ }
239
+
240
+ await page.keyboard.press("Escape");
241
+ await page.waitForTimeout(300);
242
+
243
+ if (!dismissed) break;
244
+ }
245
+ }
246
+
247
+ async function disableMediaOnPreJoin(page: Page, opts: { noCamera: boolean; noMic: boolean }) {
248
+ if (opts.noMic) {
249
+ try {
250
+ const micBtn = page
251
+ .locator(
252
+ '[aria-label*="microphone" i][data-is-muted="false"], ' +
253
+ 'button[aria-label*="Turn off microphone" i]',
254
+ )
255
+ .first();
256
+ if (await micBtn.isVisible({ timeout: 3000 })) {
257
+ await micBtn.click();
258
+ console.log(" Microphone turned off");
259
+ }
260
+ } catch {
261
+ // Already muted
262
+ }
263
+ }
264
+
265
+ if (opts.noCamera) {
266
+ try {
267
+ const camBtn = page
268
+ .locator(
269
+ '[aria-label*="camera" i][data-is-muted="false"], ' +
270
+ 'button[aria-label*="Turn off camera" i]',
271
+ )
272
+ .first();
273
+ if (await camBtn.isVisible({ timeout: 3000 })) {
274
+ await camBtn.click();
275
+ console.log(" Camera turned off");
276
+ }
277
+ } catch {
278
+ // Already off
279
+ }
280
+ }
281
+ }
282
+
283
+ async function enterNameIfNeeded(page: Page, botName: string): Promise<void> {
284
+ try {
285
+ const nameInput = page
286
+ .locator('input[aria-label="Your name"], input[placeholder*="name" i]')
287
+ .first();
288
+ if (await nameInput.isVisible({ timeout: 3000 })) {
289
+ await nameInput.fill(botName);
290
+ console.log(` Set display name: ${botName}`);
291
+ }
292
+ } catch {
293
+ // Name field not shown
294
+ }
295
+ }
296
+
297
+ async function clickJoinButton(page: Page, maxAttempts = 10): Promise<boolean> {
298
+ const joinSelectors = [
299
+ 'button:has-text("Continue without microphone and camera")',
300
+ 'button:has-text("Ask to join")',
301
+ 'button:has-text("Join now")',
302
+ 'button:has-text("Join meeting")',
303
+ 'button:has-text("Join")',
304
+ '[data-idom-class*="join"] button',
305
+ "button >> text=/join/i",
306
+ ];
307
+
308
+ for (let attempt = 0; attempt < maxAttempts; attempt++) {
309
+ // Only check for blocks after giving the page time to load (not on first 2 attempts)
310
+ if (attempt >= 2) {
311
+ const isBlocked = await page
312
+ .evaluate(() => {
313
+ const text = document.body.innerText || "";
314
+ return (
315
+ /you can.t join this video call/i.test(text) || /return(ing)? to home screen/i.test(text)
316
+ );
317
+ })
318
+ .catch(() => false);
319
+
320
+ if (isBlocked) {
321
+ console.log(" Detected 'can't join' — aborting join attempt");
322
+ return false;
323
+ }
324
+ }
325
+
326
+ for (const selector of joinSelectors) {
327
+ try {
328
+ const btn = page.locator(selector).first();
329
+ if (await btn.isVisible({ timeout: 2000 })) {
330
+ await btn.click();
331
+ console.log(" Clicked join button");
332
+ return true;
333
+ }
334
+ } catch {
335
+ // Try next
336
+ }
337
+ }
338
+
339
+ if (attempt < maxAttempts - 1) {
340
+ console.log(` Join button not found yet, retrying (${attempt + 1}/${maxAttempts})...`);
341
+ if (attempt === 0) {
342
+ const debugPath = join(WORKSPACE_DIR, "debug-pre-join.png");
343
+ await page.screenshot({ path: debugPath }).catch(() => {});
344
+ console.log(` [OPENBUILDER_DEBUG_IMAGE] ${debugPath}`);
345
+ }
346
+ await page.waitForTimeout(5000);
347
+ }
348
+ }
349
+
350
+ return false;
351
+ }
352
+
353
+ async function waitUntilInMeeting(page: Page, timeoutMs = 600_000): Promise<void> {
354
+ console.log(" Waiting to be admitted to the meeting (up to 10 min)...");
355
+ const start = Date.now();
356
+ let nextBlockCheck = Date.now() + 120_000; // First block check after 2 MINUTES (give host time to admit)
357
+
358
+ while (Date.now() - start < timeoutMs) {
359
+ // FIRST: check if we're still in the lobby — must check this before anything else
360
+ // The lobby also has a "Leave call" button, so we can't use that as an in-meeting signal
361
+ const isInLobby = await page
362
+ .evaluate(() => {
363
+ const text = document.body.innerText || "";
364
+ return (
365
+ /asking to be let in/i.test(text) ||
366
+ /waiting for someone to let you in/i.test(text) ||
367
+ /someone in the meeting/i.test(text) ||
368
+ /the meeting host/i.test(text) ||
369
+ /please wait until/i.test(text) ||
370
+ /meeting host brings you/i.test(text)
371
+ );
372
+ })
373
+ .catch(() => false);
374
+
375
+ if (isInLobby) {
376
+ // We're in the lobby — this is expected, keep waiting for admission
377
+ await page.waitForTimeout(5000);
378
+ continue;
379
+ }
380
+
381
+ // Now check if we're actually in the meeting (not lobby)
382
+ // Look for meeting-specific elements that only appear when admitted
383
+ try {
384
+ const inMeeting = await page.evaluate(() => {
385
+ const text = document.body.innerText || "";
386
+ // "You're the only one here" = admitted but alone
387
+ if (/you.re the only one here/i.test(text)) return true;
388
+ if (/you.ve been admitted/i.test(text)) return true;
389
+ // Check for participant video tiles (only in actual meeting, not lobby)
390
+ const tiles = document.querySelectorAll('[data-participant-id], [data-self-name]');
391
+ if (tiles.length > 0) return true;
392
+ // Check for caption button or "More options" menu (in-meeting toolbar)
393
+ const captionBtn = document.querySelector('[aria-label*="captions" i], [aria-label*="Turn on captions" i]');
394
+ if (captionBtn) return true;
395
+ // Check for the meeting info / people button (only in actual meeting)
396
+ const peopleBtn = document.querySelector('[aria-label*="Show everyone" i], [aria-label*="people" i]');
397
+ if (peopleBtn) return true;
398
+ return false;
399
+ });
400
+ if (inMeeting) {
401
+ console.log(" Confirmed: admitted to the meeting");
402
+ return;
403
+ }
404
+ } catch {
405
+ // Keep waiting
406
+ }
407
+
408
+ // Only check for hard blocks every 30 seconds (not every 2-3 seconds)
409
+ if (Date.now() >= nextBlockCheck) {
410
+ const isBlocked = await page
411
+ .evaluate(() => {
412
+ const text = document.body.innerText || "";
413
+ return (
414
+ /you can.t join this video call/i.test(text) ||
415
+ /return(ing)? to home screen/i.test(text) ||
416
+ /you have been removed/i.test(text) ||
417
+ /denied your request/i.test(text) ||
418
+ /meeting has been locked/i.test(text)
419
+ );
420
+ })
421
+ .catch(() => false);
422
+
423
+ if (isBlocked) {
424
+ throw new Error("Blocked from joining — access denied or meeting unavailable");
425
+ }
426
+
427
+ nextBlockCheck = Date.now() + 60_000; // Next check in 60 seconds
428
+ }
429
+
430
+ // Wait longer between checks to be patient
431
+ await page.waitForTimeout(5000);
432
+ }
433
+
434
+ throw new Error("Timed out waiting to be admitted (10 minutes)");
435
+ }
436
+
437
+ async function clickLeaveButton(page: Page): Promise<void> {
438
+ try {
439
+ const leaveBtn = page
440
+ .locator('[aria-label*="Leave call" i], [aria-label*="leave" i][data-tooltip*="Leave"]')
441
+ .first();
442
+ if (await leaveBtn.isVisible({ timeout: 1000 })) {
443
+ await leaveBtn.click();
444
+ await page.waitForTimeout(1000);
445
+ }
446
+ } catch {
447
+ // Best-effort
448
+ }
449
+ }
450
+
451
+ /** Check how many participants are in the meeting (excluding the bot) */
452
+ async function getParticipantCount(page: Page): Promise<number> {
453
+ try {
454
+ return await page.evaluate(() => {
455
+ // Method 1: Check the participant count badge/text in the toolbar
456
+ // Google Meet shows participant count near the people icon
457
+ const countEl = document.querySelector('[data-participant-count]');
458
+ if (countEl) {
459
+ const count = parseInt(countEl.getAttribute('data-participant-count') || '0', 10);
460
+ return count;
461
+ }
462
+
463
+ // Method 2: Look for "X in call" or participant count text
464
+ const allText = document.body.innerText || '';
465
+
466
+ // "You're the only one here" means just the bot
467
+ if (/you.re the only one here/i.test(allText)) return 1;
468
+
469
+ // Look for participant count patterns
470
+ const countMatch = allText.match(/(\d+)\s+(?:in call|participant|people|in this call)/i);
471
+ if (countMatch) return parseInt(countMatch[1], 10);
472
+
473
+ // Method 3: Count video tiles / participant elements
474
+ // Google Meet uses specific containers for each participant
475
+ const tiles = document.querySelectorAll(
476
+ '[data-participant-id], [data-requested-participant-id]'
477
+ );
478
+ if (tiles.length > 0) return tiles.length;
479
+
480
+ // Method 4: Count elements in the participant list if open
481
+ const participantItems = document.querySelectorAll(
482
+ '[role="listitem"][data-participant-id]'
483
+ );
484
+ if (participantItems.length > 0) return participantItems.length;
485
+
486
+ return -1; // Unknown
487
+ });
488
+ } catch {
489
+ return -1; // Error — can't determine
490
+ }
491
+ }
492
+
493
+ async function waitForMeetingEnd(
494
+ page: Page,
495
+ opts?: {
496
+ durationMs?: number;
497
+ captionIdleTimeoutMs?: number;
498
+ getLastCaptionAt?: () => number;
499
+ verbose?: boolean;
500
+ },
501
+ ): Promise<string> {
502
+ const start = Date.now();
503
+ const durationMs = opts?.durationMs;
504
+ const captionIdleTimeoutMs = opts?.captionIdleTimeoutMs;
505
+ const getLastCaptionAt = opts?.getLastCaptionAt;
506
+ const verbose = opts?.verbose ?? false;
507
+
508
+ // Track when we first detected being alone (to avoid premature exit)
509
+ let aloneDetectedAt: number | null = null;
510
+ const ALONE_GRACE_PERIOD_MS = 45_000; // Wait 45s to confirm everyone left (participant detection can be unreliable)
511
+ let lastParticipantLog = 0;
512
+ let lastLoggedCount = -1;
513
+
514
+ const checkEnded = async (): Promise<string | null> => {
515
+ try {
516
+ const endedText = page
517
+ .locator(
518
+ "text=/meeting has ended/i, text=/removed from/i, text=/You left the meeting/i, text=/You.ve left the call/i",
519
+ )
520
+ .first();
521
+ if (await endedText.isVisible({ timeout: 500 })) {
522
+ return "Meeting ended";
523
+ }
524
+ } catch {
525
+ // Still in meeting
526
+ }
527
+
528
+ if (!page.url().includes("meet.google.com")) {
529
+ return "Navigated away from meeting";
530
+ }
531
+
532
+ // Check if all other participants have left
533
+ const participantCount = await getParticipantCount(page);
534
+
535
+ // Log participant count periodically — only when verbose or count changes
536
+ if (Date.now() - lastParticipantLog > 30_000 && participantCount >= 0) {
537
+ if (verbose || participantCount !== lastLoggedCount) {
538
+ console.log(` [participants] ${participantCount} in meeting`);
539
+ lastLoggedCount = participantCount;
540
+ }
541
+ lastParticipantLog = Date.now();
542
+ }
543
+
544
+ if (participantCount === 1 || participantCount === 0) {
545
+ // Possibly only the bot is left (or count is wrong)
546
+ if (!aloneDetectedAt) {
547
+ aloneDetectedAt = Date.now();
548
+ console.log(` Participant count is ${participantCount} — waiting 45s to confirm alone...`);
549
+ } else if (Date.now() - aloneDetectedAt >= ALONE_GRACE_PERIOD_MS) {
550
+ // Re-check with a screenshot for debugging before leaving
551
+ const recheck = await getParticipantCount(page);
552
+ if (recheck <= 1) {
553
+ await clickLeaveButton(page);
554
+ return "All other participants left";
555
+ } else {
556
+ // False alarm — reset
557
+ aloneDetectedAt = null;
558
+ }
559
+ }
560
+ } else if (participantCount > 1) {
561
+ // Someone is still here — reset the alone timer
562
+ if (aloneDetectedAt) {
563
+ console.log(" Participant rejoined — continuing...");
564
+ aloneDetectedAt = null;
565
+ }
566
+ }
567
+ // participantCount === -1 means unknown, don't act on it
568
+
569
+ return null;
570
+ };
571
+
572
+ while (true) {
573
+ if (durationMs && Date.now() - start >= durationMs) {
574
+ await clickLeaveButton(page);
575
+ return "Duration limit reached";
576
+ }
577
+
578
+ if (captionIdleTimeoutMs && getLastCaptionAt && Date.now() - getLastCaptionAt() >= captionIdleTimeoutMs) {
579
+ await clickLeaveButton(page);
580
+ return "No captions captured for 10 minutes";
581
+ }
582
+
583
+ const reason = await checkEnded();
584
+ if (reason) return reason;
585
+
586
+ await page.waitForTimeout(3000);
587
+ }
588
+ }
589
+
590
+ // ── Stealth patches ────────────────────────────────────────────────────
591
+
592
+ const STEALTH_SCRIPT = `
593
+ Object.defineProperty(navigator, "webdriver", { get: () => false });
594
+
595
+ if (!window.chrome) {
596
+ window.chrome = { runtime: {} };
597
+ }
598
+
599
+ Object.defineProperty(navigator, "plugins", {
600
+ get: () => [1, 2, 3, 4, 5],
601
+ });
602
+
603
+ Object.defineProperty(navigator, "languages", {
604
+ get: () => ["en-US", "en"],
605
+ });
606
+
607
+ const originalQuery = window.Permissions?.prototype?.query;
608
+ if (originalQuery) {
609
+ window.Permissions.prototype.query = function (params) {
610
+ if (params.name === "notifications") {
611
+ return Promise.resolve({ state: "default", onchange: null });
612
+ }
613
+ return originalQuery.call(this, params);
614
+ };
615
+ }
616
+
617
+ const getParameter = WebGLRenderingContext.prototype.getParameter;
618
+ WebGLRenderingContext.prototype.getParameter = function (param) {
619
+ if (param === 37445) return "Google Inc. (Apple)";
620
+ if (param === 37446) return "ANGLE (Apple, Apple M1, OpenGL 4.1)";
621
+ return getParameter.call(this, param);
622
+ };
623
+ `;
624
+
625
+ // ── Screenshot handler ─────────────────────────────────────────────────
626
+
627
+ function registerScreenshotHandler(page: Page): void {
628
+ writeFileSync(PID_FILE, String(process.pid));
629
+
630
+ process.on("SIGUSR1", async () => {
631
+ try {
632
+ const screenshotPath = join(WORKSPACE_DIR, "on-demand-screenshot.png");
633
+ await page.screenshot({ path: screenshotPath });
634
+ const payload = JSON.stringify({ path: screenshotPath, timestamp: Date.now() });
635
+ writeFileSync(SCREENSHOT_READY_FILE, payload);
636
+ console.log(`[OPENBUILDER_SCREENSHOT] ${screenshotPath}`);
637
+ } catch (err) {
638
+ console.error("Screenshot failed:", err instanceof Error ? err.message : String(err));
639
+ }
640
+ });
641
+ }
642
+
643
+ function cleanupPidFile(): void {
644
+ try {
645
+ if (existsSync(PID_FILE)) unlinkSync(PID_FILE);
646
+ } catch {
647
+ // best-effort
648
+ }
649
+ }
650
+
651
+ // ── Caption capture ────────────────────────────────────────────────────
652
+
653
+ function extractMeetingId(meetUrl: string): string {
654
+ try {
655
+ const url = new URL(meetUrl);
656
+ return url.pathname.replace(/^\//, "").replace(/\//g, "-") || "unknown";
657
+ } catch {
658
+ return "unknown";
659
+ }
660
+ }
661
+
662
+ async function enableCaptions(page: Page): Promise<void> {
663
+ await page.waitForTimeout(5000);
664
+
665
+ // Dismiss overlays aggressively (RecallAI pattern: press Escape many times)
666
+ for (let i = 0; i < 10; i++) {
667
+ await page.keyboard.press("Escape");
668
+ await page.waitForTimeout(200);
669
+ }
670
+ await page.waitForTimeout(1000);
671
+
672
+ for (const text of ["Got it", "Dismiss", "Continue", "OK", "No thanks"]) {
673
+ try {
674
+ const btn = page.locator(`button:has-text("${text}")`).first();
675
+ if (await btn.isVisible({ timeout: 500 })) {
676
+ await btn.click();
677
+ await page.waitForTimeout(300);
678
+ }
679
+ } catch {
680
+ // Not present
681
+ }
682
+ }
683
+
684
+ const checkCaptions = async (): Promise<boolean> =>
685
+ page
686
+ .evaluate(`
687
+ !!(document.querySelector('[role="region"][aria-label*="Captions"]') ||
688
+ document.querySelector('[aria-label="Captions are on"]') ||
689
+ document.querySelector('button[aria-label*="Turn off captions" i]') ||
690
+ document.querySelector('[data-is-persistent-caption="true"]') ||
691
+ document.querySelector('[jscontroller][data-caption-id]'))
692
+ `)
693
+ .catch(() => false) as Promise<boolean>;
694
+
695
+ if (await checkCaptions()) {
696
+ console.log(" Captions already enabled");
697
+ return;
698
+ }
699
+
700
+ // Debug: take screenshot before attempting caption enable
701
+ const debugPath = join(WORKSPACE_DIR, "debug-captions.png");
702
+ await page.screenshot({ path: debugPath }).catch(() => {});
703
+ console.log(` [DEBUG] Pre-caption screenshot saved`);
704
+
705
+ // Method 1: Move mouse across bottom toolbar area to reveal it, then click CC
706
+ // Try multiple Y positions since toolbar position varies by viewport
707
+ for (const y of [680, 700, 650, 720, 600]) {
708
+ try {
709
+ await page.mouse.move(640, y);
710
+ await page.waitForTimeout(1500);
711
+
712
+ const ccButton = page
713
+ .locator(
714
+ 'button[aria-label*="Turn on captions" i], ' +
715
+ 'button[aria-label*="captions" i][aria-pressed="false"], ' +
716
+ 'button[aria-label*="captions (c)" i], ' +
717
+ 'button[aria-label*="closed captions" i]',
718
+ )
719
+ .first();
720
+ if (await ccButton.isVisible({ timeout: 2000 })) {
721
+ await ccButton.click();
722
+ await page.waitForTimeout(2000);
723
+ if (await checkCaptions()) {
724
+ console.log(" Captions enabled (clicked CC button)");
725
+ return;
726
+ }
727
+ }
728
+ } catch {
729
+ // Try next position
730
+ }
731
+ }
732
+
733
+ // Method 2: Keyboard shortcut 'c' — click body first to ensure focus
734
+ try {
735
+ await page.click("body");
736
+ await page.waitForTimeout(500);
737
+ } catch {}
738
+
739
+ await page.keyboard.press("c");
740
+ await page.waitForTimeout(3000);
741
+ if (await checkCaptions()) {
742
+ console.log(" Captions enabled (pressed 'c')");
743
+ return;
744
+ }
745
+
746
+ // Method 3: Try pressing 'c' multiple times with focus resets
747
+ for (let i = 0; i < 5; i++) {
748
+ try { await page.click("body"); } catch {}
749
+ await page.waitForTimeout(300);
750
+ await page.keyboard.press("c");
751
+ await page.waitForTimeout(2000);
752
+ if (await checkCaptions()) {
753
+ console.log(` Captions enabled (press 'c', attempt ${i + 1})`);
754
+ return;
755
+ }
756
+ }
757
+
758
+ // Method 4: Use JavaScript to find and click the CC button by scanning all buttons
759
+ try {
760
+ const clicked = await page.evaluate(() => {
761
+ const buttons = document.querySelectorAll("button");
762
+ for (const btn of buttons) {
763
+ const label = (btn.getAttribute("aria-label") || "").toLowerCase();
764
+ if (label.includes("caption") && !label.includes("turn off")) {
765
+ (btn as HTMLElement).click();
766
+ return label;
767
+ }
768
+ }
769
+ // Try finding by the closed_caption icon
770
+ const icons = document.querySelectorAll('[data-icon*="caption"]');
771
+ for (const icon of icons) {
772
+ const btn = icon.closest("button");
773
+ if (btn) {
774
+ (btn as HTMLElement).click();
775
+ return "icon-based click";
776
+ }
777
+ }
778
+ return null;
779
+ });
780
+ if (clicked) {
781
+ console.log(` Clicked caption button via JS: ${clicked}`);
782
+ await page.waitForTimeout(3000);
783
+ if (await checkCaptions()) {
784
+ console.log(" Captions enabled (JS click)");
785
+ return;
786
+ }
787
+ }
788
+ } catch {}
789
+
790
+ // Method 5: More options / activities menu
791
+ try {
792
+ const moreBtn = page
793
+ .locator(
794
+ 'button[aria-label*="more options" i], button[aria-label*="More actions" i], ' +
795
+ 'button[aria-label*="activities" i]',
796
+ )
797
+ .first();
798
+ if (await moreBtn.isVisible({ timeout: 2000 })) {
799
+ await moreBtn.click();
800
+ await page.waitForTimeout(1500);
801
+ const captionsItem = page
802
+ .locator(
803
+ '[role="menuitem"]:has-text("Captions"), li:has-text("Captions"), ' +
804
+ '[role="option"]:has-text("Captions")',
805
+ )
806
+ .first();
807
+ if (await captionsItem.isVisible({ timeout: 2000 })) {
808
+ await captionsItem.click();
809
+ await page.waitForTimeout(2000);
810
+ if (await checkCaptions()) {
811
+ console.log(" Captions enabled (via menu)");
812
+ return;
813
+ }
814
+ } else {
815
+ await page.keyboard.press("Escape");
816
+ }
817
+ }
818
+ } catch {}
819
+
820
+ // Method 6: CC icon by data-icon attribute
821
+ try {
822
+ await page.mouse.move(640, 680);
823
+ await page.waitForTimeout(500);
824
+ const ccByIcon = page
825
+ .locator(
826
+ 'button:has([data-icon="closed_caption"]), button:has([data-icon="closed_caption_off"])',
827
+ )
828
+ .first();
829
+ if (await ccByIcon.isVisible({ timeout: 2000 })) {
830
+ await ccByIcon.click();
831
+ await page.waitForTimeout(2000);
832
+ if (await checkCaptions()) {
833
+ console.log(" Captions enabled (clicked CC icon)");
834
+ return;
835
+ }
836
+ }
837
+ } catch {}
838
+
839
+ // Last resort: dump all visible button labels for debugging
840
+ await page.screenshot({ path: debugPath }).catch(() => {});
841
+ const allButtons = await page
842
+ .evaluate(() => {
843
+ return Array.from(document.querySelectorAll("button"))
844
+ .map((b) => ({
845
+ label: b.getAttribute("aria-label"),
846
+ text: (b.textContent || "").slice(0, 60),
847
+ visible: b.offsetParent !== null,
848
+ }))
849
+ .filter((b) => b.visible);
850
+ })
851
+ .catch(() => []);
852
+ console.log(" [DEBUG] All visible buttons:", JSON.stringify(allButtons));
853
+ console.log(" WARNING: Could not verify captions are on — caption capture may not work");
854
+ console.log(` [DEBUG] Screenshot: ${debugPath}`);
855
+ }
856
+
857
+ // Caption observer injected into the browser context
858
+ const CAPTION_OBSERVER_SCRIPT = `
859
+ (function() {
860
+ var BADGE_SEL = ".NWpY1d, .xoMHSc";
861
+ var captionContainer = null;
862
+
863
+ var getSpeaker = function(node) {
864
+ if (!node || !node.querySelector) return "";
865
+ var badge = node.querySelector(BADGE_SEL);
866
+ return badge ? badge.textContent.trim() : "";
867
+ };
868
+
869
+ var getText = function(node) {
870
+ if (!node || !node.cloneNode) return "";
871
+ var clone = node.cloneNode(true);
872
+ var badges = clone.querySelectorAll ? clone.querySelectorAll(BADGE_SEL) : [];
873
+ for (var i = 0; i < badges.length; i++) badges[i].remove();
874
+ var imgs = clone.querySelectorAll ? clone.querySelectorAll("img") : [];
875
+ for (var j = 0; j < imgs.length; j++) imgs[j].remove();
876
+ return clone.textContent.trim();
877
+ };
878
+
879
+ var send = function(node) {
880
+ if (!(node instanceof HTMLElement)) return;
881
+
882
+ var el = node;
883
+ var speaker = "";
884
+ for (var depth = 0; depth < 6 && el && el !== document.body; depth++) {
885
+ speaker = getSpeaker(el);
886
+ if (speaker) break;
887
+ el = el.parentElement;
888
+ }
889
+
890
+ if (!speaker || !el) return;
891
+
892
+ var text = getText(el);
893
+ if (!text || text.length > 500) return;
894
+
895
+ if (/^(mic_off|videocam|call_end|more_vert|keyboard|arrow_)/i.test(text)) return;
896
+ if (text.indexOf("extension") !== -1 && text.indexOf("developers.google") !== -1) return;
897
+
898
+ try {
899
+ window.__openbuilder_onCaption(speaker, text);
900
+ } catch(e) {}
901
+ };
902
+
903
+ new MutationObserver(function(mutations) {
904
+ if (!captionContainer || !document.contains(captionContainer)) {
905
+ captionContainer = document.querySelector('[aria-label="Captions"]') ||
906
+ document.querySelector('[aria-live]');
907
+ }
908
+
909
+ for (var i = 0; i < mutations.length; i++) {
910
+ var m = mutations[i];
911
+ if (captionContainer && !captionContainer.contains(m.target)) continue;
912
+
913
+ var added = m.addedNodes;
914
+ for (var j = 0; j < added.length; j++) {
915
+ if (added[j] instanceof HTMLElement) send(added[j]);
916
+ }
917
+
918
+ if (m.type === "characterData" && m.target && m.target.parentElement) {
919
+ send(m.target.parentElement);
920
+ }
921
+ }
922
+ }).observe(document.body, {
923
+ childList: true,
924
+ characterData: true,
925
+ subtree: true
926
+ });
927
+
928
+ console.log("[OpenBuilder] Caption observer active");
929
+ })();
930
+ `;
931
+
932
+ function normalizeForCompare(text: string): string {
933
+ return text
934
+ .toLowerCase()
935
+ .replace(/[^a-z0-9 ]/g, "")
936
+ .replace(/\s+/g, " ")
937
+ .trim();
938
+ }
939
+
940
+ async function setupCaptionCapture(
941
+ page: Page,
942
+ transcriptPath: string,
943
+ verbose: boolean,
944
+ ): Promise<{ cleanup: () => void; getLastCaptionAt: () => number }> {
945
+ const tracking = new Map<string, { text: string; ts: number; startTs: number }>();
946
+ const lastWritten = new Map<string, string>();
947
+ let lastMinuteKey = "";
948
+ let lastCaptionAt = Date.now();
949
+
950
+ const finalizeCaption = (speaker: string, text: string, startTs: number): void => {
951
+ const prevWritten = lastWritten.get(speaker) ?? "";
952
+ const normNew = normalizeForCompare(text);
953
+ const normPrev = normalizeForCompare(prevWritten);
954
+
955
+ if (
956
+ normPrev &&
957
+ (normNew === normPrev ||
958
+ normPrev.startsWith(normNew) ||
959
+ (normNew.startsWith(normPrev) && normNew.length - normPrev.length < 3))
960
+ ) {
961
+ return;
962
+ }
963
+
964
+ // Extract only the NEW text (deduplicate the accumulating Google Meet CC buffer)
965
+ let textToWrite = text;
966
+ if (prevWritten && text.startsWith(prevWritten)) {
967
+ // The caption buffer is growing — only write the new part
968
+ textToWrite = text.slice(prevWritten.length).replace(/^[\s,.!?;:]+/, "").trim();
969
+ if (!textToWrite) return;
970
+ } else if (prevWritten) {
971
+ // Try normalized comparison for fuzzy prefix matching
972
+ const prevWords = normPrev.split(/\s+/);
973
+ const newWords = normNew.split(/\s+/);
974
+ // Find the longest common prefix by words
975
+ let commonLen = 0;
976
+ for (let i = 0; i < Math.min(prevWords.length, newWords.length); i++) {
977
+ if (prevWords[i] === newWords[i]) commonLen = i + 1;
978
+ else break;
979
+ }
980
+ if (commonLen > 0 && commonLen >= prevWords.length * 0.8) {
981
+ // Most of the previous text is a prefix of the new text — extract only new words
982
+ const newPart = newWords.slice(commonLen).join(" ").trim();
983
+ if (newPart) textToWrite = newPart;
984
+ }
985
+ }
986
+
987
+ lastWritten.set(speaker, text);
988
+
989
+ const d = new Date(startTs);
990
+ const hh = String(d.getHours()).padStart(2, "0");
991
+ const mm = String(d.getMinutes()).padStart(2, "0");
992
+ const ss = String(d.getSeconds()).padStart(2, "0");
993
+ const minuteKey = `${hh}:${mm}`;
994
+
995
+ let prefix = "";
996
+ if (lastMinuteKey && minuteKey !== lastMinuteKey) {
997
+ prefix = "\n";
998
+ }
999
+ lastMinuteKey = minuteKey;
1000
+
1001
+ const line = `[${hh}:${mm}:${ss}] ${speaker}: ${textToWrite}`;
1002
+ try {
1003
+ appendFileSync(transcriptPath, `${prefix}${line}\n`);
1004
+ } catch {
1005
+ // Ignore write errors
1006
+ }
1007
+ lastCaptionAt = Date.now();
1008
+ if (verbose) {
1009
+ console.log(` [caption] ${line}`);
1010
+ }
1011
+ };
1012
+
1013
+ await page.exposeFunction("__openbuilder_onCaption", (speaker: string, text: string) => {
1014
+ const existing = tracking.get(speaker);
1015
+ const prevWritten = lastWritten.get(speaker) ?? "";
1016
+
1017
+ const normNew = normalizeForCompare(text);
1018
+ const normWritten = normalizeForCompare(prevWritten);
1019
+ if (normWritten && (normNew === normWritten || normWritten.startsWith(normNew))) {
1020
+ return;
1021
+ }
1022
+
1023
+ if (existing) {
1024
+ const normOld = normalizeForCompare(existing.text);
1025
+
1026
+ const isGrowing =
1027
+ normNew.startsWith(normOld) ||
1028
+ normOld.startsWith(normNew) ||
1029
+ (normNew.length > normOld.length &&
1030
+ normNew.includes(normOld.slice(0, Math.min(20, normOld.length))));
1031
+
1032
+ if (isGrowing) {
1033
+ if (text.length >= existing.text.length) {
1034
+ existing.text = text;
1035
+ existing.ts = Date.now();
1036
+ }
1037
+ return;
1038
+ }
1039
+
1040
+ finalizeCaption(speaker, existing.text, existing.startTs);
1041
+ }
1042
+
1043
+ tracking.set(speaker, { text, ts: Date.now(), startTs: Date.now() });
1044
+ });
1045
+
1046
+ const settleInterval = setInterval(() => {
1047
+ const now = Date.now();
1048
+ for (const [speaker, data] of tracking.entries()) {
1049
+ if (now - data.ts >= 5000) {
1050
+ finalizeCaption(speaker, data.text, data.startTs);
1051
+ tracking.delete(speaker);
1052
+ }
1053
+ }
1054
+ }, 1000);
1055
+
1056
+ await page.evaluate(CAPTION_OBSERVER_SCRIPT);
1057
+
1058
+ return {
1059
+ getLastCaptionAt: () => lastCaptionAt,
1060
+ cleanup: () => {
1061
+ clearInterval(settleInterval);
1062
+ for (const [speaker, data] of tracking.entries()) {
1063
+ finalizeCaption(speaker, data.text, data.startTs);
1064
+ }
1065
+ tracking.clear();
1066
+ },
1067
+ };
1068
+ }
1069
+
1070
+ // ── Auto-report generation ─────────────────────────────────────────────
1071
+
1072
+ async function generateAutoReport(
1073
+ transcriptPath: string,
1074
+ meetingId: string,
1075
+ channel?: string,
1076
+ target?: string,
1077
+ ): Promise<void> {
1078
+ const config = getConfig();
1079
+ const hasApiKey = config.anthropicApiKey || config.openaiApiKey;
1080
+
1081
+ if (!hasApiKey) {
1082
+ console.log(" No AI API key configured — skipping auto-report generation");
1083
+ console.log(" Set ANTHROPIC_API_KEY or OPENAI_API_KEY to enable auto-reports");
1084
+ return;
1085
+ }
1086
+
1087
+ const transcriptContent = readFileSync(transcriptPath, "utf-8").trim();
1088
+ if (!transcriptContent) {
1089
+ console.log(" Empty transcript — skipping report generation");
1090
+ return;
1091
+ }
1092
+
1093
+ console.log("Generating AI meeting report...");
1094
+
1095
+ try {
1096
+ // Dynamically import to avoid circular deps and keep the join script lean
1097
+ const { parseTranscript, formatTranscriptForAI, chunkTranscript } = await import(
1098
+ "../src/utils/transcript-parser.js"
1099
+ );
1100
+ const { ClaudeProvider } = await import("../src/ai/claude.js");
1101
+ const { OpenAIProvider } = await import("../src/ai/openai.js");
1102
+ const { getMeetingAnalysisPrompt, getMergeAnalysisPrompt } = await import(
1103
+ "../src/ai/prompts.js"
1104
+ );
1105
+ const { calculateSpeakerStats } = await import("../src/analytics/speaker-stats.js");
1106
+ const { parseAnalysisResponse, generateReport } = await import("../src/report/generator.js");
1107
+ const { REPORTS_DIR } = await import("../src/utils/config.js");
1108
+
1109
+ // Select AI provider
1110
+ const provider = config.aiProvider === "openai" && config.openaiApiKey
1111
+ ? new OpenAIProvider()
1112
+ : new ClaudeProvider();
1113
+
1114
+ const transcript = parseTranscript(transcriptContent);
1115
+ const analytics = calculateSpeakerStats(transcript);
1116
+
1117
+ // Chunk if needed (long meetings)
1118
+ const chunks = chunkTranscript(transcript, 30000);
1119
+ let analysisResponse: string;
1120
+
1121
+ if (chunks.length === 1) {
1122
+ const formatted = formatTranscriptForAI(transcript);
1123
+ analysisResponse = await provider.complete({
1124
+ messages: [
1125
+ { role: "system", content: "You are an expert meeting analyst. Return only valid JSON." },
1126
+ { role: "user", content: getMeetingAnalysisPrompt(formatted) },
1127
+ ],
1128
+ maxTokens: 4096,
1129
+ temperature: 0.3,
1130
+ });
1131
+ } else {
1132
+ // Process chunks and merge
1133
+ const chunkResults: string[] = [];
1134
+ for (let i = 0; i < chunks.length; i++) {
1135
+ console.log(` Processing chunk ${i + 1}/${chunks.length}...`);
1136
+ const result = await provider.complete({
1137
+ messages: [
1138
+ { role: "system", content: "You are an expert meeting analyst. Return only valid JSON." },
1139
+ {
1140
+ role: "user",
1141
+ content: getMeetingAnalysisPrompt(
1142
+ chunks[i]!,
1143
+ `chunk ${i + 1} of ${chunks.length}`,
1144
+ ),
1145
+ },
1146
+ ],
1147
+ maxTokens: 4096,
1148
+ temperature: 0.3,
1149
+ });
1150
+ chunkResults.push(result);
1151
+ }
1152
+
1153
+ console.log(" Merging chunk analyses...");
1154
+ analysisResponse = await provider.complete({
1155
+ messages: [
1156
+ { role: "system", content: "You are an expert meeting analyst. Return only valid JSON." },
1157
+ { role: "user", content: getMergeAnalysisPrompt(chunkResults) },
1158
+ ],
1159
+ maxTokens: 4096,
1160
+ temperature: 0.3,
1161
+ });
1162
+ }
1163
+
1164
+ const analysis = parseAnalysisResponse(analysisResponse);
1165
+ const report = generateReport({
1166
+ meetingId,
1167
+ date: new Date().toISOString().split("T")[0],
1168
+ transcriptPath,
1169
+ analysis,
1170
+ analytics,
1171
+ });
1172
+
1173
+ mkdirSync(REPORTS_DIR, { recursive: true });
1174
+ const reportPath = join(REPORTS_DIR, `${meetingId}-report.md`);
1175
+ writeFileSync(reportPath, report);
1176
+ console.log(`[OPENBUILDER_REPORT] ${reportPath}`);
1177
+ sendMessage({
1178
+ channel,
1179
+ target,
1180
+ message: `Meeting report generated! View at: ${reportPath}`,
1181
+ });
1182
+ } catch (err) {
1183
+ const msg = err instanceof Error ? err.message : String(err);
1184
+ console.error(` Report generation failed: ${msg}`);
1185
+ sendMessage({ channel, target, message: `Report generation failed: ${msg}` });
1186
+ }
1187
+ }
1188
+
1189
+ // ── Main ───────────────────────────────────────────────────────────────
1190
+
1191
+ export async function joinMeeting(opts: {
1192
+ meetUrl: string;
1193
+ headed?: boolean;
1194
+ noAuth?: boolean;
1195
+ noCamera?: boolean;
1196
+ noMic?: boolean;
1197
+ verbose?: boolean;
1198
+ noReport?: boolean;
1199
+ durationMs?: number;
1200
+ botName?: string;
1201
+ channel?: string;
1202
+ target?: string;
1203
+ captureMode?: "audio" | "captions" | "auto";
1204
+ }): Promise<{ context: BrowserContext; page: Page; reason: string }> {
1205
+ const {
1206
+ meetUrl,
1207
+ headed = false,
1208
+ noAuth = false,
1209
+ noCamera = true,
1210
+ noMic = true,
1211
+ verbose = false,
1212
+ noReport = false,
1213
+ durationMs,
1214
+ botName: botNameOpt,
1215
+ channel,
1216
+ target,
1217
+ captureMode: captureModeOpt,
1218
+ } = opts;
1219
+
1220
+ // Resolve bot name from config or arg
1221
+ const config = getConfig();
1222
+ let botName = botNameOpt ?? config.botName ?? "OpenBuilder Bot";
1223
+
1224
+ // Resolve duration from config if not specified
1225
+ let effectiveDurationMs = durationMs;
1226
+ if (!effectiveDurationMs && config.defaultDuration) {
1227
+ const match = config.defaultDuration.match(/^(\d+)(ms|s|m|h)?$/);
1228
+ if (match) {
1229
+ const value = parseInt(match[1]!, 10);
1230
+ const unit = match[2] ?? "ms";
1231
+ const multipliers: Record<string, number> = { ms: 1, s: 1000, m: 60_000, h: 3_600_000 };
1232
+ effectiveDurationMs = value * (multipliers[unit] ?? 1);
1233
+ }
1234
+ }
1235
+
1236
+ ensureDirs();
1237
+
1238
+ // Resolve capture mode: CLI flag > config > "auto"
1239
+ let useAudioCapture = false;
1240
+ const resolvedCaptureMode = captureModeOpt ?? config.captureMode ?? "auto";
1241
+
1242
+ if (resolvedCaptureMode === "audio" || resolvedCaptureMode === "auto") {
1243
+ const { isAudioCaptureAvailable } = await import("../src/audio/pipeline.js");
1244
+ const audioDeps = isAudioCaptureAvailable();
1245
+ if (audioDeps.available) {
1246
+ useAudioCapture = true;
1247
+ if (resolvedCaptureMode === "auto") {
1248
+ console.log(" Auto-detected PulseAudio + ffmpeg — using audio capture mode");
1249
+ }
1250
+ } else if (resolvedCaptureMode === "audio") {
1251
+ console.error(`ERROR: --audio requires: ${audioDeps.missing.join(", ")}`);
1252
+ console.error("Install the missing dependencies or use --captions instead.");
1253
+ process.exit(1);
1254
+ } else {
1255
+ console.log(` Audio capture not available (missing: ${audioDeps.missing.join(", ")}) — falling back to captions`);
1256
+ }
1257
+ }
1258
+
1259
+ // Check for OpenAI API key when using audio mode (needed for Whisper)
1260
+ if (useAudioCapture && !config.openaiApiKey && !process.env.OPENAI_API_KEY) {
1261
+ console.warn(" WARNING: Audio capture requires OPENAI_API_KEY for Whisper transcription.");
1262
+ console.warn(" Set it with: npx openbuilder config set openaiApiKey <key>");
1263
+ console.warn(" Falling back to captions mode.");
1264
+ useAudioCapture = false;
1265
+ }
1266
+
1267
+ const meetingId = extractMeetingId(meetUrl);
1268
+ const audioSinkName = `openbuilder_${meetingId.replace(/[^a-z0-9_-]/gi, "_")}`;
1269
+
1270
+ // If using audio capture, set up PulseAudio routing before browser launch
1271
+ if (useAudioCapture) {
1272
+ process.env.PULSE_SINK = audioSinkName;
1273
+ // Also set PULSE_SERVER so Chromium finds PulseAudio
1274
+ if (!process.env.PULSE_SERVER) {
1275
+ const { execSync } = await import("node:child_process");
1276
+ try {
1277
+ const serverInfo = execSync("pactl info 2>/dev/null | grep 'Server String' | cut -d: -f2-", { encoding: "utf-8" }).trim();
1278
+ if (serverInfo) process.env.PULSE_SERVER = serverInfo;
1279
+ } catch {}
1280
+ }
1281
+ }
1282
+
1283
+ console.log(`OpenBuilder — Joining meeting: ${meetUrl}`);
1284
+ console.log(` Bot name: ${botName}`);
1285
+ console.log(` Capture mode: ${useAudioCapture ? "audio (PulseAudio + Whisper)" : "captions (DOM scraping)"}`);
1286
+ console.log(` Camera: ${noCamera ? "off" : "on"}, Mic: ${noMic ? "off" : "on"}`);
1287
+ if (effectiveDurationMs) {
1288
+ console.log(` Max duration: ${Math.round(effectiveDurationMs / 60_000)}m`);
1289
+ }
1290
+
1291
+ let pw: PlaywrightMod;
1292
+ try {
1293
+ pw = await import("playwright-core");
1294
+ } catch {
1295
+ console.error("playwright-core not found. Run `npm install` or use `npx openbuilder join ...`.");
1296
+ process.exit(1);
1297
+ }
1298
+
1299
+ const hasAuth = !noAuth && existsSync(AUTH_FILE);
1300
+ if (noAuth) {
1301
+ console.log(" Joining as guest (--anon)");
1302
+ } else if (hasAuth) {
1303
+ console.log(` Using saved auth: ${AUTH_FILE}`);
1304
+ } else {
1305
+ console.log(" No auth.json found — joining as guest (run `npx openbuilder auth` to sign in)");
1306
+ }
1307
+
1308
+ const chromiumArgs = [
1309
+ "--disable-blink-features=AutomationControlled",
1310
+ "--no-first-run",
1311
+ "--no-default-browser-check",
1312
+ "--disable-sync",
1313
+ "--use-fake-ui-for-media-stream",
1314
+ "--use-fake-device-for-media-stream",
1315
+ "--auto-select-desktop-capture-source=Entire screen",
1316
+ "--autoplay-policy=no-user-gesture-required",
1317
+ "--disable-dev-shm-usage",
1318
+ "--window-size=1280,720",
1319
+ ];
1320
+
1321
+ if (!headed) {
1322
+ chromiumArgs.push("--headless=new", "--disable-gpu");
1323
+ }
1324
+
1325
+ // Launch browser and attempt to join (up to 3 retries with fresh contexts)
1326
+ let context: BrowserContext;
1327
+ let page: Page;
1328
+
1329
+ if (hasAuth) {
1330
+ // For audio capture, use full Chrome (not headless-shell) with headless: false
1331
+ // so the browser actually outputs audio to PulseAudio. Xvfb provides the virtual display.
1332
+ const useFullChrome = useAudioCapture;
1333
+ let fullChromePath: string | undefined;
1334
+ if (useFullChrome) {
1335
+ try {
1336
+ const result = execSync(
1337
+ 'find ~/.cache/ms-playwright/chromium-*/chrome-linux64 -name "chrome" -type f | head -1',
1338
+ { encoding: "utf-8" },
1339
+ ).trim();
1340
+ fullChromePath = result || undefined;
1341
+ } catch { /* fallback to default */ }
1342
+ }
1343
+ if (useFullChrome && fullChromePath) {
1344
+ console.log(` Using full Chrome for audio: ${fullChromePath}`);
1345
+ }
1346
+
1347
+ const browser = await pw.chromium.launch({
1348
+ headless: useFullChrome ? false : !headed,
1349
+ ...(fullChromePath ? { executablePath: fullChromePath } : {}),
1350
+ args: chromiumArgs,
1351
+ ignoreDefaultArgs: ["--enable-automation", "--mute-audio"],
1352
+ });
1353
+ context = await browser.newContext({
1354
+ storageState: AUTH_FILE,
1355
+ viewport: { width: 1280, height: 720 },
1356
+ permissions: ["camera", "microphone"],
1357
+ userAgent:
1358
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
1359
+ });
1360
+ page = await context.newPage();
1361
+ } else {
1362
+ const userDataDir = join(OPENBUILDER_DIR, "chrome-profile");
1363
+ mkdirSync(userDataDir, { recursive: true });
1364
+
1365
+ // When using audio capture, use full Chrome (not headless-shell) with headless: false
1366
+ // so the browser actually outputs audio. Xvfb provides the virtual display.
1367
+ const useFullChrome = useAudioCapture;
1368
+ const fullChromePath = useFullChrome
1369
+ ? (() => {
1370
+ try {
1371
+ const result = execSync(
1372
+ 'find ~/.cache/ms-playwright/chromium-*/chrome-linux64 -name "chrome" -type f | head -1',
1373
+ { encoding: "utf-8" },
1374
+ ).trim();
1375
+ return result || undefined;
1376
+ } catch {
1377
+ return undefined;
1378
+ }
1379
+ })()
1380
+ : undefined;
1381
+
1382
+ if (useFullChrome && fullChromePath) {
1383
+ console.log(` Using full Chrome for audio: ${fullChromePath}`);
1384
+ }
1385
+
1386
+ context = await pw.chromium.launchPersistentContext(userDataDir, {
1387
+ headless: useFullChrome ? false : true,
1388
+ ...(fullChromePath ? { executablePath: fullChromePath } : {}),
1389
+ args: chromiumArgs,
1390
+ ignoreDefaultArgs: ["--enable-automation", "--mute-audio"],
1391
+ viewport: { width: 1280, height: 720 },
1392
+ permissions: ["camera", "microphone"],
1393
+ userAgent:
1394
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
1395
+ } as Record<string, unknown>);
1396
+ page = context.pages()[0] ?? (await context.newPage());
1397
+ }
1398
+
1399
+ await context.addInitScript(STEALTH_SCRIPT);
1400
+
1401
+ const MAX_JOIN_RETRIES = 2;
1402
+ let currentContext = context;
1403
+ let currentPage = page;
1404
+ let joined = false;
1405
+
1406
+ // Only send essential status messages to avoid Telegram "typing" spam
1407
+
1408
+ for (let attempt = 1; attempt <= MAX_JOIN_RETRIES; attempt++) {
1409
+ console.log(`\nNavigating to meeting... (attempt ${attempt}/${MAX_JOIN_RETRIES})`);
1410
+ await currentPage.goto(meetUrl, { waitUntil: "domcontentloaded", timeout: 30_000 });
1411
+ await currentPage.waitForTimeout(3000);
1412
+
1413
+ await dismissOverlays(currentPage);
1414
+
1415
+ // Only check for blocks on retry attempts (not first attempt)
1416
+ if (attempt > 1 && await isBlockedFromJoining(currentPage)) {
1417
+ console.warn(` Blocked: "You can't join this video call" (attempt ${attempt})`);
1418
+
1419
+ if (attempt < MAX_JOIN_RETRIES) {
1420
+ console.log(` Waiting 60s before retrying...`);
1421
+ await currentPage.waitForTimeout(60 * 1000);
1422
+ // Just reload the page, don't create fresh context
1423
+ continue;
1424
+ }
1425
+
1426
+ const screenshotPath = join(WORKSPACE_DIR, "debug-join-failed.png");
1427
+ await currentPage.screenshot({ path: screenshotPath, fullPage: true });
1428
+ console.error(`[OPENBUILDER_DEBUG_IMAGE] ${screenshotPath}`);
1429
+ sendImage({
1430
+ channel,
1431
+ target,
1432
+ message: "Blocked from joining after multiple attempts. Here's what the bot saw:",
1433
+ mediaPath: screenshotPath,
1434
+ });
1435
+ await currentContext.close();
1436
+ throw new Error(
1437
+ `Blocked from joining after ${MAX_JOIN_RETRIES} attempts. Debug screenshot: ${screenshotPath}`,
1438
+ );
1439
+ }
1440
+
1441
+ // First attempt: wait longer before checking for blocks (let page fully load)
1442
+ if (attempt === 1) {
1443
+ await currentPage.waitForTimeout(5000); // Extra 5s for page to settle
1444
+ const earlyBlockCheck = await isBlockedFromJoining(currentPage).catch(() => false);
1445
+
1446
+ if (earlyBlockCheck) {
1447
+ console.log(" Page shows 'can't join' — waiting 15s and reloading...");
1448
+ await currentPage.waitForTimeout(15000);
1449
+ await currentPage.reload({ waitUntil: "domcontentloaded" });
1450
+ await currentPage.waitForTimeout(5000);
1451
+ await dismissOverlays(currentPage);
1452
+ }
1453
+ }
1454
+
1455
+ await enterNameIfNeeded(currentPage, botName);
1456
+ await disableMediaOnPreJoin(currentPage, { noCamera, noMic });
1457
+ await currentPage.waitForTimeout(1000);
1458
+
1459
+ console.log("\nAttempting to join...");
1460
+ joined = await clickJoinButton(currentPage);
1461
+
1462
+ // Handle 2-step join preview
1463
+ if (joined) {
1464
+ await currentPage.waitForTimeout(2000);
1465
+ try {
1466
+ const secondJoin = currentPage.locator('button:has-text("Join now")').first();
1467
+ if (await secondJoin.isVisible({ timeout: 2000 })) {
1468
+ await secondJoin.click();
1469
+ console.log(" Clicked second join button (2-step preview)");
1470
+ }
1471
+ } catch {
1472
+ // Single-step flow
1473
+ }
1474
+ }
1475
+
1476
+ if (joined) {
1477
+ registerScreenshotHandler(currentPage);
1478
+ sendMessage({ channel, target, message: `Joining meeting — please admit "${botName}" if prompted` });
1479
+ try {
1480
+ await waitUntilInMeeting(currentPage);
1481
+ break;
1482
+ } catch (err) {
1483
+ const msg = err instanceof Error ? err.message : String(err);
1484
+ console.warn(` Post-join block: ${msg} (attempt ${attempt})`);
1485
+ joined = false;
1486
+ }
1487
+ }
1488
+
1489
+ if (attempt < MAX_JOIN_RETRIES) {
1490
+ console.log(` Waiting 60s before retrying...`);
1491
+ await new Promise(r => setTimeout(r, 60 * 1000));
1492
+ // Just reload the page, don't create fresh context to avoid bot detection
1493
+ }
1494
+ }
1495
+
1496
+ if (!joined) {
1497
+ const screenshotPath = join(WORKSPACE_DIR, "debug-join-failed.png");
1498
+ await currentPage.screenshot({ path: screenshotPath, fullPage: true }).catch(() => {});
1499
+ console.error("Could not join the meeting after all attempts.");
1500
+ console.error(`[OPENBUILDER_DEBUG_IMAGE] ${screenshotPath}`);
1501
+ sendImage({
1502
+ channel,
1503
+ target,
1504
+ message: "Could not join the meeting. Here is what the bot saw:",
1505
+ mediaPath: screenshotPath,
1506
+ });
1507
+ await currentContext.close();
1508
+ throw new Error(
1509
+ `Failed to join after ${MAX_JOIN_RETRIES} attempts. Debug screenshot: ${screenshotPath}`,
1510
+ );
1511
+ }
1512
+
1513
+ // Successfully joined
1514
+ const successScreenshotPath = join(WORKSPACE_DIR, "joined-meeting.png");
1515
+ await currentPage.screenshot({ path: successScreenshotPath });
1516
+ console.log("\nSuccessfully joined the meeting!");
1517
+ console.log(`[OPENBUILDER_JOINED] ${meetUrl}`);
1518
+ console.log(`[OPENBUILDER_SUCCESS_IMAGE] ${successScreenshotPath}`);
1519
+ sendImage({
1520
+ channel,
1521
+ target,
1522
+ message: "Successfully joined the meeting!",
1523
+ mediaPath: successScreenshotPath,
1524
+ });
1525
+
1526
+ await dismissPostJoinDialogs(currentPage);
1527
+
1528
+ mkdirSync(TRANSCRIPTS_DIR, { recursive: true });
1529
+ const transcriptPath = join(TRANSCRIPTS_DIR, `${meetingId}.txt`);
1530
+ writeFileSync(transcriptPath, "");
1531
+
1532
+ let reason: string;
1533
+
1534
+ if (useAudioCapture) {
1535
+ // ── Audio capture mode ──────────────────────────────────────────
1536
+ console.log("Starting audio capture pipeline...");
1537
+
1538
+ const { startAudioPipeline } = await import("../src/audio/pipeline.js");
1539
+ const pipeline = await startAudioPipeline({
1540
+ sinkName: audioSinkName,
1541
+ transcriptPath,
1542
+ apiKey: config.openaiApiKey,
1543
+ whisperModel: config.whisperModel,
1544
+ verbose,
1545
+ });
1546
+
1547
+ console.log("Waiting in meeting... (Ctrl+C to leave)");
1548
+ reason = await waitForMeetingEnd(currentPage, {
1549
+ durationMs: effectiveDurationMs,
1550
+ captionIdleTimeoutMs: 10 * 60_000,
1551
+ getLastCaptionAt: pipeline.getLastTranscriptAt,
1552
+ verbose,
1553
+ });
1554
+ console.log(`\nLeaving meeting: ${reason}`);
1555
+
1556
+ await pipeline.stop();
1557
+ pipeline.cleanup();
1558
+ // Clean up PULSE_SINK env
1559
+ delete process.env.PULSE_SINK;
1560
+ } else {
1561
+ // ── Caption scraping mode (fallback) ────────────────────────────
1562
+ await enableCaptions(currentPage);
1563
+
1564
+ const { cleanup: cleanupCaptions, getLastCaptionAt } = await setupCaptionCapture(
1565
+ currentPage,
1566
+ transcriptPath,
1567
+ verbose,
1568
+ );
1569
+
1570
+ console.log("Waiting in meeting... (Ctrl+C to leave)");
1571
+ reason = await waitForMeetingEnd(currentPage, {
1572
+ durationMs: effectiveDurationMs,
1573
+ captionIdleTimeoutMs: 10 * 60_000,
1574
+ getLastCaptionAt,
1575
+ verbose,
1576
+ });
1577
+ console.log(`\nLeaving meeting: ${reason}`);
1578
+
1579
+ cleanupCaptions();
1580
+ }
1581
+
1582
+ if (existsSync(transcriptPath)) {
1583
+ const content = readFileSync(transcriptPath, "utf-8").trim();
1584
+ console.log(`[OPENBUILDER_TRANSCRIPT] ${transcriptPath}`);
1585
+ sendMessage({ channel, target, message: `Meeting ended (${reason}). Transcript saved.` });
1586
+
1587
+ // Auto-generate report if API key is configured
1588
+ if (!noReport && content) {
1589
+ await generateAutoReport(transcriptPath, meetingId, channel, target);
1590
+ }
1591
+ } else {
1592
+ sendMessage({ channel, target, message: `Meeting ended (${reason}). No transcript was captured.` });
1593
+ }
1594
+
1595
+ return { context: currentContext, page: currentPage, reason };
1596
+ }
1597
+
1598
+ // ── CLI entry ──────────────────────────────────────────────────────────
1599
+ async function main() {
1600
+ const opts = parseArgs();
1601
+ const { context } = await joinMeeting(opts);
1602
+ await context.close();
1603
+ cleanupPidFile();
1604
+ console.log("Done.");
1605
+ }
1606
+
1607
+ const isMain = process.argv[1]?.endsWith("builder-join.ts");
1608
+ if (isMain) {
1609
+ main().catch((err) => {
1610
+ console.error("Fatal:", err instanceof Error ? err.message : String(err));
1611
+ process.exit(1);
1612
+ });
1613
+ }