@askjo/camofox-browser 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/Dockerfile CHANGED
@@ -31,8 +31,14 @@ RUN apt-get update && apt-get install -y \
31
31
  ca-certificates \
32
32
  curl \
33
33
  unzip \
34
+ # yt-dlp runtime dependency
35
+ python3-minimal \
34
36
  && rm -rf /var/lib/apt/lists/*
35
37
 
38
+ # Install yt-dlp for YouTube transcript extraction (no browser needed)
39
+ RUN curl -L https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp -o /usr/local/bin/yt-dlp \
40
+ && chmod +x /usr/local/bin/yt-dlp
41
+
36
42
  # Pre-bake Camoufox browser binary into image
37
43
  # This avoids downloading at runtime and pins the version
38
44
  # Note: unzip returns exit code 1 for warnings (Unicode filenames), so we use || true and verify
package/README.md CHANGED
@@ -37,9 +37,20 @@ This project wraps that engine in a REST API built for agents: accessibility sna
37
37
  - **Cookie Import** - inject Netscape-format cookie files for authenticated browsing
38
38
  - **Proxy + GeoIP** - route traffic through residential proxies with automatic locale/timezone
39
39
  - **Structured Logging** - JSON log lines with request IDs for production observability
40
+ - **YouTube Transcripts** - extract captions from any YouTube video via yt-dlp, no API key needed
40
41
  - **Search Macros** - `@google_search`, `@youtube_search`, `@amazon_search`, `@reddit_subreddit`, and 10 more
42
+ - **Snapshot Screenshots** - include a base64 PNG screenshot alongside the accessibility snapshot
43
+ - **Large Page Handling** - automatic snapshot truncation with offset-based pagination
41
44
  - **Deploy Anywhere** - Docker, Fly.io, Railway
42
45
 
46
+ ## Optional Dependencies
47
+
48
+ | Dependency | Purpose | Install |
49
+ |-----------|---------|---------|
50
+ | [yt-dlp](https://github.com/yt-dlp/yt-dlp) | YouTube transcript extraction (fast path) | `pip install yt-dlp` or `brew install yt-dlp` |
51
+
52
+ The Docker image includes yt-dlp. For local dev, install it for the `/youtube/transcript` endpoint. Without it, the endpoint falls back to a slower browser-based method.
53
+
43
54
  ## Quick Start
44
55
 
45
56
  ### OpenClaw Plugin
@@ -252,7 +263,7 @@ curl -X POST http://localhost:9377/tabs/TAB_ID/navigate \
252
263
 
253
264
  | Method | Endpoint | Description |
254
265
  |--------|----------|-------------|
255
- | `GET` | `/tabs/:id/snapshot` | Accessibility snapshot with element refs |
266
+ | `GET` | `/tabs/:id/snapshot` | Accessibility snapshot with element refs. Query params: `includeScreenshot=true` (add base64 PNG), `offset=N` (paginate large snapshots) |
256
267
  | `POST` | `/tabs/:id/click` | Click element by ref or CSS selector |
257
268
  | `POST` | `/tabs/:id/type` | Type text into element |
258
269
  | `POST` | `/tabs/:id/press` | Press a keyboard key |
@@ -265,6 +276,21 @@ curl -X POST http://localhost:9377/tabs/TAB_ID/navigate \
265
276
  | `POST` | `/tabs/:id/forward` | Go forward |
266
277
  | `POST` | `/tabs/:id/refresh` | Refresh page |
267
278
 
279
+ ### YouTube Transcript
280
+
281
+ | Method | Endpoint | Description |
282
+ |--------|----------|-------------|
283
+ | `POST` | `/youtube/transcript` | Extract captions from a YouTube video |
284
+
285
+ ```bash
286
+ curl -X POST http://localhost:9377/youtube/transcript \
287
+ -H 'Content-Type: application/json' \
288
+ -d '{"url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ", "languages": ["en"]}'
289
+ # → { "status": "ok", "transcript": "[00:18] ♪ We're no strangers to love ♪\n...", "video_title": "...", "total_words": 548 }
290
+ ```
291
+
292
+ Uses [yt-dlp](https://github.com/yt-dlp/yt-dlp) when available (fast, no browser needed). Falls back to a browser-based intercept method if yt-dlp is not installed — this is slower and less reliable due to YouTube ad pre-rolls.
293
+
268
294
  ### Server
269
295
 
270
296
  | Method | Endpoint | Description |
@@ -0,0 +1,41 @@
1
+ /**
2
+ * Snapshot windowing — truncate large accessibility snapshots while
3
+ * preserving pagination/navigation links at the tail.
4
+ */
5
+
6
+ const MAX_SNAPSHOT_CHARS = 80000; // ~20K tokens
7
+ const SNAPSHOT_TAIL_CHARS = 5000; // keep last ~5K for pagination/nav links
8
+
9
+ /**
10
+ * Return a window of the snapshot YAML.
11
+ * offset=0 (default): head chunk + tail (pagination/nav).
12
+ * offset=N: chars N..N+budget from the full snapshot.
13
+ * Always appends pagination tail so nav refs are available in every chunk.
14
+ */
15
+ function windowSnapshot(yaml, offset = 0) {
16
+ if (!yaml) return { text: '', truncated: false, totalChars: 0, offset: 0 };
17
+ const total = yaml.length;
18
+ if (total <= MAX_SNAPSHOT_CHARS) return { text: yaml, truncated: false, totalChars: total, offset: 0 };
19
+
20
+ const contentBudget = MAX_SNAPSHOT_CHARS - SNAPSHOT_TAIL_CHARS - 200; // room for marker
21
+ const tail = yaml.slice(-SNAPSHOT_TAIL_CHARS);
22
+ const clampedOffset = Math.min(Math.max(0, offset), total - SNAPSHOT_TAIL_CHARS);
23
+ const chunk = yaml.slice(clampedOffset, clampedOffset + contentBudget);
24
+ const chunkEnd = clampedOffset + contentBudget;
25
+ const hasMore = chunkEnd < total - SNAPSHOT_TAIL_CHARS;
26
+
27
+ const marker = hasMore
28
+ ? `\n[... truncated at char ${chunkEnd} of ${total}. Call snapshot with offset=${chunkEnd} to see more. Pagination links below. ...]\n`
29
+ : '\n';
30
+
31
+ return {
32
+ text: chunk + marker + tail,
33
+ truncated: true,
34
+ totalChars: total,
35
+ offset: clampedOffset,
36
+ hasMore,
37
+ nextOffset: hasMore ? chunkEnd : null
38
+ };
39
+ }
40
+
41
+ module.exports = { windowSnapshot, MAX_SNAPSHOT_CHARS, SNAPSHOT_TAIL_CHARS };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@askjo/camofox-browser",
3
- "version": "1.2.0",
3
+ "version": "1.3.0",
4
4
  "description": "Headless browser automation server and OpenClaw plugin for AI agents - anti-detection, element refs, and session isolation",
5
5
  "main": "server.js",
6
6
  "license": "MIT",
@@ -26,7 +26,9 @@
26
26
  "clawdbot",
27
27
  "moltbot",
28
28
  "playwright",
29
- "firefox"
29
+ "firefox",
30
+ "youtube",
31
+ "transcript"
30
32
  ],
31
33
  "engines": {
32
34
  "node": ">=18"
@@ -63,6 +65,7 @@
63
65
  "puppeteer-extra-plugin-stealth": "^2.11.2"
64
66
  },
65
67
  "devDependencies": {
66
- "jest": "^29.7.0"
68
+ "jest": "^29.7.0",
69
+ "pngjs": "^7.0.0"
67
70
  }
68
71
  }
package/plugin.ts CHANGED
@@ -37,7 +37,7 @@ interface PluginConfig {
37
37
  }
38
38
 
39
39
  interface ToolResult {
40
- content: Array<{ type: string; text: string }>;
40
+ content: Array<{ type: string; text?: string; data?: string; mimeType?: string }>;
41
41
  }
42
42
 
43
43
  interface HealthCheckResult {
@@ -238,19 +238,30 @@ export default function register(api: PluginApi) {
238
238
  api.registerTool((ctx: ToolContext) => ({
239
239
  name: "camofox_snapshot",
240
240
  description:
241
- "Get accessibility snapshot of a Camoufox page with element refs (e1, e2, etc.) for interaction. Use with camofox_create_tab.",
241
+ "Get accessibility snapshot of a Camoufox page with element refs (e1, e2, etc.) for interaction, plus a visual screenshot. " +
242
+ "Large pages are truncated with pagination links preserved at the bottom. " +
243
+ "If the response includes hasMore=true and nextOffset, call again with that offset to see more content.",
242
244
  parameters: {
243
245
  type: "object",
244
246
  properties: {
245
247
  tabId: { type: "string", description: "Tab identifier" },
248
+ offset: { type: "number", description: "Character offset for paginated snapshots. Use nextOffset from a previous truncated response." },
246
249
  },
247
250
  required: ["tabId"],
248
251
  },
249
252
  async execute(_id, params) {
250
- const { tabId } = params as { tabId: string };
253
+ const { tabId, offset } = params as { tabId: string; offset?: number };
251
254
  const userId = ctx.agentId || fallbackUserId;
252
- const result = await fetchApi(baseUrl, `/tabs/${tabId}/snapshot?userId=${userId}`);
253
- return toToolResult(result);
255
+ const qs = offset ? `&offset=${offset}` : '';
256
+ const result = await fetchApi(baseUrl, `/tabs/${tabId}/snapshot?userId=${userId}&includeScreenshot=true${qs}`) as Record<string, unknown>;
257
+ const content: ToolResult["content"] = [
258
+ { type: "text", text: JSON.stringify({ url: result.url, refsCount: result.refsCount, snapshot: result.snapshot, truncated: result.truncated, totalChars: result.totalChars, hasMore: result.hasMore, nextOffset: result.nextOffset }, null, 2) },
259
+ ];
260
+ const screenshot = result.screenshot as { data?: string; mimeType?: string } | undefined;
261
+ if (screenshot?.data) {
262
+ content.push({ type: "image", data: screenshot.data, mimeType: screenshot.mimeType || "image/png" });
263
+ }
264
+ return { content };
254
265
  },
255
266
  }));
256
267
 
package/server.js CHANGED
@@ -5,6 +5,7 @@ const crypto = require('crypto');
5
5
  const os = require('os');
6
6
  const { expandMacro } = require('./lib/macros');
7
7
  const { loadConfig } = require('./lib/config');
8
+ const { windowSnapshot } = require('./lib/snapshot');
8
9
 
9
10
  const CONFIG = loadConfig();
10
11
 
@@ -175,9 +176,14 @@ const SESSION_TIMEOUT_MS = parseInt(process.env.SESSION_TIMEOUT_MS) || 1800000;
175
176
  const MAX_SNAPSHOT_NODES = 500;
176
177
  const MAX_SESSIONS = parseInt(process.env.MAX_SESSIONS) || 50;
177
178
  const MAX_TABS_PER_SESSION = parseInt(process.env.MAX_TABS_PER_SESSION) || 10;
179
+ const MAX_TABS_GLOBAL = parseInt(process.env.MAX_TABS_GLOBAL) || 10;
178
180
  const HANDLER_TIMEOUT_MS = parseInt(process.env.HANDLER_TIMEOUT_MS) || 30000;
179
181
  const MAX_CONCURRENT_PER_USER = parseInt(process.env.MAX_CONCURRENT_PER_USER) || 3;
180
182
  const PAGE_CLOSE_TIMEOUT_MS = 5000;
183
+ const NAVIGATE_TIMEOUT_MS = parseInt(process.env.NAVIGATE_TIMEOUT_MS) || 25000;
184
+ const BUILDREFS_TIMEOUT_MS = parseInt(process.env.BUILDREFS_TIMEOUT_MS) || 12000;
185
+ const FAILURE_THRESHOLD = 3;
186
+ const TAB_LOCK_TIMEOUT_MS = 30000;
181
187
 
182
188
  // Per-tab locks to serialize operations on the same tab
183
189
  // tabId -> Promise (the currently executing operation)
@@ -188,9 +194,14 @@ async function withTabLock(tabId, operation) {
188
194
  const pending = tabLocks.get(tabId);
189
195
  if (pending) {
190
196
  try {
191
- await pending;
197
+ await Promise.race([
198
+ pending,
199
+ new Promise((_, reject) => setTimeout(() => reject(new Error('Tab lock timeout')), TAB_LOCK_TIMEOUT_MS))
200
+ ]);
192
201
  } catch (e) {
193
- // Previous operation failed, continue anyway
202
+ if (e.message === 'Tab lock timeout') {
203
+ log('warn', 'tab lock timeout, proceeding', { tabId });
204
+ }
194
205
  }
195
206
  }
196
207
 
@@ -233,9 +244,13 @@ async function withUserLimit(userId, operation) {
233
244
  });
234
245
  }
235
246
  state.active++;
247
+ healthState.activeOps++;
236
248
  try {
237
- return await operation();
249
+ const result = await operation();
250
+ healthState.lastSuccessfulNav = Date.now();
251
+ return result;
238
252
  } finally {
253
+ healthState.activeOps--;
239
254
  state.active--;
240
255
  if (state.queue.length > 0) {
241
256
  const next = state.queue.shift();
@@ -307,6 +322,59 @@ function clearBrowserIdleTimer() {
307
322
  }
308
323
  }
309
324
 
325
+ // --- Browser health tracking ---
326
+ const healthState = {
327
+ consecutiveNavFailures: 0,
328
+ lastSuccessfulNav: Date.now(),
329
+ isRecovering: false,
330
+ activeOps: 0,
331
+ };
332
+
333
+ function recordNavSuccess() {
334
+ healthState.consecutiveNavFailures = 0;
335
+ healthState.lastSuccessfulNav = Date.now();
336
+ }
337
+
338
+ function recordNavFailure() {
339
+ healthState.consecutiveNavFailures++;
340
+ return healthState.consecutiveNavFailures >= FAILURE_THRESHOLD;
341
+ }
342
+
343
+ async function restartBrowser(reason) {
344
+ if (healthState.isRecovering) return;
345
+ healthState.isRecovering = true;
346
+ log('error', 'restarting browser', { reason, failures: healthState.consecutiveNavFailures });
347
+ try {
348
+ for (const [, session] of sessions) {
349
+ await session.context.close().catch(() => {});
350
+ }
351
+ sessions.clear();
352
+ if (browser) {
353
+ await browser.close().catch(() => {});
354
+ browser = null;
355
+ }
356
+ browserLaunchPromise = null;
357
+ await ensureBrowser();
358
+ healthState.consecutiveNavFailures = 0;
359
+ healthState.lastSuccessfulNav = Date.now();
360
+ log('info', 'browser restarted successfully');
361
+ } catch (err) {
362
+ log('error', 'browser restart failed', { error: err.message });
363
+ } finally {
364
+ healthState.isRecovering = false;
365
+ }
366
+ }
367
+
368
+ function getTotalTabCount() {
369
+ let total = 0;
370
+ for (const session of sessions.values()) {
371
+ for (const group of session.tabGroups.values()) {
372
+ total += group.size;
373
+ }
374
+ }
375
+ return total;
376
+ }
377
+
310
378
  async function launchBrowserInstance() {
311
379
  const hostOS = getHostOS();
312
380
  const proxy = buildProxyConfig();
@@ -406,7 +474,8 @@ function createTabState(page) {
406
474
  page,
407
475
  refs: new Map(),
408
476
  visitedUrls: new Set(),
409
- toolCalls: 0
477
+ toolCalls: 0,
478
+ lastSnapshot: null,
410
479
  };
411
480
  }
412
481
 
@@ -507,19 +576,47 @@ async function buildRefs(page) {
507
576
  return refs;
508
577
  }
509
578
 
579
+ const start = Date.now();
580
+
581
+ // Hard total timeout on the entire buildRefs operation
582
+ const timeoutPromise = new Promise((_, reject) =>
583
+ setTimeout(() => reject(new Error('buildRefs_timeout')), BUILDREFS_TIMEOUT_MS)
584
+ );
585
+
586
+ try {
587
+ return await Promise.race([
588
+ _buildRefsInner(page, refs, start),
589
+ timeoutPromise
590
+ ]);
591
+ } catch (err) {
592
+ if (err.message === 'buildRefs_timeout') {
593
+ log('warn', 'buildRefs: total timeout exceeded', { elapsed: Date.now() - start });
594
+ return refs;
595
+ }
596
+ throw err;
597
+ }
598
+ }
599
+
600
+ async function _buildRefsInner(page, refs, start) {
510
601
  await waitForPageReady(page, { waitForNetwork: false });
511
602
 
512
- // Get ARIA snapshot including shadow DOM content
513
- // Playwright's ariaSnapshot already traverses shadow roots, but we also
514
- // inject a script to collect shadow DOM elements for additional coverage
603
+ // Budget remaining time for ariaSnapshot
604
+ const elapsed = Date.now() - start;
605
+ const remaining = BUILDREFS_TIMEOUT_MS - elapsed;
606
+ if (remaining < 2000) {
607
+ log('warn', 'buildRefs: insufficient time for ariaSnapshot', { elapsed });
608
+ return refs;
609
+ }
610
+
515
611
  let ariaYaml;
516
612
  try {
517
- ariaYaml = await page.locator('body').ariaSnapshot({ timeout: 5000 });
613
+ ariaYaml = await page.locator('body').ariaSnapshot({ timeout: Math.min(remaining - 1000, 5000) });
518
614
  } catch (err) {
519
615
  log('warn', 'ariaSnapshot failed, retrying');
616
+ const retryBudget = BUILDREFS_TIMEOUT_MS - (Date.now() - start);
617
+ if (retryBudget < 2000) return refs;
520
618
  try {
521
- await page.waitForLoadState('load', { timeout: 3000 }).catch(() => {});
522
- ariaYaml = await page.locator('body').ariaSnapshot({ timeout: 5000 });
619
+ ariaYaml = await page.locator('body').ariaSnapshot({ timeout: Math.min(retryBudget - 500, 5000) });
523
620
  } catch (retryErr) {
524
621
  log('warn', 'ariaSnapshot retry failed, returning empty refs', { error: retryErr.message });
525
622
  return refs;
@@ -593,15 +690,314 @@ function refToLocator(page, ref, refs) {
593
690
  return locator;
594
691
  }
595
692
 
596
- // Health check (passive does not launch browser)
693
+ // --- YouTube transcript extraction via yt-dlp ---
694
+ // POST /youtube/transcript { url, languages? }
695
+ // Uses yt-dlp to extract subtitles — no browser needed, no ads, no playback.
696
+ // yt-dlp handles YouTube's signed caption URLs correctly.
697
+ // Falls back to Camoufox page intercept if yt-dlp is not installed.
698
+
699
+ const { execFile } = require('child_process');
700
+ const { mkdtemp, readFile, readdir, rm } = require('fs/promises');
701
+ const { tmpdir } = require('os');
702
+ const { join } = require('path');
703
+
704
+ // Detect yt-dlp binary at startup
705
+ let ytDlpPath = null;
706
+ (async () => {
707
+ for (const candidate of ['yt-dlp', '/usr/local/bin/yt-dlp', '/usr/bin/yt-dlp']) {
708
+ try {
709
+ await new Promise((resolve, reject) => {
710
+ execFile(candidate, ['--version'], { timeout: 5000 }, (err, stdout) => {
711
+ if (err) return reject(err);
712
+ resolve(stdout.trim());
713
+ });
714
+ });
715
+ ytDlpPath = candidate;
716
+ log('info', 'yt-dlp found', { path: candidate });
717
+ break;
718
+ } catch {}
719
+ }
720
+ if (!ytDlpPath) log('warn', 'yt-dlp not found — YouTube transcript endpoint will use browser fallback');
721
+ })();
722
+
723
+ app.post('/youtube/transcript', async (req, res) => {
724
+ const reqId = req.reqId;
725
+ try {
726
+ const { url, languages = ['en'] } = req.body;
727
+ if (!url) return res.status(400).json({ error: 'url is required' });
728
+
729
+ const urlErr = validateUrl(url);
730
+ if (urlErr) return res.status(400).json({ error: urlErr });
731
+
732
+ const videoIdMatch = url.match(
733
+ /(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/embed\/|youtube\.com\/shorts\/)([a-zA-Z0-9_-]{11})/
734
+ );
735
+ if (!videoIdMatch) {
736
+ return res.status(400).json({ error: 'Could not extract YouTube video ID from URL' });
737
+ }
738
+ const videoId = videoIdMatch[1];
739
+ const lang = languages[0] || 'en';
740
+
741
+ log('info', 'youtube transcript: starting', { reqId, videoId, lang, method: ytDlpPath ? 'yt-dlp' : 'browser' });
742
+
743
+ let result;
744
+ if (ytDlpPath) {
745
+ result = await ytDlpTranscript(reqId, url, videoId, lang);
746
+ } else {
747
+ result = await browserTranscript(reqId, url, videoId, lang);
748
+ }
749
+
750
+ log('info', 'youtube transcript: done', { reqId, videoId, status: result.status, words: result.total_words });
751
+ res.json(result);
752
+ } catch (err) {
753
+ log('error', 'youtube transcript failed', { reqId, error: err.message, stack: err.stack });
754
+ res.status(500).json({ error: safeError(err) });
755
+ }
756
+ });
757
+
758
+ // Strategy 1: yt-dlp (preferred — fast, no browser, no ads)
759
+ async function ytDlpTranscript(reqId, url, videoId, lang) {
760
+ const tmpDir = await mkdtemp(join(tmpdir(), 'yt-'));
761
+ try {
762
+ // Step 1: Get title via --print (fast, no download)
763
+ const title = await new Promise((resolve, reject) => {
764
+ execFile(ytDlpPath, [
765
+ '--skip-download', '--no-warnings', '--print', '%(title)s', url,
766
+ ], { timeout: 15000 }, (err, stdout) => {
767
+ if (err) return reject(new Error(`yt-dlp metadata failed: ${err.message}`));
768
+ resolve(stdout.trim().split('\n')[0] || '');
769
+ });
770
+ });
771
+
772
+ // Step 2: Download subtitles to temp dir
773
+ await new Promise((resolve, reject) => {
774
+ execFile(ytDlpPath, [
775
+ '--skip-download',
776
+ '--write-sub', '--write-auto-sub',
777
+ '--sub-lang', lang,
778
+ '--sub-format', 'json3',
779
+ '-o', join(tmpDir, '%(id)s'),
780
+ url,
781
+ ], { timeout: 30000 }, (err, stdout, stderr) => {
782
+ if (err) return reject(new Error(`yt-dlp subtitle download failed: ${err.message}\n${stderr}`));
783
+ resolve();
784
+ });
785
+ });
786
+
787
+ // Find the subtitle file
788
+ const files = await readdir(tmpDir);
789
+ const subFile = files.find(f => f.endsWith('.json3') || f.endsWith('.vtt') || f.endsWith('.srv3'));
790
+ if (!subFile) {
791
+ return {
792
+ status: 'error', code: 404,
793
+ message: 'No captions available for this video',
794
+ video_url: url, video_id: videoId, title,
795
+ };
796
+ }
797
+
798
+ const content = await readFile(join(tmpDir, subFile), 'utf8');
799
+ let transcriptText = null;
800
+
801
+ if (subFile.endsWith('.json3')) {
802
+ transcriptText = parseJson3(content);
803
+ } else if (subFile.endsWith('.vtt')) {
804
+ transcriptText = parseVtt(content);
805
+ } else {
806
+ transcriptText = parseXml(content);
807
+ }
808
+
809
+ if (!transcriptText || !transcriptText.trim()) {
810
+ return {
811
+ status: 'error', code: 404,
812
+ message: 'Subtitle file found but content was empty',
813
+ video_url: url, video_id: videoId, title,
814
+ };
815
+ }
816
+
817
+ // Detect language from filename (e.g., dQw4w9WgXcQ.en.json3)
818
+ const langMatch = subFile.match(/\.([a-z]{2}(?:-[a-zA-Z]+)?)\.(?:json3|vtt|srv3)$/);
819
+
820
+ return {
821
+ status: 'ok', transcript: transcriptText,
822
+ video_url: url, video_id: videoId, video_title: title,
823
+ language: langMatch?.[1] || lang,
824
+ total_words: transcriptText.split(/\s+/).length,
825
+ };
826
+ } finally {
827
+ await rm(tmpDir, { recursive: true, force: true }).catch(() => {});
828
+ }
829
+ }
830
+
831
+ // Strategy 2: Browser fallback — play video, intercept timedtext network response
832
+ async function browserTranscript(reqId, url, videoId, lang) {
833
+ return await withUserLimit('__yt_transcript__', async () => {
834
+ await ensureBrowser();
835
+ const session = await getSession('__yt_transcript__');
836
+ const page = await session.context.newPage();
837
+
838
+ try {
839
+ // Mute audio
840
+ await page.addInitScript(() => {
841
+ const origPlay = HTMLMediaElement.prototype.play;
842
+ HTMLMediaElement.prototype.play = function() { this.volume = 0; this.muted = true; return origPlay.call(this); };
843
+ });
844
+
845
+ // Intercept timedtext responses — filter by video ID to skip ad captions
846
+ let interceptedCaptions = null;
847
+ page.on('response', async (response) => {
848
+ const respUrl = response.url();
849
+ if (respUrl.includes('/api/timedtext') && respUrl.includes(`v=${videoId}`) && !interceptedCaptions) {
850
+ try {
851
+ const body = await response.text();
852
+ if (body && body.length > 0) interceptedCaptions = body;
853
+ } catch {}
854
+ }
855
+ });
856
+
857
+ await page.goto(url, { waitUntil: 'domcontentloaded', timeout: NAVIGATE_TIMEOUT_MS });
858
+ await page.waitForTimeout(2000);
859
+
860
+ // Extract metadata from ytInitialPlayerResponse
861
+ const meta = await page.evaluate(() => {
862
+ const r = window.ytInitialPlayerResponse || (typeof ytInitialPlayerResponse !== 'undefined' ? ytInitialPlayerResponse : null);
863
+ if (!r) return { title: '' };
864
+ const tracks = r?.captions?.playerCaptionsTracklistRenderer?.captionTracks || [];
865
+ return {
866
+ title: r?.videoDetails?.title || '',
867
+ languages: tracks.map(t => ({ code: t.languageCode, name: t.name?.simpleText || t.languageCode, kind: t.kind || 'manual' })),
868
+ };
869
+ });
870
+
871
+ // Start playback to trigger caption loading
872
+ await page.evaluate(() => {
873
+ const v = document.querySelector('video');
874
+ if (v) { v.muted = true; v.play().catch(() => {}); }
875
+ }).catch(() => {});
876
+
877
+ // Wait up to 20s for the target video's captions (may need to sit through an ad)
878
+ for (let i = 0; i < 40 && !interceptedCaptions; i++) {
879
+ await page.waitForTimeout(500);
880
+ }
881
+
882
+ if (!interceptedCaptions) {
883
+ return {
884
+ status: 'error', code: 404,
885
+ message: 'No captions loaded during playback (video may have no captions, or ad blocked it)',
886
+ video_url: url, video_id: videoId, title: meta.title,
887
+ };
888
+ }
889
+
890
+ log('info', 'youtube transcript: intercepted captions', { reqId, len: interceptedCaptions.length });
891
+
892
+ let transcriptText = null;
893
+ if (interceptedCaptions.trimStart().startsWith('{')) transcriptText = parseJson3(interceptedCaptions);
894
+ else if (interceptedCaptions.includes('WEBVTT')) transcriptText = parseVtt(interceptedCaptions);
895
+ else if (interceptedCaptions.includes('<text')) transcriptText = parseXml(interceptedCaptions);
896
+
897
+ if (!transcriptText || !transcriptText.trim()) {
898
+ return {
899
+ status: 'error', code: 404,
900
+ message: 'Caption data intercepted but could not be parsed',
901
+ video_url: url, video_id: videoId, title: meta.title,
902
+ };
903
+ }
904
+
905
+ return {
906
+ status: 'ok', transcript: transcriptText,
907
+ video_url: url, video_id: videoId, video_title: meta.title,
908
+ language: lang, total_words: transcriptText.split(/\s+/).length,
909
+ available_languages: meta.languages,
910
+ };
911
+ } finally {
912
+ await safePageClose(page);
913
+ }
914
+ });
915
+ }
916
+
917
+ // --- YouTube transcript parsers ---
918
+
919
+ function parseJson3(content) {
920
+ try {
921
+ const data = JSON.parse(content);
922
+ const events = data.events || [];
923
+ const lines = [];
924
+ for (const event of events) {
925
+ const segs = event.segs || [];
926
+ if (!segs.length) continue;
927
+ const text = segs.map(s => s.utf8 || '').join('').trim();
928
+ if (!text) continue;
929
+ const tsMs = event.tStartMs || 0;
930
+ const tsSec = Math.floor(tsMs / 1000);
931
+ const mm = Math.floor(tsSec / 60);
932
+ const ss = tsSec % 60;
933
+ lines.push(`[${String(mm).padStart(2, '0')}:${String(ss).padStart(2, '0')}] ${text}`);
934
+ }
935
+ return lines.join('\n');
936
+ } catch (e) {
937
+ return null;
938
+ }
939
+ }
940
+
941
+ function parseVtt(content) {
942
+ const lines = content.split('\n');
943
+ const result = [];
944
+ let currentTimestamp = '';
945
+ for (const line of lines) {
946
+ const stripped = line.trim();
947
+ if (!stripped || stripped === 'WEBVTT' || stripped.startsWith('Kind:') || stripped.startsWith('Language:') || stripped.startsWith('NOTE')) continue;
948
+ if (stripped.includes(' --> ')) {
949
+ const parts = stripped.split(' --> ');
950
+ if (parts[0]) currentTimestamp = formatVttTs(parts[0].trim());
951
+ continue;
952
+ }
953
+ const text = stripped.replace(/<[^>]+>/g, '').replace(/&amp;/g, '&').replace(/&lt;/g, '<').replace(/&gt;/g, '>').replace(/&quot;/g, '"').replace(/&#39;/g, "'").trim();
954
+ if (text && currentTimestamp) { result.push(`[${currentTimestamp}] ${text}`); currentTimestamp = ''; }
955
+ else if (text) result.push(text);
956
+ }
957
+ return result.join('\n');
958
+ }
959
+
960
+ function parseXml(content) {
961
+ const lines = [];
962
+ const regex = /<text\s+start="([^"]*)"[^>]*>([\s\S]*?)<\/text>/g;
963
+ let match;
964
+ while ((match = regex.exec(content)) !== null) {
965
+ const startSec = parseFloat(match[1]) || 0;
966
+ const text = match[2].replace(/<[^>]+>/g, '').replace(/&amp;/g, '&').replace(/&lt;/g, '<').replace(/&gt;/g, '>').replace(/&quot;/g, '"').replace(/&#39;/g, "'").trim();
967
+ if (!text) continue;
968
+ const mm = Math.floor(startSec / 60);
969
+ const ss = Math.floor(startSec % 60);
970
+ lines.push(`[${String(mm).padStart(2, '0')}:${String(ss).padStart(2, '0')}] ${text}`);
971
+ }
972
+ return lines.join('\n');
973
+ }
974
+
975
+ function formatVttTs(ts) {
976
+ const parts = ts.split(':');
977
+ if (parts.length >= 3) {
978
+ const hours = parseInt(parts[0]) || 0;
979
+ const minutes = parseInt(parts[1]) || 0;
980
+ const totalMin = hours * 60 + minutes;
981
+ const seconds = (parts[2] || '00').split('.')[0];
982
+ return `${String(totalMin).padStart(2, '0')}:${seconds}`;
983
+ } else if (parts.length === 2) {
984
+ return `${String(parseInt(parts[0])).padStart(2, '0')}:${(parts[1] || '00').split('.')[0]}`;
985
+ }
986
+ return ts;
987
+ }
988
+
597
989
  app.get('/health', (req, res) => {
990
+ if (healthState.isRecovering) {
991
+ return res.status(503).json({ ok: false, engine: 'camoufox', recovering: true });
992
+ }
598
993
  const running = browser !== null && (browser.isConnected?.() ?? false);
599
994
  res.json({
600
995
  ok: true,
601
996
  engine: 'camoufox',
602
997
  browserConnected: running,
603
998
  browserRunning: running,
604
- sessions: sessions.size,
999
+ activeTabs: getTotalTabCount(),
1000
+ consecutiveFailures: healthState.consecutiveNavFailures,
605
1001
  });
606
1002
  });
607
1003
 
@@ -658,23 +1054,46 @@ app.post('/tabs/:tabId/navigate', async (req, res) => {
658
1054
  let session = sessions.get(normalizeUserId(userId));
659
1055
  let found = session && findTab(session, tabId);
660
1056
 
1057
+ let tabState;
661
1058
  if (!found) {
662
1059
  const resolvedSessionKey = sessionKey || listItemId || 'default';
663
1060
  session = await getSession(userId);
664
- let totalTabs = 0;
665
- for (const g of session.tabGroups.values()) totalTabs += g.size;
666
- if (totalTabs >= MAX_TABS_PER_SESSION) {
667
- throw new Error('Maximum tabs per session reached');
1061
+ let sessionTabs = 0;
1062
+ for (const g of session.tabGroups.values()) sessionTabs += g.size;
1063
+ if (getTotalTabCount() >= MAX_TABS_GLOBAL || sessionTabs >= MAX_TABS_PER_SESSION) {
1064
+ // Reuse oldest tab in session instead of rejecting
1065
+ let oldestTab = null;
1066
+ let oldestGroup = null;
1067
+ let oldestTabId = null;
1068
+ for (const [gKey, group] of session.tabGroups) {
1069
+ for (const [tid, ts] of group) {
1070
+ if (!oldestTab || ts.toolCalls < oldestTab.toolCalls) {
1071
+ oldestTab = ts;
1072
+ oldestGroup = group;
1073
+ oldestTabId = tid;
1074
+ }
1075
+ }
1076
+ }
1077
+ if (oldestTab) {
1078
+ tabState = oldestTab;
1079
+ const group = getTabGroup(session, resolvedSessionKey);
1080
+ if (oldestGroup) oldestGroup.delete(oldestTabId);
1081
+ group.set(tabId, tabState);
1082
+ tabLocks.delete(oldestTabId);
1083
+ log('info', 'tab recycled (limit reached)', { reqId: req.reqId, tabId, recycledFrom: oldestTabId, userId });
1084
+ } else {
1085
+ throw new Error('Maximum tabs per session reached');
1086
+ }
1087
+ } else {
1088
+ const page = await session.context.newPage();
1089
+ tabState = createTabState(page);
1090
+ const group = getTabGroup(session, resolvedSessionKey);
1091
+ group.set(tabId, tabState);
1092
+ log('info', 'tab auto-created on navigate', { reqId: req.reqId, tabId, userId });
668
1093
  }
669
- const page = await session.context.newPage();
670
- const newTabState = createTabState(page);
671
- const group = getTabGroup(session, resolvedSessionKey);
672
- group.set(tabId, newTabState);
673
- found = { tabState: newTabState, listItemId: resolvedSessionKey, group };
674
- log('info', 'tab auto-created on navigate', { reqId: req.reqId, tabId, userId });
1094
+ } else {
1095
+ tabState = found.tabState;
675
1096
  }
676
-
677
- const { tabState } = found;
678
1097
  tabState.toolCalls++;
679
1098
 
680
1099
  let targetUrl = url;
@@ -690,8 +1109,9 @@ app.post('/tabs/:tabId/navigate', async (req, res) => {
690
1109
  return await withTabLock(tabId, async () => {
691
1110
  await tabState.page.goto(targetUrl, { waitUntil: 'domcontentloaded', timeout: 30000 });
692
1111
  tabState.visitedUrls.add(targetUrl);
1112
+ tabState.lastSnapshot = null;
693
1113
  tabState.refs = await buildRefs(tabState.page);
694
- return { ok: true, tabId, url: tabState.page.url() };
1114
+ return { ok: true, tabId, url: tabState.page.url(), refsAvailable: tabState.refs.size > 0 };
695
1115
  });
696
1116
  })(), HANDLER_TIMEOUT_MS, 'navigate'));
697
1117
 
@@ -699,7 +1119,8 @@ app.post('/tabs/:tabId/navigate', async (req, res) => {
699
1119
  res.json(result);
700
1120
  } catch (err) {
701
1121
  log('error', 'navigate failed', { reqId: req.reqId, tabId, error: err.message });
702
- res.status(500).json({ error: safeError(err) });
1122
+ const status = err.message && err.message.startsWith('Blocked URL scheme') ? 400 : 500;
1123
+ res.status(status).json({ error: safeError(err) });
703
1124
  }
704
1125
  });
705
1126
 
@@ -709,6 +1130,7 @@ app.get('/tabs/:tabId/snapshot', async (req, res) => {
709
1130
  const userId = req.query.userId;
710
1131
  if (!userId) return res.status(400).json({ error: 'userId required' });
711
1132
  const format = req.query.format || 'text';
1133
+ const offset = parseInt(req.query.offset) || 0;
712
1134
  const session = sessions.get(normalizeUserId(userId));
713
1135
  const found = session && findTab(session, req.params.tabId);
714
1136
  if (!found) return res.status(404).json({ error: 'Tab not found' });
@@ -716,6 +1138,18 @@ app.get('/tabs/:tabId/snapshot', async (req, res) => {
716
1138
  const { tabState } = found;
717
1139
  tabState.toolCalls++;
718
1140
 
1141
+ // Cached chunk retrieval for offset>0 requests
1142
+ if (offset > 0 && tabState.lastSnapshot) {
1143
+ const win = windowSnapshot(tabState.lastSnapshot, offset);
1144
+ const response = { url: tabState.page.url(), snapshot: win.text, refsCount: tabState.refs.size, truncated: win.truncated, totalChars: win.totalChars, hasMore: win.hasMore, nextOffset: win.nextOffset };
1145
+ if (req.query.includeScreenshot === 'true') {
1146
+ const pngBuffer = await tabState.page.screenshot({ type: 'png' });
1147
+ response.screenshot = { data: pngBuffer.toString('base64'), mimeType: 'image/png' };
1148
+ }
1149
+ log('info', 'snapshot (cached offset)', { reqId: req.reqId, tabId: req.params.tabId, offset, totalChars: win.totalChars });
1150
+ return res.json(response);
1151
+ }
1152
+
719
1153
  const result = await withUserLimit(userId, () => withTimeout((async () => {
720
1154
  tabState.refs = await buildRefs(tabState.page);
721
1155
  const ariaYaml = await getAriaSnapshot(tabState.page);
@@ -754,14 +1188,28 @@ app.get('/tabs/:tabId/snapshot', async (req, res) => {
754
1188
  }).join('\n');
755
1189
  }
756
1190
 
757
- return {
1191
+ tabState.lastSnapshot = annotatedYaml;
1192
+ const win = windowSnapshot(annotatedYaml, 0);
1193
+
1194
+ const response = {
758
1195
  url: tabState.page.url(),
759
- snapshot: annotatedYaml,
760
- refsCount: tabState.refs.size
1196
+ snapshot: win.text,
1197
+ refsCount: tabState.refs.size,
1198
+ truncated: win.truncated,
1199
+ totalChars: win.totalChars,
1200
+ hasMore: win.hasMore,
1201
+ nextOffset: win.nextOffset,
761
1202
  };
1203
+
1204
+ if (req.query.includeScreenshot === 'true') {
1205
+ const pngBuffer = await tabState.page.screenshot({ type: 'png' });
1206
+ response.screenshot = { data: pngBuffer.toString('base64'), mimeType: 'image/png' };
1207
+ }
1208
+
1209
+ return response;
762
1210
  })(), HANDLER_TIMEOUT_MS, 'snapshot'));
763
1211
 
764
- log('info', 'snapshot', { reqId: req.reqId, tabId: req.params.tabId, url: result.url, snapshotLen: result.snapshot?.length, refsCount: result.refsCount });
1212
+ log('info', 'snapshot', { reqId: req.reqId, tabId: req.params.tabId, url: result.url, snapshotLen: result.snapshot?.length, refsCount: result.refsCount, hasScreenshot: !!result.screenshot, truncated: result.truncated });
765
1213
  res.json(result);
766
1214
  } catch (err) {
767
1215
  log('error', 'snapshot failed', { reqId: req.reqId, tabId: req.params.tabId, error: err.message });
@@ -844,7 +1292,7 @@ app.post('/tabs/:tabId/click', async (req, res) => {
844
1292
  log('warn', 'force click failed, trying mouse sequence');
845
1293
  await dispatchMouseSequence(locator);
846
1294
  }
847
- } else if (err.message.includes('not visible') || err.message.includes('timeout')) {
1295
+ } else if (err.message.includes('not visible') || err.message.toLowerCase().includes('timeout')) {
848
1296
  // Fallback 2: Element not responding to click, try mouse sequence
849
1297
  log('warn', 'click timeout, trying mouse sequence');
850
1298
  await dispatchMouseSequence(locator);
@@ -855,7 +1303,13 @@ app.post('/tabs/:tabId/click', async (req, res) => {
855
1303
  };
856
1304
 
857
1305
  if (ref) {
858
- const locator = refToLocator(tabState.page, ref, tabState.refs);
1306
+ let locator = refToLocator(tabState.page, ref, tabState.refs);
1307
+ if (!locator && tabState.refs.size === 0) {
1308
+ // Auto-refresh refs on stale state before failing
1309
+ log('info', 'auto-refreshing stale refs before click', { ref });
1310
+ tabState.refs = await buildRefs(tabState.page);
1311
+ locator = refToLocator(tabState.page, ref, tabState.refs);
1312
+ }
859
1313
  if (!locator) {
860
1314
  const maxRef = tabState.refs.size > 0 ? `e${tabState.refs.size}` : 'none';
861
1315
  throw new Error(`Unknown ref: ${ref} (valid refs: e1-${maxRef}, ${tabState.refs.size} total). Refs reset after navigation - call snapshot first.`);
@@ -866,11 +1320,12 @@ app.post('/tabs/:tabId/click', async (req, res) => {
866
1320
  }
867
1321
 
868
1322
  await tabState.page.waitForTimeout(500);
1323
+ tabState.lastSnapshot = null;
869
1324
  tabState.refs = await buildRefs(tabState.page);
870
1325
 
871
1326
  const newUrl = tabState.page.url();
872
1327
  tabState.visitedUrls.add(newUrl);
873
- return { ok: true, url: newUrl };
1328
+ return { ok: true, url: newUrl, refsAvailable: tabState.refs.size > 0 };
874
1329
  }), HANDLER_TIMEOUT_MS, 'click'));
875
1330
 
876
1331
  log('info', 'clicked', { reqId: req.reqId, tabId, url: result.url });
@@ -1215,7 +1670,6 @@ app.get('/', (req, res) => {
1215
1670
  engine: 'camoufox',
1216
1671
  browserConnected: running,
1217
1672
  browserRunning: running,
1218
- sessions: sessions.size,
1219
1673
  });
1220
1674
  });
1221
1675
 
@@ -1364,6 +1818,7 @@ app.post('/navigate', async (req, res) => {
1364
1818
  app.get('/snapshot', async (req, res) => {
1365
1819
  try {
1366
1820
  const { targetId, userId, format = 'text' } = req.query;
1821
+ const offset = parseInt(req.query.offset) || 0;
1367
1822
  if (!userId) {
1368
1823
  return res.status(400).json({ error: 'userId is required' });
1369
1824
  }
@@ -1376,6 +1831,18 @@ app.get('/snapshot', async (req, res) => {
1376
1831
 
1377
1832
  const { tabState } = found;
1378
1833
  tabState.toolCalls++;
1834
+
1835
+ // Cached chunk retrieval
1836
+ if (offset > 0 && tabState.lastSnapshot) {
1837
+ const win = windowSnapshot(tabState.lastSnapshot, offset);
1838
+ const response = { ok: true, format: 'aria', targetId, url: tabState.page.url(), snapshot: win.text, refsCount: tabState.refs.size, truncated: win.truncated, totalChars: win.totalChars, hasMore: win.hasMore, nextOffset: win.nextOffset };
1839
+ if (req.query.includeScreenshot === 'true') {
1840
+ const pngBuffer = await tabState.page.screenshot({ type: 'png' });
1841
+ response.screenshot = { data: pngBuffer.toString('base64'), mimeType: 'image/png' };
1842
+ }
1843
+ return res.json(response);
1844
+ }
1845
+
1379
1846
  tabState.refs = await buildRefs(tabState.page);
1380
1847
 
1381
1848
  const ariaYaml = await getAriaSnapshot(tabState.page);
@@ -1404,14 +1871,28 @@ app.get('/snapshot', async (req, res) => {
1404
1871
  }).join('\n');
1405
1872
  }
1406
1873
 
1407
- res.json({
1874
+ tabState.lastSnapshot = annotatedYaml;
1875
+ const win = windowSnapshot(annotatedYaml, 0);
1876
+
1877
+ const response = {
1408
1878
  ok: true,
1409
1879
  format: 'aria',
1410
1880
  targetId,
1411
1881
  url: tabState.page.url(),
1412
- snapshot: annotatedYaml,
1413
- refsCount: tabState.refs.size
1414
- });
1882
+ snapshot: win.text,
1883
+ refsCount: tabState.refs.size,
1884
+ truncated: win.truncated,
1885
+ totalChars: win.totalChars,
1886
+ hasMore: win.hasMore,
1887
+ nextOffset: win.nextOffset,
1888
+ };
1889
+
1890
+ if (req.query.includeScreenshot === 'true') {
1891
+ const pngBuffer = await tabState.page.screenshot({ type: 'png' });
1892
+ response.screenshot = { data: pngBuffer.toString('base64'), mimeType: 'image/png' };
1893
+ }
1894
+
1895
+ res.json(response);
1415
1896
  } catch (err) {
1416
1897
  log('error', 'openclaw snapshot failed', { reqId: req.reqId, error: err.message });
1417
1898
  res.status(500).json({ error: safeError(err) });
@@ -1584,6 +2065,32 @@ setInterval(() => {
1584
2065
  });
1585
2066
  }, 5 * 60_000);
1586
2067
 
2068
+ // Active health probe — detect hung browser even when isConnected() lies
2069
+ setInterval(async () => {
2070
+ if (!browser || healthState.isRecovering) return;
2071
+ // Skip probe if operations are in flight
2072
+ if (healthState.activeOps > 0) {
2073
+ log('info', 'health probe skipped, operations active', { activeOps: healthState.activeOps });
2074
+ return;
2075
+ }
2076
+ const timeSinceSuccess = Date.now() - healthState.lastSuccessfulNav;
2077
+ if (timeSinceSuccess < 120000) return;
2078
+
2079
+ let testContext;
2080
+ try {
2081
+ testContext = await browser.newContext();
2082
+ const page = await testContext.newPage();
2083
+ await page.goto('about:blank', { timeout: 5000 });
2084
+ await page.close();
2085
+ await testContext.close();
2086
+ healthState.lastSuccessfulNav = Date.now();
2087
+ } catch (err) {
2088
+ log('warn', 'health probe failed', { error: err.message, timeSinceSuccessMs: timeSinceSuccess });
2089
+ if (testContext) await testContext.close().catch(() => {});
2090
+ restartBrowser('health probe failed').catch(() => {});
2091
+ }
2092
+ }, 60_000);
2093
+
1587
2094
  // Crash logging
1588
2095
  process.on('uncaughtException', (err) => {
1589
2096
  log('error', 'uncaughtException', { error: err.message, stack: err.stack });