@phi-code-admin/camofox-browser 1.0.0 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +571 -571
- package/Dockerfile +86 -86
- package/LICENSE +21 -21
- package/README.md +691 -691
- package/camofox.config.json +10 -10
- package/lib/auth.js +134 -134
- package/lib/camoufox-executable.js +189 -189
- package/lib/config.js +153 -153
- package/lib/cookies.js +119 -119
- package/lib/downloads.js +168 -168
- package/lib/extract.js +74 -74
- package/lib/fly.js +54 -54
- package/lib/images.js +88 -88
- package/lib/inflight.js +16 -16
- package/lib/launcher.js +47 -47
- package/lib/macros.js +31 -31
- package/lib/metrics.js +184 -184
- package/lib/openapi.js +105 -105
- package/lib/persistence.js +89 -89
- package/lib/plugins.js +178 -175
- package/lib/proxy.js +277 -277
- package/lib/reporter.js +1102 -1102
- package/lib/request-utils.js +59 -59
- package/lib/resources.js +76 -76
- package/lib/snapshot.js +41 -41
- package/lib/tmp-cleanup.js +108 -108
- package/lib/tracing.js +137 -137
- package/openclaw.plugin.json +268 -268
- package/package.json +148 -148
- package/plugin.ts +758 -758
- package/plugins/persistence/AGENTS.md +37 -37
- package/plugins/persistence/README.md +48 -48
- package/plugins/persistence/index.js +124 -124
- package/plugins/vnc/AGENTS.md +42 -42
- package/plugins/vnc/README.md +165 -165
- package/plugins/vnc/apt.txt +7 -7
- package/plugins/vnc/index.js +142 -142
- package/plugins/vnc/spawn.js +8 -8
- package/plugins/vnc/vnc-launcher.js +64 -64
- package/plugins/vnc/vnc-watcher.sh +82 -82
- package/plugins/youtube/AGENTS.md +25 -25
- package/plugins/youtube/apt.txt +1 -1
- package/plugins/youtube/index.js +206 -206
- package/plugins/youtube/post-install.sh +5 -5
- package/plugins/youtube/youtube.js +301 -301
- package/run.sh +37 -37
- package/scripts/exec.js +8 -8
- package/scripts/generate-openapi.js +24 -24
- package/scripts/install-plugin-deps.sh +63 -63
- package/scripts/plugin.js +342 -342
- package/scripts/sync-version.js +25 -25
- package/server.js +6062 -6059
- package/tsconfig.json +12 -12
package/plugins/youtube/index.js
CHANGED
|
@@ -1,206 +1,206 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* YouTube transcript plugin.
|
|
3
|
-
*
|
|
4
|
-
* Extracts video transcripts via yt-dlp (preferred) with browser fallback.
|
|
5
|
-
* Registers POST /youtube/transcript.
|
|
6
|
-
*/
|
|
7
|
-
|
|
8
|
-
import { detectYtDlp, hasYtDlp, ensureYtDlp, ytDlpTranscript, parseJson3, parseVtt, parseXml } from './youtube.js';
|
|
9
|
-
import { classifyError } from '../../lib/request-utils.js';
|
|
10
|
-
|
|
11
|
-
export async function register(app, ctx, pluginConfig = {}) {
|
|
12
|
-
const { log, config, sessions, ensureBrowser, getSession,
|
|
13
|
-
withUserLimit, safePageClose, normalizeUserId,
|
|
14
|
-
validateUrl, safeError, buildProxyUrl, proxyPool,
|
|
15
|
-
failuresTotal } = ctx;
|
|
16
|
-
|
|
17
|
-
const NAVIGATE_TIMEOUT_MS = config.navigateTimeoutMs;
|
|
18
|
-
|
|
19
|
-
// Detect yt-dlp binary at load time
|
|
20
|
-
await detectYtDlp(log);
|
|
21
|
-
|
|
22
|
-
// Auth is on by default; set { "auth": false } in camofox.config.json to disable
|
|
23
|
-
// Auth off by default -- matches pre-plugin behavior. Set { "auth": true } to require auth.
|
|
24
|
-
const middleware = pluginConfig.auth === true ? ctx.auth() : (_req, _res, next) => next();
|
|
25
|
-
|
|
26
|
-
app.post('/youtube/transcript', middleware, async (req, res) => {
|
|
27
|
-
const reqId = req.reqId;
|
|
28
|
-
try {
|
|
29
|
-
const { url, languages = ['en'] } = req.body;
|
|
30
|
-
if (!url) return res.status(400).json({ error: 'url is required' });
|
|
31
|
-
|
|
32
|
-
const urlErr = validateUrl(url);
|
|
33
|
-
if (urlErr) return res.status(400).json({ error: urlErr });
|
|
34
|
-
|
|
35
|
-
const videoIdMatch = url.match(
|
|
36
|
-
/(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/embed\/|youtube\.com\/shorts\/)([a-zA-Z0-9_-]{11})/
|
|
37
|
-
);
|
|
38
|
-
if (!videoIdMatch) {
|
|
39
|
-
return res.status(400).json({ error: 'Could not extract YouTube video ID from URL' });
|
|
40
|
-
}
|
|
41
|
-
const videoId = videoIdMatch[1];
|
|
42
|
-
const lang = languages[0] || 'en';
|
|
43
|
-
|
|
44
|
-
// Re-detect yt-dlp if startup detection failed (transient issue)
|
|
45
|
-
await ensureYtDlp(log);
|
|
46
|
-
|
|
47
|
-
const ytDlpProxyUrl = buildProxyUrl(proxyPool, config.proxy);
|
|
48
|
-
log('info', 'youtube transcript: starting', { reqId, videoId, lang, method: hasYtDlp() ? 'yt-dlp' : 'browser', hasProxy: !!ytDlpProxyUrl });
|
|
49
|
-
|
|
50
|
-
let result;
|
|
51
|
-
if (hasYtDlp()) {
|
|
52
|
-
try {
|
|
53
|
-
result = await ytDlpTranscript(reqId, url, videoId, lang, ytDlpProxyUrl);
|
|
54
|
-
} catch (ytErr) {
|
|
55
|
-
log('warn', 'yt-dlp threw, falling back to browser', { reqId, error: ytErr.message });
|
|
56
|
-
result = null;
|
|
57
|
-
}
|
|
58
|
-
// If yt-dlp returned an error result (e.g. no captions) or threw, try browser
|
|
59
|
-
if (!result || result.status !== 'ok') {
|
|
60
|
-
if (result) log('warn', 'yt-dlp returned error, falling back to browser', { reqId, status: result.status, code: result.code });
|
|
61
|
-
result = await browserTranscript(reqId, url, videoId, lang);
|
|
62
|
-
}
|
|
63
|
-
} else {
|
|
64
|
-
result = await browserTranscript(reqId, url, videoId, lang);
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
log('info', 'youtube transcript: done', { reqId, videoId, status: result.status, words: result.total_words });
|
|
68
|
-
res.json(result);
|
|
69
|
-
} catch (err) {
|
|
70
|
-
failuresTotal.labels(classifyError(err), 'youtube_transcript').inc();
|
|
71
|
-
log('error', 'youtube transcript failed', { reqId, error: err.message, stack: err.stack });
|
|
72
|
-
res.status(500).json({ error: safeError(err) });
|
|
73
|
-
}
|
|
74
|
-
});
|
|
75
|
-
|
|
76
|
-
// Browser fallback -- play video, intercept timedtext network response
|
|
77
|
-
async function browserTranscript(reqId, url, videoId, lang) {
|
|
78
|
-
return await withUserLimit('__yt_transcript__', async () => {
|
|
79
|
-
await ensureBrowser();
|
|
80
|
-
const session = await getSession('__yt_transcript__');
|
|
81
|
-
const page = await session.context.newPage();
|
|
82
|
-
|
|
83
|
-
try {
|
|
84
|
-
await page.addInitScript(() => {
|
|
85
|
-
const origPlay = HTMLMediaElement.prototype.play;
|
|
86
|
-
HTMLMediaElement.prototype.play = function() { this.volume = 0; this.muted = true; return origPlay.call(this); };
|
|
87
|
-
});
|
|
88
|
-
|
|
89
|
-
let interceptedCaptions = null;
|
|
90
|
-
page.on('response', async (response) => {
|
|
91
|
-
const respUrl = response.url();
|
|
92
|
-
if (respUrl.includes('/api/timedtext') && respUrl.includes(`v=${videoId}`) && !interceptedCaptions) {
|
|
93
|
-
try {
|
|
94
|
-
const body = await response.text();
|
|
95
|
-
if (body && body.length > 0) interceptedCaptions = body;
|
|
96
|
-
} catch {}
|
|
97
|
-
}
|
|
98
|
-
});
|
|
99
|
-
|
|
100
|
-
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: NAVIGATE_TIMEOUT_MS });
|
|
101
|
-
await page.waitForTimeout(2000);
|
|
102
|
-
|
|
103
|
-
// Extract caption track URLs and metadata from ytInitialPlayerResponse
|
|
104
|
-
const meta = await page.evaluate(() => {
|
|
105
|
-
const r = window.ytInitialPlayerResponse || (typeof ytInitialPlayerResponse !== 'undefined' ? ytInitialPlayerResponse : null);
|
|
106
|
-
if (!r) return { title: '', tracks: [] };
|
|
107
|
-
const tracks = r?.captions?.playerCaptionsTracklistRenderer?.captionTracks || [];
|
|
108
|
-
return {
|
|
109
|
-
title: r?.videoDetails?.title || '',
|
|
110
|
-
tracks: tracks.map(t => ({ code: t.languageCode, name: t.name?.simpleText || t.languageCode, kind: t.kind || 'manual', url: t.baseUrl })),
|
|
111
|
-
};
|
|
112
|
-
});
|
|
113
|
-
|
|
114
|
-
log('info', 'youtube transcript: extracted caption tracks', { reqId, title: meta.title, trackCount: meta.tracks.length, tracks: meta.tracks.map(t => t.code) });
|
|
115
|
-
|
|
116
|
-
// Strategy A: Fetch caption track URL directly from ytInitialPlayerResponse
|
|
117
|
-
if (meta.tracks && meta.tracks.length > 0) {
|
|
118
|
-
const track = meta.tracks.find(t => t.code === lang) || meta.tracks[0];
|
|
119
|
-
if (track && track.url) {
|
|
120
|
-
const captionUrl = track.url + (track.url.includes('?') ? '&' : '?') + 'fmt=json3';
|
|
121
|
-
log('info', 'youtube transcript: fetching caption track', { reqId, lang: track.code, url: captionUrl.substring(0, 100) });
|
|
122
|
-
try {
|
|
123
|
-
const captionResp = await page.evaluate(async (fetchUrl) => {
|
|
124
|
-
const resp = await fetch(fetchUrl);
|
|
125
|
-
return resp.ok ? await resp.text() : null;
|
|
126
|
-
}, captionUrl);
|
|
127
|
-
if (captionResp && captionResp.length > 0) {
|
|
128
|
-
let transcriptText = null;
|
|
129
|
-
if (captionResp.trimStart().startsWith('{')) transcriptText = parseJson3(captionResp);
|
|
130
|
-
else if (captionResp.includes('WEBVTT')) transcriptText = parseVtt(captionResp);
|
|
131
|
-
else if (captionResp.includes('<text')) transcriptText = parseXml(captionResp);
|
|
132
|
-
if (transcriptText && transcriptText.trim()) {
|
|
133
|
-
return {
|
|
134
|
-
status: 'ok', transcript: transcriptText,
|
|
135
|
-
video_url: url, video_id: videoId, video_title: meta.title,
|
|
136
|
-
language: track.code, total_words: transcriptText.split(/\s+/).length,
|
|
137
|
-
available_languages: meta.tracks.map(t => ({ code: t.code, name: t.name, kind: t.kind })),
|
|
138
|
-
};
|
|
139
|
-
}
|
|
140
|
-
}
|
|
141
|
-
} catch (fetchErr) {
|
|
142
|
-
log('warn', 'youtube transcript: caption track fetch failed', { reqId, error: fetchErr.message });
|
|
143
|
-
}
|
|
144
|
-
}
|
|
145
|
-
}
|
|
146
|
-
|
|
147
|
-
// Strategy B: Play video and intercept timedtext network response
|
|
148
|
-
await page.evaluate(() => {
|
|
149
|
-
const v = document.querySelector('video');
|
|
150
|
-
if (v) { v.muted = true; v.play().catch(() => {}); }
|
|
151
|
-
}).catch(() => {});
|
|
152
|
-
|
|
153
|
-
for (let i = 0; i < 40 && !interceptedCaptions; i++) {
|
|
154
|
-
await page.waitForTimeout(500);
|
|
155
|
-
}
|
|
156
|
-
|
|
157
|
-
if (!interceptedCaptions) {
|
|
158
|
-
return {
|
|
159
|
-
status: 'error', code: 404,
|
|
160
|
-
message: 'No captions available for this video',
|
|
161
|
-
video_url: url, video_id: videoId, title: meta.title,
|
|
162
|
-
};
|
|
163
|
-
}
|
|
164
|
-
|
|
165
|
-
log('info', 'youtube transcript: intercepted captions', { reqId, len: interceptedCaptions.length });
|
|
166
|
-
|
|
167
|
-
let transcriptText = null;
|
|
168
|
-
if (interceptedCaptions.trimStart().startsWith('{')) transcriptText = parseJson3(interceptedCaptions);
|
|
169
|
-
else if (interceptedCaptions.includes('WEBVTT')) transcriptText = parseVtt(interceptedCaptions);
|
|
170
|
-
else if (interceptedCaptions.includes('<text')) transcriptText = parseXml(interceptedCaptions);
|
|
171
|
-
|
|
172
|
-
if (!transcriptText || !transcriptText.trim()) {
|
|
173
|
-
return {
|
|
174
|
-
status: 'error', code: 404,
|
|
175
|
-
message: 'Caption data intercepted but could not be parsed',
|
|
176
|
-
video_url: url, video_id: videoId, title: meta.title,
|
|
177
|
-
};
|
|
178
|
-
}
|
|
179
|
-
|
|
180
|
-
return {
|
|
181
|
-
status: 'ok', transcript: transcriptText,
|
|
182
|
-
video_url: url, video_id: videoId, video_title: meta.title,
|
|
183
|
-
language: lang, total_words: transcriptText.split(/\s+/).length,
|
|
184
|
-
available_languages: meta.languages,
|
|
185
|
-
};
|
|
186
|
-
} finally {
|
|
187
|
-
await safePageClose(page);
|
|
188
|
-
// Clean up transcript session if no live pages remain
|
|
189
|
-
const ytKey = normalizeUserId('__yt_transcript__');
|
|
190
|
-
const ytSession = sessions.get(ytKey);
|
|
191
|
-
if (ytSession && !ytSession._closing) {
|
|
192
|
-
try {
|
|
193
|
-
const remainingPages = ytSession.context.pages();
|
|
194
|
-
if (remainingPages.length === 0) {
|
|
195
|
-
ytSession._closing = true;
|
|
196
|
-
ytSession.context.close().catch(() => {});
|
|
197
|
-
sessions.delete(ytKey);
|
|
198
|
-
}
|
|
199
|
-
} catch {
|
|
200
|
-
sessions.delete(ytKey);
|
|
201
|
-
}
|
|
202
|
-
}
|
|
203
|
-
}
|
|
204
|
-
});
|
|
205
|
-
}
|
|
206
|
-
}
|
|
1
|
+
/**
|
|
2
|
+
* YouTube transcript plugin.
|
|
3
|
+
*
|
|
4
|
+
* Extracts video transcripts via yt-dlp (preferred) with browser fallback.
|
|
5
|
+
* Registers POST /youtube/transcript.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { detectYtDlp, hasYtDlp, ensureYtDlp, ytDlpTranscript, parseJson3, parseVtt, parseXml } from './youtube.js';
|
|
9
|
+
import { classifyError } from '../../lib/request-utils.js';
|
|
10
|
+
|
|
11
|
+
export async function register(app, ctx, pluginConfig = {}) {
|
|
12
|
+
const { log, config, sessions, ensureBrowser, getSession,
|
|
13
|
+
withUserLimit, safePageClose, normalizeUserId,
|
|
14
|
+
validateUrl, safeError, buildProxyUrl, proxyPool,
|
|
15
|
+
failuresTotal } = ctx;
|
|
16
|
+
|
|
17
|
+
const NAVIGATE_TIMEOUT_MS = config.navigateTimeoutMs;
|
|
18
|
+
|
|
19
|
+
// Detect yt-dlp binary at load time
|
|
20
|
+
await detectYtDlp(log);
|
|
21
|
+
|
|
22
|
+
// Auth is on by default; set { "auth": false } in camofox.config.json to disable
|
|
23
|
+
// Auth off by default -- matches pre-plugin behavior. Set { "auth": true } to require auth.
|
|
24
|
+
const middleware = pluginConfig.auth === true ? ctx.auth() : (_req, _res, next) => next();
|
|
25
|
+
|
|
26
|
+
app.post('/youtube/transcript', middleware, async (req, res) => {
|
|
27
|
+
const reqId = req.reqId;
|
|
28
|
+
try {
|
|
29
|
+
const { url, languages = ['en'] } = req.body;
|
|
30
|
+
if (!url) return res.status(400).json({ error: 'url is required' });
|
|
31
|
+
|
|
32
|
+
const urlErr = validateUrl(url);
|
|
33
|
+
if (urlErr) return res.status(400).json({ error: urlErr });
|
|
34
|
+
|
|
35
|
+
const videoIdMatch = url.match(
|
|
36
|
+
/(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/embed\/|youtube\.com\/shorts\/)([a-zA-Z0-9_-]{11})/
|
|
37
|
+
);
|
|
38
|
+
if (!videoIdMatch) {
|
|
39
|
+
return res.status(400).json({ error: 'Could not extract YouTube video ID from URL' });
|
|
40
|
+
}
|
|
41
|
+
const videoId = videoIdMatch[1];
|
|
42
|
+
const lang = languages[0] || 'en';
|
|
43
|
+
|
|
44
|
+
// Re-detect yt-dlp if startup detection failed (transient issue)
|
|
45
|
+
await ensureYtDlp(log);
|
|
46
|
+
|
|
47
|
+
const ytDlpProxyUrl = buildProxyUrl(proxyPool, config.proxy);
|
|
48
|
+
log('info', 'youtube transcript: starting', { reqId, videoId, lang, method: hasYtDlp() ? 'yt-dlp' : 'browser', hasProxy: !!ytDlpProxyUrl });
|
|
49
|
+
|
|
50
|
+
let result;
|
|
51
|
+
if (hasYtDlp()) {
|
|
52
|
+
try {
|
|
53
|
+
result = await ytDlpTranscript(reqId, url, videoId, lang, ytDlpProxyUrl);
|
|
54
|
+
} catch (ytErr) {
|
|
55
|
+
log('warn', 'yt-dlp threw, falling back to browser', { reqId, error: ytErr.message });
|
|
56
|
+
result = null;
|
|
57
|
+
}
|
|
58
|
+
// If yt-dlp returned an error result (e.g. no captions) or threw, try browser
|
|
59
|
+
if (!result || result.status !== 'ok') {
|
|
60
|
+
if (result) log('warn', 'yt-dlp returned error, falling back to browser', { reqId, status: result.status, code: result.code });
|
|
61
|
+
result = await browserTranscript(reqId, url, videoId, lang);
|
|
62
|
+
}
|
|
63
|
+
} else {
|
|
64
|
+
result = await browserTranscript(reqId, url, videoId, lang);
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
log('info', 'youtube transcript: done', { reqId, videoId, status: result.status, words: result.total_words });
|
|
68
|
+
res.json(result);
|
|
69
|
+
} catch (err) {
|
|
70
|
+
failuresTotal.labels(classifyError(err), 'youtube_transcript').inc();
|
|
71
|
+
log('error', 'youtube transcript failed', { reqId, error: err.message, stack: err.stack });
|
|
72
|
+
res.status(500).json({ error: safeError(err) });
|
|
73
|
+
}
|
|
74
|
+
});
|
|
75
|
+
|
|
76
|
+
// Browser fallback -- play video, intercept timedtext network response
|
|
77
|
+
async function browserTranscript(reqId, url, videoId, lang) {
|
|
78
|
+
return await withUserLimit('__yt_transcript__', async () => {
|
|
79
|
+
await ensureBrowser();
|
|
80
|
+
const session = await getSession('__yt_transcript__');
|
|
81
|
+
const page = await session.context.newPage();
|
|
82
|
+
|
|
83
|
+
try {
|
|
84
|
+
await page.addInitScript(() => {
|
|
85
|
+
const origPlay = HTMLMediaElement.prototype.play;
|
|
86
|
+
HTMLMediaElement.prototype.play = function() { this.volume = 0; this.muted = true; return origPlay.call(this); };
|
|
87
|
+
});
|
|
88
|
+
|
|
89
|
+
let interceptedCaptions = null;
|
|
90
|
+
page.on('response', async (response) => {
|
|
91
|
+
const respUrl = response.url();
|
|
92
|
+
if (respUrl.includes('/api/timedtext') && respUrl.includes(`v=${videoId}`) && !interceptedCaptions) {
|
|
93
|
+
try {
|
|
94
|
+
const body = await response.text();
|
|
95
|
+
if (body && body.length > 0) interceptedCaptions = body;
|
|
96
|
+
} catch {}
|
|
97
|
+
}
|
|
98
|
+
});
|
|
99
|
+
|
|
100
|
+
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: NAVIGATE_TIMEOUT_MS });
|
|
101
|
+
await page.waitForTimeout(2000);
|
|
102
|
+
|
|
103
|
+
// Extract caption track URLs and metadata from ytInitialPlayerResponse
|
|
104
|
+
const meta = await page.evaluate(() => {
|
|
105
|
+
const r = window.ytInitialPlayerResponse || (typeof ytInitialPlayerResponse !== 'undefined' ? ytInitialPlayerResponse : null);
|
|
106
|
+
if (!r) return { title: '', tracks: [] };
|
|
107
|
+
const tracks = r?.captions?.playerCaptionsTracklistRenderer?.captionTracks || [];
|
|
108
|
+
return {
|
|
109
|
+
title: r?.videoDetails?.title || '',
|
|
110
|
+
tracks: tracks.map(t => ({ code: t.languageCode, name: t.name?.simpleText || t.languageCode, kind: t.kind || 'manual', url: t.baseUrl })),
|
|
111
|
+
};
|
|
112
|
+
});
|
|
113
|
+
|
|
114
|
+
log('info', 'youtube transcript: extracted caption tracks', { reqId, title: meta.title, trackCount: meta.tracks.length, tracks: meta.tracks.map(t => t.code) });
|
|
115
|
+
|
|
116
|
+
// Strategy A: Fetch caption track URL directly from ytInitialPlayerResponse
|
|
117
|
+
if (meta.tracks && meta.tracks.length > 0) {
|
|
118
|
+
const track = meta.tracks.find(t => t.code === lang) || meta.tracks[0];
|
|
119
|
+
if (track && track.url) {
|
|
120
|
+
const captionUrl = track.url + (track.url.includes('?') ? '&' : '?') + 'fmt=json3';
|
|
121
|
+
log('info', 'youtube transcript: fetching caption track', { reqId, lang: track.code, url: captionUrl.substring(0, 100) });
|
|
122
|
+
try {
|
|
123
|
+
const captionResp = await page.evaluate(async (fetchUrl) => {
|
|
124
|
+
const resp = await fetch(fetchUrl);
|
|
125
|
+
return resp.ok ? await resp.text() : null;
|
|
126
|
+
}, captionUrl);
|
|
127
|
+
if (captionResp && captionResp.length > 0) {
|
|
128
|
+
let transcriptText = null;
|
|
129
|
+
if (captionResp.trimStart().startsWith('{')) transcriptText = parseJson3(captionResp);
|
|
130
|
+
else if (captionResp.includes('WEBVTT')) transcriptText = parseVtt(captionResp);
|
|
131
|
+
else if (captionResp.includes('<text')) transcriptText = parseXml(captionResp);
|
|
132
|
+
if (transcriptText && transcriptText.trim()) {
|
|
133
|
+
return {
|
|
134
|
+
status: 'ok', transcript: transcriptText,
|
|
135
|
+
video_url: url, video_id: videoId, video_title: meta.title,
|
|
136
|
+
language: track.code, total_words: transcriptText.split(/\s+/).length,
|
|
137
|
+
available_languages: meta.tracks.map(t => ({ code: t.code, name: t.name, kind: t.kind })),
|
|
138
|
+
};
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
} catch (fetchErr) {
|
|
142
|
+
log('warn', 'youtube transcript: caption track fetch failed', { reqId, error: fetchErr.message });
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
// Strategy B: Play video and intercept timedtext network response
|
|
148
|
+
await page.evaluate(() => {
|
|
149
|
+
const v = document.querySelector('video');
|
|
150
|
+
if (v) { v.muted = true; v.play().catch(() => {}); }
|
|
151
|
+
}).catch(() => {});
|
|
152
|
+
|
|
153
|
+
for (let i = 0; i < 40 && !interceptedCaptions; i++) {
|
|
154
|
+
await page.waitForTimeout(500);
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
if (!interceptedCaptions) {
|
|
158
|
+
return {
|
|
159
|
+
status: 'error', code: 404,
|
|
160
|
+
message: 'No captions available for this video',
|
|
161
|
+
video_url: url, video_id: videoId, title: meta.title,
|
|
162
|
+
};
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
log('info', 'youtube transcript: intercepted captions', { reqId, len: interceptedCaptions.length });
|
|
166
|
+
|
|
167
|
+
let transcriptText = null;
|
|
168
|
+
if (interceptedCaptions.trimStart().startsWith('{')) transcriptText = parseJson3(interceptedCaptions);
|
|
169
|
+
else if (interceptedCaptions.includes('WEBVTT')) transcriptText = parseVtt(interceptedCaptions);
|
|
170
|
+
else if (interceptedCaptions.includes('<text')) transcriptText = parseXml(interceptedCaptions);
|
|
171
|
+
|
|
172
|
+
if (!transcriptText || !transcriptText.trim()) {
|
|
173
|
+
return {
|
|
174
|
+
status: 'error', code: 404,
|
|
175
|
+
message: 'Caption data intercepted but could not be parsed',
|
|
176
|
+
video_url: url, video_id: videoId, title: meta.title,
|
|
177
|
+
};
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
return {
|
|
181
|
+
status: 'ok', transcript: transcriptText,
|
|
182
|
+
video_url: url, video_id: videoId, video_title: meta.title,
|
|
183
|
+
language: lang, total_words: transcriptText.split(/\s+/).length,
|
|
184
|
+
available_languages: meta.languages,
|
|
185
|
+
};
|
|
186
|
+
} finally {
|
|
187
|
+
await safePageClose(page);
|
|
188
|
+
// Clean up transcript session if no live pages remain
|
|
189
|
+
const ytKey = normalizeUserId('__yt_transcript__');
|
|
190
|
+
const ytSession = sessions.get(ytKey);
|
|
191
|
+
if (ytSession && !ytSession._closing) {
|
|
192
|
+
try {
|
|
193
|
+
const remainingPages = ytSession.context.pages();
|
|
194
|
+
if (remainingPages.length === 0) {
|
|
195
|
+
ytSession._closing = true;
|
|
196
|
+
ytSession.context.close().catch(() => {});
|
|
197
|
+
sessions.delete(ytKey);
|
|
198
|
+
}
|
|
199
|
+
} catch {
|
|
200
|
+
sessions.delete(ytKey);
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
});
|
|
205
|
+
}
|
|
206
|
+
}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
#!/bin/sh
|
|
2
|
-
# Install yt-dlp binary (not available via apt)
|
|
3
|
-
set -e
|
|
4
|
-
curl -fL https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp -o /usr/local/bin/yt-dlp
|
|
5
|
-
chmod +x /usr/local/bin/yt-dlp
|
|
1
|
+
#!/bin/sh
|
|
2
|
+
# Install yt-dlp binary (not available via apt)
|
|
3
|
+
set -e
|
|
4
|
+
curl -fL https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp -o /usr/local/bin/yt-dlp
|
|
5
|
+
chmod +x /usr/local/bin/yt-dlp
|