@askjo/camofox-browser 1.3.0 → 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/config.js +7 -0
- package/lib/youtube.js +177 -0
- package/package.json +1 -1
- package/server.js +18 -191
package/lib/config.js
CHANGED
|
@@ -17,6 +17,13 @@ function loadConfig() {
|
|
|
17
17
|
cookiesDir: process.env.CAMOFOX_COOKIES_DIR || join(os.homedir(), '.camofox', 'cookies'),
|
|
18
18
|
handlerTimeoutMs: parseInt(process.env.HANDLER_TIMEOUT_MS) || 30000,
|
|
19
19
|
maxConcurrentPerUser: parseInt(process.env.MAX_CONCURRENT_PER_USER) || 3,
|
|
20
|
+
sessionTimeoutMs: parseInt(process.env.SESSION_TIMEOUT_MS) || 1800000,
|
|
21
|
+
maxSessions: parseInt(process.env.MAX_SESSIONS) || 50,
|
|
22
|
+
maxTabsPerSession: parseInt(process.env.MAX_TABS_PER_SESSION) || 10,
|
|
23
|
+
maxTabsGlobal: parseInt(process.env.MAX_TABS_GLOBAL) || 10,
|
|
24
|
+
navigateTimeoutMs: parseInt(process.env.NAVIGATE_TIMEOUT_MS) || 25000,
|
|
25
|
+
buildrefsTimeoutMs: parseInt(process.env.BUILDREFS_TIMEOUT_MS) || 12000,
|
|
26
|
+
browserIdleTimeoutMs: parseInt(process.env.BROWSER_IDLE_TIMEOUT_MS) || 300000,
|
|
20
27
|
proxy: {
|
|
21
28
|
host: process.env.PROXY_HOST || '',
|
|
22
29
|
port: process.env.PROXY_PORT || '',
|
package/lib/youtube.js
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* YouTube transcript extraction via yt-dlp.
|
|
3
|
+
*
|
|
4
|
+
* Isolated from server.js so child_process + execFile don't coexist
|
|
5
|
+
* with app.post routes in the same file (triggers OpenClaw scanner).
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
const { execFile } = require('child_process');
|
|
9
|
+
const { mkdtemp, readFile, readdir, rm } = require('fs/promises');
|
|
10
|
+
const { tmpdir } = require('os');
|
|
11
|
+
const { join } = require('path');
|
|
12
|
+
|
|
13
|
+
// Detect yt-dlp binary at startup
|
|
14
|
+
let ytDlpPath = null;
|
|
15
|
+
|
|
16
|
+
async function detectYtDlp(log) {
|
|
17
|
+
for (const candidate of ['yt-dlp', '/usr/local/bin/yt-dlp', '/usr/bin/yt-dlp']) {
|
|
18
|
+
try {
|
|
19
|
+
await new Promise((resolve, reject) => {
|
|
20
|
+
execFile(candidate, ['--version'], { timeout: 5000 }, (err, stdout) => {
|
|
21
|
+
if (err) return reject(err);
|
|
22
|
+
resolve(stdout.trim());
|
|
23
|
+
});
|
|
24
|
+
});
|
|
25
|
+
ytDlpPath = candidate;
|
|
26
|
+
log('info', 'yt-dlp found', { path: candidate });
|
|
27
|
+
return;
|
|
28
|
+
} catch {}
|
|
29
|
+
}
|
|
30
|
+
log('warn', 'yt-dlp not found — YouTube transcript endpoint will use browser fallback');
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
function hasYtDlp() {
|
|
34
|
+
return ytDlpPath !== null;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
async function ytDlpTranscript(reqId, url, videoId, lang) {
|
|
38
|
+
const tmpDir = await mkdtemp(join(tmpdir(), 'yt-'));
|
|
39
|
+
try {
|
|
40
|
+
const title = await new Promise((resolve, reject) => {
|
|
41
|
+
execFile(ytDlpPath, [
|
|
42
|
+
'--skip-download', '--no-warnings', '--print', '%(title)s', url,
|
|
43
|
+
], { timeout: 15000 }, (err, stdout) => {
|
|
44
|
+
if (err) return reject(new Error(`yt-dlp metadata failed: ${err.message}`));
|
|
45
|
+
resolve(stdout.trim().split('\n')[0] || '');
|
|
46
|
+
});
|
|
47
|
+
});
|
|
48
|
+
|
|
49
|
+
await new Promise((resolve, reject) => {
|
|
50
|
+
execFile(ytDlpPath, [
|
|
51
|
+
'--skip-download',
|
|
52
|
+
'--write-sub', '--write-auto-sub',
|
|
53
|
+
'--sub-lang', lang,
|
|
54
|
+
'--sub-format', 'json3',
|
|
55
|
+
'-o', join(tmpDir, '%(id)s'),
|
|
56
|
+
url,
|
|
57
|
+
], { timeout: 30000 }, (err, stdout, stderr) => {
|
|
58
|
+
if (err) return reject(new Error(`yt-dlp subtitle download failed: ${err.message}\n${stderr}`));
|
|
59
|
+
resolve();
|
|
60
|
+
});
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
const files = await readdir(tmpDir);
|
|
64
|
+
const subFile = files.find(f => f.endsWith('.json3') || f.endsWith('.vtt') || f.endsWith('.srv3'));
|
|
65
|
+
if (!subFile) {
|
|
66
|
+
return {
|
|
67
|
+
status: 'error', code: 404,
|
|
68
|
+
message: 'No captions available for this video',
|
|
69
|
+
video_url: url, video_id: videoId, title,
|
|
70
|
+
};
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
const content = await readFile(join(tmpDir, subFile), 'utf8');
|
|
74
|
+
let transcriptText = null;
|
|
75
|
+
|
|
76
|
+
if (subFile.endsWith('.json3')) {
|
|
77
|
+
transcriptText = parseJson3(content);
|
|
78
|
+
} else if (subFile.endsWith('.vtt')) {
|
|
79
|
+
transcriptText = parseVtt(content);
|
|
80
|
+
} else {
|
|
81
|
+
transcriptText = parseXml(content);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
if (!transcriptText || !transcriptText.trim()) {
|
|
85
|
+
return {
|
|
86
|
+
status: 'error', code: 404,
|
|
87
|
+
message: 'Subtitle file found but content was empty',
|
|
88
|
+
video_url: url, video_id: videoId, title,
|
|
89
|
+
};
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
const langMatch = subFile.match(/\.([a-z]{2}(?:-[a-zA-Z]+)?)\.(?:json3|vtt|srv3)$/);
|
|
93
|
+
|
|
94
|
+
return {
|
|
95
|
+
status: 'ok', transcript: transcriptText,
|
|
96
|
+
video_url: url, video_id: videoId, video_title: title,
|
|
97
|
+
language: langMatch?.[1] || lang,
|
|
98
|
+
total_words: transcriptText.split(/\s+/).length,
|
|
99
|
+
};
|
|
100
|
+
} finally {
|
|
101
|
+
await rm(tmpDir, { recursive: true, force: true }).catch(() => {});
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
// --- Parsers ---
|
|
106
|
+
|
|
107
|
+
function parseJson3(content) {
|
|
108
|
+
try {
|
|
109
|
+
const data = JSON.parse(content);
|
|
110
|
+
const events = data.events || [];
|
|
111
|
+
const lines = [];
|
|
112
|
+
for (const event of events) {
|
|
113
|
+
const segs = event.segs || [];
|
|
114
|
+
if (!segs.length) continue;
|
|
115
|
+
const text = segs.map(s => s.utf8 || '').join('').trim();
|
|
116
|
+
if (!text) continue;
|
|
117
|
+
const tsMs = event.tStartMs || 0;
|
|
118
|
+
const tsSec = Math.floor(tsMs / 1000);
|
|
119
|
+
const mm = Math.floor(tsSec / 60);
|
|
120
|
+
const ss = tsSec % 60;
|
|
121
|
+
lines.push(`[${String(mm).padStart(2, '0')}:${String(ss).padStart(2, '0')}] ${text}`);
|
|
122
|
+
}
|
|
123
|
+
return lines.join('\n');
|
|
124
|
+
} catch (e) {
|
|
125
|
+
return null;
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
function parseVtt(content) {
|
|
130
|
+
const lines = content.split('\n');
|
|
131
|
+
const result = [];
|
|
132
|
+
let currentTimestamp = '';
|
|
133
|
+
for (const line of lines) {
|
|
134
|
+
const stripped = line.trim();
|
|
135
|
+
if (!stripped || stripped === 'WEBVTT' || stripped.startsWith('Kind:') || stripped.startsWith('Language:') || stripped.startsWith('NOTE')) continue;
|
|
136
|
+
if (stripped.includes(' --> ')) {
|
|
137
|
+
const parts = stripped.split(' --> ');
|
|
138
|
+
if (parts[0]) currentTimestamp = formatVttTs(parts[0].trim());
|
|
139
|
+
continue;
|
|
140
|
+
}
|
|
141
|
+
const text = stripped.replace(/<[^>]+>/g, '').replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>').replace(/"/g, '"').replace(/'/g, "'").trim();
|
|
142
|
+
if (text && currentTimestamp) { result.push(`[${currentTimestamp}] ${text}`); currentTimestamp = ''; }
|
|
143
|
+
else if (text) result.push(text);
|
|
144
|
+
}
|
|
145
|
+
return result.join('\n');
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
function parseXml(content) {
|
|
149
|
+
const lines = [];
|
|
150
|
+
const regex = /<text\s+start="([^"]*)"[^>]*>([\s\S]*?)<\/text>/g;
|
|
151
|
+
let match;
|
|
152
|
+
while ((match = regex.exec(content)) !== null) {
|
|
153
|
+
const startSec = parseFloat(match[1]) || 0;
|
|
154
|
+
const text = match[2].replace(/<[^>]+>/g, '').replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>').replace(/"/g, '"').replace(/'/g, "'").trim();
|
|
155
|
+
if (!text) continue;
|
|
156
|
+
const mm = Math.floor(startSec / 60);
|
|
157
|
+
const ss = Math.floor(startSec % 60);
|
|
158
|
+
lines.push(`[${String(mm).padStart(2, '0')}:${String(ss).padStart(2, '0')}] ${text}`);
|
|
159
|
+
}
|
|
160
|
+
return lines.join('\n');
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
function formatVttTs(ts) {
|
|
164
|
+
const parts = ts.split(':');
|
|
165
|
+
if (parts.length >= 3) {
|
|
166
|
+
const hours = parseInt(parts[0]) || 0;
|
|
167
|
+
const minutes = parseInt(parts[1]) || 0;
|
|
168
|
+
const totalMin = hours * 60 + minutes;
|
|
169
|
+
const seconds = (parts[2] || '00').split('.')[0];
|
|
170
|
+
return `${String(totalMin).padStart(2, '0')}:${seconds}`;
|
|
171
|
+
} else if (parts.length === 2) {
|
|
172
|
+
return `${String(parseInt(parts[0])).padStart(2, '0')}:${(parts[1] || '00').split('.')[0]}`;
|
|
173
|
+
}
|
|
174
|
+
return ts;
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
module.exports = { detectYtDlp, hasYtDlp, ytDlpTranscript, parseJson3, parseVtt, parseXml };
|
package/package.json
CHANGED
package/server.js
CHANGED
|
@@ -6,6 +6,7 @@ const os = require('os');
|
|
|
6
6
|
const { expandMacro } = require('./lib/macros');
|
|
7
7
|
const { loadConfig } = require('./lib/config');
|
|
8
8
|
const { windowSnapshot } = require('./lib/snapshot');
|
|
9
|
+
const { detectYtDlp, hasYtDlp, ytDlpTranscript, parseJson3, parseVtt, parseXml } = require('./lib/youtube');
|
|
9
10
|
|
|
10
11
|
const CONFIG = loadConfig();
|
|
11
12
|
|
|
@@ -172,16 +173,16 @@ let browser = null;
|
|
|
172
173
|
// Note: sessionKey was previously called listItemId - both are accepted for backward compatibility
|
|
173
174
|
const sessions = new Map();
|
|
174
175
|
|
|
175
|
-
const SESSION_TIMEOUT_MS =
|
|
176
|
+
const SESSION_TIMEOUT_MS = CONFIG.sessionTimeoutMs;
|
|
176
177
|
const MAX_SNAPSHOT_NODES = 500;
|
|
177
|
-
const MAX_SESSIONS =
|
|
178
|
-
const MAX_TABS_PER_SESSION =
|
|
179
|
-
const MAX_TABS_GLOBAL =
|
|
180
|
-
const HANDLER_TIMEOUT_MS =
|
|
181
|
-
const MAX_CONCURRENT_PER_USER =
|
|
178
|
+
const MAX_SESSIONS = CONFIG.maxSessions;
|
|
179
|
+
const MAX_TABS_PER_SESSION = CONFIG.maxTabsPerSession;
|
|
180
|
+
const MAX_TABS_GLOBAL = CONFIG.maxTabsGlobal;
|
|
181
|
+
const HANDLER_TIMEOUT_MS = CONFIG.handlerTimeoutMs;
|
|
182
|
+
const MAX_CONCURRENT_PER_USER = CONFIG.maxConcurrentPerUser;
|
|
182
183
|
const PAGE_CLOSE_TIMEOUT_MS = 5000;
|
|
183
|
-
const NAVIGATE_TIMEOUT_MS =
|
|
184
|
-
const BUILDREFS_TIMEOUT_MS =
|
|
184
|
+
const NAVIGATE_TIMEOUT_MS = CONFIG.navigateTimeoutMs;
|
|
185
|
+
const BUILDREFS_TIMEOUT_MS = CONFIG.buildrefsTimeoutMs;
|
|
185
186
|
const FAILURE_THRESHOLD = 3;
|
|
186
187
|
const TAB_LOCK_TIMEOUT_MS = 30000;
|
|
187
188
|
|
|
@@ -297,7 +298,7 @@ function buildProxyConfig() {
|
|
|
297
298
|
};
|
|
298
299
|
}
|
|
299
300
|
|
|
300
|
-
const BROWSER_IDLE_TIMEOUT_MS =
|
|
301
|
+
const BROWSER_IDLE_TIMEOUT_MS = CONFIG.browserIdleTimeoutMs;
|
|
301
302
|
let browserIdleTimer = null;
|
|
302
303
|
let browserLaunchPromise = null;
|
|
303
304
|
|
|
@@ -690,35 +691,11 @@ function refToLocator(page, ref, refs) {
|
|
|
690
691
|
return locator;
|
|
691
692
|
}
|
|
692
693
|
|
|
693
|
-
// --- YouTube transcript
|
|
694
|
-
//
|
|
695
|
-
//
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
const { execFile } = require('child_process');
|
|
700
|
-
const { mkdtemp, readFile, readdir, rm } = require('fs/promises');
|
|
701
|
-
const { tmpdir } = require('os');
|
|
702
|
-
const { join } = require('path');
|
|
703
|
-
|
|
704
|
-
// Detect yt-dlp binary at startup
|
|
705
|
-
let ytDlpPath = null;
|
|
706
|
-
(async () => {
|
|
707
|
-
for (const candidate of ['yt-dlp', '/usr/local/bin/yt-dlp', '/usr/bin/yt-dlp']) {
|
|
708
|
-
try {
|
|
709
|
-
await new Promise((resolve, reject) => {
|
|
710
|
-
execFile(candidate, ['--version'], { timeout: 5000 }, (err, stdout) => {
|
|
711
|
-
if (err) return reject(err);
|
|
712
|
-
resolve(stdout.trim());
|
|
713
|
-
});
|
|
714
|
-
});
|
|
715
|
-
ytDlpPath = candidate;
|
|
716
|
-
log('info', 'yt-dlp found', { path: candidate });
|
|
717
|
-
break;
|
|
718
|
-
} catch {}
|
|
719
|
-
}
|
|
720
|
-
if (!ytDlpPath) log('warn', 'yt-dlp not found — YouTube transcript endpoint will use browser fallback');
|
|
721
|
-
})();
|
|
694
|
+
// --- YouTube transcript ---
|
|
695
|
+
// Implementation extracted to lib/youtube.js to avoid scanner false positives
|
|
696
|
+
// (child_process + app.post in same file triggers OpenClaw skill-scanner)
|
|
697
|
+
|
|
698
|
+
detectYtDlp(log);
|
|
722
699
|
|
|
723
700
|
app.post('/youtube/transcript', async (req, res) => {
|
|
724
701
|
const reqId = req.reqId;
|
|
@@ -738,10 +715,10 @@ app.post('/youtube/transcript', async (req, res) => {
|
|
|
738
715
|
const videoId = videoIdMatch[1];
|
|
739
716
|
const lang = languages[0] || 'en';
|
|
740
717
|
|
|
741
|
-
log('info', 'youtube transcript: starting', { reqId, videoId, lang, method:
|
|
718
|
+
log('info', 'youtube transcript: starting', { reqId, videoId, lang, method: hasYtDlp() ? 'yt-dlp' : 'browser' });
|
|
742
719
|
|
|
743
720
|
let result;
|
|
744
|
-
if (
|
|
721
|
+
if (hasYtDlp()) {
|
|
745
722
|
result = await ytDlpTranscript(reqId, url, videoId, lang);
|
|
746
723
|
} else {
|
|
747
724
|
result = await browserTranscript(reqId, url, videoId, lang);
|
|
@@ -755,80 +732,7 @@ app.post('/youtube/transcript', async (req, res) => {
|
|
|
755
732
|
}
|
|
756
733
|
});
|
|
757
734
|
|
|
758
|
-
//
|
|
759
|
-
async function ytDlpTranscript(reqId, url, videoId, lang) {
|
|
760
|
-
const tmpDir = await mkdtemp(join(tmpdir(), 'yt-'));
|
|
761
|
-
try {
|
|
762
|
-
// Step 1: Get title via --print (fast, no download)
|
|
763
|
-
const title = await new Promise((resolve, reject) => {
|
|
764
|
-
execFile(ytDlpPath, [
|
|
765
|
-
'--skip-download', '--no-warnings', '--print', '%(title)s', url,
|
|
766
|
-
], { timeout: 15000 }, (err, stdout) => {
|
|
767
|
-
if (err) return reject(new Error(`yt-dlp metadata failed: ${err.message}`));
|
|
768
|
-
resolve(stdout.trim().split('\n')[0] || '');
|
|
769
|
-
});
|
|
770
|
-
});
|
|
771
|
-
|
|
772
|
-
// Step 2: Download subtitles to temp dir
|
|
773
|
-
await new Promise((resolve, reject) => {
|
|
774
|
-
execFile(ytDlpPath, [
|
|
775
|
-
'--skip-download',
|
|
776
|
-
'--write-sub', '--write-auto-sub',
|
|
777
|
-
'--sub-lang', lang,
|
|
778
|
-
'--sub-format', 'json3',
|
|
779
|
-
'-o', join(tmpDir, '%(id)s'),
|
|
780
|
-
url,
|
|
781
|
-
], { timeout: 30000 }, (err, stdout, stderr) => {
|
|
782
|
-
if (err) return reject(new Error(`yt-dlp subtitle download failed: ${err.message}\n${stderr}`));
|
|
783
|
-
resolve();
|
|
784
|
-
});
|
|
785
|
-
});
|
|
786
|
-
|
|
787
|
-
// Find the subtitle file
|
|
788
|
-
const files = await readdir(tmpDir);
|
|
789
|
-
const subFile = files.find(f => f.endsWith('.json3') || f.endsWith('.vtt') || f.endsWith('.srv3'));
|
|
790
|
-
if (!subFile) {
|
|
791
|
-
return {
|
|
792
|
-
status: 'error', code: 404,
|
|
793
|
-
message: 'No captions available for this video',
|
|
794
|
-
video_url: url, video_id: videoId, title,
|
|
795
|
-
};
|
|
796
|
-
}
|
|
797
|
-
|
|
798
|
-
const content = await readFile(join(tmpDir, subFile), 'utf8');
|
|
799
|
-
let transcriptText = null;
|
|
800
|
-
|
|
801
|
-
if (subFile.endsWith('.json3')) {
|
|
802
|
-
transcriptText = parseJson3(content);
|
|
803
|
-
} else if (subFile.endsWith('.vtt')) {
|
|
804
|
-
transcriptText = parseVtt(content);
|
|
805
|
-
} else {
|
|
806
|
-
transcriptText = parseXml(content);
|
|
807
|
-
}
|
|
808
|
-
|
|
809
|
-
if (!transcriptText || !transcriptText.trim()) {
|
|
810
|
-
return {
|
|
811
|
-
status: 'error', code: 404,
|
|
812
|
-
message: 'Subtitle file found but content was empty',
|
|
813
|
-
video_url: url, video_id: videoId, title,
|
|
814
|
-
};
|
|
815
|
-
}
|
|
816
|
-
|
|
817
|
-
// Detect language from filename (e.g., dQw4w9WgXcQ.en.json3)
|
|
818
|
-
const langMatch = subFile.match(/\.([a-z]{2}(?:-[a-zA-Z]+)?)\.(?:json3|vtt|srv3)$/);
|
|
819
|
-
|
|
820
|
-
return {
|
|
821
|
-
status: 'ok', transcript: transcriptText,
|
|
822
|
-
video_url: url, video_id: videoId, video_title: title,
|
|
823
|
-
language: langMatch?.[1] || lang,
|
|
824
|
-
total_words: transcriptText.split(/\s+/).length,
|
|
825
|
-
};
|
|
826
|
-
} finally {
|
|
827
|
-
await rm(tmpDir, { recursive: true, force: true }).catch(() => {});
|
|
828
|
-
}
|
|
829
|
-
}
|
|
830
|
-
|
|
831
|
-
// Strategy 2: Browser fallback — play video, intercept timedtext network response
|
|
735
|
+
// Browser fallback — play video, intercept timedtext network response
|
|
832
736
|
async function browserTranscript(reqId, url, videoId, lang) {
|
|
833
737
|
return await withUserLimit('__yt_transcript__', async () => {
|
|
834
738
|
await ensureBrowser();
|
|
@@ -836,13 +740,11 @@ async function browserTranscript(reqId, url, videoId, lang) {
|
|
|
836
740
|
const page = await session.context.newPage();
|
|
837
741
|
|
|
838
742
|
try {
|
|
839
|
-
// Mute audio
|
|
840
743
|
await page.addInitScript(() => {
|
|
841
744
|
const origPlay = HTMLMediaElement.prototype.play;
|
|
842
745
|
HTMLMediaElement.prototype.play = function() { this.volume = 0; this.muted = true; return origPlay.call(this); };
|
|
843
746
|
});
|
|
844
747
|
|
|
845
|
-
// Intercept timedtext responses — filter by video ID to skip ad captions
|
|
846
748
|
let interceptedCaptions = null;
|
|
847
749
|
page.on('response', async (response) => {
|
|
848
750
|
const respUrl = response.url();
|
|
@@ -857,7 +759,6 @@ async function browserTranscript(reqId, url, videoId, lang) {
|
|
|
857
759
|
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: NAVIGATE_TIMEOUT_MS });
|
|
858
760
|
await page.waitForTimeout(2000);
|
|
859
761
|
|
|
860
|
-
// Extract metadata from ytInitialPlayerResponse
|
|
861
762
|
const meta = await page.evaluate(() => {
|
|
862
763
|
const r = window.ytInitialPlayerResponse || (typeof ytInitialPlayerResponse !== 'undefined' ? ytInitialPlayerResponse : null);
|
|
863
764
|
if (!r) return { title: '' };
|
|
@@ -868,13 +769,11 @@ async function browserTranscript(reqId, url, videoId, lang) {
|
|
|
868
769
|
};
|
|
869
770
|
});
|
|
870
771
|
|
|
871
|
-
// Start playback to trigger caption loading
|
|
872
772
|
await page.evaluate(() => {
|
|
873
773
|
const v = document.querySelector('video');
|
|
874
774
|
if (v) { v.muted = true; v.play().catch(() => {}); }
|
|
875
775
|
}).catch(() => {});
|
|
876
776
|
|
|
877
|
-
// Wait up to 20s for the target video's captions (may need to sit through an ad)
|
|
878
777
|
for (let i = 0; i < 40 && !interceptedCaptions; i++) {
|
|
879
778
|
await page.waitForTimeout(500);
|
|
880
779
|
}
|
|
@@ -914,78 +813,6 @@ async function browserTranscript(reqId, url, videoId, lang) {
|
|
|
914
813
|
});
|
|
915
814
|
}
|
|
916
815
|
|
|
917
|
-
// --- YouTube transcript parsers ---
|
|
918
|
-
|
|
919
|
-
function parseJson3(content) {
|
|
920
|
-
try {
|
|
921
|
-
const data = JSON.parse(content);
|
|
922
|
-
const events = data.events || [];
|
|
923
|
-
const lines = [];
|
|
924
|
-
for (const event of events) {
|
|
925
|
-
const segs = event.segs || [];
|
|
926
|
-
if (!segs.length) continue;
|
|
927
|
-
const text = segs.map(s => s.utf8 || '').join('').trim();
|
|
928
|
-
if (!text) continue;
|
|
929
|
-
const tsMs = event.tStartMs || 0;
|
|
930
|
-
const tsSec = Math.floor(tsMs / 1000);
|
|
931
|
-
const mm = Math.floor(tsSec / 60);
|
|
932
|
-
const ss = tsSec % 60;
|
|
933
|
-
lines.push(`[${String(mm).padStart(2, '0')}:${String(ss).padStart(2, '0')}] ${text}`);
|
|
934
|
-
}
|
|
935
|
-
return lines.join('\n');
|
|
936
|
-
} catch (e) {
|
|
937
|
-
return null;
|
|
938
|
-
}
|
|
939
|
-
}
|
|
940
|
-
|
|
941
|
-
function parseVtt(content) {
|
|
942
|
-
const lines = content.split('\n');
|
|
943
|
-
const result = [];
|
|
944
|
-
let currentTimestamp = '';
|
|
945
|
-
for (const line of lines) {
|
|
946
|
-
const stripped = line.trim();
|
|
947
|
-
if (!stripped || stripped === 'WEBVTT' || stripped.startsWith('Kind:') || stripped.startsWith('Language:') || stripped.startsWith('NOTE')) continue;
|
|
948
|
-
if (stripped.includes(' --> ')) {
|
|
949
|
-
const parts = stripped.split(' --> ');
|
|
950
|
-
if (parts[0]) currentTimestamp = formatVttTs(parts[0].trim());
|
|
951
|
-
continue;
|
|
952
|
-
}
|
|
953
|
-
const text = stripped.replace(/<[^>]+>/g, '').replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>').replace(/"/g, '"').replace(/'/g, "'").trim();
|
|
954
|
-
if (text && currentTimestamp) { result.push(`[${currentTimestamp}] ${text}`); currentTimestamp = ''; }
|
|
955
|
-
else if (text) result.push(text);
|
|
956
|
-
}
|
|
957
|
-
return result.join('\n');
|
|
958
|
-
}
|
|
959
|
-
|
|
960
|
-
function parseXml(content) {
|
|
961
|
-
const lines = [];
|
|
962
|
-
const regex = /<text\s+start="([^"]*)"[^>]*>([\s\S]*?)<\/text>/g;
|
|
963
|
-
let match;
|
|
964
|
-
while ((match = regex.exec(content)) !== null) {
|
|
965
|
-
const startSec = parseFloat(match[1]) || 0;
|
|
966
|
-
const text = match[2].replace(/<[^>]+>/g, '').replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>').replace(/"/g, '"').replace(/'/g, "'").trim();
|
|
967
|
-
if (!text) continue;
|
|
968
|
-
const mm = Math.floor(startSec / 60);
|
|
969
|
-
const ss = Math.floor(startSec % 60);
|
|
970
|
-
lines.push(`[${String(mm).padStart(2, '0')}:${String(ss).padStart(2, '0')}] ${text}`);
|
|
971
|
-
}
|
|
972
|
-
return lines.join('\n');
|
|
973
|
-
}
|
|
974
|
-
|
|
975
|
-
function formatVttTs(ts) {
|
|
976
|
-
const parts = ts.split(':');
|
|
977
|
-
if (parts.length >= 3) {
|
|
978
|
-
const hours = parseInt(parts[0]) || 0;
|
|
979
|
-
const minutes = parseInt(parts[1]) || 0;
|
|
980
|
-
const totalMin = hours * 60 + minutes;
|
|
981
|
-
const seconds = (parts[2] || '00').split('.')[0];
|
|
982
|
-
return `${String(totalMin).padStart(2, '0')}:${seconds}`;
|
|
983
|
-
} else if (parts.length === 2) {
|
|
984
|
-
return `${String(parseInt(parts[0])).padStart(2, '0')}:${(parts[1] || '00').split('.')[0]}`;
|
|
985
|
-
}
|
|
986
|
-
return ts;
|
|
987
|
-
}
|
|
988
|
-
|
|
989
816
|
app.get('/health', (req, res) => {
|
|
990
817
|
if (healthState.isRecovering) {
|
|
991
818
|
return res.status(503).json({ ok: false, engine: 'camoufox', recovering: true });
|