webpeel 0.20.9 → 0.20.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/core/youtube.js
CHANGED
|
@@ -224,8 +224,15 @@ export async function getYouTubeTranscript(url, options = {}) {
|
|
|
224
224
|
const preferredLang = options.language ?? 'en';
|
|
225
225
|
const videoUrl = `https://www.youtube.com/watch?v=${videoId}`;
|
|
226
226
|
// --- Path 1: Try simpleFetch (fast, no browser overhead) ---
|
|
227
|
+
// YouTube serves consent/challenge pages to server IPs without cookies.
|
|
228
|
+
// Setting SOCS consent cookie bypasses this — same approach as youtube-transcript npm.
|
|
229
|
+
const ytUserAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36';
|
|
230
|
+
const ytHeaders = {
|
|
231
|
+
'Cookie': 'SOCS=CAISNQgDEitib3FfaWRlbnRpdHlmcm9udGVuZHVpc2VydmVyXzIwMjQwNTE1LjA3X3AxGgJlbiADGgYIgLv3tQY; CONSENT=PENDING+987',
|
|
232
|
+
'Accept-Language': 'en-US,en;q=0.9',
|
|
233
|
+
};
|
|
227
234
|
try {
|
|
228
|
-
const fetchResult = await simpleFetch(videoUrl,
|
|
235
|
+
const fetchResult = await simpleFetch(videoUrl, ytUserAgent, 15000, ytHeaders);
|
|
229
236
|
const html = fetchResult.html;
|
|
230
237
|
if (!html.includes('ytInitialPlayerResponse') && !html.includes('ytInitialData')) {
|
|
231
238
|
throw new Error('YouTube served non-video page (likely challenge/consent)');
|
|
@@ -245,11 +252,12 @@ export async function getYouTubeTranscript(url, options = {}) {
|
|
|
245
252
|
throw new Error('No captions available');
|
|
246
253
|
const availableLanguages = captionTracks.map(t => t.languageCode);
|
|
247
254
|
const selectedTrack = selectBestTrack(captionTracks, preferredLang);
|
|
248
|
-
|
|
255
|
+
// Pass same cookies + user-agent to caption fetch — URL is session-locked
|
|
256
|
+
const captionXml = await fetchCaptionXml(selectedTrack.baseUrl, ytUserAgent, ytHeaders);
|
|
249
257
|
const segments = parseCaptionXml(captionXml);
|
|
250
258
|
if (segments.length === 0) {
|
|
251
259
|
// Caption URL returned empty content (common when ip=0.0.0.0 in signature)
|
|
252
|
-
// Fall through to browser intercept path
|
|
260
|
+
// Fall through to yt-dlp / browser intercept path
|
|
253
261
|
throw new Error('Caption XML returned empty — session-locked URL');
|
|
254
262
|
}
|
|
255
263
|
const fullText = segments.map(s => s.text).join(' ').replace(/\s+/g, ' ').trim();
|
|
@@ -671,10 +679,10 @@ function selectBestTrack(tracks, preferredLang) {
|
|
|
671
679
|
}
|
|
672
680
|
/**
|
|
673
681
|
* Fetch the caption XML from YouTube's timedtext API.
|
|
674
|
-
*
|
|
682
|
+
* Must use same cookies/UA as the page fetch — URLs are session-locked.
|
|
675
683
|
*/
|
|
676
|
-
async function fetchCaptionXml(baseUrl) {
|
|
677
|
-
const result = await simpleFetch(baseUrl,
|
|
684
|
+
async function fetchCaptionXml(baseUrl, userAgent, headers) {
|
|
685
|
+
const result = await simpleFetch(baseUrl, userAgent, 10000, headers);
|
|
678
686
|
return result.html;
|
|
679
687
|
}
|
|
680
688
|
/**
|
package/dist/server/app.js
CHANGED
|
@@ -94,6 +94,7 @@ export function createApp(config = {}) {
|
|
|
94
94
|
app.use((req, res, next) => {
|
|
95
95
|
const path = req.path;
|
|
96
96
|
let timeoutMs = 30000; // 30s default
|
|
97
|
+
const urlParam = req.query?.url || '';
|
|
97
98
|
if (path.includes('/crawl') || path.includes('/map'))
|
|
98
99
|
timeoutMs = 300000; // 5min for crawls
|
|
99
100
|
else if (path.includes('/batch'))
|
|
@@ -102,6 +103,8 @@ export function createApp(config = {}) {
|
|
|
102
103
|
timeoutMs = 60000; // 1min for screenshots
|
|
103
104
|
else if (req.query?.render === 'true')
|
|
104
105
|
timeoutMs = 60000; // 1min for rendered fetches
|
|
106
|
+
else if (urlParam.includes('youtube.com') || urlParam.includes('youtu.be'))
|
|
107
|
+
timeoutMs = 45000; // 45s for YouTube (transcript extraction)
|
|
105
108
|
req.setTimeout(timeoutMs);
|
|
106
109
|
res.setTimeout(timeoutMs, () => {
|
|
107
110
|
if (!res.headersSent) {
|
|
@@ -516,6 +516,8 @@ export function createFetchRouter(authStore) {
|
|
|
516
516
|
// SECURITY: Sanitize error messages to prevent information disclosure
|
|
517
517
|
if (err.code) {
|
|
518
518
|
// WebPeelError from core library - safe to expose with helpful context
|
|
519
|
+
if (res.headersSent)
|
|
520
|
+
return; // Timeout middleware already responded
|
|
519
521
|
const safeMessage = err.message.replace(/[<>"']/g, ''); // Remove HTML chars
|
|
520
522
|
const statusCode = err.code === 'TIMEOUT' ? 504
|
|
521
523
|
: err.code === 'BLOCKED' ? 403
|
|
@@ -540,6 +542,8 @@ export function createFetchRouter(authStore) {
|
|
|
540
542
|
else {
|
|
541
543
|
// Unexpected error - generic message only
|
|
542
544
|
console.error('Fetch error:', err); // Log full error server-side
|
|
545
|
+
if (res.headersSent)
|
|
546
|
+
return; // Timeout middleware already responded
|
|
543
547
|
res.status(500).json({
|
|
544
548
|
success: false,
|
|
545
549
|
error: {
|
|
@@ -1000,6 +1004,8 @@ export function createFetchRouter(authStore) {
|
|
|
1000
1004
|
catch (error) {
|
|
1001
1005
|
const err = error;
|
|
1002
1006
|
console.error('POST fetch/scrape error:', err);
|
|
1007
|
+
if (res.headersSent)
|
|
1008
|
+
return; // Timeout middleware already responded
|
|
1003
1009
|
if (err.code) {
|
|
1004
1010
|
const safeMessage = err.message.replace(/[<>"']/g, '');
|
|
1005
1011
|
const statusCode = err.code === 'TIMEOUT' ? 504
|
|
@@ -4,13 +4,31 @@
|
|
|
4
4
|
* so it's never blocked by rate limiting (Render hits it every ~30s).
|
|
5
5
|
*/
|
|
6
6
|
import { Router } from 'express';
|
|
7
|
+
import { readFileSync } from 'fs';
|
|
8
|
+
import { join, dirname } from 'path';
|
|
9
|
+
import { fileURLToPath } from 'url';
|
|
7
10
|
const startTime = Date.now();
|
|
11
|
+
// Read version once at startup
|
|
12
|
+
let version = 'unknown';
|
|
13
|
+
try {
|
|
14
|
+
const pkgPath = join(dirname(fileURLToPath(import.meta.url)), '..', '..', '..', 'package.json');
|
|
15
|
+
version = JSON.parse(readFileSync(pkgPath, 'utf-8')).version;
|
|
16
|
+
}
|
|
17
|
+
catch {
|
|
18
|
+
// Fallback for bundled/Docker environments
|
|
19
|
+
try {
|
|
20
|
+
const altPath = join(process.cwd(), 'package.json');
|
|
21
|
+
version = JSON.parse(readFileSync(altPath, 'utf-8')).version;
|
|
22
|
+
}
|
|
23
|
+
catch { /* keep 'unknown' */ }
|
|
24
|
+
}
|
|
8
25
|
export function createHealthRouter() {
|
|
9
26
|
const router = Router();
|
|
10
27
|
router.get('/health', (_req, res) => {
|
|
11
28
|
const uptime = Math.floor((Date.now() - startTime) / 1000);
|
|
12
29
|
res.json({
|
|
13
30
|
status: 'healthy',
|
|
31
|
+
version,
|
|
14
32
|
uptime,
|
|
15
33
|
timestamp: new Date().toISOString(),
|
|
16
34
|
});
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.20.
|
|
3
|
+
"version": "0.20.11",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|