webpeel 0.20.8 → 0.20.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/core/youtube.js
CHANGED
|
@@ -224,8 +224,15 @@ export async function getYouTubeTranscript(url, options = {}) {
|
|
|
224
224
|
const preferredLang = options.language ?? 'en';
|
|
225
225
|
const videoUrl = `https://www.youtube.com/watch?v=${videoId}`;
|
|
226
226
|
// --- Path 1: Try simpleFetch (fast, no browser overhead) ---
|
|
227
|
+
// YouTube serves consent/challenge pages to server IPs without cookies.
|
|
228
|
+
// Setting SOCS consent cookie bypasses this — same approach as youtube-transcript npm.
|
|
229
|
+
const ytUserAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36';
|
|
230
|
+
const ytHeaders = {
|
|
231
|
+
'Cookie': 'SOCS=CAISNQgDEitib3FfaWRlbnRpdHlmcm9udGVuZHVpc2VydmVyXzIwMjQwNTE1LjA3X3AxGgJlbiADGgYIgLv3tQY; CONSENT=PENDING+987',
|
|
232
|
+
'Accept-Language': 'en-US,en;q=0.9',
|
|
233
|
+
};
|
|
227
234
|
try {
|
|
228
|
-
const fetchResult = await simpleFetch(videoUrl,
|
|
235
|
+
const fetchResult = await simpleFetch(videoUrl, ytUserAgent, 15000, ytHeaders);
|
|
229
236
|
const html = fetchResult.html;
|
|
230
237
|
if (!html.includes('ytInitialPlayerResponse') && !html.includes('ytInitialData')) {
|
|
231
238
|
throw new Error('YouTube served non-video page (likely challenge/consent)');
|
package/dist/server/app.js
CHANGED
|
@@ -94,6 +94,7 @@ export function createApp(config = {}) {
|
|
|
94
94
|
app.use((req, res, next) => {
|
|
95
95
|
const path = req.path;
|
|
96
96
|
let timeoutMs = 30000; // 30s default
|
|
97
|
+
const urlParam = req.query?.url || '';
|
|
97
98
|
if (path.includes('/crawl') || path.includes('/map'))
|
|
98
99
|
timeoutMs = 300000; // 5min for crawls
|
|
99
100
|
else if (path.includes('/batch'))
|
|
@@ -102,6 +103,8 @@ export function createApp(config = {}) {
|
|
|
102
103
|
timeoutMs = 60000; // 1min for screenshots
|
|
103
104
|
else if (req.query?.render === 'true')
|
|
104
105
|
timeoutMs = 60000; // 1min for rendered fetches
|
|
106
|
+
else if (urlParam.includes('youtube.com') || urlParam.includes('youtu.be'))
|
|
107
|
+
timeoutMs = 45000; // 45s for YouTube (transcript extraction)
|
|
105
108
|
req.setTimeout(timeoutMs);
|
|
106
109
|
res.setTimeout(timeoutMs, () => {
|
|
107
110
|
if (!res.headersSent) {
|
|
@@ -516,6 +516,8 @@ export function createFetchRouter(authStore) {
|
|
|
516
516
|
// SECURITY: Sanitize error messages to prevent information disclosure
|
|
517
517
|
if (err.code) {
|
|
518
518
|
// WebPeelError from core library - safe to expose with helpful context
|
|
519
|
+
if (res.headersSent)
|
|
520
|
+
return; // Timeout middleware already responded
|
|
519
521
|
const safeMessage = err.message.replace(/[<>"']/g, ''); // Remove HTML chars
|
|
520
522
|
const statusCode = err.code === 'TIMEOUT' ? 504
|
|
521
523
|
: err.code === 'BLOCKED' ? 403
|
|
@@ -540,6 +542,8 @@ export function createFetchRouter(authStore) {
|
|
|
540
542
|
else {
|
|
541
543
|
// Unexpected error - generic message only
|
|
542
544
|
console.error('Fetch error:', err); // Log full error server-side
|
|
545
|
+
if (res.headersSent)
|
|
546
|
+
return; // Timeout middleware already responded
|
|
543
547
|
res.status(500).json({
|
|
544
548
|
success: false,
|
|
545
549
|
error: {
|
|
@@ -1000,6 +1004,8 @@ export function createFetchRouter(authStore) {
|
|
|
1000
1004
|
catch (error) {
|
|
1001
1005
|
const err = error;
|
|
1002
1006
|
console.error('POST fetch/scrape error:', err);
|
|
1007
|
+
if (res.headersSent)
|
|
1008
|
+
return; // Timeout middleware already responded
|
|
1003
1009
|
if (err.code) {
|
|
1004
1010
|
const safeMessage = err.message.replace(/[<>"']/g, '');
|
|
1005
1011
|
const statusCode = err.code === 'TIMEOUT' ? 504
|
|
@@ -4,13 +4,31 @@
|
|
|
4
4
|
* so it's never blocked by rate limiting (Render hits it every ~30s).
|
|
5
5
|
*/
|
|
6
6
|
import { Router } from 'express';
|
|
7
|
+
import { readFileSync } from 'fs';
|
|
8
|
+
import { join, dirname } from 'path';
|
|
9
|
+
import { fileURLToPath } from 'url';
|
|
7
10
|
const startTime = Date.now();
|
|
11
|
+
// Read version once at startup
|
|
12
|
+
let version = 'unknown';
|
|
13
|
+
try {
|
|
14
|
+
const pkgPath = join(dirname(fileURLToPath(import.meta.url)), '..', '..', '..', 'package.json');
|
|
15
|
+
version = JSON.parse(readFileSync(pkgPath, 'utf-8')).version;
|
|
16
|
+
}
|
|
17
|
+
catch {
|
|
18
|
+
// Fallback for bundled/Docker environments
|
|
19
|
+
try {
|
|
20
|
+
const altPath = join(process.cwd(), 'package.json');
|
|
21
|
+
version = JSON.parse(readFileSync(altPath, 'utf-8')).version;
|
|
22
|
+
}
|
|
23
|
+
catch { /* keep 'unknown' */ }
|
|
24
|
+
}
|
|
8
25
|
export function createHealthRouter() {
|
|
9
26
|
const router = Router();
|
|
10
27
|
router.get('/health', (_req, res) => {
|
|
11
28
|
const uptime = Math.floor((Date.now() - startTime) / 1000);
|
|
12
29
|
res.json({
|
|
13
30
|
status: 'healthy',
|
|
31
|
+
version,
|
|
14
32
|
uptime,
|
|
15
33
|
timestamp: new Date().toISOString(),
|
|
16
34
|
});
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.20.
|
|
3
|
+
"version": "0.20.10",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|