webpeel 0.20.8 → 0.20.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -224,8 +224,15 @@ export async function getYouTubeTranscript(url, options = {}) {
224
224
  const preferredLang = options.language ?? 'en';
225
225
  const videoUrl = `https://www.youtube.com/watch?v=${videoId}`;
226
226
  // --- Path 1: Try simpleFetch (fast, no browser overhead) ---
227
+ // YouTube serves consent/challenge pages to server IPs without cookies.
228
+ // Setting SOCS consent cookie bypasses this — same approach as youtube-transcript npm.
229
+ const ytUserAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36';
230
+ const ytHeaders = {
231
+ 'Cookie': 'SOCS=CAISNQgDEitib3FfaWRlbnRpdHlmcm9udGVuZHVpc2VydmVyXzIwMjQwNTE1LjA3X3AxGgJlbiADGgYIgLv3tQY; CONSENT=PENDING+987',
232
+ 'Accept-Language': 'en-US,en;q=0.9',
233
+ };
227
234
  try {
228
- const fetchResult = await simpleFetch(videoUrl, undefined, 15000);
235
+ const fetchResult = await simpleFetch(videoUrl, ytUserAgent, 15000, ytHeaders);
229
236
  const html = fetchResult.html;
230
237
  if (!html.includes('ytInitialPlayerResponse') && !html.includes('ytInitialData')) {
231
238
  throw new Error('YouTube served non-video page (likely challenge/consent)');
@@ -94,6 +94,7 @@ export function createApp(config = {}) {
94
94
  app.use((req, res, next) => {
95
95
  const path = req.path;
96
96
  let timeoutMs = 30000; // 30s default
97
+ const urlParam = req.query?.url || '';
97
98
  if (path.includes('/crawl') || path.includes('/map'))
98
99
  timeoutMs = 300000; // 5min for crawls
99
100
  else if (path.includes('/batch'))
@@ -102,6 +103,8 @@ export function createApp(config = {}) {
102
103
  timeoutMs = 60000; // 1min for screenshots
103
104
  else if (req.query?.render === 'true')
104
105
  timeoutMs = 60000; // 1min for rendered fetches
106
+ else if (urlParam.includes('youtube.com') || urlParam.includes('youtu.be'))
107
+ timeoutMs = 45000; // 45s for YouTube (transcript extraction)
105
108
  req.setTimeout(timeoutMs);
106
109
  res.setTimeout(timeoutMs, () => {
107
110
  if (!res.headersSent) {
@@ -516,6 +516,8 @@ export function createFetchRouter(authStore) {
516
516
  // SECURITY: Sanitize error messages to prevent information disclosure
517
517
  if (err.code) {
518
518
  // WebPeelError from core library - safe to expose with helpful context
519
+ if (res.headersSent)
520
+ return; // Timeout middleware already responded
519
521
  const safeMessage = err.message.replace(/[<>"']/g, ''); // Remove HTML chars
520
522
  const statusCode = err.code === 'TIMEOUT' ? 504
521
523
  : err.code === 'BLOCKED' ? 403
@@ -540,6 +542,8 @@ export function createFetchRouter(authStore) {
540
542
  else {
541
543
  // Unexpected error - generic message only
542
544
  console.error('Fetch error:', err); // Log full error server-side
545
+ if (res.headersSent)
546
+ return; // Timeout middleware already responded
543
547
  res.status(500).json({
544
548
  success: false,
545
549
  error: {
@@ -1000,6 +1004,8 @@ export function createFetchRouter(authStore) {
1000
1004
  catch (error) {
1001
1005
  const err = error;
1002
1006
  console.error('POST fetch/scrape error:', err);
1007
+ if (res.headersSent)
1008
+ return; // Timeout middleware already responded
1003
1009
  if (err.code) {
1004
1010
  const safeMessage = err.message.replace(/[<>"']/g, '');
1005
1011
  const statusCode = err.code === 'TIMEOUT' ? 504
@@ -4,13 +4,31 @@
4
4
  * so it's never blocked by rate limiting (Render hits it every ~30s).
5
5
  */
6
6
  import { Router } from 'express';
7
+ import { readFileSync } from 'fs';
8
+ import { join, dirname } from 'path';
9
+ import { fileURLToPath } from 'url';
7
10
  const startTime = Date.now();
11
+ // Read version once at startup
12
+ let version = 'unknown';
13
+ try {
14
+ const pkgPath = join(dirname(fileURLToPath(import.meta.url)), '..', '..', '..', 'package.json');
15
+ version = JSON.parse(readFileSync(pkgPath, 'utf-8')).version;
16
+ }
17
+ catch {
18
+ // Fallback for bundled/Docker environments
19
+ try {
20
+ const altPath = join(process.cwd(), 'package.json');
21
+ version = JSON.parse(readFileSync(altPath, 'utf-8')).version;
22
+ }
23
+ catch { /* keep 'unknown' */ }
24
+ }
8
25
  export function createHealthRouter() {
9
26
  const router = Router();
10
27
  router.get('/health', (_req, res) => {
11
28
  const uptime = Math.floor((Date.now() - startTime) / 1000);
12
29
  res.json({
13
30
  status: 'healthy',
31
+ version,
14
32
  uptime,
15
33
  timestamp: new Date().toISOString(),
16
34
  });
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "webpeel",
3
- "version": "0.20.8",
3
+ "version": "0.20.10",
4
4
  "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
5
5
  "author": "Jake Liu",
6
6
  "license": "AGPL-3.0-only",