webpeel 0.21.0 → 0.21.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -40,6 +40,21 @@ const DNS_WARMUP_DOMAINS = [
40
40
  'tools.ietf.org',
41
41
  'unicode.org',
42
42
  'www.bbc.com',
43
+ 'bbc.co.uk',
44
+ 'stripe.com',
45
+ 'docs.stripe.com',
46
+ 'vuejs.org',
47
+ 'angular.io',
48
+ 'www.washingtonpost.com',
49
+ 'www.theguardian.com',
50
+ 'techcrunch.com',
51
+ 'www.wired.com',
52
+ 'arstechnica.com',
53
+ 'docs.google.com',
54
+ 'drive.google.com',
55
+ 'www.notion.so',
56
+ 'www.producthunt.com',
57
+ 'www.crunchbase.com',
43
58
  'news.google.com',
44
59
  'www.youtube.com',
45
60
  'example.com',
@@ -42,11 +42,13 @@ const HTTP_STATUS_TEXT = {
42
42
  // ── HTTP connection pool ──────────────────────────────────────────────────────
43
43
  function createHttpPool() {
44
44
  return new Agent({
45
- connections: 20,
46
- pipelining: 6,
45
+ connections: 50,
46
+ pipelining: 10,
47
47
  keepAliveTimeout: 60000,
48
48
  keepAliveMaxTimeout: 60000,
49
49
  allowH2: true,
50
+ headersTimeout: 10000,
51
+ bodyTimeout: 30000,
50
52
  connect: {
51
53
  lookup: cachedLookup,
52
54
  },
@@ -75,6 +75,8 @@ export interface PipelineContext {
75
75
  domainApiHandled?: boolean;
76
76
  /** True when server returned pre-rendered markdown (Content-Type: text/markdown) */
77
77
  serverMarkdown?: boolean;
78
+ /** True when HTTP fetch completed in < 500ms — enables fast path (skip challenge detection) */
79
+ fastPath?: boolean;
78
80
  /** Non-fatal warnings accumulated during the pipeline run */
79
81
  warnings: string[];
80
82
  /** Raw HTML size in characters (measured from fetched content before any conversion) */
@@ -399,7 +399,17 @@ export async function fetchContent(ctx) {
399
399
  }
400
400
  throw fetchError;
401
401
  }
402
- ctx.timer.end('fetch');
402
+ const fetchDuration = ctx.timer.end('fetch');
403
+ // Fast path: if a plain HTTP fetch completed quickly with real HTML content,
404
+ // mark it so post-processing can skip expensive heuristics (challenge detection).
405
+ // Only applies to non-browser fetches that succeeded with HTML content.
406
+ if (fetchDuration < 500 &&
407
+ !ctx.render &&
408
+ fetchResult.statusCode === 200 &&
409
+ (fetchResult.contentType || '').includes('html') &&
410
+ (fetchResult.html?.length || 0) > 200) {
411
+ ctx.fastPath = true;
412
+ }
403
413
  // Auto-scroll to load lazy content, then grab fresh HTML
404
414
  if (needsAutoScroll && fetchResult.page) {
405
415
  try {
@@ -927,7 +937,9 @@ export async function postProcess(ctx) {
927
937
  // === Challenge / bot-protection page detection ===
928
938
  // If the extracted content looks like a challenge page (not real content),
929
939
  // mark it and try the search-as-proxy fallback to get the real info.
930
- if (ctx.content && ctx.content.length < 2000) {
940
+ // Fast path: skip this check for HTTP fetches that completed in < 500ms
941
+ // a fast successful response is virtually never a challenge page.
942
+ if (!ctx.fastPath && ctx.content && ctx.content.length < 2000) {
931
943
  const lowerContent = ctx.content.toLowerCase();
932
944
  const challengeSignals = [
933
945
  'please verify you are a human',
@@ -229,7 +229,10 @@ export function createFetchRouter(authStore) {
229
229
  const cacheAge = Date.now() - cached.timestamp;
230
230
  if (cacheAge < maxAgeMs && cacheAge < cacheTtlMs) {
231
231
  res.setHeader('X-Cache', 'HIT');
232
+ res.setHeader('X-Cache-Status', 'HIT');
232
233
  res.setHeader('X-Cache-Age', Math.floor(cacheAge / 1000).toString());
234
+ // Cache-Control: allow Cloudflare edge to cache successful GET responses
235
+ res.setHeader('Cache-Control', 'public, s-maxage=60, stale-while-revalidate=300');
233
236
  if (wantsEnvelope(req)) {
234
237
  successResponse(res, cached.result, {
235
238
  requestId: req.requestId,
@@ -467,9 +470,19 @@ export function createFetchRouter(authStore) {
467
470
  : undefined;
468
471
  // Add usage headers (kept for backward compat; also surfaced in envelope metadata)
469
472
  res.setHeader('X-Cache', 'MISS');
473
+ res.setHeader('X-Cache-Status', 'MISS');
470
474
  res.setHeader('X-Credits-Used', '1');
471
475
  res.setHeader('X-Processing-Time', elapsed.toString());
472
476
  res.setHeader('X-Fetch-Type', fetchType);
477
+ // Cache-Control: allow Cloudflare edge to cache successful GET responses for 60s
478
+ res.setHeader('Cache-Control', 'public, s-maxage=60, stale-while-revalidate=300');
479
+ // Response timing headers — let customers see exactly where time is spent
480
+ const timingFetch = result.timing?.fetch ?? 0;
481
+ const timingParse = (result.timing?.convert ?? 0) + (result.timing?.metadata ?? 0) + (result.timing?.prune ?? 0);
482
+ res.setHeader('X-Response-Time', `${elapsed}ms`);
483
+ res.setHeader('X-Fetch-Time', `${timingFetch}ms`);
484
+ res.setHeader('X-Parse-Time', `${timingParse}ms`);
485
+ res.setHeader('Server-Timing', `fetch;dur=${timingFetch}, parse;dur=${timingParse}, total;dur=${elapsed}`);
473
486
  // Build response — extend result with optional answer/summary fields
474
487
  const getResponseBody = { ...result };
475
488
  if (getAnswerResult !== undefined)
@@ -676,6 +689,7 @@ export function createFetchRouter(authStore) {
676
689
  const cacheAge = Date.now() - cached.timestamp;
677
690
  if (cacheAge < postCacheTtlMs) {
678
691
  res.setHeader('X-Cache', 'HIT');
692
+ res.setHeader('X-Cache-Status', 'HIT');
679
693
  res.setHeader('X-Cache-Age', Math.floor(cacheAge / 1000).toString());
680
694
  if (wantsEnvelope(req)) {
681
695
  successResponse(res, cached.result, {
@@ -932,9 +946,17 @@ export function createFetchRouter(authStore) {
932
946
  // --- Build response ------------------------------------------------------
933
947
  // Headers kept for backward compat; also surfaced in envelope metadata.
934
948
  res.setHeader('X-Cache', 'MISS');
949
+ res.setHeader('X-Cache-Status', 'MISS');
935
950
  res.setHeader('X-Credits-Used', '1');
936
951
  res.setHeader('X-Processing-Time', elapsed.toString());
937
952
  res.setHeader('X-Fetch-Type', fetchType);
953
+ // Response timing headers — let customers see exactly where time is spent
954
+ const postTimingFetch = result.timing?.fetch ?? 0;
955
+ const postTimingParse = (result.timing?.convert ?? 0) + (result.timing?.metadata ?? 0) + (result.timing?.prune ?? 0);
956
+ res.setHeader('X-Response-Time', `${elapsed}ms`);
957
+ res.setHeader('X-Fetch-Time', `${postTimingFetch}ms`);
958
+ res.setHeader('X-Parse-Time', `${postTimingParse}ms`);
959
+ res.setHeader('Server-Timing', `fetch;dur=${postTimingFetch}, parse;dur=${postTimingParse}, total;dur=${elapsed}`);
938
960
  const responseBody = { ...result };
939
961
  if (jsonData !== undefined) {
940
962
  responseBody.json = jsonData;
@@ -96,6 +96,11 @@ export function createReaderRouter() {
96
96
  selector: targetSelector,
97
97
  waitSelector: waitForSelector,
98
98
  });
99
+ // Cache-Control: this endpoint is public and heavily cacheable.
100
+ // Cloudflare edge caches for 2 min; serves stale for up to 10 min while revalidating.
101
+ res.setHeader('Cache-Control', 'public, s-maxage=120, stale-while-revalidate=600');
102
+ // Vary on Accept so different content-type representations are cached separately.
103
+ res.setHeader('Vary', 'Accept');
99
104
  // Return based on format
100
105
  const responseFormat = format.toLowerCase();
101
106
  if (responseFormat === 'text') {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "webpeel",
3
- "version": "0.21.0",
3
+ "version": "0.21.1",
4
4
  "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
5
5
  "author": "Jake Liu",
6
6
  "license": "AGPL-3.0-only",