webpeel 0.21.24 → 0.21.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,10 +12,10 @@
12
12
  * In production with no API keys configured, getBestSearchProvider() returns
13
13
  * StealthSearchProvider since DDG HTTP is often blocked on datacenter IPs.
14
14
  */
15
- import { fetch as undiciFetch } from 'undici';
15
+ import { fetch as undiciFetch, ProxyAgent } from 'undici';
16
16
  import { load } from 'cheerio';
17
17
  import { getStealthBrowser, getRandomUserAgent, applyStealthScripts } from './browser-pool.js';
18
- import { getWebshareProxy } from './proxy-config.js';
18
+ import { getWebshareProxy, getWebshareProxyUrl } from './proxy-config.js';
19
19
  import { createLogger } from './logger.js';
20
20
  const log = createLogger('search');
21
21
  function decodeHtmlEntities(input) {
@@ -626,7 +626,9 @@ export class DuckDuckGoProvider {
626
626
  const { count, signal } = options;
627
627
  const searchUrl = this.buildSearchUrl(query, options);
628
628
  // Use realistic browser headers to avoid DDG bot detection on datacenter IPs
629
- const response = await undiciFetch(searchUrl, {
629
+ // Route through residential proxy when available (datacenter IPs are blocked)
630
+ const proxyUrl = getWebshareProxyUrl();
631
+ const fetchOpts = {
630
632
  headers: {
631
633
  'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
632
634
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
@@ -641,7 +643,11 @@ export class DuckDuckGoProvider {
641
643
  'Referer': 'https://duckduckgo.com/',
642
644
  },
643
645
  signal,
644
- });
646
+ };
647
+ if (proxyUrl) {
648
+ fetchOpts.dispatcher = new ProxyAgent(proxyUrl);
649
+ }
650
+ const response = await undiciFetch(searchUrl, fetchOpts);
645
651
  if (!response.ok) {
646
652
  throw new Error(`Search failed: HTTP ${response.status}`);
647
653
  }
@@ -701,7 +707,8 @@ export class DuckDuckGoProvider {
701
707
  const { count, signal } = options;
702
708
  const params = new URLSearchParams();
703
709
  params.set('q', query);
704
- const response = await undiciFetch(`https://lite.duckduckgo.com/lite/?${params.toString()}`, {
710
+ const liteProxyUrl = getWebshareProxyUrl();
711
+ const liteFetchOpts = {
705
712
  headers: {
706
713
  'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
707
714
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
@@ -709,7 +716,11 @@ export class DuckDuckGoProvider {
709
716
  'Referer': 'https://lite.duckduckgo.com/',
710
717
  },
711
718
  signal,
712
- });
719
+ };
720
+ if (liteProxyUrl) {
721
+ liteFetchOpts.dispatcher = new ProxyAgent(liteProxyUrl);
722
+ }
723
+ const response = await undiciFetch(`https://lite.duckduckgo.com/lite/?${params.toString()}`, liteFetchOpts);
713
724
  if (!response.ok)
714
725
  return [];
715
726
  const html = await response.text();
@@ -30,7 +30,7 @@ export interface ExtractionResult {
30
30
  * @param llmConfig Optional LLM config (if omitted, uses heuristic fallback)
31
31
  * @param prompt Optional user guidance added to the LLM prompt
32
32
  */
33
- export declare function extractStructured(content: string, schema: ExtractionSchema, llmConfig?: LLMConfig, prompt?: string): Promise<ExtractionResult>;
33
+ export declare function extractStructured(content: string, schema: ExtractionSchema, llmConfig?: LLMConfig, prompt?: string, domainHints?: Record<string, unknown>): Promise<ExtractionResult>;
34
34
  /**
35
35
  * Convert a shorthand schema `{ field: "string", active: "boolean" }` to a
36
36
  * full ExtractionSchema. Useful for CLI --extract flag.
@@ -422,7 +422,7 @@ async function heuristicExtract(content, schema) {
422
422
  * @param llmConfig Optional LLM config (if omitted, uses heuristic fallback)
423
423
  * @param prompt Optional user guidance added to the LLM prompt
424
424
  */
425
- export async function extractStructured(content, schema, llmConfig, prompt) {
425
+ export async function extractStructured(content, schema, llmConfig, prompt, domainHints) {
426
426
  // Guard: empty content
427
427
  if (!content || content.trim().length === 0) {
428
428
  return { data: {}, confidence: 0, tokensUsed: 0 };
@@ -495,7 +495,35 @@ export async function extractStructured(content, schema, llmConfig, prompt) {
495
495
  }
496
496
  }
497
497
  // ── Heuristic extraction ─────────────────────────────────────────────────
498
- return heuristicExtract(content, schema);
498
+ const heuristic = await heuristicExtract(content, schema);
499
+ // ── Domain hints overlay ─────────────────────────────────────────────────
500
+ // If domain-api pre-extracted fields (e.g. GitHub stars/language), merge them
501
+ // into the result. Domain-api data is authoritative — prefer over heuristic.
502
+ if (domainHints && Object.keys(domainHints).length > 0) {
503
+ const props = schema.properties;
504
+ let hintMerged = 0;
505
+ for (const [field, hintValue] of Object.entries(domainHints)) {
506
+ if (field in props && hintValue !== null && hintValue !== undefined) {
507
+ const expected = props[field].type;
508
+ const actual = typeof hintValue;
509
+ // Only merge if type matches (or number vs string coercion)
510
+ if (actual === expected ||
511
+ (expected === 'number' && actual === 'string' && !isNaN(Number(hintValue))) ||
512
+ (expected === 'string' && actual !== 'object')) {
513
+ heuristic.data[field] =
514
+ expected === 'number' ? Number(hintValue) : hintValue;
515
+ hintMerged++;
516
+ }
517
+ }
518
+ }
519
+ if (hintMerged > 0) {
520
+ // Boost confidence since we have authoritative domain-api data
521
+ const filled = Object.values(heuristic.data).filter(v => v !== null && v !== undefined).length;
522
+ const total = Object.keys(props).length;
523
+ heuristic.confidence = parseFloat(Math.min(0.90, 0.65 + (filled / total) * 0.25).toFixed(2));
524
+ }
525
+ }
526
+ return heuristic;
499
527
  }
500
528
  // ---------------------------------------------------------------------------
501
529
  // Helper: convert simple { field: "type" } map → ExtractionSchema
@@ -186,12 +186,24 @@ export function createExtractRouter() {
186
186
  const peelResult = await peel(url, {
187
187
  format: 'markdown',
188
188
  render: useRender,
189
+ noEscalate: !useRender, // prevent OOM: only browser when render=true explicitly
189
190
  timeout: 30000,
190
191
  readable: true,
191
192
  });
192
193
  const content = peelResult.content || '';
193
194
  // ── Extract structured data ─────────────────────────────────────────
194
- const extractResult = await extractStructured(content, schema, llmConfig, typeof prompt === 'string' ? prompt : undefined);
195
+ // Seed hints from domain-api structured data (GitHub stars/language, etc.)
196
+ // This lets heuristic extraction use pre-parsed structured fields as ground truth.
197
+ const domainHints = {};
198
+ const rawDomainData = peelResult.domainData?.structured;
199
+ if (rawDomainData && typeof rawDomainData === 'object') {
200
+ for (const [k, v] of Object.entries(rawDomainData)) {
201
+ if (v !== null && v !== undefined && v !== '') {
202
+ domainHints[k] = v;
203
+ }
204
+ }
205
+ }
206
+ const extractResult = await extractStructured(content, schema, llmConfig, typeof prompt === 'string' ? prompt : undefined, Object.keys(domainHints).length > 0 ? domainHints : undefined);
195
207
  const method = llmConfig ? 'llm' : 'heuristic';
196
208
  res.json({
197
209
  success: true,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "webpeel",
3
- "version": "0.21.24",
3
+ "version": "0.21.26",
4
4
  "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
5
5
  "author": "Jake Liu",
6
6
  "license": "AGPL-3.0-only",