webpeel 0.21.23 → 0.21.25
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -844,20 +844,27 @@ export class DuckDuckGoProvider {
|
|
|
844
844
|
// Stage 4: Stealth multi-engine (DDG + Bing + Ecosia in parallel)
|
|
845
845
|
// Bypasses bot-detection on datacenter IPs. This is the reliable
|
|
846
846
|
// last resort — but it spins up a browser so it takes a few seconds.
|
|
847
|
+
// DISABLED on memory-constrained servers (512MB) — Playwright OOM kills.
|
|
848
|
+
// Set NO_BROWSER_SEARCH=1 to skip this stage entirely.
|
|
847
849
|
// -----------------------------------------------------------
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
850
|
+
if (!process.env.NO_BROWSER_SEARCH) {
|
|
851
|
+
log.debug('Trying stealth browser search (DDG + Bing + Ecosia)...');
|
|
852
|
+
try {
|
|
853
|
+
const stealthProvider = new StealthSearchProvider();
|
|
854
|
+
// StealthSearchProvider already applies filterRelevantResults internally.
|
|
855
|
+
const stealthResults = await stealthProvider.searchWeb(query, options);
|
|
856
|
+
if (stealthResults.length > 0) {
|
|
857
|
+
log.debug(`source=stealth returned ${stealthResults.length} results`);
|
|
858
|
+
return stealthResults;
|
|
859
|
+
}
|
|
860
|
+
log.debug('Stealth search returned 0 results');
|
|
861
|
+
}
|
|
862
|
+
catch (e) {
|
|
863
|
+
log.debug('Stealth search failed:', e instanceof Error ? e.message : e);
|
|
856
864
|
}
|
|
857
|
-
log.debug('Stealth search returned 0 results');
|
|
858
865
|
}
|
|
859
|
-
|
|
860
|
-
log.debug('Stealth search
|
|
866
|
+
else {
|
|
867
|
+
log.debug('Stealth browser search skipped (NO_BROWSER_SEARCH=1)');
|
|
861
868
|
}
|
|
862
869
|
return [];
|
|
863
870
|
}
|
|
@@ -30,7 +30,7 @@ export interface ExtractionResult {
|
|
|
30
30
|
* @param llmConfig Optional LLM config (if omitted, uses heuristic fallback)
|
|
31
31
|
* @param prompt Optional user guidance added to the LLM prompt
|
|
32
32
|
*/
|
|
33
|
-
export declare function extractStructured(content: string, schema: ExtractionSchema, llmConfig?: LLMConfig, prompt?: string): Promise<ExtractionResult>;
|
|
33
|
+
export declare function extractStructured(content: string, schema: ExtractionSchema, llmConfig?: LLMConfig, prompt?: string, domainHints?: Record<string, unknown>): Promise<ExtractionResult>;
|
|
34
34
|
/**
|
|
35
35
|
* Convert a shorthand schema `{ field: "string", active: "boolean" }` to a
|
|
36
36
|
* full ExtractionSchema. Useful for CLI --extract flag.
|
|
@@ -422,7 +422,7 @@ async function heuristicExtract(content, schema) {
|
|
|
422
422
|
* @param llmConfig Optional LLM config (if omitted, uses heuristic fallback)
|
|
423
423
|
* @param prompt Optional user guidance added to the LLM prompt
|
|
424
424
|
*/
|
|
425
|
-
export async function extractStructured(content, schema, llmConfig, prompt) {
|
|
425
|
+
export async function extractStructured(content, schema, llmConfig, prompt, domainHints) {
|
|
426
426
|
// Guard: empty content
|
|
427
427
|
if (!content || content.trim().length === 0) {
|
|
428
428
|
return { data: {}, confidence: 0, tokensUsed: 0 };
|
|
@@ -495,7 +495,35 @@ export async function extractStructured(content, schema, llmConfig, prompt) {
|
|
|
495
495
|
}
|
|
496
496
|
}
|
|
497
497
|
// ── Heuristic extraction ─────────────────────────────────────────────────
|
|
498
|
-
|
|
498
|
+
const heuristic = await heuristicExtract(content, schema);
|
|
499
|
+
// ── Domain hints overlay ─────────────────────────────────────────────────
|
|
500
|
+
// If domain-api pre-extracted fields (e.g. GitHub stars/language), merge them
|
|
501
|
+
// into the result. Domain-api data is authoritative — prefer over heuristic.
|
|
502
|
+
if (domainHints && Object.keys(domainHints).length > 0) {
|
|
503
|
+
const props = schema.properties;
|
|
504
|
+
let hintMerged = 0;
|
|
505
|
+
for (const [field, hintValue] of Object.entries(domainHints)) {
|
|
506
|
+
if (field in props && hintValue !== null && hintValue !== undefined) {
|
|
507
|
+
const expected = props[field].type;
|
|
508
|
+
const actual = typeof hintValue;
|
|
509
|
+
// Only merge if type matches (or number vs string coercion)
|
|
510
|
+
if (actual === expected ||
|
|
511
|
+
(expected === 'number' && actual === 'string' && !isNaN(Number(hintValue))) ||
|
|
512
|
+
(expected === 'string' && actual !== 'object')) {
|
|
513
|
+
heuristic.data[field] =
|
|
514
|
+
expected === 'number' ? Number(hintValue) : hintValue;
|
|
515
|
+
hintMerged++;
|
|
516
|
+
}
|
|
517
|
+
}
|
|
518
|
+
}
|
|
519
|
+
if (hintMerged > 0) {
|
|
520
|
+
// Boost confidence since we have authoritative domain-api data
|
|
521
|
+
const filled = Object.values(heuristic.data).filter(v => v !== null && v !== undefined).length;
|
|
522
|
+
const total = Object.keys(props).length;
|
|
523
|
+
heuristic.confidence = parseFloat(Math.min(0.90, 0.65 + (filled / total) * 0.25).toFixed(2));
|
|
524
|
+
}
|
|
525
|
+
}
|
|
526
|
+
return heuristic;
|
|
499
527
|
}
|
|
500
528
|
// ---------------------------------------------------------------------------
|
|
501
529
|
// Helper: convert simple { field: "type" } map → ExtractionSchema
|
|
@@ -186,12 +186,24 @@ export function createExtractRouter() {
|
|
|
186
186
|
const peelResult = await peel(url, {
|
|
187
187
|
format: 'markdown',
|
|
188
188
|
render: useRender,
|
|
189
|
+
noEscalate: !useRender, // prevent OOM: only browser when render=true explicitly
|
|
189
190
|
timeout: 30000,
|
|
190
191
|
readable: true,
|
|
191
192
|
});
|
|
192
193
|
const content = peelResult.content || '';
|
|
193
194
|
// ── Extract structured data ─────────────────────────────────────────
|
|
194
|
-
|
|
195
|
+
// Seed hints from domain-api structured data (GitHub stars/language, etc.)
|
|
196
|
+
// This lets heuristic extraction use pre-parsed structured fields as ground truth.
|
|
197
|
+
const domainHints = {};
|
|
198
|
+
const rawDomainData = peelResult.domainData?.structured;
|
|
199
|
+
if (rawDomainData && typeof rawDomainData === 'object') {
|
|
200
|
+
for (const [k, v] of Object.entries(rawDomainData)) {
|
|
201
|
+
if (v !== null && v !== undefined && v !== '') {
|
|
202
|
+
domainHints[k] = v;
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
const extractResult = await extractStructured(content, schema, llmConfig, typeof prompt === 'string' ? prompt : undefined, Object.keys(domainHints).length > 0 ? domainHints : undefined);
|
|
195
207
|
const method = llmConfig ? 'llm' : 'heuristic';
|
|
196
208
|
res.json({
|
|
197
209
|
success: true,
|
|
@@ -341,6 +341,10 @@ export function createFetchRouter(authStore) {
|
|
|
341
341
|
lite: lite === 'true',
|
|
342
342
|
timeout: timeout ? parseInt(timeout, 10) : undefined,
|
|
343
343
|
captionImages: captionImages === 'true',
|
|
344
|
+
// Prevent auto-escalation to browser unless render=true is explicitly requested.
|
|
345
|
+
// On 512MB containers, surprise browser launches cause OOM kills.
|
|
346
|
+
// Domain extractors (GitHub, Wikipedia, npm etc.) use HTTP APIs, not the browser.
|
|
347
|
+
noEscalate: !shouldRender,
|
|
344
348
|
};
|
|
345
349
|
// Auto-budget: default to 4000 tokens for API requests when no budget specified
|
|
346
350
|
// Opt-out: budget=0 explicitly disables. Lite mode disables auto-budget.
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.25",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|