webpeel 0.21.53 → 0.21.56
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/core/pipeline.js
CHANGED
|
@@ -1217,52 +1217,6 @@ export async function finalize(ctx) {
|
|
|
1217
1217
|
log.error('Change tracking failed:', error);
|
|
1218
1218
|
}
|
|
1219
1219
|
}
|
|
1220
|
-
// ── Auto-escalation: retry thin content with browser rendering ──────────────
|
|
1221
|
-
// If simple fetch returned very little content and user didn't explicitly disable render,
|
|
1222
|
-
// automatically retry with browser rendering to handle JS-heavy/paywalled sites.
|
|
1223
|
-
const preEscalationWords = ctx.content.trim().split(/\s+/).filter((w) => w.length > 0).length;
|
|
1224
|
-
const escalationFetchMethod = fetchResult?.method || 'unknown';
|
|
1225
|
-
const alreadyTriedBrowser = escalationFetchMethod === 'browser' || escalationFetchMethod === 'stealth'
|
|
1226
|
-
|| options.render || options.stealth;
|
|
1227
|
-
const userDisabledRender = options.render === false;
|
|
1228
|
-
const escalationCandidate = preEscalationWords < 200 && preEscalationWords > 0
|
|
1229
|
-
&& escalationFetchMethod === 'simple' && !alreadyTriedBrowser && !userDisabledRender
|
|
1230
|
-
&& !ctx._escalated;
|
|
1231
|
-
if (escalationCandidate) {
|
|
1232
|
-
log.info(`thin content (${preEscalationWords}w) from simple fetch, auto-escalating to browser render for ${ctx.url}`);
|
|
1233
|
-
ctx._escalated = true;
|
|
1234
|
-
try {
|
|
1235
|
-
const { smartFetch } = await import('./strategies.js');
|
|
1236
|
-
const browserResult = await smartFetch(ctx.url, {
|
|
1237
|
-
forceBrowser: true,
|
|
1238
|
-
stealth: false,
|
|
1239
|
-
timeoutMs: options.timeout || 15000,
|
|
1240
|
-
proxy: options.proxy,
|
|
1241
|
-
});
|
|
1242
|
-
if (browserResult.html && browserResult.html.length > (fetchResult?.html?.length || 0)) {
|
|
1243
|
-
const { htmlToMarkdown } = await import('./markdown.js');
|
|
1244
|
-
const browserContent = htmlToMarkdown(browserResult.html);
|
|
1245
|
-
const browserWords = browserContent.trim().split(/\s+/).filter((w) => w.length > 0).length;
|
|
1246
|
-
if (browserWords > preEscalationWords) {
|
|
1247
|
-
log.info(`browser escalation improved content: ${preEscalationWords}w → ${browserWords}w`);
|
|
1248
|
-
ctx.content = browserContent;
|
|
1249
|
-
ctx.fetchResult = browserResult;
|
|
1250
|
-
ctx.fetchResult.method = 'browser-escalation';
|
|
1251
|
-
}
|
|
1252
|
-
else {
|
|
1253
|
-
log.debug(`browser escalation did not improve (${browserWords}w vs ${preEscalationWords}w)`);
|
|
1254
|
-
}
|
|
1255
|
-
// Always clean up browser resources
|
|
1256
|
-
if (browserResult.page)
|
|
1257
|
-
await browserResult.page.close().catch(() => { });
|
|
1258
|
-
if (browserResult.browser)
|
|
1259
|
-
await browserResult.browser.close().catch(() => { });
|
|
1260
|
-
}
|
|
1261
|
-
}
|
|
1262
|
-
catch (e) {
|
|
1263
|
-
log.debug('browser escalation failed:', e instanceof Error ? e.message : e);
|
|
1264
|
-
}
|
|
1265
|
-
}
|
|
1266
1220
|
// Generate AI summary if requested
|
|
1267
1221
|
if (options.summary && options.llm) {
|
|
1268
1222
|
try {
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Source credibility scoring — lightweight, zero dependencies.
|
|
3
|
+
*
|
|
4
|
+
* Classifies URLs by trustworthiness:
|
|
5
|
+
* - Official (★★★): .gov, .edu, .mil, WHO, NIH, academic journals
|
|
6
|
+
* - Verified (★★): Wikipedia, Reuters, BBC, GitHub, StackOverflow
|
|
7
|
+
* - General (★): Everything else
|
|
8
|
+
*/
|
|
9
|
+
export interface SourceCredibility {
|
|
10
|
+
tier: 'official' | 'verified' | 'general';
|
|
11
|
+
stars: number;
|
|
12
|
+
label: string;
|
|
13
|
+
}
|
|
14
|
+
/**
|
|
15
|
+
* Assess the credibility of a source URL.
|
|
16
|
+
*/
|
|
17
|
+
export declare function getSourceCredibility(url: string): SourceCredibility;
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Source credibility scoring — lightweight, zero dependencies.
|
|
3
|
+
*
|
|
4
|
+
* Classifies URLs by trustworthiness:
|
|
5
|
+
* - Official (★★★): .gov, .edu, .mil, WHO, NIH, academic journals
|
|
6
|
+
* - Verified (★★): Wikipedia, Reuters, BBC, GitHub, StackOverflow
|
|
7
|
+
* - General (★): Everything else
|
|
8
|
+
*/
|
|
9
|
+
/** Official TLDs and hostnames that indicate high-authority sources */
|
|
10
|
+
const OFFICIAL_TLDS = new Set(['.gov', '.edu', '.mil']);
|
|
11
|
+
const OFFICIAL_HOSTNAMES = new Set([
|
|
12
|
+
// Academic / research
|
|
13
|
+
'arxiv.org', 'scholar.google.com', 'pubmed.ncbi.nlm.nih.gov', 'ncbi.nlm.nih.gov',
|
|
14
|
+
'jstor.org', 'nature.com', 'science.org', 'cell.com', 'nejm.org', 'bmj.com',
|
|
15
|
+
'thelancet.com', 'plos.org', 'springer.com', 'elsevier.com',
|
|
16
|
+
// International organisations
|
|
17
|
+
'who.int', 'un.org', 'worldbank.org', 'imf.org', 'oecd.org', 'europa.eu',
|
|
18
|
+
// Official tech documentation
|
|
19
|
+
'docs.python.org', 'developer.mozilla.org', 'nodejs.org', 'rust-lang.org',
|
|
20
|
+
'docs.microsoft.com', 'learn.microsoft.com', 'developer.apple.com',
|
|
21
|
+
'developer.android.com', 'php.net', 'ruby-lang.org', 'golang.org', 'go.dev',
|
|
22
|
+
// Health / medicine
|
|
23
|
+
'cdc.gov', 'nih.gov', 'fda.gov', 'mayoclinic.org', 'clevelandclinic.org',
|
|
24
|
+
'webmd.com', 'medlineplus.gov',
|
|
25
|
+
// Standards / specs
|
|
26
|
+
'w3.org', 'ietf.org', 'rfc-editor.org', 'iso.org',
|
|
27
|
+
]);
|
|
28
|
+
const VERIFIED_HOSTNAMES = new Set([
|
|
29
|
+
// Encyclopaedia / reference
|
|
30
|
+
'wikipedia.org', 'en.wikipedia.org', 'britannica.com',
|
|
31
|
+
// Reputable news agencies
|
|
32
|
+
'reuters.com', 'apnews.com', 'bbc.com', 'bbc.co.uk', 'nytimes.com',
|
|
33
|
+
'washingtonpost.com', 'theguardian.com', 'economist.com', 'ft.com',
|
|
34
|
+
'cnn.com', 'npr.org', 'pbs.org',
|
|
35
|
+
// Developer resources
|
|
36
|
+
'github.com', 'stackoverflow.com', 'npmjs.com', 'pypi.org',
|
|
37
|
+
'crates.io', 'docs.rs', 'packagist.org', 'rubygems.org',
|
|
38
|
+
// Official cloud / vendor docs
|
|
39
|
+
'docs.aws.amazon.com', 'cloud.google.com', 'docs.github.com',
|
|
40
|
+
'azure.microsoft.com', 'registry.terraform.io',
|
|
41
|
+
// Reputable tech publications
|
|
42
|
+
'arstechnica.com', 'wired.com', 'techcrunch.com', 'theverge.com',
|
|
43
|
+
// National Geographic, Smithsonian
|
|
44
|
+
'nationalgeographic.com', 'smithsonianmag.com',
|
|
45
|
+
]);
|
|
46
|
+
/**
|
|
47
|
+
* Assess the credibility of a source URL.
|
|
48
|
+
*/
|
|
49
|
+
export function getSourceCredibility(url) {
|
|
50
|
+
try {
|
|
51
|
+
const hostname = new URL(url).hostname.toLowerCase().replace(/^www\./, '');
|
|
52
|
+
// Check official TLDs
|
|
53
|
+
for (const tld of OFFICIAL_TLDS) {
|
|
54
|
+
if (hostname.endsWith(tld)) {
|
|
55
|
+
return { tier: 'official', stars: 3, label: 'OFFICIAL SOURCE' };
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
// Check known official hostnames
|
|
59
|
+
if (OFFICIAL_HOSTNAMES.has(hostname)) {
|
|
60
|
+
return { tier: 'official', stars: 3, label: 'OFFICIAL SOURCE' };
|
|
61
|
+
}
|
|
62
|
+
// Check parent domain (e.g. en.wikipedia.org → wikipedia.org)
|
|
63
|
+
const parts = hostname.split('.');
|
|
64
|
+
if (parts.length > 2) {
|
|
65
|
+
const parentDomain = parts.slice(-2).join('.');
|
|
66
|
+
if (OFFICIAL_HOSTNAMES.has(parentDomain)) {
|
|
67
|
+
return { tier: 'official', stars: 3, label: 'OFFICIAL SOURCE' };
|
|
68
|
+
}
|
|
69
|
+
if (VERIFIED_HOSTNAMES.has(parentDomain)) {
|
|
70
|
+
return { tier: 'verified', stars: 2, label: 'VERIFIED' };
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
// Check known verified hostnames
|
|
74
|
+
if (VERIFIED_HOSTNAMES.has(hostname)) {
|
|
75
|
+
return { tier: 'verified', stars: 2, label: 'VERIFIED' };
|
|
76
|
+
}
|
|
77
|
+
// Everything else
|
|
78
|
+
return { tier: 'general', stars: 1, label: 'UNVERIFIED' };
|
|
79
|
+
}
|
|
80
|
+
catch {
|
|
81
|
+
return { tier: 'general', stars: 1, label: 'UNVERIFIED' };
|
|
82
|
+
}
|
|
83
|
+
}
|
|
@@ -9,6 +9,7 @@ import { peel } from '../../index.js';
|
|
|
9
9
|
import { simpleFetch } from '../../core/fetcher.js';
|
|
10
10
|
import { searchCache } from '../../core/fetch-cache.js';
|
|
11
11
|
import { getSearchProvider, getBestSearchProvider, } from '../../core/search-provider.js';
|
|
12
|
+
import { getSourceCredibility } from '../../core/source-credibility.js';
|
|
12
13
|
export function createSearchRouter(authStore) {
|
|
13
14
|
const router = Router();
|
|
14
15
|
// LRU cache: 15 minute TTL, max 500 entries, 50MB total size
|
|
@@ -210,6 +211,19 @@ export function createSearchRouter(authStore) {
|
|
|
210
211
|
}
|
|
211
212
|
}
|
|
212
213
|
}
|
|
214
|
+
// Add credibility scores and sort by trustworthiness
|
|
215
|
+
const tierOrder = { official: 0, verified: 1, general: 2 };
|
|
216
|
+
results = results
|
|
217
|
+
.map(r => {
|
|
218
|
+
const cred = getSourceCredibility(r.url);
|
|
219
|
+
return { ...r, credibility: cred };
|
|
220
|
+
})
|
|
221
|
+
.sort((a, b) => {
|
|
222
|
+
const aTier = tierOrder[a.credibility?.tier || 'general'] ?? 2;
|
|
223
|
+
const bTier = tierOrder[b.credibility?.tier || 'general'] ?? 2;
|
|
224
|
+
return aTier - bTier; // Official first, then verified, then general
|
|
225
|
+
})
|
|
226
|
+
.map((r, i) => ({ ...r, rank: i + 1 }));
|
|
213
227
|
data.web = results;
|
|
214
228
|
}
|
|
215
229
|
// Fetch news results (DDG only — Brave news is not supported via HTML scraping)
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.56",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|