webpeel 0.21.9 → 0.21.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -409,6 +409,28 @@ export async function runFetch(url, options) {
|
|
|
409
409
|
}
|
|
410
410
|
// Suppress spinner when --progress is active (progress lines replace it)
|
|
411
411
|
const spinner = (options.silent || options.progress) ? null : ora('Fetching...').start();
|
|
412
|
+
// Auto progress: after 3 s, update spinner text with elapsed time + method hints
|
|
413
|
+
// Updated every 2 s so the user knows we're still working.
|
|
414
|
+
const autoProgressStart = Date.now();
|
|
415
|
+
const autoProgressSteps = [
|
|
416
|
+
{ afterMs: 3000, text: '⏳ Fetching... (slow response)' },
|
|
417
|
+
{ afterMs: 6000, text: '⏳ Fetching with browser... ({s}s)' },
|
|
418
|
+
{ afterMs: 12000, text: '⏳ Fetching with browser... ({s}s — stealth may be needed)' },
|
|
419
|
+
{ afterMs: 20000, text: '⏳ Fetching with stealth browser + proxy... ({s}s)' },
|
|
420
|
+
];
|
|
421
|
+
let autoProgressStepIdx = 0;
|
|
422
|
+
const autoProgressInterval = spinner ? setInterval(() => {
|
|
423
|
+
const elapsed = Date.now() - autoProgressStart;
|
|
424
|
+
const secs = Math.round(elapsed / 1000);
|
|
425
|
+
while (autoProgressStepIdx < autoProgressSteps.length &&
|
|
426
|
+
elapsed >= autoProgressSteps[autoProgressStepIdx].afterMs) {
|
|
427
|
+
autoProgressStepIdx++;
|
|
428
|
+
}
|
|
429
|
+
if (autoProgressStepIdx > 0 && spinner) {
|
|
430
|
+
const tmpl = autoProgressSteps[autoProgressStepIdx - 1].text;
|
|
431
|
+
spinner.text = tmpl.replace('{s}', String(secs));
|
|
432
|
+
}
|
|
433
|
+
}, 2000) : null;
|
|
412
434
|
try {
|
|
413
435
|
// Validate options
|
|
414
436
|
if (options.wait && (options.wait < 0 || options.wait > 60000)) {
|
|
@@ -661,11 +683,13 @@ export async function runFetch(url, options) {
|
|
|
661
683
|
if (resolvedProfileName) {
|
|
662
684
|
touchProfile(resolvedProfileName);
|
|
663
685
|
}
|
|
664
|
-
// Stop progress
|
|
686
|
+
// Stop progress intervals and show final result
|
|
665
687
|
if (progressInterval) {
|
|
666
688
|
clearInterval(progressInterval);
|
|
667
689
|
progressInterval = undefined;
|
|
668
690
|
}
|
|
691
|
+
if (autoProgressInterval)
|
|
692
|
+
clearInterval(autoProgressInterval);
|
|
669
693
|
if (options.progress) {
|
|
670
694
|
const method = result.method || 'simple';
|
|
671
695
|
const elapsedSec = ((result.elapsed || (Date.now() - progressStart)) / 1000).toFixed(1);
|
|
@@ -1067,6 +1091,8 @@ export async function runFetch(url, options) {
|
|
|
1067
1091
|
process.exit(0);
|
|
1068
1092
|
}
|
|
1069
1093
|
catch (error) {
|
|
1094
|
+
if (autoProgressInterval)
|
|
1095
|
+
clearInterval(autoProgressInterval);
|
|
1070
1096
|
if (spinner) {
|
|
1071
1097
|
spinner.fail('Failed to fetch');
|
|
1072
1098
|
}
|
|
@@ -308,6 +308,7 @@ async function twitterExtractor(html, url) {
|
|
|
308
308
|
if (fxData && fxData.code === 200 && fxData.user) {
|
|
309
309
|
const u = fxData.user;
|
|
310
310
|
const structured = {
|
|
311
|
+
title: `${u.name || ''} (@${u.screen_name || ''}) on X/Twitter`,
|
|
311
312
|
name: u.name || '',
|
|
312
313
|
handle: '@' + (u.screen_name || ''),
|
|
313
314
|
bio: u.description || '',
|
|
@@ -1500,6 +1501,7 @@ async function npmExtractor(_html, url) {
|
|
|
1500
1501
|
}
|
|
1501
1502
|
catch { /* optional */ }
|
|
1502
1503
|
const structured = {
|
|
1504
|
+
title: `${data.name}@${latest || 'unknown'}`,
|
|
1503
1505
|
name: data.name,
|
|
1504
1506
|
description: data.description || '',
|
|
1505
1507
|
version: latest || 'unknown',
|
|
@@ -17,6 +17,8 @@ export interface WebSearchResult {
|
|
|
17
17
|
title: string;
|
|
18
18
|
url: string;
|
|
19
19
|
snippet: string;
|
|
20
|
+
/** Relevance score (0–1) based on keyword overlap with query. Added by filterRelevantResults. */
|
|
21
|
+
relevanceScore?: number;
|
|
20
22
|
}
|
|
21
23
|
export interface WebSearchOptions {
|
|
22
24
|
/** Number of results (1-10) */
|
|
@@ -71,6 +73,18 @@ declare class ProviderStatsTracker {
|
|
|
71
73
|
* (e.g. in tests) and to log diagnostics.
|
|
72
74
|
*/
|
|
73
75
|
export declare const providerStats: ProviderStatsTracker;
|
|
76
|
+
/**
|
|
77
|
+
* Filter and rank results by relevance to the original query.
|
|
78
|
+
*
|
|
79
|
+
* 1. Extract meaningful keywords from the query (remove stop words).
|
|
80
|
+
* 2. Score each result by keyword overlap with title + URL + snippet.
|
|
81
|
+
* 3. Remove results with zero overlap (completely irrelevant).
|
|
82
|
+
* 4. Sort descending by score, keeping original index as tiebreaker.
|
|
83
|
+
* 5. Attach `relevanceScore` (0–1) to each surviving result.
|
|
84
|
+
*
|
|
85
|
+
* Results without any scores (query produced no keywords) are returned as-is.
|
|
86
|
+
*/
|
|
87
|
+
export declare function filterRelevantResults(results: WebSearchResult[], query: string): WebSearchResult[];
|
|
74
88
|
/**
|
|
75
89
|
* StealthSearchProvider — self-hosted multi-engine search
|
|
76
90
|
*
|
|
@@ -115,6 +129,11 @@ export declare class DuckDuckGoProvider implements SearchProvider {
|
|
|
115
129
|
*/
|
|
116
130
|
private searchLite;
|
|
117
131
|
searchWeb(query: string, options: WebSearchOptions): Promise<WebSearchResult[]>;
|
|
132
|
+
/**
|
|
133
|
+
* Exposed for testing: score and filter a pre-fetched result list against a query.
|
|
134
|
+
* Equivalent to calling filterRelevantResults() directly.
|
|
135
|
+
*/
|
|
136
|
+
filterResults(results: WebSearchResult[], query: string): WebSearchResult[];
|
|
118
137
|
}
|
|
119
138
|
export declare class BraveSearchProvider implements SearchProvider {
|
|
120
139
|
readonly id: SearchProviderId;
|
|
@@ -195,6 +195,91 @@ function normalizeUrlForDedupe(rawUrl) {
|
|
|
195
195
|
.replace(/\/+$/g, '');
|
|
196
196
|
}
|
|
197
197
|
}
|
|
198
|
+
// ============================================================
|
|
199
|
+
// Result Relevance Filtering
|
|
200
|
+
// Lightweight keyword-overlap scoring — no external deps.
|
|
201
|
+
// Applied after fetching raw results to remove completely off-
|
|
202
|
+
// topic hits (e.g., a grammar article returned for "used cars").
|
|
203
|
+
// ============================================================
|
|
204
|
+
const STOP_WORDS = new Set([
|
|
205
|
+
'the', 'a', 'an', 'is', 'are', 'was', 'were', 'in', 'on', 'at', 'to', 'for',
|
|
206
|
+
'of', 'with', 'how', 'what', 'where', 'when', 'why', 'best', 'top', 'most',
|
|
207
|
+
'and', 'or', 'but', 'not', 'do', 'does', 'did', 'be', 'been', 'have', 'has',
|
|
208
|
+
'buy', 'get', 'find', 'about', 'from', 'by', 'its', 'it', 'this', 'that',
|
|
209
|
+
]);
|
|
210
|
+
/**
|
|
211
|
+
* Extract meaningful keywords from a search query by stripping stop words and
|
|
212
|
+
* short tokens. Returns lowercase tokens, deduped.
|
|
213
|
+
*/
|
|
214
|
+
function extractKeywords(query) {
|
|
215
|
+
const seen = new Set();
|
|
216
|
+
return query
|
|
217
|
+
.toLowerCase()
|
|
218
|
+
.replace(/[^a-z0-9\s]/g, ' ')
|
|
219
|
+
.split(/\s+/)
|
|
220
|
+
.filter(w => w.length >= 2 && !STOP_WORDS.has(w))
|
|
221
|
+
.filter(w => {
|
|
222
|
+
if (seen.has(w))
|
|
223
|
+
return false;
|
|
224
|
+
seen.add(w);
|
|
225
|
+
return true;
|
|
226
|
+
});
|
|
227
|
+
}
|
|
228
|
+
/**
|
|
229
|
+
* Compute a [0, 1] relevance score for a single result against extracted keywords.
|
|
230
|
+
* Weights: title 0.5, URL 0.3, snippet 0.2.
|
|
231
|
+
*/
|
|
232
|
+
function scoreResult(result, keywords) {
|
|
233
|
+
if (keywords.length === 0)
|
|
234
|
+
return 1;
|
|
235
|
+
const titleLower = (result.title || '').toLowerCase();
|
|
236
|
+
const urlLower = (result.url || '').toLowerCase();
|
|
237
|
+
const snippetLower = (result.snippet || '').toLowerCase();
|
|
238
|
+
let titleHits = 0;
|
|
239
|
+
let urlHits = 0;
|
|
240
|
+
let snippetHits = 0;
|
|
241
|
+
for (const kw of keywords) {
|
|
242
|
+
if (titleLower.includes(kw))
|
|
243
|
+
titleHits++;
|
|
244
|
+
if (urlLower.includes(kw))
|
|
245
|
+
urlHits++;
|
|
246
|
+
if (snippetLower.includes(kw))
|
|
247
|
+
snippetHits++;
|
|
248
|
+
}
|
|
249
|
+
const titleScore = titleHits / keywords.length;
|
|
250
|
+
const urlScore = urlHits / keywords.length;
|
|
251
|
+
const snippetScore = snippetHits / keywords.length;
|
|
252
|
+
return titleScore * 0.5 + urlScore * 0.3 + snippetScore * 0.2;
|
|
253
|
+
}
|
|
254
|
+
/**
|
|
255
|
+
* Filter and rank results by relevance to the original query.
|
|
256
|
+
*
|
|
257
|
+
* 1. Extract meaningful keywords from the query (remove stop words).
|
|
258
|
+
* 2. Score each result by keyword overlap with title + URL + snippet.
|
|
259
|
+
* 3. Remove results with zero overlap (completely irrelevant).
|
|
260
|
+
* 4. Sort descending by score, keeping original index as tiebreaker.
|
|
261
|
+
* 5. Attach `relevanceScore` (0–1) to each surviving result.
|
|
262
|
+
*
|
|
263
|
+
* Results without any scores (query produced no keywords) are returned as-is.
|
|
264
|
+
*/
|
|
265
|
+
export function filterRelevantResults(results, query) {
|
|
266
|
+
const keywords = extractKeywords(query);
|
|
267
|
+
if (keywords.length === 0)
|
|
268
|
+
return results; // no keywords to filter on
|
|
269
|
+
const scored = results.map((r, idx) => ({
|
|
270
|
+
result: r,
|
|
271
|
+
score: scoreResult(r, keywords),
|
|
272
|
+
idx,
|
|
273
|
+
}));
|
|
274
|
+
// Drop results with zero overlap
|
|
275
|
+
const relevant = scored.filter(s => s.score > 0);
|
|
276
|
+
// Sort by score descending, original order as tiebreaker
|
|
277
|
+
relevant.sort((a, b) => (b.score !== a.score ? b.score - a.score : a.idx - b.idx));
|
|
278
|
+
return relevant.map(s => ({
|
|
279
|
+
...s.result,
|
|
280
|
+
relevanceScore: Math.min(1, s.score),
|
|
281
|
+
}));
|
|
282
|
+
}
|
|
198
283
|
/**
|
|
199
284
|
* StealthSearchProvider — self-hosted multi-engine search
|
|
200
285
|
*
|
|
@@ -470,7 +555,10 @@ export class StealthSearchProvider {
|
|
|
470
555
|
if (deduped.length >= count)
|
|
471
556
|
break;
|
|
472
557
|
}
|
|
473
|
-
|
|
558
|
+
// Relevance filtering: remove completely off-topic results, score the rest
|
|
559
|
+
const filtered = filterRelevantResults(deduped, query);
|
|
560
|
+
// Respect the original count limit after filtering
|
|
561
|
+
return filtered.slice(0, count);
|
|
474
562
|
}
|
|
475
563
|
}
|
|
476
564
|
export class DuckDuckGoProvider {
|
|
@@ -688,7 +776,9 @@ export class DuckDuckGoProvider {
|
|
|
688
776
|
providerStats.record('ddg-http', true);
|
|
689
777
|
log.debug(`source=ddg-http returned ${results.length} results` +
|
|
690
778
|
(ddgTimeoutMs < 8_000 ? ` (fast-timeout ${ddgTimeoutMs}ms)` : ''));
|
|
691
|
-
|
|
779
|
+
// Apply relevance filtering before returning
|
|
780
|
+
const filtered = filterRelevantResults(results, query);
|
|
781
|
+
return filtered.length > 0 ? filtered : results; // fallback to unfiltered if all removed
|
|
692
782
|
}
|
|
693
783
|
ddgSucceeded = true; // connected OK, just 0 results
|
|
694
784
|
}
|
|
@@ -721,7 +811,9 @@ export class DuckDuckGoProvider {
|
|
|
721
811
|
providerStats.record('ddg-lite', true);
|
|
722
812
|
log.debug(`source=ddg-lite returned ${liteResults.length} results` +
|
|
723
813
|
(liteTimeoutMs < 8_000 ? ` (fast-timeout ${liteTimeoutMs}ms)` : ''));
|
|
724
|
-
|
|
814
|
+
// Apply relevance filtering before returning
|
|
815
|
+
const filteredLite = filterRelevantResults(liteResults, query);
|
|
816
|
+
return filteredLite.length > 0 ? filteredLite : liteResults;
|
|
725
817
|
}
|
|
726
818
|
providerStats.record('ddg-lite', false);
|
|
727
819
|
log.debug('DDG Lite also returned 0 results');
|
|
@@ -756,6 +848,7 @@ export class DuckDuckGoProvider {
|
|
|
756
848
|
log.debug('Trying stealth browser search (DDG + Bing + Ecosia)...');
|
|
757
849
|
try {
|
|
758
850
|
const stealthProvider = new StealthSearchProvider();
|
|
851
|
+
// StealthSearchProvider already applies filterRelevantResults internally.
|
|
759
852
|
const stealthResults = await stealthProvider.searchWeb(query, options);
|
|
760
853
|
if (stealthResults.length > 0) {
|
|
761
854
|
log.debug(`source=stealth returned ${stealthResults.length} results`);
|
|
@@ -768,6 +861,13 @@ export class DuckDuckGoProvider {
|
|
|
768
861
|
}
|
|
769
862
|
return [];
|
|
770
863
|
}
|
|
864
|
+
/**
|
|
865
|
+
* Exposed for testing: score and filter a pre-fetched result list against a query.
|
|
866
|
+
* Equivalent to calling filterRelevantResults() directly.
|
|
867
|
+
*/
|
|
868
|
+
filterResults(results, query) {
|
|
869
|
+
return filterRelevantResults(results, query);
|
|
870
|
+
}
|
|
771
871
|
}
|
|
772
872
|
export class BraveSearchProvider {
|
|
773
873
|
id = 'brave';
|
|
@@ -296,7 +296,13 @@ function heuristicExtractNumber(fieldName, content) {
|
|
|
296
296
|
}
|
|
297
297
|
// Year
|
|
298
298
|
if (/year/.test(lf)) {
|
|
299
|
-
|
|
299
|
+
// Match 4-digit years (1900-2099), prefer explicit "Year: YYYY" pattern first
|
|
300
|
+
const explicit = content.match(/\bYear[:\s]+(\d{4})\b/i);
|
|
301
|
+
if (explicit?.[1]) {
|
|
302
|
+
const n = parseInt(explicit[1]);
|
|
303
|
+
return isNaN(n) ? null : n;
|
|
304
|
+
}
|
|
305
|
+
const m = content.match(/\b((?:19|20)\d{2})\b/);
|
|
300
306
|
if (m?.[1]) {
|
|
301
307
|
const n = parseInt(m[1]);
|
|
302
308
|
return isNaN(n) ? null : n;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.11",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|