webpeel 0.21.9 → 0.21.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -409,6 +409,28 @@ export async function runFetch(url, options) {
409
409
  }
410
410
  // Suppress spinner when --progress is active (progress lines replace it)
411
411
  const spinner = (options.silent || options.progress) ? null : ora('Fetching...').start();
412
+ // Auto progress: after 3 s, update spinner text with elapsed time + method hints
413
+ // Updated every 2 s so the user knows we're still working.
414
+ const autoProgressStart = Date.now();
415
+ const autoProgressSteps = [
416
+ { afterMs: 3000, text: '⏳ Fetching... (slow response)' },
417
+ { afterMs: 6000, text: '⏳ Fetching with browser... ({s}s)' },
418
+ { afterMs: 12000, text: '⏳ Fetching with browser... ({s}s — stealth may be needed)' },
419
+ { afterMs: 20000, text: '⏳ Fetching with stealth browser + proxy... ({s}s)' },
420
+ ];
421
+ let autoProgressStepIdx = 0;
422
+ const autoProgressInterval = spinner ? setInterval(() => {
423
+ const elapsed = Date.now() - autoProgressStart;
424
+ const secs = Math.round(elapsed / 1000);
425
+ while (autoProgressStepIdx < autoProgressSteps.length &&
426
+ elapsed >= autoProgressSteps[autoProgressStepIdx].afterMs) {
427
+ autoProgressStepIdx++;
428
+ }
429
+ if (autoProgressStepIdx > 0 && spinner) {
430
+ const tmpl = autoProgressSteps[autoProgressStepIdx - 1].text;
431
+ spinner.text = tmpl.replace('{s}', String(secs));
432
+ }
433
+ }, 2000) : null;
412
434
  try {
413
435
  // Validate options
414
436
  if (options.wait && (options.wait < 0 || options.wait > 60000)) {
@@ -661,11 +683,13 @@ export async function runFetch(url, options) {
661
683
  if (resolvedProfileName) {
662
684
  touchProfile(resolvedProfileName);
663
685
  }
664
- // Stop progress interval and show final result
686
+ // Stop progress intervals and show final result
665
687
  if (progressInterval) {
666
688
  clearInterval(progressInterval);
667
689
  progressInterval = undefined;
668
690
  }
691
+ if (autoProgressInterval)
692
+ clearInterval(autoProgressInterval);
669
693
  if (options.progress) {
670
694
  const method = result.method || 'simple';
671
695
  const elapsedSec = ((result.elapsed || (Date.now() - progressStart)) / 1000).toFixed(1);
@@ -1067,6 +1091,8 @@ export async function runFetch(url, options) {
1067
1091
  process.exit(0);
1068
1092
  }
1069
1093
  catch (error) {
1094
+ if (autoProgressInterval)
1095
+ clearInterval(autoProgressInterval);
1070
1096
  if (spinner) {
1071
1097
  spinner.fail('Failed to fetch');
1072
1098
  }
@@ -308,6 +308,7 @@ async function twitterExtractor(html, url) {
308
308
  if (fxData && fxData.code === 200 && fxData.user) {
309
309
  const u = fxData.user;
310
310
  const structured = {
311
+ title: `${u.name || ''} (@${u.screen_name || ''}) on X/Twitter`,
311
312
  name: u.name || '',
312
313
  handle: '@' + (u.screen_name || ''),
313
314
  bio: u.description || '',
@@ -1500,6 +1501,7 @@ async function npmExtractor(_html, url) {
1500
1501
  }
1501
1502
  catch { /* optional */ }
1502
1503
  const structured = {
1504
+ title: `${data.name}@${latest || 'unknown'}`,
1503
1505
  name: data.name,
1504
1506
  description: data.description || '',
1505
1507
  version: latest || 'unknown',
@@ -17,6 +17,8 @@ export interface WebSearchResult {
17
17
  title: string;
18
18
  url: string;
19
19
  snippet: string;
20
+ /** Relevance score (0–1) based on keyword overlap with query. Added by filterRelevantResults. */
21
+ relevanceScore?: number;
20
22
  }
21
23
  export interface WebSearchOptions {
22
24
  /** Number of results (1-10) */
@@ -71,6 +73,18 @@ declare class ProviderStatsTracker {
71
73
  * (e.g. in tests) and to log diagnostics.
72
74
  */
73
75
  export declare const providerStats: ProviderStatsTracker;
76
+ /**
77
+ * Filter and rank results by relevance to the original query.
78
+ *
79
+ * 1. Extract meaningful keywords from the query (remove stop words).
80
+ * 2. Score each result by keyword overlap with title + URL + snippet.
81
+ * 3. Remove results with zero overlap (completely irrelevant).
82
+ * 4. Sort descending by score, keeping original index as tiebreaker.
83
+ * 5. Attach `relevanceScore` (0–1) to each surviving result.
84
+ *
85
+ * Results without any scores (query produced no keywords) are returned as-is.
86
+ */
87
+ export declare function filterRelevantResults(results: WebSearchResult[], query: string): WebSearchResult[];
74
88
  /**
75
89
  * StealthSearchProvider — self-hosted multi-engine search
76
90
  *
@@ -115,6 +129,11 @@ export declare class DuckDuckGoProvider implements SearchProvider {
115
129
  */
116
130
  private searchLite;
117
131
  searchWeb(query: string, options: WebSearchOptions): Promise<WebSearchResult[]>;
132
+ /**
133
+ * Exposed for testing: score and filter a pre-fetched result list against a query.
134
+ * Equivalent to calling filterRelevantResults() directly.
135
+ */
136
+ filterResults(results: WebSearchResult[], query: string): WebSearchResult[];
118
137
  }
119
138
  export declare class BraveSearchProvider implements SearchProvider {
120
139
  readonly id: SearchProviderId;
@@ -195,6 +195,91 @@ function normalizeUrlForDedupe(rawUrl) {
195
195
  .replace(/\/+$/g, '');
196
196
  }
197
197
  }
198
+ // ============================================================
199
+ // Result Relevance Filtering
200
+ // Lightweight keyword-overlap scoring — no external deps.
201
+ // Applied after fetching raw results to remove completely off-
202
+ // topic hits (e.g., a grammar article returned for "used cars").
203
+ // ============================================================
204
+ const STOP_WORDS = new Set([
205
+ 'the', 'a', 'an', 'is', 'are', 'was', 'were', 'in', 'on', 'at', 'to', 'for',
206
+ 'of', 'with', 'how', 'what', 'where', 'when', 'why', 'best', 'top', 'most',
207
+ 'and', 'or', 'but', 'not', 'do', 'does', 'did', 'be', 'been', 'have', 'has',
208
+ 'buy', 'get', 'find', 'about', 'from', 'by', 'its', 'it', 'this', 'that',
209
+ ]);
210
+ /**
211
+ * Extract meaningful keywords from a search query by stripping stop words and
212
+ * short tokens. Returns lowercase tokens, deduped.
213
+ */
214
+ function extractKeywords(query) {
215
+ const seen = new Set();
216
+ return query
217
+ .toLowerCase()
218
+ .replace(/[^a-z0-9\s]/g, ' ')
219
+ .split(/\s+/)
220
+ .filter(w => w.length >= 2 && !STOP_WORDS.has(w))
221
+ .filter(w => {
222
+ if (seen.has(w))
223
+ return false;
224
+ seen.add(w);
225
+ return true;
226
+ });
227
+ }
228
+ /**
229
+ * Compute a [0, 1] relevance score for a single result against extracted keywords.
230
+ * Weights: title 0.5, URL 0.3, snippet 0.2.
231
+ */
232
+ function scoreResult(result, keywords) {
233
+ if (keywords.length === 0)
234
+ return 1;
235
+ const titleLower = (result.title || '').toLowerCase();
236
+ const urlLower = (result.url || '').toLowerCase();
237
+ const snippetLower = (result.snippet || '').toLowerCase();
238
+ let titleHits = 0;
239
+ let urlHits = 0;
240
+ let snippetHits = 0;
241
+ for (const kw of keywords) {
242
+ if (titleLower.includes(kw))
243
+ titleHits++;
244
+ if (urlLower.includes(kw))
245
+ urlHits++;
246
+ if (snippetLower.includes(kw))
247
+ snippetHits++;
248
+ }
249
+ const titleScore = titleHits / keywords.length;
250
+ const urlScore = urlHits / keywords.length;
251
+ const snippetScore = snippetHits / keywords.length;
252
+ return titleScore * 0.5 + urlScore * 0.3 + snippetScore * 0.2;
253
+ }
254
+ /**
255
+ * Filter and rank results by relevance to the original query.
256
+ *
257
+ * 1. Extract meaningful keywords from the query (remove stop words).
258
+ * 2. Score each result by keyword overlap with title + URL + snippet.
259
+ * 3. Remove results with zero overlap (completely irrelevant).
260
+ * 4. Sort descending by score, keeping original index as tiebreaker.
261
+ * 5. Attach `relevanceScore` (0–1) to each surviving result.
262
+ *
263
+ * Results without any scores (query produced no keywords) are returned as-is.
264
+ */
265
+ export function filterRelevantResults(results, query) {
266
+ const keywords = extractKeywords(query);
267
+ if (keywords.length === 0)
268
+ return results; // no keywords to filter on
269
+ const scored = results.map((r, idx) => ({
270
+ result: r,
271
+ score: scoreResult(r, keywords),
272
+ idx,
273
+ }));
274
+ // Drop results with zero overlap
275
+ const relevant = scored.filter(s => s.score > 0);
276
+ // Sort by score descending, original order as tiebreaker
277
+ relevant.sort((a, b) => (b.score !== a.score ? b.score - a.score : a.idx - b.idx));
278
+ return relevant.map(s => ({
279
+ ...s.result,
280
+ relevanceScore: Math.min(1, s.score),
281
+ }));
282
+ }
198
283
  /**
199
284
  * StealthSearchProvider — self-hosted multi-engine search
200
285
  *
@@ -470,7 +555,10 @@ export class StealthSearchProvider {
470
555
  if (deduped.length >= count)
471
556
  break;
472
557
  }
473
- return deduped;
558
+ // Relevance filtering: remove completely off-topic results, score the rest
559
+ const filtered = filterRelevantResults(deduped, query);
560
+ // Respect the original count limit after filtering
561
+ return filtered.slice(0, count);
474
562
  }
475
563
  }
476
564
  export class DuckDuckGoProvider {
@@ -688,7 +776,9 @@ export class DuckDuckGoProvider {
688
776
  providerStats.record('ddg-http', true);
689
777
  log.debug(`source=ddg-http returned ${results.length} results` +
690
778
  (ddgTimeoutMs < 8_000 ? ` (fast-timeout ${ddgTimeoutMs}ms)` : ''));
691
- return results;
779
+ // Apply relevance filtering before returning
780
+ const filtered = filterRelevantResults(results, query);
781
+ return filtered.length > 0 ? filtered : results; // fallback to unfiltered if all removed
692
782
  }
693
783
  ddgSucceeded = true; // connected OK, just 0 results
694
784
  }
@@ -721,7 +811,9 @@ export class DuckDuckGoProvider {
721
811
  providerStats.record('ddg-lite', true);
722
812
  log.debug(`source=ddg-lite returned ${liteResults.length} results` +
723
813
  (liteTimeoutMs < 8_000 ? ` (fast-timeout ${liteTimeoutMs}ms)` : ''));
724
- return liteResults;
814
+ // Apply relevance filtering before returning
815
+ const filteredLite = filterRelevantResults(liteResults, query);
816
+ return filteredLite.length > 0 ? filteredLite : liteResults;
725
817
  }
726
818
  providerStats.record('ddg-lite', false);
727
819
  log.debug('DDG Lite also returned 0 results');
@@ -756,6 +848,7 @@ export class DuckDuckGoProvider {
756
848
  log.debug('Trying stealth browser search (DDG + Bing + Ecosia)...');
757
849
  try {
758
850
  const stealthProvider = new StealthSearchProvider();
851
+ // StealthSearchProvider already applies filterRelevantResults internally.
759
852
  const stealthResults = await stealthProvider.searchWeb(query, options);
760
853
  if (stealthResults.length > 0) {
761
854
  log.debug(`source=stealth returned ${stealthResults.length} results`);
@@ -768,6 +861,13 @@ export class DuckDuckGoProvider {
768
861
  }
769
862
  return [];
770
863
  }
864
+ /**
865
+ * Exposed for testing: score and filter a pre-fetched result list against a query.
866
+ * Equivalent to calling filterRelevantResults() directly.
867
+ */
868
+ filterResults(results, query) {
869
+ return filterRelevantResults(results, query);
870
+ }
771
871
  }
772
872
  export class BraveSearchProvider {
773
873
  id = 'brave';
@@ -296,7 +296,13 @@ function heuristicExtractNumber(fieldName, content) {
296
296
  }
297
297
  // Year
298
298
  if (/year/.test(lf)) {
299
- const m = content.match(/\b(20\d{2})\b/);
299
+ // Match 4-digit years (1900-2099), prefer explicit "Year: YYYY" pattern first
300
+ const explicit = content.match(/\bYear[:\s]+(\d{4})\b/i);
301
+ if (explicit?.[1]) {
302
+ const n = parseInt(explicit[1]);
303
+ return isNaN(n) ? null : n;
304
+ }
305
+ const m = content.match(/\b((?:19|20)\d{2})\b/);
300
306
  if (m?.[1]) {
301
307
  const n = parseInt(m[1]);
302
308
  return isNaN(n) ? null : n;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "webpeel",
3
- "version": "0.21.9",
3
+ "version": "0.21.11",
4
4
  "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
5
5
  "author": "Jake Liu",
6
6
  "license": "AGPL-3.0-only",