crawlforge-mcp-server 4.2.12 → 4.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/CLAUDE.md +19 -7
  2. package/README.md +11 -3
  3. package/package.json +3 -2
  4. package/server.js +195 -22
  5. package/src/cli/commands/init.js +107 -0
  6. package/src/cli/index.js +2 -0
  7. package/src/constants/config.js +5 -0
  8. package/src/core/ActionExecutor.js +13 -1
  9. package/src/core/AgentOrchestrator.js +300 -0
  10. package/src/core/AuthManager.js +21 -1
  11. package/src/core/ChangeTracker.js +8 -5
  12. package/src/core/LLMsTxtAnalyzer.js +71 -47
  13. package/src/core/LocalizationManager.js +7 -4
  14. package/src/core/ResearchOrchestrator.js +10 -6
  15. package/src/core/StealthBrowserManager.js +52 -13
  16. package/src/core/analysis/ContentAnalyzer.js +2 -2
  17. package/src/core/crawlers/BFSCrawler.js +23 -12
  18. package/src/core/processing/ContentProcessor.js +19 -3
  19. package/src/core/processing/PDFProcessor.js +72 -23
  20. package/src/tools/advanced/ScrapeWithActionsTool.js +63 -25
  21. package/src/tools/advanced/batchScrape/index.js +3 -1
  22. package/src/tools/advanced/batchScrape/reporter.js +5 -1
  23. package/src/tools/advanced/batchScrape/worker.js +6 -1
  24. package/src/tools/agent/agent.js +71 -0
  25. package/src/tools/basic/_fetch.js +78 -5
  26. package/src/tools/basic/extractLinks.js +1 -1
  27. package/src/tools/basic/extractMetadata.js +65 -1
  28. package/src/tools/basic/extractText.js +73 -5
  29. package/src/tools/basic/scrapeStructured.js +48 -10
  30. package/src/tools/crawl/crawlDeep.js +13 -5
  31. package/src/tools/crawl/mapSite.js +53 -52
  32. package/src/tools/extract/analyzeContent.js +11 -6
  33. package/src/tools/extract/extractContent.js +23 -5
  34. package/src/tools/extract/extractStructured.js +65 -16
  35. package/src/tools/extract/extractWithLlm.js +192 -11
  36. package/src/tools/extract/listOllamaModels.js +19 -8
  37. package/src/tools/extract/processDocument.js +10 -4
  38. package/src/tools/extract/summarizeContent.js +58 -1
  39. package/src/tools/llmstxt/generateLLMsTxt.js +124 -3
  40. package/src/tools/research/deepResearch.js +43 -4
  41. package/src/tools/scrape/unifiedScrape.js +314 -0
  42. package/src/tools/search/providers/searxng.js +2 -2
  43. package/src/tools/search/ranking/ResultDeduplicator.js +32 -9
  44. package/src/tools/search/ranking/ResultRanker.js +13 -4
  45. package/src/tools/search/searchWeb.js +5 -5
  46. package/src/tools/templates/TemplateRegistry.js +3 -2
  47. package/src/tools/tracking/trackChanges/differ.js +33 -1
  48. package/src/utils/htmlToMarkdown.js +5 -1
@@ -13,7 +13,8 @@ const GenerateLLMsTxtSchema = z.object({
13
13
  maxPages: z.number().min(10).max(500).optional().default(100).describe('Maximum pages to analyze'),
14
14
  detectAPIs: z.boolean().optional().default(true).describe('Whether to detect API endpoints'),
15
15
  analyzeContent: z.boolean().optional().default(true).describe('Whether to analyze content types'),
16
- checkSecurity: z.boolean().optional().default(true).describe('Whether to check security boundaries'),
16
+ checkSecurity: z.boolean().optional().default(false).describe('Whether to probe security-sensitive paths (opt-in; sends requests to /admin, /login, etc.)'),
17
+ probeRateLimit: z.boolean().optional().default(false).describe('Whether to send repeated probe requests to estimate rate limits (opt-in; fires ~5 requests)'),
17
18
  respectRobots: z.boolean().optional().default(true).describe('Whether to respect robots.txt')
18
19
  }).optional().default({}),
19
20
 
@@ -23,7 +24,8 @@ const GenerateLLMsTxtSchema = z.object({
23
24
  contactEmail: z.string().email().optional().describe('Contact email for the LLMs.txt'),
24
25
  organizationName: z.string().optional().describe('Organization name'),
25
26
  customGuidelines: z.array(z.string()).optional().describe('Additional custom guidelines'),
26
- customRestrictions: z.array(z.string()).optional().describe('Additional restrictions')
27
+ customRestrictions: z.array(z.string()).optional().describe('Additional restrictions'),
28
+ robotsStyle: z.boolean().optional().default(false).describe('Emit legacy robots.txt-style directives instead of spec-compliant llmstxt.org markdown')
27
29
  }).optional().default({}),
28
30
 
29
31
  complianceLevel: z.enum(['basic', 'standard', 'strict']).optional().default('standard').describe('Compliance level for generated guidelines'),
@@ -120,9 +122,128 @@ export class GenerateLLMsTxtTool {
120
122
  }
121
123
 
122
124
  /**
123
- * Generate standard LLMs.txt content
125
+ * Generate LLMs.txt content. By default emits spec-compliant llmstxt.org markdown;
126
+ * set outputOptions.robotsStyle=true for the legacy robots.txt-style directives.
124
127
  */
125
128
  generateLLMsTxt(analysis, outputOptions, complianceLevel) {
129
+ if (outputOptions.robotsStyle) {
130
+ return this.generateRobotsStyleTxt(analysis, outputOptions, complianceLevel);
131
+ }
132
+ return this.generateSpecLLMsTxt(analysis, outputOptions);
133
+ }
134
+
135
+ /**
136
+ * Generate spec-compliant llms.txt per https://llmstxt.org/ :
137
+ * # Title
138
+ * > one-line summary (blockquote)
139
+ * optional detail paragraph(s)
140
+ * ## Section
141
+ * - [name](url): optional notes
142
+ */
143
+ generateSpecLLMsTxt(analysis, outputOptions) {
144
+ const lines = [];
145
+ const baseUrl = analysis.metadata.baseUrl;
146
+ let host = baseUrl;
147
+ try { host = new URL(baseUrl).hostname; } catch { /* keep baseUrl */ }
148
+
149
+ // H1 title (required)
150
+ const title = outputOptions.organizationName || host;
151
+ lines.push(`# ${title}`);
152
+ lines.push('');
153
+
154
+ // Blockquote summary (required by spec)
155
+ const summary = `Site map and key resources for ${baseUrl}, generated to help LLMs locate relevant content.`;
156
+ lines.push(`> ${summary}`);
157
+ lines.push('');
158
+
159
+ // Optional detail paragraph(s)
160
+ const details = [];
161
+ if (analysis.structure?.totalPages) {
162
+ details.push(`This site has approximately ${analysis.structure.totalPages} discoverable pages.`);
163
+ }
164
+ if (outputOptions.contactEmail) {
165
+ details.push(`Contact: ${outputOptions.contactEmail}.`);
166
+ }
167
+ if (Array.isArray(outputOptions.customGuidelines) && outputOptions.customGuidelines.length > 0) {
168
+ details.push(...outputOptions.customGuidelines);
169
+ }
170
+ if (details.length > 0) {
171
+ lines.push(details.join(' '));
172
+ lines.push('');
173
+ }
174
+
175
+ // Helper: emit a "## Section" with a list of [name](url) links.
176
+ const linkLabel = (u) => {
177
+ try {
178
+ const p = new URL(u).pathname.replace(/\/+$/, '');
179
+ if (!p || p === '') return 'Home';
180
+ const seg = p.split('/').filter(Boolean).pop() || p;
181
+ return seg.replace(/[-_]/g, ' ').replace(/\.[a-z0-9]+$/i, '').trim() || p;
182
+ } catch {
183
+ return u;
184
+ }
185
+ };
186
+ const emitSection = (heading, urls) => {
187
+ // Coerce to an array: sitemap/sections may arrive as a flat array, a
188
+ // grouped object ({path: [...]}), or a single value.
189
+ let arr = [];
190
+ if (Array.isArray(urls)) {
191
+ arr = urls;
192
+ } else if (urls && typeof urls === 'object') {
193
+ arr = Object.values(urls).flat();
194
+ }
195
+ const list = arr
196
+ .map((u) => (typeof u === 'string' ? u : (u?.url || u?.loc)))
197
+ .filter(Boolean)
198
+ .slice(0, 25);
199
+ if (list.length === 0) return;
200
+ lines.push(`## ${heading}`);
201
+ lines.push('');
202
+ for (const u of list) {
203
+ lines.push(`- [${linkLabel(u)}](${u})`);
204
+ }
205
+ lines.push('');
206
+ };
207
+
208
+ // Sections derived from the categorized site structure.
209
+ const sections = analysis.structure?.sections || {};
210
+ const flatten = (cat) => {
211
+ const v = sections[cat];
212
+ if (!Array.isArray(v)) return [];
213
+ // categorizeSections may produce either flat URLs or {path, urls:[...]} groups.
214
+ return v.flatMap((entry) =>
215
+ typeof entry === 'string' ? [entry] : (Array.isArray(entry?.urls) ? entry.urls : []));
216
+ };
217
+
218
+ emitSection('Documentation', flatten('documentation'));
219
+ emitSection('Content', flatten('content'));
220
+ emitSection('Tools', flatten('tools'));
221
+ emitSection('Navigation', flatten('navigation'));
222
+
223
+ // APIs as their own section.
224
+ if (Array.isArray(analysis.apis) && analysis.apis.length > 0) {
225
+ lines.push('## APIs');
226
+ lines.push('');
227
+ for (const api of analysis.apis.slice(0, 25)) {
228
+ const note = api.type ? `: ${api.type}` : '';
229
+ lines.push(`- [${linkLabel(api.url)}](${api.url})${note}`);
230
+ }
231
+ lines.push('');
232
+ }
233
+
234
+ // Fallback: if no categorized sections produced output, list the raw sitemap.
235
+ const hasBody = lines.some((l) => l.startsWith('## '));
236
+ if (!hasBody) {
237
+ emitSection('Pages', analysis.structure?.sitemap || []);
238
+ }
239
+
240
+ return lines.join('\n').replace(/\n{3,}/g, '\n\n').trimEnd() + '\n';
241
+ }
242
+
243
+ /**
244
+ * Generate legacy robots.txt-style content (opt-in via outputOptions.robotsStyle).
245
+ */
246
+ generateRobotsStyleTxt(analysis, outputOptions, complianceLevel) {
126
247
  const lines = [];
127
248
  const baseUrl = analysis.metadata.baseUrl;
128
249
 
@@ -379,18 +379,57 @@ export class DeepResearchTool {
379
379
  * Format research results according to output preferences
380
380
  */
381
381
  formatResults(results, params) {
382
- // Raw evidence mode (no LLM configured): pass through the clean shape
383
- // designed for the calling LLM to synthesize.
382
+ // Raw evidence mode (no LLM configured): apply lightweight formatting so
383
+ // outputFormat is not silently ignored, and rank sources by credibility.
384
384
  if (results.synthesisMode === 'raw_evidence') {
385
- return {
385
+ const rankedSources = (results.sources || [])
386
+ .slice()
387
+ .sort((a, b) => (b.credibility || 0) - (a.credibility || 0));
388
+
389
+ const base = {
386
390
  synthesisMode: 'raw_evidence',
387
391
  note: results.note,
388
- sources: results.sources,
389
392
  researchSummary: results.researchSummary,
390
393
  metadata: results.metadata,
391
394
  performance: results.performance,
392
395
  activityLog: params.includeActivityLog ? results.activityLog : undefined
393
396
  };
397
+
398
+ switch (params.outputFormat) {
399
+ case 'summary':
400
+ return {
401
+ ...base,
402
+ sources: rankedSources.slice(0, 5)
403
+ };
404
+
405
+ case 'citations_only':
406
+ return {
407
+ ...base,
408
+ sources: rankedSources.map(s => ({
409
+ title: s.title,
410
+ url: s.url,
411
+ credibility: s.credibility
412
+ })),
413
+ citationCount: rankedSources.length,
414
+ citationSummary: this.generateCitationSummary(rankedSources)
415
+ };
416
+
417
+ case 'conflicts_focus':
418
+ // Without LLM there is no conflict detection; return ranked sources
419
+ // with a note so the caller knows what happened.
420
+ return {
421
+ ...base,
422
+ sources: rankedSources,
423
+ conflictsNote: 'Conflict detection requires an LLM (OPENAI_API_KEY or ANTHROPIC_API_KEY). Sources are ranked by credibility for manual review.'
424
+ };
425
+
426
+ case 'comprehensive':
427
+ default:
428
+ return {
429
+ ...base,
430
+ sources: rankedSources
431
+ };
432
+ }
394
433
  }
395
434
 
396
435
  const formatted = {
@@ -0,0 +1,314 @@
1
+ /**
2
+ * unifiedScrape — single-fetch, multi-format scraping tool.
3
+ *
4
+ * One call, one fetch. formats[] drives what is returned.
5
+ * Mirrors the output shape of ScrapeWithActionsTool.generateFormats():
6
+ * content.html, content.rawHtml, content.text, content.markdown,
7
+ * content.links, content.metadata, content.screenshots, content.json
8
+ *
9
+ * onlyMainContent maps to Readability boilerplate removal (same as extractContent).
10
+ * Partial success: per-format warnings[] never fail the whole call.
11
+ */
12
+
13
+ import { z } from 'zod';
14
+ import { JSDOM } from 'jsdom';
15
+ import { Readability } from '@mozilla/readability';
16
+ import { fetchAndParse } from '../extract/_fetchAndParse.js';
17
+ import { htmlToMarkdown } from '../../utils/htmlToMarkdown.js';
18
+ import { extractBlockText, readabilityToMarkdown } from '../basic/extractText.js';
19
+
20
+ // ── Schema ────────────────────────────────────────────────────────────────────
21
+
22
+ const JsonFormatSchema = z.object({
23
+ type: z.literal('json'),
24
+ schema: z.record(z.any()).optional(),
25
+ prompt: z.string().optional()
26
+ });
27
+
28
+ const FormatSchema = z.union([
29
+ z.enum(['markdown', 'html', 'rawHtml', 'text', 'links', 'metadata', 'screenshot']),
30
+ JsonFormatSchema
31
+ ]);
32
+
33
+ export const UnifiedScrapeSchema = z.object({
34
+ url: z.string().url(),
35
+ formats: z.array(FormatSchema).min(1).default(['markdown']),
36
+ onlyMainContent: z.boolean().optional().default(true),
37
+ // Pass-through to fetchAndParse
38
+ timeoutMs: z.number().min(1000).max(60000).optional().default(15000)
39
+ });
40
+
41
+ // ── Helpers ───────────────────────────────────────────────────────────────────
42
+
43
+ /**
44
+ * Extract links from a loaded cheerio $ and the page URL.
45
+ */
46
+ function extractLinksFromDom($, pageUrl) {
47
+ const links = [];
48
+ const seen = new Set();
49
+ let pageOrigin = '';
50
+ try { pageOrigin = new URL(pageUrl).origin; } catch { /* ignore */ }
51
+
52
+ $('a[href]').each((_, el) => {
53
+ const href = $(el).attr('href');
54
+ const text = $(el).text().trim();
55
+ if (!href) return;
56
+ try {
57
+ let absoluteUrl;
58
+ let isExternal = false;
59
+ if (href.startsWith('http://') || href.startsWith('https://')) {
60
+ absoluteUrl = href;
61
+ isExternal = new URL(href).origin !== pageOrigin;
62
+ } else if (href.startsWith('#') || href.startsWith('javascript:')) {
63
+ return;
64
+ } else {
65
+ absoluteUrl = new URL(href, pageUrl).toString();
66
+ isExternal = false;
67
+ }
68
+ if (!seen.has(absoluteUrl)) {
69
+ seen.add(absoluteUrl);
70
+ links.push({ href: absoluteUrl, text, is_external: isExternal, original_href: href });
71
+ }
72
+ } catch { /* skip invalid */ }
73
+ });
74
+
75
+ return {
76
+ links,
77
+ total_count: links.length,
78
+ internal_count: links.filter(l => !l.is_external).length,
79
+ external_count: links.filter(l => l.is_external).length
80
+ };
81
+ }
82
+
83
+ /**
84
+ * Extract metadata from a loaded cheerio $.
85
+ */
86
+ function extractMetadataFromDom($, pageUrl) {
87
+ // JSON-LD
88
+ const jsonLd = [];
89
+ $('script[type="application/ld+json"]').each((_, el) => {
90
+ try { const raw = $(el).html(); if (raw) jsonLd.push(JSON.parse(raw)); } catch { /* skip */ }
91
+ });
92
+
93
+ // Microdata
94
+ const microdata = [];
95
+ $('[itemscope]').each((_, el) => {
96
+ const $el = $(el);
97
+ const item = { type: $el.attr('itemtype') || null, properties: {} };
98
+ $el.find('[itemprop]').each((_, prop) => {
99
+ const $prop = $(prop);
100
+ const name = $prop.attr('itemprop');
101
+ if (!name) return;
102
+ const tag = ($prop.get(0).tagName || '').toLowerCase();
103
+ let value;
104
+ if (tag === 'meta') value = $prop.attr('content');
105
+ else if (tag === 'a' || tag === 'link') value = $prop.attr('href');
106
+ else if (tag === 'img') value = $prop.attr('src');
107
+ else if (tag === 'time') value = $prop.attr('datetime') || $prop.text().trim();
108
+ else value = $prop.text().trim();
109
+ if (value) {
110
+ if (!item.properties[name]) item.properties[name] = [];
111
+ item.properties[name].push(value);
112
+ }
113
+ });
114
+ microdata.push(item);
115
+ });
116
+
117
+ const title =
118
+ $('meta[property="og:title"]').attr('content') ||
119
+ $('title').text().trim() ||
120
+ $('h1').first().text().trim() || '';
121
+
122
+ const ogTags = {};
123
+ $('meta[property^="og:"]').each((_, el) => {
124
+ const property = $(el).attr('property');
125
+ const content = $(el).attr('content');
126
+ if (property && content) ogTags[property.replace('og:', '')] = content;
127
+ });
128
+
129
+ const twitterTags = {};
130
+ $('meta[name^="twitter:"]').each((_, el) => {
131
+ const name = $(el).attr('name');
132
+ const content = $(el).attr('content');
133
+ if (name && content) twitterTags[name.replace('twitter:', '')] = content;
134
+ });
135
+
136
+ return {
137
+ title,
138
+ description: $('meta[name="description"]').attr('content') || $('meta[property="og:description"]').attr('content') || '',
139
+ keywords: ($('meta[name="keywords"]').attr('content') || '').split(',').map(k => k.trim()).filter(Boolean),
140
+ canonical_url: $('link[rel="canonical"]').attr('href') || '',
141
+ author: $('meta[name="author"]').attr('content') || '',
142
+ robots: $('meta[name="robots"]').attr('content') || '',
143
+ viewport: $('meta[name="viewport"]').attr('content') || '',
144
+ og_tags: ogTags,
145
+ twitter_tags: twitterTags,
146
+ json_ld: jsonLd,
147
+ microdata,
148
+ url: pageUrl
149
+ };
150
+ }
151
+
152
+ // ── Tool class ────────────────────────────────────────────────────────────────
153
+
154
+ export class UnifiedScrapeTool {
155
+ constructor(options = {}) {
156
+ this._extractWithLlm = null;
157
+ this._extractWithLlmConfig = options.llmConfig || {};
158
+ }
159
+
160
+ /** Lazy-load ExtractWithLlm to avoid pulling in heavy deps unless needed. */
161
+ async _getExtractWithLlm() {
162
+ if (!this._extractWithLlm) {
163
+ const { ExtractWithLlm } = await import('../extract/extractWithLlm.js');
164
+ this._extractWithLlm = new ExtractWithLlm(this._extractWithLlmConfig);
165
+ }
166
+ return this._extractWithLlm;
167
+ }
168
+
169
+ /**
170
+ * Execute a unified scrape.
171
+ * @param {object} params - UnifiedScrapeSchema-compatible input
172
+ * @returns {Promise<object>}
173
+ */
174
+ async execute(params) {
175
+ const validated = UnifiedScrapeSchema.parse(params);
176
+ const { url, formats, onlyMainContent, timeoutMs } = validated;
177
+
178
+ // Single fetch
179
+ let html, $, finalUrl;
180
+ try {
181
+ ({ html, $, finalUrl } = await fetchAndParse(url, {
182
+ timeoutMs,
183
+ stripTags: [] // we handle boilerplate ourselves
184
+ }));
185
+ } catch (err) {
186
+ throw new Error(`scrape: fetch failed for ${url}: ${err.message}`);
187
+ }
188
+
189
+ // For onlyMainContent: extract main-content html via Readability once
190
+ let mainHtml = null;
191
+ function getMainHtml() {
192
+ if (mainHtml !== null) return mainHtml;
193
+ try {
194
+ const dom = new JSDOM(html, { url: finalUrl });
195
+ const reader = new Readability(dom.window.document);
196
+ const article = reader.parse();
197
+ mainHtml = article ? article.content : html;
198
+ } catch {
199
+ mainHtml = html;
200
+ }
201
+ return mainHtml;
202
+ }
203
+
204
+ const content = {};
205
+ const warnings = [];
206
+
207
+ for (const fmt of formats) {
208
+ // JSON format object
209
+ if (fmt && typeof fmt === 'object' && fmt.type === 'json') {
210
+ try {
211
+ const extractWithLlm = await this._getExtractWithLlm();
212
+ const text = onlyMainContent
213
+ ? htmlToMarkdown(getMainHtml())
214
+ : $('body').text().replace(/\s+/g, ' ').trim();
215
+ const result = await extractWithLlm.execute({
216
+ content: text,
217
+ prompt: fmt.prompt || 'Extract structured data from this page content.',
218
+ schema: fmt.schema,
219
+ provider: 'auto'
220
+ });
221
+ content.json = result.success ? result.data : { error: result.error };
222
+ if (!result.success) {
223
+ warnings.push(`json: extraction failed — ${result.error}`);
224
+ }
225
+ } catch (err) {
226
+ content.json = { error: err.message };
227
+ warnings.push(`json: ${err.message}`);
228
+ }
229
+ continue;
230
+ }
231
+
232
+ // String formats
233
+ switch (fmt) {
234
+ case 'markdown':
235
+ try {
236
+ content.markdown = onlyMainContent
237
+ ? readabilityToMarkdown(html, finalUrl)
238
+ : htmlToMarkdown($.html('body') || html);
239
+ } catch (err) {
240
+ content.markdown = '';
241
+ warnings.push(`markdown: ${err.message}`);
242
+ }
243
+ break;
244
+
245
+ case 'html':
246
+ try {
247
+ content.html = onlyMainContent ? getMainHtml() : $.html('body') || html;
248
+ } catch (err) {
249
+ content.html = '';
250
+ warnings.push(`html: ${err.message}`);
251
+ }
252
+ break;
253
+
254
+ case 'rawHtml':
255
+ content.rawHtml = html;
256
+ break;
257
+
258
+ case 'text':
259
+ try {
260
+ if (onlyMainContent) {
261
+ // Plain text from Readability main content via cheerio
262
+ const { load } = await import('cheerio');
263
+ const $main = load(getMainHtml());
264
+ $main('script, style').remove();
265
+ content.text = extractBlockText($main);
266
+ } else {
267
+ $('script, style').remove();
268
+ content.text = extractBlockText($);
269
+ }
270
+ } catch (err) {
271
+ content.text = '';
272
+ warnings.push(`text: ${err.message}`);
273
+ }
274
+ break;
275
+
276
+ case 'links':
277
+ try {
278
+ content.links = extractLinksFromDom($, finalUrl);
279
+ } catch (err) {
280
+ content.links = { links: [], total_count: 0, internal_count: 0, external_count: 0 };
281
+ warnings.push(`links: ${err.message}`);
282
+ }
283
+ break;
284
+
285
+ case 'metadata':
286
+ try {
287
+ content.metadata = extractMetadataFromDom($, finalUrl);
288
+ } catch (err) {
289
+ content.metadata = {};
290
+ warnings.push(`metadata: ${err.message}`);
291
+ }
292
+ break;
293
+
294
+ case 'screenshot':
295
+ // Screenshot requires a browser; not available in the basic scrape path.
296
+ content.screenshots = [];
297
+ warnings.push('screenshot: browser screenshots are not available in the scrape tool; use scrape_with_actions for screenshots');
298
+ break;
299
+
300
+ default:
301
+ warnings.push(`unknown format: ${String(fmt)}`);
302
+ }
303
+ }
304
+
305
+ return {
306
+ success: true,
307
+ url: finalUrl,
308
+ content,
309
+ warnings: warnings.length > 0 ? warnings : undefined
310
+ };
311
+ }
312
+ }
313
+
314
+ export default UnifiedScrapeTool;
@@ -117,8 +117,8 @@ export async function searchViaSearxng(opts = {}) {
117
117
  return {
118
118
  items,
119
119
  searchInformation: {
120
- totalResults: String(rawResults.length),
121
- searchTime: data.answers ? 0 : 0
120
+ totalResults: rawResults.length,
121
+ searchTime: 0
122
122
  },
123
123
  queries: {},
124
124
  context: {}
@@ -545,27 +545,50 @@ export class ResultDeduplicator {
545
545
  }
546
546
 
547
547
  /**
548
- * SimHash implementation for content similarity
548
+ * SimHash implementation for content similarity.
549
+ * Uses two independent 32-bit FNV-1a hashes (seeded differently) to produce
550
+ * independent high/low 32-bit words, giving a true 64-bit fingerprint so that
551
+ * bits 32-63 are not a duplicate of bits 0-31.
549
552
  */
550
553
  simHash(text, bits = 64) {
551
554
  const tokens = text.split(/\s+/);
552
555
  const hashBits = new Array(bits).fill(0);
553
-
556
+
554
557
  for (const token of tokens) {
555
- const hash = this.stringHash(token);
556
-
557
- for (let i = 0; i < bits; i++) {
558
- const bit = (hash >> i) & 1;
559
- hashBits[i] += bit ? 1 : -1;
558
+ const lo = this._fnv1a32(token, 0x811c9dc5);
559
+ const hi = this._fnv1a32(token, 0x84222325); // different seed
560
+
561
+ for (let i = 0; i < 32; i++) {
562
+ hashBits[i] += ((lo >>> i) & 1) ? 1 : -1;
563
+ }
564
+ for (let i = 0; i < 32; i++) {
565
+ hashBits[32 + i] += ((hi >>> i) & 1) ? 1 : -1;
560
566
  }
561
567
  }
562
-
568
+
563
569
  // Convert to binary string
564
570
  return hashBits.map(bit => bit > 0 ? '1' : '0').join('');
565
571
  }
566
572
 
567
573
  /**
568
- * String hash function
574
+ * FNV-1a 32-bit hash with a configurable offset basis (seed).
575
+ * @param {string} str
576
+ * @param {number} seed - 32-bit unsigned offset basis
577
+ * @returns {number} 32-bit unsigned integer
578
+ */
579
+ _fnv1a32(str, seed) {
580
+ const FNV_PRIME = 0x01000193;
581
+ let hash = seed >>> 0;
582
+ for (let i = 0; i < str.length; i++) {
583
+ hash ^= str.charCodeAt(i);
584
+ // Multiply by FNV prime using 32-bit arithmetic
585
+ hash = Math.imul(hash, FNV_PRIME) >>> 0;
586
+ }
587
+ return hash;
588
+ }
589
+
590
+ /**
591
+ * String hash function (kept for hashResults / cache-key use)
569
592
  */
570
593
  stringHash(str) {
571
594
  let hash = 0;
@@ -191,18 +191,27 @@ export class ResultRanker {
191
191
  // Calculate term frequencies
192
192
  const termFreqs = this.getTermFrequencies(contentTerms);
193
193
 
194
+ // Build per-term document frequency across all results
195
+ const docFreqs = {};
196
+ for (const r of allResults) {
197
+ const rContent = [r.title || '', r.snippet || '', r.htmlSnippet || ''].join(' ');
198
+ const rTerms = new Set(this.tokenize(rContent.toLowerCase()));
199
+ for (const t of rTerms) {
200
+ docFreqs[t] = (docFreqs[t] || 0) + 1;
201
+ }
202
+ }
203
+
194
204
  let score = 0;
195
205
  for (const term of queryTerms) {
196
206
  const tf = termFreqs[term] || 0;
197
207
  if (tf > 0) {
198
- // Document frequency (simplified - assume term appears in some docs)
199
- const df = Math.min(allResults.length * 0.1, 1); // Conservative estimate
208
+ const df = docFreqs[term] || 1;
200
209
  const idf = Math.log((allResults.length - df + 0.5) / (df + 0.5));
201
-
210
+
202
211
  // BM25 formula
203
212
  const numerator = tf * (k1 + 1);
204
213
  const denominator = tf + k1 * (1 - b + b * (contentLength / avgDocLength));
205
-
214
+
206
215
  score += idf * (numerator / denominator);
207
216
  }
208
217
  }
@@ -280,14 +280,14 @@ export class SearchWebTool {
280
280
  // Clean up results based on detail level requested
281
281
  if (!validated.include_ranking_details) {
282
282
  processedResults = processedResults.map(result => {
283
- const { rankingDetails, ...cleanResult } = result;
283
+ const { rankingDetails, finalScore, originalIndex, scores, ...cleanResult } = result;
284
284
  return cleanResult;
285
285
  });
286
286
  }
287
-
287
+
288
288
  if (!validated.include_deduplication_details) {
289
289
  processedResults = processedResults.map(result => {
290
- const { deduplicationInfo, ...cleanResult } = result;
290
+ const { deduplicationInfo, contentHash, normalizedUrl, titleTokens, ...cleanResult } = result;
291
291
  return cleanResult;
292
292
  });
293
293
  }
@@ -407,10 +407,10 @@ export class SearchWebTool {
407
407
  }
408
408
 
409
409
  if (!validated.include_ranking_details) {
410
- processedResults = processedResults.map(({ rankingDetails, ...r }) => r);
410
+ processedResults = processedResults.map(({ rankingDetails, finalScore, originalIndex, scores, ...r }) => r);
411
411
  }
412
412
  if (!validated.include_deduplication_details) {
413
- processedResults = processedResults.map(({ deduplicationInfo: _d, ...r }) => r);
413
+ processedResults = processedResults.map(({ deduplicationInfo: _d, contentHash, normalizedUrl, titleTokens, ...r }) => r);
414
414
  }
415
415
 
416
416
  return {
@@ -162,8 +162,9 @@ const TEMPLATES = [
162
162
  const stories = [];
163
163
  $('tr.athing').each((_, el) => {
164
164
  const $row = $(el);
165
- const $score = $row.next('.spacer').find('.score');
166
- const $subtext = $row.next('.spacer').find('.subtext');
165
+ // The metadata row (".subtext") is the sibling row immediately after tr.athing.
166
+ const $subtext = $row.next('tr').find('.subtext');
167
+ const $score = $subtext.find('.score');
167
168
  const $titleLink = $row.find('.titleline > a');
168
169
  stories.push({
169
170
  id: $row.attr('id'),