crawlforge-mcp-server 4.2.11 → 4.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/package.json +2 -1
  2. package/server.js +152 -21
  3. package/src/constants/config.js +5 -0
  4. package/src/core/ActionExecutor.js +13 -1
  5. package/src/core/ChangeTracker.js +8 -5
  6. package/src/core/LLMsTxtAnalyzer.js +71 -47
  7. package/src/core/LocalizationManager.js +7 -4
  8. package/src/core/ResearchOrchestrator.js +10 -6
  9. package/src/core/StealthBrowserManager.js +111 -40
  10. package/src/core/analysis/ContentAnalyzer.js +2 -2
  11. package/src/core/crawlers/BFSCrawler.js +23 -12
  12. package/src/core/processing/ContentProcessor.js +19 -3
  13. package/src/core/processing/PDFProcessor.js +72 -23
  14. package/src/tools/advanced/ScrapeWithActionsTool.js +63 -25
  15. package/src/tools/advanced/batchScrape/index.js +3 -1
  16. package/src/tools/advanced/batchScrape/reporter.js +5 -1
  17. package/src/tools/advanced/batchScrape/worker.js +6 -1
  18. package/src/tools/basic/_fetch.js +78 -5
  19. package/src/tools/basic/extractLinks.js +1 -1
  20. package/src/tools/basic/extractMetadata.js +65 -1
  21. package/src/tools/basic/extractText.js +61 -5
  22. package/src/tools/basic/scrapeStructured.js +48 -10
  23. package/src/tools/crawl/crawlDeep.js +13 -5
  24. package/src/tools/crawl/mapSite.js +24 -51
  25. package/src/tools/extract/analyzeContent.js +11 -6
  26. package/src/tools/extract/extractContent.js +23 -5
  27. package/src/tools/extract/extractStructured.js +65 -16
  28. package/src/tools/extract/extractWithLlm.js +192 -11
  29. package/src/tools/extract/listOllamaModels.js +19 -8
  30. package/src/tools/extract/processDocument.js +10 -4
  31. package/src/tools/extract/summarizeContent.js +58 -1
  32. package/src/tools/llmstxt/generateLLMsTxt.js +124 -3
  33. package/src/tools/research/deepResearch.js +43 -4
  34. package/src/tools/search/providers/searxng.js +2 -2
  35. package/src/tools/search/ranking/ResultDeduplicator.js +32 -9
  36. package/src/tools/search/ranking/ResultRanker.js +13 -4
  37. package/src/tools/search/searchWeb.js +5 -5
  38. package/src/tools/templates/TemplateRegistry.js +3 -2
  39. package/src/tools/tracking/trackChanges/differ.js +33 -1
  40. package/src/utils/htmlToMarkdown.js +5 -1
@@ -13,7 +13,8 @@ const GenerateLLMsTxtSchema = z.object({
13
13
  maxPages: z.number().min(10).max(500).optional().default(100).describe('Maximum pages to analyze'),
14
14
  detectAPIs: z.boolean().optional().default(true).describe('Whether to detect API endpoints'),
15
15
  analyzeContent: z.boolean().optional().default(true).describe('Whether to analyze content types'),
16
- checkSecurity: z.boolean().optional().default(true).describe('Whether to check security boundaries'),
16
+ checkSecurity: z.boolean().optional().default(false).describe('Whether to probe security-sensitive paths (opt-in; sends requests to /admin, /login, etc.)'),
17
+ probeRateLimit: z.boolean().optional().default(false).describe('Whether to send repeated probe requests to estimate rate limits (opt-in; fires ~5 requests)'),
17
18
  respectRobots: z.boolean().optional().default(true).describe('Whether to respect robots.txt')
18
19
  }).optional().default({}),
19
20
 
@@ -23,7 +24,8 @@ const GenerateLLMsTxtSchema = z.object({
23
24
  contactEmail: z.string().email().optional().describe('Contact email for the LLMs.txt'),
24
25
  organizationName: z.string().optional().describe('Organization name'),
25
26
  customGuidelines: z.array(z.string()).optional().describe('Additional custom guidelines'),
26
- customRestrictions: z.array(z.string()).optional().describe('Additional restrictions')
27
+ customRestrictions: z.array(z.string()).optional().describe('Additional restrictions'),
28
+ robotsStyle: z.boolean().optional().default(false).describe('Emit legacy robots.txt-style directives instead of spec-compliant llmstxt.org markdown')
27
29
  }).optional().default({}),
28
30
 
29
31
  complianceLevel: z.enum(['basic', 'standard', 'strict']).optional().default('standard').describe('Compliance level for generated guidelines'),
@@ -120,9 +122,128 @@ export class GenerateLLMsTxtTool {
120
122
  }
121
123
 
122
124
  /**
123
- * Generate standard LLMs.txt content
125
+ * Generate LLMs.txt content. By default emits spec-compliant llmstxt.org markdown;
126
+ * set outputOptions.robotsStyle=true for the legacy robots.txt-style directives.
124
127
  */
125
128
  generateLLMsTxt(analysis, outputOptions, complianceLevel) {
129
+ if (outputOptions.robotsStyle) {
130
+ return this.generateRobotsStyleTxt(analysis, outputOptions, complianceLevel);
131
+ }
132
+ return this.generateSpecLLMsTxt(analysis, outputOptions);
133
+ }
134
+
135
+ /**
136
+ * Generate spec-compliant llms.txt per https://llmstxt.org/ :
137
+ * # Title
138
+ * > one-line summary (blockquote)
139
+ * optional detail paragraph(s)
140
+ * ## Section
141
+ * - [name](url): optional notes
142
+ */
143
+ generateSpecLLMsTxt(analysis, outputOptions) {
144
+ const lines = [];
145
+ const baseUrl = analysis.metadata.baseUrl;
146
+ let host = baseUrl;
147
+ try { host = new URL(baseUrl).hostname; } catch { /* keep baseUrl */ }
148
+
149
+ // H1 title (required)
150
+ const title = outputOptions.organizationName || host;
151
+ lines.push(`# ${title}`);
152
+ lines.push('');
153
+
154
+ // Blockquote summary (required by spec)
155
+ const summary = `Site map and key resources for ${baseUrl}, generated to help LLMs locate relevant content.`;
156
+ lines.push(`> ${summary}`);
157
+ lines.push('');
158
+
159
+ // Optional detail paragraph(s)
160
+ const details = [];
161
+ if (analysis.structure?.totalPages) {
162
+ details.push(`This site has approximately ${analysis.structure.totalPages} discoverable pages.`);
163
+ }
164
+ if (outputOptions.contactEmail) {
165
+ details.push(`Contact: ${outputOptions.contactEmail}.`);
166
+ }
167
+ if (Array.isArray(outputOptions.customGuidelines) && outputOptions.customGuidelines.length > 0) {
168
+ details.push(...outputOptions.customGuidelines);
169
+ }
170
+ if (details.length > 0) {
171
+ lines.push(details.join(' '));
172
+ lines.push('');
173
+ }
174
+
175
+ // Helper: emit a "## Section" with a list of [name](url) links.
176
+ const linkLabel = (u) => {
177
+ try {
178
+ const p = new URL(u).pathname.replace(/\/+$/, '');
179
+ if (!p || p === '') return 'Home';
180
+ const seg = p.split('/').filter(Boolean).pop() || p;
181
+ return seg.replace(/[-_]/g, ' ').replace(/\.[a-z0-9]+$/i, '').trim() || p;
182
+ } catch {
183
+ return u;
184
+ }
185
+ };
186
+ const emitSection = (heading, urls) => {
187
+ // Coerce to an array: sitemap/sections may arrive as a flat array, a
188
+ // grouped object ({path: [...]}), or a single value.
189
+ let arr = [];
190
+ if (Array.isArray(urls)) {
191
+ arr = urls;
192
+ } else if (urls && typeof urls === 'object') {
193
+ arr = Object.values(urls).flat();
194
+ }
195
+ const list = arr
196
+ .map((u) => (typeof u === 'string' ? u : (u?.url || u?.loc)))
197
+ .filter(Boolean)
198
+ .slice(0, 25);
199
+ if (list.length === 0) return;
200
+ lines.push(`## ${heading}`);
201
+ lines.push('');
202
+ for (const u of list) {
203
+ lines.push(`- [${linkLabel(u)}](${u})`);
204
+ }
205
+ lines.push('');
206
+ };
207
+
208
+ // Sections derived from the categorized site structure.
209
+ const sections = analysis.structure?.sections || {};
210
+ const flatten = (cat) => {
211
+ const v = sections[cat];
212
+ if (!Array.isArray(v)) return [];
213
+ // categorizeSections may produce either flat URLs or {path, urls:[...]} groups.
214
+ return v.flatMap((entry) =>
215
+ typeof entry === 'string' ? [entry] : (Array.isArray(entry?.urls) ? entry.urls : []));
216
+ };
217
+
218
+ emitSection('Documentation', flatten('documentation'));
219
+ emitSection('Content', flatten('content'));
220
+ emitSection('Tools', flatten('tools'));
221
+ emitSection('Navigation', flatten('navigation'));
222
+
223
+ // APIs as their own section.
224
+ if (Array.isArray(analysis.apis) && analysis.apis.length > 0) {
225
+ lines.push('## APIs');
226
+ lines.push('');
227
+ for (const api of analysis.apis.slice(0, 25)) {
228
+ const note = api.type ? `: ${api.type}` : '';
229
+ lines.push(`- [${linkLabel(api.url)}](${api.url})${note}`);
230
+ }
231
+ lines.push('');
232
+ }
233
+
234
+ // Fallback: if no categorized sections produced output, list the raw sitemap.
235
+ const hasBody = lines.some((l) => l.startsWith('## '));
236
+ if (!hasBody) {
237
+ emitSection('Pages', analysis.structure?.sitemap || []);
238
+ }
239
+
240
+ return lines.join('\n').replace(/\n{3,}/g, '\n\n').trimEnd() + '\n';
241
+ }
242
+
243
+ /**
244
+ * Generate legacy robots.txt-style content (opt-in via outputOptions.robotsStyle).
245
+ */
246
+ generateRobotsStyleTxt(analysis, outputOptions, complianceLevel) {
126
247
  const lines = [];
127
248
  const baseUrl = analysis.metadata.baseUrl;
128
249
 
@@ -379,18 +379,57 @@ export class DeepResearchTool {
379
379
  * Format research results according to output preferences
380
380
  */
381
381
  formatResults(results, params) {
382
- // Raw evidence mode (no LLM configured): pass through the clean shape
383
- // designed for the calling LLM to synthesize.
382
+ // Raw evidence mode (no LLM configured): apply lightweight formatting so
383
+ // outputFormat is not silently ignored, and rank sources by credibility.
384
384
  if (results.synthesisMode === 'raw_evidence') {
385
- return {
385
+ const rankedSources = (results.sources || [])
386
+ .slice()
387
+ .sort((a, b) => (b.credibility || 0) - (a.credibility || 0));
388
+
389
+ const base = {
386
390
  synthesisMode: 'raw_evidence',
387
391
  note: results.note,
388
- sources: results.sources,
389
392
  researchSummary: results.researchSummary,
390
393
  metadata: results.metadata,
391
394
  performance: results.performance,
392
395
  activityLog: params.includeActivityLog ? results.activityLog : undefined
393
396
  };
397
+
398
+ switch (params.outputFormat) {
399
+ case 'summary':
400
+ return {
401
+ ...base,
402
+ sources: rankedSources.slice(0, 5)
403
+ };
404
+
405
+ case 'citations_only':
406
+ return {
407
+ ...base,
408
+ sources: rankedSources.map(s => ({
409
+ title: s.title,
410
+ url: s.url,
411
+ credibility: s.credibility
412
+ })),
413
+ citationCount: rankedSources.length,
414
+ citationSummary: this.generateCitationSummary(rankedSources)
415
+ };
416
+
417
+ case 'conflicts_focus':
418
+ // Without LLM there is no conflict detection; return ranked sources
419
+ // with a note so the caller knows what happened.
420
+ return {
421
+ ...base,
422
+ sources: rankedSources,
423
+ conflictsNote: 'Conflict detection requires an LLM (OPENAI_API_KEY or ANTHROPIC_API_KEY). Sources are ranked by credibility for manual review.'
424
+ };
425
+
426
+ case 'comprehensive':
427
+ default:
428
+ return {
429
+ ...base,
430
+ sources: rankedSources
431
+ };
432
+ }
394
433
  }
395
434
 
396
435
  const formatted = {
@@ -117,8 +117,8 @@ export async function searchViaSearxng(opts = {}) {
117
117
  return {
118
118
  items,
119
119
  searchInformation: {
120
- totalResults: String(rawResults.length),
121
- searchTime: data.answers ? 0 : 0
120
+ totalResults: rawResults.length,
121
+ searchTime: 0
122
122
  },
123
123
  queries: {},
124
124
  context: {}
@@ -545,27 +545,50 @@ export class ResultDeduplicator {
545
545
  }
546
546
 
547
547
  /**
548
- * SimHash implementation for content similarity
548
+ * SimHash implementation for content similarity.
549
+ * Uses two independent 32-bit FNV-1a hashes (seeded differently) to produce
550
+ * independent high/low 32-bit words, giving a true 64-bit fingerprint so that
551
+ * bits 32-63 are not a duplicate of bits 0-31.
549
552
  */
550
553
  simHash(text, bits = 64) {
551
554
  const tokens = text.split(/\s+/);
552
555
  const hashBits = new Array(bits).fill(0);
553
-
556
+
554
557
  for (const token of tokens) {
555
- const hash = this.stringHash(token);
556
-
557
- for (let i = 0; i < bits; i++) {
558
- const bit = (hash >> i) & 1;
559
- hashBits[i] += bit ? 1 : -1;
558
+ const lo = this._fnv1a32(token, 0x811c9dc5);
559
+ const hi = this._fnv1a32(token, 0x84222325); // different seed
560
+
561
+ for (let i = 0; i < 32; i++) {
562
+ hashBits[i] += ((lo >>> i) & 1) ? 1 : -1;
563
+ }
564
+ for (let i = 0; i < 32; i++) {
565
+ hashBits[32 + i] += ((hi >>> i) & 1) ? 1 : -1;
560
566
  }
561
567
  }
562
-
568
+
563
569
  // Convert to binary string
564
570
  return hashBits.map(bit => bit > 0 ? '1' : '0').join('');
565
571
  }
566
572
 
567
573
  /**
568
- * String hash function
574
+ * FNV-1a 32-bit hash with a configurable offset basis (seed).
575
+ * @param {string} str
576
+ * @param {number} seed - 32-bit unsigned offset basis
577
+ * @returns {number} 32-bit unsigned integer
578
+ */
579
+ _fnv1a32(str, seed) {
580
+ const FNV_PRIME = 0x01000193;
581
+ let hash = seed >>> 0;
582
+ for (let i = 0; i < str.length; i++) {
583
+ hash ^= str.charCodeAt(i);
584
+ // Multiply by FNV prime using 32-bit arithmetic
585
+ hash = Math.imul(hash, FNV_PRIME) >>> 0;
586
+ }
587
+ return hash;
588
+ }
589
+
590
+ /**
591
+ * String hash function (kept for hashResults / cache-key use)
569
592
  */
570
593
  stringHash(str) {
571
594
  let hash = 0;
@@ -191,18 +191,27 @@ export class ResultRanker {
191
191
  // Calculate term frequencies
192
192
  const termFreqs = this.getTermFrequencies(contentTerms);
193
193
 
194
+ // Build per-term document frequency across all results
195
+ const docFreqs = {};
196
+ for (const r of allResults) {
197
+ const rContent = [r.title || '', r.snippet || '', r.htmlSnippet || ''].join(' ');
198
+ const rTerms = new Set(this.tokenize(rContent.toLowerCase()));
199
+ for (const t of rTerms) {
200
+ docFreqs[t] = (docFreqs[t] || 0) + 1;
201
+ }
202
+ }
203
+
194
204
  let score = 0;
195
205
  for (const term of queryTerms) {
196
206
  const tf = termFreqs[term] || 0;
197
207
  if (tf > 0) {
198
- // Document frequency (simplified - assume term appears in some docs)
199
- const df = Math.min(allResults.length * 0.1, 1); // Conservative estimate
208
+ const df = docFreqs[term] || 1;
200
209
  const idf = Math.log((allResults.length - df + 0.5) / (df + 0.5));
201
-
210
+
202
211
  // BM25 formula
203
212
  const numerator = tf * (k1 + 1);
204
213
  const denominator = tf + k1 * (1 - b + b * (contentLength / avgDocLength));
205
-
214
+
206
215
  score += idf * (numerator / denominator);
207
216
  }
208
217
  }
@@ -280,14 +280,14 @@ export class SearchWebTool {
280
280
  // Clean up results based on detail level requested
281
281
  if (!validated.include_ranking_details) {
282
282
  processedResults = processedResults.map(result => {
283
- const { rankingDetails, ...cleanResult } = result;
283
+ const { rankingDetails, finalScore, originalIndex, scores, ...cleanResult } = result;
284
284
  return cleanResult;
285
285
  });
286
286
  }
287
-
287
+
288
288
  if (!validated.include_deduplication_details) {
289
289
  processedResults = processedResults.map(result => {
290
- const { deduplicationInfo, ...cleanResult } = result;
290
+ const { deduplicationInfo, contentHash, normalizedUrl, titleTokens, ...cleanResult } = result;
291
291
  return cleanResult;
292
292
  });
293
293
  }
@@ -407,10 +407,10 @@ export class SearchWebTool {
407
407
  }
408
408
 
409
409
  if (!validated.include_ranking_details) {
410
- processedResults = processedResults.map(({ rankingDetails, ...r }) => r);
410
+ processedResults = processedResults.map(({ rankingDetails, finalScore, originalIndex, scores, ...r }) => r);
411
411
  }
412
412
  if (!validated.include_deduplication_details) {
413
- processedResults = processedResults.map(({ deduplicationInfo: _d, ...r }) => r);
413
+ processedResults = processedResults.map(({ deduplicationInfo: _d, contentHash, normalizedUrl, titleTokens, ...r }) => r);
414
414
  }
415
415
 
416
416
  return {
@@ -162,8 +162,9 @@ const TEMPLATES = [
162
162
  const stories = [];
163
163
  $('tr.athing').each((_, el) => {
164
164
  const $row = $(el);
165
- const $score = $row.next('.spacer').find('.score');
166
- const $subtext = $row.next('.spacer').find('.subtext');
165
+ // The metadata row (".subtext") is the sibling row immediately after tr.athing.
166
+ const $subtext = $row.next('tr').find('.subtext');
167
+ const $score = $subtext.find('.score');
167
168
  const $titleLink = $row.find('.titleline > a');
168
169
  stories.push({
169
170
  id: $row.attr('id'),
@@ -3,6 +3,38 @@
3
3
  * URL content fetching and history/stat helper functions.
4
4
  */
5
5
 
6
+ /**
7
+ * Default Jaccard similarity threshold below which a change is considered
8
+ * meaningful (i.e. worth flagging). 0.85 means content must be at least 85 %
9
+ * similar to be treated as "no significant change".
10
+ */
11
+ export const DEFAULT_CHANGE_THRESHOLD = 0.85;
12
+
13
+ /**
14
+ * Compute token-based Jaccard similarity between two text strings.
15
+ * Tokenises on whitespace; returns a value in [0, 1] where 1 is identical.
16
+ *
17
+ * @param {string} text1
18
+ * @param {string} text2
19
+ * @returns {number}
20
+ */
21
+ export function calculateSimilarity(text1, text2) {
22
+ if (!text1 && !text2) return 1;
23
+ if (!text1 || !text2) return 0;
24
+
25
+ const tokenise = (str) => new Set(str.toLowerCase().split(/\s+/).filter(Boolean));
26
+ const setA = tokenise(text1);
27
+ const setB = tokenise(text2);
28
+
29
+ let intersection = 0;
30
+ for (const token of setA) {
31
+ if (setB.has(token)) intersection++;
32
+ }
33
+
34
+ const union = setA.size + setB.size - intersection;
35
+ return union === 0 ? 1 : intersection / union;
36
+ }
37
+
6
38
  /**
7
39
  * Fetch the HTML/text content of a URL with change-tracking headers.
8
40
  * @param {string} url
@@ -18,7 +50,7 @@ export async function fetchContent(url) {
18
50
  'Accept-Encoding': 'gzip, deflate',
19
51
  'Cache-Control': 'no-cache'
20
52
  },
21
- timeout: 30000
53
+ signal: AbortSignal.timeout(30000)
22
54
  });
23
55
 
24
56
  if (!response.ok) {
@@ -11,10 +11,11 @@
11
11
  * headingStyle: 'atx' -> # H1 / ## H2 instead of underline style
12
12
  * codeBlockStyle: 'fenced' -> triple-backtick fences
13
13
  * bulletListMarker: '-'
14
- * - Tables fall back to prose (no GFM plugin loaded by default).
14
+ * - GFM plugin enabled for table support (turndown-plugin-gfm).
15
15
  */
16
16
 
17
17
  import TurndownService from 'turndown';
18
+ import { gfm } from 'turndown-plugin-gfm';
18
19
 
19
20
  let _td = null;
20
21
 
@@ -30,6 +31,9 @@ function getTurndown() {
30
31
  linkStyle: 'inlined'
31
32
  });
32
33
 
34
+ // Enable GFM extensions (tables, strikethrough, task lists)
35
+ _td.use(gfm);
36
+
33
37
  // Remove boilerplate elements before converting
34
38
  _td.remove(['script', 'style', 'nav', 'footer', 'aside', 'noscript']);
35
39
  }