crawlforge-mcp-server 4.2.12 → 4.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -1
- package/server.js +138 -20
- package/src/constants/config.js +5 -0
- package/src/core/ActionExecutor.js +13 -1
- package/src/core/ChangeTracker.js +8 -5
- package/src/core/LLMsTxtAnalyzer.js +71 -47
- package/src/core/LocalizationManager.js +7 -4
- package/src/core/ResearchOrchestrator.js +10 -6
- package/src/core/StealthBrowserManager.js +52 -13
- package/src/core/analysis/ContentAnalyzer.js +2 -2
- package/src/core/crawlers/BFSCrawler.js +23 -12
- package/src/core/processing/ContentProcessor.js +19 -3
- package/src/core/processing/PDFProcessor.js +72 -23
- package/src/tools/advanced/ScrapeWithActionsTool.js +63 -25
- package/src/tools/advanced/batchScrape/index.js +3 -1
- package/src/tools/advanced/batchScrape/reporter.js +5 -1
- package/src/tools/advanced/batchScrape/worker.js +6 -1
- package/src/tools/basic/_fetch.js +78 -5
- package/src/tools/basic/extractLinks.js +1 -1
- package/src/tools/basic/extractMetadata.js +65 -1
- package/src/tools/basic/extractText.js +61 -5
- package/src/tools/basic/scrapeStructured.js +48 -10
- package/src/tools/crawl/crawlDeep.js +13 -5
- package/src/tools/crawl/mapSite.js +24 -51
- package/src/tools/extract/analyzeContent.js +11 -6
- package/src/tools/extract/extractContent.js +23 -5
- package/src/tools/extract/extractStructured.js +65 -16
- package/src/tools/extract/extractWithLlm.js +192 -11
- package/src/tools/extract/listOllamaModels.js +19 -8
- package/src/tools/extract/processDocument.js +10 -4
- package/src/tools/extract/summarizeContent.js +58 -1
- package/src/tools/llmstxt/generateLLMsTxt.js +124 -3
- package/src/tools/research/deepResearch.js +43 -4
- package/src/tools/search/providers/searxng.js +2 -2
- package/src/tools/search/ranking/ResultDeduplicator.js +32 -9
- package/src/tools/search/ranking/ResultRanker.js +13 -4
- package/src/tools/search/searchWeb.js +5 -5
- package/src/tools/templates/TemplateRegistry.js +3 -2
- package/src/tools/tracking/trackChanges/differ.js +33 -1
- package/src/utils/htmlToMarkdown.js +5 -1
|
@@ -13,7 +13,8 @@ const GenerateLLMsTxtSchema = z.object({
|
|
|
13
13
|
maxPages: z.number().min(10).max(500).optional().default(100).describe('Maximum pages to analyze'),
|
|
14
14
|
detectAPIs: z.boolean().optional().default(true).describe('Whether to detect API endpoints'),
|
|
15
15
|
analyzeContent: z.boolean().optional().default(true).describe('Whether to analyze content types'),
|
|
16
|
-
checkSecurity: z.boolean().optional().default(
|
|
16
|
+
checkSecurity: z.boolean().optional().default(false).describe('Whether to probe security-sensitive paths (opt-in; sends requests to /admin, /login, etc.)'),
|
|
17
|
+
probeRateLimit: z.boolean().optional().default(false).describe('Whether to send repeated probe requests to estimate rate limits (opt-in; fires ~5 requests)'),
|
|
17
18
|
respectRobots: z.boolean().optional().default(true).describe('Whether to respect robots.txt')
|
|
18
19
|
}).optional().default({}),
|
|
19
20
|
|
|
@@ -23,7 +24,8 @@ const GenerateLLMsTxtSchema = z.object({
|
|
|
23
24
|
contactEmail: z.string().email().optional().describe('Contact email for the LLMs.txt'),
|
|
24
25
|
organizationName: z.string().optional().describe('Organization name'),
|
|
25
26
|
customGuidelines: z.array(z.string()).optional().describe('Additional custom guidelines'),
|
|
26
|
-
customRestrictions: z.array(z.string()).optional().describe('Additional restrictions')
|
|
27
|
+
customRestrictions: z.array(z.string()).optional().describe('Additional restrictions'),
|
|
28
|
+
robotsStyle: z.boolean().optional().default(false).describe('Emit legacy robots.txt-style directives instead of spec-compliant llmstxt.org markdown')
|
|
27
29
|
}).optional().default({}),
|
|
28
30
|
|
|
29
31
|
complianceLevel: z.enum(['basic', 'standard', 'strict']).optional().default('standard').describe('Compliance level for generated guidelines'),
|
|
@@ -120,9 +122,128 @@ export class GenerateLLMsTxtTool {
|
|
|
120
122
|
}
|
|
121
123
|
|
|
122
124
|
/**
|
|
123
|
-
* Generate
|
|
125
|
+
* Generate LLMs.txt content. By default emits spec-compliant llmstxt.org markdown;
|
|
126
|
+
* set outputOptions.robotsStyle=true for the legacy robots.txt-style directives.
|
|
124
127
|
*/
|
|
125
128
|
generateLLMsTxt(analysis, outputOptions, complianceLevel) {
|
|
129
|
+
if (outputOptions.robotsStyle) {
|
|
130
|
+
return this.generateRobotsStyleTxt(analysis, outputOptions, complianceLevel);
|
|
131
|
+
}
|
|
132
|
+
return this.generateSpecLLMsTxt(analysis, outputOptions);
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
/**
|
|
136
|
+
* Generate spec-compliant llms.txt per https://llmstxt.org/ :
|
|
137
|
+
* # Title
|
|
138
|
+
* > one-line summary (blockquote)
|
|
139
|
+
* optional detail paragraph(s)
|
|
140
|
+
* ## Section
|
|
141
|
+
* - [name](url): optional notes
|
|
142
|
+
*/
|
|
143
|
+
generateSpecLLMsTxt(analysis, outputOptions) {
|
|
144
|
+
const lines = [];
|
|
145
|
+
const baseUrl = analysis.metadata.baseUrl;
|
|
146
|
+
let host = baseUrl;
|
|
147
|
+
try { host = new URL(baseUrl).hostname; } catch { /* keep baseUrl */ }
|
|
148
|
+
|
|
149
|
+
// H1 title (required)
|
|
150
|
+
const title = outputOptions.organizationName || host;
|
|
151
|
+
lines.push(`# ${title}`);
|
|
152
|
+
lines.push('');
|
|
153
|
+
|
|
154
|
+
// Blockquote summary (required by spec)
|
|
155
|
+
const summary = `Site map and key resources for ${baseUrl}, generated to help LLMs locate relevant content.`;
|
|
156
|
+
lines.push(`> ${summary}`);
|
|
157
|
+
lines.push('');
|
|
158
|
+
|
|
159
|
+
// Optional detail paragraph(s)
|
|
160
|
+
const details = [];
|
|
161
|
+
if (analysis.structure?.totalPages) {
|
|
162
|
+
details.push(`This site has approximately ${analysis.structure.totalPages} discoverable pages.`);
|
|
163
|
+
}
|
|
164
|
+
if (outputOptions.contactEmail) {
|
|
165
|
+
details.push(`Contact: ${outputOptions.contactEmail}.`);
|
|
166
|
+
}
|
|
167
|
+
if (Array.isArray(outputOptions.customGuidelines) && outputOptions.customGuidelines.length > 0) {
|
|
168
|
+
details.push(...outputOptions.customGuidelines);
|
|
169
|
+
}
|
|
170
|
+
if (details.length > 0) {
|
|
171
|
+
lines.push(details.join(' '));
|
|
172
|
+
lines.push('');
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
// Helper: emit a "## Section" with a list of [name](url) links.
|
|
176
|
+
const linkLabel = (u) => {
|
|
177
|
+
try {
|
|
178
|
+
const p = new URL(u).pathname.replace(/\/+$/, '');
|
|
179
|
+
if (!p || p === '') return 'Home';
|
|
180
|
+
const seg = p.split('/').filter(Boolean).pop() || p;
|
|
181
|
+
return seg.replace(/[-_]/g, ' ').replace(/\.[a-z0-9]+$/i, '').trim() || p;
|
|
182
|
+
} catch {
|
|
183
|
+
return u;
|
|
184
|
+
}
|
|
185
|
+
};
|
|
186
|
+
const emitSection = (heading, urls) => {
|
|
187
|
+
// Coerce to an array: sitemap/sections may arrive as a flat array, a
|
|
188
|
+
// grouped object ({path: [...]}), or a single value.
|
|
189
|
+
let arr = [];
|
|
190
|
+
if (Array.isArray(urls)) {
|
|
191
|
+
arr = urls;
|
|
192
|
+
} else if (urls && typeof urls === 'object') {
|
|
193
|
+
arr = Object.values(urls).flat();
|
|
194
|
+
}
|
|
195
|
+
const list = arr
|
|
196
|
+
.map((u) => (typeof u === 'string' ? u : (u?.url || u?.loc)))
|
|
197
|
+
.filter(Boolean)
|
|
198
|
+
.slice(0, 25);
|
|
199
|
+
if (list.length === 0) return;
|
|
200
|
+
lines.push(`## ${heading}`);
|
|
201
|
+
lines.push('');
|
|
202
|
+
for (const u of list) {
|
|
203
|
+
lines.push(`- [${linkLabel(u)}](${u})`);
|
|
204
|
+
}
|
|
205
|
+
lines.push('');
|
|
206
|
+
};
|
|
207
|
+
|
|
208
|
+
// Sections derived from the categorized site structure.
|
|
209
|
+
const sections = analysis.structure?.sections || {};
|
|
210
|
+
const flatten = (cat) => {
|
|
211
|
+
const v = sections[cat];
|
|
212
|
+
if (!Array.isArray(v)) return [];
|
|
213
|
+
// categorizeSections may produce either flat URLs or {path, urls:[...]} groups.
|
|
214
|
+
return v.flatMap((entry) =>
|
|
215
|
+
typeof entry === 'string' ? [entry] : (Array.isArray(entry?.urls) ? entry.urls : []));
|
|
216
|
+
};
|
|
217
|
+
|
|
218
|
+
emitSection('Documentation', flatten('documentation'));
|
|
219
|
+
emitSection('Content', flatten('content'));
|
|
220
|
+
emitSection('Tools', flatten('tools'));
|
|
221
|
+
emitSection('Navigation', flatten('navigation'));
|
|
222
|
+
|
|
223
|
+
// APIs as their own section.
|
|
224
|
+
if (Array.isArray(analysis.apis) && analysis.apis.length > 0) {
|
|
225
|
+
lines.push('## APIs');
|
|
226
|
+
lines.push('');
|
|
227
|
+
for (const api of analysis.apis.slice(0, 25)) {
|
|
228
|
+
const note = api.type ? `: ${api.type}` : '';
|
|
229
|
+
lines.push(`- [${linkLabel(api.url)}](${api.url})${note}`);
|
|
230
|
+
}
|
|
231
|
+
lines.push('');
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
// Fallback: if no categorized sections produced output, list the raw sitemap.
|
|
235
|
+
const hasBody = lines.some((l) => l.startsWith('## '));
|
|
236
|
+
if (!hasBody) {
|
|
237
|
+
emitSection('Pages', analysis.structure?.sitemap || []);
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
return lines.join('\n').replace(/\n{3,}/g, '\n\n').trimEnd() + '\n';
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
/**
|
|
244
|
+
* Generate legacy robots.txt-style content (opt-in via outputOptions.robotsStyle).
|
|
245
|
+
*/
|
|
246
|
+
generateRobotsStyleTxt(analysis, outputOptions, complianceLevel) {
|
|
126
247
|
const lines = [];
|
|
127
248
|
const baseUrl = analysis.metadata.baseUrl;
|
|
128
249
|
|
|
@@ -379,18 +379,57 @@ export class DeepResearchTool {
|
|
|
379
379
|
* Format research results according to output preferences
|
|
380
380
|
*/
|
|
381
381
|
formatResults(results, params) {
|
|
382
|
-
// Raw evidence mode (no LLM configured):
|
|
383
|
-
//
|
|
382
|
+
// Raw evidence mode (no LLM configured): apply lightweight formatting so
|
|
383
|
+
// outputFormat is not silently ignored, and rank sources by credibility.
|
|
384
384
|
if (results.synthesisMode === 'raw_evidence') {
|
|
385
|
-
|
|
385
|
+
const rankedSources = (results.sources || [])
|
|
386
|
+
.slice()
|
|
387
|
+
.sort((a, b) => (b.credibility || 0) - (a.credibility || 0));
|
|
388
|
+
|
|
389
|
+
const base = {
|
|
386
390
|
synthesisMode: 'raw_evidence',
|
|
387
391
|
note: results.note,
|
|
388
|
-
sources: results.sources,
|
|
389
392
|
researchSummary: results.researchSummary,
|
|
390
393
|
metadata: results.metadata,
|
|
391
394
|
performance: results.performance,
|
|
392
395
|
activityLog: params.includeActivityLog ? results.activityLog : undefined
|
|
393
396
|
};
|
|
397
|
+
|
|
398
|
+
switch (params.outputFormat) {
|
|
399
|
+
case 'summary':
|
|
400
|
+
return {
|
|
401
|
+
...base,
|
|
402
|
+
sources: rankedSources.slice(0, 5)
|
|
403
|
+
};
|
|
404
|
+
|
|
405
|
+
case 'citations_only':
|
|
406
|
+
return {
|
|
407
|
+
...base,
|
|
408
|
+
sources: rankedSources.map(s => ({
|
|
409
|
+
title: s.title,
|
|
410
|
+
url: s.url,
|
|
411
|
+
credibility: s.credibility
|
|
412
|
+
})),
|
|
413
|
+
citationCount: rankedSources.length,
|
|
414
|
+
citationSummary: this.generateCitationSummary(rankedSources)
|
|
415
|
+
};
|
|
416
|
+
|
|
417
|
+
case 'conflicts_focus':
|
|
418
|
+
// Without LLM there is no conflict detection; return ranked sources
|
|
419
|
+
// with a note so the caller knows what happened.
|
|
420
|
+
return {
|
|
421
|
+
...base,
|
|
422
|
+
sources: rankedSources,
|
|
423
|
+
conflictsNote: 'Conflict detection requires an LLM (OPENAI_API_KEY or ANTHROPIC_API_KEY). Sources are ranked by credibility for manual review.'
|
|
424
|
+
};
|
|
425
|
+
|
|
426
|
+
case 'comprehensive':
|
|
427
|
+
default:
|
|
428
|
+
return {
|
|
429
|
+
...base,
|
|
430
|
+
sources: rankedSources
|
|
431
|
+
};
|
|
432
|
+
}
|
|
394
433
|
}
|
|
395
434
|
|
|
396
435
|
const formatted = {
|
|
@@ -117,8 +117,8 @@ export async function searchViaSearxng(opts = {}) {
|
|
|
117
117
|
return {
|
|
118
118
|
items,
|
|
119
119
|
searchInformation: {
|
|
120
|
-
totalResults:
|
|
121
|
-
searchTime:
|
|
120
|
+
totalResults: rawResults.length,
|
|
121
|
+
searchTime: 0
|
|
122
122
|
},
|
|
123
123
|
queries: {},
|
|
124
124
|
context: {}
|
|
@@ -545,27 +545,50 @@ export class ResultDeduplicator {
|
|
|
545
545
|
}
|
|
546
546
|
|
|
547
547
|
/**
|
|
548
|
-
* SimHash implementation for content similarity
|
|
548
|
+
* SimHash implementation for content similarity.
|
|
549
|
+
* Uses two independent 32-bit FNV-1a hashes (seeded differently) to produce
|
|
550
|
+
* independent high/low 32-bit words, giving a true 64-bit fingerprint so that
|
|
551
|
+
* bits 32-63 are not a duplicate of bits 0-31.
|
|
549
552
|
*/
|
|
550
553
|
simHash(text, bits = 64) {
|
|
551
554
|
const tokens = text.split(/\s+/);
|
|
552
555
|
const hashBits = new Array(bits).fill(0);
|
|
553
|
-
|
|
556
|
+
|
|
554
557
|
for (const token of tokens) {
|
|
555
|
-
const
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
hashBits[i]
|
|
558
|
+
const lo = this._fnv1a32(token, 0x811c9dc5);
|
|
559
|
+
const hi = this._fnv1a32(token, 0x84222325); // different seed
|
|
560
|
+
|
|
561
|
+
for (let i = 0; i < 32; i++) {
|
|
562
|
+
hashBits[i] += ((lo >>> i) & 1) ? 1 : -1;
|
|
563
|
+
}
|
|
564
|
+
for (let i = 0; i < 32; i++) {
|
|
565
|
+
hashBits[32 + i] += ((hi >>> i) & 1) ? 1 : -1;
|
|
560
566
|
}
|
|
561
567
|
}
|
|
562
|
-
|
|
568
|
+
|
|
563
569
|
// Convert to binary string
|
|
564
570
|
return hashBits.map(bit => bit > 0 ? '1' : '0').join('');
|
|
565
571
|
}
|
|
566
572
|
|
|
567
573
|
/**
|
|
568
|
-
*
|
|
574
|
+
* FNV-1a 32-bit hash with a configurable offset basis (seed).
|
|
575
|
+
* @param {string} str
|
|
576
|
+
* @param {number} seed - 32-bit unsigned offset basis
|
|
577
|
+
* @returns {number} 32-bit unsigned integer
|
|
578
|
+
*/
|
|
579
|
+
_fnv1a32(str, seed) {
|
|
580
|
+
const FNV_PRIME = 0x01000193;
|
|
581
|
+
let hash = seed >>> 0;
|
|
582
|
+
for (let i = 0; i < str.length; i++) {
|
|
583
|
+
hash ^= str.charCodeAt(i);
|
|
584
|
+
// Multiply by FNV prime using 32-bit arithmetic
|
|
585
|
+
hash = Math.imul(hash, FNV_PRIME) >>> 0;
|
|
586
|
+
}
|
|
587
|
+
return hash;
|
|
588
|
+
}
|
|
589
|
+
|
|
590
|
+
/**
|
|
591
|
+
* String hash function (kept for hashResults / cache-key use)
|
|
569
592
|
*/
|
|
570
593
|
stringHash(str) {
|
|
571
594
|
let hash = 0;
|
|
@@ -191,18 +191,27 @@ export class ResultRanker {
|
|
|
191
191
|
// Calculate term frequencies
|
|
192
192
|
const termFreqs = this.getTermFrequencies(contentTerms);
|
|
193
193
|
|
|
194
|
+
// Build per-term document frequency across all results
|
|
195
|
+
const docFreqs = {};
|
|
196
|
+
for (const r of allResults) {
|
|
197
|
+
const rContent = [r.title || '', r.snippet || '', r.htmlSnippet || ''].join(' ');
|
|
198
|
+
const rTerms = new Set(this.tokenize(rContent.toLowerCase()));
|
|
199
|
+
for (const t of rTerms) {
|
|
200
|
+
docFreqs[t] = (docFreqs[t] || 0) + 1;
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
|
|
194
204
|
let score = 0;
|
|
195
205
|
for (const term of queryTerms) {
|
|
196
206
|
const tf = termFreqs[term] || 0;
|
|
197
207
|
if (tf > 0) {
|
|
198
|
-
|
|
199
|
-
const df = Math.min(allResults.length * 0.1, 1); // Conservative estimate
|
|
208
|
+
const df = docFreqs[term] || 1;
|
|
200
209
|
const idf = Math.log((allResults.length - df + 0.5) / (df + 0.5));
|
|
201
|
-
|
|
210
|
+
|
|
202
211
|
// BM25 formula
|
|
203
212
|
const numerator = tf * (k1 + 1);
|
|
204
213
|
const denominator = tf + k1 * (1 - b + b * (contentLength / avgDocLength));
|
|
205
|
-
|
|
214
|
+
|
|
206
215
|
score += idf * (numerator / denominator);
|
|
207
216
|
}
|
|
208
217
|
}
|
|
@@ -280,14 +280,14 @@ export class SearchWebTool {
|
|
|
280
280
|
// Clean up results based on detail level requested
|
|
281
281
|
if (!validated.include_ranking_details) {
|
|
282
282
|
processedResults = processedResults.map(result => {
|
|
283
|
-
const { rankingDetails, ...cleanResult } = result;
|
|
283
|
+
const { rankingDetails, finalScore, originalIndex, scores, ...cleanResult } = result;
|
|
284
284
|
return cleanResult;
|
|
285
285
|
});
|
|
286
286
|
}
|
|
287
|
-
|
|
287
|
+
|
|
288
288
|
if (!validated.include_deduplication_details) {
|
|
289
289
|
processedResults = processedResults.map(result => {
|
|
290
|
-
const { deduplicationInfo, ...cleanResult } = result;
|
|
290
|
+
const { deduplicationInfo, contentHash, normalizedUrl, titleTokens, ...cleanResult } = result;
|
|
291
291
|
return cleanResult;
|
|
292
292
|
});
|
|
293
293
|
}
|
|
@@ -407,10 +407,10 @@ export class SearchWebTool {
|
|
|
407
407
|
}
|
|
408
408
|
|
|
409
409
|
if (!validated.include_ranking_details) {
|
|
410
|
-
processedResults = processedResults.map(({ rankingDetails, ...r }) => r);
|
|
410
|
+
processedResults = processedResults.map(({ rankingDetails, finalScore, originalIndex, scores, ...r }) => r);
|
|
411
411
|
}
|
|
412
412
|
if (!validated.include_deduplication_details) {
|
|
413
|
-
processedResults = processedResults.map(({ deduplicationInfo: _d, ...r }) => r);
|
|
413
|
+
processedResults = processedResults.map(({ deduplicationInfo: _d, contentHash, normalizedUrl, titleTokens, ...r }) => r);
|
|
414
414
|
}
|
|
415
415
|
|
|
416
416
|
return {
|
|
@@ -162,8 +162,9 @@ const TEMPLATES = [
|
|
|
162
162
|
const stories = [];
|
|
163
163
|
$('tr.athing').each((_, el) => {
|
|
164
164
|
const $row = $(el);
|
|
165
|
-
|
|
166
|
-
const $subtext = $row.next('
|
|
165
|
+
// The metadata row (".subtext") is the sibling row immediately after tr.athing.
|
|
166
|
+
const $subtext = $row.next('tr').find('.subtext');
|
|
167
|
+
const $score = $subtext.find('.score');
|
|
167
168
|
const $titleLink = $row.find('.titleline > a');
|
|
168
169
|
stories.push({
|
|
169
170
|
id: $row.attr('id'),
|
|
@@ -3,6 +3,38 @@
|
|
|
3
3
|
* URL content fetching and history/stat helper functions.
|
|
4
4
|
*/
|
|
5
5
|
|
|
6
|
+
/**
|
|
7
|
+
* Default Jaccard similarity threshold below which a change is considered
|
|
8
|
+
* meaningful (i.e. worth flagging). 0.85 means content must be at least 85 %
|
|
9
|
+
* similar to be treated as "no significant change".
|
|
10
|
+
*/
|
|
11
|
+
export const DEFAULT_CHANGE_THRESHOLD = 0.85;
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Compute token-based Jaccard similarity between two text strings.
|
|
15
|
+
* Tokenises on whitespace; returns a value in [0, 1] where 1 is identical.
|
|
16
|
+
*
|
|
17
|
+
* @param {string} text1
|
|
18
|
+
* @param {string} text2
|
|
19
|
+
* @returns {number}
|
|
20
|
+
*/
|
|
21
|
+
export function calculateSimilarity(text1, text2) {
|
|
22
|
+
if (!text1 && !text2) return 1;
|
|
23
|
+
if (!text1 || !text2) return 0;
|
|
24
|
+
|
|
25
|
+
const tokenise = (str) => new Set(str.toLowerCase().split(/\s+/).filter(Boolean));
|
|
26
|
+
const setA = tokenise(text1);
|
|
27
|
+
const setB = tokenise(text2);
|
|
28
|
+
|
|
29
|
+
let intersection = 0;
|
|
30
|
+
for (const token of setA) {
|
|
31
|
+
if (setB.has(token)) intersection++;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
const union = setA.size + setB.size - intersection;
|
|
35
|
+
return union === 0 ? 1 : intersection / union;
|
|
36
|
+
}
|
|
37
|
+
|
|
6
38
|
/**
|
|
7
39
|
* Fetch the HTML/text content of a URL with change-tracking headers.
|
|
8
40
|
* @param {string} url
|
|
@@ -18,7 +50,7 @@ export async function fetchContent(url) {
|
|
|
18
50
|
'Accept-Encoding': 'gzip, deflate',
|
|
19
51
|
'Cache-Control': 'no-cache'
|
|
20
52
|
},
|
|
21
|
-
|
|
53
|
+
signal: AbortSignal.timeout(30000)
|
|
22
54
|
});
|
|
23
55
|
|
|
24
56
|
if (!response.ok) {
|
|
@@ -11,10 +11,11 @@
|
|
|
11
11
|
* headingStyle: 'atx' -> # H1 / ## H2 instead of underline style
|
|
12
12
|
* codeBlockStyle: 'fenced' -> triple-backtick fences
|
|
13
13
|
* bulletListMarker: '-'
|
|
14
|
-
* -
|
|
14
|
+
* - GFM plugin enabled for table support (turndown-plugin-gfm).
|
|
15
15
|
*/
|
|
16
16
|
|
|
17
17
|
import TurndownService from 'turndown';
|
|
18
|
+
import { gfm } from 'turndown-plugin-gfm';
|
|
18
19
|
|
|
19
20
|
let _td = null;
|
|
20
21
|
|
|
@@ -30,6 +31,9 @@ function getTurndown() {
|
|
|
30
31
|
linkStyle: 'inlined'
|
|
31
32
|
});
|
|
32
33
|
|
|
34
|
+
// Enable GFM extensions (tables, strikethrough, task lists)
|
|
35
|
+
_td.use(gfm);
|
|
36
|
+
|
|
33
37
|
// Remove boilerplate elements before converting
|
|
34
38
|
_td.remove(['script', 'style', 'nav', 'footer', 'aside', 'noscript']);
|
|
35
39
|
}
|