crawlforge-mcp-server 4.2.12 → 4.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -1
- package/server.js +138 -20
- package/src/constants/config.js +5 -0
- package/src/core/ActionExecutor.js +13 -1
- package/src/core/ChangeTracker.js +8 -5
- package/src/core/LLMsTxtAnalyzer.js +71 -47
- package/src/core/LocalizationManager.js +7 -4
- package/src/core/ResearchOrchestrator.js +10 -6
- package/src/core/StealthBrowserManager.js +52 -13
- package/src/core/analysis/ContentAnalyzer.js +2 -2
- package/src/core/crawlers/BFSCrawler.js +23 -12
- package/src/core/processing/ContentProcessor.js +19 -3
- package/src/core/processing/PDFProcessor.js +72 -23
- package/src/tools/advanced/ScrapeWithActionsTool.js +63 -25
- package/src/tools/advanced/batchScrape/index.js +3 -1
- package/src/tools/advanced/batchScrape/reporter.js +5 -1
- package/src/tools/advanced/batchScrape/worker.js +6 -1
- package/src/tools/basic/_fetch.js +78 -5
- package/src/tools/basic/extractLinks.js +1 -1
- package/src/tools/basic/extractMetadata.js +65 -1
- package/src/tools/basic/extractText.js +61 -5
- package/src/tools/basic/scrapeStructured.js +48 -10
- package/src/tools/crawl/crawlDeep.js +13 -5
- package/src/tools/crawl/mapSite.js +24 -51
- package/src/tools/extract/analyzeContent.js +11 -6
- package/src/tools/extract/extractContent.js +23 -5
- package/src/tools/extract/extractStructured.js +65 -16
- package/src/tools/extract/extractWithLlm.js +192 -11
- package/src/tools/extract/listOllamaModels.js +19 -8
- package/src/tools/extract/processDocument.js +10 -4
- package/src/tools/extract/summarizeContent.js +58 -1
- package/src/tools/llmstxt/generateLLMsTxt.js +124 -3
- package/src/tools/research/deepResearch.js +43 -4
- package/src/tools/search/providers/searxng.js +2 -2
- package/src/tools/search/ranking/ResultDeduplicator.js +32 -9
- package/src/tools/search/ranking/ResultRanker.js +13 -4
- package/src/tools/search/searchWeb.js +5 -5
- package/src/tools/templates/TemplateRegistry.js +3 -2
- package/src/tools/tracking/trackChanges/differ.js +33 -1
- package/src/utils/htmlToMarkdown.js +5 -1
|
@@ -3,28 +3,43 @@
|
|
|
3
3
|
* Applies an AbortController timeout and a default User-Agent.
|
|
4
4
|
*/
|
|
5
5
|
|
|
6
|
+
import { config } from '../../constants/config.js';
|
|
7
|
+
import { createRequire } from 'module';
|
|
8
|
+
|
|
9
|
+
// Derive User-Agent from package version so it reflects the actual release.
|
|
10
|
+
const _require = createRequire(import.meta.url);
|
|
11
|
+
const _pkg = _require('../../../package.json');
|
|
12
|
+
const CRAWLFORGE_UA = `CrawlForge/${_pkg.version} (+https://crawlforge.dev)`;
|
|
13
|
+
|
|
6
14
|
/**
|
|
7
|
-
* Fetch a URL with a configurable timeout.
|
|
15
|
+
* Fetch a URL with a configurable timeout and body-size cap.
|
|
16
|
+
*
|
|
17
|
+
* Content-Length is checked before the body is read; if absent or lying, the
|
|
18
|
+
* accumulated byte count is checked during streaming. Both checks use the
|
|
19
|
+
* configurable cap from config.fetch.maxBodySize (env MAX_FETCH_BODY_SIZE,
|
|
20
|
+
* default 25 MB).
|
|
21
|
+
*
|
|
8
22
|
* @param {string} url
|
|
9
23
|
* @param {{ timeout?: number, headers?: Record<string,string> }} [options]
|
|
10
|
-
* @returns {Promise<Response>}
|
|
24
|
+
* @returns {Promise<Response & { _body: string }>}
|
|
11
25
|
*/
|
|
12
26
|
export async function fetchWithTimeout(url, options = {}) {
|
|
13
27
|
const { timeout = 10000, headers = {} } = options;
|
|
28
|
+
const maxBodySize = config.fetch.maxBodySize;
|
|
14
29
|
|
|
15
30
|
const controller = new AbortController();
|
|
16
31
|
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
|
17
32
|
|
|
33
|
+
let response;
|
|
18
34
|
try {
|
|
19
|
-
|
|
35
|
+
response = await fetch(url, {
|
|
20
36
|
signal: controller.signal,
|
|
21
37
|
headers: {
|
|
22
|
-
'User-Agent':
|
|
38
|
+
'User-Agent': CRAWLFORGE_UA,
|
|
23
39
|
...headers
|
|
24
40
|
}
|
|
25
41
|
});
|
|
26
42
|
clearTimeout(timeoutId);
|
|
27
|
-
return response;
|
|
28
43
|
} catch (error) {
|
|
29
44
|
clearTimeout(timeoutId);
|
|
30
45
|
if (error.name === 'AbortError') {
|
|
@@ -32,4 +47,62 @@ export async function fetchWithTimeout(url, options = {}) {
|
|
|
32
47
|
}
|
|
33
48
|
throw error;
|
|
34
49
|
}
|
|
50
|
+
|
|
51
|
+
// --- Body-size cap ---
|
|
52
|
+
|
|
53
|
+
// Early rejection via Content-Length (servers may omit or lie — guard below
|
|
54
|
+
// handles that case). Optional-chained so non-standard responses (e.g. test
|
|
55
|
+
// mocks) without a Headers object don't throw.
|
|
56
|
+
const contentLengthHeader = response.headers?.get?.('content-length') ?? null;
|
|
57
|
+
if (contentLengthHeader !== null) {
|
|
58
|
+
const declared = parseInt(contentLengthHeader, 10);
|
|
59
|
+
if (!isNaN(declared) && declared > maxBodySize) {
|
|
60
|
+
throw new Error(
|
|
61
|
+
`Response body too large: Content-Length ${declared} exceeds limit of ${maxBodySize} bytes`
|
|
62
|
+
);
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// Only the streaming byte-count guard requires a readable body. Responses
|
|
67
|
+
// without a ReadableStream body (already-buffered responses, test mocks)
|
|
68
|
+
// are returned unchanged so callers' native .text()/.json() still work.
|
|
69
|
+
if (!response.body || typeof response.body.getReader !== 'function') {
|
|
70
|
+
return response;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// Stream the body and abort if accumulated bytes exceed the cap.
|
|
74
|
+
const reader = response.body.getReader();
|
|
75
|
+
const chunks = [];
|
|
76
|
+
let totalBytes = 0;
|
|
77
|
+
|
|
78
|
+
while (true) {
|
|
79
|
+
const { done, value } = await reader.read();
|
|
80
|
+
if (done) break;
|
|
81
|
+
totalBytes += value.byteLength;
|
|
82
|
+
if (totalBytes > maxBodySize) {
|
|
83
|
+
reader.cancel();
|
|
84
|
+
throw new Error(
|
|
85
|
+
`Response body too large: exceeded limit of ${maxBodySize} bytes`
|
|
86
|
+
);
|
|
87
|
+
}
|
|
88
|
+
chunks.push(value);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// Reassemble and expose as a response-like object that callers can use.
|
|
92
|
+
const bodyText = new TextDecoder().decode(
|
|
93
|
+
chunks.reduce((acc, chunk) => {
|
|
94
|
+
const merged = new Uint8Array(acc.byteLength + chunk.byteLength);
|
|
95
|
+
merged.set(acc, 0);
|
|
96
|
+
merged.set(chunk, acc.byteLength);
|
|
97
|
+
return merged;
|
|
98
|
+
}, new Uint8Array(0))
|
|
99
|
+
);
|
|
100
|
+
|
|
101
|
+
// Attach the pre-read text so callers can call .text() on the result.
|
|
102
|
+
// We wrap it in a minimal compatible object.
|
|
103
|
+
return Object.assign(response, {
|
|
104
|
+
text: () => Promise.resolve(bodyText),
|
|
105
|
+
json: () => Promise.resolve(JSON.parse(bodyText)),
|
|
106
|
+
_body: bodyText
|
|
107
|
+
});
|
|
35
108
|
}
|
|
@@ -41,7 +41,7 @@ export async function extractLinksHandler({ url, filter_external, base_url }) {
|
|
|
41
41
|
isExternal = false;
|
|
42
42
|
}
|
|
43
43
|
|
|
44
|
-
if (filter_external && isExternal) return;
|
|
44
|
+
if (filter_external && !isExternal) return;
|
|
45
45
|
|
|
46
46
|
links.push({ href: absoluteUrl, text, is_external: isExternal, original_href: href });
|
|
47
47
|
} catch {
|
|
@@ -1,11 +1,64 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* extract_metadata — Extract page metadata (title, description, OG tags, etc.).
|
|
3
3
|
* Extracted from server.js inline handler.
|
|
4
|
+
* B1: Parse JSON-LD and microdata; stronger title fallback chain (og:title → <title> → h1).
|
|
4
5
|
*/
|
|
5
6
|
|
|
6
7
|
import { load } from 'cheerio';
|
|
7
8
|
import { fetchWithTimeout } from './_fetch.js';
|
|
8
9
|
|
|
10
|
+
/**
|
|
11
|
+
* Parse all JSON-LD blocks from the document.
|
|
12
|
+
* @param {import('cheerio').CheerioAPI} $
|
|
13
|
+
* @returns {Array}
|
|
14
|
+
*/
|
|
15
|
+
function parseJsonLd($) {
|
|
16
|
+
const results = [];
|
|
17
|
+
$('script[type="application/ld+json"]').each((_, el) => {
|
|
18
|
+
try {
|
|
19
|
+
const raw = $(el).html();
|
|
20
|
+
if (raw) results.push(JSON.parse(raw));
|
|
21
|
+
} catch {
|
|
22
|
+
// Skip invalid blocks
|
|
23
|
+
}
|
|
24
|
+
});
|
|
25
|
+
return results;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
/**
|
|
29
|
+
* Parse microdata items (elements with itemscope).
|
|
30
|
+
* @param {import('cheerio').CheerioAPI} $
|
|
31
|
+
* @returns {Array}
|
|
32
|
+
*/
|
|
33
|
+
function parseMicrodata($) {
|
|
34
|
+
const results = [];
|
|
35
|
+
$('[itemscope]').each((_, el) => {
|
|
36
|
+
const $el = $(el);
|
|
37
|
+
const item = {
|
|
38
|
+
type: $el.attr('itemtype') || null,
|
|
39
|
+
properties: {}
|
|
40
|
+
};
|
|
41
|
+
$el.find('[itemprop]').each((_, prop) => {
|
|
42
|
+
const $prop = $(prop);
|
|
43
|
+
const name = $prop.attr('itemprop');
|
|
44
|
+
if (!name) return;
|
|
45
|
+
const tag = ($prop.get(0).tagName || '').toLowerCase();
|
|
46
|
+
let value;
|
|
47
|
+
if (tag === 'meta') value = $prop.attr('content');
|
|
48
|
+
else if (tag === 'a' || tag === 'link') value = $prop.attr('href');
|
|
49
|
+
else if (tag === 'img') value = $prop.attr('src');
|
|
50
|
+
else if (tag === 'time') value = $prop.attr('datetime') || $prop.text().trim();
|
|
51
|
+
else value = $prop.text().trim();
|
|
52
|
+
if (value) {
|
|
53
|
+
if (!item.properties[name]) item.properties[name] = [];
|
|
54
|
+
item.properties[name].push(value);
|
|
55
|
+
}
|
|
56
|
+
});
|
|
57
|
+
results.push(item);
|
|
58
|
+
});
|
|
59
|
+
return results;
|
|
60
|
+
}
|
|
61
|
+
|
|
9
62
|
/**
|
|
10
63
|
* @param {{ url: string }} params
|
|
11
64
|
*/
|
|
@@ -19,7 +72,13 @@ export async function extractMetadataHandler({ url }) {
|
|
|
19
72
|
const html = await response.text();
|
|
20
73
|
const $ = load(html);
|
|
21
74
|
|
|
22
|
-
|
|
75
|
+
// Stronger title fallback: og:title → <title> → h1
|
|
76
|
+
const title =
|
|
77
|
+
$('meta[property="og:title"]').attr('content') ||
|
|
78
|
+
$('title').text().trim() ||
|
|
79
|
+
$('h1').first().text().trim() ||
|
|
80
|
+
'';
|
|
81
|
+
|
|
23
82
|
const description =
|
|
24
83
|
$('meta[name="description"]').attr('content') ||
|
|
25
84
|
$('meta[property="og:description"]').attr('content') || '';
|
|
@@ -47,6 +106,9 @@ export async function extractMetadataHandler({ url }) {
|
|
|
47
106
|
$('meta[charset]').attr('charset') ||
|
|
48
107
|
$('meta[http-equiv="Content-Type"]').attr('content') || '';
|
|
49
108
|
|
|
109
|
+
const jsonLd = parseJsonLd($);
|
|
110
|
+
const microdata = parseMicrodata($);
|
|
111
|
+
|
|
50
112
|
return {
|
|
51
113
|
content: [{
|
|
52
114
|
type: 'text',
|
|
@@ -61,6 +123,8 @@ export async function extractMetadataHandler({ url }) {
|
|
|
61
123
|
charset,
|
|
62
124
|
og_tags: ogTags,
|
|
63
125
|
twitter_tags: twitterTags,
|
|
126
|
+
json_ld: jsonLd,
|
|
127
|
+
microdata,
|
|
64
128
|
url: response.url
|
|
65
129
|
}, null, 2)
|
|
66
130
|
}]
|
|
@@ -2,12 +2,56 @@
|
|
|
2
2
|
* extract_text — Extract clean text content from HTML.
|
|
3
3
|
* Extracted from server.js inline handler.
|
|
4
4
|
* D3.1: Added output_format:"markdown" option backed by Turndown.
|
|
5
|
+
* B1: Preserve block structure for text mode; use Readability + GFM for markdown mode.
|
|
5
6
|
*/
|
|
6
7
|
|
|
7
8
|
import { load } from 'cheerio';
|
|
9
|
+
import { JSDOM } from 'jsdom';
|
|
10
|
+
import { Readability } from '@mozilla/readability';
|
|
8
11
|
import { fetchWithTimeout } from './_fetch.js';
|
|
9
12
|
import { htmlToMarkdown } from '../../utils/htmlToMarkdown.js';
|
|
10
13
|
|
|
14
|
+
// Block-level elements whose boundaries should become paragraph breaks
|
|
15
|
+
const BLOCK_ELEMENTS = new Set([
|
|
16
|
+
'p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
|
17
|
+
'li', 'blockquote', 'pre', 'td', 'th', 'dt', 'dd',
|
|
18
|
+
'article', 'section', 'figure', 'figcaption', 'aside',
|
|
19
|
+
'header', 'footer', 'main', 'nav', 'form', 'fieldset',
|
|
20
|
+
'table', 'tr', 'caption'
|
|
21
|
+
]);
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Extract plain text from a cheerio root preserving block-element paragraph breaks.
|
|
25
|
+
* @param {import('cheerio').CheerioAPI} $ - loaded cheerio instance
|
|
26
|
+
* @returns {string}
|
|
27
|
+
*/
|
|
28
|
+
function extractBlockText($) {
|
|
29
|
+
const parts = [];
|
|
30
|
+
|
|
31
|
+
function walk(node) {
|
|
32
|
+
if (node.type === 'text') {
|
|
33
|
+
const t = node.data.replace(/[ \t\r\n]+/g, ' ');
|
|
34
|
+
if (t.trim()) parts.push(t);
|
|
35
|
+
return;
|
|
36
|
+
}
|
|
37
|
+
if (node.type !== 'tag') return;
|
|
38
|
+
const tag = node.tagName ? node.tagName.toLowerCase() : '';
|
|
39
|
+
const isBlock = BLOCK_ELEMENTS.has(tag);
|
|
40
|
+
if (isBlock) parts.push('\n\n');
|
|
41
|
+
for (const child of (node.children || [])) {
|
|
42
|
+
walk(child);
|
|
43
|
+
}
|
|
44
|
+
if (isBlock) parts.push('\n\n');
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
const body = $('body').get(0);
|
|
48
|
+
if (body) {
|
|
49
|
+
for (const child of (body.children || [])) walk(child);
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
return parts.join('').replace(/\n{3,}/g, '\n\n').trim();
|
|
53
|
+
}
|
|
54
|
+
|
|
11
55
|
/**
|
|
12
56
|
* @param {{ url: string, remove_scripts?: boolean, remove_styles?: boolean, output_format?: "text"|"markdown" }} params
|
|
13
57
|
*/
|
|
@@ -26,20 +70,32 @@ export async function extractTextHandler({ url, remove_scripts, remove_styles, o
|
|
|
26
70
|
|
|
27
71
|
$('nav, header, footer, aside, .advertisement, .ad, .sidebar').remove();
|
|
28
72
|
|
|
29
|
-
const text = $('body').text().replace(/\s+/g, ' ').trim();
|
|
30
|
-
|
|
31
73
|
const result = {
|
|
32
|
-
word_count: text.split(/\s+/).filter(w => w.length > 0).length,
|
|
33
|
-
char_count: text.length,
|
|
34
74
|
url: response.url
|
|
35
75
|
};
|
|
36
76
|
|
|
37
77
|
if (output_format === 'markdown') {
|
|
38
|
-
|
|
78
|
+
// Run Readability first to get main content, then convert to GFM markdown
|
|
79
|
+
let articleHtml;
|
|
80
|
+
try {
|
|
81
|
+
const dom = new JSDOM(html, { url: response.url });
|
|
82
|
+
const reader = new Readability(dom.window.document);
|
|
83
|
+
const article = reader.parse();
|
|
84
|
+
articleHtml = article ? article.content : $.html('body');
|
|
85
|
+
} catch {
|
|
86
|
+
articleHtml = $.html('body');
|
|
87
|
+
}
|
|
88
|
+
result.markdown = htmlToMarkdown(articleHtml);
|
|
39
89
|
result.output_format = 'markdown';
|
|
90
|
+
const plainText = result.markdown.replace(/[#*`_\[\]]/g, '').replace(/\s+/g, ' ').trim();
|
|
91
|
+
result.word_count = plainText.split(/\s+/).filter(w => w.length > 0).length;
|
|
92
|
+
result.char_count = plainText.length;
|
|
40
93
|
} else {
|
|
94
|
+
const text = extractBlockText($);
|
|
41
95
|
result.text = text;
|
|
42
96
|
result.output_format = 'text';
|
|
97
|
+
result.word_count = text.split(/\s+/).filter(w => w.length > 0).length;
|
|
98
|
+
result.char_count = text.length;
|
|
43
99
|
}
|
|
44
100
|
|
|
45
101
|
return {
|
|
@@ -1,15 +1,33 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* scrape_structured — Extract structured data using CSS selectors.
|
|
3
3
|
* Extracted from server.js inline handler.
|
|
4
|
+
* B1: Support attribute extraction (selector@attr), add max_results,
|
|
5
|
+
* fix elements_found to report real per-field DOM match counts.
|
|
4
6
|
*/
|
|
5
7
|
|
|
6
8
|
import { load } from 'cheerio';
|
|
7
9
|
import { fetchWithTimeout } from './_fetch.js';
|
|
8
10
|
|
|
9
11
|
/**
|
|
10
|
-
*
|
|
12
|
+
* Parse a selector string that may include an attribute suffix: "css@attr"
|
|
13
|
+
* e.g. "a.link@href" -> { selector: "a.link", attribute: "href" }
|
|
14
|
+
* "img@src" -> { selector: "img", attribute: "src" }
|
|
15
|
+
* "h1" -> { selector: "h1", attribute: null }
|
|
16
|
+
* @param {string} raw
|
|
17
|
+
* @returns {{ selector: string, attribute: string|null }}
|
|
11
18
|
*/
|
|
12
|
-
|
|
19
|
+
function parseSelectorSpec(raw) {
|
|
20
|
+
const atIdx = raw.lastIndexOf('@');
|
|
21
|
+
if (atIdx > 0) {
|
|
22
|
+
return { selector: raw.slice(0, atIdx), attribute: raw.slice(atIdx + 1) };
|
|
23
|
+
}
|
|
24
|
+
return { selector: raw, attribute: null };
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* @param {{ url: string, selectors: Record<string, string>, max_results?: number }} params
|
|
29
|
+
*/
|
|
30
|
+
export async function scrapeStructuredHandler({ url, selectors, max_results }) {
|
|
13
31
|
try {
|
|
14
32
|
const response = await fetchWithTimeout(url);
|
|
15
33
|
if (!response.ok) {
|
|
@@ -19,22 +37,42 @@ export async function scrapeStructuredHandler({ url, selectors }) {
|
|
|
19
37
|
const html = await response.text();
|
|
20
38
|
const $ = load(html);
|
|
21
39
|
const results = {};
|
|
40
|
+
const matchCounts = {};
|
|
22
41
|
|
|
23
|
-
for (const [fieldName,
|
|
42
|
+
for (const [fieldName, rawSelector] of Object.entries(selectors)) {
|
|
24
43
|
try {
|
|
25
|
-
const
|
|
26
|
-
|
|
44
|
+
const { selector, attribute } = parseSelectorSpec(rawSelector);
|
|
45
|
+
let elements = $(selector);
|
|
46
|
+
const domCount = elements.length;
|
|
47
|
+
matchCounts[fieldName] = domCount;
|
|
48
|
+
|
|
49
|
+
if (domCount === 0) {
|
|
27
50
|
results[fieldName] = null;
|
|
28
|
-
} else if (elements.length === 1) {
|
|
29
|
-
results[fieldName] = elements.text().trim();
|
|
30
51
|
} else {
|
|
31
|
-
|
|
52
|
+
// Apply max_results cap if specified
|
|
53
|
+
if (max_results != null && max_results > 0 && domCount > max_results) {
|
|
54
|
+
elements = elements.slice(0, max_results);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
const extract = (el) => {
|
|
58
|
+
if (attribute) {
|
|
59
|
+
return $(el).attr(attribute) ?? null;
|
|
60
|
+
}
|
|
61
|
+
return $(el).text().trim();
|
|
62
|
+
};
|
|
63
|
+
|
|
64
|
+
if (elements.length === 1) {
|
|
65
|
+
results[fieldName] = extract(elements.get(0));
|
|
66
|
+
} else {
|
|
67
|
+
results[fieldName] = elements.map((_, el) => extract(el)).get();
|
|
68
|
+
}
|
|
32
69
|
}
|
|
33
70
|
} catch (selectorError) {
|
|
34
71
|
results[fieldName] = {
|
|
35
|
-
error: `Invalid selector: ${
|
|
72
|
+
error: `Invalid selector: ${rawSelector}`,
|
|
36
73
|
message: selectorError.message
|
|
37
74
|
};
|
|
75
|
+
matchCounts[fieldName] = 0;
|
|
38
76
|
}
|
|
39
77
|
}
|
|
40
78
|
|
|
@@ -44,7 +82,7 @@ export async function scrapeStructuredHandler({ url, selectors }) {
|
|
|
44
82
|
text: JSON.stringify({
|
|
45
83
|
data: results,
|
|
46
84
|
selectors_used: selectors,
|
|
47
|
-
elements_found:
|
|
85
|
+
elements_found: matchCounts,
|
|
48
86
|
url: response.url
|
|
49
87
|
}, null, 2)
|
|
50
88
|
}]
|
|
@@ -14,6 +14,7 @@ const CrawlDeepSchema = z.object({
|
|
|
14
14
|
follow_external: z.boolean().optional().default(false),
|
|
15
15
|
respect_robots: z.boolean().optional().default(true),
|
|
16
16
|
extract_content: z.boolean().optional().default(true),
|
|
17
|
+
content_max_length: z.number().min(1).max(100000).optional().default(500),
|
|
17
18
|
concurrency: z.number().min(1).max(20).optional().default(10),
|
|
18
19
|
enable_link_analysis: z.boolean().optional().default(true),
|
|
19
20
|
link_analysis_options: z.object({
|
|
@@ -217,7 +218,7 @@ export class CrawlDeepTool {
|
|
|
217
218
|
errors: results.errors.length,
|
|
218
219
|
duration_ms: duration,
|
|
219
220
|
pages_per_second: results.urls.length / (duration / 1000),
|
|
220
|
-
results: this.formatResults(results.results, validated.extract_content),
|
|
221
|
+
results: this.formatResults(results.results, validated.extract_content, validated.content_max_length),
|
|
221
222
|
errors: results.errors,
|
|
222
223
|
stats: results.stats,
|
|
223
224
|
site_structure: this.analyzeSiteStructure(results.urls),
|
|
@@ -240,7 +241,7 @@ export class CrawlDeepTool {
|
|
|
240
241
|
}
|
|
241
242
|
}
|
|
242
243
|
|
|
243
|
-
formatResults(results, includeContent) {
|
|
244
|
+
formatResults(results, includeContent, contentMaxLength = 500) {
|
|
244
245
|
return results.map(result => {
|
|
245
246
|
const formatted = {
|
|
246
247
|
url: result.url,
|
|
@@ -250,12 +251,19 @@ export class CrawlDeepTool {
|
|
|
250
251
|
content_length: result.contentLength,
|
|
251
252
|
timestamp: result.timestamp
|
|
252
253
|
};
|
|
253
|
-
|
|
254
|
+
|
|
254
255
|
if (includeContent) {
|
|
255
|
-
|
|
256
|
+
const raw = result.content || '';
|
|
257
|
+
if (raw.length > contentMaxLength) {
|
|
258
|
+
formatted.content = raw.substring(0, contentMaxLength);
|
|
259
|
+
formatted.truncated = true;
|
|
260
|
+
} else {
|
|
261
|
+
formatted.content = raw;
|
|
262
|
+
formatted.truncated = false;
|
|
263
|
+
}
|
|
256
264
|
formatted.metadata = result.metadata;
|
|
257
265
|
}
|
|
258
|
-
|
|
266
|
+
|
|
259
267
|
return formatted;
|
|
260
268
|
});
|
|
261
269
|
}
|
|
@@ -3,6 +3,7 @@ import { load } from 'cheerio';
|
|
|
3
3
|
import { DomainFilter } from '../../utils/domainFilter.js';
|
|
4
4
|
import { normalizeUrl, getBaseUrl } from '../../utils/urlNormalizer.js';
|
|
5
5
|
import { CacheManager } from '../../core/cache/CacheManager.js';
|
|
6
|
+
import { SitemapParser } from '../../utils/sitemapParser.js';
|
|
6
7
|
|
|
7
8
|
const MapSiteSchema = z.object({
|
|
8
9
|
url: z.string().url(),
|
|
@@ -33,6 +34,7 @@ export class MapSiteTool {
|
|
|
33
34
|
this.timeout = timeout;
|
|
34
35
|
// Per-session result cache: avoids redundant site maps for the same root URL
|
|
35
36
|
this.cache = cacheEnabled ? new CacheManager({ ttl: cacheTTL }) : null;
|
|
37
|
+
this.sitemapParser = new SitemapParser({ userAgent, timeout, enableCaching: cacheEnabled, cacheTTL });
|
|
36
38
|
}
|
|
37
39
|
|
|
38
40
|
async execute(params) {
|
|
@@ -131,61 +133,32 @@ export class MapSiteTool {
|
|
|
131
133
|
}
|
|
132
134
|
|
|
133
135
|
async fetchSitemapUrls(baseUrl, domainFilter = null) {
|
|
136
|
+
// Discover sitemaps via robots.txt and common paths, then parse with full
|
|
137
|
+
// SitemapParser support (sitemap-index recursion, gzip, CDATA/entities).
|
|
138
|
+
const discovered = await this.sitemapParser.discoverSitemaps(baseUrl, {
|
|
139
|
+
checkRobotsTxt: true,
|
|
140
|
+
checkCommonPaths: true,
|
|
141
|
+
checkSitemapIndex: false
|
|
142
|
+
});
|
|
143
|
+
|
|
134
144
|
const urls = new Set();
|
|
135
|
-
const
|
|
136
|
-
`${baseUrl}/sitemap.xml`,
|
|
137
|
-
`${baseUrl}/sitemap_index.xml`,
|
|
138
|
-
`${baseUrl}/sitemap-index.xml`,
|
|
139
|
-
`${baseUrl}/sitemaps.xml`
|
|
140
|
-
];
|
|
141
|
-
|
|
142
|
-
for (const sitemapUrl of sitemapUrls) {
|
|
145
|
+
for (const sitemapUrl of discovered) {
|
|
143
146
|
try {
|
|
144
|
-
const
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
147
|
+
const parsed = await this.sitemapParser.parseSitemap(sitemapUrl, {
|
|
148
|
+
includeMetadata: false,
|
|
149
|
+
followIndexes: true
|
|
150
|
+
});
|
|
151
|
+
if (parsed.success) {
|
|
152
|
+
for (const entry of parsed.urls) {
|
|
153
|
+
const url = entry.loc || entry;
|
|
151
154
|
if (!domainFilter || domainFilter.isAllowed(url).allowed) {
|
|
152
155
|
urls.add(url);
|
|
153
156
|
}
|
|
154
|
-
}
|
|
155
|
-
|
|
156
|
-
// If we found a sitemap, don't try others
|
|
157
|
-
if (urls.size > 0) break;
|
|
157
|
+
}
|
|
158
158
|
}
|
|
159
|
+
if (urls.size > 0) break;
|
|
159
160
|
} catch {
|
|
160
|
-
// Continue to next sitemap
|
|
161
|
-
}
|
|
162
|
-
}
|
|
163
|
-
|
|
164
|
-
return Array.from(urls);
|
|
165
|
-
}
|
|
166
|
-
|
|
167
|
-
parseSitemap(xml) {
|
|
168
|
-
const urls = new Set();
|
|
169
|
-
|
|
170
|
-
// Extract URLs from sitemap
|
|
171
|
-
const urlMatches = xml.match(/<loc>([^<]+)<\/loc>/g);
|
|
172
|
-
if (urlMatches) {
|
|
173
|
-
urlMatches.forEach(match => {
|
|
174
|
-
const url = match.replace(/<\/?loc>/g, '').trim();
|
|
175
|
-
if (url) urls.add(url);
|
|
176
|
-
});
|
|
177
|
-
}
|
|
178
|
-
|
|
179
|
-
// Check for nested sitemaps (sitemap index)
|
|
180
|
-
const sitemapMatches = xml.match(/<sitemap>[\s\S]*?<\/sitemap>/g);
|
|
181
|
-
if (sitemapMatches) {
|
|
182
|
-
for (const sitemapMatch of sitemapMatches) {
|
|
183
|
-
const locMatch = sitemapMatch.match(/<loc>([^<]+)<\/loc>/);
|
|
184
|
-
if (locMatch && locMatch[1]) {
|
|
185
|
-
// We could recursively fetch nested sitemaps here
|
|
186
|
-
// For now, just add the sitemap URL itself
|
|
187
|
-
urls.add(locMatch[1]);
|
|
188
|
-
}
|
|
161
|
+
// Continue to next discovered sitemap
|
|
189
162
|
}
|
|
190
163
|
}
|
|
191
164
|
|
|
@@ -362,7 +335,7 @@ export class MapSiteTool {
|
|
|
362
335
|
max_depth: 0,
|
|
363
336
|
average_depth: 0,
|
|
364
337
|
url_lengths: {
|
|
365
|
-
min:
|
|
338
|
+
min: null,
|
|
366
339
|
max: 0,
|
|
367
340
|
average: 0
|
|
368
341
|
}
|
|
@@ -374,7 +347,7 @@ export class MapSiteTool {
|
|
|
374
347
|
for (const url of urls) {
|
|
375
348
|
try {
|
|
376
349
|
const urlObj = new URL(url);
|
|
377
|
-
|
|
350
|
+
|
|
378
351
|
// Count secure URLs
|
|
379
352
|
if (urlObj.protocol === 'https:') {
|
|
380
353
|
stats.secure_urls++;
|
|
@@ -396,7 +369,7 @@ export class MapSiteTool {
|
|
|
396
369
|
// Track URL lengths
|
|
397
370
|
const length = url.length;
|
|
398
371
|
totalLength += length;
|
|
399
|
-
stats.url_lengths.min = Math.min(stats.url_lengths.min, length);
|
|
372
|
+
stats.url_lengths.min = stats.url_lengths.min === null ? length : Math.min(stats.url_lengths.min, length);
|
|
400
373
|
stats.url_lengths.max = Math.max(stats.url_lengths.max, length);
|
|
401
374
|
|
|
402
375
|
// Track file extensions
|
|
@@ -266,9 +266,9 @@ export class AnalyzeContentTool {
|
|
|
266
266
|
};
|
|
267
267
|
|
|
268
268
|
const keywordStr = keywords.join(' ').toLowerCase();
|
|
269
|
-
|
|
269
|
+
|
|
270
270
|
for (const [category, categoryKeywords] of Object.entries(categories)) {
|
|
271
|
-
const matches = categoryKeywords.filter(word =>
|
|
271
|
+
const matches = categoryKeywords.filter(word => new RegExp(`\\b${word}\\b`).test(keywordStr));
|
|
272
272
|
if (matches.length > 0) {
|
|
273
273
|
return category;
|
|
274
274
|
}
|
|
@@ -394,13 +394,18 @@ export class AnalyzeContentTool {
|
|
|
394
394
|
anticipation: ['excited', 'eager', 'looking forward', 'anticipating', 'expecting']
|
|
395
395
|
};
|
|
396
396
|
|
|
397
|
-
const
|
|
397
|
+
const lowerText = text.toLowerCase();
|
|
398
|
+
const words = lowerText.split(/\s+/);
|
|
398
399
|
const emotions = [];
|
|
399
400
|
|
|
400
401
|
for (const [emotion, emotionKeywords] of Object.entries(emotionWords)) {
|
|
401
|
-
const
|
|
402
|
-
|
|
403
|
-
const
|
|
402
|
+
const matchCount = emotionKeywords.reduce((count, keyword) => {
|
|
403
|
+
const re = new RegExp(`\\b${keyword}\\b`, 'g');
|
|
404
|
+
const found = lowerText.match(re);
|
|
405
|
+
return count + (found ? found.length : 0);
|
|
406
|
+
}, 0);
|
|
407
|
+
if (matchCount > 0) {
|
|
408
|
+
const intensity = Math.min(1, matchCount / Math.max(words.length / 100, 1));
|
|
404
409
|
emotions.push({
|
|
405
410
|
emotion,
|
|
406
411
|
intensity: Math.round(intensity * 100) / 100
|