crawlforge-mcp-server 4.2.12 → 4.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +19 -7
- package/README.md +11 -3
- package/package.json +3 -2
- package/server.js +195 -22
- package/src/cli/commands/init.js +107 -0
- package/src/cli/index.js +2 -0
- package/src/constants/config.js +5 -0
- package/src/core/ActionExecutor.js +13 -1
- package/src/core/AgentOrchestrator.js +300 -0
- package/src/core/AuthManager.js +21 -1
- package/src/core/ChangeTracker.js +8 -5
- package/src/core/LLMsTxtAnalyzer.js +71 -47
- package/src/core/LocalizationManager.js +7 -4
- package/src/core/ResearchOrchestrator.js +10 -6
- package/src/core/StealthBrowserManager.js +52 -13
- package/src/core/analysis/ContentAnalyzer.js +2 -2
- package/src/core/crawlers/BFSCrawler.js +23 -12
- package/src/core/processing/ContentProcessor.js +19 -3
- package/src/core/processing/PDFProcessor.js +72 -23
- package/src/tools/advanced/ScrapeWithActionsTool.js +63 -25
- package/src/tools/advanced/batchScrape/index.js +3 -1
- package/src/tools/advanced/batchScrape/reporter.js +5 -1
- package/src/tools/advanced/batchScrape/worker.js +6 -1
- package/src/tools/agent/agent.js +71 -0
- package/src/tools/basic/_fetch.js +78 -5
- package/src/tools/basic/extractLinks.js +1 -1
- package/src/tools/basic/extractMetadata.js +65 -1
- package/src/tools/basic/extractText.js +73 -5
- package/src/tools/basic/scrapeStructured.js +48 -10
- package/src/tools/crawl/crawlDeep.js +13 -5
- package/src/tools/crawl/mapSite.js +53 -52
- package/src/tools/extract/analyzeContent.js +11 -6
- package/src/tools/extract/extractContent.js +23 -5
- package/src/tools/extract/extractStructured.js +65 -16
- package/src/tools/extract/extractWithLlm.js +192 -11
- package/src/tools/extract/listOllamaModels.js +19 -8
- package/src/tools/extract/processDocument.js +10 -4
- package/src/tools/extract/summarizeContent.js +58 -1
- package/src/tools/llmstxt/generateLLMsTxt.js +124 -3
- package/src/tools/research/deepResearch.js +43 -4
- package/src/tools/scrape/unifiedScrape.js +314 -0
- package/src/tools/search/providers/searxng.js +2 -2
- package/src/tools/search/ranking/ResultDeduplicator.js +32 -9
- package/src/tools/search/ranking/ResultRanker.js +13 -4
- package/src/tools/search/searchWeb.js +5 -5
- package/src/tools/templates/TemplateRegistry.js +3 -2
- package/src/tools/tracking/trackChanges/differ.js +33 -1
- package/src/utils/htmlToMarkdown.js +5 -1
|
@@ -3,28 +3,43 @@
|
|
|
3
3
|
* Applies an AbortController timeout and a default User-Agent.
|
|
4
4
|
*/
|
|
5
5
|
|
|
6
|
+
import { config } from '../../constants/config.js';
|
|
7
|
+
import { createRequire } from 'module';
|
|
8
|
+
|
|
9
|
+
// Derive User-Agent from package version so it reflects the actual release.
|
|
10
|
+
const _require = createRequire(import.meta.url);
|
|
11
|
+
const _pkg = _require('../../../package.json');
|
|
12
|
+
const CRAWLFORGE_UA = `CrawlForge/${_pkg.version} (+https://crawlforge.dev)`;
|
|
13
|
+
|
|
6
14
|
/**
|
|
7
|
-
* Fetch a URL with a configurable timeout.
|
|
15
|
+
* Fetch a URL with a configurable timeout and body-size cap.
|
|
16
|
+
*
|
|
17
|
+
* Content-Length is checked before the body is read; if absent or lying, the
|
|
18
|
+
* accumulated byte count is checked during streaming. Both checks use the
|
|
19
|
+
* configurable cap from config.fetch.maxBodySize (env MAX_FETCH_BODY_SIZE,
|
|
20
|
+
* default 25 MB).
|
|
21
|
+
*
|
|
8
22
|
* @param {string} url
|
|
9
23
|
* @param {{ timeout?: number, headers?: Record<string,string> }} [options]
|
|
10
|
-
* @returns {Promise<Response>}
|
|
24
|
+
* @returns {Promise<Response & { _body: string }>}
|
|
11
25
|
*/
|
|
12
26
|
export async function fetchWithTimeout(url, options = {}) {
|
|
13
27
|
const { timeout = 10000, headers = {} } = options;
|
|
28
|
+
const maxBodySize = config.fetch.maxBodySize;
|
|
14
29
|
|
|
15
30
|
const controller = new AbortController();
|
|
16
31
|
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
|
17
32
|
|
|
33
|
+
let response;
|
|
18
34
|
try {
|
|
19
|
-
|
|
35
|
+
response = await fetch(url, {
|
|
20
36
|
signal: controller.signal,
|
|
21
37
|
headers: {
|
|
22
|
-
'User-Agent':
|
|
38
|
+
'User-Agent': CRAWLFORGE_UA,
|
|
23
39
|
...headers
|
|
24
40
|
}
|
|
25
41
|
});
|
|
26
42
|
clearTimeout(timeoutId);
|
|
27
|
-
return response;
|
|
28
43
|
} catch (error) {
|
|
29
44
|
clearTimeout(timeoutId);
|
|
30
45
|
if (error.name === 'AbortError') {
|
|
@@ -32,4 +47,62 @@ export async function fetchWithTimeout(url, options = {}) {
|
|
|
32
47
|
}
|
|
33
48
|
throw error;
|
|
34
49
|
}
|
|
50
|
+
|
|
51
|
+
// --- Body-size cap ---
|
|
52
|
+
|
|
53
|
+
// Early rejection via Content-Length (servers may omit or lie — guard below
|
|
54
|
+
// handles that case). Optional-chained so non-standard responses (e.g. test
|
|
55
|
+
// mocks) without a Headers object don't throw.
|
|
56
|
+
const contentLengthHeader = response.headers?.get?.('content-length') ?? null;
|
|
57
|
+
if (contentLengthHeader !== null) {
|
|
58
|
+
const declared = parseInt(contentLengthHeader, 10);
|
|
59
|
+
if (!isNaN(declared) && declared > maxBodySize) {
|
|
60
|
+
throw new Error(
|
|
61
|
+
`Response body too large: Content-Length ${declared} exceeds limit of ${maxBodySize} bytes`
|
|
62
|
+
);
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// Only the streaming byte-count guard requires a readable body. Responses
|
|
67
|
+
// without a ReadableStream body (already-buffered responses, test mocks)
|
|
68
|
+
// are returned unchanged so callers' native .text()/.json() still work.
|
|
69
|
+
if (!response.body || typeof response.body.getReader !== 'function') {
|
|
70
|
+
return response;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// Stream the body and abort if accumulated bytes exceed the cap.
|
|
74
|
+
const reader = response.body.getReader();
|
|
75
|
+
const chunks = [];
|
|
76
|
+
let totalBytes = 0;
|
|
77
|
+
|
|
78
|
+
while (true) {
|
|
79
|
+
const { done, value } = await reader.read();
|
|
80
|
+
if (done) break;
|
|
81
|
+
totalBytes += value.byteLength;
|
|
82
|
+
if (totalBytes > maxBodySize) {
|
|
83
|
+
reader.cancel();
|
|
84
|
+
throw new Error(
|
|
85
|
+
`Response body too large: exceeded limit of ${maxBodySize} bytes`
|
|
86
|
+
);
|
|
87
|
+
}
|
|
88
|
+
chunks.push(value);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// Reassemble and expose as a response-like object that callers can use.
|
|
92
|
+
const bodyText = new TextDecoder().decode(
|
|
93
|
+
chunks.reduce((acc, chunk) => {
|
|
94
|
+
const merged = new Uint8Array(acc.byteLength + chunk.byteLength);
|
|
95
|
+
merged.set(acc, 0);
|
|
96
|
+
merged.set(chunk, acc.byteLength);
|
|
97
|
+
return merged;
|
|
98
|
+
}, new Uint8Array(0))
|
|
99
|
+
);
|
|
100
|
+
|
|
101
|
+
// Attach the pre-read text so callers can call .text() on the result.
|
|
102
|
+
// We wrap it in a minimal compatible object.
|
|
103
|
+
return Object.assign(response, {
|
|
104
|
+
text: () => Promise.resolve(bodyText),
|
|
105
|
+
json: () => Promise.resolve(JSON.parse(bodyText)),
|
|
106
|
+
_body: bodyText
|
|
107
|
+
});
|
|
35
108
|
}
|
|
@@ -41,7 +41,7 @@ export async function extractLinksHandler({ url, filter_external, base_url }) {
|
|
|
41
41
|
isExternal = false;
|
|
42
42
|
}
|
|
43
43
|
|
|
44
|
-
if (filter_external && isExternal) return;
|
|
44
|
+
if (filter_external && !isExternal) return;
|
|
45
45
|
|
|
46
46
|
links.push({ href: absoluteUrl, text, is_external: isExternal, original_href: href });
|
|
47
47
|
} catch {
|
|
@@ -1,11 +1,64 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* extract_metadata — Extract page metadata (title, description, OG tags, etc.).
|
|
3
3
|
* Extracted from server.js inline handler.
|
|
4
|
+
* B1: Parse JSON-LD and microdata; stronger title fallback chain (og:title → <title> → h1).
|
|
4
5
|
*/
|
|
5
6
|
|
|
6
7
|
import { load } from 'cheerio';
|
|
7
8
|
import { fetchWithTimeout } from './_fetch.js';
|
|
8
9
|
|
|
10
|
+
/**
|
|
11
|
+
* Parse all JSON-LD blocks from the document.
|
|
12
|
+
* @param {import('cheerio').CheerioAPI} $
|
|
13
|
+
* @returns {Array}
|
|
14
|
+
*/
|
|
15
|
+
function parseJsonLd($) {
|
|
16
|
+
const results = [];
|
|
17
|
+
$('script[type="application/ld+json"]').each((_, el) => {
|
|
18
|
+
try {
|
|
19
|
+
const raw = $(el).html();
|
|
20
|
+
if (raw) results.push(JSON.parse(raw));
|
|
21
|
+
} catch {
|
|
22
|
+
// Skip invalid blocks
|
|
23
|
+
}
|
|
24
|
+
});
|
|
25
|
+
return results;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
/**
|
|
29
|
+
* Parse microdata items (elements with itemscope).
|
|
30
|
+
* @param {import('cheerio').CheerioAPI} $
|
|
31
|
+
* @returns {Array}
|
|
32
|
+
*/
|
|
33
|
+
function parseMicrodata($) {
|
|
34
|
+
const results = [];
|
|
35
|
+
$('[itemscope]').each((_, el) => {
|
|
36
|
+
const $el = $(el);
|
|
37
|
+
const item = {
|
|
38
|
+
type: $el.attr('itemtype') || null,
|
|
39
|
+
properties: {}
|
|
40
|
+
};
|
|
41
|
+
$el.find('[itemprop]').each((_, prop) => {
|
|
42
|
+
const $prop = $(prop);
|
|
43
|
+
const name = $prop.attr('itemprop');
|
|
44
|
+
if (!name) return;
|
|
45
|
+
const tag = ($prop.get(0).tagName || '').toLowerCase();
|
|
46
|
+
let value;
|
|
47
|
+
if (tag === 'meta') value = $prop.attr('content');
|
|
48
|
+
else if (tag === 'a' || tag === 'link') value = $prop.attr('href');
|
|
49
|
+
else if (tag === 'img') value = $prop.attr('src');
|
|
50
|
+
else if (tag === 'time') value = $prop.attr('datetime') || $prop.text().trim();
|
|
51
|
+
else value = $prop.text().trim();
|
|
52
|
+
if (value) {
|
|
53
|
+
if (!item.properties[name]) item.properties[name] = [];
|
|
54
|
+
item.properties[name].push(value);
|
|
55
|
+
}
|
|
56
|
+
});
|
|
57
|
+
results.push(item);
|
|
58
|
+
});
|
|
59
|
+
return results;
|
|
60
|
+
}
|
|
61
|
+
|
|
9
62
|
/**
|
|
10
63
|
* @param {{ url: string }} params
|
|
11
64
|
*/
|
|
@@ -19,7 +72,13 @@ export async function extractMetadataHandler({ url }) {
|
|
|
19
72
|
const html = await response.text();
|
|
20
73
|
const $ = load(html);
|
|
21
74
|
|
|
22
|
-
|
|
75
|
+
// Stronger title fallback: og:title → <title> → h1
|
|
76
|
+
const title =
|
|
77
|
+
$('meta[property="og:title"]').attr('content') ||
|
|
78
|
+
$('title').text().trim() ||
|
|
79
|
+
$('h1').first().text().trim() ||
|
|
80
|
+
'';
|
|
81
|
+
|
|
23
82
|
const description =
|
|
24
83
|
$('meta[name="description"]').attr('content') ||
|
|
25
84
|
$('meta[property="og:description"]').attr('content') || '';
|
|
@@ -47,6 +106,9 @@ export async function extractMetadataHandler({ url }) {
|
|
|
47
106
|
$('meta[charset]').attr('charset') ||
|
|
48
107
|
$('meta[http-equiv="Content-Type"]').attr('content') || '';
|
|
49
108
|
|
|
109
|
+
const jsonLd = parseJsonLd($);
|
|
110
|
+
const microdata = parseMicrodata($);
|
|
111
|
+
|
|
50
112
|
return {
|
|
51
113
|
content: [{
|
|
52
114
|
type: 'text',
|
|
@@ -61,6 +123,8 @@ export async function extractMetadataHandler({ url }) {
|
|
|
61
123
|
charset,
|
|
62
124
|
og_tags: ogTags,
|
|
63
125
|
twitter_tags: twitterTags,
|
|
126
|
+
json_ld: jsonLd,
|
|
127
|
+
microdata,
|
|
64
128
|
url: response.url
|
|
65
129
|
}, null, 2)
|
|
66
130
|
}]
|
|
@@ -2,12 +2,77 @@
|
|
|
2
2
|
* extract_text — Extract clean text content from HTML.
|
|
3
3
|
* Extracted from server.js inline handler.
|
|
4
4
|
* D3.1: Added output_format:"markdown" option backed by Turndown.
|
|
5
|
+
* B1: Preserve block structure for text mode; use Readability + GFM for markdown mode.
|
|
5
6
|
*/
|
|
6
7
|
|
|
7
8
|
import { load } from 'cheerio';
|
|
9
|
+
import { JSDOM } from 'jsdom';
|
|
10
|
+
import { Readability } from '@mozilla/readability';
|
|
8
11
|
import { fetchWithTimeout } from './_fetch.js';
|
|
9
12
|
import { htmlToMarkdown } from '../../utils/htmlToMarkdown.js';
|
|
10
13
|
|
|
14
|
+
// Block-level elements whose boundaries should become paragraph breaks
|
|
15
|
+
const BLOCK_ELEMENTS = new Set([
|
|
16
|
+
'p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
|
17
|
+
'li', 'blockquote', 'pre', 'td', 'th', 'dt', 'dd',
|
|
18
|
+
'article', 'section', 'figure', 'figcaption', 'aside',
|
|
19
|
+
'header', 'footer', 'main', 'nav', 'form', 'fieldset',
|
|
20
|
+
'table', 'tr', 'caption'
|
|
21
|
+
]);
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Extract plain text from a cheerio root preserving block-element paragraph breaks.
|
|
25
|
+
* @param {import('cheerio').CheerioAPI} $ - loaded cheerio instance
|
|
26
|
+
* @returns {string}
|
|
27
|
+
*/
|
|
28
|
+
export function extractBlockText($) {
|
|
29
|
+
const parts = [];
|
|
30
|
+
|
|
31
|
+
function walk(node) {
|
|
32
|
+
if (node.type === 'text') {
|
|
33
|
+
const t = node.data.replace(/[ \t\r\n]+/g, ' ');
|
|
34
|
+
if (t.trim()) parts.push(t);
|
|
35
|
+
return;
|
|
36
|
+
}
|
|
37
|
+
if (node.type !== 'tag') return;
|
|
38
|
+
const tag = node.tagName ? node.tagName.toLowerCase() : '';
|
|
39
|
+
const isBlock = BLOCK_ELEMENTS.has(tag);
|
|
40
|
+
if (isBlock) parts.push('\n\n');
|
|
41
|
+
for (const child of (node.children || [])) {
|
|
42
|
+
walk(child);
|
|
43
|
+
}
|
|
44
|
+
if (isBlock) parts.push('\n\n');
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
const body = $('body').get(0);
|
|
48
|
+
if (body) {
|
|
49
|
+
for (const child of (body.children || [])) walk(child);
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
return parts.join('').replace(/\n{3,}/g, '\n\n').trim();
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Convert raw HTML to GFM markdown using Readability + Turndown.
|
|
57
|
+
* Accepts the original HTML string and the final URL (needed for Readability).
|
|
58
|
+
* Returns the markdown string.
|
|
59
|
+
* @param {string} html - raw HTML
|
|
60
|
+
* @param {string} pageUrl - URL of the page (used by Readability)
|
|
61
|
+
* @returns {string}
|
|
62
|
+
*/
|
|
63
|
+
export function readabilityToMarkdown(html, pageUrl) {
|
|
64
|
+
let articleHtml;
|
|
65
|
+
try {
|
|
66
|
+
const dom = new JSDOM(html, { url: pageUrl });
|
|
67
|
+
const reader = new Readability(dom.window.document);
|
|
68
|
+
const article = reader.parse();
|
|
69
|
+
articleHtml = article ? article.content : html;
|
|
70
|
+
} catch {
|
|
71
|
+
articleHtml = html;
|
|
72
|
+
}
|
|
73
|
+
return htmlToMarkdown(articleHtml);
|
|
74
|
+
}
|
|
75
|
+
|
|
11
76
|
/**
|
|
12
77
|
* @param {{ url: string, remove_scripts?: boolean, remove_styles?: boolean, output_format?: "text"|"markdown" }} params
|
|
13
78
|
*/
|
|
@@ -26,20 +91,23 @@ export async function extractTextHandler({ url, remove_scripts, remove_styles, o
|
|
|
26
91
|
|
|
27
92
|
$('nav, header, footer, aside, .advertisement, .ad, .sidebar').remove();
|
|
28
93
|
|
|
29
|
-
const text = $('body').text().replace(/\s+/g, ' ').trim();
|
|
30
|
-
|
|
31
94
|
const result = {
|
|
32
|
-
word_count: text.split(/\s+/).filter(w => w.length > 0).length,
|
|
33
|
-
char_count: text.length,
|
|
34
95
|
url: response.url
|
|
35
96
|
};
|
|
36
97
|
|
|
37
98
|
if (output_format === 'markdown') {
|
|
38
|
-
|
|
99
|
+
// Run Readability first to get main content, then convert to GFM markdown
|
|
100
|
+
result.markdown = readabilityToMarkdown(html, response.url);
|
|
39
101
|
result.output_format = 'markdown';
|
|
102
|
+
const plainText = result.markdown.replace(/[#*`_\[\]]/g, '').replace(/\s+/g, ' ').trim();
|
|
103
|
+
result.word_count = plainText.split(/\s+/).filter(w => w.length > 0).length;
|
|
104
|
+
result.char_count = plainText.length;
|
|
40
105
|
} else {
|
|
106
|
+
const text = extractBlockText($);
|
|
41
107
|
result.text = text;
|
|
42
108
|
result.output_format = 'text';
|
|
109
|
+
result.word_count = text.split(/\s+/).filter(w => w.length > 0).length;
|
|
110
|
+
result.char_count = text.length;
|
|
43
111
|
}
|
|
44
112
|
|
|
45
113
|
return {
|
|
@@ -1,15 +1,33 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* scrape_structured — Extract structured data using CSS selectors.
|
|
3
3
|
* Extracted from server.js inline handler.
|
|
4
|
+
* B1: Support attribute extraction (selector@attr), add max_results,
|
|
5
|
+
* fix elements_found to report real per-field DOM match counts.
|
|
4
6
|
*/
|
|
5
7
|
|
|
6
8
|
import { load } from 'cheerio';
|
|
7
9
|
import { fetchWithTimeout } from './_fetch.js';
|
|
8
10
|
|
|
9
11
|
/**
|
|
10
|
-
*
|
|
12
|
+
* Parse a selector string that may include an attribute suffix: "css@attr"
|
|
13
|
+
* e.g. "a.link@href" -> { selector: "a.link", attribute: "href" }
|
|
14
|
+
* "img@src" -> { selector: "img", attribute: "src" }
|
|
15
|
+
* "h1" -> { selector: "h1", attribute: null }
|
|
16
|
+
* @param {string} raw
|
|
17
|
+
* @returns {{ selector: string, attribute: string|null }}
|
|
11
18
|
*/
|
|
12
|
-
|
|
19
|
+
function parseSelectorSpec(raw) {
|
|
20
|
+
const atIdx = raw.lastIndexOf('@');
|
|
21
|
+
if (atIdx > 0) {
|
|
22
|
+
return { selector: raw.slice(0, atIdx), attribute: raw.slice(atIdx + 1) };
|
|
23
|
+
}
|
|
24
|
+
return { selector: raw, attribute: null };
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* @param {{ url: string, selectors: Record<string, string>, max_results?: number }} params
|
|
29
|
+
*/
|
|
30
|
+
export async function scrapeStructuredHandler({ url, selectors, max_results }) {
|
|
13
31
|
try {
|
|
14
32
|
const response = await fetchWithTimeout(url);
|
|
15
33
|
if (!response.ok) {
|
|
@@ -19,22 +37,42 @@ export async function scrapeStructuredHandler({ url, selectors }) {
|
|
|
19
37
|
const html = await response.text();
|
|
20
38
|
const $ = load(html);
|
|
21
39
|
const results = {};
|
|
40
|
+
const matchCounts = {};
|
|
22
41
|
|
|
23
|
-
for (const [fieldName,
|
|
42
|
+
for (const [fieldName, rawSelector] of Object.entries(selectors)) {
|
|
24
43
|
try {
|
|
25
|
-
const
|
|
26
|
-
|
|
44
|
+
const { selector, attribute } = parseSelectorSpec(rawSelector);
|
|
45
|
+
let elements = $(selector);
|
|
46
|
+
const domCount = elements.length;
|
|
47
|
+
matchCounts[fieldName] = domCount;
|
|
48
|
+
|
|
49
|
+
if (domCount === 0) {
|
|
27
50
|
results[fieldName] = null;
|
|
28
|
-
} else if (elements.length === 1) {
|
|
29
|
-
results[fieldName] = elements.text().trim();
|
|
30
51
|
} else {
|
|
31
|
-
|
|
52
|
+
// Apply max_results cap if specified
|
|
53
|
+
if (max_results != null && max_results > 0 && domCount > max_results) {
|
|
54
|
+
elements = elements.slice(0, max_results);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
const extract = (el) => {
|
|
58
|
+
if (attribute) {
|
|
59
|
+
return $(el).attr(attribute) ?? null;
|
|
60
|
+
}
|
|
61
|
+
return $(el).text().trim();
|
|
62
|
+
};
|
|
63
|
+
|
|
64
|
+
if (elements.length === 1) {
|
|
65
|
+
results[fieldName] = extract(elements.get(0));
|
|
66
|
+
} else {
|
|
67
|
+
results[fieldName] = elements.map((_, el) => extract(el)).get();
|
|
68
|
+
}
|
|
32
69
|
}
|
|
33
70
|
} catch (selectorError) {
|
|
34
71
|
results[fieldName] = {
|
|
35
|
-
error: `Invalid selector: ${
|
|
72
|
+
error: `Invalid selector: ${rawSelector}`,
|
|
36
73
|
message: selectorError.message
|
|
37
74
|
};
|
|
75
|
+
matchCounts[fieldName] = 0;
|
|
38
76
|
}
|
|
39
77
|
}
|
|
40
78
|
|
|
@@ -44,7 +82,7 @@ export async function scrapeStructuredHandler({ url, selectors }) {
|
|
|
44
82
|
text: JSON.stringify({
|
|
45
83
|
data: results,
|
|
46
84
|
selectors_used: selectors,
|
|
47
|
-
elements_found:
|
|
85
|
+
elements_found: matchCounts,
|
|
48
86
|
url: response.url
|
|
49
87
|
}, null, 2)
|
|
50
88
|
}]
|
|
@@ -14,6 +14,7 @@ const CrawlDeepSchema = z.object({
|
|
|
14
14
|
follow_external: z.boolean().optional().default(false),
|
|
15
15
|
respect_robots: z.boolean().optional().default(true),
|
|
16
16
|
extract_content: z.boolean().optional().default(true),
|
|
17
|
+
content_max_length: z.number().min(1).max(100000).optional().default(500),
|
|
17
18
|
concurrency: z.number().min(1).max(20).optional().default(10),
|
|
18
19
|
enable_link_analysis: z.boolean().optional().default(true),
|
|
19
20
|
link_analysis_options: z.object({
|
|
@@ -217,7 +218,7 @@ export class CrawlDeepTool {
|
|
|
217
218
|
errors: results.errors.length,
|
|
218
219
|
duration_ms: duration,
|
|
219
220
|
pages_per_second: results.urls.length / (duration / 1000),
|
|
220
|
-
results: this.formatResults(results.results, validated.extract_content),
|
|
221
|
+
results: this.formatResults(results.results, validated.extract_content, validated.content_max_length),
|
|
221
222
|
errors: results.errors,
|
|
222
223
|
stats: results.stats,
|
|
223
224
|
site_structure: this.analyzeSiteStructure(results.urls),
|
|
@@ -240,7 +241,7 @@ export class CrawlDeepTool {
|
|
|
240
241
|
}
|
|
241
242
|
}
|
|
242
243
|
|
|
243
|
-
formatResults(results, includeContent) {
|
|
244
|
+
formatResults(results, includeContent, contentMaxLength = 500) {
|
|
244
245
|
return results.map(result => {
|
|
245
246
|
const formatted = {
|
|
246
247
|
url: result.url,
|
|
@@ -250,12 +251,19 @@ export class CrawlDeepTool {
|
|
|
250
251
|
content_length: result.contentLength,
|
|
251
252
|
timestamp: result.timestamp
|
|
252
253
|
};
|
|
253
|
-
|
|
254
|
+
|
|
254
255
|
if (includeContent) {
|
|
255
|
-
|
|
256
|
+
const raw = result.content || '';
|
|
257
|
+
if (raw.length > contentMaxLength) {
|
|
258
|
+
formatted.content = raw.substring(0, contentMaxLength);
|
|
259
|
+
formatted.truncated = true;
|
|
260
|
+
} else {
|
|
261
|
+
formatted.content = raw;
|
|
262
|
+
formatted.truncated = false;
|
|
263
|
+
}
|
|
256
264
|
formatted.metadata = result.metadata;
|
|
257
265
|
}
|
|
258
|
-
|
|
266
|
+
|
|
259
267
|
return formatted;
|
|
260
268
|
});
|
|
261
269
|
}
|
|
@@ -3,6 +3,15 @@ import { load } from 'cheerio';
|
|
|
3
3
|
import { DomainFilter } from '../../utils/domainFilter.js';
|
|
4
4
|
import { normalizeUrl, getBaseUrl } from '../../utils/urlNormalizer.js';
|
|
5
5
|
import { CacheManager } from '../../core/cache/CacheManager.js';
|
|
6
|
+
import { SitemapParser } from '../../utils/sitemapParser.js';
|
|
7
|
+
import { ResultRanker } from '../search/ranking/ResultRanker.js';
|
|
8
|
+
|
|
9
|
+
// Lazy singleton — avoids creating a CacheManager timer per request
|
|
10
|
+
let _ranker = null;
|
|
11
|
+
function getRanker() {
|
|
12
|
+
if (!_ranker) _ranker = new ResultRanker({ cacheEnabled: false });
|
|
13
|
+
return _ranker;
|
|
14
|
+
}
|
|
6
15
|
|
|
7
16
|
const MapSiteSchema = z.object({
|
|
8
17
|
url: z.string().url(),
|
|
@@ -17,7 +26,8 @@ const MapSiteSchema = z.object({
|
|
|
17
26
|
include_patterns: z.array(z.string()).optional().default([]),
|
|
18
27
|
exclude_patterns: z.array(z.string()).optional().default([])
|
|
19
28
|
}).optional(),
|
|
20
|
-
import_filter_config: z.string().optional() // JSON string of exported config
|
|
29
|
+
import_filter_config: z.string().optional(), // JSON string of exported config
|
|
30
|
+
search: z.string().optional() // when set, rank URLs by relevance and emit ranked_urls
|
|
21
31
|
});
|
|
22
32
|
|
|
23
33
|
export class MapSiteTool {
|
|
@@ -33,6 +43,7 @@ export class MapSiteTool {
|
|
|
33
43
|
this.timeout = timeout;
|
|
34
44
|
// Per-session result cache: avoids redundant site maps for the same root URL
|
|
35
45
|
this.cache = cacheEnabled ? new CacheManager({ ttl: cacheTTL }) : null;
|
|
46
|
+
this.sitemapParser = new SitemapParser({ userAgent, timeout, enableCaching: cacheEnabled, cacheTTL });
|
|
36
47
|
}
|
|
37
48
|
|
|
38
49
|
async execute(params) {
|
|
@@ -118,6 +129,25 @@ export class MapSiteTool {
|
|
|
118
129
|
filter_stats: domainFilter ? domainFilter.getStats() : null
|
|
119
130
|
};
|
|
120
131
|
|
|
132
|
+
// Optional: rank URLs by relevance to a search string
|
|
133
|
+
if (validated.search) {
|
|
134
|
+
try {
|
|
135
|
+
const rankerInput = urlArray.map(url => {
|
|
136
|
+
let title = url;
|
|
137
|
+
try {
|
|
138
|
+
const { pathname } = new URL(url);
|
|
139
|
+
title = decodeURIComponent(pathname).replace(/[-_/]/g, ' ').trim();
|
|
140
|
+
} catch { /* keep raw url */ }
|
|
141
|
+
return { link: url, title, snippet: '' };
|
|
142
|
+
});
|
|
143
|
+
const ranked = await getRanker().rankResults(rankerInput, validated.search);
|
|
144
|
+
result.ranked_urls = ranked.map(r => ({ url: r.link, score: r.finalScore ?? 0 }));
|
|
145
|
+
} catch {
|
|
146
|
+
// ranking is best-effort; don't fail the whole call
|
|
147
|
+
result.ranked_urls = urlArray.map(u => ({ url: u, score: 0 }));
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
|
|
121
151
|
// Store in cache before returning
|
|
122
152
|
if (this.cache) {
|
|
123
153
|
const cacheKey = this.cache.generateKey('map_site', { url: validated.url, maxUrls: validated.max_urls });
|
|
@@ -131,61 +161,32 @@ export class MapSiteTool {
|
|
|
131
161
|
}
|
|
132
162
|
|
|
133
163
|
async fetchSitemapUrls(baseUrl, domainFilter = null) {
|
|
164
|
+
// Discover sitemaps via robots.txt and common paths, then parse with full
|
|
165
|
+
// SitemapParser support (sitemap-index recursion, gzip, CDATA/entities).
|
|
166
|
+
const discovered = await this.sitemapParser.discoverSitemaps(baseUrl, {
|
|
167
|
+
checkRobotsTxt: true,
|
|
168
|
+
checkCommonPaths: true,
|
|
169
|
+
checkSitemapIndex: false
|
|
170
|
+
});
|
|
171
|
+
|
|
134
172
|
const urls = new Set();
|
|
135
|
-
const
|
|
136
|
-
`${baseUrl}/sitemap.xml`,
|
|
137
|
-
`${baseUrl}/sitemap_index.xml`,
|
|
138
|
-
`${baseUrl}/sitemap-index.xml`,
|
|
139
|
-
`${baseUrl}/sitemaps.xml`
|
|
140
|
-
];
|
|
141
|
-
|
|
142
|
-
for (const sitemapUrl of sitemapUrls) {
|
|
173
|
+
for (const sitemapUrl of discovered) {
|
|
143
174
|
try {
|
|
144
|
-
const
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
175
|
+
const parsed = await this.sitemapParser.parseSitemap(sitemapUrl, {
|
|
176
|
+
includeMetadata: false,
|
|
177
|
+
followIndexes: true
|
|
178
|
+
});
|
|
179
|
+
if (parsed.success) {
|
|
180
|
+
for (const entry of parsed.urls) {
|
|
181
|
+
const url = entry.loc || entry;
|
|
151
182
|
if (!domainFilter || domainFilter.isAllowed(url).allowed) {
|
|
152
183
|
urls.add(url);
|
|
153
184
|
}
|
|
154
|
-
}
|
|
155
|
-
|
|
156
|
-
// If we found a sitemap, don't try others
|
|
157
|
-
if (urls.size > 0) break;
|
|
185
|
+
}
|
|
158
186
|
}
|
|
187
|
+
if (urls.size > 0) break;
|
|
159
188
|
} catch {
|
|
160
|
-
// Continue to next sitemap
|
|
161
|
-
}
|
|
162
|
-
}
|
|
163
|
-
|
|
164
|
-
return Array.from(urls);
|
|
165
|
-
}
|
|
166
|
-
|
|
167
|
-
parseSitemap(xml) {
|
|
168
|
-
const urls = new Set();
|
|
169
|
-
|
|
170
|
-
// Extract URLs from sitemap
|
|
171
|
-
const urlMatches = xml.match(/<loc>([^<]+)<\/loc>/g);
|
|
172
|
-
if (urlMatches) {
|
|
173
|
-
urlMatches.forEach(match => {
|
|
174
|
-
const url = match.replace(/<\/?loc>/g, '').trim();
|
|
175
|
-
if (url) urls.add(url);
|
|
176
|
-
});
|
|
177
|
-
}
|
|
178
|
-
|
|
179
|
-
// Check for nested sitemaps (sitemap index)
|
|
180
|
-
const sitemapMatches = xml.match(/<sitemap>[\s\S]*?<\/sitemap>/g);
|
|
181
|
-
if (sitemapMatches) {
|
|
182
|
-
for (const sitemapMatch of sitemapMatches) {
|
|
183
|
-
const locMatch = sitemapMatch.match(/<loc>([^<]+)<\/loc>/);
|
|
184
|
-
if (locMatch && locMatch[1]) {
|
|
185
|
-
// We could recursively fetch nested sitemaps here
|
|
186
|
-
// For now, just add the sitemap URL itself
|
|
187
|
-
urls.add(locMatch[1]);
|
|
188
|
-
}
|
|
189
|
+
// Continue to next discovered sitemap
|
|
189
190
|
}
|
|
190
191
|
}
|
|
191
192
|
|
|
@@ -362,7 +363,7 @@ export class MapSiteTool {
|
|
|
362
363
|
max_depth: 0,
|
|
363
364
|
average_depth: 0,
|
|
364
365
|
url_lengths: {
|
|
365
|
-
min:
|
|
366
|
+
min: null,
|
|
366
367
|
max: 0,
|
|
367
368
|
average: 0
|
|
368
369
|
}
|
|
@@ -374,7 +375,7 @@ export class MapSiteTool {
|
|
|
374
375
|
for (const url of urls) {
|
|
375
376
|
try {
|
|
376
377
|
const urlObj = new URL(url);
|
|
377
|
-
|
|
378
|
+
|
|
378
379
|
// Count secure URLs
|
|
379
380
|
if (urlObj.protocol === 'https:') {
|
|
380
381
|
stats.secure_urls++;
|
|
@@ -396,7 +397,7 @@ export class MapSiteTool {
|
|
|
396
397
|
// Track URL lengths
|
|
397
398
|
const length = url.length;
|
|
398
399
|
totalLength += length;
|
|
399
|
-
stats.url_lengths.min = Math.min(stats.url_lengths.min, length);
|
|
400
|
+
stats.url_lengths.min = stats.url_lengths.min === null ? length : Math.min(stats.url_lengths.min, length);
|
|
400
401
|
stats.url_lengths.max = Math.max(stats.url_lengths.max, length);
|
|
401
402
|
|
|
402
403
|
// Track file extensions
|
|
@@ -266,9 +266,9 @@ export class AnalyzeContentTool {
|
|
|
266
266
|
};
|
|
267
267
|
|
|
268
268
|
const keywordStr = keywords.join(' ').toLowerCase();
|
|
269
|
-
|
|
269
|
+
|
|
270
270
|
for (const [category, categoryKeywords] of Object.entries(categories)) {
|
|
271
|
-
const matches = categoryKeywords.filter(word =>
|
|
271
|
+
const matches = categoryKeywords.filter(word => new RegExp(`\\b${word}\\b`).test(keywordStr));
|
|
272
272
|
if (matches.length > 0) {
|
|
273
273
|
return category;
|
|
274
274
|
}
|
|
@@ -394,13 +394,18 @@ export class AnalyzeContentTool {
|
|
|
394
394
|
anticipation: ['excited', 'eager', 'looking forward', 'anticipating', 'expecting']
|
|
395
395
|
};
|
|
396
396
|
|
|
397
|
-
const
|
|
397
|
+
const lowerText = text.toLowerCase();
|
|
398
|
+
const words = lowerText.split(/\s+/);
|
|
398
399
|
const emotions = [];
|
|
399
400
|
|
|
400
401
|
for (const [emotion, emotionKeywords] of Object.entries(emotionWords)) {
|
|
401
|
-
const
|
|
402
|
-
|
|
403
|
-
const
|
|
402
|
+
const matchCount = emotionKeywords.reduce((count, keyword) => {
|
|
403
|
+
const re = new RegExp(`\\b${keyword}\\b`, 'g');
|
|
404
|
+
const found = lowerText.match(re);
|
|
405
|
+
return count + (found ? found.length : 0);
|
|
406
|
+
}, 0);
|
|
407
|
+
if (matchCount > 0) {
|
|
408
|
+
const intensity = Math.min(1, matchCount / Math.max(words.length / 100, 1));
|
|
404
409
|
emotions.push({
|
|
405
410
|
emotion,
|
|
406
411
|
intensity: Math.round(intensity * 100) / 100
|