crawlforge-mcp-server 3.0.18 → 3.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +5 -2
- package/server.js +192 -1277
- package/src/core/ActionExecutor.js +2 -43
- package/src/core/AuthManager.js +127 -14
- package/src/core/BrowserContextPool.js +187 -0
- package/src/core/JobManager.js +7 -5
- package/src/core/LocalizationManager.js +14 -125
- package/src/core/StealthBrowserManager.js +26 -18
- package/src/core/cache/CacheManager.js +4 -1
- package/src/core/crawlers/BFSCrawler.js +19 -5
- package/src/observability/metrics.js +137 -0
- package/src/observability/tracing.js +74 -0
- package/src/server/auth/oauth.js +388 -0
- package/src/server/registerTool.js +41 -0
- package/src/server/schemas/common.js +29 -0
- package/src/server/transports/http.js +22 -0
- package/src/server/transports/stdio.js +16 -0
- package/src/server/transports/streamableHttp.js +226 -0
- package/src/server/withAuth.js +121 -0
- package/src/tools/advanced/BatchScrapeTool.js +12 -1086
- package/src/tools/advanced/ScrapeWithActionsTool.js +105 -19
- package/src/tools/advanced/batchScrape/index.js +328 -0
- package/src/tools/advanced/batchScrape/queue.js +91 -0
- package/src/tools/advanced/batchScrape/reporter.js +26 -0
- package/src/tools/advanced/batchScrape/schema.js +37 -0
- package/src/tools/advanced/batchScrape/worker.js +179 -0
- package/src/tools/advanced/scrapeWithActions/recorder.js +188 -0
- package/src/tools/basic/_fetch.js +35 -0
- package/src/tools/basic/extractLinks.js +74 -0
- package/src/tools/basic/extractMetadata.js +74 -0
- package/src/tools/basic/extractText.js +46 -0
- package/src/tools/basic/fetchUrl.js +44 -0
- package/src/tools/basic/scrapeStructured.js +58 -0
- package/src/tools/crawl/_sessionContext.js +234 -0
- package/src/tools/crawl/crawlDeep.js +55 -5
- package/src/tools/crawl/mapSite.js +23 -2
- package/src/tools/extract/_fetchAndParse.js +57 -0
- package/src/tools/extract/extractStructured.js +3 -19
- package/src/tools/extract/extractWithLlm.js +365 -0
- package/src/tools/search/providers/searxng.js +126 -0
- package/src/tools/search/ranking/ResultDeduplicator.js +18 -11
- package/src/tools/search/ranking/ResultRanker.js +17 -10
- package/src/tools/search/ranking/SearchResultCache.js +52 -0
- package/src/tools/search/searchWeb.js +112 -6
- package/src/tools/tracking/trackChanges/differ.js +98 -0
- package/src/tools/tracking/trackChanges/index.js +432 -0
- package/src/tools/tracking/trackChanges/monitor.js +93 -0
- package/src/tools/tracking/trackChanges/notifier.js +105 -0
- package/src/tools/tracking/trackChanges/schema.js +127 -0
- package/src/tools/tracking/trackChanges.js +12 -1374
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* batchScrape — worker module.
|
|
3
|
+
* URL fetching, content extraction, format generation.
|
|
4
|
+
* Used by queue.js (the Semaphore-based batch runner).
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
import { load } from 'cheerio';
|
|
8
|
+
|
|
9
|
+
const USER_AGENT = 'MCP-WebScraper-BatchTool/1.0.0';
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* Fetch a URL with AbortController timeout.
|
|
13
|
+
*/
|
|
14
|
+
export async function fetchUrl(url, options = {}) {
|
|
15
|
+
const { timeout = 15000, headers = {} } = options;
|
|
16
|
+
const controller = new AbortController();
|
|
17
|
+
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
|
18
|
+
try {
|
|
19
|
+
const response = await fetch(url, {
|
|
20
|
+
signal: controller.signal,
|
|
21
|
+
headers: { 'User-Agent': USER_AGENT, ...headers }
|
|
22
|
+
});
|
|
23
|
+
clearTimeout(timeoutId);
|
|
24
|
+
return response;
|
|
25
|
+
} catch (error) {
|
|
26
|
+
clearTimeout(timeoutId);
|
|
27
|
+
if (error.name === 'AbortError') throw new Error(`Request timeout after ${timeout}ms`);
|
|
28
|
+
throw error;
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Scrape a single URL and return a result object.
|
|
34
|
+
*/
|
|
35
|
+
export async function scrapeUrl(config, options, defaultTimeout) {
|
|
36
|
+
const startTime = Date.now();
|
|
37
|
+
try {
|
|
38
|
+
const response = await fetchUrl(config.url, {
|
|
39
|
+
headers: config.headers,
|
|
40
|
+
timeout: config.timeout || defaultTimeout
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
if (!response.ok) throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
44
|
+
|
|
45
|
+
const html = await response.text();
|
|
46
|
+
const $ = load(html);
|
|
47
|
+
|
|
48
|
+
const result = {
|
|
49
|
+
success: true,
|
|
50
|
+
url: config.url,
|
|
51
|
+
timestamp: Date.now(),
|
|
52
|
+
executionTime: Date.now() - startTime,
|
|
53
|
+
metadata: {
|
|
54
|
+
status: response.status,
|
|
55
|
+
contentType: response.headers.get('content-type'),
|
|
56
|
+
contentLength: html.length,
|
|
57
|
+
...(config.metadata || {})
|
|
58
|
+
}
|
|
59
|
+
};
|
|
60
|
+
|
|
61
|
+
if (options.extractionSchema || config.selectors) {
|
|
62
|
+
result.extracted = extractStructuredData($, { ...config.selectors, ...options.extractionSchema });
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
result.content = generateFormats($, html, options.formats);
|
|
66
|
+
return result;
|
|
67
|
+
} catch (error) {
|
|
68
|
+
return {
|
|
69
|
+
success: false,
|
|
70
|
+
url: config.url,
|
|
71
|
+
error: error.message,
|
|
72
|
+
timestamp: Date.now(),
|
|
73
|
+
executionTime: Date.now() - startTime,
|
|
74
|
+
metadata: config.metadata || {}
|
|
75
|
+
};
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
function extractStructuredData($, selectors) {
|
|
80
|
+
const extracted = {};
|
|
81
|
+
for (const [key, selector] of Object.entries(selectors)) {
|
|
82
|
+
try {
|
|
83
|
+
const elements = $(selector);
|
|
84
|
+
if (elements.length === 0) extracted[key] = null;
|
|
85
|
+
else if (elements.length === 1) extracted[key] = elements.text().trim();
|
|
86
|
+
else extracted[key] = elements.map((_, el) => $(el).text().trim()).get();
|
|
87
|
+
} catch {
|
|
88
|
+
extracted[key] = { error: `Invalid selector: ${selector}` };
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
return extracted;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
function generateFormats($, html, formats) {
|
|
95
|
+
const content = {};
|
|
96
|
+
if (formats.includes('html')) content.html = html;
|
|
97
|
+
if (formats.includes('text')) content.text = $('body').text().replace(/\s+/g, ' ').trim();
|
|
98
|
+
if (formats.includes('markdown')) content.markdown = buildMarkdown($);
|
|
99
|
+
if (formats.includes('json')) {
|
|
100
|
+
content.json = {
|
|
101
|
+
title: $('title').text().trim(),
|
|
102
|
+
headings: extractHeadings($),
|
|
103
|
+
links: extractLinks($),
|
|
104
|
+
images: extractImages($),
|
|
105
|
+
metadata: extractMetadata($)
|
|
106
|
+
};
|
|
107
|
+
}
|
|
108
|
+
return content;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
function buildMarkdown($) {
|
|
112
|
+
let md = '';
|
|
113
|
+
const title = $('title').text().trim();
|
|
114
|
+
if (title) md += `# ${title}\n\n`;
|
|
115
|
+
|
|
116
|
+
const selectors = ['article', 'main', '.content', '#content', '.post-content', '.entry-content'];
|
|
117
|
+
let $body = null;
|
|
118
|
+
for (const sel of selectors) {
|
|
119
|
+
$body = $(sel);
|
|
120
|
+
if ($body.length > 0) break;
|
|
121
|
+
}
|
|
122
|
+
if (!$body || $body.length === 0) $body = $('body');
|
|
123
|
+
|
|
124
|
+
$body.find('h1').each((_, el) => { md += `# ${$(el).text().trim()}\n\n`; });
|
|
125
|
+
$body.find('h2').each((_, el) => { md += `## ${$(el).text().trim()}\n\n`; });
|
|
126
|
+
$body.find('h3').each((_, el) => { md += `### ${$(el).text().trim()}\n\n`; });
|
|
127
|
+
$body.find('p').each((_, el) => { const t = $(el).text().trim(); if (t) md += `${t}\n\n`; });
|
|
128
|
+
$body.find('ul li').each((_, el) => { md += `- ${$(el).text().trim()}\n`; });
|
|
129
|
+
$body.find('ol li').each((_, el) => { md += `1. ${$(el).text().trim()}\n`; });
|
|
130
|
+
|
|
131
|
+
return md.trim();
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
function extractHeadings($) {
|
|
135
|
+
const headings = [];
|
|
136
|
+
$('h1, h2, h3, h4, h5, h6').each((_, el) => {
|
|
137
|
+
headings.push({ level: parseInt(el.name.substring(1)), text: $(el).text().trim(), id: $(el).attr('id') || null });
|
|
138
|
+
});
|
|
139
|
+
return headings;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
function extractLinks($) {
|
|
143
|
+
const links = [];
|
|
144
|
+
$('a[href]').each((_, el) => {
|
|
145
|
+
const href = $(el).attr('href');
|
|
146
|
+
const text = $(el).text().trim();
|
|
147
|
+
if (href && text) links.push({ href, text, title: $(el).attr('title') || null });
|
|
148
|
+
});
|
|
149
|
+
return links;
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
function extractImages($) {
|
|
153
|
+
const images = [];
|
|
154
|
+
$('img[src]').each((_, el) => {
|
|
155
|
+
images.push({
|
|
156
|
+
src: $(el).attr('src'),
|
|
157
|
+
alt: $(el).attr('alt') || null,
|
|
158
|
+
title: $(el).attr('title') || null,
|
|
159
|
+
width: $(el).attr('width') || null,
|
|
160
|
+
height: $(el).attr('height') || null
|
|
161
|
+
});
|
|
162
|
+
});
|
|
163
|
+
return images;
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
function extractMetadata($) {
|
|
167
|
+
const m = {};
|
|
168
|
+
m.title = $('title').text().trim();
|
|
169
|
+
m.description = $('meta[name="description"]').attr('content') || '';
|
|
170
|
+
m.og = {};
|
|
171
|
+
$('meta[property^="og:"]').each((_, el) => {
|
|
172
|
+
m.og[$(el).attr('property').replace('og:', '')] = $(el).attr('content');
|
|
173
|
+
});
|
|
174
|
+
m.twitter = {};
|
|
175
|
+
$('meta[name^="twitter:"]').each((_, el) => {
|
|
176
|
+
m.twitter[$(el).attr('name').replace('twitter:', '')] = $(el).attr('content');
|
|
177
|
+
});
|
|
178
|
+
return m;
|
|
179
|
+
}
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* recorder.js — Recording and replay support for scrape_with_actions.
|
|
3
|
+
*
|
|
4
|
+
* Responsibilities:
|
|
5
|
+
* - Validate recording names (path-traversal prevention)
|
|
6
|
+
* - Persist recorded action sequences to disk (atomic write)
|
|
7
|
+
* - Load saved recordings for replay
|
|
8
|
+
* - List available recordings
|
|
9
|
+
*
|
|
10
|
+
* The recordings directory is resolved from:
|
|
11
|
+
* 1. process.env.CRAWLFORGE_HOME_OVERRIDE (tests only)
|
|
12
|
+
* 2. os.homedir()
|
|
13
|
+
*
|
|
14
|
+
* File layout: <homeDir>/.crawlforge/recordings/<name>.json
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
import os from 'os';
|
|
18
|
+
import fs from 'fs/promises';
|
|
19
|
+
import path from 'path';
|
|
20
|
+
|
|
21
|
+
// Regex enforcing safe recording names — no path separators or special chars.
|
|
22
|
+
const VALID_NAME_RE = /^[a-zA-Z0-9_-]{1,64}$/;
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Return the base home directory respecting the test override env var.
|
|
26
|
+
* @returns {string}
|
|
27
|
+
*/
|
|
28
|
+
function homeDir() {
|
|
29
|
+
return process.env.CRAWLFORGE_HOME_OVERRIDE || os.homedir();
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Return the recordings directory path (not guaranteed to exist).
|
|
34
|
+
* @returns {string}
|
|
35
|
+
*/
|
|
36
|
+
function recordingsDir() {
|
|
37
|
+
return path.join(homeDir(), '.crawlforge', 'recordings');
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Validate a recording name.
|
|
42
|
+
* @param {string} name
|
|
43
|
+
* @throws {Error} if the name is invalid
|
|
44
|
+
*/
|
|
45
|
+
export function validateRecordingName(name) {
|
|
46
|
+
if (typeof name !== 'string' || !VALID_NAME_RE.test(name)) {
|
|
47
|
+
throw new Error(
|
|
48
|
+
`Invalid recording name "${name}". ` +
|
|
49
|
+
'Names must be 1–64 characters and contain only letters, digits, underscores, or hyphens.'
|
|
50
|
+
);
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* Persist a recorded action sequence to disk atomically.
|
|
56
|
+
*
|
|
57
|
+
* @param {string} name - Recording name (validated before writing)
|
|
58
|
+
* @param {Array<Object>} recordedActions - Array of annotated action entries
|
|
59
|
+
* @param {Object} [meta] - Optional metadata (original url, timestamp, etc.)
|
|
60
|
+
* @returns {Promise<string>} Resolved file path
|
|
61
|
+
*/
|
|
62
|
+
export async function saveRecording(name, recordedActions, meta = {}) {
|
|
63
|
+
validateRecordingName(name);
|
|
64
|
+
|
|
65
|
+
const dir = recordingsDir();
|
|
66
|
+
await fs.mkdir(dir, { recursive: true });
|
|
67
|
+
|
|
68
|
+
const filePath = path.join(dir, `${name}.json`);
|
|
69
|
+
const tmpPath = `${filePath}.tmp`;
|
|
70
|
+
|
|
71
|
+
const payload = JSON.stringify(
|
|
72
|
+
{
|
|
73
|
+
name,
|
|
74
|
+
savedAt: new Date().toISOString(),
|
|
75
|
+
...meta,
|
|
76
|
+
recordedActions
|
|
77
|
+
},
|
|
78
|
+
null,
|
|
79
|
+
2
|
|
80
|
+
);
|
|
81
|
+
|
|
82
|
+
// Atomic write: write to .tmp then rename
|
|
83
|
+
await fs.writeFile(tmpPath, payload, 'utf8');
|
|
84
|
+
await fs.rename(tmpPath, filePath);
|
|
85
|
+
|
|
86
|
+
return filePath;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Load a saved recording from disk.
|
|
91
|
+
*
|
|
92
|
+
* @param {string} name - Recording name
|
|
93
|
+
* @returns {Promise<Object>} Parsed recording object (includes `recordedActions`)
|
|
94
|
+
* @throws {Error} if the recording does not exist or cannot be parsed
|
|
95
|
+
*/
|
|
96
|
+
export async function loadRecording(name) {
|
|
97
|
+
validateRecordingName(name);
|
|
98
|
+
|
|
99
|
+
const filePath = path.join(recordingsDir(), `${name}.json`);
|
|
100
|
+
|
|
101
|
+
let raw;
|
|
102
|
+
try {
|
|
103
|
+
raw = await fs.readFile(filePath, 'utf8');
|
|
104
|
+
} catch (err) {
|
|
105
|
+
if (err.code === 'ENOENT') {
|
|
106
|
+
throw new Error(`Recording "${name}" not found. Use replayRecording: "__list__" to see available recordings.`);
|
|
107
|
+
}
|
|
108
|
+
throw new Error(`Failed to read recording "${name}": ${err.message}`);
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
let parsed;
|
|
112
|
+
try {
|
|
113
|
+
parsed = JSON.parse(raw);
|
|
114
|
+
} catch (err) {
|
|
115
|
+
throw new Error(`Recording "${name}" is corrupted (invalid JSON): ${err.message}`);
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
if (!Array.isArray(parsed.recordedActions)) {
|
|
119
|
+
throw new Error(`Recording "${name}" has an unexpected format (missing recordedActions array).`);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
return parsed;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
/**
|
|
126
|
+
* List all available recording names.
|
|
127
|
+
*
|
|
128
|
+
* @returns {Promise<string[]>} Sorted array of recording names (without .json extension)
|
|
129
|
+
*/
|
|
130
|
+
export async function listRecordings() {
|
|
131
|
+
const dir = recordingsDir();
|
|
132
|
+
|
|
133
|
+
let entries;
|
|
134
|
+
try {
|
|
135
|
+
entries = await fs.readdir(dir);
|
|
136
|
+
} catch (err) {
|
|
137
|
+
if (err.code === 'ENOENT') {
|
|
138
|
+
return [];
|
|
139
|
+
}
|
|
140
|
+
throw new Error(`Failed to list recordings: ${err.message}`);
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
return entries
|
|
144
|
+
.filter(f => f.endsWith('.json') && !f.endsWith('.tmp'))
|
|
145
|
+
.map(f => f.slice(0, -5)) // strip .json
|
|
146
|
+
.sort();
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
/**
|
|
150
|
+
* Build a recordedActions entry from an action definition and timing info.
|
|
151
|
+
*
|
|
152
|
+
* Only the fields meaningful for replay are kept.
|
|
153
|
+
*
|
|
154
|
+
* @param {Object} action - Original action object
|
|
155
|
+
* @param {number} timestampMsSinceStart - ms since recording session started
|
|
156
|
+
* @returns {Object}
|
|
157
|
+
*/
|
|
158
|
+
export function buildRecordedEntry(action, timestampMsSinceStart) {
|
|
159
|
+
const entry = {
|
|
160
|
+
type: action.type,
|
|
161
|
+
timestamp_ms_since_start: timestampMsSinceStart
|
|
162
|
+
};
|
|
163
|
+
|
|
164
|
+
// Preserve replay-relevant fields per action type
|
|
165
|
+
if (action.selector !== undefined) entry.selector = action.selector;
|
|
166
|
+
if (action.text !== undefined) entry.text = action.text;
|
|
167
|
+
if (action.key !== undefined) entry.key = action.key;
|
|
168
|
+
if (action.duration !== undefined) entry.duration = action.duration;
|
|
169
|
+
if (action.url !== undefined) entry.url = action.url;
|
|
170
|
+
if (action.value !== undefined) entry.value = action.value;
|
|
171
|
+
if (action.direction !== undefined) entry.direction = action.direction;
|
|
172
|
+
if (action.distance !== undefined) entry.distance = action.distance;
|
|
173
|
+
if (action.description !== undefined) entry.description = action.description;
|
|
174
|
+
|
|
175
|
+
return entry;
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
/**
|
|
179
|
+
* Convert a recorded entry back into an action object suitable for ActionExecutor.
|
|
180
|
+
*
|
|
181
|
+
* @param {Object} entry - Recorded entry
|
|
182
|
+
* @returns {Object} Action object
|
|
183
|
+
*/
|
|
184
|
+
export function recordedEntryToAction(entry) {
|
|
185
|
+
// Pass through all fields except the recording-specific timestamp
|
|
186
|
+
const { timestamp_ms_since_start: _ignored, ...action } = entry;
|
|
187
|
+
return action;
|
|
188
|
+
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared HTTP fetch helper for basic tools.
|
|
3
|
+
* Applies an AbortController timeout and a default User-Agent.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Fetch a URL with a configurable timeout.
|
|
8
|
+
* @param {string} url
|
|
9
|
+
* @param {{ timeout?: number, headers?: Record<string,string> }} [options]
|
|
10
|
+
* @returns {Promise<Response>}
|
|
11
|
+
*/
|
|
12
|
+
export async function fetchWithTimeout(url, options = {}) {
|
|
13
|
+
const { timeout = 10000, headers = {} } = options;
|
|
14
|
+
|
|
15
|
+
const controller = new AbortController();
|
|
16
|
+
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
|
17
|
+
|
|
18
|
+
try {
|
|
19
|
+
const response = await fetch(url, {
|
|
20
|
+
signal: controller.signal,
|
|
21
|
+
headers: {
|
|
22
|
+
'User-Agent': 'CrawlForge/1.0.0',
|
|
23
|
+
...headers
|
|
24
|
+
}
|
|
25
|
+
});
|
|
26
|
+
clearTimeout(timeoutId);
|
|
27
|
+
return response;
|
|
28
|
+
} catch (error) {
|
|
29
|
+
clearTimeout(timeoutId);
|
|
30
|
+
if (error.name === 'AbortError') {
|
|
31
|
+
throw new Error(`Request timeout after ${timeout}ms`);
|
|
32
|
+
}
|
|
33
|
+
throw error;
|
|
34
|
+
}
|
|
35
|
+
}
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* extract_links — Extract all links from a webpage with optional filtering.
|
|
3
|
+
* Extracted from server.js inline handler.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import { load } from 'cheerio';
|
|
7
|
+
import { fetchWithTimeout } from './_fetch.js';
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* @param {{ url: string, filter_external?: boolean, base_url?: string }} params
|
|
11
|
+
*/
|
|
12
|
+
export async function extractLinksHandler({ url, filter_external, base_url }) {
|
|
13
|
+
try {
|
|
14
|
+
const response = await fetchWithTimeout(url);
|
|
15
|
+
if (!response.ok) {
|
|
16
|
+
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
const html = await response.text();
|
|
20
|
+
const $ = load(html);
|
|
21
|
+
|
|
22
|
+
const baseUrl = base_url || new URL(url).origin;
|
|
23
|
+
const pageUrl = new URL(url);
|
|
24
|
+
const links = [];
|
|
25
|
+
|
|
26
|
+
$('a[href]').each((_, element) => {
|
|
27
|
+
const href = $(element).attr('href');
|
|
28
|
+
const text = $(element).text().trim();
|
|
29
|
+
|
|
30
|
+
if (!href) return;
|
|
31
|
+
|
|
32
|
+
let absoluteUrl;
|
|
33
|
+
let isExternal = false;
|
|
34
|
+
|
|
35
|
+
try {
|
|
36
|
+
if (href.startsWith('http://') || href.startsWith('https://')) {
|
|
37
|
+
absoluteUrl = href;
|
|
38
|
+
isExternal = new URL(href).origin !== pageUrl.origin;
|
|
39
|
+
} else {
|
|
40
|
+
absoluteUrl = new URL(href, baseUrl).toString();
|
|
41
|
+
isExternal = false;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
if (filter_external && isExternal) return;
|
|
45
|
+
|
|
46
|
+
links.push({ href: absoluteUrl, text, is_external: isExternal, original_href: href });
|
|
47
|
+
} catch {
|
|
48
|
+
// skip invalid URLs
|
|
49
|
+
}
|
|
50
|
+
});
|
|
51
|
+
|
|
52
|
+
const uniqueLinks = links.filter((link, index, arr) =>
|
|
53
|
+
arr.findIndex(l => l.href === link.href) === index
|
|
54
|
+
);
|
|
55
|
+
|
|
56
|
+
return {
|
|
57
|
+
content: [{
|
|
58
|
+
type: 'text',
|
|
59
|
+
text: JSON.stringify({
|
|
60
|
+
links: uniqueLinks,
|
|
61
|
+
total_count: uniqueLinks.length,
|
|
62
|
+
internal_count: uniqueLinks.filter(l => !l.is_external).length,
|
|
63
|
+
external_count: uniqueLinks.filter(l => l.is_external).length,
|
|
64
|
+
base_url: baseUrl
|
|
65
|
+
}, null, 2)
|
|
66
|
+
}]
|
|
67
|
+
};
|
|
68
|
+
} catch (error) {
|
|
69
|
+
return {
|
|
70
|
+
content: [{ type: 'text', text: `Failed to extract links: ${error.message}` }],
|
|
71
|
+
isError: true
|
|
72
|
+
};
|
|
73
|
+
}
|
|
74
|
+
}
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* extract_metadata — Extract page metadata (title, description, OG tags, etc.).
|
|
3
|
+
* Extracted from server.js inline handler.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import { load } from 'cheerio';
|
|
7
|
+
import { fetchWithTimeout } from './_fetch.js';
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* @param {{ url: string }} params
|
|
11
|
+
*/
|
|
12
|
+
export async function extractMetadataHandler({ url }) {
|
|
13
|
+
try {
|
|
14
|
+
const response = await fetchWithTimeout(url);
|
|
15
|
+
if (!response.ok) {
|
|
16
|
+
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
const html = await response.text();
|
|
20
|
+
const $ = load(html);
|
|
21
|
+
|
|
22
|
+
const title = $('title').text().trim() || $('h1').first().text().trim();
|
|
23
|
+
const description =
|
|
24
|
+
$('meta[name="description"]').attr('content') ||
|
|
25
|
+
$('meta[property="og:description"]').attr('content') || '';
|
|
26
|
+
const keywords = $('meta[name="keywords"]').attr('content') || '';
|
|
27
|
+
const canonical = $('link[rel="canonical"]').attr('href') || '';
|
|
28
|
+
|
|
29
|
+
const ogTags = {};
|
|
30
|
+
$('meta[property^="og:"]').each((_, el) => {
|
|
31
|
+
const property = $(el).attr('property');
|
|
32
|
+
const content = $(el).attr('content');
|
|
33
|
+
if (property && content) ogTags[property.replace('og:', '')] = content;
|
|
34
|
+
});
|
|
35
|
+
|
|
36
|
+
const twitterTags = {};
|
|
37
|
+
$('meta[name^="twitter:"]').each((_, el) => {
|
|
38
|
+
const name = $(el).attr('name');
|
|
39
|
+
const content = $(el).attr('content');
|
|
40
|
+
if (name && content) twitterTags[name.replace('twitter:', '')] = content;
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
const author = $('meta[name="author"]').attr('content') || '';
|
|
44
|
+
const robots = $('meta[name="robots"]').attr('content') || '';
|
|
45
|
+
const viewport = $('meta[name="viewport"]').attr('content') || '';
|
|
46
|
+
const charset =
|
|
47
|
+
$('meta[charset]').attr('charset') ||
|
|
48
|
+
$('meta[http-equiv="Content-Type"]').attr('content') || '';
|
|
49
|
+
|
|
50
|
+
return {
|
|
51
|
+
content: [{
|
|
52
|
+
type: 'text',
|
|
53
|
+
text: JSON.stringify({
|
|
54
|
+
title,
|
|
55
|
+
description,
|
|
56
|
+
keywords: keywords.split(',').map(k => k.trim()).filter(Boolean),
|
|
57
|
+
canonical_url: canonical,
|
|
58
|
+
author,
|
|
59
|
+
robots,
|
|
60
|
+
viewport,
|
|
61
|
+
charset,
|
|
62
|
+
og_tags: ogTags,
|
|
63
|
+
twitter_tags: twitterTags,
|
|
64
|
+
url: response.url
|
|
65
|
+
}, null, 2)
|
|
66
|
+
}]
|
|
67
|
+
};
|
|
68
|
+
} catch (error) {
|
|
69
|
+
return {
|
|
70
|
+
content: [{ type: 'text', text: `Failed to extract metadata: ${error.message}` }],
|
|
71
|
+
isError: true
|
|
72
|
+
};
|
|
73
|
+
}
|
|
74
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* extract_text — Extract clean text content from HTML.
|
|
3
|
+
* Extracted from server.js inline handler.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import { load } from 'cheerio';
|
|
7
|
+
import { fetchWithTimeout } from './_fetch.js';
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* @param {{ url: string, remove_scripts?: boolean, remove_styles?: boolean }} params
|
|
11
|
+
*/
|
|
12
|
+
export async function extractTextHandler({ url, remove_scripts, remove_styles }) {
|
|
13
|
+
try {
|
|
14
|
+
const response = await fetchWithTimeout(url);
|
|
15
|
+
if (!response.ok) {
|
|
16
|
+
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
const html = await response.text();
|
|
20
|
+
const $ = load(html);
|
|
21
|
+
|
|
22
|
+
if (remove_scripts !== false) $('script').remove();
|
|
23
|
+
if (remove_styles !== false) $('style').remove();
|
|
24
|
+
|
|
25
|
+
$('nav, header, footer, aside, .advertisement, .ad, .sidebar').remove();
|
|
26
|
+
|
|
27
|
+
const text = $('body').text().replace(/\s+/g, ' ').trim();
|
|
28
|
+
|
|
29
|
+
return {
|
|
30
|
+
content: [{
|
|
31
|
+
type: 'text',
|
|
32
|
+
text: JSON.stringify({
|
|
33
|
+
text,
|
|
34
|
+
word_count: text.split(/\s+/).filter(w => w.length > 0).length,
|
|
35
|
+
char_count: text.length,
|
|
36
|
+
url: response.url
|
|
37
|
+
}, null, 2)
|
|
38
|
+
}]
|
|
39
|
+
};
|
|
40
|
+
} catch (error) {
|
|
41
|
+
return {
|
|
42
|
+
content: [{ type: 'text', text: `Failed to extract text: ${error.message}` }],
|
|
43
|
+
isError: true
|
|
44
|
+
};
|
|
45
|
+
}
|
|
46
|
+
}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* fetch_url — Basic URL fetching with headers and response handling.
|
|
3
|
+
* Extracted from server.js inline handler.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import { fetchWithTimeout } from './_fetch.js';
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* @param {{ url: string, headers?: Record<string,string>, timeout?: number }} params
|
|
10
|
+
*/
|
|
11
|
+
export async function fetchUrlHandler({ url, headers, timeout }) {
|
|
12
|
+
try {
|
|
13
|
+
const response = await fetchWithTimeout(url, {
|
|
14
|
+
timeout: timeout || 10000,
|
|
15
|
+
headers: headers || {}
|
|
16
|
+
});
|
|
17
|
+
|
|
18
|
+
const body = await response.text();
|
|
19
|
+
const responseHeaders = {};
|
|
20
|
+
response.headers.forEach((value, key) => {
|
|
21
|
+
responseHeaders[key] = value;
|
|
22
|
+
});
|
|
23
|
+
|
|
24
|
+
return {
|
|
25
|
+
content: [{
|
|
26
|
+
type: 'text',
|
|
27
|
+
text: JSON.stringify({
|
|
28
|
+
status: response.status,
|
|
29
|
+
statusText: response.statusText,
|
|
30
|
+
headers: responseHeaders,
|
|
31
|
+
body,
|
|
32
|
+
contentType: response.headers.get('content-type') || 'unknown',
|
|
33
|
+
size: body.length,
|
|
34
|
+
url: response.url
|
|
35
|
+
}, null, 2)
|
|
36
|
+
}]
|
|
37
|
+
};
|
|
38
|
+
} catch (error) {
|
|
39
|
+
return {
|
|
40
|
+
content: [{ type: 'text', text: `Failed to fetch URL: ${error.message}` }],
|
|
41
|
+
isError: true
|
|
42
|
+
};
|
|
43
|
+
}
|
|
44
|
+
}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* scrape_structured — Extract structured data using CSS selectors.
|
|
3
|
+
* Extracted from server.js inline handler.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import { load } from 'cheerio';
|
|
7
|
+
import { fetchWithTimeout } from './_fetch.js';
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* @param {{ url: string, selectors: Record<string, string> }} params
|
|
11
|
+
*/
|
|
12
|
+
export async function scrapeStructuredHandler({ url, selectors }) {
|
|
13
|
+
try {
|
|
14
|
+
const response = await fetchWithTimeout(url);
|
|
15
|
+
if (!response.ok) {
|
|
16
|
+
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
const html = await response.text();
|
|
20
|
+
const $ = load(html);
|
|
21
|
+
const results = {};
|
|
22
|
+
|
|
23
|
+
for (const [fieldName, selector] of Object.entries(selectors)) {
|
|
24
|
+
try {
|
|
25
|
+
const elements = $(selector);
|
|
26
|
+
if (elements.length === 0) {
|
|
27
|
+
results[fieldName] = null;
|
|
28
|
+
} else if (elements.length === 1) {
|
|
29
|
+
results[fieldName] = elements.text().trim();
|
|
30
|
+
} else {
|
|
31
|
+
results[fieldName] = elements.map((_, el) => $(el).text().trim()).get();
|
|
32
|
+
}
|
|
33
|
+
} catch (selectorError) {
|
|
34
|
+
results[fieldName] = {
|
|
35
|
+
error: `Invalid selector: ${selector}`,
|
|
36
|
+
message: selectorError.message
|
|
37
|
+
};
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
return {
|
|
42
|
+
content: [{
|
|
43
|
+
type: 'text',
|
|
44
|
+
text: JSON.stringify({
|
|
45
|
+
data: results,
|
|
46
|
+
selectors_used: selectors,
|
|
47
|
+
elements_found: Object.keys(results).length,
|
|
48
|
+
url: response.url
|
|
49
|
+
}, null, 2)
|
|
50
|
+
}]
|
|
51
|
+
};
|
|
52
|
+
} catch (error) {
|
|
53
|
+
return {
|
|
54
|
+
content: [{ type: 'text', text: `Failed to scrape structured data: ${error.message}` }],
|
|
55
|
+
isError: true
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
}
|