crawlforge-mcp-server 3.0.18 → 3.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/package.json +5 -2
  2. package/server.js +192 -1277
  3. package/src/core/ActionExecutor.js +2 -43
  4. package/src/core/AuthManager.js +127 -14
  5. package/src/core/BrowserContextPool.js +187 -0
  6. package/src/core/JobManager.js +7 -5
  7. package/src/core/LocalizationManager.js +14 -125
  8. package/src/core/StealthBrowserManager.js +26 -18
  9. package/src/core/cache/CacheManager.js +4 -1
  10. package/src/core/crawlers/BFSCrawler.js +19 -5
  11. package/src/observability/metrics.js +137 -0
  12. package/src/observability/tracing.js +74 -0
  13. package/src/server/auth/oauth.js +388 -0
  14. package/src/server/registerTool.js +41 -0
  15. package/src/server/schemas/common.js +29 -0
  16. package/src/server/transports/http.js +22 -0
  17. package/src/server/transports/stdio.js +16 -0
  18. package/src/server/transports/streamableHttp.js +226 -0
  19. package/src/server/withAuth.js +121 -0
  20. package/src/tools/advanced/BatchScrapeTool.js +12 -1086
  21. package/src/tools/advanced/ScrapeWithActionsTool.js +105 -19
  22. package/src/tools/advanced/batchScrape/index.js +328 -0
  23. package/src/tools/advanced/batchScrape/queue.js +91 -0
  24. package/src/tools/advanced/batchScrape/reporter.js +26 -0
  25. package/src/tools/advanced/batchScrape/schema.js +37 -0
  26. package/src/tools/advanced/batchScrape/worker.js +179 -0
  27. package/src/tools/advanced/scrapeWithActions/recorder.js +188 -0
  28. package/src/tools/basic/_fetch.js +35 -0
  29. package/src/tools/basic/extractLinks.js +74 -0
  30. package/src/tools/basic/extractMetadata.js +74 -0
  31. package/src/tools/basic/extractText.js +46 -0
  32. package/src/tools/basic/fetchUrl.js +44 -0
  33. package/src/tools/basic/scrapeStructured.js +58 -0
  34. package/src/tools/crawl/_sessionContext.js +234 -0
  35. package/src/tools/crawl/crawlDeep.js +55 -5
  36. package/src/tools/crawl/mapSite.js +23 -2
  37. package/src/tools/extract/_fetchAndParse.js +57 -0
  38. package/src/tools/extract/extractStructured.js +3 -19
  39. package/src/tools/extract/extractWithLlm.js +365 -0
  40. package/src/tools/search/providers/searxng.js +126 -0
  41. package/src/tools/search/ranking/ResultDeduplicator.js +18 -11
  42. package/src/tools/search/ranking/ResultRanker.js +17 -10
  43. package/src/tools/search/ranking/SearchResultCache.js +52 -0
  44. package/src/tools/search/searchWeb.js +112 -6
  45. package/src/tools/tracking/trackChanges/differ.js +98 -0
  46. package/src/tools/tracking/trackChanges/index.js +432 -0
  47. package/src/tools/tracking/trackChanges/monitor.js +93 -0
  48. package/src/tools/tracking/trackChanges/notifier.js +105 -0
  49. package/src/tools/tracking/trackChanges/schema.js +127 -0
  50. package/src/tools/tracking/trackChanges.js +12 -1374
@@ -0,0 +1,179 @@
1
+ /**
2
+ * batchScrape — worker module.
3
+ * URL fetching, content extraction, format generation.
4
+ * Used by queue.js (the Semaphore-based batch runner).
5
+ */
6
+
7
+ import { load } from 'cheerio';
8
+
9
+ const USER_AGENT = 'MCP-WebScraper-BatchTool/1.0.0';
10
+
11
+ /**
12
+ * Fetch a URL with AbortController timeout.
13
+ */
14
+ export async function fetchUrl(url, options = {}) {
15
+ const { timeout = 15000, headers = {} } = options;
16
+ const controller = new AbortController();
17
+ const timeoutId = setTimeout(() => controller.abort(), timeout);
18
+ try {
19
+ const response = await fetch(url, {
20
+ signal: controller.signal,
21
+ headers: { 'User-Agent': USER_AGENT, ...headers }
22
+ });
23
+ clearTimeout(timeoutId);
24
+ return response;
25
+ } catch (error) {
26
+ clearTimeout(timeoutId);
27
+ if (error.name === 'AbortError') throw new Error(`Request timeout after ${timeout}ms`);
28
+ throw error;
29
+ }
30
+ }
31
+
32
+ /**
33
+ * Scrape a single URL and return a result object.
34
+ */
35
+ export async function scrapeUrl(config, options, defaultTimeout) {
36
+ const startTime = Date.now();
37
+ try {
38
+ const response = await fetchUrl(config.url, {
39
+ headers: config.headers,
40
+ timeout: config.timeout || defaultTimeout
41
+ });
42
+
43
+ if (!response.ok) throw new Error(`HTTP ${response.status}: ${response.statusText}`);
44
+
45
+ const html = await response.text();
46
+ const $ = load(html);
47
+
48
+ const result = {
49
+ success: true,
50
+ url: config.url,
51
+ timestamp: Date.now(),
52
+ executionTime: Date.now() - startTime,
53
+ metadata: {
54
+ status: response.status,
55
+ contentType: response.headers.get('content-type'),
56
+ contentLength: html.length,
57
+ ...(config.metadata || {})
58
+ }
59
+ };
60
+
61
+ if (options.extractionSchema || config.selectors) {
62
+ result.extracted = extractStructuredData($, { ...config.selectors, ...options.extractionSchema });
63
+ }
64
+
65
+ result.content = generateFormats($, html, options.formats);
66
+ return result;
67
+ } catch (error) {
68
+ return {
69
+ success: false,
70
+ url: config.url,
71
+ error: error.message,
72
+ timestamp: Date.now(),
73
+ executionTime: Date.now() - startTime,
74
+ metadata: config.metadata || {}
75
+ };
76
+ }
77
+ }
78
+
79
+ function extractStructuredData($, selectors) {
80
+ const extracted = {};
81
+ for (const [key, selector] of Object.entries(selectors)) {
82
+ try {
83
+ const elements = $(selector);
84
+ if (elements.length === 0) extracted[key] = null;
85
+ else if (elements.length === 1) extracted[key] = elements.text().trim();
86
+ else extracted[key] = elements.map((_, el) => $(el).text().trim()).get();
87
+ } catch {
88
+ extracted[key] = { error: `Invalid selector: ${selector}` };
89
+ }
90
+ }
91
+ return extracted;
92
+ }
93
+
94
+ function generateFormats($, html, formats) {
95
+ const content = {};
96
+ if (formats.includes('html')) content.html = html;
97
+ if (formats.includes('text')) content.text = $('body').text().replace(/\s+/g, ' ').trim();
98
+ if (formats.includes('markdown')) content.markdown = buildMarkdown($);
99
+ if (formats.includes('json')) {
100
+ content.json = {
101
+ title: $('title').text().trim(),
102
+ headings: extractHeadings($),
103
+ links: extractLinks($),
104
+ images: extractImages($),
105
+ metadata: extractMetadata($)
106
+ };
107
+ }
108
+ return content;
109
+ }
110
+
111
+ function buildMarkdown($) {
112
+ let md = '';
113
+ const title = $('title').text().trim();
114
+ if (title) md += `# ${title}\n\n`;
115
+
116
+ const selectors = ['article', 'main', '.content', '#content', '.post-content', '.entry-content'];
117
+ let $body = null;
118
+ for (const sel of selectors) {
119
+ $body = $(sel);
120
+ if ($body.length > 0) break;
121
+ }
122
+ if (!$body || $body.length === 0) $body = $('body');
123
+
124
+ $body.find('h1').each((_, el) => { md += `# ${$(el).text().trim()}\n\n`; });
125
+ $body.find('h2').each((_, el) => { md += `## ${$(el).text().trim()}\n\n`; });
126
+ $body.find('h3').each((_, el) => { md += `### ${$(el).text().trim()}\n\n`; });
127
+ $body.find('p').each((_, el) => { const t = $(el).text().trim(); if (t) md += `${t}\n\n`; });
128
+ $body.find('ul li').each((_, el) => { md += `- ${$(el).text().trim()}\n`; });
129
+ $body.find('ol li').each((_, el) => { md += `1. ${$(el).text().trim()}\n`; });
130
+
131
+ return md.trim();
132
+ }
133
+
134
+ function extractHeadings($) {
135
+ const headings = [];
136
+ $('h1, h2, h3, h4, h5, h6').each((_, el) => {
137
+ headings.push({ level: parseInt(el.name.substring(1)), text: $(el).text().trim(), id: $(el).attr('id') || null });
138
+ });
139
+ return headings;
140
+ }
141
+
142
+ function extractLinks($) {
143
+ const links = [];
144
+ $('a[href]').each((_, el) => {
145
+ const href = $(el).attr('href');
146
+ const text = $(el).text().trim();
147
+ if (href && text) links.push({ href, text, title: $(el).attr('title') || null });
148
+ });
149
+ return links;
150
+ }
151
+
152
+ function extractImages($) {
153
+ const images = [];
154
+ $('img[src]').each((_, el) => {
155
+ images.push({
156
+ src: $(el).attr('src'),
157
+ alt: $(el).attr('alt') || null,
158
+ title: $(el).attr('title') || null,
159
+ width: $(el).attr('width') || null,
160
+ height: $(el).attr('height') || null
161
+ });
162
+ });
163
+ return images;
164
+ }
165
+
166
+ function extractMetadata($) {
167
+ const m = {};
168
+ m.title = $('title').text().trim();
169
+ m.description = $('meta[name="description"]').attr('content') || '';
170
+ m.og = {};
171
+ $('meta[property^="og:"]').each((_, el) => {
172
+ m.og[$(el).attr('property').replace('og:', '')] = $(el).attr('content');
173
+ });
174
+ m.twitter = {};
175
+ $('meta[name^="twitter:"]').each((_, el) => {
176
+ m.twitter[$(el).attr('name').replace('twitter:', '')] = $(el).attr('content');
177
+ });
178
+ return m;
179
+ }
@@ -0,0 +1,188 @@
1
+ /**
2
+ * recorder.js — Recording and replay support for scrape_with_actions.
3
+ *
4
+ * Responsibilities:
5
+ * - Validate recording names (path-traversal prevention)
6
+ * - Persist recorded action sequences to disk (atomic write)
7
+ * - Load saved recordings for replay
8
+ * - List available recordings
9
+ *
10
+ * The recordings directory is resolved from:
11
+ * 1. process.env.CRAWLFORGE_HOME_OVERRIDE (tests only)
12
+ * 2. os.homedir()
13
+ *
14
+ * File layout: <homeDir>/.crawlforge/recordings/<name>.json
15
+ */
16
+
17
+ import os from 'os';
18
+ import fs from 'fs/promises';
19
+ import path from 'path';
20
+
21
+ // Regex enforcing safe recording names — no path separators or special chars.
22
+ const VALID_NAME_RE = /^[a-zA-Z0-9_-]{1,64}$/;
23
+
24
+ /**
25
+ * Return the base home directory respecting the test override env var.
26
+ * @returns {string}
27
+ */
28
+ function homeDir() {
29
+ return process.env.CRAWLFORGE_HOME_OVERRIDE || os.homedir();
30
+ }
31
+
32
+ /**
33
+ * Return the recordings directory path (not guaranteed to exist).
34
+ * @returns {string}
35
+ */
36
+ function recordingsDir() {
37
+ return path.join(homeDir(), '.crawlforge', 'recordings');
38
+ }
39
+
40
+ /**
41
+ * Validate a recording name.
42
+ * @param {string} name
43
+ * @throws {Error} if the name is invalid
44
+ */
45
+ export function validateRecordingName(name) {
46
+ if (typeof name !== 'string' || !VALID_NAME_RE.test(name)) {
47
+ throw new Error(
48
+ `Invalid recording name "${name}". ` +
49
+ 'Names must be 1–64 characters and contain only letters, digits, underscores, or hyphens.'
50
+ );
51
+ }
52
+ }
53
+
54
+ /**
55
+ * Persist a recorded action sequence to disk atomically.
56
+ *
57
+ * @param {string} name - Recording name (validated before writing)
58
+ * @param {Array<Object>} recordedActions - Array of annotated action entries
59
+ * @param {Object} [meta] - Optional metadata (original url, timestamp, etc.)
60
+ * @returns {Promise<string>} Resolved file path
61
+ */
62
+ export async function saveRecording(name, recordedActions, meta = {}) {
63
+ validateRecordingName(name);
64
+
65
+ const dir = recordingsDir();
66
+ await fs.mkdir(dir, { recursive: true });
67
+
68
+ const filePath = path.join(dir, `${name}.json`);
69
+ const tmpPath = `${filePath}.tmp`;
70
+
71
+ const payload = JSON.stringify(
72
+ {
73
+ name,
74
+ savedAt: new Date().toISOString(),
75
+ ...meta,
76
+ recordedActions
77
+ },
78
+ null,
79
+ 2
80
+ );
81
+
82
+ // Atomic write: write to .tmp then rename
83
+ await fs.writeFile(tmpPath, payload, 'utf8');
84
+ await fs.rename(tmpPath, filePath);
85
+
86
+ return filePath;
87
+ }
88
+
89
+ /**
90
+ * Load a saved recording from disk.
91
+ *
92
+ * @param {string} name - Recording name
93
+ * @returns {Promise<Object>} Parsed recording object (includes `recordedActions`)
94
+ * @throws {Error} if the recording does not exist or cannot be parsed
95
+ */
96
+ export async function loadRecording(name) {
97
+ validateRecordingName(name);
98
+
99
+ const filePath = path.join(recordingsDir(), `${name}.json`);
100
+
101
+ let raw;
102
+ try {
103
+ raw = await fs.readFile(filePath, 'utf8');
104
+ } catch (err) {
105
+ if (err.code === 'ENOENT') {
106
+ throw new Error(`Recording "${name}" not found. Use replayRecording: "__list__" to see available recordings.`);
107
+ }
108
+ throw new Error(`Failed to read recording "${name}": ${err.message}`);
109
+ }
110
+
111
+ let parsed;
112
+ try {
113
+ parsed = JSON.parse(raw);
114
+ } catch (err) {
115
+ throw new Error(`Recording "${name}" is corrupted (invalid JSON): ${err.message}`);
116
+ }
117
+
118
+ if (!Array.isArray(parsed.recordedActions)) {
119
+ throw new Error(`Recording "${name}" has an unexpected format (missing recordedActions array).`);
120
+ }
121
+
122
+ return parsed;
123
+ }
124
+
125
+ /**
126
+ * List all available recording names.
127
+ *
128
+ * @returns {Promise<string[]>} Sorted array of recording names (without .json extension)
129
+ */
130
+ export async function listRecordings() {
131
+ const dir = recordingsDir();
132
+
133
+ let entries;
134
+ try {
135
+ entries = await fs.readdir(dir);
136
+ } catch (err) {
137
+ if (err.code === 'ENOENT') {
138
+ return [];
139
+ }
140
+ throw new Error(`Failed to list recordings: ${err.message}`);
141
+ }
142
+
143
+ return entries
144
+ .filter(f => f.endsWith('.json') && !f.endsWith('.tmp'))
145
+ .map(f => f.slice(0, -5)) // strip .json
146
+ .sort();
147
+ }
148
+
149
+ /**
150
+ * Build a recordedActions entry from an action definition and timing info.
151
+ *
152
+ * Only the fields meaningful for replay are kept.
153
+ *
154
+ * @param {Object} action - Original action object
155
+ * @param {number} timestampMsSinceStart - ms since recording session started
156
+ * @returns {Object}
157
+ */
158
+ export function buildRecordedEntry(action, timestampMsSinceStart) {
159
+ const entry = {
160
+ type: action.type,
161
+ timestamp_ms_since_start: timestampMsSinceStart
162
+ };
163
+
164
+ // Preserve replay-relevant fields per action type
165
+ if (action.selector !== undefined) entry.selector = action.selector;
166
+ if (action.text !== undefined) entry.text = action.text;
167
+ if (action.key !== undefined) entry.key = action.key;
168
+ if (action.duration !== undefined) entry.duration = action.duration;
169
+ if (action.url !== undefined) entry.url = action.url;
170
+ if (action.value !== undefined) entry.value = action.value;
171
+ if (action.direction !== undefined) entry.direction = action.direction;
172
+ if (action.distance !== undefined) entry.distance = action.distance;
173
+ if (action.description !== undefined) entry.description = action.description;
174
+
175
+ return entry;
176
+ }
177
+
178
+ /**
179
+ * Convert a recorded entry back into an action object suitable for ActionExecutor.
180
+ *
181
+ * @param {Object} entry - Recorded entry
182
+ * @returns {Object} Action object
183
+ */
184
+ export function recordedEntryToAction(entry) {
185
+ // Pass through all fields except the recording-specific timestamp
186
+ const { timestamp_ms_since_start: _ignored, ...action } = entry;
187
+ return action;
188
+ }
@@ -0,0 +1,35 @@
1
+ /**
2
+ * Shared HTTP fetch helper for basic tools.
3
+ * Applies an AbortController timeout and a default User-Agent.
4
+ */
5
+
6
+ /**
7
+ * Fetch a URL with a configurable timeout.
8
+ * @param {string} url
9
+ * @param {{ timeout?: number, headers?: Record<string,string> }} [options]
10
+ * @returns {Promise<Response>}
11
+ */
12
+ export async function fetchWithTimeout(url, options = {}) {
13
+ const { timeout = 10000, headers = {} } = options;
14
+
15
+ const controller = new AbortController();
16
+ const timeoutId = setTimeout(() => controller.abort(), timeout);
17
+
18
+ try {
19
+ const response = await fetch(url, {
20
+ signal: controller.signal,
21
+ headers: {
22
+ 'User-Agent': 'CrawlForge/1.0.0',
23
+ ...headers
24
+ }
25
+ });
26
+ clearTimeout(timeoutId);
27
+ return response;
28
+ } catch (error) {
29
+ clearTimeout(timeoutId);
30
+ if (error.name === 'AbortError') {
31
+ throw new Error(`Request timeout after ${timeout}ms`);
32
+ }
33
+ throw error;
34
+ }
35
+ }
@@ -0,0 +1,74 @@
1
+ /**
2
+ * extract_links — Extract all links from a webpage with optional filtering.
3
+ * Extracted from server.js inline handler.
4
+ */
5
+
6
+ import { load } from 'cheerio';
7
+ import { fetchWithTimeout } from './_fetch.js';
8
+
9
+ /**
10
+ * @param {{ url: string, filter_external?: boolean, base_url?: string }} params
11
+ */
12
+ export async function extractLinksHandler({ url, filter_external, base_url }) {
13
+ try {
14
+ const response = await fetchWithTimeout(url);
15
+ if (!response.ok) {
16
+ throw new Error(`HTTP ${response.status}: ${response.statusText}`);
17
+ }
18
+
19
+ const html = await response.text();
20
+ const $ = load(html);
21
+
22
+ const baseUrl = base_url || new URL(url).origin;
23
+ const pageUrl = new URL(url);
24
+ const links = [];
25
+
26
+ $('a[href]').each((_, element) => {
27
+ const href = $(element).attr('href');
28
+ const text = $(element).text().trim();
29
+
30
+ if (!href) return;
31
+
32
+ let absoluteUrl;
33
+ let isExternal = false;
34
+
35
+ try {
36
+ if (href.startsWith('http://') || href.startsWith('https://')) {
37
+ absoluteUrl = href;
38
+ isExternal = new URL(href).origin !== pageUrl.origin;
39
+ } else {
40
+ absoluteUrl = new URL(href, baseUrl).toString();
41
+ isExternal = false;
42
+ }
43
+
44
+ if (filter_external && isExternal) return;
45
+
46
+ links.push({ href: absoluteUrl, text, is_external: isExternal, original_href: href });
47
+ } catch {
48
+ // skip invalid URLs
49
+ }
50
+ });
51
+
52
+ const uniqueLinks = links.filter((link, index, arr) =>
53
+ arr.findIndex(l => l.href === link.href) === index
54
+ );
55
+
56
+ return {
57
+ content: [{
58
+ type: 'text',
59
+ text: JSON.stringify({
60
+ links: uniqueLinks,
61
+ total_count: uniqueLinks.length,
62
+ internal_count: uniqueLinks.filter(l => !l.is_external).length,
63
+ external_count: uniqueLinks.filter(l => l.is_external).length,
64
+ base_url: baseUrl
65
+ }, null, 2)
66
+ }]
67
+ };
68
+ } catch (error) {
69
+ return {
70
+ content: [{ type: 'text', text: `Failed to extract links: ${error.message}` }],
71
+ isError: true
72
+ };
73
+ }
74
+ }
@@ -0,0 +1,74 @@
1
+ /**
2
+ * extract_metadata — Extract page metadata (title, description, OG tags, etc.).
3
+ * Extracted from server.js inline handler.
4
+ */
5
+
6
+ import { load } from 'cheerio';
7
+ import { fetchWithTimeout } from './_fetch.js';
8
+
9
+ /**
10
+ * @param {{ url: string }} params
11
+ */
12
+ export async function extractMetadataHandler({ url }) {
13
+ try {
14
+ const response = await fetchWithTimeout(url);
15
+ if (!response.ok) {
16
+ throw new Error(`HTTP ${response.status}: ${response.statusText}`);
17
+ }
18
+
19
+ const html = await response.text();
20
+ const $ = load(html);
21
+
22
+ const title = $('title').text().trim() || $('h1').first().text().trim();
23
+ const description =
24
+ $('meta[name="description"]').attr('content') ||
25
+ $('meta[property="og:description"]').attr('content') || '';
26
+ const keywords = $('meta[name="keywords"]').attr('content') || '';
27
+ const canonical = $('link[rel="canonical"]').attr('href') || '';
28
+
29
+ const ogTags = {};
30
+ $('meta[property^="og:"]').each((_, el) => {
31
+ const property = $(el).attr('property');
32
+ const content = $(el).attr('content');
33
+ if (property && content) ogTags[property.replace('og:', '')] = content;
34
+ });
35
+
36
+ const twitterTags = {};
37
+ $('meta[name^="twitter:"]').each((_, el) => {
38
+ const name = $(el).attr('name');
39
+ const content = $(el).attr('content');
40
+ if (name && content) twitterTags[name.replace('twitter:', '')] = content;
41
+ });
42
+
43
+ const author = $('meta[name="author"]').attr('content') || '';
44
+ const robots = $('meta[name="robots"]').attr('content') || '';
45
+ const viewport = $('meta[name="viewport"]').attr('content') || '';
46
+ const charset =
47
+ $('meta[charset]').attr('charset') ||
48
+ $('meta[http-equiv="Content-Type"]').attr('content') || '';
49
+
50
+ return {
51
+ content: [{
52
+ type: 'text',
53
+ text: JSON.stringify({
54
+ title,
55
+ description,
56
+ keywords: keywords.split(',').map(k => k.trim()).filter(Boolean),
57
+ canonical_url: canonical,
58
+ author,
59
+ robots,
60
+ viewport,
61
+ charset,
62
+ og_tags: ogTags,
63
+ twitter_tags: twitterTags,
64
+ url: response.url
65
+ }, null, 2)
66
+ }]
67
+ };
68
+ } catch (error) {
69
+ return {
70
+ content: [{ type: 'text', text: `Failed to extract metadata: ${error.message}` }],
71
+ isError: true
72
+ };
73
+ }
74
+ }
@@ -0,0 +1,46 @@
1
+ /**
2
+ * extract_text — Extract clean text content from HTML.
3
+ * Extracted from server.js inline handler.
4
+ */
5
+
6
+ import { load } from 'cheerio';
7
+ import { fetchWithTimeout } from './_fetch.js';
8
+
9
+ /**
10
+ * @param {{ url: string, remove_scripts?: boolean, remove_styles?: boolean }} params
11
+ */
12
+ export async function extractTextHandler({ url, remove_scripts, remove_styles }) {
13
+ try {
14
+ const response = await fetchWithTimeout(url);
15
+ if (!response.ok) {
16
+ throw new Error(`HTTP ${response.status}: ${response.statusText}`);
17
+ }
18
+
19
+ const html = await response.text();
20
+ const $ = load(html);
21
+
22
+ if (remove_scripts !== false) $('script').remove();
23
+ if (remove_styles !== false) $('style').remove();
24
+
25
+ $('nav, header, footer, aside, .advertisement, .ad, .sidebar').remove();
26
+
27
+ const text = $('body').text().replace(/\s+/g, ' ').trim();
28
+
29
+ return {
30
+ content: [{
31
+ type: 'text',
32
+ text: JSON.stringify({
33
+ text,
34
+ word_count: text.split(/\s+/).filter(w => w.length > 0).length,
35
+ char_count: text.length,
36
+ url: response.url
37
+ }, null, 2)
38
+ }]
39
+ };
40
+ } catch (error) {
41
+ return {
42
+ content: [{ type: 'text', text: `Failed to extract text: ${error.message}` }],
43
+ isError: true
44
+ };
45
+ }
46
+ }
@@ -0,0 +1,44 @@
1
+ /**
2
+ * fetch_url — Basic URL fetching with headers and response handling.
3
+ * Extracted from server.js inline handler.
4
+ */
5
+
6
+ import { fetchWithTimeout } from './_fetch.js';
7
+
8
+ /**
9
+ * @param {{ url: string, headers?: Record<string,string>, timeout?: number }} params
10
+ */
11
+ export async function fetchUrlHandler({ url, headers, timeout }) {
12
+ try {
13
+ const response = await fetchWithTimeout(url, {
14
+ timeout: timeout || 10000,
15
+ headers: headers || {}
16
+ });
17
+
18
+ const body = await response.text();
19
+ const responseHeaders = {};
20
+ response.headers.forEach((value, key) => {
21
+ responseHeaders[key] = value;
22
+ });
23
+
24
+ return {
25
+ content: [{
26
+ type: 'text',
27
+ text: JSON.stringify({
28
+ status: response.status,
29
+ statusText: response.statusText,
30
+ headers: responseHeaders,
31
+ body,
32
+ contentType: response.headers.get('content-type') || 'unknown',
33
+ size: body.length,
34
+ url: response.url
35
+ }, null, 2)
36
+ }]
37
+ };
38
+ } catch (error) {
39
+ return {
40
+ content: [{ type: 'text', text: `Failed to fetch URL: ${error.message}` }],
41
+ isError: true
42
+ };
43
+ }
44
+ }
@@ -0,0 +1,58 @@
1
+ /**
2
+ * scrape_structured — Extract structured data using CSS selectors.
3
+ * Extracted from server.js inline handler.
4
+ */
5
+
6
+ import { load } from 'cheerio';
7
+ import { fetchWithTimeout } from './_fetch.js';
8
+
9
+ /**
10
+ * @param {{ url: string, selectors: Record<string, string> }} params
11
+ */
12
+ export async function scrapeStructuredHandler({ url, selectors }) {
13
+ try {
14
+ const response = await fetchWithTimeout(url);
15
+ if (!response.ok) {
16
+ throw new Error(`HTTP ${response.status}: ${response.statusText}`);
17
+ }
18
+
19
+ const html = await response.text();
20
+ const $ = load(html);
21
+ const results = {};
22
+
23
+ for (const [fieldName, selector] of Object.entries(selectors)) {
24
+ try {
25
+ const elements = $(selector);
26
+ if (elements.length === 0) {
27
+ results[fieldName] = null;
28
+ } else if (elements.length === 1) {
29
+ results[fieldName] = elements.text().trim();
30
+ } else {
31
+ results[fieldName] = elements.map((_, el) => $(el).text().trim()).get();
32
+ }
33
+ } catch (selectorError) {
34
+ results[fieldName] = {
35
+ error: `Invalid selector: ${selector}`,
36
+ message: selectorError.message
37
+ };
38
+ }
39
+ }
40
+
41
+ return {
42
+ content: [{
43
+ type: 'text',
44
+ text: JSON.stringify({
45
+ data: results,
46
+ selectors_used: selectors,
47
+ elements_found: Object.keys(results).length,
48
+ url: response.url
49
+ }, null, 2)
50
+ }]
51
+ };
52
+ } catch (error) {
53
+ return {
54
+ content: [{ type: 'text', text: `Failed to scrape structured data: ${error.message}` }],
55
+ isError: true
56
+ };
57
+ }
58
+ }