pse-mcp 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,420 @@
1
+ import { Server } from '@modelcontextprotocol/sdk/server/index.js';
2
+ import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
3
+ import { CallToolRequestSchema, ListToolsRequestSchema } from '@modelcontextprotocol/sdk/types.js';
4
+ import { GoogleSearchService } from './services/google-search.service.js';
5
+ import { ContentExtractor } from './services/content-extractor.service.js';
6
+ class GoogleSearchServer {
7
+ constructor() {
8
+ this.searchService = new GoogleSearchService();
9
+ this.contentExtractor = new ContentExtractor();
10
+ this.server = new Server({
11
+ name: 'google-search',
12
+ version: '1.0.0'
13
+ }, {
14
+ capabilities: {
15
+ tools: {
16
+ google_search: {
17
+ description: 'Search Google and return relevant results from the web. This tool finds web pages, articles, and information on specific topics using Google\'s search engine. Results include titles, snippets, and URLs that can be analyzed further using extract_webpage_content.',
18
+ inputSchema: {
19
+ type: 'object',
20
+ properties: {
21
+ query: {
22
+ type: 'string',
23
+ description: 'Search query - be specific and use quotes for exact matches. For best results, use clear keywords and avoid very long queries.'
24
+ },
25
+ num_results: {
26
+ type: 'number',
27
+ description: 'Number of results to return (default: 5, max: 10). Increase for broader coverage, decrease for faster response.'
28
+ },
29
+ site: {
30
+ type: 'string',
31
+ description: 'Limit search results to a specific website domain (e.g., "wikipedia.org" or "nytimes.com").'
32
+ },
33
+ language: {
34
+ type: 'string',
35
+ description: 'Filter results by language using ISO 639-1 codes (e.g., "en" for English, "es" for Spanish, "fr" for French).'
36
+ },
37
+ dateRestrict: {
38
+ type: 'string',
39
+ description: 'Filter results by date using Google\'s date restriction format: "d[number]" for past days, "w[number]" for past weeks, "m[number]" for past months, or "y[number]" for past years. Example: "m6" for results from the past 6 months.'
40
+ },
41
+ exactTerms: {
42
+ type: 'string',
43
+ description: 'Search for results that contain this exact phrase. This is equivalent to putting the terms in quotes in the search query.'
44
+ },
45
+ resultType: {
46
+ type: 'string',
47
+ description: 'Specify the type of results to return. Options include "image" (or "images"), "news", and "video" (or "videos"). Default is general web results.'
48
+ },
49
+ page: {
50
+ type: 'number',
51
+ description: 'Page number for paginated results (starts at 1). Use in combination with resultsPerPage to navigate through large result sets.'
52
+ },
53
+ resultsPerPage: {
54
+ type: 'number',
55
+ description: 'Number of results to show per page (default: 5, max: 10). Controls how many results are returned for each page.'
56
+ },
57
+ sort: {
58
+ type: 'string',
59
+ description: 'Sorting method for search results. Options: "relevance" (default) or "date" (most recent first).'
60
+ }
61
+ },
62
+ required: ['query']
63
+ }
64
+ },
65
+ extract_webpage_content: {
66
+ description: 'Extract and analyze content from a webpage, converting it to readable text. This tool fetches the main content while removing ads, navigation elements, and other clutter. Use it to get detailed information from specific pages found via google_search. Works with most common webpage formats including articles, blogs, and documentation.',
67
+ inputSchema: {
68
+ type: 'object',
69
+ properties: {
70
+ url: {
71
+ type: 'string',
72
+ description: 'Full URL of the webpage to extract content from (must start with http:// or https://). Ensure the URL is from a public webpage and not behind authentication.'
73
+ },
74
+ format: {
75
+ type: 'string',
76
+ description: 'Output format for the extracted content. Options: "markdown" (default), "html", or "text".'
77
+ }
78
+ },
79
+ required: ['url']
80
+ }
81
+ },
82
+ extract_multiple_webpages: {
83
+ description: 'Extract and analyze content from multiple webpages in a single request. This tool is ideal for comparing information across different sources or gathering comprehensive information on a topic. Limited to 5 URLs per request to maintain performance.',
84
+ inputSchema: {
85
+ type: 'object',
86
+ properties: {
87
+ urls: {
88
+ type: 'array',
89
+ items: { type: 'string' },
90
+ description: 'Array of webpage URLs to extract content from. Each URL must be public and start with http:// or https://. Maximum 5 URLs per request.'
91
+ },
92
+ format: {
93
+ type: 'string',
94
+ description: 'Output format for the extracted content. Options: "markdown" (default), "html", or "text".'
95
+ }
96
+ },
97
+ required: ['urls']
98
+ }
99
+ }
100
+ }
101
+ }
102
+ });
103
+ // Register tool list handler
104
+ this.server.setRequestHandler(ListToolsRequestSchema, async () => ({
105
+ tools: [
106
+ {
107
+ name: 'google_search',
108
+ description: 'Search Google and return relevant results from the web. This tool finds web pages, articles, and information on specific topics using Google\'s search engine. Results include titles, snippets, and URLs that can be analyzed further using extract_webpage_content.',
109
+ inputSchema: {
110
+ type: 'object',
111
+ properties: {
112
+ query: {
113
+ type: 'string',
114
+ description: 'Search query - be specific and use quotes for exact matches. For best results, use clear keywords and avoid very long queries.'
115
+ },
116
+ num_results: {
117
+ type: 'number',
118
+ description: 'Number of results to return (default: 5, max: 10). Increase for broader coverage, decrease for faster response.'
119
+ },
120
+ site: {
121
+ type: 'string',
122
+ description: 'Limit search results to a specific website domain (e.g., "wikipedia.org" or "nytimes.com").'
123
+ },
124
+ language: {
125
+ type: 'string',
126
+ description: 'Filter results by language using ISO 639-1 codes (e.g., "en" for English, "es" for Spanish, "fr" for French).'
127
+ },
128
+ dateRestrict: {
129
+ type: 'string',
130
+ description: 'Filter results by date using Google\'s date restriction format: "d[number]" for past days, "w[number]" for past weeks, "m[number]" for past months, or "y[number]" for past years. Example: "m6" for results from the past 6 months.'
131
+ },
132
+ exactTerms: {
133
+ type: 'string',
134
+ description: 'Search for results that contain this exact phrase. This is equivalent to putting the terms in quotes in the search query.'
135
+ },
136
+ resultType: {
137
+ type: 'string',
138
+ description: 'Specify the type of results to return. Options include "image" (or "images"), "news", and "video" (or "videos"). Default is general web results.'
139
+ },
140
+ page: {
141
+ type: 'number',
142
+ description: 'Page number for paginated results (starts at 1). Use in combination with resultsPerPage to navigate through large result sets.'
143
+ },
144
+ resultsPerPage: {
145
+ type: 'number',
146
+ description: 'Number of results to show per page (default: 5, max: 10). Controls how many results are returned for each page.'
147
+ },
148
+ sort: {
149
+ type: 'string',
150
+ description: 'Sorting method for search results. Options: "relevance" (default) or "date" (most recent first).'
151
+ }
152
+ },
153
+ required: ['query']
154
+ }
155
+ },
156
+ {
157
+ name: 'extract_webpage_content',
158
+ description: 'Extract and analyze content from a webpage, converting it to readable text. This tool fetches the main content while removing ads, navigation elements, and other clutter. Use it to get detailed information from specific pages found via google_search. Works with most common webpage formats including articles, blogs, and documentation.',
159
+ inputSchema: {
160
+ type: 'object',
161
+ properties: {
162
+ url: {
163
+ type: 'string',
164
+ description: 'Full URL of the webpage to extract content from (must start with http:// or https://). Ensure the URL is from a public webpage and not behind authentication.'
165
+ },
166
+ format: {
167
+ type: 'string',
168
+ description: 'Output format for the extracted content. Options: "markdown" (default), "html", or "text".'
169
+ }
170
+ },
171
+ required: ['url']
172
+ }
173
+ },
174
+ {
175
+ name: 'extract_multiple_webpages',
176
+ description: 'Extract and analyze content from multiple webpages in a single request. This tool is ideal for comparing information across different sources or gathering comprehensive information on a topic. Limited to 5 URLs per request to maintain performance.',
177
+ inputSchema: {
178
+ type: 'object',
179
+ properties: {
180
+ urls: {
181
+ type: 'array',
182
+ items: { type: 'string' },
183
+ description: 'Array of webpage URLs to extract content from. Each URL must be public and start with http:// or https://. Maximum 5 URLs per request.'
184
+ },
185
+ format: {
186
+ type: 'string',
187
+ description: 'Output format for the extracted content. Options: "markdown" (default), "html", or "text".'
188
+ }
189
+ },
190
+ required: ['urls']
191
+ }
192
+ }
193
+ ]
194
+ }));
195
+ // Register tool call handler
196
+ this.server.setRequestHandler(CallToolRequestSchema, async (request) => {
197
+ switch (request.params.name) {
198
+ case 'google_search':
199
+ if (typeof request.params.arguments === 'object' && request.params.arguments !== null && 'query' in request.params.arguments) {
200
+ return this.handleSearch({
201
+ query: String(request.params.arguments.query),
202
+ num_results: typeof request.params.arguments.num_results === 'number' ? request.params.arguments.num_results : undefined,
203
+ filters: {
204
+ site: request.params.arguments.site ? String(request.params.arguments.site) : undefined,
205
+ language: request.params.arguments.language ? String(request.params.arguments.language) : undefined,
206
+ dateRestrict: request.params.arguments.dateRestrict ? String(request.params.arguments.dateRestrict) : undefined,
207
+ exactTerms: request.params.arguments.exactTerms ? String(request.params.arguments.exactTerms) : undefined,
208
+ resultType: request.params.arguments.resultType ? String(request.params.arguments.resultType) : undefined,
209
+ page: typeof request.params.arguments.page === 'number' ? request.params.arguments.page : undefined,
210
+ resultsPerPage: typeof request.params.arguments.resultsPerPage === 'number' ? request.params.arguments.resultsPerPage : undefined,
211
+ sort: request.params.arguments.sort ? String(request.params.arguments.sort) : undefined
212
+ }
213
+ });
214
+ }
215
+ throw new Error('Invalid arguments for google_search tool');
216
+ case 'extract_webpage_content':
217
+ if (typeof request.params.arguments === 'object' && request.params.arguments !== null && 'url' in request.params.arguments) {
218
+ return this.handleAnalyzeWebpage({
219
+ url: String(request.params.arguments.url),
220
+ format: request.params.arguments.format ? String(request.params.arguments.format) : 'markdown'
221
+ });
222
+ }
223
+ throw new Error('Invalid arguments for extract_webpage_content tool');
224
+ case 'extract_multiple_webpages':
225
+ if (typeof request.params.arguments === 'object' && request.params.arguments !== null && 'urls' in request.params.arguments && Array.isArray(request.params.arguments.urls)) {
226
+ return this.handleBatchAnalyzeWebpages({
227
+ urls: request.params.arguments.urls.map(String),
228
+ format: request.params.arguments.format ? String(request.params.arguments.format) : 'markdown'
229
+ });
230
+ }
231
+ throw new Error('Invalid arguments for extract_multiple_webpages tool');
232
+ default:
233
+ throw new Error(`Unknown tool: ${request.params.name}`);
234
+ }
235
+ });
236
+ }
237
+ async handleSearch(args) {
238
+ try {
239
+ const { results, pagination, categories } = await this.searchService.search(args.query, args.num_results, args.filters);
240
+ if (results.length === 0) {
241
+ return {
242
+ content: [{
243
+ type: 'text',
244
+ text: 'No results found. Try:\n- Using different keywords\n- Removing quotes from non-exact phrases\n- Using more general terms'
245
+ }],
246
+ isError: true
247
+ };
248
+ }
249
+ // Format results in a more concise, readable way
250
+ const formattedResults = results.map(result => ({
251
+ title: result.title,
252
+ link: result.link,
253
+ snippet: result.snippet,
254
+ category: result.category
255
+ }));
256
+ // Format results in a more AI-friendly way
257
+ let responseText = `Search results for "${args.query}":\n\n`;
258
+ // Add category summary if available
259
+ if (categories && categories.length > 0) {
260
+ responseText += "Categories: " + categories.map(c => `${c.name} (${c.count})`).join(', ') + "\n\n";
261
+ }
262
+ // Add pagination info
263
+ if (pagination) {
264
+ responseText += `Showing page ${pagination.currentPage}${pagination.totalResults ? ` of approximately ${pagination.totalResults} results` : ''}\n\n`;
265
+ }
266
+ // Add each result in a readable format
267
+ formattedResults.forEach((result, index) => {
268
+ responseText += `${index + 1}. ${result.title}\n`;
269
+ responseText += ` URL: ${result.link}\n`;
270
+ responseText += ` ${result.snippet}\n\n`;
271
+ });
272
+ // Add navigation hints if pagination exists
273
+ if (pagination && (pagination.hasNextPage || pagination.hasPreviousPage)) {
274
+ responseText += "Navigation: ";
275
+ if (pagination.hasPreviousPage) {
276
+ responseText += "Use 'page: " + (pagination.currentPage - 1) + "' for previous results. ";
277
+ }
278
+ if (pagination.hasNextPage) {
279
+ responseText += "Use 'page: " + (pagination.currentPage + 1) + "' for more results.";
280
+ }
281
+ responseText += "\n";
282
+ }
283
+ return {
284
+ content: [
285
+ {
286
+ type: 'text',
287
+ text: responseText,
288
+ },
289
+ ],
290
+ };
291
+ }
292
+ catch (error) {
293
+ const message = error instanceof Error ? error.message : 'Unknown error during search';
294
+ return {
295
+ content: [{ type: 'text', text: message }],
296
+ isError: true
297
+ };
298
+ }
299
+ }
300
+ async handleAnalyzeWebpage(args) {
301
+ try {
302
+ const content = await this.contentExtractor.extractContent(args.url, args.format);
303
+ // Format the response in a more readable, concise way
304
+ let responseText = `Content from: ${content.url}\n\n`;
305
+ responseText += `Title: ${content.title}\n`;
306
+ if (content.description) {
307
+ responseText += `Description: ${content.description}\n`;
308
+ }
309
+ responseText += `\nStats: ${content.stats.word_count} words, ${content.stats.approximate_chars} characters\n\n`;
310
+ // Add the summary if available
311
+ if (content.summary) {
312
+ responseText += `Summary: ${content.summary}\n\n`;
313
+ }
314
+ // Add a preview of the content
315
+ responseText += `Content Preview:\n${content.content_preview.first_500_chars}\n\n`;
316
+ // Add a note about requesting specific information
317
+ responseText += `Note: This is a preview of the content. For specific information, please ask about particular aspects of this webpage.`;
318
+ return {
319
+ content: [
320
+ {
321
+ type: 'text',
322
+ text: responseText,
323
+ },
324
+ ],
325
+ };
326
+ }
327
+ catch (error) {
328
+ const errorMessage = error instanceof Error ? error.message : 'Unknown error occurred';
329
+ const helpText = 'Common issues:\n- Check if the URL is accessible in a browser\n- Ensure the webpage is public\n- Try again if it\'s a temporary network issue';
330
+ return {
331
+ content: [
332
+ {
333
+ type: 'text',
334
+ text: `${errorMessage}\n\n${helpText}`,
335
+ },
336
+ ],
337
+ isError: true,
338
+ };
339
+ }
340
+ }
341
+ async handleBatchAnalyzeWebpages(args) {
342
+ if (args.urls.length > 5) {
343
+ return {
344
+ content: [{
345
+ type: 'text',
346
+ text: 'Maximum 5 URLs allowed per request to maintain performance. Please reduce the number of URLs.'
347
+ }],
348
+ isError: true
349
+ };
350
+ }
351
+ try {
352
+ const results = await this.contentExtractor.batchExtractContent(args.urls, args.format);
353
+ // Format the response in a more readable, concise way
354
+ let responseText = `Content from ${args.urls.length} webpages:\n\n`;
355
+ for (const [url, result] of Object.entries(results)) {
356
+ responseText += `URL: ${url}\n`;
357
+ if ('error' in result) {
358
+ responseText += `Error: ${result.error}\n\n`;
359
+ continue;
360
+ }
361
+ responseText += `Title: ${result.title}\n`;
362
+ if (result.description) {
363
+ responseText += `Description: ${result.description}\n`;
364
+ }
365
+ responseText += `Stats: ${result.stats.word_count} words\n`;
366
+ // Add summary if available
367
+ if (result.summary) {
368
+ responseText += `Summary: ${result.summary}\n`;
369
+ }
370
+ responseText += `Preview: ${result.content_preview.first_500_chars.substring(0, 150)}...\n\n`;
371
+ }
372
+ responseText += `Note: These are previews of the content. To analyze the full content of a specific URL, use the extract_webpage_content tool with that URL.`;
373
+ return {
374
+ content: [
375
+ {
376
+ type: 'text',
377
+ text: responseText,
378
+ },
379
+ ],
380
+ };
381
+ }
382
+ catch (error) {
383
+ const errorMessage = error instanceof Error ? error.message : 'Unknown error occurred';
384
+ const helpText = 'Common issues:\n- Check if all URLs are accessible in a browser\n- Ensure all webpages are public\n- Try again if it\'s a temporary network issue\n- Consider reducing the number of URLs';
385
+ return {
386
+ content: [
387
+ {
388
+ type: 'text',
389
+ text: `${errorMessage}\n\n${helpText}`,
390
+ },
391
+ ],
392
+ isError: true,
393
+ };
394
+ }
395
+ }
396
+ async start() {
397
+ try {
398
+ const transport = new StdioServerTransport();
399
+ await this.server.connect(transport);
400
+ console.error('Google Search MCP server running');
401
+ // Keep the process running
402
+ process.on('SIGINT', () => {
403
+ this.server.close().catch(console.error);
404
+ process.exit(0);
405
+ });
406
+ }
407
+ catch (error) {
408
+ if (error instanceof Error) {
409
+ console.error('Failed to start MCP server:', error.message);
410
+ }
411
+ else {
412
+ console.error('Failed to start MCP server: Unknown error');
413
+ }
414
+ process.exit(1);
415
+ }
416
+ }
417
+ }
418
+ // Start the server
419
+ const server = new GoogleSearchServer();
420
+ server.start().catch(console.error);
@@ -0,0 +1,195 @@
1
+ import axios from 'axios';
2
+ import * as cheerio from 'cheerio';
3
+ import { Readability } from '@mozilla/readability';
4
+ import { JSDOM } from 'jsdom';
5
+ import MarkdownIt from 'markdown-it';
6
+ import TurndownService from 'turndown';
7
+ export class ContentExtractor {
8
+ constructor() {
9
+ // Cache for webpage content (key: url + format, value: content)
10
+ this.contentCache = new Map();
11
+ // Cache expiration time in milliseconds (30 minutes)
12
+ this.cacheTTL = 30 * 60 * 1000;
13
+ this.md = new MarkdownIt();
14
+ this.turndownService = new TurndownService({
15
+ headingStyle: 'atx',
16
+ codeBlockStyle: 'fenced'
17
+ });
18
+ }
19
+ cleanText(text) {
20
+ // Remove multiple blank lines
21
+ text = text.replace(/\n\s*\n\s*\n/g, '\n\n');
22
+ // Remove excessive spaces
23
+ text = text.replace(/ +/g, ' ');
24
+ return text.trim();
25
+ }
26
+ cleanMarkdown(text) {
27
+ let cleanedText = this.cleanText(text);
28
+ // Ensure headers have space after #
29
+ cleanedText = cleanedText.replace(/#([A-Za-z0-9])/g, '# $1');
30
+ return cleanedText;
31
+ }
32
+ htmlToMarkdown(html) {
33
+ return this.cleanMarkdown(this.turndownService.turndown(html));
34
+ }
35
+ htmlToPlainText(html) {
36
+ const dom = new JSDOM(html);
37
+ return this.cleanText(dom.window.document.body.textContent || '');
38
+ }
39
+ isValidUrl(url) {
40
+ try {
41
+ new URL(url);
42
+ return true;
43
+ }
44
+ catch {
45
+ return false;
46
+ }
47
+ }
48
+ /**
49
+ * Generate a cache key from URL and format
50
+ */
51
+ generateCacheKey(url, format) {
52
+ return `${url}|${format}`;
53
+ }
54
+ /**
55
+ * Check if a cache entry is still valid
56
+ */
57
+ isCacheValid(entry) {
58
+ const now = Date.now();
59
+ return now - entry.timestamp < this.cacheTTL;
60
+ }
61
+ /**
62
+ * Store webpage content in cache
63
+ */
64
+ cacheContent(url, format, content) {
65
+ const cacheKey = this.generateCacheKey(url, format);
66
+ this.contentCache.set(cacheKey, {
67
+ timestamp: Date.now(),
68
+ content
69
+ });
70
+ // Limit cache size to prevent memory issues (max 50 entries)
71
+ if (this.contentCache.size > 50) {
72
+ // Delete oldest entry
73
+ const oldestKey = Array.from(this.contentCache.entries())
74
+ .sort((a, b) => a[1].timestamp - b[1].timestamp)[0][0];
75
+ this.contentCache.delete(oldestKey);
76
+ }
77
+ }
78
+ /**
79
+ * Generates a concise summary of the content
80
+ * @param content The content to summarize
81
+ * @param maxLength Maximum length of the summary
82
+ * @returns A summary of the content
83
+ */
84
+ generateSummary(content, maxLength = 300) {
85
+ // Simple summarization: take first few sentences up to maxLength
86
+ const sentences = content.split(/(?<=[.!?])\s+/);
87
+ let summary = '';
88
+ for (const sentence of sentences) {
89
+ if ((summary + sentence).length <= maxLength) {
90
+ summary += sentence + ' ';
91
+ }
92
+ else {
93
+ break;
94
+ }
95
+ }
96
+ return summary.trim() + (summary.length < content.length ? '...' : '');
97
+ }
98
+ async extractContent(url, format = 'markdown') {
99
+ if (!this.isValidUrl(url)) {
100
+ throw new Error('Invalid URL provided');
101
+ }
102
+ // Check cache first
103
+ const cacheKey = this.generateCacheKey(url, format);
104
+ const cachedContent = this.contentCache.get(cacheKey);
105
+ if (cachedContent && this.isCacheValid(cachedContent)) {
106
+ console.error(`Using cached content for ${url}`);
107
+ return cachedContent.content;
108
+ }
109
+ try {
110
+ // Fetch webpage content
111
+ const response = await axios.get(url, {
112
+ headers: {
113
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
114
+ },
115
+ timeout: 10000
116
+ });
117
+ // Parse with Cheerio for metadata
118
+ const $ = cheerio.load(response.data);
119
+ const metaTags = {};
120
+ // Only extract the most important meta tags to reduce data volume
121
+ const importantMetaTags = ['description', 'keywords', 'author', 'og:title', 'og:description', 'twitter:title', 'twitter:description'];
122
+ $('meta').each((_, element) => {
123
+ const name = $(element).attr('name') || $(element).attr('property') || '';
124
+ const content = $(element).attr('content') || '';
125
+ if (name && content && importantMetaTags.some(tag => name.includes(tag))) {
126
+ metaTags[name] = content;
127
+ }
128
+ });
129
+ // Use Readability for main content extraction
130
+ const dom = new JSDOM(response.data);
131
+ const reader = new Readability(dom.window.document);
132
+ const article = reader.parse();
133
+ if (!article) {
134
+ throw new Error('Failed to extract content from webpage');
135
+ }
136
+ // Convert content based on requested format
137
+ let contentStr;
138
+ switch (format) {
139
+ case 'html':
140
+ contentStr = article.content || '';
141
+ break;
142
+ case 'text':
143
+ contentStr = this.htmlToPlainText(article.content || '');
144
+ break;
145
+ case 'markdown':
146
+ default:
147
+ contentStr = this.htmlToMarkdown(article.content || '');
148
+ break;
149
+ }
150
+ // Calculate content stats
151
+ const wordCount = contentStr.split(/\s+/).filter(word => word.length > 0).length;
152
+ // Generate a summary of the content
153
+ const summary = this.generateSummary(contentStr);
154
+ const content = {
155
+ url,
156
+ title: $('title').text() || article.title || '',
157
+ description: metaTags['description'] || '',
158
+ content: contentStr,
159
+ format: format,
160
+ meta_tags: metaTags,
161
+ stats: {
162
+ word_count: wordCount,
163
+ approximate_chars: contentStr.length
164
+ },
165
+ content_preview: {
166
+ first_500_chars: contentStr.slice(0, 500) + (contentStr.length > 500 ? '...' : '')
167
+ },
168
+ summary: summary
169
+ };
170
+ // Cache the content before returning
171
+ this.cacheContent(url, format, content);
172
+ return content;
173
+ }
174
+ catch (error) {
175
+ if (axios.isAxiosError(error)) {
176
+ throw new Error(`Failed to fetch webpage: ${error.message}`);
177
+ }
178
+ throw error;
179
+ }
180
+ }
181
+ async batchExtractContent(urls, format = 'markdown') {
182
+ const results = {};
183
+ await Promise.all(urls.map(async (url) => {
184
+ try {
185
+ results[url] = await this.extractContent(url, format);
186
+ }
187
+ catch (error) {
188
+ results[url] = {
189
+ error: error instanceof Error ? error.message : 'Unknown error occurred'
190
+ };
191
+ }
192
+ }));
193
+ return results;
194
+ }
195
+ }