pse-mcp 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/GEMINI.md +72 -0
- package/License.md +3 -0
- package/MCP Documents/README.md +1 -0
- package/MCP Documents/mcp-client-guide.txt +736 -0
- package/MCP Documents/mcp-complete-guide.txt +522 -0
- package/MCP Documents/mcp-enhanced-instructions.md +297 -0
- package/MCP Documents/mcp-server-guide.md +415 -0
- package/MCP Documents/mcp-windows.txt +161 -0
- package/QWEN.md +207 -0
- package/README.md +220 -0
- package/dist/content-fetcher.js +36 -0
- package/dist/google-search.js +421 -0
- package/dist/services/content-extractor.service.js +195 -0
- package/dist/services/google-search.service.js +244 -0
- package/dist/types.js +1 -0
- package/dist-package/README.md +210 -0
- package/dist-package/dist/content-fetcher.js +36 -0
- package/dist-package/dist/google-search.js +420 -0
- package/dist-package/dist/services/content-extractor.service.js +195 -0
- package/dist-package/dist/services/google-search.service.js +244 -0
- package/dist-package/dist/types.js +1 -0
- package/dist-package/package-lock.json +3104 -0
- package/dist-package/package.json +23 -0
- package/license +4 -0
- package/package.json +40 -0
- package/src/google-search.ts +477 -0
- package/src/mcp.d.ts +36 -0
- package/src/services/content-extractor.service.ts +232 -0
- package/src/services/google-search.service.ts +305 -0
- package/src/types.ts +64 -0
- package/tasks.md +141 -0
- package/tsconfig.json +16 -0
@@ -0,0 +1,420 @@
|
|
1
|
+
import { Server } from '@modelcontextprotocol/sdk/server/index.js';
|
2
|
+
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
|
3
|
+
import { CallToolRequestSchema, ListToolsRequestSchema } from '@modelcontextprotocol/sdk/types.js';
|
4
|
+
import { GoogleSearchService } from './services/google-search.service.js';
|
5
|
+
import { ContentExtractor } from './services/content-extractor.service.js';
|
6
|
+
class GoogleSearchServer {
|
7
|
+
constructor() {
|
8
|
+
this.searchService = new GoogleSearchService();
|
9
|
+
this.contentExtractor = new ContentExtractor();
|
10
|
+
this.server = new Server({
|
11
|
+
name: 'google-search',
|
12
|
+
version: '1.0.0'
|
13
|
+
}, {
|
14
|
+
capabilities: {
|
15
|
+
tools: {
|
16
|
+
google_search: {
|
17
|
+
description: 'Search Google and return relevant results from the web. This tool finds web pages, articles, and information on specific topics using Google\'s search engine. Results include titles, snippets, and URLs that can be analyzed further using extract_webpage_content.',
|
18
|
+
inputSchema: {
|
19
|
+
type: 'object',
|
20
|
+
properties: {
|
21
|
+
query: {
|
22
|
+
type: 'string',
|
23
|
+
description: 'Search query - be specific and use quotes for exact matches. For best results, use clear keywords and avoid very long queries.'
|
24
|
+
},
|
25
|
+
num_results: {
|
26
|
+
type: 'number',
|
27
|
+
description: 'Number of results to return (default: 5, max: 10). Increase for broader coverage, decrease for faster response.'
|
28
|
+
},
|
29
|
+
site: {
|
30
|
+
type: 'string',
|
31
|
+
description: 'Limit search results to a specific website domain (e.g., "wikipedia.org" or "nytimes.com").'
|
32
|
+
},
|
33
|
+
language: {
|
34
|
+
type: 'string',
|
35
|
+
description: 'Filter results by language using ISO 639-1 codes (e.g., "en" for English, "es" for Spanish, "fr" for French).'
|
36
|
+
},
|
37
|
+
dateRestrict: {
|
38
|
+
type: 'string',
|
39
|
+
description: 'Filter results by date using Google\'s date restriction format: "d[number]" for past days, "w[number]" for past weeks, "m[number]" for past months, or "y[number]" for past years. Example: "m6" for results from the past 6 months.'
|
40
|
+
},
|
41
|
+
exactTerms: {
|
42
|
+
type: 'string',
|
43
|
+
description: 'Search for results that contain this exact phrase. This is equivalent to putting the terms in quotes in the search query.'
|
44
|
+
},
|
45
|
+
resultType: {
|
46
|
+
type: 'string',
|
47
|
+
description: 'Specify the type of results to return. Options include "image" (or "images"), "news", and "video" (or "videos"). Default is general web results.'
|
48
|
+
},
|
49
|
+
page: {
|
50
|
+
type: 'number',
|
51
|
+
description: 'Page number for paginated results (starts at 1). Use in combination with resultsPerPage to navigate through large result sets.'
|
52
|
+
},
|
53
|
+
resultsPerPage: {
|
54
|
+
type: 'number',
|
55
|
+
description: 'Number of results to show per page (default: 5, max: 10). Controls how many results are returned for each page.'
|
56
|
+
},
|
57
|
+
sort: {
|
58
|
+
type: 'string',
|
59
|
+
description: 'Sorting method for search results. Options: "relevance" (default) or "date" (most recent first).'
|
60
|
+
}
|
61
|
+
},
|
62
|
+
required: ['query']
|
63
|
+
}
|
64
|
+
},
|
65
|
+
extract_webpage_content: {
|
66
|
+
description: 'Extract and analyze content from a webpage, converting it to readable text. This tool fetches the main content while removing ads, navigation elements, and other clutter. Use it to get detailed information from specific pages found via google_search. Works with most common webpage formats including articles, blogs, and documentation.',
|
67
|
+
inputSchema: {
|
68
|
+
type: 'object',
|
69
|
+
properties: {
|
70
|
+
url: {
|
71
|
+
type: 'string',
|
72
|
+
description: 'Full URL of the webpage to extract content from (must start with http:// or https://). Ensure the URL is from a public webpage and not behind authentication.'
|
73
|
+
},
|
74
|
+
format: {
|
75
|
+
type: 'string',
|
76
|
+
description: 'Output format for the extracted content. Options: "markdown" (default), "html", or "text".'
|
77
|
+
}
|
78
|
+
},
|
79
|
+
required: ['url']
|
80
|
+
}
|
81
|
+
},
|
82
|
+
extract_multiple_webpages: {
|
83
|
+
description: 'Extract and analyze content from multiple webpages in a single request. This tool is ideal for comparing information across different sources or gathering comprehensive information on a topic. Limited to 5 URLs per request to maintain performance.',
|
84
|
+
inputSchema: {
|
85
|
+
type: 'object',
|
86
|
+
properties: {
|
87
|
+
urls: {
|
88
|
+
type: 'array',
|
89
|
+
items: { type: 'string' },
|
90
|
+
description: 'Array of webpage URLs to extract content from. Each URL must be public and start with http:// or https://. Maximum 5 URLs per request.'
|
91
|
+
},
|
92
|
+
format: {
|
93
|
+
type: 'string',
|
94
|
+
description: 'Output format for the extracted content. Options: "markdown" (default), "html", or "text".'
|
95
|
+
}
|
96
|
+
},
|
97
|
+
required: ['urls']
|
98
|
+
}
|
99
|
+
}
|
100
|
+
}
|
101
|
+
}
|
102
|
+
});
|
103
|
+
// Register tool list handler
|
104
|
+
this.server.setRequestHandler(ListToolsRequestSchema, async () => ({
|
105
|
+
tools: [
|
106
|
+
{
|
107
|
+
name: 'google_search',
|
108
|
+
description: 'Search Google and return relevant results from the web. This tool finds web pages, articles, and information on specific topics using Google\'s search engine. Results include titles, snippets, and URLs that can be analyzed further using extract_webpage_content.',
|
109
|
+
inputSchema: {
|
110
|
+
type: 'object',
|
111
|
+
properties: {
|
112
|
+
query: {
|
113
|
+
type: 'string',
|
114
|
+
description: 'Search query - be specific and use quotes for exact matches. For best results, use clear keywords and avoid very long queries.'
|
115
|
+
},
|
116
|
+
num_results: {
|
117
|
+
type: 'number',
|
118
|
+
description: 'Number of results to return (default: 5, max: 10). Increase for broader coverage, decrease for faster response.'
|
119
|
+
},
|
120
|
+
site: {
|
121
|
+
type: 'string',
|
122
|
+
description: 'Limit search results to a specific website domain (e.g., "wikipedia.org" or "nytimes.com").'
|
123
|
+
},
|
124
|
+
language: {
|
125
|
+
type: 'string',
|
126
|
+
description: 'Filter results by language using ISO 639-1 codes (e.g., "en" for English, "es" for Spanish, "fr" for French).'
|
127
|
+
},
|
128
|
+
dateRestrict: {
|
129
|
+
type: 'string',
|
130
|
+
description: 'Filter results by date using Google\'s date restriction format: "d[number]" for past days, "w[number]" for past weeks, "m[number]" for past months, or "y[number]" for past years. Example: "m6" for results from the past 6 months.'
|
131
|
+
},
|
132
|
+
exactTerms: {
|
133
|
+
type: 'string',
|
134
|
+
description: 'Search for results that contain this exact phrase. This is equivalent to putting the terms in quotes in the search query.'
|
135
|
+
},
|
136
|
+
resultType: {
|
137
|
+
type: 'string',
|
138
|
+
description: 'Specify the type of results to return. Options include "image" (or "images"), "news", and "video" (or "videos"). Default is general web results.'
|
139
|
+
},
|
140
|
+
page: {
|
141
|
+
type: 'number',
|
142
|
+
description: 'Page number for paginated results (starts at 1). Use in combination with resultsPerPage to navigate through large result sets.'
|
143
|
+
},
|
144
|
+
resultsPerPage: {
|
145
|
+
type: 'number',
|
146
|
+
description: 'Number of results to show per page (default: 5, max: 10). Controls how many results are returned for each page.'
|
147
|
+
},
|
148
|
+
sort: {
|
149
|
+
type: 'string',
|
150
|
+
description: 'Sorting method for search results. Options: "relevance" (default) or "date" (most recent first).'
|
151
|
+
}
|
152
|
+
},
|
153
|
+
required: ['query']
|
154
|
+
}
|
155
|
+
},
|
156
|
+
{
|
157
|
+
name: 'extract_webpage_content',
|
158
|
+
description: 'Extract and analyze content from a webpage, converting it to readable text. This tool fetches the main content while removing ads, navigation elements, and other clutter. Use it to get detailed information from specific pages found via google_search. Works with most common webpage formats including articles, blogs, and documentation.',
|
159
|
+
inputSchema: {
|
160
|
+
type: 'object',
|
161
|
+
properties: {
|
162
|
+
url: {
|
163
|
+
type: 'string',
|
164
|
+
description: 'Full URL of the webpage to extract content from (must start with http:// or https://). Ensure the URL is from a public webpage and not behind authentication.'
|
165
|
+
},
|
166
|
+
format: {
|
167
|
+
type: 'string',
|
168
|
+
description: 'Output format for the extracted content. Options: "markdown" (default), "html", or "text".'
|
169
|
+
}
|
170
|
+
},
|
171
|
+
required: ['url']
|
172
|
+
}
|
173
|
+
},
|
174
|
+
{
|
175
|
+
name: 'extract_multiple_webpages',
|
176
|
+
description: 'Extract and analyze content from multiple webpages in a single request. This tool is ideal for comparing information across different sources or gathering comprehensive information on a topic. Limited to 5 URLs per request to maintain performance.',
|
177
|
+
inputSchema: {
|
178
|
+
type: 'object',
|
179
|
+
properties: {
|
180
|
+
urls: {
|
181
|
+
type: 'array',
|
182
|
+
items: { type: 'string' },
|
183
|
+
description: 'Array of webpage URLs to extract content from. Each URL must be public and start with http:// or https://. Maximum 5 URLs per request.'
|
184
|
+
},
|
185
|
+
format: {
|
186
|
+
type: 'string',
|
187
|
+
description: 'Output format for the extracted content. Options: "markdown" (default), "html", or "text".'
|
188
|
+
}
|
189
|
+
},
|
190
|
+
required: ['urls']
|
191
|
+
}
|
192
|
+
}
|
193
|
+
]
|
194
|
+
}));
|
195
|
+
// Register tool call handler
|
196
|
+
this.server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
197
|
+
switch (request.params.name) {
|
198
|
+
case 'google_search':
|
199
|
+
if (typeof request.params.arguments === 'object' && request.params.arguments !== null && 'query' in request.params.arguments) {
|
200
|
+
return this.handleSearch({
|
201
|
+
query: String(request.params.arguments.query),
|
202
|
+
num_results: typeof request.params.arguments.num_results === 'number' ? request.params.arguments.num_results : undefined,
|
203
|
+
filters: {
|
204
|
+
site: request.params.arguments.site ? String(request.params.arguments.site) : undefined,
|
205
|
+
language: request.params.arguments.language ? String(request.params.arguments.language) : undefined,
|
206
|
+
dateRestrict: request.params.arguments.dateRestrict ? String(request.params.arguments.dateRestrict) : undefined,
|
207
|
+
exactTerms: request.params.arguments.exactTerms ? String(request.params.arguments.exactTerms) : undefined,
|
208
|
+
resultType: request.params.arguments.resultType ? String(request.params.arguments.resultType) : undefined,
|
209
|
+
page: typeof request.params.arguments.page === 'number' ? request.params.arguments.page : undefined,
|
210
|
+
resultsPerPage: typeof request.params.arguments.resultsPerPage === 'number' ? request.params.arguments.resultsPerPage : undefined,
|
211
|
+
sort: request.params.arguments.sort ? String(request.params.arguments.sort) : undefined
|
212
|
+
}
|
213
|
+
});
|
214
|
+
}
|
215
|
+
throw new Error('Invalid arguments for google_search tool');
|
216
|
+
case 'extract_webpage_content':
|
217
|
+
if (typeof request.params.arguments === 'object' && request.params.arguments !== null && 'url' in request.params.arguments) {
|
218
|
+
return this.handleAnalyzeWebpage({
|
219
|
+
url: String(request.params.arguments.url),
|
220
|
+
format: request.params.arguments.format ? String(request.params.arguments.format) : 'markdown'
|
221
|
+
});
|
222
|
+
}
|
223
|
+
throw new Error('Invalid arguments for extract_webpage_content tool');
|
224
|
+
case 'extract_multiple_webpages':
|
225
|
+
if (typeof request.params.arguments === 'object' && request.params.arguments !== null && 'urls' in request.params.arguments && Array.isArray(request.params.arguments.urls)) {
|
226
|
+
return this.handleBatchAnalyzeWebpages({
|
227
|
+
urls: request.params.arguments.urls.map(String),
|
228
|
+
format: request.params.arguments.format ? String(request.params.arguments.format) : 'markdown'
|
229
|
+
});
|
230
|
+
}
|
231
|
+
throw new Error('Invalid arguments for extract_multiple_webpages tool');
|
232
|
+
default:
|
233
|
+
throw new Error(`Unknown tool: ${request.params.name}`);
|
234
|
+
}
|
235
|
+
});
|
236
|
+
}
|
237
|
+
async handleSearch(args) {
|
238
|
+
try {
|
239
|
+
const { results, pagination, categories } = await this.searchService.search(args.query, args.num_results, args.filters);
|
240
|
+
if (results.length === 0) {
|
241
|
+
return {
|
242
|
+
content: [{
|
243
|
+
type: 'text',
|
244
|
+
text: 'No results found. Try:\n- Using different keywords\n- Removing quotes from non-exact phrases\n- Using more general terms'
|
245
|
+
}],
|
246
|
+
isError: true
|
247
|
+
};
|
248
|
+
}
|
249
|
+
// Format results in a more concise, readable way
|
250
|
+
const formattedResults = results.map(result => ({
|
251
|
+
title: result.title,
|
252
|
+
link: result.link,
|
253
|
+
snippet: result.snippet,
|
254
|
+
category: result.category
|
255
|
+
}));
|
256
|
+
// Format results in a more AI-friendly way
|
257
|
+
let responseText = `Search results for "${args.query}":\n\n`;
|
258
|
+
// Add category summary if available
|
259
|
+
if (categories && categories.length > 0) {
|
260
|
+
responseText += "Categories: " + categories.map(c => `${c.name} (${c.count})`).join(', ') + "\n\n";
|
261
|
+
}
|
262
|
+
// Add pagination info
|
263
|
+
if (pagination) {
|
264
|
+
responseText += `Showing page ${pagination.currentPage}${pagination.totalResults ? ` of approximately ${pagination.totalResults} results` : ''}\n\n`;
|
265
|
+
}
|
266
|
+
// Add each result in a readable format
|
267
|
+
formattedResults.forEach((result, index) => {
|
268
|
+
responseText += `${index + 1}. ${result.title}\n`;
|
269
|
+
responseText += ` URL: ${result.link}\n`;
|
270
|
+
responseText += ` ${result.snippet}\n\n`;
|
271
|
+
});
|
272
|
+
// Add navigation hints if pagination exists
|
273
|
+
if (pagination && (pagination.hasNextPage || pagination.hasPreviousPage)) {
|
274
|
+
responseText += "Navigation: ";
|
275
|
+
if (pagination.hasPreviousPage) {
|
276
|
+
responseText += "Use 'page: " + (pagination.currentPage - 1) + "' for previous results. ";
|
277
|
+
}
|
278
|
+
if (pagination.hasNextPage) {
|
279
|
+
responseText += "Use 'page: " + (pagination.currentPage + 1) + "' for more results.";
|
280
|
+
}
|
281
|
+
responseText += "\n";
|
282
|
+
}
|
283
|
+
return {
|
284
|
+
content: [
|
285
|
+
{
|
286
|
+
type: 'text',
|
287
|
+
text: responseText,
|
288
|
+
},
|
289
|
+
],
|
290
|
+
};
|
291
|
+
}
|
292
|
+
catch (error) {
|
293
|
+
const message = error instanceof Error ? error.message : 'Unknown error during search';
|
294
|
+
return {
|
295
|
+
content: [{ type: 'text', text: message }],
|
296
|
+
isError: true
|
297
|
+
};
|
298
|
+
}
|
299
|
+
}
|
300
|
+
async handleAnalyzeWebpage(args) {
|
301
|
+
try {
|
302
|
+
const content = await this.contentExtractor.extractContent(args.url, args.format);
|
303
|
+
// Format the response in a more readable, concise way
|
304
|
+
let responseText = `Content from: ${content.url}\n\n`;
|
305
|
+
responseText += `Title: ${content.title}\n`;
|
306
|
+
if (content.description) {
|
307
|
+
responseText += `Description: ${content.description}\n`;
|
308
|
+
}
|
309
|
+
responseText += `\nStats: ${content.stats.word_count} words, ${content.stats.approximate_chars} characters\n\n`;
|
310
|
+
// Add the summary if available
|
311
|
+
if (content.summary) {
|
312
|
+
responseText += `Summary: ${content.summary}\n\n`;
|
313
|
+
}
|
314
|
+
// Add a preview of the content
|
315
|
+
responseText += `Content Preview:\n${content.content_preview.first_500_chars}\n\n`;
|
316
|
+
// Add a note about requesting specific information
|
317
|
+
responseText += `Note: This is a preview of the content. For specific information, please ask about particular aspects of this webpage.`;
|
318
|
+
return {
|
319
|
+
content: [
|
320
|
+
{
|
321
|
+
type: 'text',
|
322
|
+
text: responseText,
|
323
|
+
},
|
324
|
+
],
|
325
|
+
};
|
326
|
+
}
|
327
|
+
catch (error) {
|
328
|
+
const errorMessage = error instanceof Error ? error.message : 'Unknown error occurred';
|
329
|
+
const helpText = 'Common issues:\n- Check if the URL is accessible in a browser\n- Ensure the webpage is public\n- Try again if it\'s a temporary network issue';
|
330
|
+
return {
|
331
|
+
content: [
|
332
|
+
{
|
333
|
+
type: 'text',
|
334
|
+
text: `${errorMessage}\n\n${helpText}`,
|
335
|
+
},
|
336
|
+
],
|
337
|
+
isError: true,
|
338
|
+
};
|
339
|
+
}
|
340
|
+
}
|
341
|
+
async handleBatchAnalyzeWebpages(args) {
|
342
|
+
if (args.urls.length > 5) {
|
343
|
+
return {
|
344
|
+
content: [{
|
345
|
+
type: 'text',
|
346
|
+
text: 'Maximum 5 URLs allowed per request to maintain performance. Please reduce the number of URLs.'
|
347
|
+
}],
|
348
|
+
isError: true
|
349
|
+
};
|
350
|
+
}
|
351
|
+
try {
|
352
|
+
const results = await this.contentExtractor.batchExtractContent(args.urls, args.format);
|
353
|
+
// Format the response in a more readable, concise way
|
354
|
+
let responseText = `Content from ${args.urls.length} webpages:\n\n`;
|
355
|
+
for (const [url, result] of Object.entries(results)) {
|
356
|
+
responseText += `URL: ${url}\n`;
|
357
|
+
if ('error' in result) {
|
358
|
+
responseText += `Error: ${result.error}\n\n`;
|
359
|
+
continue;
|
360
|
+
}
|
361
|
+
responseText += `Title: ${result.title}\n`;
|
362
|
+
if (result.description) {
|
363
|
+
responseText += `Description: ${result.description}\n`;
|
364
|
+
}
|
365
|
+
responseText += `Stats: ${result.stats.word_count} words\n`;
|
366
|
+
// Add summary if available
|
367
|
+
if (result.summary) {
|
368
|
+
responseText += `Summary: ${result.summary}\n`;
|
369
|
+
}
|
370
|
+
responseText += `Preview: ${result.content_preview.first_500_chars.substring(0, 150)}...\n\n`;
|
371
|
+
}
|
372
|
+
responseText += `Note: These are previews of the content. To analyze the full content of a specific URL, use the extract_webpage_content tool with that URL.`;
|
373
|
+
return {
|
374
|
+
content: [
|
375
|
+
{
|
376
|
+
type: 'text',
|
377
|
+
text: responseText,
|
378
|
+
},
|
379
|
+
],
|
380
|
+
};
|
381
|
+
}
|
382
|
+
catch (error) {
|
383
|
+
const errorMessage = error instanceof Error ? error.message : 'Unknown error occurred';
|
384
|
+
const helpText = 'Common issues:\n- Check if all URLs are accessible in a browser\n- Ensure all webpages are public\n- Try again if it\'s a temporary network issue\n- Consider reducing the number of URLs';
|
385
|
+
return {
|
386
|
+
content: [
|
387
|
+
{
|
388
|
+
type: 'text',
|
389
|
+
text: `${errorMessage}\n\n${helpText}`,
|
390
|
+
},
|
391
|
+
],
|
392
|
+
isError: true,
|
393
|
+
};
|
394
|
+
}
|
395
|
+
}
|
396
|
+
async start() {
|
397
|
+
try {
|
398
|
+
const transport = new StdioServerTransport();
|
399
|
+
await this.server.connect(transport);
|
400
|
+
console.error('Google Search MCP server running');
|
401
|
+
// Keep the process running
|
402
|
+
process.on('SIGINT', () => {
|
403
|
+
this.server.close().catch(console.error);
|
404
|
+
process.exit(0);
|
405
|
+
});
|
406
|
+
}
|
407
|
+
catch (error) {
|
408
|
+
if (error instanceof Error) {
|
409
|
+
console.error('Failed to start MCP server:', error.message);
|
410
|
+
}
|
411
|
+
else {
|
412
|
+
console.error('Failed to start MCP server: Unknown error');
|
413
|
+
}
|
414
|
+
process.exit(1);
|
415
|
+
}
|
416
|
+
}
|
417
|
+
}
|
418
|
+
// Start the server
|
419
|
+
const server = new GoogleSearchServer();
|
420
|
+
server.start().catch(console.error);
|
@@ -0,0 +1,195 @@
|
|
1
|
+
import axios from 'axios';
|
2
|
+
import * as cheerio from 'cheerio';
|
3
|
+
import { Readability } from '@mozilla/readability';
|
4
|
+
import { JSDOM } from 'jsdom';
|
5
|
+
import MarkdownIt from 'markdown-it';
|
6
|
+
import TurndownService from 'turndown';
|
7
|
+
export class ContentExtractor {
|
8
|
+
constructor() {
|
9
|
+
// Cache for webpage content (key: url + format, value: content)
|
10
|
+
this.contentCache = new Map();
|
11
|
+
// Cache expiration time in milliseconds (30 minutes)
|
12
|
+
this.cacheTTL = 30 * 60 * 1000;
|
13
|
+
this.md = new MarkdownIt();
|
14
|
+
this.turndownService = new TurndownService({
|
15
|
+
headingStyle: 'atx',
|
16
|
+
codeBlockStyle: 'fenced'
|
17
|
+
});
|
18
|
+
}
|
19
|
+
cleanText(text) {
|
20
|
+
// Remove multiple blank lines
|
21
|
+
text = text.replace(/\n\s*\n\s*\n/g, '\n\n');
|
22
|
+
// Remove excessive spaces
|
23
|
+
text = text.replace(/ +/g, ' ');
|
24
|
+
return text.trim();
|
25
|
+
}
|
26
|
+
cleanMarkdown(text) {
|
27
|
+
let cleanedText = this.cleanText(text);
|
28
|
+
// Ensure headers have space after #
|
29
|
+
cleanedText = cleanedText.replace(/#([A-Za-z0-9])/g, '# $1');
|
30
|
+
return cleanedText;
|
31
|
+
}
|
32
|
+
htmlToMarkdown(html) {
|
33
|
+
return this.cleanMarkdown(this.turndownService.turndown(html));
|
34
|
+
}
|
35
|
+
htmlToPlainText(html) {
|
36
|
+
const dom = new JSDOM(html);
|
37
|
+
return this.cleanText(dom.window.document.body.textContent || '');
|
38
|
+
}
|
39
|
+
isValidUrl(url) {
|
40
|
+
try {
|
41
|
+
new URL(url);
|
42
|
+
return true;
|
43
|
+
}
|
44
|
+
catch {
|
45
|
+
return false;
|
46
|
+
}
|
47
|
+
}
|
48
|
+
/**
|
49
|
+
* Generate a cache key from URL and format
|
50
|
+
*/
|
51
|
+
generateCacheKey(url, format) {
|
52
|
+
return `${url}|${format}`;
|
53
|
+
}
|
54
|
+
/**
|
55
|
+
* Check if a cache entry is still valid
|
56
|
+
*/
|
57
|
+
isCacheValid(entry) {
|
58
|
+
const now = Date.now();
|
59
|
+
return now - entry.timestamp < this.cacheTTL;
|
60
|
+
}
|
61
|
+
/**
|
62
|
+
* Store webpage content in cache
|
63
|
+
*/
|
64
|
+
cacheContent(url, format, content) {
|
65
|
+
const cacheKey = this.generateCacheKey(url, format);
|
66
|
+
this.contentCache.set(cacheKey, {
|
67
|
+
timestamp: Date.now(),
|
68
|
+
content
|
69
|
+
});
|
70
|
+
// Limit cache size to prevent memory issues (max 50 entries)
|
71
|
+
if (this.contentCache.size > 50) {
|
72
|
+
// Delete oldest entry
|
73
|
+
const oldestKey = Array.from(this.contentCache.entries())
|
74
|
+
.sort((a, b) => a[1].timestamp - b[1].timestamp)[0][0];
|
75
|
+
this.contentCache.delete(oldestKey);
|
76
|
+
}
|
77
|
+
}
|
78
|
+
/**
|
79
|
+
* Generates a concise summary of the content
|
80
|
+
* @param content The content to summarize
|
81
|
+
* @param maxLength Maximum length of the summary
|
82
|
+
* @returns A summary of the content
|
83
|
+
*/
|
84
|
+
generateSummary(content, maxLength = 300) {
|
85
|
+
// Simple summarization: take first few sentences up to maxLength
|
86
|
+
const sentences = content.split(/(?<=[.!?])\s+/);
|
87
|
+
let summary = '';
|
88
|
+
for (const sentence of sentences) {
|
89
|
+
if ((summary + sentence).length <= maxLength) {
|
90
|
+
summary += sentence + ' ';
|
91
|
+
}
|
92
|
+
else {
|
93
|
+
break;
|
94
|
+
}
|
95
|
+
}
|
96
|
+
return summary.trim() + (summary.length < content.length ? '...' : '');
|
97
|
+
}
|
98
|
+
async extractContent(url, format = 'markdown') {
|
99
|
+
if (!this.isValidUrl(url)) {
|
100
|
+
throw new Error('Invalid URL provided');
|
101
|
+
}
|
102
|
+
// Check cache first
|
103
|
+
const cacheKey = this.generateCacheKey(url, format);
|
104
|
+
const cachedContent = this.contentCache.get(cacheKey);
|
105
|
+
if (cachedContent && this.isCacheValid(cachedContent)) {
|
106
|
+
console.error(`Using cached content for ${url}`);
|
107
|
+
return cachedContent.content;
|
108
|
+
}
|
109
|
+
try {
|
110
|
+
// Fetch webpage content
|
111
|
+
const response = await axios.get(url, {
|
112
|
+
headers: {
|
113
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
114
|
+
},
|
115
|
+
timeout: 10000
|
116
|
+
});
|
117
|
+
// Parse with Cheerio for metadata
|
118
|
+
const $ = cheerio.load(response.data);
|
119
|
+
const metaTags = {};
|
120
|
+
// Only extract the most important meta tags to reduce data volume
|
121
|
+
const importantMetaTags = ['description', 'keywords', 'author', 'og:title', 'og:description', 'twitter:title', 'twitter:description'];
|
122
|
+
$('meta').each((_, element) => {
|
123
|
+
const name = $(element).attr('name') || $(element).attr('property') || '';
|
124
|
+
const content = $(element).attr('content') || '';
|
125
|
+
if (name && content && importantMetaTags.some(tag => name.includes(tag))) {
|
126
|
+
metaTags[name] = content;
|
127
|
+
}
|
128
|
+
});
|
129
|
+
// Use Readability for main content extraction
|
130
|
+
const dom = new JSDOM(response.data);
|
131
|
+
const reader = new Readability(dom.window.document);
|
132
|
+
const article = reader.parse();
|
133
|
+
if (!article) {
|
134
|
+
throw new Error('Failed to extract content from webpage');
|
135
|
+
}
|
136
|
+
// Convert content based on requested format
|
137
|
+
let contentStr;
|
138
|
+
switch (format) {
|
139
|
+
case 'html':
|
140
|
+
contentStr = article.content || '';
|
141
|
+
break;
|
142
|
+
case 'text':
|
143
|
+
contentStr = this.htmlToPlainText(article.content || '');
|
144
|
+
break;
|
145
|
+
case 'markdown':
|
146
|
+
default:
|
147
|
+
contentStr = this.htmlToMarkdown(article.content || '');
|
148
|
+
break;
|
149
|
+
}
|
150
|
+
// Calculate content stats
|
151
|
+
const wordCount = contentStr.split(/\s+/).filter(word => word.length > 0).length;
|
152
|
+
// Generate a summary of the content
|
153
|
+
const summary = this.generateSummary(contentStr);
|
154
|
+
const content = {
|
155
|
+
url,
|
156
|
+
title: $('title').text() || article.title || '',
|
157
|
+
description: metaTags['description'] || '',
|
158
|
+
content: contentStr,
|
159
|
+
format: format,
|
160
|
+
meta_tags: metaTags,
|
161
|
+
stats: {
|
162
|
+
word_count: wordCount,
|
163
|
+
approximate_chars: contentStr.length
|
164
|
+
},
|
165
|
+
content_preview: {
|
166
|
+
first_500_chars: contentStr.slice(0, 500) + (contentStr.length > 500 ? '...' : '')
|
167
|
+
},
|
168
|
+
summary: summary
|
169
|
+
};
|
170
|
+
// Cache the content before returning
|
171
|
+
this.cacheContent(url, format, content);
|
172
|
+
return content;
|
173
|
+
}
|
174
|
+
catch (error) {
|
175
|
+
if (axios.isAxiosError(error)) {
|
176
|
+
throw new Error(`Failed to fetch webpage: ${error.message}`);
|
177
|
+
}
|
178
|
+
throw error;
|
179
|
+
}
|
180
|
+
}
|
181
|
+
async batchExtractContent(urls, format = 'markdown') {
|
182
|
+
const results = {};
|
183
|
+
await Promise.all(urls.map(async (url) => {
|
184
|
+
try {
|
185
|
+
results[url] = await this.extractContent(url, format);
|
186
|
+
}
|
187
|
+
catch (error) {
|
188
|
+
results[url] = {
|
189
|
+
error: error instanceof Error ? error.message : 'Unknown error occurred'
|
190
|
+
};
|
191
|
+
}
|
192
|
+
}));
|
193
|
+
return results;
|
194
|
+
}
|
195
|
+
}
|