pse-mcp 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,23 @@
1
+ {
2
+ "name": "google-search-mcp",
3
+ "version": "0.1.0",
4
+ "description": "MCP server for Google search and webpage analysis",
5
+ "type": "module",
6
+ "scripts": {
7
+ "start": "node dist/google-search.js"
8
+ },
9
+ "dependencies": {
10
+ "@modelcontextprotocol/sdk": "^1.0.1",
11
+ "@mozilla/readability": "^0.6.0",
12
+ "@types/turndown": "^5.0.5",
13
+ "axios": "^1.7.9",
14
+ "cheerio": "^1.0.0",
15
+ "dompurify": "^3.2.3",
16
+ "express": "^4.21.2",
17
+ "googleapis": "^144.0.0",
18
+ "jsdom": "^25.0.1",
19
+ "markdown-it": "^14.1.0",
20
+ "readability": "^0.1.0",
21
+ "turndown": "^7.2.0"
22
+ }
23
+ }
package/license ADDED
@@ -0,0 +1,4 @@
1
+ License:
2
+
3
+ Free for non commercial use.
4
+ Want to use it commercially, feel free to contact me.
package/package.json ADDED
@@ -0,0 +1,40 @@
1
+ {
2
+ "name": "pse-mcp",
3
+ "version": "0.1.0",
4
+ "description": "MCP server for Google search and webpage analysis",
5
+ "type": "module",
6
+ "bin": {
7
+ "pse-mcp": "dist/google-search.js"
8
+ },
9
+ "scripts": {
10
+ "build": "tsc",
11
+ "start": "node dist/google-search.js",
12
+ "dev": "tsc -w",
13
+ "start:python": "concurrently \"python google_search.py\" \"python link_view.py\"",
14
+ "start:all": "concurrently \"npm run start:python\" \"npm run start\""
15
+ },
16
+ "dependencies": {
17
+ "@modelcontextprotocol/sdk": "^1.0.1",
18
+ "@mozilla/readability": "^0.6.0",
19
+ "@types/turndown": "^5.0.5",
20
+ "axios": "^1.7.9",
21
+ "cheerio": "^1.0.0",
22
+ "dompurify": "^3.2.3",
23
+ "express": "^4.21.2",
24
+ "googleapis": "^144.0.0",
25
+ "jsdom": "^25.0.1",
26
+ "markdown-it": "^14.1.0",
27
+ "readability": "^0.1.0",
28
+ "turndown": "^7.2.0"
29
+ },
30
+ "devDependencies": {
31
+ "@types/cheerio": "^0.22.35",
32
+ "@types/dompurify": "^3.0.5",
33
+ "@types/express": "^4.17.21",
34
+ "@types/jsdom": "^21.1.7",
35
+ "@types/markdown-it": "^14.1.2",
36
+ "@types/node": "^20.17.21",
37
+ "concurrently": "^9.1.0",
38
+ "typescript": "^5.7.2"
39
+ }
40
+ }
@@ -0,0 +1,477 @@
1
+ #!/usr/bin/env node
2
+
3
+ import { Server } from '@modelcontextprotocol/sdk/server/index.js';
4
+ import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
5
+ import { CallToolRequestSchema, ListToolsRequestSchema } from '@modelcontextprotocol/sdk/types.js';
6
+ import { GoogleSearchService } from './services/google-search.service.js';
7
+ import { ContentExtractor } from './services/content-extractor.service.js';
8
+ import { OutputFormat } from './types.js';
9
+
10
+ class GoogleSearchServer {
11
+ private server: Server;
12
+ private searchService: GoogleSearchService;
13
+ private contentExtractor: ContentExtractor;
14
+
15
+ constructor() {
16
+ this.searchService = new GoogleSearchService();
17
+ this.contentExtractor = new ContentExtractor();
18
+ this.server = new Server(
19
+ {
20
+ name: 'google-search',
21
+ version: '1.0.0'
22
+ },
23
+ {
24
+ capabilities: {
25
+ tools: {
26
+ google_search: {
27
+ description: 'Search Google and return relevant results from the web. This tool finds web pages, articles, and information on specific topics using Google\'s search engine. Results include titles, snippets, and URLs that can be analyzed further using extract_webpage_content.',
28
+ inputSchema: {
29
+ type: 'object',
30
+ properties: {
31
+ query: {
32
+ type: 'string',
33
+ description: 'Search query - be specific and use quotes for exact matches. For best results, use clear keywords and avoid very long queries.'
34
+ },
35
+ num_results: {
36
+ type: 'number',
37
+ description: 'Number of results to return (default: 5, max: 10). Increase for broader coverage, decrease for faster response.'
38
+ },
39
+ site: {
40
+ type: 'string',
41
+ description: 'Limit search results to a specific website domain (e.g., "wikipedia.org" or "nytimes.com").'
42
+ },
43
+ language: {
44
+ type: 'string',
45
+ description: 'Filter results by language using ISO 639-1 codes (e.g., "en" for English, "es" for Spanish, "fr" for French).'
46
+ },
47
+ dateRestrict: {
48
+ type: 'string',
49
+ description: 'Filter results by date using Google\'s date restriction format: "d[number]" for past days, "w[number]" for past weeks, "m[number]" for past months, or "y[number]" for past years. Example: "m6" for results from the past 6 months.'
50
+ },
51
+ exactTerms: {
52
+ type: 'string',
53
+ description: 'Search for results that contain this exact phrase. This is equivalent to putting the terms in quotes in the search query.'
54
+ },
55
+ resultType: {
56
+ type: 'string',
57
+ description: 'Specify the type of results to return. Options include "image" (or "images"), "news", and "video" (or "videos"). Default is general web results.'
58
+ },
59
+ page: {
60
+ type: 'number',
61
+ description: 'Page number for paginated results (starts at 1). Use in combination with resultsPerPage to navigate through large result sets.'
62
+ },
63
+ resultsPerPage: {
64
+ type: 'number',
65
+ description: 'Number of results to show per page (default: 5, max: 10). Controls how many results are returned for each page.'
66
+ },
67
+ sort: {
68
+ type: 'string',
69
+ description: 'Sorting method for search results. Options: "relevance" (default) or "date" (most recent first).'
70
+ }
71
+ },
72
+ required: ['query']
73
+ }
74
+ },
75
+ extract_webpage_content: {
76
+ description: 'Extract and analyze content from a webpage, converting it to readable text. This tool fetches the main content while removing ads, navigation elements, and other clutter. Use it to get detailed information from specific pages found via google_search. Works with most common webpage formats including articles, blogs, and documentation.',
77
+ inputSchema: {
78
+ type: 'object',
79
+ properties: {
80
+ url: {
81
+ type: 'string',
82
+ description: 'Full URL of the webpage to extract content from (must start with http:// or https://). Ensure the URL is from a public webpage and not behind authentication.'
83
+ },
84
+ format: {
85
+ type: 'string',
86
+ description: 'Output format for the extracted content. Options: "markdown" (default), "html", or "text".'
87
+ }
88
+ },
89
+ required: ['url']
90
+ }
91
+ },
92
+ extract_multiple_webpages: {
93
+ description: 'Extract and analyze content from multiple webpages in a single request. This tool is ideal for comparing information across different sources or gathering comprehensive information on a topic. Limited to 5 URLs per request to maintain performance.',
94
+ inputSchema: {
95
+ type: 'object',
96
+ properties: {
97
+ urls: {
98
+ type: 'array',
99
+ items: { type: 'string' },
100
+ description: 'Array of webpage URLs to extract content from. Each URL must be public and start with http:// or https://. Maximum 5 URLs per request.'
101
+ },
102
+ format: {
103
+ type: 'string',
104
+ description: 'Output format for the extracted content. Options: "markdown" (default), "html", or "text".'
105
+ }
106
+ },
107
+ required: ['urls']
108
+ }
109
+ }
110
+ }
111
+ }
112
+ });
113
+
114
+ // Register tool list handler
115
+ this.server.setRequestHandler(ListToolsRequestSchema, async () => ({
116
+ tools: [
117
+ {
118
+ name: 'google_search',
119
+ description: 'Search Google and return relevant results from the web. This tool finds web pages, articles, and information on specific topics using Google\'s search engine. Results include titles, snippets, and URLs that can be analyzed further using extract_webpage_content.',
120
+ inputSchema: {
121
+ type: 'object',
122
+ properties: {
123
+ query: {
124
+ type: 'string',
125
+ description: 'Search query - be specific and use quotes for exact matches. For best results, use clear keywords and avoid very long queries.'
126
+ },
127
+ num_results: {
128
+ type: 'number',
129
+ description: 'Number of results to return (default: 5, max: 10). Increase for broader coverage, decrease for faster response.'
130
+ },
131
+ site: {
132
+ type: 'string',
133
+ description: 'Limit search results to a specific website domain (e.g., "wikipedia.org" or "nytimes.com").'
134
+ },
135
+ language: {
136
+ type: 'string',
137
+ description: 'Filter results by language using ISO 639-1 codes (e.g., "en" for English, "es" for Spanish, "fr" for French).'
138
+ },
139
+ dateRestrict: {
140
+ type: 'string',
141
+ description: 'Filter results by date using Google\'s date restriction format: "d[number]" for past days, "w[number]" for past weeks, "m[number]" for past months, or "y[number]" for past years. Example: "m6" for results from the past 6 months.'
142
+ },
143
+ exactTerms: {
144
+ type: 'string',
145
+ description: 'Search for results that contain this exact phrase. This is equivalent to putting the terms in quotes in the search query.'
146
+ },
147
+ resultType: {
148
+ type: 'string',
149
+ description: 'Specify the type of results to return. Options include "image" (or "images"), "news", and "video" (or "videos"). Default is general web results.'
150
+ },
151
+ page: {
152
+ type: 'number',
153
+ description: 'Page number for paginated results (starts at 1). Use in combination with resultsPerPage to navigate through large result sets.'
154
+ },
155
+ resultsPerPage: {
156
+ type: 'number',
157
+ description: 'Number of results to show per page (default: 5, max: 10). Controls how many results are returned for each page.'
158
+ },
159
+ sort: {
160
+ type: 'string',
161
+ description: 'Sorting method for search results. Options: "relevance" (default) or "date" (most recent first).'
162
+ }
163
+ },
164
+ required: ['query']
165
+ }
166
+ },
167
+ {
168
+ name: 'extract_webpage_content',
169
+ description: 'Extract and analyze content from a webpage, converting it to readable text. This tool fetches the main content while removing ads, navigation elements, and other clutter. Use it to get detailed information from specific pages found via google_search. Works with most common webpage formats including articles, blogs, and documentation.',
170
+ inputSchema: {
171
+ type: 'object',
172
+ properties: {
173
+ url: {
174
+ type: 'string',
175
+ description: 'Full URL of the webpage to extract content from (must start with http:// or https://). Ensure the URL is from a public webpage and not behind authentication.'
176
+ },
177
+ format: {
178
+ type: 'string',
179
+ description: 'Output format for the extracted content. Options: "markdown" (default), "html", or "text".'
180
+ }
181
+ },
182
+ required: ['url']
183
+ }
184
+ },
185
+ {
186
+ name: 'extract_multiple_webpages',
187
+ description: 'Extract and analyze content from multiple webpages in a single request. This tool is ideal for comparing information across different sources or gathering comprehensive information on a topic. Limited to 5 URLs per request to maintain performance.',
188
+ inputSchema: {
189
+ type: 'object',
190
+ properties: {
191
+ urls: {
192
+ type: 'array',
193
+ items: { type: 'string' },
194
+ description: 'Array of webpage URLs to extract content from. Each URL must be public and start with http:// or https://. Maximum 5 URLs per request.'
195
+ },
196
+ format: {
197
+ type: 'string',
198
+ description: 'Output format for the extracted content. Options: "markdown" (default), "html", or "text".'
199
+ }
200
+ },
201
+ required: ['urls']
202
+ }
203
+ }
204
+ ]
205
+ }));
206
+
207
+ // Register tool call handler
208
+ this.server.setRequestHandler(CallToolRequestSchema, async (request: any) => {
209
+ switch (request.params.name) {
210
+ case 'google_search':
211
+ if (typeof request.params.arguments === 'object' && request.params.arguments !== null && 'query' in request.params.arguments) {
212
+ return this.handleSearch({
213
+ query: String(request.params.arguments.query),
214
+ num_results: typeof request.params.arguments.num_results === 'number' ? request.params.arguments.num_results : undefined,
215
+ filters: {
216
+ site: request.params.arguments.site ? String(request.params.arguments.site) : undefined,
217
+ language: request.params.arguments.language ? String(request.params.arguments.language) : undefined,
218
+ dateRestrict: request.params.arguments.dateRestrict ? String(request.params.arguments.dateRestrict) : undefined,
219
+ exactTerms: request.params.arguments.exactTerms ? String(request.params.arguments.exactTerms) : undefined,
220
+ resultType: request.params.arguments.resultType ? String(request.params.arguments.resultType) : undefined,
221
+ page: typeof request.params.arguments.page === 'number' ? request.params.arguments.page : undefined,
222
+ resultsPerPage: typeof request.params.arguments.resultsPerPage === 'number' ? request.params.arguments.resultsPerPage : undefined,
223
+ sort: request.params.arguments.sort ? String(request.params.arguments.sort) : undefined
224
+ }
225
+ });
226
+ }
227
+ throw new Error('Invalid arguments for google_search tool');
228
+
229
+ case 'extract_webpage_content':
230
+ if (typeof request.params.arguments === 'object' && request.params.arguments !== null && 'url' in request.params.arguments) {
231
+ return this.handleAnalyzeWebpage({
232
+ url: String(request.params.arguments.url),
233
+ format: request.params.arguments.format ? String(request.params.arguments.format) as OutputFormat : 'markdown'
234
+ });
235
+ }
236
+ throw new Error('Invalid arguments for extract_webpage_content tool');
237
+
238
+ case 'extract_multiple_webpages':
239
+ if (typeof request.params.arguments === 'object' && request.params.arguments !== null && 'urls' in request.params.arguments && Array.isArray(request.params.arguments.urls)) {
240
+ return this.handleBatchAnalyzeWebpages({
241
+ urls: request.params.arguments.urls.map(String),
242
+ format: request.params.arguments.format ? String(request.params.arguments.format) as OutputFormat : 'markdown'
243
+ });
244
+ }
245
+ throw new Error('Invalid arguments for extract_multiple_webpages tool');
246
+
247
+ default:
248
+ throw new Error(`Unknown tool: ${request.params.name}`);
249
+ }
250
+ });
251
+ }
252
+
253
+ private async handleSearch(args: {
254
+ query: string;
255
+ num_results?: number;
256
+ filters?: {
257
+ site?: string;
258
+ language?: string;
259
+ dateRestrict?: string;
260
+ exactTerms?: string;
261
+ resultType?: string;
262
+ page?: number;
263
+ resultsPerPage?: number;
264
+ sort?: string;
265
+ }
266
+ }) {
267
+ try {
268
+ const { results, pagination, categories } = await this.searchService.search(args.query, args.num_results, args.filters);
269
+
270
+ if (results.length === 0) {
271
+ return {
272
+ content: [{
273
+ type: 'text',
274
+ text: 'No results found. Try:\n- Using different keywords\n- Removing quotes from non-exact phrases\n- Using more general terms'
275
+ }],
276
+ isError: true
277
+ };
278
+ }
279
+
280
+ // Format results in a more concise, readable way
281
+ const formattedResults = results.map(result => ({
282
+ title: result.title,
283
+ link: result.link,
284
+ snippet: result.snippet,
285
+ category: result.category
286
+ }));
287
+
288
+ // Format results in a more AI-friendly way
289
+ let responseText = `Search results for "${args.query}":\n\n`;
290
+
291
+ // Add category summary if available
292
+ if (categories && categories.length > 0) {
293
+ responseText += "Categories: " + categories.map(c => `${c.name} (${c.count})`).join(', ') + "\n\n";
294
+ }
295
+
296
+ // Add pagination info
297
+ if (pagination) {
298
+ responseText += `Showing page ${pagination.currentPage}${pagination.totalResults ? ` of approximately ${pagination.totalResults} results` : ''}\n\n`;
299
+ }
300
+
301
+ // Add each result in a readable format
302
+ formattedResults.forEach((result, index) => {
303
+ responseText += `${index + 1}. ${result.title}\n`;
304
+ responseText += ` URL: ${result.link}\n`;
305
+ responseText += ` ${result.snippet}\n\n`;
306
+ });
307
+
308
+ // Add navigation hints if pagination exists
309
+ if (pagination && (pagination.hasNextPage || pagination.hasPreviousPage)) {
310
+ responseText += "Navigation: ";
311
+ if (pagination.hasPreviousPage) {
312
+ responseText += "Use 'page: " + (pagination.currentPage - 1) + "' for previous results. ";
313
+ }
314
+ if (pagination.hasNextPage) {
315
+ responseText += "Use 'page: " + (pagination.currentPage + 1) + "' for more results.";
316
+ }
317
+ responseText += "\n";
318
+ }
319
+
320
+ return {
321
+ content: [
322
+ {
323
+ type: 'text',
324
+ text: responseText,
325
+ },
326
+ ],
327
+ };
328
+ } catch (error) {
329
+ const message = error instanceof Error ? error.message : 'Unknown error during search';
330
+ return {
331
+ content: [{ type: 'text', text: message }],
332
+ isError: true
333
+ };
334
+ }
335
+ }
336
+
337
+ private async handleAnalyzeWebpage(args: { url: string; format?: OutputFormat; summarize?: boolean }) {
338
+ try {
339
+ const content = await this.contentExtractor.extractContent(args.url, args.format);
340
+
341
+ // Format the response in a more readable, concise way
342
+ let responseText = `Content from: ${content.url}\n\n`;
343
+ responseText += `Title: ${content.title}\n`;
344
+
345
+ if (content.description) {
346
+ responseText += `Description: ${content.description}\n`;
347
+ }
348
+
349
+ responseText += `\nStats: ${content.stats.word_count} words, ${content.stats.approximate_chars} characters\n\n`;
350
+
351
+ // Add the summary if available
352
+ if (content.summary) {
353
+ responseText += `Summary: ${content.summary}\n\n`;
354
+ }
355
+
356
+ // Add a preview of the content
357
+ responseText += `Content Preview:\n${content.content_preview.first_500_chars}\n\n`;
358
+
359
+ // Add a note about requesting specific information
360
+ responseText += `Note: This is a preview of the content. For specific information, please ask about particular aspects of this webpage.`;
361
+
362
+ return {
363
+ content: [
364
+ {
365
+ type: 'text',
366
+ text: responseText,
367
+ },
368
+ ],
369
+ };
370
+ } catch (error) {
371
+ const errorMessage = error instanceof Error ? error.message : 'Unknown error occurred';
372
+ const helpText = 'Common issues:\n- Check if the URL is accessible in a browser\n- Ensure the webpage is public\n- Try again if it\'s a temporary network issue';
373
+
374
+ return {
375
+ content: [
376
+ {
377
+ type: 'text',
378
+ text: `${errorMessage}\n\n${helpText}`,
379
+ },
380
+ ],
381
+ isError: true,
382
+ };
383
+ }
384
+ }
385
+
386
+ private async handleBatchAnalyzeWebpages(args: { urls: string[]; format?: OutputFormat }) {
387
+ if (args.urls.length > 5) {
388
+ return {
389
+ content: [{
390
+ type: 'text',
391
+ text: 'Maximum 5 URLs allowed per request to maintain performance. Please reduce the number of URLs.'
392
+ }],
393
+ isError: true
394
+ };
395
+ }
396
+
397
+ try {
398
+ const results = await this.contentExtractor.batchExtractContent(args.urls, args.format);
399
+
400
+ // Format the response in a more readable, concise way
401
+ let responseText = `Content from ${args.urls.length} webpages:\n\n`;
402
+
403
+ for (const [url, result] of Object.entries(results)) {
404
+ responseText += `URL: ${url}\n`;
405
+
406
+ if ('error' in result) {
407
+ responseText += `Error: ${result.error}\n\n`;
408
+ continue;
409
+ }
410
+
411
+ responseText += `Title: ${result.title}\n`;
412
+
413
+ if (result.description) {
414
+ responseText += `Description: ${result.description}\n`;
415
+ }
416
+
417
+ responseText += `Stats: ${result.stats.word_count} words\n`;
418
+
419
+ // Add summary if available
420
+ if (result.summary) {
421
+ responseText += `Summary: ${result.summary}\n`;
422
+ }
423
+
424
+ responseText += `Preview: ${result.content_preview.first_500_chars.substring(0, 150)}...\n\n`;
425
+ }
426
+
427
+ responseText += `Note: These are previews of the content. To analyze the full content of a specific URL, use the extract_webpage_content tool with that URL.`;
428
+
429
+ return {
430
+ content: [
431
+ {
432
+ type: 'text',
433
+ text: responseText,
434
+ },
435
+ ],
436
+ };
437
+ } catch (error) {
438
+ const errorMessage = error instanceof Error ? error.message : 'Unknown error occurred';
439
+ const helpText = 'Common issues:\n- Check if all URLs are accessible in a browser\n- Ensure all webpages are public\n- Try again if it\'s a temporary network issue\n- Consider reducing the number of URLs';
440
+
441
+ return {
442
+ content: [
443
+ {
444
+ type: 'text',
445
+ text: `${errorMessage}\n\n${helpText}`,
446
+ },
447
+ ],
448
+ isError: true,
449
+ };
450
+ }
451
+ }
452
+
453
+ async start() {
454
+ try {
455
+ const transport = new StdioServerTransport();
456
+ await this.server.connect(transport);
457
+ console.error('Google Search MCP server running');
458
+
459
+ // Keep the process running
460
+ process.on('SIGINT', () => {
461
+ this.server.close().catch(console.error);
462
+ process.exit(0);
463
+ });
464
+ } catch (error: unknown) {
465
+ if (error instanceof Error) {
466
+ console.error('Failed to start MCP server:', error.message);
467
+ } else {
468
+ console.error('Failed to start MCP server: Unknown error');
469
+ }
470
+ process.exit(1);
471
+ }
472
+ }
473
+ }
474
+
475
+ // Start the server
476
+ const server = new GoogleSearchServer();
477
+ server.start().catch(console.error);
package/src/mcp.d.ts ADDED
@@ -0,0 +1,36 @@
1
+ declare module '@modelcontextprotocol/sdk/server' {
2
+ export class Server {
3
+ constructor(
4
+ info: { name: string; version: string },
5
+ config: {
6
+ capabilities: {
7
+ tools?: Record<string, {
8
+ description: string;
9
+ inputSchema: {
10
+ type: string;
11
+ properties: Record<string, any>;
12
+ required: string[];
13
+ };
14
+ }>;
15
+ };
16
+ }
17
+ );
18
+
19
+ setToolHandler(
20
+ name: string,
21
+ handler: (args: any) => Promise<{
22
+ content: Array<{ type: string; text: string }>;
23
+ isError?: boolean;
24
+ }>
25
+ ): void;
26
+
27
+ connect(transport: any): Promise<void>;
28
+ close(): Promise<void>;
29
+ }
30
+ }
31
+
32
+ declare module '@modelcontextprotocol/sdk/server/stdio' {
33
+ export class StdioServerTransport {
34
+ constructor();
35
+ }
36
+ }