pse-mcp 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +104 -220
- package/dist/google-search.js +4 -190
- package/dist/services/google-search.service.js +1 -1
- package/dist-package/dist/google-search.js +0 -0
- package/dist-package/package.json +22 -22
- package/package.json +40 -40
- package/src/google-search.ts +4 -214
- package/src/services/google-search.service.ts +1 -1
- package/src/types.ts +0 -22
- package/GEMINI.md +0 -72
- package/QWEN.md +0 -207
- package/dist/content-fetcher.js +0 -36
- package/dist/services/content-extractor.service.js +0 -195
- package/src/services/content-extractor.service.ts +0 -232
- package/tasks.md +0 -141
package/package.json
CHANGED
@@ -1,40 +1,40 @@
|
|
1
|
-
{
|
2
|
-
"name": "pse-mcp",
|
3
|
-
"version": "0.1.
|
4
|
-
"description": "MCP server for Google search and webpage analysis",
|
5
|
-
"type": "module",
|
6
|
-
"bin": {
|
7
|
-
"pse-mcp": "dist/google-search.js"
|
8
|
-
},
|
9
|
-
"scripts": {
|
10
|
-
"build": "tsc",
|
11
|
-
"start": "node dist/google-search.js",
|
12
|
-
"dev": "tsc -w",
|
13
|
-
"start:python": "concurrently \"python google_search.py\" \"python link_view.py\"",
|
14
|
-
"start:all": "concurrently \"npm run start:python\" \"npm run start\""
|
15
|
-
},
|
16
|
-
"dependencies": {
|
17
|
-
"@modelcontextprotocol/sdk": "^1.0.1",
|
18
|
-
"@mozilla/readability": "^0.6.0",
|
19
|
-
"@types/turndown": "^5.0.5",
|
20
|
-
"axios": "^1.7.9",
|
21
|
-
"cheerio": "^1.0.0",
|
22
|
-
"dompurify": "^3.2.3",
|
23
|
-
"express": "^4.21.2",
|
24
|
-
"googleapis": "^144.0.0",
|
25
|
-
"jsdom": "^25.0.1",
|
26
|
-
"markdown-it": "^14.1.0",
|
27
|
-
"readability": "^0.1.0",
|
28
|
-
"turndown": "^7.2.0"
|
29
|
-
},
|
30
|
-
"devDependencies": {
|
31
|
-
"@types/cheerio": "^0.22.35",
|
32
|
-
"@types/dompurify": "^3.0.5",
|
33
|
-
"@types/express": "^4.17.21",
|
34
|
-
"@types/jsdom": "^21.1.7",
|
35
|
-
"@types/markdown-it": "^14.1.2",
|
36
|
-
"@types/node": "^20.17.21",
|
37
|
-
"concurrently": "^9.1.0",
|
38
|
-
"typescript": "^5.
|
39
|
-
}
|
40
|
-
}
|
1
|
+
{
|
2
|
+
"name": "pse-mcp",
|
3
|
+
"version": "0.1.1",
|
4
|
+
"description": "MCP server for Google search and webpage analysis",
|
5
|
+
"type": "module",
|
6
|
+
"bin": {
|
7
|
+
"pse-mcp": "dist-package/dist/google-search.js"
|
8
|
+
},
|
9
|
+
"scripts": {
|
10
|
+
"build": "tsc",
|
11
|
+
"start": "node dist/google-search.js",
|
12
|
+
"dev": "tsc -w",
|
13
|
+
"start:python": "concurrently \"python google_search.py\" \"python link_view.py\"",
|
14
|
+
"start:all": "concurrently \"npm run start:python\" \"npm run start\""
|
15
|
+
},
|
16
|
+
"dependencies": {
|
17
|
+
"@modelcontextprotocol/sdk": "^1.0.1",
|
18
|
+
"@mozilla/readability": "^0.6.0",
|
19
|
+
"@types/turndown": "^5.0.5",
|
20
|
+
"axios": "^1.7.9",
|
21
|
+
"cheerio": "^1.0.0",
|
22
|
+
"dompurify": "^3.2.3",
|
23
|
+
"express": "^4.21.2",
|
24
|
+
"googleapis": "^144.0.0",
|
25
|
+
"jsdom": "^25.0.1",
|
26
|
+
"markdown-it": "^14.1.0",
|
27
|
+
"readability": "^0.1.0",
|
28
|
+
"turndown": "^7.2.0"
|
29
|
+
},
|
30
|
+
"devDependencies": {
|
31
|
+
"@types/cheerio": "^0.22.35",
|
32
|
+
"@types/dompurify": "^3.0.5",
|
33
|
+
"@types/express": "^4.17.21",
|
34
|
+
"@types/jsdom": "^21.1.7",
|
35
|
+
"@types/markdown-it": "^14.1.2",
|
36
|
+
"@types/node": "^20.17.21",
|
37
|
+
"concurrently": "^9.1.0",
|
38
|
+
"typescript": "^5.9.3"
|
39
|
+
}
|
40
|
+
}
|
package/src/google-search.ts
CHANGED
@@ -4,17 +4,13 @@ import { Server } from '@modelcontextprotocol/sdk/server/index.js';
|
|
4
4
|
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
|
5
5
|
import { CallToolRequestSchema, ListToolsRequestSchema } from '@modelcontextprotocol/sdk/types.js';
|
6
6
|
import { GoogleSearchService } from './services/google-search.service.js';
|
7
|
-
import { ContentExtractor } from './services/content-extractor.service.js';
|
8
|
-
import { OutputFormat } from './types.js';
|
9
7
|
|
10
8
|
class GoogleSearchServer {
|
11
9
|
private server: Server;
|
12
10
|
private searchService: GoogleSearchService;
|
13
|
-
private contentExtractor: ContentExtractor;
|
14
11
|
|
15
12
|
constructor() {
|
16
13
|
this.searchService = new GoogleSearchService();
|
17
|
-
this.contentExtractor = new ContentExtractor();
|
18
14
|
this.server = new Server(
|
19
15
|
{
|
20
16
|
name: 'google-search',
|
@@ -34,7 +30,7 @@ class GoogleSearchServer {
|
|
34
30
|
},
|
35
31
|
num_results: {
|
36
32
|
type: 'number',
|
37
|
-
description: 'Number of results to return (default:
|
33
|
+
description: 'Number of results to return (default: 10, max: 10). Increase for broader coverage, decrease for faster response.'
|
38
34
|
},
|
39
35
|
site: {
|
40
36
|
type: 'string',
|
@@ -62,7 +58,7 @@ class GoogleSearchServer {
|
|
62
58
|
},
|
63
59
|
resultsPerPage: {
|
64
60
|
type: 'number',
|
65
|
-
description: 'Number of results to show per page (default:
|
61
|
+
description: 'Number of results to show per page (default: 10, max: 10). Controls how many results are returned for each page.'
|
66
62
|
},
|
67
63
|
sort: {
|
68
64
|
type: 'string',
|
@@ -71,41 +67,6 @@ class GoogleSearchServer {
|
|
71
67
|
},
|
72
68
|
required: ['query']
|
73
69
|
}
|
74
|
-
},
|
75
|
-
extract_webpage_content: {
|
76
|
-
description: 'Extract and analyze content from a webpage, converting it to readable text. This tool fetches the main content while removing ads, navigation elements, and other clutter. Use it to get detailed information from specific pages found via google_search. Works with most common webpage formats including articles, blogs, and documentation.',
|
77
|
-
inputSchema: {
|
78
|
-
type: 'object',
|
79
|
-
properties: {
|
80
|
-
url: {
|
81
|
-
type: 'string',
|
82
|
-
description: 'Full URL of the webpage to extract content from (must start with http:// or https://). Ensure the URL is from a public webpage and not behind authentication.'
|
83
|
-
},
|
84
|
-
format: {
|
85
|
-
type: 'string',
|
86
|
-
description: 'Output format for the extracted content. Options: "markdown" (default), "html", or "text".'
|
87
|
-
}
|
88
|
-
},
|
89
|
-
required: ['url']
|
90
|
-
}
|
91
|
-
},
|
92
|
-
extract_multiple_webpages: {
|
93
|
-
description: 'Extract and analyze content from multiple webpages in a single request. This tool is ideal for comparing information across different sources or gathering comprehensive information on a topic. Limited to 5 URLs per request to maintain performance.',
|
94
|
-
inputSchema: {
|
95
|
-
type: 'object',
|
96
|
-
properties: {
|
97
|
-
urls: {
|
98
|
-
type: 'array',
|
99
|
-
items: { type: 'string' },
|
100
|
-
description: 'Array of webpage URLs to extract content from. Each URL must be public and start with http:// or https://. Maximum 5 URLs per request.'
|
101
|
-
},
|
102
|
-
format: {
|
103
|
-
type: 'string',
|
104
|
-
description: 'Output format for the extracted content. Options: "markdown" (default), "html", or "text".'
|
105
|
-
}
|
106
|
-
},
|
107
|
-
required: ['urls']
|
108
|
-
}
|
109
70
|
}
|
110
71
|
}
|
111
72
|
}
|
@@ -126,7 +87,7 @@ class GoogleSearchServer {
|
|
126
87
|
},
|
127
88
|
num_results: {
|
128
89
|
type: 'number',
|
129
|
-
description: 'Number of results to return (default:
|
90
|
+
description: 'Number of results to return (default: 10, max: 10). Increase for broader coverage, decrease for faster response.'
|
130
91
|
},
|
131
92
|
site: {
|
132
93
|
type: 'string',
|
@@ -154,7 +115,7 @@ class GoogleSearchServer {
|
|
154
115
|
},
|
155
116
|
resultsPerPage: {
|
156
117
|
type: 'number',
|
157
|
-
description: 'Number of results to show per page (default:
|
118
|
+
description: 'Number of results to show per page (default: 10, max: 10). Controls how many results are returned for each page.'
|
158
119
|
},
|
159
120
|
sort: {
|
160
121
|
type: 'string',
|
@@ -163,43 +124,6 @@ class GoogleSearchServer {
|
|
163
124
|
},
|
164
125
|
required: ['query']
|
165
126
|
}
|
166
|
-
},
|
167
|
-
{
|
168
|
-
name: 'extract_webpage_content',
|
169
|
-
description: 'Extract and analyze content from a webpage, converting it to readable text. This tool fetches the main content while removing ads, navigation elements, and other clutter. Use it to get detailed information from specific pages found via google_search. Works with most common webpage formats including articles, blogs, and documentation.',
|
170
|
-
inputSchema: {
|
171
|
-
type: 'object',
|
172
|
-
properties: {
|
173
|
-
url: {
|
174
|
-
type: 'string',
|
175
|
-
description: 'Full URL of the webpage to extract content from (must start with http:// or https://). Ensure the URL is from a public webpage and not behind authentication.'
|
176
|
-
},
|
177
|
-
format: {
|
178
|
-
type: 'string',
|
179
|
-
description: 'Output format for the extracted content. Options: "markdown" (default), "html", or "text".'
|
180
|
-
}
|
181
|
-
},
|
182
|
-
required: ['url']
|
183
|
-
}
|
184
|
-
},
|
185
|
-
{
|
186
|
-
name: 'extract_multiple_webpages',
|
187
|
-
description: 'Extract and analyze content from multiple webpages in a single request. This tool is ideal for comparing information across different sources or gathering comprehensive information on a topic. Limited to 5 URLs per request to maintain performance.',
|
188
|
-
inputSchema: {
|
189
|
-
type: 'object',
|
190
|
-
properties: {
|
191
|
-
urls: {
|
192
|
-
type: 'array',
|
193
|
-
items: { type: 'string' },
|
194
|
-
description: 'Array of webpage URLs to extract content from. Each URL must be public and start with http:// or https://. Maximum 5 URLs per request.'
|
195
|
-
},
|
196
|
-
format: {
|
197
|
-
type: 'string',
|
198
|
-
description: 'Output format for the extracted content. Options: "markdown" (default), "html", or "text".'
|
199
|
-
}
|
200
|
-
},
|
201
|
-
required: ['urls']
|
202
|
-
}
|
203
127
|
}
|
204
128
|
]
|
205
129
|
}));
|
@@ -226,24 +150,6 @@ class GoogleSearchServer {
|
|
226
150
|
}
|
227
151
|
throw new Error('Invalid arguments for google_search tool');
|
228
152
|
|
229
|
-
case 'extract_webpage_content':
|
230
|
-
if (typeof request.params.arguments === 'object' && request.params.arguments !== null && 'url' in request.params.arguments) {
|
231
|
-
return this.handleAnalyzeWebpage({
|
232
|
-
url: String(request.params.arguments.url),
|
233
|
-
format: request.params.arguments.format ? String(request.params.arguments.format) as OutputFormat : 'markdown'
|
234
|
-
});
|
235
|
-
}
|
236
|
-
throw new Error('Invalid arguments for extract_webpage_content tool');
|
237
|
-
|
238
|
-
case 'extract_multiple_webpages':
|
239
|
-
if (typeof request.params.arguments === 'object' && request.params.arguments !== null && 'urls' in request.params.arguments && Array.isArray(request.params.arguments.urls)) {
|
240
|
-
return this.handleBatchAnalyzeWebpages({
|
241
|
-
urls: request.params.arguments.urls.map(String),
|
242
|
-
format: request.params.arguments.format ? String(request.params.arguments.format) as OutputFormat : 'markdown'
|
243
|
-
});
|
244
|
-
}
|
245
|
-
throw new Error('Invalid arguments for extract_multiple_webpages tool');
|
246
|
-
|
247
153
|
default:
|
248
154
|
throw new Error(`Unknown tool: ${request.params.name}`);
|
249
155
|
}
|
@@ -334,122 +240,6 @@ class GoogleSearchServer {
|
|
334
240
|
}
|
335
241
|
}
|
336
242
|
|
337
|
-
private async handleAnalyzeWebpage(args: { url: string; format?: OutputFormat; summarize?: boolean }) {
|
338
|
-
try {
|
339
|
-
const content = await this.contentExtractor.extractContent(args.url, args.format);
|
340
|
-
|
341
|
-
// Format the response in a more readable, concise way
|
342
|
-
let responseText = `Content from: ${content.url}\n\n`;
|
343
|
-
responseText += `Title: ${content.title}\n`;
|
344
|
-
|
345
|
-
if (content.description) {
|
346
|
-
responseText += `Description: ${content.description}\n`;
|
347
|
-
}
|
348
|
-
|
349
|
-
responseText += `\nStats: ${content.stats.word_count} words, ${content.stats.approximate_chars} characters\n\n`;
|
350
|
-
|
351
|
-
// Add the summary if available
|
352
|
-
if (content.summary) {
|
353
|
-
responseText += `Summary: ${content.summary}\n\n`;
|
354
|
-
}
|
355
|
-
|
356
|
-
// Add a preview of the content
|
357
|
-
responseText += `Content Preview:\n${content.content_preview.first_500_chars}\n\n`;
|
358
|
-
|
359
|
-
// Add a note about requesting specific information
|
360
|
-
responseText += `Note: This is a preview of the content. For specific information, please ask about particular aspects of this webpage.`;
|
361
|
-
|
362
|
-
return {
|
363
|
-
content: [
|
364
|
-
{
|
365
|
-
type: 'text',
|
366
|
-
text: responseText,
|
367
|
-
},
|
368
|
-
],
|
369
|
-
};
|
370
|
-
} catch (error) {
|
371
|
-
const errorMessage = error instanceof Error ? error.message : 'Unknown error occurred';
|
372
|
-
const helpText = 'Common issues:\n- Check if the URL is accessible in a browser\n- Ensure the webpage is public\n- Try again if it\'s a temporary network issue';
|
373
|
-
|
374
|
-
return {
|
375
|
-
content: [
|
376
|
-
{
|
377
|
-
type: 'text',
|
378
|
-
text: `${errorMessage}\n\n${helpText}`,
|
379
|
-
},
|
380
|
-
],
|
381
|
-
isError: true,
|
382
|
-
};
|
383
|
-
}
|
384
|
-
}
|
385
|
-
|
386
|
-
private async handleBatchAnalyzeWebpages(args: { urls: string[]; format?: OutputFormat }) {
|
387
|
-
if (args.urls.length > 5) {
|
388
|
-
return {
|
389
|
-
content: [{
|
390
|
-
type: 'text',
|
391
|
-
text: 'Maximum 5 URLs allowed per request to maintain performance. Please reduce the number of URLs.'
|
392
|
-
}],
|
393
|
-
isError: true
|
394
|
-
};
|
395
|
-
}
|
396
|
-
|
397
|
-
try {
|
398
|
-
const results = await this.contentExtractor.batchExtractContent(args.urls, args.format);
|
399
|
-
|
400
|
-
// Format the response in a more readable, concise way
|
401
|
-
let responseText = `Content from ${args.urls.length} webpages:\n\n`;
|
402
|
-
|
403
|
-
for (const [url, result] of Object.entries(results)) {
|
404
|
-
responseText += `URL: ${url}\n`;
|
405
|
-
|
406
|
-
if ('error' in result) {
|
407
|
-
responseText += `Error: ${result.error}\n\n`;
|
408
|
-
continue;
|
409
|
-
}
|
410
|
-
|
411
|
-
responseText += `Title: ${result.title}\n`;
|
412
|
-
|
413
|
-
if (result.description) {
|
414
|
-
responseText += `Description: ${result.description}\n`;
|
415
|
-
}
|
416
|
-
|
417
|
-
responseText += `Stats: ${result.stats.word_count} words\n`;
|
418
|
-
|
419
|
-
// Add summary if available
|
420
|
-
if (result.summary) {
|
421
|
-
responseText += `Summary: ${result.summary}\n`;
|
422
|
-
}
|
423
|
-
|
424
|
-
responseText += `Preview: ${result.content_preview.first_500_chars.substring(0, 150)}...\n\n`;
|
425
|
-
}
|
426
|
-
|
427
|
-
responseText += `Note: These are previews of the content. To analyze the full content of a specific URL, use the extract_webpage_content tool with that URL.`;
|
428
|
-
|
429
|
-
return {
|
430
|
-
content: [
|
431
|
-
{
|
432
|
-
type: 'text',
|
433
|
-
text: responseText,
|
434
|
-
},
|
435
|
-
],
|
436
|
-
};
|
437
|
-
} catch (error) {
|
438
|
-
const errorMessage = error instanceof Error ? error.message : 'Unknown error occurred';
|
439
|
-
const helpText = 'Common issues:\n- Check if all URLs are accessible in a browser\n- Ensure all webpages are public\n- Try again if it\'s a temporary network issue\n- Consider reducing the number of URLs';
|
440
|
-
|
441
|
-
return {
|
442
|
-
content: [
|
443
|
-
{
|
444
|
-
type: 'text',
|
445
|
-
text: `${errorMessage}\n\n${helpText}`,
|
446
|
-
},
|
447
|
-
],
|
448
|
-
isError: true,
|
449
|
-
};
|
450
|
-
}
|
451
|
-
}
|
452
|
-
|
453
243
|
async start() {
|
454
244
|
try {
|
455
245
|
const transport = new StdioServerTransport();
|
@@ -79,7 +79,7 @@ export class GoogleSearchService {
|
|
79
79
|
}
|
80
80
|
}
|
81
81
|
|
82
|
-
async search(query: string, numResults: number =
|
82
|
+
async search(query: string, numResults: number = 10, filters?: SearchFilters): Promise<{
|
83
83
|
results: SearchResult[];
|
84
84
|
pagination?: SearchPaginationInfo;
|
85
85
|
categories?: CategoryInfo[];
|
package/src/types.ts
CHANGED
@@ -40,25 +40,3 @@ export interface SearchResponse {
|
|
40
40
|
categories?: CategoryInfo[];
|
41
41
|
}
|
42
42
|
|
43
|
-
export type OutputFormat = 'markdown' | 'html' | 'text';
|
44
|
-
|
45
|
-
export interface WebpageContent {
|
46
|
-
url: string;
|
47
|
-
title: string;
|
48
|
-
description: string;
|
49
|
-
content: string;
|
50
|
-
format: OutputFormat;
|
51
|
-
meta_tags: Record<string, string>;
|
52
|
-
stats: {
|
53
|
-
word_count: number;
|
54
|
-
approximate_chars: number;
|
55
|
-
};
|
56
|
-
content_preview: {
|
57
|
-
first_500_chars: string;
|
58
|
-
};
|
59
|
-
summary?: string;
|
60
|
-
}
|
61
|
-
|
62
|
-
export interface WebpageAnalysisResponse {
|
63
|
-
[url: string]: WebpageContent | { error: string };
|
64
|
-
}
|
package/GEMINI.md
DELETED
@@ -1,72 +0,0 @@
|
|
1
|
-
# GEMINI.md
|
2
|
-
|
3
|
-
## Project Overview
|
4
|
-
|
5
|
-
This project is a TypeScript-based MCP (Model Context Protocol) server that provides Google search capabilities and webpage content analysis tools. It enables AI models to programmatically perform Google searches and analyze webpage content.
|
6
|
-
|
7
|
-
The server is built with TypeScript and utilizes the `@modelcontextprotocol/sdk` for MCP communication. It exposes three main tools: `google_search`, `extract_webpage_content`, and `extract_multiple_webpages`.
|
8
|
-
|
9
|
-
The core logic is divided into two services:
|
10
|
-
- `GoogleSearchService`: Handles interactions with the Google Custom Search API, including caching, pagination, and result categorization.
|
11
|
-
- `ContentExtractor`: Fetches and extracts the main content from webpages using `axios`, `cheerio`, and `@mozilla/readability`. It supports various output formats and caching.
|
12
|
-
|
13
|
-
## Building and Running
|
14
|
-
|
15
|
-
### Prerequisites
|
16
|
-
|
17
|
-
- Node.js (v16 or higher)
|
18
|
-
- Google Cloud Platform account
|
19
|
-
- Custom Search Engine ID
|
20
|
-
- Google API Key
|
21
|
-
|
22
|
-
### Installation
|
23
|
-
|
24
|
-
1. Install Node.js dependencies:
|
25
|
-
```bash
|
26
|
-
npm install
|
27
|
-
```
|
28
|
-
|
29
|
-
### Environment Variables
|
30
|
-
|
31
|
-
The following environment variables must be set:
|
32
|
-
|
33
|
-
- `GOOGLE_API_KEY`: Your Google API key.
|
34
|
-
- `GOOGLE_SEARCH_ENGINE_ID`: Your Custom Search Engine ID.
|
35
|
-
|
36
|
-
### Building
|
37
|
-
|
38
|
-
To build the TypeScript code, run:
|
39
|
-
|
40
|
-
```bash
|
41
|
-
npm run build
|
42
|
-
```
|
43
|
-
|
44
|
-
This will compile the TypeScript files from `src` into JavaScript files in the `dist` directory.
|
45
|
-
|
46
|
-
### Running
|
47
|
-
|
48
|
-
To start the MCP server, run:
|
49
|
-
|
50
|
-
```bash
|
51
|
-
npm run start
|
52
|
-
```
|
53
|
-
|
54
|
-
This will execute the compiled `dist/google-search.js` file.
|
55
|
-
|
56
|
-
For development, you can use:
|
57
|
-
|
58
|
-
```bash
|
59
|
-
npm run dev
|
60
|
-
```
|
61
|
-
|
62
|
-
This will watch for changes in the `src` directory and automatically recompile the code.
|
63
|
-
|
64
|
-
## Development Conventions
|
65
|
-
|
66
|
-
- **Language:** TypeScript
|
67
|
-
- **Module System:** ES Modules (`"type": "module"` in `package.json`)
|
68
|
-
- **Compiler Target:** ES2020
|
69
|
-
- **Code Style:** The code is well-structured and follows standard TypeScript conventions. It uses classes for services and the main server logic.
|
70
|
-
- **Error Handling:** The code includes error handling for API requests and other operations.
|
71
|
-
- **Caching:** Caching is implemented for both search results and webpage content to improve performance and reduce API calls.
|
72
|
-
- **Testing:** There are no explicit test files in the provided structure.
|
package/QWEN.md
DELETED
@@ -1,207 +0,0 @@
|
|
1
|
-
# Google Search MCP Server - Project Context
|
2
|
-
|
3
|
-
## Project Overview
|
4
|
-
|
5
|
-
The Google Search MCP Server is a Model Context Protocol (MCP) server that provides Google search capabilities and webpage content analysis tools. This server enables AI models to perform Google searches and analyze webpage content programmatically through a standardized MCP interface.
|
6
|
-
|
7
|
-
### Key Features
|
8
|
-
- Google Custom Search integration
|
9
|
-
- Advanced search features (filters, sorting, pagination, categorization)
|
10
|
-
- Webpage content analysis in multiple formats (markdown, HTML, plain text)
|
11
|
-
- Batch webpage analysis
|
12
|
-
- Result categorization and classification
|
13
|
-
- Content summarization
|
14
|
-
- Optimized, human-readable responses
|
15
|
-
- MCP-compliant interface
|
16
|
-
|
17
|
-
### Technology Stack
|
18
|
-
- **Language**: TypeScript
|
19
|
-
- **Framework**: Node.js
|
20
|
-
- **Core Dependencies**:
|
21
|
-
- `@modelcontextprotocol/sdk`: MCP server SDK
|
22
|
-
- `googleapis`: Google API integration
|
23
|
-
- `@mozilla/readability`: Content extraction
|
24
|
-
- `cheerio`: HTML parsing
|
25
|
-
- `jsdom`: DOM manipulation
|
26
|
-
- `turndown`: HTML to Markdown conversion
|
27
|
-
- `axios`: HTTP client
|
28
|
-
- `express`: Web framework (if needed)
|
29
|
-
|
30
|
-
## Project Structure
|
31
|
-
|
32
|
-
```
|
33
|
-
D:\ai\pse-mcp\
|
34
|
-
├── dist/ # Compiled JavaScript files
|
35
|
-
├── dist-package/ # Distribution package
|
36
|
-
├── MCP Documents/ # MCP protocol documentation
|
37
|
-
├── src/ # Source TypeScript files
|
38
|
-
│ ├── services/ # Service implementations
|
39
|
-
│ │ ├── google-search.service.ts
|
40
|
-
│ │ └── content-extractor.service.ts
|
41
|
-
│ ├── google-search.ts # Main server entry point
|
42
|
-
│ ├── mcp.d.ts # MCP type definitions
|
43
|
-
│ └── types.ts # Type definitions
|
44
|
-
├── package.json # Project dependencies and scripts
|
45
|
-
├── tsconfig.json # TypeScript configuration
|
46
|
-
├── README.md # Project documentation
|
47
|
-
└── ...
|
48
|
-
```
|
49
|
-
|
50
|
-
## Building and Running
|
51
|
-
|
52
|
-
### Prerequisites
|
53
|
-
- Node.js (v16 or higher)
|
54
|
-
- Google Cloud Platform account
|
55
|
-
- Custom Search Engine ID
|
56
|
-
- Google API Key
|
57
|
-
|
58
|
-
### Setup Commands
|
59
|
-
1. Install dependencies:
|
60
|
-
```bash
|
61
|
-
npm install
|
62
|
-
```
|
63
|
-
|
64
|
-
2. Build the TypeScript code:
|
65
|
-
```bash
|
66
|
-
npm run build
|
67
|
-
```
|
68
|
-
|
69
|
-
3. Run the server:
|
70
|
-
```bash
|
71
|
-
npm run start
|
72
|
-
```
|
73
|
-
|
74
|
-
### Environment Variables
|
75
|
-
Required environment variables:
|
76
|
-
- `GOOGLE_API_KEY`: Your Google API key
|
77
|
-
- `GOOGLE_SEARCH_ENGINE_ID`: Your Custom Search Engine ID
|
78
|
-
|
79
|
-
### Development Scripts
|
80
|
-
- `npm run build`: Compiles TypeScript to JavaScript
|
81
|
-
- `npm run start`: Runs the compiled server
|
82
|
-
- `npm run dev`: Watches for changes and recompiles (tsc -w)
|
83
|
-
|
84
|
-
## Architecture and Components
|
85
|
-
|
86
|
-
### Main Components
|
87
|
-
1. **GoogleSearchService**: Handles Google API interactions for search functionality
|
88
|
-
2. **ContentExtractor**: Manages webpage content analysis and extraction
|
89
|
-
3. **Main Server (google-search.ts)**: MCP server implementation with tool handlers
|
90
|
-
|
91
|
-
### Services
|
92
|
-
|
93
|
-
#### GoogleSearchService
|
94
|
-
- Integrates with Google Custom Search API
|
95
|
-
- Provides caching mechanism (5-minute TTL, max 100 entries)
|
96
|
-
- Implements advanced search filtering and pagination
|
97
|
-
- Categorizes search results by content type
|
98
|
-
- Handles error management and response formatting
|
99
|
-
|
100
|
-
#### ContentExtractor
|
101
|
-
- Extracts webpage content using Mozilla Readability
|
102
|
-
- Converts content to markdown, HTML, or plain text formats
|
103
|
-
- Implements content caching (30-minute TTL, max 50 entries)
|
104
|
-
- Generates content summaries and statistics
|
105
|
-
- Handles batch processing of multiple webpages
|
106
|
-
|
107
|
-
### Available Tools
|
108
|
-
|
109
|
-
#### 1. google_search
|
110
|
-
Searches Google and returns relevant results with advanced filtering options:
|
111
|
-
- Query string (required)
|
112
|
-
- Number of results (default: 5, max: 10)
|
113
|
-
- Site filtering
|
114
|
-
- Language filtering (ISO 639-1 codes)
|
115
|
-
- Date restrictions
|
116
|
-
- Exact phrase matching
|
117
|
-
- Result type (news, images, videos)
|
118
|
-
- Pagination support
|
119
|
-
- Sorting (relevance or date)
|
120
|
-
|
121
|
-
#### 2. extract_webpage_content
|
122
|
-
Extracts and analyzes content from a single webpage:
|
123
|
-
- URL (required)
|
124
|
-
- Output format (markdown, html, text)
|
125
|
-
- Removes ads, navigation, and clutter
|
126
|
-
- Returns title, description, content stats, and summary
|
127
|
-
|
128
|
-
#### 3. extract_multiple_webpages
|
129
|
-
Extracts content from multiple webpages in a single request:
|
130
|
-
- Array of URLs (max 5 per request)
|
131
|
-
- Output format (markdown, html, text)
|
132
|
-
- Batch processing capability
|
133
|
-
|
134
|
-
## Configuration
|
135
|
-
|
136
|
-
The server configuration needs to be added to the MCP settings file (typically located at `%APPDATA%/Code/User/globalStorage/saoudrizwan.claude-dev/settings/cline_mcp_settings.json`):
|
137
|
-
|
138
|
-
```json
|
139
|
-
{
|
140
|
-
"mcpServers": {
|
141
|
-
"google-search": {
|
142
|
-
"autoApprove": [
|
143
|
-
"google_search",
|
144
|
-
"extract_webpage_content",
|
145
|
-
"extract_multiple_webpages"
|
146
|
-
],
|
147
|
-
"disabled": false,
|
148
|
-
"timeout": 60,
|
149
|
-
"command": "node",
|
150
|
-
"args": [
|
151
|
-
"/path/to/google-search-mcp-server/dist/google-search.js"
|
152
|
-
],
|
153
|
-
"env": {
|
154
|
-
"GOOGLE_API_KEY": "your-google-api-key",
|
155
|
-
"GOOGLE_SEARCH_ENGINE_ID": "your-custom-search-engine-id"
|
156
|
-
},
|
157
|
-
"transportType": "stdio"
|
158
|
-
}
|
159
|
-
}
|
160
|
-
}
|
161
|
-
```
|
162
|
-
|
163
|
-
## Development Conventions
|
164
|
-
|
165
|
-
### Coding Standards
|
166
|
-
- TypeScript with strict mode enabled
|
167
|
-
- Use of async/await for asynchronous operations
|
168
|
-
- Proper error handling with descriptive messages
|
169
|
-
- Input validation for all tool arguments
|
170
|
-
- Caching strategies for performance optimization
|
171
|
-
|
172
|
-
### Type Safety
|
173
|
-
- Comprehensive type definitions in `types.ts`
|
174
|
-
- Strict typing for all function parameters and return values
|
175
|
-
- MCP request/response schema validation
|
176
|
-
|
177
|
-
### Caching Strategy
|
178
|
-
- Search results: 5-minute TTL with max 100 entries
|
179
|
-
- Webpage content: 30-minute TTL with max 50 entries
|
180
|
-
- Cache keys generated from request parameters
|
181
|
-
- Automatic cleanup of oldest entries when limits exceeded
|
182
|
-
|
183
|
-
## MCP Protocol Integration
|
184
|
-
|
185
|
-
The server implements the Model Context Protocol with:
|
186
|
-
- Standardized tool listing and calling
|
187
|
-
- Input schema validation
|
188
|
-
- Error response formatting
|
189
|
-
- Stdio transport for communication with MCP clients
|
190
|
-
|
191
|
-
## Testing and Verification
|
192
|
-
|
193
|
-
To verify the server is working:
|
194
|
-
1. Build with `npm run build`
|
195
|
-
2. Start with `npm run start`
|
196
|
-
3. Use MCP client to call the available tools
|
197
|
-
4. Verify search results and content extraction work as expected
|
198
|
-
|
199
|
-
## Deployment
|
200
|
-
|
201
|
-
For distribution, the project includes a process to create a compiled distribution package:
|
202
|
-
1. Build the TypeScript code
|
203
|
-
2. Create a distribution package with only necessary files
|
204
|
-
3. Include production dependencies only
|
205
|
-
4. Simplified package.json for end users
|
206
|
-
|
207
|
-
The distribution approach allows for shipping compiled JavaScript without exposing source code while maintaining functionality.
|