pse-mcp 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,36 +0,0 @@
1
- import axios from 'axios';
2
- export class ContentFetcher {
3
- constructor(port = 5001) {
4
- this.baseUrl = `http://localhost:${port}`;
5
- }
6
- async fetchContent(url) {
7
- try {
8
- const response = await axios.post(`${this.baseUrl}/analyze`, { url });
9
- return response.data;
10
- }
11
- catch (error) {
12
- if (axios.isAxiosError(error)) {
13
- throw new Error(`Failed to fetch content: ${error.response?.data?.error || error.message}`);
14
- }
15
- if (error instanceof Error) {
16
- throw new Error(`Failed to fetch content: ${error.message}`);
17
- }
18
- throw new Error('Failed to fetch content: Unknown error');
19
- }
20
- }
21
- async batchFetchContent(urls) {
22
- try {
23
- const response = await axios.post(`${this.baseUrl}/batch_analyze`, { urls });
24
- return response.data;
25
- }
26
- catch (error) {
27
- if (axios.isAxiosError(error)) {
28
- throw new Error(`Failed to batch fetch content: ${error.response?.data?.error || error.message}`);
29
- }
30
- if (error instanceof Error) {
31
- throw new Error(`Failed to batch fetch content: ${error.message}`);
32
- }
33
- throw new Error('Failed to batch fetch content: Unknown error');
34
- }
35
- }
36
- }
@@ -1,195 +0,0 @@
1
- import axios from 'axios';
2
- import * as cheerio from 'cheerio';
3
- import { Readability } from '@mozilla/readability';
4
- import { JSDOM } from 'jsdom';
5
- import MarkdownIt from 'markdown-it';
6
- import TurndownService from 'turndown';
7
- export class ContentExtractor {
8
- constructor() {
9
- // Cache for webpage content (key: url + format, value: content)
10
- this.contentCache = new Map();
11
- // Cache expiration time in milliseconds (30 minutes)
12
- this.cacheTTL = 30 * 60 * 1000;
13
- this.md = new MarkdownIt();
14
- this.turndownService = new TurndownService({
15
- headingStyle: 'atx',
16
- codeBlockStyle: 'fenced'
17
- });
18
- }
19
- cleanText(text) {
20
- // Remove multiple blank lines
21
- text = text.replace(/\n\s*\n\s*\n/g, '\n\n');
22
- // Remove excessive spaces
23
- text = text.replace(/ +/g, ' ');
24
- return text.trim();
25
- }
26
- cleanMarkdown(text) {
27
- let cleanedText = this.cleanText(text);
28
- // Ensure headers have space after #
29
- cleanedText = cleanedText.replace(/#([A-Za-z0-9])/g, '# $1');
30
- return cleanedText;
31
- }
32
- htmlToMarkdown(html) {
33
- return this.cleanMarkdown(this.turndownService.turndown(html));
34
- }
35
- htmlToPlainText(html) {
36
- const dom = new JSDOM(html);
37
- return this.cleanText(dom.window.document.body.textContent || '');
38
- }
39
- isValidUrl(url) {
40
- try {
41
- new URL(url);
42
- return true;
43
- }
44
- catch {
45
- return false;
46
- }
47
- }
48
- /**
49
- * Generate a cache key from URL and format
50
- */
51
- generateCacheKey(url, format) {
52
- return `${url}|${format}`;
53
- }
54
- /**
55
- * Check if a cache entry is still valid
56
- */
57
- isCacheValid(entry) {
58
- const now = Date.now();
59
- return now - entry.timestamp < this.cacheTTL;
60
- }
61
- /**
62
- * Store webpage content in cache
63
- */
64
- cacheContent(url, format, content) {
65
- const cacheKey = this.generateCacheKey(url, format);
66
- this.contentCache.set(cacheKey, {
67
- timestamp: Date.now(),
68
- content
69
- });
70
- // Limit cache size to prevent memory issues (max 50 entries)
71
- if (this.contentCache.size > 50) {
72
- // Delete oldest entry
73
- const oldestKey = Array.from(this.contentCache.entries())
74
- .sort((a, b) => a[1].timestamp - b[1].timestamp)[0][0];
75
- this.contentCache.delete(oldestKey);
76
- }
77
- }
78
- /**
79
- * Generates a concise summary of the content
80
- * @param content The content to summarize
81
- * @param maxLength Maximum length of the summary
82
- * @returns A summary of the content
83
- */
84
- generateSummary(content, maxLength = 300) {
85
- // Simple summarization: take first few sentences up to maxLength
86
- const sentences = content.split(/(?<=[.!?])\s+/);
87
- let summary = '';
88
- for (const sentence of sentences) {
89
- if ((summary + sentence).length <= maxLength) {
90
- summary += sentence + ' ';
91
- }
92
- else {
93
- break;
94
- }
95
- }
96
- return summary.trim() + (summary.length < content.length ? '...' : '');
97
- }
98
- async extractContent(url, format = 'markdown') {
99
- if (!this.isValidUrl(url)) {
100
- throw new Error('Invalid URL provided');
101
- }
102
- // Check cache first
103
- const cacheKey = this.generateCacheKey(url, format);
104
- const cachedContent = this.contentCache.get(cacheKey);
105
- if (cachedContent && this.isCacheValid(cachedContent)) {
106
- console.error(`Using cached content for ${url}`);
107
- return cachedContent.content;
108
- }
109
- try {
110
- // Fetch webpage content
111
- const response = await axios.get(url, {
112
- headers: {
113
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
114
- },
115
- timeout: 10000
116
- });
117
- // Parse with Cheerio for metadata
118
- const $ = cheerio.load(response.data);
119
- const metaTags = {};
120
- // Only extract the most important meta tags to reduce data volume
121
- const importantMetaTags = ['description', 'keywords', 'author', 'og:title', 'og:description', 'twitter:title', 'twitter:description'];
122
- $('meta').each((_, element) => {
123
- const name = $(element).attr('name') || $(element).attr('property') || '';
124
- const content = $(element).attr('content') || '';
125
- if (name && content && importantMetaTags.some(tag => name.includes(tag))) {
126
- metaTags[name] = content;
127
- }
128
- });
129
- // Use Readability for main content extraction
130
- const dom = new JSDOM(response.data);
131
- const reader = new Readability(dom.window.document);
132
- const article = reader.parse();
133
- if (!article) {
134
- throw new Error('Failed to extract content from webpage');
135
- }
136
- // Convert content based on requested format
137
- let contentStr;
138
- switch (format) {
139
- case 'html':
140
- contentStr = article.content || '';
141
- break;
142
- case 'text':
143
- contentStr = this.htmlToPlainText(article.content || '');
144
- break;
145
- case 'markdown':
146
- default:
147
- contentStr = this.htmlToMarkdown(article.content || '');
148
- break;
149
- }
150
- // Calculate content stats
151
- const wordCount = contentStr.split(/\s+/).filter(word => word.length > 0).length;
152
- // Generate a summary of the content
153
- const summary = this.generateSummary(contentStr);
154
- const content = {
155
- url,
156
- title: $('title').text() || article.title || '',
157
- description: metaTags['description'] || '',
158
- content: contentStr,
159
- format: format,
160
- meta_tags: metaTags,
161
- stats: {
162
- word_count: wordCount,
163
- approximate_chars: contentStr.length
164
- },
165
- content_preview: {
166
- first_500_chars: contentStr.slice(0, 500) + (contentStr.length > 500 ? '...' : '')
167
- },
168
- summary: summary
169
- };
170
- // Cache the content before returning
171
- this.cacheContent(url, format, content);
172
- return content;
173
- }
174
- catch (error) {
175
- if (axios.isAxiosError(error)) {
176
- throw new Error(`Failed to fetch webpage: ${error.message}`);
177
- }
178
- throw error;
179
- }
180
- }
181
- async batchExtractContent(urls, format = 'markdown') {
182
- const results = {};
183
- await Promise.all(urls.map(async (url) => {
184
- try {
185
- results[url] = await this.extractContent(url, format);
186
- }
187
- catch (error) {
188
- results[url] = {
189
- error: error instanceof Error ? error.message : 'Unknown error occurred'
190
- };
191
- }
192
- }));
193
- return results;
194
- }
195
- }
@@ -1,232 +0,0 @@
1
- import axios from 'axios';
2
- import * as cheerio from 'cheerio';
3
- import { Readability } from '@mozilla/readability';
4
- import { JSDOM } from 'jsdom';
5
- import MarkdownIt from 'markdown-it';
6
- import { WebpageContent, OutputFormat } from '../types.js';
7
- import TurndownService from 'turndown';
8
-
9
- interface ContentCacheEntry {
10
- timestamp: number;
11
- content: WebpageContent;
12
- }
13
-
14
- export class ContentExtractor {
15
- private md: MarkdownIt;
16
- private turndownService: TurndownService;
17
- // Cache for webpage content (key: url + format, value: content)
18
- private contentCache: Map<string, ContentCacheEntry> = new Map();
19
- // Cache expiration time in milliseconds (30 minutes)
20
- private cacheTTL: number = 30 * 60 * 1000;
21
-
22
- constructor() {
23
- this.md = new MarkdownIt();
24
- this.turndownService = new TurndownService({
25
- headingStyle: 'atx',
26
- codeBlockStyle: 'fenced'
27
- });
28
- }
29
-
30
- private cleanText(text: string): string {
31
- // Remove multiple blank lines
32
- text = text.replace(/\n\s*\n\s*\n/g, '\n\n');
33
- // Remove excessive spaces
34
- text = text.replace(/ +/g, ' ');
35
- return text.trim();
36
- }
37
-
38
- private cleanMarkdown(text: string): string {
39
- let cleanedText = this.cleanText(text);
40
- // Ensure headers have space after #
41
- cleanedText = cleanedText.replace(/#([A-Za-z0-9])/g, '# $1');
42
- return cleanedText;
43
- }
44
-
45
- private htmlToMarkdown(html: string): string {
46
- return this.cleanMarkdown(this.turndownService.turndown(html));
47
- }
48
-
49
- private htmlToPlainText(html: string): string {
50
- const dom = new JSDOM(html);
51
- return this.cleanText(dom.window.document.body.textContent || '');
52
- }
53
-
54
- private isValidUrl(url: string): boolean {
55
- try {
56
- new URL(url);
57
- return true;
58
- } catch {
59
- return false;
60
- }
61
- }
62
-
63
- /**
64
- * Generate a cache key from URL and format
65
- */
66
- private generateCacheKey(url: string, format: OutputFormat): string {
67
- return `${url}|${format}`;
68
- }
69
-
70
- /**
71
- * Check if a cache entry is still valid
72
- */
73
- private isCacheValid(entry: ContentCacheEntry): boolean {
74
- const now = Date.now();
75
- return now - entry.timestamp < this.cacheTTL;
76
- }
77
-
78
- /**
79
- * Store webpage content in cache
80
- */
81
- private cacheContent(url: string, format: OutputFormat, content: WebpageContent): void {
82
- const cacheKey = this.generateCacheKey(url, format);
83
- this.contentCache.set(cacheKey, {
84
- timestamp: Date.now(),
85
- content
86
- });
87
-
88
- // Limit cache size to prevent memory issues (max 50 entries)
89
- if (this.contentCache.size > 50) {
90
- // Delete oldest entry
91
- const oldestKey = Array.from(this.contentCache.entries())
92
- .sort((a, b) => a[1].timestamp - b[1].timestamp)[0][0];
93
- this.contentCache.delete(oldestKey);
94
- }
95
- }
96
-
97
- /**
98
- * Generates a concise summary of the content
99
- * @param content The content to summarize
100
- * @param maxLength Maximum length of the summary
101
- * @returns A summary of the content
102
- */
103
- private generateSummary(content: string, maxLength: number = 300): string {
104
- // Simple summarization: take first few sentences up to maxLength
105
- const sentences = content.split(/(?<=[.!?])\s+/);
106
- let summary = '';
107
-
108
- for (const sentence of sentences) {
109
- if ((summary + sentence).length <= maxLength) {
110
- summary += sentence + ' ';
111
- } else {
112
- break;
113
- }
114
- }
115
-
116
- return summary.trim() + (summary.length < content.length ? '...' : '');
117
- }
118
-
119
- async extractContent(url: string, format: OutputFormat = 'markdown'): Promise<WebpageContent> {
120
- if (!this.isValidUrl(url)) {
121
- throw new Error('Invalid URL provided');
122
- }
123
-
124
- // Check cache first
125
- const cacheKey = this.generateCacheKey(url, format);
126
- const cachedContent = this.contentCache.get(cacheKey);
127
- if (cachedContent && this.isCacheValid(cachedContent)) {
128
- console.error(`Using cached content for ${url}`);
129
- return cachedContent.content;
130
- }
131
-
132
- try {
133
- // Fetch webpage content
134
- const response = await axios.get(url, {
135
- headers: {
136
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
137
- },
138
- timeout: 10000
139
- });
140
-
141
- // Parse with Cheerio for metadata
142
- const $ = cheerio.load(response.data);
143
- const metaTags: Record<string, string> = {};
144
-
145
- // Only extract the most important meta tags to reduce data volume
146
- const importantMetaTags = ['description', 'keywords', 'author', 'og:title', 'og:description', 'twitter:title', 'twitter:description'];
147
-
148
- $('meta').each((_, element) => {
149
- const name = $(element).attr('name') || $(element).attr('property') || '';
150
- const content = $(element).attr('content') || '';
151
- if (name && content && importantMetaTags.some(tag => name.includes(tag))) {
152
- metaTags[name] = content;
153
- }
154
- });
155
-
156
- // Use Readability for main content extraction
157
- const dom = new JSDOM(response.data);
158
- const reader = new Readability(dom.window.document);
159
- const article = reader.parse();
160
-
161
- if (!article) {
162
- throw new Error('Failed to extract content from webpage');
163
- }
164
-
165
- // Convert content based on requested format
166
- let contentStr: string;
167
- switch (format) {
168
- case 'html':
169
- contentStr = article.content || '';
170
- break;
171
- case 'text':
172
- contentStr = this.htmlToPlainText(article.content || '');
173
- break;
174
- case 'markdown':
175
- default:
176
- contentStr = this.htmlToMarkdown(article.content || '');
177
- break;
178
- }
179
-
180
- // Calculate content stats
181
- const wordCount = contentStr.split(/\s+/).filter(word => word.length > 0).length;
182
-
183
- // Generate a summary of the content
184
- const summary = this.generateSummary(contentStr);
185
-
186
- const content: WebpageContent = {
187
- url,
188
- title: ($('title').text() as string) || article.title || '',
189
- description: metaTags['description'] || '',
190
- content: contentStr,
191
- format: format,
192
- meta_tags: metaTags,
193
- stats: {
194
- word_count: wordCount,
195
- approximate_chars: contentStr.length
196
- },
197
- content_preview: {
198
- first_500_chars: contentStr.slice(0, 500) + (contentStr.length > 500 ? '...' : '')
199
- },
200
- summary: summary
201
- };
202
-
203
- // Cache the content before returning
204
- this.cacheContent(url, format, content);
205
-
206
- return content;
207
- } catch (error) {
208
- if (axios.isAxiosError(error)) {
209
- throw new Error(`Failed to fetch webpage: ${error.message}`);
210
- }
211
- throw error;
212
- }
213
- }
214
-
215
- async batchExtractContent(urls: string[], format: OutputFormat = 'markdown'): Promise<Record<string, WebpageContent | { error: string }>> {
216
- const results: Record<string, WebpageContent | { error: string }> = {};
217
-
218
- await Promise.all(
219
- urls.map(async (url) => {
220
- try {
221
- results[url] = await this.extractContent(url, format);
222
- } catch (error) {
223
- results[url] = {
224
- error: error instanceof Error ? error.message : 'Unknown error occurred'
225
- };
226
- }
227
- })
228
- );
229
-
230
- return results;
231
- }
232
- }
package/tasks.md DELETED
@@ -1,141 +0,0 @@
1
- # Google Search MCP Server Improvement Plan
2
-
3
- ## Current Implementation
4
- The Google Search MCP Server currently provides three tools:
5
- 1. `google_search` - Searches Google and returns relevant results
6
- 2. `extract_webpage_content` - Extracts content from a single webpage
7
- 3. `extract_multiple_webpages` - Extracts content from multiple webpages
8
-
9
- ## Improvement Roadmap
10
-
11
- ### Phase 1: Enhanced Search Capabilities
12
-
13
- #### Task 1: Add Basic Search Filters
14
- - [x] Add site-specific search parameter (`site:example.com`)
15
- - [x] Add language filter parameter
16
- - [x] Update input schema and documentation
17
- - [x] Test functionality with different filter combinations
18
-
19
- #### Task 2: Add Date Range Filtering
20
- - [x] Add date range parameters (start date, end date)
21
- - [x] Implement date formatting and validation
22
- - [x] Update Google API query construction
23
- - [x] Test functionality with various date ranges
24
-
25
- #### Task 3: Add Result Type Specification
26
- - [x] Add parameter for result type (news, images, videos)
27
- - [x] Implement type-specific query parameters
28
- - [x] Update result processing for each type
29
- - [x] Test different result types
30
-
31
- #### Task 4: Implement Pagination Support
32
- - [x] Add pagination parameters (page number, results per page)
33
- - [x] Implement pagination logic using Google API's `start` parameter
34
- - [x] Add metadata about total results and current page
35
- - [x] Test pagination functionality
36
-
37
- #### Task 5: Add Sorting Options
38
- - [x] Add sorting parameter (relevance, date)
39
- - [x] Implement sort parameter handling
40
- - [x] Test different sorting options
41
-
42
- #### Task 6: Implement Result Categorization
43
- - [x] Design categorization algorithm (by domain, topic, or content type)
44
- - [x] Implement result clustering/categorization
45
- - [x] Add category information to the response
46
- - [x] Test categorization with various search queries
47
-
48
- ### Phase 2: Advanced Content Extraction
49
-
50
- #### Task 7: Support Different Output Formats
51
- - [x] Add parameter for output format (markdown, HTML, plain text)
52
- - [x] Implement format conversion functions
53
- - [x] Test output in different formats
54
-
55
- #### Task 8: Add Content Summarization
56
- - [ ] Research summarization approaches
57
- - [ ] Implement text summarization algorithm
58
- - [ ] Add summary to extraction results
59
- - [ ] Test summarization with various content types
60
-
61
- #### Task 9: Extract Specific Elements
62
- - [ ] Add support for extracting tables
63
- - [ ] Add support for extracting lists
64
- - [ ] Add support for extracting code blocks
65
- - [ ] Test specific element extraction
66
-
67
- #### Task 10: Implement Image Extraction
68
- - [ ] Add functionality to identify and extract images
69
- - [ ] Process image metadata (alt text, dimensions)
70
- - [ ] Return image URLs and metadata
71
- - [ ] Test image extraction from various pages
72
-
73
- #### Task 11: Add Content Translation Support
74
- - [ ] Research translation API options
75
- - [ ] Integrate with a translation service
76
- - [ ] Add target language parameter
77
- - [ ] Test translation functionality
78
-
79
- ### Phase 3: Performance and Infrastructure Improvements
80
-
81
- #### Task 12: Implement Result Caching
82
- - [x] Design cache structure for search results
83
- - [x] Implement cache lookup before making new requests
84
- - [x] Add cache expiration mechanism
85
- - [x] Test cache hit and miss scenarios
86
-
87
- #### Task 13: Add Content Cache Layer
88
- - [x] Design cache structure for webpage content
89
- - [x] Implement content cache lookup and storage
90
- - [x] Add cache invalidation strategy
91
- - [x] Test content caching performance
92
-
93
- #### Task 14: Implement Rate Limiting
94
- - [ ] Add rate limiting configuration
95
- - [ ] Implement request throttling
96
- - [ ] Add rate limit information in responses
97
- - [ ] Test rate limiting behavior
98
-
99
- #### Task 15: Add Concurrent Request Handling
100
- - [ ] Implement batch processing for search requests
101
- - [ ] Add parallel processing for multiple content extractions
102
- - [ ] Optimize resource usage during concurrent operations
103
- - [ ] Test performance with concurrent requests
104
-
105
- #### Task 16: Support Custom User-Agent Strings
106
- - [ ] Add user-agent parameter
107
- - [ ] Implement user-agent validation
108
- - [ ] Update request headers with custom user-agent
109
- - [ ] Test different user-agent strings
110
-
111
- #### Task 17: Add Proxy Support
112
- - [ ] Add proxy configuration options
113
- - [ ] Implement proxy routing for requests
114
- - [ ] Add fallback mechanism for proxy failures
115
- - [ ] Test proxy functionality
116
-
117
- ### Phase 4: Finalization and Documentation
118
-
119
- #### Task 18: Comprehensive Testing
120
- - [ ] Develop automated tests for all new features
121
- - [ ] Perform integration testing
122
- - [ ] Stress test with high volume of requests
123
- - [ ] Fix any identified issues
124
-
125
- #### Task 19: Update Documentation
126
- - [ ] Update server documentation with new capabilities
127
- - [ ] Create examples for each new feature
128
- - [ ] Document best practices and recommendations
129
- - [ ] Create troubleshooting guide
130
-
131
- #### Task 20: Performance Optimization
132
- - [ ] Profile and identify bottlenecks
133
- - [ ] Optimize resource usage
134
- - [ ] Implement additional caching if needed
135
- - [ ] Benchmark performance improvements
136
-
137
- ## Implementation Notes
138
- - Each task should be completed and tested independently
139
- - Regular commits should be made after each feature is implemented
140
- - Follow existing code patterns and naming conventions
141
- - Maintain backward compatibility where possible