pse-mcp 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,244 +0,0 @@
1
- import { google } from 'googleapis';
2
- import { URL } from 'url';
3
- export class GoogleSearchService {
4
- constructor() {
5
- // Cache for search results (key: query string + filters, value: results)
6
- this.searchCache = new Map();
7
- // Cache expiration time in milliseconds (5 minutes)
8
- this.cacheTTL = 5 * 60 * 1000;
9
- const apiKey = process.env.GOOGLE_API_KEY;
10
- const searchEngineId = process.env.GOOGLE_SEARCH_ENGINE_ID;
11
- if (!apiKey || !searchEngineId) {
12
- throw new Error('Missing required environment variables: GOOGLE_API_KEY and GOOGLE_SEARCH_ENGINE_ID');
13
- }
14
- // Initialize Google Custom Search API
15
- this.customSearch = google.customsearch('v1').cse;
16
- this.searchEngineId = searchEngineId;
17
- // Set up the API client
18
- google.options({
19
- auth: apiKey
20
- });
21
- }
22
- /**
23
- * Generate a cache key from search parameters
24
- */
25
- generateCacheKey(query, numResults, filters) {
26
- return JSON.stringify({
27
- query,
28
- numResults,
29
- filters
30
- });
31
- }
32
- /**
33
- * Check if a cache entry is still valid
34
- */
35
- isCacheValid(entry) {
36
- const now = Date.now();
37
- return now - entry.timestamp < this.cacheTTL;
38
- }
39
- /**
40
- * Store search results in cache
41
- */
42
- cacheSearchResults(cacheKey, results, pagination, categories) {
43
- this.searchCache.set(cacheKey, {
44
- timestamp: Date.now(),
45
- data: { results, pagination, categories }
46
- });
47
- // Limit cache size to prevent memory issues (max 100 entries)
48
- if (this.searchCache.size > 100) {
49
- // Delete oldest entry
50
- const oldestKey = Array.from(this.searchCache.entries())
51
- .sort((a, b) => a[1].timestamp - b[1].timestamp)[0][0];
52
- this.searchCache.delete(oldestKey);
53
- }
54
- }
55
- async search(query, numResults = 5, filters) {
56
- try {
57
- // Generate cache key
58
- const cacheKey = this.generateCacheKey(query, numResults, filters);
59
- // Check cache first
60
- const cachedResult = this.searchCache.get(cacheKey);
61
- if (cachedResult && this.isCacheValid(cachedResult)) {
62
- console.error('Using cached search results');
63
- return cachedResult.data;
64
- }
65
- let formattedQuery = query;
66
- // Apply site filter if provided
67
- if (filters?.site) {
68
- formattedQuery += ` site:${filters.site}`;
69
- }
70
- // Apply exact terms if provided
71
- if (filters?.exactTerms) {
72
- formattedQuery += ` "${filters.exactTerms}"`;
73
- }
74
- // Set default pagination values if not provided
75
- const page = filters?.page && filters.page > 0 ? filters.page : 1;
76
- const resultsPerPage = filters?.resultsPerPage ? Math.min(filters.resultsPerPage, 10) : Math.min(numResults, 10);
77
- // Calculate start index for pagination (Google uses 1-based indexing)
78
- const startIndex = (page - 1) * resultsPerPage + 1;
79
- const params = {
80
- cx: this.searchEngineId,
81
- q: formattedQuery,
82
- num: resultsPerPage,
83
- start: startIndex
84
- };
85
- // Apply language filter if provided
86
- if (filters?.language) {
87
- params.lr = `lang_${filters.language}`;
88
- }
89
- // Apply date restriction if provided
90
- if (filters?.dateRestrict) {
91
- params.dateRestrict = filters.dateRestrict;
92
- }
93
- // Apply result type filter if provided
94
- if (filters?.resultType) {
95
- switch (filters.resultType.toLowerCase()) {
96
- case 'image':
97
- case 'images':
98
- params.searchType = 'image';
99
- break;
100
- case 'news':
101
- // For news, we need to modify the query
102
- formattedQuery += ' source:news';
103
- params.q = formattedQuery;
104
- break;
105
- case 'video':
106
- case 'videos':
107
- // For videos, we can use a more specific filter
108
- formattedQuery += ' filetype:video OR inurl:video OR inurl:watch';
109
- params.q = formattedQuery;
110
- break;
111
- }
112
- }
113
- // Apply sorting if provided
114
- if (filters?.sort) {
115
- switch (filters.sort.toLowerCase()) {
116
- case 'date':
117
- // Sort by date (most recent first)
118
- params.sort = 'date';
119
- break;
120
- case 'relevance':
121
- default:
122
- // Google's default sort is by relevance, so we don't need to specify
123
- break;
124
- }
125
- }
126
- const response = await this.customSearch.list(params);
127
- // If no items are found, return empty results with pagination info
128
- if (!response.data.items) {
129
- return {
130
- results: [],
131
- pagination: {
132
- currentPage: page,
133
- resultsPerPage,
134
- totalResults: 0,
135
- totalPages: 0,
136
- hasNextPage: false,
137
- hasPreviousPage: page > 1
138
- },
139
- categories: []
140
- };
141
- }
142
- // Map the search results and categorize them
143
- const results = response.data.items.map(item => {
144
- const result = {
145
- title: item.title || '',
146
- link: item.link || '',
147
- snippet: item.snippet || '',
148
- pagemap: item.pagemap || {},
149
- datePublished: item.pagemap?.metatags?.[0]?.['article:published_time'] || '',
150
- source: 'google_search'
151
- };
152
- // Add category to the result
153
- result.category = this.categorizeResult(result);
154
- return result;
155
- });
156
- // Generate category statistics
157
- const categories = this.generateCategoryStats(results);
158
- // Create pagination information
159
- const totalResults = parseInt(response.data.searchInformation?.totalResults || '0', 10);
160
- const totalPages = Math.ceil(totalResults / resultsPerPage);
161
- const pagination = {
162
- currentPage: page,
163
- resultsPerPage,
164
- totalResults,
165
- totalPages,
166
- hasNextPage: page < totalPages,
167
- hasPreviousPage: page > 1
168
- };
169
- // Cache the results before returning
170
- this.cacheSearchResults(cacheKey, results, pagination, categories);
171
- return {
172
- results,
173
- pagination,
174
- categories
175
- };
176
- }
177
- catch (error) {
178
- if (error instanceof Error) {
179
- throw new Error(`Google Search API error: ${error.message}`);
180
- }
181
- throw new Error('Unknown error during Google search');
182
- }
183
- }
184
- /**
185
- * Categorizes a search result based on its content
186
- * @param result The search result to categorize
187
- * @returns The category name
188
- */
189
- categorizeResult(result) {
190
- try {
191
- // Extract the domain from the URL
192
- const url = new URL(result.link);
193
- const domain = url.hostname.replace(/^www\./, '');
194
- // Check if this is a social media site
195
- if (domain.match(/facebook\.com|twitter\.com|instagram\.com|linkedin\.com|pinterest\.com|tiktok\.com|reddit\.com/i)) {
196
- return 'Social Media';
197
- }
198
- // Check if this is a video site
199
- if (domain.match(/youtube\.com|vimeo\.com|dailymotion\.com|twitch\.tv/i)) {
200
- return 'Video';
201
- }
202
- // Check if this is a news site
203
- if (domain.match(/news|cnn\.com|bbc\.com|nytimes\.com|wsj\.com|reuters\.com|bloomberg\.com/i)) {
204
- return 'News';
205
- }
206
- // Check if this is an educational site
207
- if (domain.match(/\.edu$|wikipedia\.org|khan|course|learn|study|academic/i)) {
208
- return 'Educational';
209
- }
210
- // Check if this is a documentation site
211
- if (domain.match(/docs|documentation|developer|github\.com|gitlab\.com|bitbucket\.org|stackoverflow\.com/i) ||
212
- result.title.match(/docs|documentation|api|reference|manual/i)) {
213
- return 'Documentation';
214
- }
215
- // Check if this is a shopping site
216
- if (domain.match(/amazon\.com|ebay\.com|etsy\.com|walmart\.com|shop|store|buy/i)) {
217
- return 'Shopping';
218
- }
219
- // Default category based on domain
220
- return domain.split('.').slice(-2, -1)[0].charAt(0).toUpperCase() + domain.split('.').slice(-2, -1)[0].slice(1);
221
- }
222
- catch (error) {
223
- // If there's any error in categorization, return a default category
224
- return 'Other';
225
- }
226
- }
227
- /**
228
- * Generates category statistics from search results
229
- * @param results The search results to analyze
230
- * @returns An array of category information
231
- */
232
- generateCategoryStats(results) {
233
- // Count results by category
234
- const categoryCounts = {};
235
- results.forEach(result => {
236
- const category = result.category || 'Other';
237
- categoryCounts[category] = (categoryCounts[category] || 0) + 1;
238
- });
239
- // Convert to array of category info objects
240
- return Object.entries(categoryCounts)
241
- .map(([name, count]) => ({ name, count }))
242
- .sort((a, b) => b.count - a.count); // Sort by count in descending order
243
- }
244
- }
package/dist/types.js DELETED
@@ -1 +0,0 @@
1
- export {};
@@ -1,232 +0,0 @@
1
- import axios from 'axios';
2
- import * as cheerio from 'cheerio';
3
- import { Readability } from '@mozilla/readability';
4
- import { JSDOM } from 'jsdom';
5
- import MarkdownIt from 'markdown-it';
6
- import { WebpageContent, OutputFormat } from '../types.js';
7
- import TurndownService from 'turndown';
8
-
9
- interface ContentCacheEntry {
10
- timestamp: number;
11
- content: WebpageContent;
12
- }
13
-
14
- export class ContentExtractor {
15
- private md: MarkdownIt;
16
- private turndownService: TurndownService;
17
- // Cache for webpage content (key: url + format, value: content)
18
- private contentCache: Map<string, ContentCacheEntry> = new Map();
19
- // Cache expiration time in milliseconds (30 minutes)
20
- private cacheTTL: number = 30 * 60 * 1000;
21
-
22
- constructor() {
23
- this.md = new MarkdownIt();
24
- this.turndownService = new TurndownService({
25
- headingStyle: 'atx',
26
- codeBlockStyle: 'fenced'
27
- });
28
- }
29
-
30
- private cleanText(text: string): string {
31
- // Remove multiple blank lines
32
- text = text.replace(/\n\s*\n\s*\n/g, '\n\n');
33
- // Remove excessive spaces
34
- text = text.replace(/ +/g, ' ');
35
- return text.trim();
36
- }
37
-
38
- private cleanMarkdown(text: string): string {
39
- let cleanedText = this.cleanText(text);
40
- // Ensure headers have space after #
41
- cleanedText = cleanedText.replace(/#([A-Za-z0-9])/g, '# $1');
42
- return cleanedText;
43
- }
44
-
45
- private htmlToMarkdown(html: string): string {
46
- return this.cleanMarkdown(this.turndownService.turndown(html));
47
- }
48
-
49
- private htmlToPlainText(html: string): string {
50
- const dom = new JSDOM(html);
51
- return this.cleanText(dom.window.document.body.textContent || '');
52
- }
53
-
54
- private isValidUrl(url: string): boolean {
55
- try {
56
- new URL(url);
57
- return true;
58
- } catch {
59
- return false;
60
- }
61
- }
62
-
63
- /**
64
- * Generate a cache key from URL and format
65
- */
66
- private generateCacheKey(url: string, format: OutputFormat): string {
67
- return `${url}|${format}`;
68
- }
69
-
70
- /**
71
- * Check if a cache entry is still valid
72
- */
73
- private isCacheValid(entry: ContentCacheEntry): boolean {
74
- const now = Date.now();
75
- return now - entry.timestamp < this.cacheTTL;
76
- }
77
-
78
- /**
79
- * Store webpage content in cache
80
- */
81
- private cacheContent(url: string, format: OutputFormat, content: WebpageContent): void {
82
- const cacheKey = this.generateCacheKey(url, format);
83
- this.contentCache.set(cacheKey, {
84
- timestamp: Date.now(),
85
- content
86
- });
87
-
88
- // Limit cache size to prevent memory issues (max 50 entries)
89
- if (this.contentCache.size > 50) {
90
- // Delete oldest entry
91
- const oldestKey = Array.from(this.contentCache.entries())
92
- .sort((a, b) => a[1].timestamp - b[1].timestamp)[0][0];
93
- this.contentCache.delete(oldestKey);
94
- }
95
- }
96
-
97
- /**
98
- * Generates a concise summary of the content
99
- * @param content The content to summarize
100
- * @param maxLength Maximum length of the summary
101
- * @returns A summary of the content
102
- */
103
- private generateSummary(content: string, maxLength: number = 300): string {
104
- // Simple summarization: take first few sentences up to maxLength
105
- const sentences = content.split(/(?<=[.!?])\s+/);
106
- let summary = '';
107
-
108
- for (const sentence of sentences) {
109
- if ((summary + sentence).length <= maxLength) {
110
- summary += sentence + ' ';
111
- } else {
112
- break;
113
- }
114
- }
115
-
116
- return summary.trim() + (summary.length < content.length ? '...' : '');
117
- }
118
-
119
- async extractContent(url: string, format: OutputFormat = 'markdown'): Promise<WebpageContent> {
120
- if (!this.isValidUrl(url)) {
121
- throw new Error('Invalid URL provided');
122
- }
123
-
124
- // Check cache first
125
- const cacheKey = this.generateCacheKey(url, format);
126
- const cachedContent = this.contentCache.get(cacheKey);
127
- if (cachedContent && this.isCacheValid(cachedContent)) {
128
- console.error(`Using cached content for ${url}`);
129
- return cachedContent.content;
130
- }
131
-
132
- try {
133
- // Fetch webpage content
134
- const response = await axios.get(url, {
135
- headers: {
136
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
137
- },
138
- timeout: 10000
139
- });
140
-
141
- // Parse with Cheerio for metadata
142
- const $ = cheerio.load(response.data);
143
- const metaTags: Record<string, string> = {};
144
-
145
- // Only extract the most important meta tags to reduce data volume
146
- const importantMetaTags = ['description', 'keywords', 'author', 'og:title', 'og:description', 'twitter:title', 'twitter:description'];
147
-
148
- $('meta').each((_, element) => {
149
- const name = $(element).attr('name') || $(element).attr('property') || '';
150
- const content = $(element).attr('content') || '';
151
- if (name && content && importantMetaTags.some(tag => name.includes(tag))) {
152
- metaTags[name] = content;
153
- }
154
- });
155
-
156
- // Use Readability for main content extraction
157
- const dom = new JSDOM(response.data);
158
- const reader = new Readability(dom.window.document);
159
- const article = reader.parse();
160
-
161
- if (!article) {
162
- throw new Error('Failed to extract content from webpage');
163
- }
164
-
165
- // Convert content based on requested format
166
- let contentStr: string;
167
- switch (format) {
168
- case 'html':
169
- contentStr = article.content || '';
170
- break;
171
- case 'text':
172
- contentStr = this.htmlToPlainText(article.content || '');
173
- break;
174
- case 'markdown':
175
- default:
176
- contentStr = this.htmlToMarkdown(article.content || '');
177
- break;
178
- }
179
-
180
- // Calculate content stats
181
- const wordCount = contentStr.split(/\s+/).filter(word => word.length > 0).length;
182
-
183
- // Generate a summary of the content
184
- const summary = this.generateSummary(contentStr);
185
-
186
- const content: WebpageContent = {
187
- url,
188
- title: ($('title').text() as string) || article.title || '',
189
- description: metaTags['description'] || '',
190
- content: contentStr,
191
- format: format,
192
- meta_tags: metaTags,
193
- stats: {
194
- word_count: wordCount,
195
- approximate_chars: contentStr.length
196
- },
197
- content_preview: {
198
- first_500_chars: contentStr.slice(0, 500) + (contentStr.length > 500 ? '...' : '')
199
- },
200
- summary: summary
201
- };
202
-
203
- // Cache the content before returning
204
- this.cacheContent(url, format, content);
205
-
206
- return content;
207
- } catch (error) {
208
- if (axios.isAxiosError(error)) {
209
- throw new Error(`Failed to fetch webpage: ${error.message}`);
210
- }
211
- throw error;
212
- }
213
- }
214
-
215
- async batchExtractContent(urls: string[], format: OutputFormat = 'markdown'): Promise<Record<string, WebpageContent | { error: string }>> {
216
- const results: Record<string, WebpageContent | { error: string }> = {};
217
-
218
- await Promise.all(
219
- urls.map(async (url) => {
220
- try {
221
- results[url] = await this.extractContent(url, format);
222
- } catch (error) {
223
- results[url] = {
224
- error: error instanceof Error ? error.message : 'Unknown error occurred'
225
- };
226
- }
227
- })
228
- );
229
-
230
- return results;
231
- }
232
- }
package/tasks.md DELETED
@@ -1,141 +0,0 @@
1
- # Google Search MCP Server Improvement Plan
2
-
3
- ## Current Implementation
4
- The Google Search MCP Server currently provides three tools:
5
- 1. `google_search` - Searches Google and returns relevant results
6
- 2. `extract_webpage_content` - Extracts content from a single webpage
7
- 3. `extract_multiple_webpages` - Extracts content from multiple webpages
8
-
9
- ## Improvement Roadmap
10
-
11
- ### Phase 1: Enhanced Search Capabilities
12
-
13
- #### Task 1: Add Basic Search Filters
14
- - [x] Add site-specific search parameter (`site:example.com`)
15
- - [x] Add language filter parameter
16
- - [x] Update input schema and documentation
17
- - [x] Test functionality with different filter combinations
18
-
19
- #### Task 2: Add Date Range Filtering
20
- - [x] Add date range parameters (start date, end date)
21
- - [x] Implement date formatting and validation
22
- - [x] Update Google API query construction
23
- - [x] Test functionality with various date ranges
24
-
25
- #### Task 3: Add Result Type Specification
26
- - [x] Add parameter for result type (news, images, videos)
27
- - [x] Implement type-specific query parameters
28
- - [x] Update result processing for each type
29
- - [x] Test different result types
30
-
31
- #### Task 4: Implement Pagination Support
32
- - [x] Add pagination parameters (page number, results per page)
33
- - [x] Implement pagination logic using Google API's `start` parameter
34
- - [x] Add metadata about total results and current page
35
- - [x] Test pagination functionality
36
-
37
- #### Task 5: Add Sorting Options
38
- - [x] Add sorting parameter (relevance, date)
39
- - [x] Implement sort parameter handling
40
- - [x] Test different sorting options
41
-
42
- #### Task 6: Implement Result Categorization
43
- - [x] Design categorization algorithm (by domain, topic, or content type)
44
- - [x] Implement result clustering/categorization
45
- - [x] Add category information to the response
46
- - [x] Test categorization with various search queries
47
-
48
- ### Phase 2: Advanced Content Extraction
49
-
50
- #### Task 7: Support Different Output Formats
51
- - [x] Add parameter for output format (markdown, HTML, plain text)
52
- - [x] Implement format conversion functions
53
- - [x] Test output in different formats
54
-
55
- #### Task 8: Add Content Summarization
56
- - [ ] Research summarization approaches
57
- - [ ] Implement text summarization algorithm
58
- - [ ] Add summary to extraction results
59
- - [ ] Test summarization with various content types
60
-
61
- #### Task 9: Extract Specific Elements
62
- - [ ] Add support for extracting tables
63
- - [ ] Add support for extracting lists
64
- - [ ] Add support for extracting code blocks
65
- - [ ] Test specific element extraction
66
-
67
- #### Task 10: Implement Image Extraction
68
- - [ ] Add functionality to identify and extract images
69
- - [ ] Process image metadata (alt text, dimensions)
70
- - [ ] Return image URLs and metadata
71
- - [ ] Test image extraction from various pages
72
-
73
- #### Task 11: Add Content Translation Support
74
- - [ ] Research translation API options
75
- - [ ] Integrate with a translation service
76
- - [ ] Add target language parameter
77
- - [ ] Test translation functionality
78
-
79
- ### Phase 3: Performance and Infrastructure Improvements
80
-
81
- #### Task 12: Implement Result Caching
82
- - [x] Design cache structure for search results
83
- - [x] Implement cache lookup before making new requests
84
- - [x] Add cache expiration mechanism
85
- - [x] Test cache hit and miss scenarios
86
-
87
- #### Task 13: Add Content Cache Layer
88
- - [x] Design cache structure for webpage content
89
- - [x] Implement content cache lookup and storage
90
- - [x] Add cache invalidation strategy
91
- - [x] Test content caching performance
92
-
93
- #### Task 14: Implement Rate Limiting
94
- - [ ] Add rate limiting configuration
95
- - [ ] Implement request throttling
96
- - [ ] Add rate limit information in responses
97
- - [ ] Test rate limiting behavior
98
-
99
- #### Task 15: Add Concurrent Request Handling
100
- - [ ] Implement batch processing for search requests
101
- - [ ] Add parallel processing for multiple content extractions
102
- - [ ] Optimize resource usage during concurrent operations
103
- - [ ] Test performance with concurrent requests
104
-
105
- #### Task 16: Support Custom User-Agent Strings
106
- - [ ] Add user-agent parameter
107
- - [ ] Implement user-agent validation
108
- - [ ] Update request headers with custom user-agent
109
- - [ ] Test different user-agent strings
110
-
111
- #### Task 17: Add Proxy Support
112
- - [ ] Add proxy configuration options
113
- - [ ] Implement proxy routing for requests
114
- - [ ] Add fallback mechanism for proxy failures
115
- - [ ] Test proxy functionality
116
-
117
- ### Phase 4: Finalization and Documentation
118
-
119
- #### Task 18: Comprehensive Testing
120
- - [ ] Develop automated tests for all new features
121
- - [ ] Perform integration testing
122
- - [ ] Stress test with high volume of requests
123
- - [ ] Fix any identified issues
124
-
125
- #### Task 19: Update Documentation
126
- - [ ] Update server documentation with new capabilities
127
- - [ ] Create examples for each new feature
128
- - [ ] Document best practices and recommendations
129
- - [ ] Create troubleshooting guide
130
-
131
- #### Task 20: Performance Optimization
132
- - [ ] Profile and identify bottlenecks
133
- - [ ] Optimize resource usage
134
- - [ ] Implement additional caching if needed
135
- - [ ] Benchmark performance improvements
136
-
137
- ## Implementation Notes
138
- - Each task should be completed and tested independently
139
- - Regular commits should be made after each feature is implemented
140
- - Follow existing code patterns and naming conventions
141
- - Maintain backward compatibility where possible