pse-mcp 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +104 -220
- package/dist-package/dist/google-search.js +0 -0
- package/dist-package/package.json +22 -22
- package/package.json +40 -40
- package/src/google-search.ts +4 -214
- package/src/services/google-search.service.ts +1 -1
- package/src/types.ts +0 -22
- package/GEMINI.md +0 -72
- package/QWEN.md +0 -207
- package/dist/content-fetcher.js +0 -36
- package/dist/google-search.js +0 -421
- package/dist/services/content-extractor.service.js +0 -195
- package/dist/services/google-search.service.js +0 -244
- package/dist/types.js +0 -1
- package/src/services/content-extractor.service.ts +0 -232
- package/tasks.md +0 -141
@@ -1,244 +0,0 @@
|
|
1
|
-
import { google } from 'googleapis';
|
2
|
-
import { URL } from 'url';
|
3
|
-
export class GoogleSearchService {
|
4
|
-
constructor() {
|
5
|
-
// Cache for search results (key: query string + filters, value: results)
|
6
|
-
this.searchCache = new Map();
|
7
|
-
// Cache expiration time in milliseconds (5 minutes)
|
8
|
-
this.cacheTTL = 5 * 60 * 1000;
|
9
|
-
const apiKey = process.env.GOOGLE_API_KEY;
|
10
|
-
const searchEngineId = process.env.GOOGLE_SEARCH_ENGINE_ID;
|
11
|
-
if (!apiKey || !searchEngineId) {
|
12
|
-
throw new Error('Missing required environment variables: GOOGLE_API_KEY and GOOGLE_SEARCH_ENGINE_ID');
|
13
|
-
}
|
14
|
-
// Initialize Google Custom Search API
|
15
|
-
this.customSearch = google.customsearch('v1').cse;
|
16
|
-
this.searchEngineId = searchEngineId;
|
17
|
-
// Set up the API client
|
18
|
-
google.options({
|
19
|
-
auth: apiKey
|
20
|
-
});
|
21
|
-
}
|
22
|
-
/**
|
23
|
-
* Generate a cache key from search parameters
|
24
|
-
*/
|
25
|
-
generateCacheKey(query, numResults, filters) {
|
26
|
-
return JSON.stringify({
|
27
|
-
query,
|
28
|
-
numResults,
|
29
|
-
filters
|
30
|
-
});
|
31
|
-
}
|
32
|
-
/**
|
33
|
-
* Check if a cache entry is still valid
|
34
|
-
*/
|
35
|
-
isCacheValid(entry) {
|
36
|
-
const now = Date.now();
|
37
|
-
return now - entry.timestamp < this.cacheTTL;
|
38
|
-
}
|
39
|
-
/**
|
40
|
-
* Store search results in cache
|
41
|
-
*/
|
42
|
-
cacheSearchResults(cacheKey, results, pagination, categories) {
|
43
|
-
this.searchCache.set(cacheKey, {
|
44
|
-
timestamp: Date.now(),
|
45
|
-
data: { results, pagination, categories }
|
46
|
-
});
|
47
|
-
// Limit cache size to prevent memory issues (max 100 entries)
|
48
|
-
if (this.searchCache.size > 100) {
|
49
|
-
// Delete oldest entry
|
50
|
-
const oldestKey = Array.from(this.searchCache.entries())
|
51
|
-
.sort((a, b) => a[1].timestamp - b[1].timestamp)[0][0];
|
52
|
-
this.searchCache.delete(oldestKey);
|
53
|
-
}
|
54
|
-
}
|
55
|
-
async search(query, numResults = 5, filters) {
|
56
|
-
try {
|
57
|
-
// Generate cache key
|
58
|
-
const cacheKey = this.generateCacheKey(query, numResults, filters);
|
59
|
-
// Check cache first
|
60
|
-
const cachedResult = this.searchCache.get(cacheKey);
|
61
|
-
if (cachedResult && this.isCacheValid(cachedResult)) {
|
62
|
-
console.error('Using cached search results');
|
63
|
-
return cachedResult.data;
|
64
|
-
}
|
65
|
-
let formattedQuery = query;
|
66
|
-
// Apply site filter if provided
|
67
|
-
if (filters?.site) {
|
68
|
-
formattedQuery += ` site:${filters.site}`;
|
69
|
-
}
|
70
|
-
// Apply exact terms if provided
|
71
|
-
if (filters?.exactTerms) {
|
72
|
-
formattedQuery += ` "${filters.exactTerms}"`;
|
73
|
-
}
|
74
|
-
// Set default pagination values if not provided
|
75
|
-
const page = filters?.page && filters.page > 0 ? filters.page : 1;
|
76
|
-
const resultsPerPage = filters?.resultsPerPage ? Math.min(filters.resultsPerPage, 10) : Math.min(numResults, 10);
|
77
|
-
// Calculate start index for pagination (Google uses 1-based indexing)
|
78
|
-
const startIndex = (page - 1) * resultsPerPage + 1;
|
79
|
-
const params = {
|
80
|
-
cx: this.searchEngineId,
|
81
|
-
q: formattedQuery,
|
82
|
-
num: resultsPerPage,
|
83
|
-
start: startIndex
|
84
|
-
};
|
85
|
-
// Apply language filter if provided
|
86
|
-
if (filters?.language) {
|
87
|
-
params.lr = `lang_${filters.language}`;
|
88
|
-
}
|
89
|
-
// Apply date restriction if provided
|
90
|
-
if (filters?.dateRestrict) {
|
91
|
-
params.dateRestrict = filters.dateRestrict;
|
92
|
-
}
|
93
|
-
// Apply result type filter if provided
|
94
|
-
if (filters?.resultType) {
|
95
|
-
switch (filters.resultType.toLowerCase()) {
|
96
|
-
case 'image':
|
97
|
-
case 'images':
|
98
|
-
params.searchType = 'image';
|
99
|
-
break;
|
100
|
-
case 'news':
|
101
|
-
// For news, we need to modify the query
|
102
|
-
formattedQuery += ' source:news';
|
103
|
-
params.q = formattedQuery;
|
104
|
-
break;
|
105
|
-
case 'video':
|
106
|
-
case 'videos':
|
107
|
-
// For videos, we can use a more specific filter
|
108
|
-
formattedQuery += ' filetype:video OR inurl:video OR inurl:watch';
|
109
|
-
params.q = formattedQuery;
|
110
|
-
break;
|
111
|
-
}
|
112
|
-
}
|
113
|
-
// Apply sorting if provided
|
114
|
-
if (filters?.sort) {
|
115
|
-
switch (filters.sort.toLowerCase()) {
|
116
|
-
case 'date':
|
117
|
-
// Sort by date (most recent first)
|
118
|
-
params.sort = 'date';
|
119
|
-
break;
|
120
|
-
case 'relevance':
|
121
|
-
default:
|
122
|
-
// Google's default sort is by relevance, so we don't need to specify
|
123
|
-
break;
|
124
|
-
}
|
125
|
-
}
|
126
|
-
const response = await this.customSearch.list(params);
|
127
|
-
// If no items are found, return empty results with pagination info
|
128
|
-
if (!response.data.items) {
|
129
|
-
return {
|
130
|
-
results: [],
|
131
|
-
pagination: {
|
132
|
-
currentPage: page,
|
133
|
-
resultsPerPage,
|
134
|
-
totalResults: 0,
|
135
|
-
totalPages: 0,
|
136
|
-
hasNextPage: false,
|
137
|
-
hasPreviousPage: page > 1
|
138
|
-
},
|
139
|
-
categories: []
|
140
|
-
};
|
141
|
-
}
|
142
|
-
// Map the search results and categorize them
|
143
|
-
const results = response.data.items.map(item => {
|
144
|
-
const result = {
|
145
|
-
title: item.title || '',
|
146
|
-
link: item.link || '',
|
147
|
-
snippet: item.snippet || '',
|
148
|
-
pagemap: item.pagemap || {},
|
149
|
-
datePublished: item.pagemap?.metatags?.[0]?.['article:published_time'] || '',
|
150
|
-
source: 'google_search'
|
151
|
-
};
|
152
|
-
// Add category to the result
|
153
|
-
result.category = this.categorizeResult(result);
|
154
|
-
return result;
|
155
|
-
});
|
156
|
-
// Generate category statistics
|
157
|
-
const categories = this.generateCategoryStats(results);
|
158
|
-
// Create pagination information
|
159
|
-
const totalResults = parseInt(response.data.searchInformation?.totalResults || '0', 10);
|
160
|
-
const totalPages = Math.ceil(totalResults / resultsPerPage);
|
161
|
-
const pagination = {
|
162
|
-
currentPage: page,
|
163
|
-
resultsPerPage,
|
164
|
-
totalResults,
|
165
|
-
totalPages,
|
166
|
-
hasNextPage: page < totalPages,
|
167
|
-
hasPreviousPage: page > 1
|
168
|
-
};
|
169
|
-
// Cache the results before returning
|
170
|
-
this.cacheSearchResults(cacheKey, results, pagination, categories);
|
171
|
-
return {
|
172
|
-
results,
|
173
|
-
pagination,
|
174
|
-
categories
|
175
|
-
};
|
176
|
-
}
|
177
|
-
catch (error) {
|
178
|
-
if (error instanceof Error) {
|
179
|
-
throw new Error(`Google Search API error: ${error.message}`);
|
180
|
-
}
|
181
|
-
throw new Error('Unknown error during Google search');
|
182
|
-
}
|
183
|
-
}
|
184
|
-
/**
|
185
|
-
* Categorizes a search result based on its content
|
186
|
-
* @param result The search result to categorize
|
187
|
-
* @returns The category name
|
188
|
-
*/
|
189
|
-
categorizeResult(result) {
|
190
|
-
try {
|
191
|
-
// Extract the domain from the URL
|
192
|
-
const url = new URL(result.link);
|
193
|
-
const domain = url.hostname.replace(/^www\./, '');
|
194
|
-
// Check if this is a social media site
|
195
|
-
if (domain.match(/facebook\.com|twitter\.com|instagram\.com|linkedin\.com|pinterest\.com|tiktok\.com|reddit\.com/i)) {
|
196
|
-
return 'Social Media';
|
197
|
-
}
|
198
|
-
// Check if this is a video site
|
199
|
-
if (domain.match(/youtube\.com|vimeo\.com|dailymotion\.com|twitch\.tv/i)) {
|
200
|
-
return 'Video';
|
201
|
-
}
|
202
|
-
// Check if this is a news site
|
203
|
-
if (domain.match(/news|cnn\.com|bbc\.com|nytimes\.com|wsj\.com|reuters\.com|bloomberg\.com/i)) {
|
204
|
-
return 'News';
|
205
|
-
}
|
206
|
-
// Check if this is an educational site
|
207
|
-
if (domain.match(/\.edu$|wikipedia\.org|khan|course|learn|study|academic/i)) {
|
208
|
-
return 'Educational';
|
209
|
-
}
|
210
|
-
// Check if this is a documentation site
|
211
|
-
if (domain.match(/docs|documentation|developer|github\.com|gitlab\.com|bitbucket\.org|stackoverflow\.com/i) ||
|
212
|
-
result.title.match(/docs|documentation|api|reference|manual/i)) {
|
213
|
-
return 'Documentation';
|
214
|
-
}
|
215
|
-
// Check if this is a shopping site
|
216
|
-
if (domain.match(/amazon\.com|ebay\.com|etsy\.com|walmart\.com|shop|store|buy/i)) {
|
217
|
-
return 'Shopping';
|
218
|
-
}
|
219
|
-
// Default category based on domain
|
220
|
-
return domain.split('.').slice(-2, -1)[0].charAt(0).toUpperCase() + domain.split('.').slice(-2, -1)[0].slice(1);
|
221
|
-
}
|
222
|
-
catch (error) {
|
223
|
-
// If there's any error in categorization, return a default category
|
224
|
-
return 'Other';
|
225
|
-
}
|
226
|
-
}
|
227
|
-
/**
|
228
|
-
* Generates category statistics from search results
|
229
|
-
* @param results The search results to analyze
|
230
|
-
* @returns An array of category information
|
231
|
-
*/
|
232
|
-
generateCategoryStats(results) {
|
233
|
-
// Count results by category
|
234
|
-
const categoryCounts = {};
|
235
|
-
results.forEach(result => {
|
236
|
-
const category = result.category || 'Other';
|
237
|
-
categoryCounts[category] = (categoryCounts[category] || 0) + 1;
|
238
|
-
});
|
239
|
-
// Convert to array of category info objects
|
240
|
-
return Object.entries(categoryCounts)
|
241
|
-
.map(([name, count]) => ({ name, count }))
|
242
|
-
.sort((a, b) => b.count - a.count); // Sort by count in descending order
|
243
|
-
}
|
244
|
-
}
|
package/dist/types.js
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
export {};
|
@@ -1,232 +0,0 @@
|
|
1
|
-
import axios from 'axios';
|
2
|
-
import * as cheerio from 'cheerio';
|
3
|
-
import { Readability } from '@mozilla/readability';
|
4
|
-
import { JSDOM } from 'jsdom';
|
5
|
-
import MarkdownIt from 'markdown-it';
|
6
|
-
import { WebpageContent, OutputFormat } from '../types.js';
|
7
|
-
import TurndownService from 'turndown';
|
8
|
-
|
9
|
-
interface ContentCacheEntry {
|
10
|
-
timestamp: number;
|
11
|
-
content: WebpageContent;
|
12
|
-
}
|
13
|
-
|
14
|
-
export class ContentExtractor {
|
15
|
-
private md: MarkdownIt;
|
16
|
-
private turndownService: TurndownService;
|
17
|
-
// Cache for webpage content (key: url + format, value: content)
|
18
|
-
private contentCache: Map<string, ContentCacheEntry> = new Map();
|
19
|
-
// Cache expiration time in milliseconds (30 minutes)
|
20
|
-
private cacheTTL: number = 30 * 60 * 1000;
|
21
|
-
|
22
|
-
constructor() {
|
23
|
-
this.md = new MarkdownIt();
|
24
|
-
this.turndownService = new TurndownService({
|
25
|
-
headingStyle: 'atx',
|
26
|
-
codeBlockStyle: 'fenced'
|
27
|
-
});
|
28
|
-
}
|
29
|
-
|
30
|
-
private cleanText(text: string): string {
|
31
|
-
// Remove multiple blank lines
|
32
|
-
text = text.replace(/\n\s*\n\s*\n/g, '\n\n');
|
33
|
-
// Remove excessive spaces
|
34
|
-
text = text.replace(/ +/g, ' ');
|
35
|
-
return text.trim();
|
36
|
-
}
|
37
|
-
|
38
|
-
private cleanMarkdown(text: string): string {
|
39
|
-
let cleanedText = this.cleanText(text);
|
40
|
-
// Ensure headers have space after #
|
41
|
-
cleanedText = cleanedText.replace(/#([A-Za-z0-9])/g, '# $1');
|
42
|
-
return cleanedText;
|
43
|
-
}
|
44
|
-
|
45
|
-
private htmlToMarkdown(html: string): string {
|
46
|
-
return this.cleanMarkdown(this.turndownService.turndown(html));
|
47
|
-
}
|
48
|
-
|
49
|
-
private htmlToPlainText(html: string): string {
|
50
|
-
const dom = new JSDOM(html);
|
51
|
-
return this.cleanText(dom.window.document.body.textContent || '');
|
52
|
-
}
|
53
|
-
|
54
|
-
private isValidUrl(url: string): boolean {
|
55
|
-
try {
|
56
|
-
new URL(url);
|
57
|
-
return true;
|
58
|
-
} catch {
|
59
|
-
return false;
|
60
|
-
}
|
61
|
-
}
|
62
|
-
|
63
|
-
/**
|
64
|
-
* Generate a cache key from URL and format
|
65
|
-
*/
|
66
|
-
private generateCacheKey(url: string, format: OutputFormat): string {
|
67
|
-
return `${url}|${format}`;
|
68
|
-
}
|
69
|
-
|
70
|
-
/**
|
71
|
-
* Check if a cache entry is still valid
|
72
|
-
*/
|
73
|
-
private isCacheValid(entry: ContentCacheEntry): boolean {
|
74
|
-
const now = Date.now();
|
75
|
-
return now - entry.timestamp < this.cacheTTL;
|
76
|
-
}
|
77
|
-
|
78
|
-
/**
|
79
|
-
* Store webpage content in cache
|
80
|
-
*/
|
81
|
-
private cacheContent(url: string, format: OutputFormat, content: WebpageContent): void {
|
82
|
-
const cacheKey = this.generateCacheKey(url, format);
|
83
|
-
this.contentCache.set(cacheKey, {
|
84
|
-
timestamp: Date.now(),
|
85
|
-
content
|
86
|
-
});
|
87
|
-
|
88
|
-
// Limit cache size to prevent memory issues (max 50 entries)
|
89
|
-
if (this.contentCache.size > 50) {
|
90
|
-
// Delete oldest entry
|
91
|
-
const oldestKey = Array.from(this.contentCache.entries())
|
92
|
-
.sort((a, b) => a[1].timestamp - b[1].timestamp)[0][0];
|
93
|
-
this.contentCache.delete(oldestKey);
|
94
|
-
}
|
95
|
-
}
|
96
|
-
|
97
|
-
/**
|
98
|
-
* Generates a concise summary of the content
|
99
|
-
* @param content The content to summarize
|
100
|
-
* @param maxLength Maximum length of the summary
|
101
|
-
* @returns A summary of the content
|
102
|
-
*/
|
103
|
-
private generateSummary(content: string, maxLength: number = 300): string {
|
104
|
-
// Simple summarization: take first few sentences up to maxLength
|
105
|
-
const sentences = content.split(/(?<=[.!?])\s+/);
|
106
|
-
let summary = '';
|
107
|
-
|
108
|
-
for (const sentence of sentences) {
|
109
|
-
if ((summary + sentence).length <= maxLength) {
|
110
|
-
summary += sentence + ' ';
|
111
|
-
} else {
|
112
|
-
break;
|
113
|
-
}
|
114
|
-
}
|
115
|
-
|
116
|
-
return summary.trim() + (summary.length < content.length ? '...' : '');
|
117
|
-
}
|
118
|
-
|
119
|
-
async extractContent(url: string, format: OutputFormat = 'markdown'): Promise<WebpageContent> {
|
120
|
-
if (!this.isValidUrl(url)) {
|
121
|
-
throw new Error('Invalid URL provided');
|
122
|
-
}
|
123
|
-
|
124
|
-
// Check cache first
|
125
|
-
const cacheKey = this.generateCacheKey(url, format);
|
126
|
-
const cachedContent = this.contentCache.get(cacheKey);
|
127
|
-
if (cachedContent && this.isCacheValid(cachedContent)) {
|
128
|
-
console.error(`Using cached content for ${url}`);
|
129
|
-
return cachedContent.content;
|
130
|
-
}
|
131
|
-
|
132
|
-
try {
|
133
|
-
// Fetch webpage content
|
134
|
-
const response = await axios.get(url, {
|
135
|
-
headers: {
|
136
|
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
137
|
-
},
|
138
|
-
timeout: 10000
|
139
|
-
});
|
140
|
-
|
141
|
-
// Parse with Cheerio for metadata
|
142
|
-
const $ = cheerio.load(response.data);
|
143
|
-
const metaTags: Record<string, string> = {};
|
144
|
-
|
145
|
-
// Only extract the most important meta tags to reduce data volume
|
146
|
-
const importantMetaTags = ['description', 'keywords', 'author', 'og:title', 'og:description', 'twitter:title', 'twitter:description'];
|
147
|
-
|
148
|
-
$('meta').each((_, element) => {
|
149
|
-
const name = $(element).attr('name') || $(element).attr('property') || '';
|
150
|
-
const content = $(element).attr('content') || '';
|
151
|
-
if (name && content && importantMetaTags.some(tag => name.includes(tag))) {
|
152
|
-
metaTags[name] = content;
|
153
|
-
}
|
154
|
-
});
|
155
|
-
|
156
|
-
// Use Readability for main content extraction
|
157
|
-
const dom = new JSDOM(response.data);
|
158
|
-
const reader = new Readability(dom.window.document);
|
159
|
-
const article = reader.parse();
|
160
|
-
|
161
|
-
if (!article) {
|
162
|
-
throw new Error('Failed to extract content from webpage');
|
163
|
-
}
|
164
|
-
|
165
|
-
// Convert content based on requested format
|
166
|
-
let contentStr: string;
|
167
|
-
switch (format) {
|
168
|
-
case 'html':
|
169
|
-
contentStr = article.content || '';
|
170
|
-
break;
|
171
|
-
case 'text':
|
172
|
-
contentStr = this.htmlToPlainText(article.content || '');
|
173
|
-
break;
|
174
|
-
case 'markdown':
|
175
|
-
default:
|
176
|
-
contentStr = this.htmlToMarkdown(article.content || '');
|
177
|
-
break;
|
178
|
-
}
|
179
|
-
|
180
|
-
// Calculate content stats
|
181
|
-
const wordCount = contentStr.split(/\s+/).filter(word => word.length > 0).length;
|
182
|
-
|
183
|
-
// Generate a summary of the content
|
184
|
-
const summary = this.generateSummary(contentStr);
|
185
|
-
|
186
|
-
const content: WebpageContent = {
|
187
|
-
url,
|
188
|
-
title: ($('title').text() as string) || article.title || '',
|
189
|
-
description: metaTags['description'] || '',
|
190
|
-
content: contentStr,
|
191
|
-
format: format,
|
192
|
-
meta_tags: metaTags,
|
193
|
-
stats: {
|
194
|
-
word_count: wordCount,
|
195
|
-
approximate_chars: contentStr.length
|
196
|
-
},
|
197
|
-
content_preview: {
|
198
|
-
first_500_chars: contentStr.slice(0, 500) + (contentStr.length > 500 ? '...' : '')
|
199
|
-
},
|
200
|
-
summary: summary
|
201
|
-
};
|
202
|
-
|
203
|
-
// Cache the content before returning
|
204
|
-
this.cacheContent(url, format, content);
|
205
|
-
|
206
|
-
return content;
|
207
|
-
} catch (error) {
|
208
|
-
if (axios.isAxiosError(error)) {
|
209
|
-
throw new Error(`Failed to fetch webpage: ${error.message}`);
|
210
|
-
}
|
211
|
-
throw error;
|
212
|
-
}
|
213
|
-
}
|
214
|
-
|
215
|
-
async batchExtractContent(urls: string[], format: OutputFormat = 'markdown'): Promise<Record<string, WebpageContent | { error: string }>> {
|
216
|
-
const results: Record<string, WebpageContent | { error: string }> = {};
|
217
|
-
|
218
|
-
await Promise.all(
|
219
|
-
urls.map(async (url) => {
|
220
|
-
try {
|
221
|
-
results[url] = await this.extractContent(url, format);
|
222
|
-
} catch (error) {
|
223
|
-
results[url] = {
|
224
|
-
error: error instanceof Error ? error.message : 'Unknown error occurred'
|
225
|
-
};
|
226
|
-
}
|
227
|
-
})
|
228
|
-
);
|
229
|
-
|
230
|
-
return results;
|
231
|
-
}
|
232
|
-
}
|
package/tasks.md
DELETED
@@ -1,141 +0,0 @@
|
|
1
|
-
# Google Search MCP Server Improvement Plan
|
2
|
-
|
3
|
-
## Current Implementation
|
4
|
-
The Google Search MCP Server currently provides three tools:
|
5
|
-
1. `google_search` - Searches Google and returns relevant results
|
6
|
-
2. `extract_webpage_content` - Extracts content from a single webpage
|
7
|
-
3. `extract_multiple_webpages` - Extracts content from multiple webpages
|
8
|
-
|
9
|
-
## Improvement Roadmap
|
10
|
-
|
11
|
-
### Phase 1: Enhanced Search Capabilities
|
12
|
-
|
13
|
-
#### Task 1: Add Basic Search Filters
|
14
|
-
- [x] Add site-specific search parameter (`site:example.com`)
|
15
|
-
- [x] Add language filter parameter
|
16
|
-
- [x] Update input schema and documentation
|
17
|
-
- [x] Test functionality with different filter combinations
|
18
|
-
|
19
|
-
#### Task 2: Add Date Range Filtering
|
20
|
-
- [x] Add date range parameters (start date, end date)
|
21
|
-
- [x] Implement date formatting and validation
|
22
|
-
- [x] Update Google API query construction
|
23
|
-
- [x] Test functionality with various date ranges
|
24
|
-
|
25
|
-
#### Task 3: Add Result Type Specification
|
26
|
-
- [x] Add parameter for result type (news, images, videos)
|
27
|
-
- [x] Implement type-specific query parameters
|
28
|
-
- [x] Update result processing for each type
|
29
|
-
- [x] Test different result types
|
30
|
-
|
31
|
-
#### Task 4: Implement Pagination Support
|
32
|
-
- [x] Add pagination parameters (page number, results per page)
|
33
|
-
- [x] Implement pagination logic using Google API's `start` parameter
|
34
|
-
- [x] Add metadata about total results and current page
|
35
|
-
- [x] Test pagination functionality
|
36
|
-
|
37
|
-
#### Task 5: Add Sorting Options
|
38
|
-
- [x] Add sorting parameter (relevance, date)
|
39
|
-
- [x] Implement sort parameter handling
|
40
|
-
- [x] Test different sorting options
|
41
|
-
|
42
|
-
#### Task 6: Implement Result Categorization
|
43
|
-
- [x] Design categorization algorithm (by domain, topic, or content type)
|
44
|
-
- [x] Implement result clustering/categorization
|
45
|
-
- [x] Add category information to the response
|
46
|
-
- [x] Test categorization with various search queries
|
47
|
-
|
48
|
-
### Phase 2: Advanced Content Extraction
|
49
|
-
|
50
|
-
#### Task 7: Support Different Output Formats
|
51
|
-
- [x] Add parameter for output format (markdown, HTML, plain text)
|
52
|
-
- [x] Implement format conversion functions
|
53
|
-
- [x] Test output in different formats
|
54
|
-
|
55
|
-
#### Task 8: Add Content Summarization
|
56
|
-
- [ ] Research summarization approaches
|
57
|
-
- [ ] Implement text summarization algorithm
|
58
|
-
- [ ] Add summary to extraction results
|
59
|
-
- [ ] Test summarization with various content types
|
60
|
-
|
61
|
-
#### Task 9: Extract Specific Elements
|
62
|
-
- [ ] Add support for extracting tables
|
63
|
-
- [ ] Add support for extracting lists
|
64
|
-
- [ ] Add support for extracting code blocks
|
65
|
-
- [ ] Test specific element extraction
|
66
|
-
|
67
|
-
#### Task 10: Implement Image Extraction
|
68
|
-
- [ ] Add functionality to identify and extract images
|
69
|
-
- [ ] Process image metadata (alt text, dimensions)
|
70
|
-
- [ ] Return image URLs and metadata
|
71
|
-
- [ ] Test image extraction from various pages
|
72
|
-
|
73
|
-
#### Task 11: Add Content Translation Support
|
74
|
-
- [ ] Research translation API options
|
75
|
-
- [ ] Integrate with a translation service
|
76
|
-
- [ ] Add target language parameter
|
77
|
-
- [ ] Test translation functionality
|
78
|
-
|
79
|
-
### Phase 3: Performance and Infrastructure Improvements
|
80
|
-
|
81
|
-
#### Task 12: Implement Result Caching
|
82
|
-
- [x] Design cache structure for search results
|
83
|
-
- [x] Implement cache lookup before making new requests
|
84
|
-
- [x] Add cache expiration mechanism
|
85
|
-
- [x] Test cache hit and miss scenarios
|
86
|
-
|
87
|
-
#### Task 13: Add Content Cache Layer
|
88
|
-
- [x] Design cache structure for webpage content
|
89
|
-
- [x] Implement content cache lookup and storage
|
90
|
-
- [x] Add cache invalidation strategy
|
91
|
-
- [x] Test content caching performance
|
92
|
-
|
93
|
-
#### Task 14: Implement Rate Limiting
|
94
|
-
- [ ] Add rate limiting configuration
|
95
|
-
- [ ] Implement request throttling
|
96
|
-
- [ ] Add rate limit information in responses
|
97
|
-
- [ ] Test rate limiting behavior
|
98
|
-
|
99
|
-
#### Task 15: Add Concurrent Request Handling
|
100
|
-
- [ ] Implement batch processing for search requests
|
101
|
-
- [ ] Add parallel processing for multiple content extractions
|
102
|
-
- [ ] Optimize resource usage during concurrent operations
|
103
|
-
- [ ] Test performance with concurrent requests
|
104
|
-
|
105
|
-
#### Task 16: Support Custom User-Agent Strings
|
106
|
-
- [ ] Add user-agent parameter
|
107
|
-
- [ ] Implement user-agent validation
|
108
|
-
- [ ] Update request headers with custom user-agent
|
109
|
-
- [ ] Test different user-agent strings
|
110
|
-
|
111
|
-
#### Task 17: Add Proxy Support
|
112
|
-
- [ ] Add proxy configuration options
|
113
|
-
- [ ] Implement proxy routing for requests
|
114
|
-
- [ ] Add fallback mechanism for proxy failures
|
115
|
-
- [ ] Test proxy functionality
|
116
|
-
|
117
|
-
### Phase 4: Finalization and Documentation
|
118
|
-
|
119
|
-
#### Task 18: Comprehensive Testing
|
120
|
-
- [ ] Develop automated tests for all new features
|
121
|
-
- [ ] Perform integration testing
|
122
|
-
- [ ] Stress test with high volume of requests
|
123
|
-
- [ ] Fix any identified issues
|
124
|
-
|
125
|
-
#### Task 19: Update Documentation
|
126
|
-
- [ ] Update server documentation with new capabilities
|
127
|
-
- [ ] Create examples for each new feature
|
128
|
-
- [ ] Document best practices and recommendations
|
129
|
-
- [ ] Create troubleshooting guide
|
130
|
-
|
131
|
-
#### Task 20: Performance Optimization
|
132
|
-
- [ ] Profile and identify bottlenecks
|
133
|
-
- [ ] Optimize resource usage
|
134
|
-
- [ ] Implement additional caching if needed
|
135
|
-
- [ ] Benchmark performance improvements
|
136
|
-
|
137
|
-
## Implementation Notes
|
138
|
-
- Each task should be completed and tested independently
|
139
|
-
- Regular commits should be made after each feature is implemented
|
140
|
-
- Follow existing code patterns and naming conventions
|
141
|
-
- Maintain backward compatibility where possible
|