pse-mcp 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +104 -220
- package/dist/google-search.js +4 -190
- package/dist/services/google-search.service.js +1 -1
- package/dist-package/dist/google-search.js +0 -0
- package/dist-package/package.json +22 -22
- package/package.json +40 -40
- package/src/google-search.ts +4 -214
- package/src/services/google-search.service.ts +1 -1
- package/src/types.ts +0 -22
- package/GEMINI.md +0 -72
- package/QWEN.md +0 -207
- package/dist/content-fetcher.js +0 -36
- package/dist/services/content-extractor.service.js +0 -195
- package/src/services/content-extractor.service.ts +0 -232
- package/tasks.md +0 -141
package/dist/content-fetcher.js
DELETED
@@ -1,36 +0,0 @@
|
|
1
|
-
import axios from 'axios';
|
2
|
-
export class ContentFetcher {
|
3
|
-
constructor(port = 5001) {
|
4
|
-
this.baseUrl = `http://localhost:${port}`;
|
5
|
-
}
|
6
|
-
async fetchContent(url) {
|
7
|
-
try {
|
8
|
-
const response = await axios.post(`${this.baseUrl}/analyze`, { url });
|
9
|
-
return response.data;
|
10
|
-
}
|
11
|
-
catch (error) {
|
12
|
-
if (axios.isAxiosError(error)) {
|
13
|
-
throw new Error(`Failed to fetch content: ${error.response?.data?.error || error.message}`);
|
14
|
-
}
|
15
|
-
if (error instanceof Error) {
|
16
|
-
throw new Error(`Failed to fetch content: ${error.message}`);
|
17
|
-
}
|
18
|
-
throw new Error('Failed to fetch content: Unknown error');
|
19
|
-
}
|
20
|
-
}
|
21
|
-
async batchFetchContent(urls) {
|
22
|
-
try {
|
23
|
-
const response = await axios.post(`${this.baseUrl}/batch_analyze`, { urls });
|
24
|
-
return response.data;
|
25
|
-
}
|
26
|
-
catch (error) {
|
27
|
-
if (axios.isAxiosError(error)) {
|
28
|
-
throw new Error(`Failed to batch fetch content: ${error.response?.data?.error || error.message}`);
|
29
|
-
}
|
30
|
-
if (error instanceof Error) {
|
31
|
-
throw new Error(`Failed to batch fetch content: ${error.message}`);
|
32
|
-
}
|
33
|
-
throw new Error('Failed to batch fetch content: Unknown error');
|
34
|
-
}
|
35
|
-
}
|
36
|
-
}
|
@@ -1,195 +0,0 @@
|
|
1
|
-
import axios from 'axios';
|
2
|
-
import * as cheerio from 'cheerio';
|
3
|
-
import { Readability } from '@mozilla/readability';
|
4
|
-
import { JSDOM } from 'jsdom';
|
5
|
-
import MarkdownIt from 'markdown-it';
|
6
|
-
import TurndownService from 'turndown';
|
7
|
-
export class ContentExtractor {
|
8
|
-
constructor() {
|
9
|
-
// Cache for webpage content (key: url + format, value: content)
|
10
|
-
this.contentCache = new Map();
|
11
|
-
// Cache expiration time in milliseconds (30 minutes)
|
12
|
-
this.cacheTTL = 30 * 60 * 1000;
|
13
|
-
this.md = new MarkdownIt();
|
14
|
-
this.turndownService = new TurndownService({
|
15
|
-
headingStyle: 'atx',
|
16
|
-
codeBlockStyle: 'fenced'
|
17
|
-
});
|
18
|
-
}
|
19
|
-
cleanText(text) {
|
20
|
-
// Remove multiple blank lines
|
21
|
-
text = text.replace(/\n\s*\n\s*\n/g, '\n\n');
|
22
|
-
// Remove excessive spaces
|
23
|
-
text = text.replace(/ +/g, ' ');
|
24
|
-
return text.trim();
|
25
|
-
}
|
26
|
-
cleanMarkdown(text) {
|
27
|
-
let cleanedText = this.cleanText(text);
|
28
|
-
// Ensure headers have space after #
|
29
|
-
cleanedText = cleanedText.replace(/#([A-Za-z0-9])/g, '# $1');
|
30
|
-
return cleanedText;
|
31
|
-
}
|
32
|
-
htmlToMarkdown(html) {
|
33
|
-
return this.cleanMarkdown(this.turndownService.turndown(html));
|
34
|
-
}
|
35
|
-
htmlToPlainText(html) {
|
36
|
-
const dom = new JSDOM(html);
|
37
|
-
return this.cleanText(dom.window.document.body.textContent || '');
|
38
|
-
}
|
39
|
-
isValidUrl(url) {
|
40
|
-
try {
|
41
|
-
new URL(url);
|
42
|
-
return true;
|
43
|
-
}
|
44
|
-
catch {
|
45
|
-
return false;
|
46
|
-
}
|
47
|
-
}
|
48
|
-
/**
|
49
|
-
* Generate a cache key from URL and format
|
50
|
-
*/
|
51
|
-
generateCacheKey(url, format) {
|
52
|
-
return `${url}|${format}`;
|
53
|
-
}
|
54
|
-
/**
|
55
|
-
* Check if a cache entry is still valid
|
56
|
-
*/
|
57
|
-
isCacheValid(entry) {
|
58
|
-
const now = Date.now();
|
59
|
-
return now - entry.timestamp < this.cacheTTL;
|
60
|
-
}
|
61
|
-
/**
|
62
|
-
* Store webpage content in cache
|
63
|
-
*/
|
64
|
-
cacheContent(url, format, content) {
|
65
|
-
const cacheKey = this.generateCacheKey(url, format);
|
66
|
-
this.contentCache.set(cacheKey, {
|
67
|
-
timestamp: Date.now(),
|
68
|
-
content
|
69
|
-
});
|
70
|
-
// Limit cache size to prevent memory issues (max 50 entries)
|
71
|
-
if (this.contentCache.size > 50) {
|
72
|
-
// Delete oldest entry
|
73
|
-
const oldestKey = Array.from(this.contentCache.entries())
|
74
|
-
.sort((a, b) => a[1].timestamp - b[1].timestamp)[0][0];
|
75
|
-
this.contentCache.delete(oldestKey);
|
76
|
-
}
|
77
|
-
}
|
78
|
-
/**
|
79
|
-
* Generates a concise summary of the content
|
80
|
-
* @param content The content to summarize
|
81
|
-
* @param maxLength Maximum length of the summary
|
82
|
-
* @returns A summary of the content
|
83
|
-
*/
|
84
|
-
generateSummary(content, maxLength = 300) {
|
85
|
-
// Simple summarization: take first few sentences up to maxLength
|
86
|
-
const sentences = content.split(/(?<=[.!?])\s+/);
|
87
|
-
let summary = '';
|
88
|
-
for (const sentence of sentences) {
|
89
|
-
if ((summary + sentence).length <= maxLength) {
|
90
|
-
summary += sentence + ' ';
|
91
|
-
}
|
92
|
-
else {
|
93
|
-
break;
|
94
|
-
}
|
95
|
-
}
|
96
|
-
return summary.trim() + (summary.length < content.length ? '...' : '');
|
97
|
-
}
|
98
|
-
async extractContent(url, format = 'markdown') {
|
99
|
-
if (!this.isValidUrl(url)) {
|
100
|
-
throw new Error('Invalid URL provided');
|
101
|
-
}
|
102
|
-
// Check cache first
|
103
|
-
const cacheKey = this.generateCacheKey(url, format);
|
104
|
-
const cachedContent = this.contentCache.get(cacheKey);
|
105
|
-
if (cachedContent && this.isCacheValid(cachedContent)) {
|
106
|
-
console.error(`Using cached content for ${url}`);
|
107
|
-
return cachedContent.content;
|
108
|
-
}
|
109
|
-
try {
|
110
|
-
// Fetch webpage content
|
111
|
-
const response = await axios.get(url, {
|
112
|
-
headers: {
|
113
|
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
114
|
-
},
|
115
|
-
timeout: 10000
|
116
|
-
});
|
117
|
-
// Parse with Cheerio for metadata
|
118
|
-
const $ = cheerio.load(response.data);
|
119
|
-
const metaTags = {};
|
120
|
-
// Only extract the most important meta tags to reduce data volume
|
121
|
-
const importantMetaTags = ['description', 'keywords', 'author', 'og:title', 'og:description', 'twitter:title', 'twitter:description'];
|
122
|
-
$('meta').each((_, element) => {
|
123
|
-
const name = $(element).attr('name') || $(element).attr('property') || '';
|
124
|
-
const content = $(element).attr('content') || '';
|
125
|
-
if (name && content && importantMetaTags.some(tag => name.includes(tag))) {
|
126
|
-
metaTags[name] = content;
|
127
|
-
}
|
128
|
-
});
|
129
|
-
// Use Readability for main content extraction
|
130
|
-
const dom = new JSDOM(response.data);
|
131
|
-
const reader = new Readability(dom.window.document);
|
132
|
-
const article = reader.parse();
|
133
|
-
if (!article) {
|
134
|
-
throw new Error('Failed to extract content from webpage');
|
135
|
-
}
|
136
|
-
// Convert content based on requested format
|
137
|
-
let contentStr;
|
138
|
-
switch (format) {
|
139
|
-
case 'html':
|
140
|
-
contentStr = article.content || '';
|
141
|
-
break;
|
142
|
-
case 'text':
|
143
|
-
contentStr = this.htmlToPlainText(article.content || '');
|
144
|
-
break;
|
145
|
-
case 'markdown':
|
146
|
-
default:
|
147
|
-
contentStr = this.htmlToMarkdown(article.content || '');
|
148
|
-
break;
|
149
|
-
}
|
150
|
-
// Calculate content stats
|
151
|
-
const wordCount = contentStr.split(/\s+/).filter(word => word.length > 0).length;
|
152
|
-
// Generate a summary of the content
|
153
|
-
const summary = this.generateSummary(contentStr);
|
154
|
-
const content = {
|
155
|
-
url,
|
156
|
-
title: $('title').text() || article.title || '',
|
157
|
-
description: metaTags['description'] || '',
|
158
|
-
content: contentStr,
|
159
|
-
format: format,
|
160
|
-
meta_tags: metaTags,
|
161
|
-
stats: {
|
162
|
-
word_count: wordCount,
|
163
|
-
approximate_chars: contentStr.length
|
164
|
-
},
|
165
|
-
content_preview: {
|
166
|
-
first_500_chars: contentStr.slice(0, 500) + (contentStr.length > 500 ? '...' : '')
|
167
|
-
},
|
168
|
-
summary: summary
|
169
|
-
};
|
170
|
-
// Cache the content before returning
|
171
|
-
this.cacheContent(url, format, content);
|
172
|
-
return content;
|
173
|
-
}
|
174
|
-
catch (error) {
|
175
|
-
if (axios.isAxiosError(error)) {
|
176
|
-
throw new Error(`Failed to fetch webpage: ${error.message}`);
|
177
|
-
}
|
178
|
-
throw error;
|
179
|
-
}
|
180
|
-
}
|
181
|
-
async batchExtractContent(urls, format = 'markdown') {
|
182
|
-
const results = {};
|
183
|
-
await Promise.all(urls.map(async (url) => {
|
184
|
-
try {
|
185
|
-
results[url] = await this.extractContent(url, format);
|
186
|
-
}
|
187
|
-
catch (error) {
|
188
|
-
results[url] = {
|
189
|
-
error: error instanceof Error ? error.message : 'Unknown error occurred'
|
190
|
-
};
|
191
|
-
}
|
192
|
-
}));
|
193
|
-
return results;
|
194
|
-
}
|
195
|
-
}
|
@@ -1,232 +0,0 @@
|
|
1
|
-
import axios from 'axios';
|
2
|
-
import * as cheerio from 'cheerio';
|
3
|
-
import { Readability } from '@mozilla/readability';
|
4
|
-
import { JSDOM } from 'jsdom';
|
5
|
-
import MarkdownIt from 'markdown-it';
|
6
|
-
import { WebpageContent, OutputFormat } from '../types.js';
|
7
|
-
import TurndownService from 'turndown';
|
8
|
-
|
9
|
-
interface ContentCacheEntry {
|
10
|
-
timestamp: number;
|
11
|
-
content: WebpageContent;
|
12
|
-
}
|
13
|
-
|
14
|
-
export class ContentExtractor {
|
15
|
-
private md: MarkdownIt;
|
16
|
-
private turndownService: TurndownService;
|
17
|
-
// Cache for webpage content (key: url + format, value: content)
|
18
|
-
private contentCache: Map<string, ContentCacheEntry> = new Map();
|
19
|
-
// Cache expiration time in milliseconds (30 minutes)
|
20
|
-
private cacheTTL: number = 30 * 60 * 1000;
|
21
|
-
|
22
|
-
constructor() {
|
23
|
-
this.md = new MarkdownIt();
|
24
|
-
this.turndownService = new TurndownService({
|
25
|
-
headingStyle: 'atx',
|
26
|
-
codeBlockStyle: 'fenced'
|
27
|
-
});
|
28
|
-
}
|
29
|
-
|
30
|
-
private cleanText(text: string): string {
|
31
|
-
// Remove multiple blank lines
|
32
|
-
text = text.replace(/\n\s*\n\s*\n/g, '\n\n');
|
33
|
-
// Remove excessive spaces
|
34
|
-
text = text.replace(/ +/g, ' ');
|
35
|
-
return text.trim();
|
36
|
-
}
|
37
|
-
|
38
|
-
private cleanMarkdown(text: string): string {
|
39
|
-
let cleanedText = this.cleanText(text);
|
40
|
-
// Ensure headers have space after #
|
41
|
-
cleanedText = cleanedText.replace(/#([A-Za-z0-9])/g, '# $1');
|
42
|
-
return cleanedText;
|
43
|
-
}
|
44
|
-
|
45
|
-
private htmlToMarkdown(html: string): string {
|
46
|
-
return this.cleanMarkdown(this.turndownService.turndown(html));
|
47
|
-
}
|
48
|
-
|
49
|
-
private htmlToPlainText(html: string): string {
|
50
|
-
const dom = new JSDOM(html);
|
51
|
-
return this.cleanText(dom.window.document.body.textContent || '');
|
52
|
-
}
|
53
|
-
|
54
|
-
private isValidUrl(url: string): boolean {
|
55
|
-
try {
|
56
|
-
new URL(url);
|
57
|
-
return true;
|
58
|
-
} catch {
|
59
|
-
return false;
|
60
|
-
}
|
61
|
-
}
|
62
|
-
|
63
|
-
/**
|
64
|
-
* Generate a cache key from URL and format
|
65
|
-
*/
|
66
|
-
private generateCacheKey(url: string, format: OutputFormat): string {
|
67
|
-
return `${url}|${format}`;
|
68
|
-
}
|
69
|
-
|
70
|
-
/**
|
71
|
-
* Check if a cache entry is still valid
|
72
|
-
*/
|
73
|
-
private isCacheValid(entry: ContentCacheEntry): boolean {
|
74
|
-
const now = Date.now();
|
75
|
-
return now - entry.timestamp < this.cacheTTL;
|
76
|
-
}
|
77
|
-
|
78
|
-
/**
|
79
|
-
* Store webpage content in cache
|
80
|
-
*/
|
81
|
-
private cacheContent(url: string, format: OutputFormat, content: WebpageContent): void {
|
82
|
-
const cacheKey = this.generateCacheKey(url, format);
|
83
|
-
this.contentCache.set(cacheKey, {
|
84
|
-
timestamp: Date.now(),
|
85
|
-
content
|
86
|
-
});
|
87
|
-
|
88
|
-
// Limit cache size to prevent memory issues (max 50 entries)
|
89
|
-
if (this.contentCache.size > 50) {
|
90
|
-
// Delete oldest entry
|
91
|
-
const oldestKey = Array.from(this.contentCache.entries())
|
92
|
-
.sort((a, b) => a[1].timestamp - b[1].timestamp)[0][0];
|
93
|
-
this.contentCache.delete(oldestKey);
|
94
|
-
}
|
95
|
-
}
|
96
|
-
|
97
|
-
/**
|
98
|
-
* Generates a concise summary of the content
|
99
|
-
* @param content The content to summarize
|
100
|
-
* @param maxLength Maximum length of the summary
|
101
|
-
* @returns A summary of the content
|
102
|
-
*/
|
103
|
-
private generateSummary(content: string, maxLength: number = 300): string {
|
104
|
-
// Simple summarization: take first few sentences up to maxLength
|
105
|
-
const sentences = content.split(/(?<=[.!?])\s+/);
|
106
|
-
let summary = '';
|
107
|
-
|
108
|
-
for (const sentence of sentences) {
|
109
|
-
if ((summary + sentence).length <= maxLength) {
|
110
|
-
summary += sentence + ' ';
|
111
|
-
} else {
|
112
|
-
break;
|
113
|
-
}
|
114
|
-
}
|
115
|
-
|
116
|
-
return summary.trim() + (summary.length < content.length ? '...' : '');
|
117
|
-
}
|
118
|
-
|
119
|
-
async extractContent(url: string, format: OutputFormat = 'markdown'): Promise<WebpageContent> {
|
120
|
-
if (!this.isValidUrl(url)) {
|
121
|
-
throw new Error('Invalid URL provided');
|
122
|
-
}
|
123
|
-
|
124
|
-
// Check cache first
|
125
|
-
const cacheKey = this.generateCacheKey(url, format);
|
126
|
-
const cachedContent = this.contentCache.get(cacheKey);
|
127
|
-
if (cachedContent && this.isCacheValid(cachedContent)) {
|
128
|
-
console.error(`Using cached content for ${url}`);
|
129
|
-
return cachedContent.content;
|
130
|
-
}
|
131
|
-
|
132
|
-
try {
|
133
|
-
// Fetch webpage content
|
134
|
-
const response = await axios.get(url, {
|
135
|
-
headers: {
|
136
|
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
137
|
-
},
|
138
|
-
timeout: 10000
|
139
|
-
});
|
140
|
-
|
141
|
-
// Parse with Cheerio for metadata
|
142
|
-
const $ = cheerio.load(response.data);
|
143
|
-
const metaTags: Record<string, string> = {};
|
144
|
-
|
145
|
-
// Only extract the most important meta tags to reduce data volume
|
146
|
-
const importantMetaTags = ['description', 'keywords', 'author', 'og:title', 'og:description', 'twitter:title', 'twitter:description'];
|
147
|
-
|
148
|
-
$('meta').each((_, element) => {
|
149
|
-
const name = $(element).attr('name') || $(element).attr('property') || '';
|
150
|
-
const content = $(element).attr('content') || '';
|
151
|
-
if (name && content && importantMetaTags.some(tag => name.includes(tag))) {
|
152
|
-
metaTags[name] = content;
|
153
|
-
}
|
154
|
-
});
|
155
|
-
|
156
|
-
// Use Readability for main content extraction
|
157
|
-
const dom = new JSDOM(response.data);
|
158
|
-
const reader = new Readability(dom.window.document);
|
159
|
-
const article = reader.parse();
|
160
|
-
|
161
|
-
if (!article) {
|
162
|
-
throw new Error('Failed to extract content from webpage');
|
163
|
-
}
|
164
|
-
|
165
|
-
// Convert content based on requested format
|
166
|
-
let contentStr: string;
|
167
|
-
switch (format) {
|
168
|
-
case 'html':
|
169
|
-
contentStr = article.content || '';
|
170
|
-
break;
|
171
|
-
case 'text':
|
172
|
-
contentStr = this.htmlToPlainText(article.content || '');
|
173
|
-
break;
|
174
|
-
case 'markdown':
|
175
|
-
default:
|
176
|
-
contentStr = this.htmlToMarkdown(article.content || '');
|
177
|
-
break;
|
178
|
-
}
|
179
|
-
|
180
|
-
// Calculate content stats
|
181
|
-
const wordCount = contentStr.split(/\s+/).filter(word => word.length > 0).length;
|
182
|
-
|
183
|
-
// Generate a summary of the content
|
184
|
-
const summary = this.generateSummary(contentStr);
|
185
|
-
|
186
|
-
const content: WebpageContent = {
|
187
|
-
url,
|
188
|
-
title: ($('title').text() as string) || article.title || '',
|
189
|
-
description: metaTags['description'] || '',
|
190
|
-
content: contentStr,
|
191
|
-
format: format,
|
192
|
-
meta_tags: metaTags,
|
193
|
-
stats: {
|
194
|
-
word_count: wordCount,
|
195
|
-
approximate_chars: contentStr.length
|
196
|
-
},
|
197
|
-
content_preview: {
|
198
|
-
first_500_chars: contentStr.slice(0, 500) + (contentStr.length > 500 ? '...' : '')
|
199
|
-
},
|
200
|
-
summary: summary
|
201
|
-
};
|
202
|
-
|
203
|
-
// Cache the content before returning
|
204
|
-
this.cacheContent(url, format, content);
|
205
|
-
|
206
|
-
return content;
|
207
|
-
} catch (error) {
|
208
|
-
if (axios.isAxiosError(error)) {
|
209
|
-
throw new Error(`Failed to fetch webpage: ${error.message}`);
|
210
|
-
}
|
211
|
-
throw error;
|
212
|
-
}
|
213
|
-
}
|
214
|
-
|
215
|
-
async batchExtractContent(urls: string[], format: OutputFormat = 'markdown'): Promise<Record<string, WebpageContent | { error: string }>> {
|
216
|
-
const results: Record<string, WebpageContent | { error: string }> = {};
|
217
|
-
|
218
|
-
await Promise.all(
|
219
|
-
urls.map(async (url) => {
|
220
|
-
try {
|
221
|
-
results[url] = await this.extractContent(url, format);
|
222
|
-
} catch (error) {
|
223
|
-
results[url] = {
|
224
|
-
error: error instanceof Error ? error.message : 'Unknown error occurred'
|
225
|
-
};
|
226
|
-
}
|
227
|
-
})
|
228
|
-
);
|
229
|
-
|
230
|
-
return results;
|
231
|
-
}
|
232
|
-
}
|
package/tasks.md
DELETED
@@ -1,141 +0,0 @@
|
|
1
|
-
# Google Search MCP Server Improvement Plan
|
2
|
-
|
3
|
-
## Current Implementation
|
4
|
-
The Google Search MCP Server currently provides three tools:
|
5
|
-
1. `google_search` - Searches Google and returns relevant results
|
6
|
-
2. `extract_webpage_content` - Extracts content from a single webpage
|
7
|
-
3. `extract_multiple_webpages` - Extracts content from multiple webpages
|
8
|
-
|
9
|
-
## Improvement Roadmap
|
10
|
-
|
11
|
-
### Phase 1: Enhanced Search Capabilities
|
12
|
-
|
13
|
-
#### Task 1: Add Basic Search Filters
|
14
|
-
- [x] Add site-specific search parameter (`site:example.com`)
|
15
|
-
- [x] Add language filter parameter
|
16
|
-
- [x] Update input schema and documentation
|
17
|
-
- [x] Test functionality with different filter combinations
|
18
|
-
|
19
|
-
#### Task 2: Add Date Range Filtering
|
20
|
-
- [x] Add date range parameters (start date, end date)
|
21
|
-
- [x] Implement date formatting and validation
|
22
|
-
- [x] Update Google API query construction
|
23
|
-
- [x] Test functionality with various date ranges
|
24
|
-
|
25
|
-
#### Task 3: Add Result Type Specification
|
26
|
-
- [x] Add parameter for result type (news, images, videos)
|
27
|
-
- [x] Implement type-specific query parameters
|
28
|
-
- [x] Update result processing for each type
|
29
|
-
- [x] Test different result types
|
30
|
-
|
31
|
-
#### Task 4: Implement Pagination Support
|
32
|
-
- [x] Add pagination parameters (page number, results per page)
|
33
|
-
- [x] Implement pagination logic using Google API's `start` parameter
|
34
|
-
- [x] Add metadata about total results and current page
|
35
|
-
- [x] Test pagination functionality
|
36
|
-
|
37
|
-
#### Task 5: Add Sorting Options
|
38
|
-
- [x] Add sorting parameter (relevance, date)
|
39
|
-
- [x] Implement sort parameter handling
|
40
|
-
- [x] Test different sorting options
|
41
|
-
|
42
|
-
#### Task 6: Implement Result Categorization
|
43
|
-
- [x] Design categorization algorithm (by domain, topic, or content type)
|
44
|
-
- [x] Implement result clustering/categorization
|
45
|
-
- [x] Add category information to the response
|
46
|
-
- [x] Test categorization with various search queries
|
47
|
-
|
48
|
-
### Phase 2: Advanced Content Extraction
|
49
|
-
|
50
|
-
#### Task 7: Support Different Output Formats
|
51
|
-
- [x] Add parameter for output format (markdown, HTML, plain text)
|
52
|
-
- [x] Implement format conversion functions
|
53
|
-
- [x] Test output in different formats
|
54
|
-
|
55
|
-
#### Task 8: Add Content Summarization
|
56
|
-
- [ ] Research summarization approaches
|
57
|
-
- [ ] Implement text summarization algorithm
|
58
|
-
- [ ] Add summary to extraction results
|
59
|
-
- [ ] Test summarization with various content types
|
60
|
-
|
61
|
-
#### Task 9: Extract Specific Elements
|
62
|
-
- [ ] Add support for extracting tables
|
63
|
-
- [ ] Add support for extracting lists
|
64
|
-
- [ ] Add support for extracting code blocks
|
65
|
-
- [ ] Test specific element extraction
|
66
|
-
|
67
|
-
#### Task 10: Implement Image Extraction
|
68
|
-
- [ ] Add functionality to identify and extract images
|
69
|
-
- [ ] Process image metadata (alt text, dimensions)
|
70
|
-
- [ ] Return image URLs and metadata
|
71
|
-
- [ ] Test image extraction from various pages
|
72
|
-
|
73
|
-
#### Task 11: Add Content Translation Support
|
74
|
-
- [ ] Research translation API options
|
75
|
-
- [ ] Integrate with a translation service
|
76
|
-
- [ ] Add target language parameter
|
77
|
-
- [ ] Test translation functionality
|
78
|
-
|
79
|
-
### Phase 3: Performance and Infrastructure Improvements
|
80
|
-
|
81
|
-
#### Task 12: Implement Result Caching
|
82
|
-
- [x] Design cache structure for search results
|
83
|
-
- [x] Implement cache lookup before making new requests
|
84
|
-
- [x] Add cache expiration mechanism
|
85
|
-
- [x] Test cache hit and miss scenarios
|
86
|
-
|
87
|
-
#### Task 13: Add Content Cache Layer
|
88
|
-
- [x] Design cache structure for webpage content
|
89
|
-
- [x] Implement content cache lookup and storage
|
90
|
-
- [x] Add cache invalidation strategy
|
91
|
-
- [x] Test content caching performance
|
92
|
-
|
93
|
-
#### Task 14: Implement Rate Limiting
|
94
|
-
- [ ] Add rate limiting configuration
|
95
|
-
- [ ] Implement request throttling
|
96
|
-
- [ ] Add rate limit information in responses
|
97
|
-
- [ ] Test rate limiting behavior
|
98
|
-
|
99
|
-
#### Task 15: Add Concurrent Request Handling
|
100
|
-
- [ ] Implement batch processing for search requests
|
101
|
-
- [ ] Add parallel processing for multiple content extractions
|
102
|
-
- [ ] Optimize resource usage during concurrent operations
|
103
|
-
- [ ] Test performance with concurrent requests
|
104
|
-
|
105
|
-
#### Task 16: Support Custom User-Agent Strings
|
106
|
-
- [ ] Add user-agent parameter
|
107
|
-
- [ ] Implement user-agent validation
|
108
|
-
- [ ] Update request headers with custom user-agent
|
109
|
-
- [ ] Test different user-agent strings
|
110
|
-
|
111
|
-
#### Task 17: Add Proxy Support
|
112
|
-
- [ ] Add proxy configuration options
|
113
|
-
- [ ] Implement proxy routing for requests
|
114
|
-
- [ ] Add fallback mechanism for proxy failures
|
115
|
-
- [ ] Test proxy functionality
|
116
|
-
|
117
|
-
### Phase 4: Finalization and Documentation
|
118
|
-
|
119
|
-
#### Task 18: Comprehensive Testing
|
120
|
-
- [ ] Develop automated tests for all new features
|
121
|
-
- [ ] Perform integration testing
|
122
|
-
- [ ] Stress test with high volume of requests
|
123
|
-
- [ ] Fix any identified issues
|
124
|
-
|
125
|
-
#### Task 19: Update Documentation
|
126
|
-
- [ ] Update server documentation with new capabilities
|
127
|
-
- [ ] Create examples for each new feature
|
128
|
-
- [ ] Document best practices and recommendations
|
129
|
-
- [ ] Create troubleshooting guide
|
130
|
-
|
131
|
-
#### Task 20: Performance Optimization
|
132
|
-
- [ ] Profile and identify bottlenecks
|
133
|
-
- [ ] Optimize resource usage
|
134
|
-
- [ ] Implement additional caching if needed
|
135
|
-
- [ ] Benchmark performance improvements
|
136
|
-
|
137
|
-
## Implementation Notes
|
138
|
-
- Each task should be completed and tested independently
|
139
|
-
- Regular commits should be made after each feature is implemented
|
140
|
-
- Follow existing code patterns and naming conventions
|
141
|
-
- Maintain backward compatibility where possible
|