pse-mcp 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,232 @@
1
+ import axios from 'axios';
2
+ import * as cheerio from 'cheerio';
3
+ import { Readability } from '@mozilla/readability';
4
+ import { JSDOM } from 'jsdom';
5
+ import MarkdownIt from 'markdown-it';
6
+ import { WebpageContent, OutputFormat } from '../types.js';
7
+ import TurndownService from 'turndown';
8
+
9
+ interface ContentCacheEntry {
10
+ timestamp: number;
11
+ content: WebpageContent;
12
+ }
13
+
14
+ export class ContentExtractor {
15
+ private md: MarkdownIt;
16
+ private turndownService: TurndownService;
17
+ // Cache for webpage content (key: url + format, value: content)
18
+ private contentCache: Map<string, ContentCacheEntry> = new Map();
19
+ // Cache expiration time in milliseconds (30 minutes)
20
+ private cacheTTL: number = 30 * 60 * 1000;
21
+
22
+ constructor() {
23
+ this.md = new MarkdownIt();
24
+ this.turndownService = new TurndownService({
25
+ headingStyle: 'atx',
26
+ codeBlockStyle: 'fenced'
27
+ });
28
+ }
29
+
30
+ private cleanText(text: string): string {
31
+ // Remove multiple blank lines
32
+ text = text.replace(/\n\s*\n\s*\n/g, '\n\n');
33
+ // Remove excessive spaces
34
+ text = text.replace(/ +/g, ' ');
35
+ return text.trim();
36
+ }
37
+
38
+ private cleanMarkdown(text: string): string {
39
+ let cleanedText = this.cleanText(text);
40
+ // Ensure headers have space after #
41
+ cleanedText = cleanedText.replace(/#([A-Za-z0-9])/g, '# $1');
42
+ return cleanedText;
43
+ }
44
+
45
+ private htmlToMarkdown(html: string): string {
46
+ return this.cleanMarkdown(this.turndownService.turndown(html));
47
+ }
48
+
49
+ private htmlToPlainText(html: string): string {
50
+ const dom = new JSDOM(html);
51
+ return this.cleanText(dom.window.document.body.textContent || '');
52
+ }
53
+
54
+ private isValidUrl(url: string): boolean {
55
+ try {
56
+ new URL(url);
57
+ return true;
58
+ } catch {
59
+ return false;
60
+ }
61
+ }
62
+
63
+ /**
64
+ * Generate a cache key from URL and format
65
+ */
66
+ private generateCacheKey(url: string, format: OutputFormat): string {
67
+ return `${url}|${format}`;
68
+ }
69
+
70
+ /**
71
+ * Check if a cache entry is still valid
72
+ */
73
+ private isCacheValid(entry: ContentCacheEntry): boolean {
74
+ const now = Date.now();
75
+ return now - entry.timestamp < this.cacheTTL;
76
+ }
77
+
78
+ /**
79
+ * Store webpage content in cache
80
+ */
81
+ private cacheContent(url: string, format: OutputFormat, content: WebpageContent): void {
82
+ const cacheKey = this.generateCacheKey(url, format);
83
+ this.contentCache.set(cacheKey, {
84
+ timestamp: Date.now(),
85
+ content
86
+ });
87
+
88
+ // Limit cache size to prevent memory issues (max 50 entries)
89
+ if (this.contentCache.size > 50) {
90
+ // Delete oldest entry
91
+ const oldestKey = Array.from(this.contentCache.entries())
92
+ .sort((a, b) => a[1].timestamp - b[1].timestamp)[0][0];
93
+ this.contentCache.delete(oldestKey);
94
+ }
95
+ }
96
+
97
+ /**
98
+ * Generates a concise summary of the content
99
+ * @param content The content to summarize
100
+ * @param maxLength Maximum length of the summary
101
+ * @returns A summary of the content
102
+ */
103
+ private generateSummary(content: string, maxLength: number = 300): string {
104
+ // Simple summarization: take first few sentences up to maxLength
105
+ const sentences = content.split(/(?<=[.!?])\s+/);
106
+ let summary = '';
107
+
108
+ for (const sentence of sentences) {
109
+ if ((summary + sentence).length <= maxLength) {
110
+ summary += sentence + ' ';
111
+ } else {
112
+ break;
113
+ }
114
+ }
115
+
116
+ return summary.trim() + (summary.length < content.length ? '...' : '');
117
+ }
118
+
119
+ async extractContent(url: string, format: OutputFormat = 'markdown'): Promise<WebpageContent> {
120
+ if (!this.isValidUrl(url)) {
121
+ throw new Error('Invalid URL provided');
122
+ }
123
+
124
+ // Check cache first
125
+ const cacheKey = this.generateCacheKey(url, format);
126
+ const cachedContent = this.contentCache.get(cacheKey);
127
+ if (cachedContent && this.isCacheValid(cachedContent)) {
128
+ console.error(`Using cached content for ${url}`);
129
+ return cachedContent.content;
130
+ }
131
+
132
+ try {
133
+ // Fetch webpage content
134
+ const response = await axios.get(url, {
135
+ headers: {
136
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
137
+ },
138
+ timeout: 10000
139
+ });
140
+
141
+ // Parse with Cheerio for metadata
142
+ const $ = cheerio.load(response.data);
143
+ const metaTags: Record<string, string> = {};
144
+
145
+ // Only extract the most important meta tags to reduce data volume
146
+ const importantMetaTags = ['description', 'keywords', 'author', 'og:title', 'og:description', 'twitter:title', 'twitter:description'];
147
+
148
+ $('meta').each((_, element) => {
149
+ const name = $(element).attr('name') || $(element).attr('property') || '';
150
+ const content = $(element).attr('content') || '';
151
+ if (name && content && importantMetaTags.some(tag => name.includes(tag))) {
152
+ metaTags[name] = content;
153
+ }
154
+ });
155
+
156
+ // Use Readability for main content extraction
157
+ const dom = new JSDOM(response.data);
158
+ const reader = new Readability(dom.window.document);
159
+ const article = reader.parse();
160
+
161
+ if (!article) {
162
+ throw new Error('Failed to extract content from webpage');
163
+ }
164
+
165
+ // Convert content based on requested format
166
+ let contentStr: string;
167
+ switch (format) {
168
+ case 'html':
169
+ contentStr = article.content || '';
170
+ break;
171
+ case 'text':
172
+ contentStr = this.htmlToPlainText(article.content || '');
173
+ break;
174
+ case 'markdown':
175
+ default:
176
+ contentStr = this.htmlToMarkdown(article.content || '');
177
+ break;
178
+ }
179
+
180
+ // Calculate content stats
181
+ const wordCount = contentStr.split(/\s+/).filter(word => word.length > 0).length;
182
+
183
+ // Generate a summary of the content
184
+ const summary = this.generateSummary(contentStr);
185
+
186
+ const content: WebpageContent = {
187
+ url,
188
+ title: ($('title').text() as string) || article.title || '',
189
+ description: metaTags['description'] || '',
190
+ content: contentStr,
191
+ format: format,
192
+ meta_tags: metaTags,
193
+ stats: {
194
+ word_count: wordCount,
195
+ approximate_chars: contentStr.length
196
+ },
197
+ content_preview: {
198
+ first_500_chars: contentStr.slice(0, 500) + (contentStr.length > 500 ? '...' : '')
199
+ },
200
+ summary: summary
201
+ };
202
+
203
+ // Cache the content before returning
204
+ this.cacheContent(url, format, content);
205
+
206
+ return content;
207
+ } catch (error) {
208
+ if (axios.isAxiosError(error)) {
209
+ throw new Error(`Failed to fetch webpage: ${error.message}`);
210
+ }
211
+ throw error;
212
+ }
213
+ }
214
+
215
+ async batchExtractContent(urls: string[], format: OutputFormat = 'markdown'): Promise<Record<string, WebpageContent | { error: string }>> {
216
+ const results: Record<string, WebpageContent | { error: string }> = {};
217
+
218
+ await Promise.all(
219
+ urls.map(async (url) => {
220
+ try {
221
+ results[url] = await this.extractContent(url, format);
222
+ } catch (error) {
223
+ results[url] = {
224
+ error: error instanceof Error ? error.message : 'Unknown error occurred'
225
+ };
226
+ }
227
+ })
228
+ );
229
+
230
+ return results;
231
+ }
232
+ }
@@ -0,0 +1,305 @@
1
+ import { google } from 'googleapis';
2
+ import { SearchResult, SearchFilters, SearchPaginationInfo, CategoryInfo } from '../types.js';
3
+ import { URL } from 'url';
4
+
5
+ interface CacheEntry {
6
+ timestamp: number;
7
+ data: {
8
+ results: SearchResult[];
9
+ pagination?: SearchPaginationInfo;
10
+ categories?: CategoryInfo[];
11
+ };
12
+ }
13
+
14
+ export class GoogleSearchService {
15
+ // Cache for search results (key: query string + filters, value: results)
16
+ private searchCache: Map<string, CacheEntry> = new Map();
17
+ // Cache expiration time in milliseconds (5 minutes)
18
+ private cacheTTL: number = 5 * 60 * 1000;
19
+ private customSearch;
20
+ private searchEngineId: string;
21
+
22
+ constructor() {
23
+ const apiKey = process.env.GOOGLE_API_KEY;
24
+ const searchEngineId = process.env.GOOGLE_SEARCH_ENGINE_ID;
25
+
26
+ if (!apiKey || !searchEngineId) {
27
+ throw new Error('Missing required environment variables: GOOGLE_API_KEY and GOOGLE_SEARCH_ENGINE_ID');
28
+ }
29
+
30
+ // Initialize Google Custom Search API
31
+ this.customSearch = google.customsearch('v1').cse;
32
+ this.searchEngineId = searchEngineId;
33
+
34
+ // Set up the API client
35
+ google.options({
36
+ auth: apiKey
37
+ });
38
+ }
39
+
40
+ /**
41
+ * Generate a cache key from search parameters
42
+ */
43
+ private generateCacheKey(query: string, numResults: number, filters?: SearchFilters): string {
44
+ return JSON.stringify({
45
+ query,
46
+ numResults,
47
+ filters
48
+ });
49
+ }
50
+
51
+ /**
52
+ * Check if a cache entry is still valid
53
+ */
54
+ private isCacheValid(entry: CacheEntry): boolean {
55
+ const now = Date.now();
56
+ return now - entry.timestamp < this.cacheTTL;
57
+ }
58
+
59
+ /**
60
+ * Store search results in cache
61
+ */
62
+ private cacheSearchResults(
63
+ cacheKey: string,
64
+ results: SearchResult[],
65
+ pagination?: SearchPaginationInfo,
66
+ categories?: CategoryInfo[]
67
+ ): void {
68
+ this.searchCache.set(cacheKey, {
69
+ timestamp: Date.now(),
70
+ data: { results, pagination, categories }
71
+ });
72
+
73
+ // Limit cache size to prevent memory issues (max 100 entries)
74
+ if (this.searchCache.size > 100) {
75
+ // Delete oldest entry
76
+ const oldestKey = Array.from(this.searchCache.entries())
77
+ .sort((a, b) => a[1].timestamp - b[1].timestamp)[0][0];
78
+ this.searchCache.delete(oldestKey);
79
+ }
80
+ }
81
+
82
+ async search(query: string, numResults: number = 5, filters?: SearchFilters): Promise<{
83
+ results: SearchResult[];
84
+ pagination?: SearchPaginationInfo;
85
+ categories?: CategoryInfo[];
86
+ }> {
87
+ try {
88
+ // Generate cache key
89
+ const cacheKey = this.generateCacheKey(query, numResults, filters);
90
+
91
+ // Check cache first
92
+ const cachedResult = this.searchCache.get(cacheKey);
93
+ if (cachedResult && this.isCacheValid(cachedResult)) {
94
+ console.error('Using cached search results');
95
+ return cachedResult.data;
96
+ }
97
+ let formattedQuery = query;
98
+
99
+ // Apply site filter if provided
100
+ if (filters?.site) {
101
+ formattedQuery += ` site:${filters.site}`;
102
+ }
103
+
104
+ // Apply exact terms if provided
105
+ if (filters?.exactTerms) {
106
+ formattedQuery += ` "${filters.exactTerms}"`;
107
+ }
108
+
109
+ // Set default pagination values if not provided
110
+ const page = filters?.page && filters.page > 0 ? filters.page : 1;
111
+ const resultsPerPage = filters?.resultsPerPage ? Math.min(filters.resultsPerPage, 10) : Math.min(numResults, 10);
112
+
113
+ // Calculate start index for pagination (Google uses 1-based indexing)
114
+ const startIndex = (page - 1) * resultsPerPage + 1;
115
+
116
+ const params: any = {
117
+ cx: this.searchEngineId,
118
+ q: formattedQuery,
119
+ num: resultsPerPage,
120
+ start: startIndex
121
+ };
122
+
123
+ // Apply language filter if provided
124
+ if (filters?.language) {
125
+ params.lr = `lang_${filters.language}`;
126
+ }
127
+
128
+ // Apply date restriction if provided
129
+ if (filters?.dateRestrict) {
130
+ params.dateRestrict = filters.dateRestrict;
131
+ }
132
+
133
+ // Apply result type filter if provided
134
+ if (filters?.resultType) {
135
+ switch (filters.resultType.toLowerCase()) {
136
+ case 'image':
137
+ case 'images':
138
+ params.searchType = 'image';
139
+ break;
140
+ case 'news':
141
+ // For news, we need to modify the query
142
+ formattedQuery += ' source:news';
143
+ params.q = formattedQuery;
144
+ break;
145
+ case 'video':
146
+ case 'videos':
147
+ // For videos, we can use a more specific filter
148
+ formattedQuery += ' filetype:video OR inurl:video OR inurl:watch';
149
+ params.q = formattedQuery;
150
+ break;
151
+ }
152
+ }
153
+
154
+ // Apply sorting if provided
155
+ if (filters?.sort) {
156
+ switch (filters.sort.toLowerCase()) {
157
+ case 'date':
158
+ // Sort by date (most recent first)
159
+ params.sort = 'date';
160
+ break;
161
+ case 'relevance':
162
+ default:
163
+ // Google's default sort is by relevance, so we don't need to specify
164
+ break;
165
+ }
166
+ }
167
+
168
+ const response = await this.customSearch.list(params);
169
+
170
+ // If no items are found, return empty results with pagination info
171
+ if (!response.data.items) {
172
+ return {
173
+ results: [],
174
+ pagination: {
175
+ currentPage: page,
176
+ resultsPerPage,
177
+ totalResults: 0,
178
+ totalPages: 0,
179
+ hasNextPage: false,
180
+ hasPreviousPage: page > 1
181
+ },
182
+ categories: []
183
+ };
184
+ }
185
+
186
+ // Map the search results and categorize them
187
+ const results = response.data.items.map(item => {
188
+ const result: SearchResult = {
189
+ title: item.title || '',
190
+ link: item.link || '',
191
+ snippet: item.snippet || '',
192
+ pagemap: item.pagemap || {},
193
+ datePublished: item.pagemap?.metatags?.[0]?.['article:published_time'] || '',
194
+ source: 'google_search'
195
+ };
196
+
197
+ // Add category to the result
198
+ result.category = this.categorizeResult(result);
199
+
200
+ return result;
201
+ });
202
+
203
+ // Generate category statistics
204
+ const categories = this.generateCategoryStats(results);
205
+
206
+ // Create pagination information
207
+ const totalResults = parseInt(response.data.searchInformation?.totalResults || '0', 10);
208
+ const totalPages = Math.ceil(totalResults / resultsPerPage);
209
+
210
+ const pagination: SearchPaginationInfo = {
211
+ currentPage: page,
212
+ resultsPerPage,
213
+ totalResults,
214
+ totalPages,
215
+ hasNextPage: page < totalPages,
216
+ hasPreviousPage: page > 1
217
+ };
218
+
219
+ // Cache the results before returning
220
+ this.cacheSearchResults(cacheKey, results, pagination, categories);
221
+
222
+ return {
223
+ results,
224
+ pagination,
225
+ categories
226
+ };
227
+ } catch (error) {
228
+ if (error instanceof Error) {
229
+ throw new Error(`Google Search API error: ${error.message}`);
230
+ }
231
+ throw new Error('Unknown error during Google search');
232
+ }
233
+ }
234
+
235
+ /**
236
+ * Categorizes a search result based on its content
237
+ * @param result The search result to categorize
238
+ * @returns The category name
239
+ */
240
+ private categorizeResult(result: SearchResult): string {
241
+ try {
242
+ // Extract the domain from the URL
243
+ const url = new URL(result.link);
244
+ const domain = url.hostname.replace(/^www\./, '');
245
+
246
+ // Check if this is a social media site
247
+ if (domain.match(/facebook\.com|twitter\.com|instagram\.com|linkedin\.com|pinterest\.com|tiktok\.com|reddit\.com/i)) {
248
+ return 'Social Media';
249
+ }
250
+
251
+ // Check if this is a video site
252
+ if (domain.match(/youtube\.com|vimeo\.com|dailymotion\.com|twitch\.tv/i)) {
253
+ return 'Video';
254
+ }
255
+
256
+ // Check if this is a news site
257
+ if (domain.match(/news|cnn\.com|bbc\.com|nytimes\.com|wsj\.com|reuters\.com|bloomberg\.com/i)) {
258
+ return 'News';
259
+ }
260
+
261
+ // Check if this is an educational site
262
+ if (domain.match(/\.edu$|wikipedia\.org|khan|course|learn|study|academic/i)) {
263
+ return 'Educational';
264
+ }
265
+
266
+ // Check if this is a documentation site
267
+ if (domain.match(/docs|documentation|developer|github\.com|gitlab\.com|bitbucket\.org|stackoverflow\.com/i) ||
268
+ result.title.match(/docs|documentation|api|reference|manual/i)) {
269
+ return 'Documentation';
270
+ }
271
+
272
+ // Check if this is a shopping site
273
+ if (domain.match(/amazon\.com|ebay\.com|etsy\.com|walmart\.com|shop|store|buy/i)) {
274
+ return 'Shopping';
275
+ }
276
+
277
+ // Default category based on domain
278
+ return domain.split('.').slice(-2, -1)[0].charAt(0).toUpperCase() + domain.split('.').slice(-2, -1)[0].slice(1);
279
+
280
+ } catch (error) {
281
+ // If there's any error in categorization, return a default category
282
+ return 'Other';
283
+ }
284
+ }
285
+
286
+ /**
287
+ * Generates category statistics from search results
288
+ * @param results The search results to analyze
289
+ * @returns An array of category information
290
+ */
291
+ private generateCategoryStats(results: SearchResult[]): CategoryInfo[] {
292
+ // Count results by category
293
+ const categoryCounts: Record<string, number> = {};
294
+
295
+ results.forEach(result => {
296
+ const category = result.category || 'Other';
297
+ categoryCounts[category] = (categoryCounts[category] || 0) + 1;
298
+ });
299
+
300
+ // Convert to array of category info objects
301
+ return Object.entries(categoryCounts)
302
+ .map(([name, count]) => ({ name, count }))
303
+ .sort((a, b) => b.count - a.count); // Sort by count in descending order
304
+ }
305
+ }
package/src/types.ts ADDED
@@ -0,0 +1,64 @@
1
+ export interface SearchFilters {
2
+ site?: string;
3
+ language?: string;
4
+ dateRestrict?: string;
5
+ exactTerms?: string;
6
+ resultType?: string;
7
+ page?: number;
8
+ resultsPerPage?: number;
9
+ sort?: string;
10
+ }
11
+
12
+ export interface SearchResult {
13
+ title: string;
14
+ link: string;
15
+ snippet: string;
16
+ pagemap: Record<string, any>;
17
+ datePublished: string;
18
+ source: string;
19
+ category?: string;
20
+ }
21
+
22
+ export interface CategoryInfo {
23
+ name: string;
24
+ count: number;
25
+ }
26
+
27
+ export interface SearchPaginationInfo {
28
+ currentPage: number;
29
+ totalResults?: number;
30
+ resultsPerPage: number;
31
+ totalPages?: number;
32
+ hasNextPage: boolean;
33
+ hasPreviousPage: boolean;
34
+ }
35
+
36
+ export interface SearchResponse {
37
+ results: SearchResult[];
38
+ filters?: SearchFilters;
39
+ pagination?: SearchPaginationInfo;
40
+ categories?: CategoryInfo[];
41
+ }
42
+
43
+ export type OutputFormat = 'markdown' | 'html' | 'text';
44
+
45
+ export interface WebpageContent {
46
+ url: string;
47
+ title: string;
48
+ description: string;
49
+ content: string;
50
+ format: OutputFormat;
51
+ meta_tags: Record<string, string>;
52
+ stats: {
53
+ word_count: number;
54
+ approximate_chars: number;
55
+ };
56
+ content_preview: {
57
+ first_500_chars: string;
58
+ };
59
+ summary?: string;
60
+ }
61
+
62
+ export interface WebpageAnalysisResponse {
63
+ [url: string]: WebpageContent | { error: string };
64
+ }