pse-mcp 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/GEMINI.md +72 -0
- package/License.md +3 -0
- package/MCP Documents/README.md +1 -0
- package/MCP Documents/mcp-client-guide.txt +736 -0
- package/MCP Documents/mcp-complete-guide.txt +522 -0
- package/MCP Documents/mcp-enhanced-instructions.md +297 -0
- package/MCP Documents/mcp-server-guide.md +415 -0
- package/MCP Documents/mcp-windows.txt +161 -0
- package/QWEN.md +207 -0
- package/README.md +220 -0
- package/dist/content-fetcher.js +36 -0
- package/dist/google-search.js +421 -0
- package/dist/services/content-extractor.service.js +195 -0
- package/dist/services/google-search.service.js +244 -0
- package/dist/types.js +1 -0
- package/dist-package/README.md +210 -0
- package/dist-package/dist/content-fetcher.js +36 -0
- package/dist-package/dist/google-search.js +420 -0
- package/dist-package/dist/services/content-extractor.service.js +195 -0
- package/dist-package/dist/services/google-search.service.js +244 -0
- package/dist-package/dist/types.js +1 -0
- package/dist-package/package-lock.json +3104 -0
- package/dist-package/package.json +23 -0
- package/license +4 -0
- package/package.json +40 -0
- package/src/google-search.ts +477 -0
- package/src/mcp.d.ts +36 -0
- package/src/services/content-extractor.service.ts +232 -0
- package/src/services/google-search.service.ts +305 -0
- package/src/types.ts +64 -0
- package/tasks.md +141 -0
- package/tsconfig.json +16 -0
@@ -0,0 +1,232 @@
|
|
1
|
+
import axios from 'axios';
|
2
|
+
import * as cheerio from 'cheerio';
|
3
|
+
import { Readability } from '@mozilla/readability';
|
4
|
+
import { JSDOM } from 'jsdom';
|
5
|
+
import MarkdownIt from 'markdown-it';
|
6
|
+
import { WebpageContent, OutputFormat } from '../types.js';
|
7
|
+
import TurndownService from 'turndown';
|
8
|
+
|
9
|
+
interface ContentCacheEntry {
|
10
|
+
timestamp: number;
|
11
|
+
content: WebpageContent;
|
12
|
+
}
|
13
|
+
|
14
|
+
export class ContentExtractor {
|
15
|
+
private md: MarkdownIt;
|
16
|
+
private turndownService: TurndownService;
|
17
|
+
// Cache for webpage content (key: url + format, value: content)
|
18
|
+
private contentCache: Map<string, ContentCacheEntry> = new Map();
|
19
|
+
// Cache expiration time in milliseconds (30 minutes)
|
20
|
+
private cacheTTL: number = 30 * 60 * 1000;
|
21
|
+
|
22
|
+
constructor() {
|
23
|
+
this.md = new MarkdownIt();
|
24
|
+
this.turndownService = new TurndownService({
|
25
|
+
headingStyle: 'atx',
|
26
|
+
codeBlockStyle: 'fenced'
|
27
|
+
});
|
28
|
+
}
|
29
|
+
|
30
|
+
private cleanText(text: string): string {
|
31
|
+
// Remove multiple blank lines
|
32
|
+
text = text.replace(/\n\s*\n\s*\n/g, '\n\n');
|
33
|
+
// Remove excessive spaces
|
34
|
+
text = text.replace(/ +/g, ' ');
|
35
|
+
return text.trim();
|
36
|
+
}
|
37
|
+
|
38
|
+
private cleanMarkdown(text: string): string {
|
39
|
+
let cleanedText = this.cleanText(text);
|
40
|
+
// Ensure headers have space after #
|
41
|
+
cleanedText = cleanedText.replace(/#([A-Za-z0-9])/g, '# $1');
|
42
|
+
return cleanedText;
|
43
|
+
}
|
44
|
+
|
45
|
+
private htmlToMarkdown(html: string): string {
|
46
|
+
return this.cleanMarkdown(this.turndownService.turndown(html));
|
47
|
+
}
|
48
|
+
|
49
|
+
private htmlToPlainText(html: string): string {
|
50
|
+
const dom = new JSDOM(html);
|
51
|
+
return this.cleanText(dom.window.document.body.textContent || '');
|
52
|
+
}
|
53
|
+
|
54
|
+
private isValidUrl(url: string): boolean {
|
55
|
+
try {
|
56
|
+
new URL(url);
|
57
|
+
return true;
|
58
|
+
} catch {
|
59
|
+
return false;
|
60
|
+
}
|
61
|
+
}
|
62
|
+
|
63
|
+
/**
|
64
|
+
* Generate a cache key from URL and format
|
65
|
+
*/
|
66
|
+
private generateCacheKey(url: string, format: OutputFormat): string {
|
67
|
+
return `${url}|${format}`;
|
68
|
+
}
|
69
|
+
|
70
|
+
/**
|
71
|
+
* Check if a cache entry is still valid
|
72
|
+
*/
|
73
|
+
private isCacheValid(entry: ContentCacheEntry): boolean {
|
74
|
+
const now = Date.now();
|
75
|
+
return now - entry.timestamp < this.cacheTTL;
|
76
|
+
}
|
77
|
+
|
78
|
+
/**
|
79
|
+
* Store webpage content in cache
|
80
|
+
*/
|
81
|
+
private cacheContent(url: string, format: OutputFormat, content: WebpageContent): void {
|
82
|
+
const cacheKey = this.generateCacheKey(url, format);
|
83
|
+
this.contentCache.set(cacheKey, {
|
84
|
+
timestamp: Date.now(),
|
85
|
+
content
|
86
|
+
});
|
87
|
+
|
88
|
+
// Limit cache size to prevent memory issues (max 50 entries)
|
89
|
+
if (this.contentCache.size > 50) {
|
90
|
+
// Delete oldest entry
|
91
|
+
const oldestKey = Array.from(this.contentCache.entries())
|
92
|
+
.sort((a, b) => a[1].timestamp - b[1].timestamp)[0][0];
|
93
|
+
this.contentCache.delete(oldestKey);
|
94
|
+
}
|
95
|
+
}
|
96
|
+
|
97
|
+
/**
|
98
|
+
* Generates a concise summary of the content
|
99
|
+
* @param content The content to summarize
|
100
|
+
* @param maxLength Maximum length of the summary
|
101
|
+
* @returns A summary of the content
|
102
|
+
*/
|
103
|
+
private generateSummary(content: string, maxLength: number = 300): string {
|
104
|
+
// Simple summarization: take first few sentences up to maxLength
|
105
|
+
const sentences = content.split(/(?<=[.!?])\s+/);
|
106
|
+
let summary = '';
|
107
|
+
|
108
|
+
for (const sentence of sentences) {
|
109
|
+
if ((summary + sentence).length <= maxLength) {
|
110
|
+
summary += sentence + ' ';
|
111
|
+
} else {
|
112
|
+
break;
|
113
|
+
}
|
114
|
+
}
|
115
|
+
|
116
|
+
return summary.trim() + (summary.length < content.length ? '...' : '');
|
117
|
+
}
|
118
|
+
|
119
|
+
async extractContent(url: string, format: OutputFormat = 'markdown'): Promise<WebpageContent> {
|
120
|
+
if (!this.isValidUrl(url)) {
|
121
|
+
throw new Error('Invalid URL provided');
|
122
|
+
}
|
123
|
+
|
124
|
+
// Check cache first
|
125
|
+
const cacheKey = this.generateCacheKey(url, format);
|
126
|
+
const cachedContent = this.contentCache.get(cacheKey);
|
127
|
+
if (cachedContent && this.isCacheValid(cachedContent)) {
|
128
|
+
console.error(`Using cached content for ${url}`);
|
129
|
+
return cachedContent.content;
|
130
|
+
}
|
131
|
+
|
132
|
+
try {
|
133
|
+
// Fetch webpage content
|
134
|
+
const response = await axios.get(url, {
|
135
|
+
headers: {
|
136
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
137
|
+
},
|
138
|
+
timeout: 10000
|
139
|
+
});
|
140
|
+
|
141
|
+
// Parse with Cheerio for metadata
|
142
|
+
const $ = cheerio.load(response.data);
|
143
|
+
const metaTags: Record<string, string> = {};
|
144
|
+
|
145
|
+
// Only extract the most important meta tags to reduce data volume
|
146
|
+
const importantMetaTags = ['description', 'keywords', 'author', 'og:title', 'og:description', 'twitter:title', 'twitter:description'];
|
147
|
+
|
148
|
+
$('meta').each((_, element) => {
|
149
|
+
const name = $(element).attr('name') || $(element).attr('property') || '';
|
150
|
+
const content = $(element).attr('content') || '';
|
151
|
+
if (name && content && importantMetaTags.some(tag => name.includes(tag))) {
|
152
|
+
metaTags[name] = content;
|
153
|
+
}
|
154
|
+
});
|
155
|
+
|
156
|
+
// Use Readability for main content extraction
|
157
|
+
const dom = new JSDOM(response.data);
|
158
|
+
const reader = new Readability(dom.window.document);
|
159
|
+
const article = reader.parse();
|
160
|
+
|
161
|
+
if (!article) {
|
162
|
+
throw new Error('Failed to extract content from webpage');
|
163
|
+
}
|
164
|
+
|
165
|
+
// Convert content based on requested format
|
166
|
+
let contentStr: string;
|
167
|
+
switch (format) {
|
168
|
+
case 'html':
|
169
|
+
contentStr = article.content || '';
|
170
|
+
break;
|
171
|
+
case 'text':
|
172
|
+
contentStr = this.htmlToPlainText(article.content || '');
|
173
|
+
break;
|
174
|
+
case 'markdown':
|
175
|
+
default:
|
176
|
+
contentStr = this.htmlToMarkdown(article.content || '');
|
177
|
+
break;
|
178
|
+
}
|
179
|
+
|
180
|
+
// Calculate content stats
|
181
|
+
const wordCount = contentStr.split(/\s+/).filter(word => word.length > 0).length;
|
182
|
+
|
183
|
+
// Generate a summary of the content
|
184
|
+
const summary = this.generateSummary(contentStr);
|
185
|
+
|
186
|
+
const content: WebpageContent = {
|
187
|
+
url,
|
188
|
+
title: ($('title').text() as string) || article.title || '',
|
189
|
+
description: metaTags['description'] || '',
|
190
|
+
content: contentStr,
|
191
|
+
format: format,
|
192
|
+
meta_tags: metaTags,
|
193
|
+
stats: {
|
194
|
+
word_count: wordCount,
|
195
|
+
approximate_chars: contentStr.length
|
196
|
+
},
|
197
|
+
content_preview: {
|
198
|
+
first_500_chars: contentStr.slice(0, 500) + (contentStr.length > 500 ? '...' : '')
|
199
|
+
},
|
200
|
+
summary: summary
|
201
|
+
};
|
202
|
+
|
203
|
+
// Cache the content before returning
|
204
|
+
this.cacheContent(url, format, content);
|
205
|
+
|
206
|
+
return content;
|
207
|
+
} catch (error) {
|
208
|
+
if (axios.isAxiosError(error)) {
|
209
|
+
throw new Error(`Failed to fetch webpage: ${error.message}`);
|
210
|
+
}
|
211
|
+
throw error;
|
212
|
+
}
|
213
|
+
}
|
214
|
+
|
215
|
+
async batchExtractContent(urls: string[], format: OutputFormat = 'markdown'): Promise<Record<string, WebpageContent | { error: string }>> {
|
216
|
+
const results: Record<string, WebpageContent | { error: string }> = {};
|
217
|
+
|
218
|
+
await Promise.all(
|
219
|
+
urls.map(async (url) => {
|
220
|
+
try {
|
221
|
+
results[url] = await this.extractContent(url, format);
|
222
|
+
} catch (error) {
|
223
|
+
results[url] = {
|
224
|
+
error: error instanceof Error ? error.message : 'Unknown error occurred'
|
225
|
+
};
|
226
|
+
}
|
227
|
+
})
|
228
|
+
);
|
229
|
+
|
230
|
+
return results;
|
231
|
+
}
|
232
|
+
}
|
@@ -0,0 +1,305 @@
|
|
1
|
+
import { google } from 'googleapis';
|
2
|
+
import { SearchResult, SearchFilters, SearchPaginationInfo, CategoryInfo } from '../types.js';
|
3
|
+
import { URL } from 'url';
|
4
|
+
|
5
|
+
interface CacheEntry {
|
6
|
+
timestamp: number;
|
7
|
+
data: {
|
8
|
+
results: SearchResult[];
|
9
|
+
pagination?: SearchPaginationInfo;
|
10
|
+
categories?: CategoryInfo[];
|
11
|
+
};
|
12
|
+
}
|
13
|
+
|
14
|
+
export class GoogleSearchService {
|
15
|
+
// Cache for search results (key: query string + filters, value: results)
|
16
|
+
private searchCache: Map<string, CacheEntry> = new Map();
|
17
|
+
// Cache expiration time in milliseconds (5 minutes)
|
18
|
+
private cacheTTL: number = 5 * 60 * 1000;
|
19
|
+
private customSearch;
|
20
|
+
private searchEngineId: string;
|
21
|
+
|
22
|
+
constructor() {
|
23
|
+
const apiKey = process.env.GOOGLE_API_KEY;
|
24
|
+
const searchEngineId = process.env.GOOGLE_SEARCH_ENGINE_ID;
|
25
|
+
|
26
|
+
if (!apiKey || !searchEngineId) {
|
27
|
+
throw new Error('Missing required environment variables: GOOGLE_API_KEY and GOOGLE_SEARCH_ENGINE_ID');
|
28
|
+
}
|
29
|
+
|
30
|
+
// Initialize Google Custom Search API
|
31
|
+
this.customSearch = google.customsearch('v1').cse;
|
32
|
+
this.searchEngineId = searchEngineId;
|
33
|
+
|
34
|
+
// Set up the API client
|
35
|
+
google.options({
|
36
|
+
auth: apiKey
|
37
|
+
});
|
38
|
+
}
|
39
|
+
|
40
|
+
/**
|
41
|
+
* Generate a cache key from search parameters
|
42
|
+
*/
|
43
|
+
private generateCacheKey(query: string, numResults: number, filters?: SearchFilters): string {
|
44
|
+
return JSON.stringify({
|
45
|
+
query,
|
46
|
+
numResults,
|
47
|
+
filters
|
48
|
+
});
|
49
|
+
}
|
50
|
+
|
51
|
+
/**
|
52
|
+
* Check if a cache entry is still valid
|
53
|
+
*/
|
54
|
+
private isCacheValid(entry: CacheEntry): boolean {
|
55
|
+
const now = Date.now();
|
56
|
+
return now - entry.timestamp < this.cacheTTL;
|
57
|
+
}
|
58
|
+
|
59
|
+
/**
|
60
|
+
* Store search results in cache
|
61
|
+
*/
|
62
|
+
private cacheSearchResults(
|
63
|
+
cacheKey: string,
|
64
|
+
results: SearchResult[],
|
65
|
+
pagination?: SearchPaginationInfo,
|
66
|
+
categories?: CategoryInfo[]
|
67
|
+
): void {
|
68
|
+
this.searchCache.set(cacheKey, {
|
69
|
+
timestamp: Date.now(),
|
70
|
+
data: { results, pagination, categories }
|
71
|
+
});
|
72
|
+
|
73
|
+
// Limit cache size to prevent memory issues (max 100 entries)
|
74
|
+
if (this.searchCache.size > 100) {
|
75
|
+
// Delete oldest entry
|
76
|
+
const oldestKey = Array.from(this.searchCache.entries())
|
77
|
+
.sort((a, b) => a[1].timestamp - b[1].timestamp)[0][0];
|
78
|
+
this.searchCache.delete(oldestKey);
|
79
|
+
}
|
80
|
+
}
|
81
|
+
|
82
|
+
async search(query: string, numResults: number = 5, filters?: SearchFilters): Promise<{
|
83
|
+
results: SearchResult[];
|
84
|
+
pagination?: SearchPaginationInfo;
|
85
|
+
categories?: CategoryInfo[];
|
86
|
+
}> {
|
87
|
+
try {
|
88
|
+
// Generate cache key
|
89
|
+
const cacheKey = this.generateCacheKey(query, numResults, filters);
|
90
|
+
|
91
|
+
// Check cache first
|
92
|
+
const cachedResult = this.searchCache.get(cacheKey);
|
93
|
+
if (cachedResult && this.isCacheValid(cachedResult)) {
|
94
|
+
console.error('Using cached search results');
|
95
|
+
return cachedResult.data;
|
96
|
+
}
|
97
|
+
let formattedQuery = query;
|
98
|
+
|
99
|
+
// Apply site filter if provided
|
100
|
+
if (filters?.site) {
|
101
|
+
formattedQuery += ` site:${filters.site}`;
|
102
|
+
}
|
103
|
+
|
104
|
+
// Apply exact terms if provided
|
105
|
+
if (filters?.exactTerms) {
|
106
|
+
formattedQuery += ` "${filters.exactTerms}"`;
|
107
|
+
}
|
108
|
+
|
109
|
+
// Set default pagination values if not provided
|
110
|
+
const page = filters?.page && filters.page > 0 ? filters.page : 1;
|
111
|
+
const resultsPerPage = filters?.resultsPerPage ? Math.min(filters.resultsPerPage, 10) : Math.min(numResults, 10);
|
112
|
+
|
113
|
+
// Calculate start index for pagination (Google uses 1-based indexing)
|
114
|
+
const startIndex = (page - 1) * resultsPerPage + 1;
|
115
|
+
|
116
|
+
const params: any = {
|
117
|
+
cx: this.searchEngineId,
|
118
|
+
q: formattedQuery,
|
119
|
+
num: resultsPerPage,
|
120
|
+
start: startIndex
|
121
|
+
};
|
122
|
+
|
123
|
+
// Apply language filter if provided
|
124
|
+
if (filters?.language) {
|
125
|
+
params.lr = `lang_${filters.language}`;
|
126
|
+
}
|
127
|
+
|
128
|
+
// Apply date restriction if provided
|
129
|
+
if (filters?.dateRestrict) {
|
130
|
+
params.dateRestrict = filters.dateRestrict;
|
131
|
+
}
|
132
|
+
|
133
|
+
// Apply result type filter if provided
|
134
|
+
if (filters?.resultType) {
|
135
|
+
switch (filters.resultType.toLowerCase()) {
|
136
|
+
case 'image':
|
137
|
+
case 'images':
|
138
|
+
params.searchType = 'image';
|
139
|
+
break;
|
140
|
+
case 'news':
|
141
|
+
// For news, we need to modify the query
|
142
|
+
formattedQuery += ' source:news';
|
143
|
+
params.q = formattedQuery;
|
144
|
+
break;
|
145
|
+
case 'video':
|
146
|
+
case 'videos':
|
147
|
+
// For videos, we can use a more specific filter
|
148
|
+
formattedQuery += ' filetype:video OR inurl:video OR inurl:watch';
|
149
|
+
params.q = formattedQuery;
|
150
|
+
break;
|
151
|
+
}
|
152
|
+
}
|
153
|
+
|
154
|
+
// Apply sorting if provided
|
155
|
+
if (filters?.sort) {
|
156
|
+
switch (filters.sort.toLowerCase()) {
|
157
|
+
case 'date':
|
158
|
+
// Sort by date (most recent first)
|
159
|
+
params.sort = 'date';
|
160
|
+
break;
|
161
|
+
case 'relevance':
|
162
|
+
default:
|
163
|
+
// Google's default sort is by relevance, so we don't need to specify
|
164
|
+
break;
|
165
|
+
}
|
166
|
+
}
|
167
|
+
|
168
|
+
const response = await this.customSearch.list(params);
|
169
|
+
|
170
|
+
// If no items are found, return empty results with pagination info
|
171
|
+
if (!response.data.items) {
|
172
|
+
return {
|
173
|
+
results: [],
|
174
|
+
pagination: {
|
175
|
+
currentPage: page,
|
176
|
+
resultsPerPage,
|
177
|
+
totalResults: 0,
|
178
|
+
totalPages: 0,
|
179
|
+
hasNextPage: false,
|
180
|
+
hasPreviousPage: page > 1
|
181
|
+
},
|
182
|
+
categories: []
|
183
|
+
};
|
184
|
+
}
|
185
|
+
|
186
|
+
// Map the search results and categorize them
|
187
|
+
const results = response.data.items.map(item => {
|
188
|
+
const result: SearchResult = {
|
189
|
+
title: item.title || '',
|
190
|
+
link: item.link || '',
|
191
|
+
snippet: item.snippet || '',
|
192
|
+
pagemap: item.pagemap || {},
|
193
|
+
datePublished: item.pagemap?.metatags?.[0]?.['article:published_time'] || '',
|
194
|
+
source: 'google_search'
|
195
|
+
};
|
196
|
+
|
197
|
+
// Add category to the result
|
198
|
+
result.category = this.categorizeResult(result);
|
199
|
+
|
200
|
+
return result;
|
201
|
+
});
|
202
|
+
|
203
|
+
// Generate category statistics
|
204
|
+
const categories = this.generateCategoryStats(results);
|
205
|
+
|
206
|
+
// Create pagination information
|
207
|
+
const totalResults = parseInt(response.data.searchInformation?.totalResults || '0', 10);
|
208
|
+
const totalPages = Math.ceil(totalResults / resultsPerPage);
|
209
|
+
|
210
|
+
const pagination: SearchPaginationInfo = {
|
211
|
+
currentPage: page,
|
212
|
+
resultsPerPage,
|
213
|
+
totalResults,
|
214
|
+
totalPages,
|
215
|
+
hasNextPage: page < totalPages,
|
216
|
+
hasPreviousPage: page > 1
|
217
|
+
};
|
218
|
+
|
219
|
+
// Cache the results before returning
|
220
|
+
this.cacheSearchResults(cacheKey, results, pagination, categories);
|
221
|
+
|
222
|
+
return {
|
223
|
+
results,
|
224
|
+
pagination,
|
225
|
+
categories
|
226
|
+
};
|
227
|
+
} catch (error) {
|
228
|
+
if (error instanceof Error) {
|
229
|
+
throw new Error(`Google Search API error: ${error.message}`);
|
230
|
+
}
|
231
|
+
throw new Error('Unknown error during Google search');
|
232
|
+
}
|
233
|
+
}
|
234
|
+
|
235
|
+
/**
|
236
|
+
* Categorizes a search result based on its content
|
237
|
+
* @param result The search result to categorize
|
238
|
+
* @returns The category name
|
239
|
+
*/
|
240
|
+
private categorizeResult(result: SearchResult): string {
|
241
|
+
try {
|
242
|
+
// Extract the domain from the URL
|
243
|
+
const url = new URL(result.link);
|
244
|
+
const domain = url.hostname.replace(/^www\./, '');
|
245
|
+
|
246
|
+
// Check if this is a social media site
|
247
|
+
if (domain.match(/facebook\.com|twitter\.com|instagram\.com|linkedin\.com|pinterest\.com|tiktok\.com|reddit\.com/i)) {
|
248
|
+
return 'Social Media';
|
249
|
+
}
|
250
|
+
|
251
|
+
// Check if this is a video site
|
252
|
+
if (domain.match(/youtube\.com|vimeo\.com|dailymotion\.com|twitch\.tv/i)) {
|
253
|
+
return 'Video';
|
254
|
+
}
|
255
|
+
|
256
|
+
// Check if this is a news site
|
257
|
+
if (domain.match(/news|cnn\.com|bbc\.com|nytimes\.com|wsj\.com|reuters\.com|bloomberg\.com/i)) {
|
258
|
+
return 'News';
|
259
|
+
}
|
260
|
+
|
261
|
+
// Check if this is an educational site
|
262
|
+
if (domain.match(/\.edu$|wikipedia\.org|khan|course|learn|study|academic/i)) {
|
263
|
+
return 'Educational';
|
264
|
+
}
|
265
|
+
|
266
|
+
// Check if this is a documentation site
|
267
|
+
if (domain.match(/docs|documentation|developer|github\.com|gitlab\.com|bitbucket\.org|stackoverflow\.com/i) ||
|
268
|
+
result.title.match(/docs|documentation|api|reference|manual/i)) {
|
269
|
+
return 'Documentation';
|
270
|
+
}
|
271
|
+
|
272
|
+
// Check if this is a shopping site
|
273
|
+
if (domain.match(/amazon\.com|ebay\.com|etsy\.com|walmart\.com|shop|store|buy/i)) {
|
274
|
+
return 'Shopping';
|
275
|
+
}
|
276
|
+
|
277
|
+
// Default category based on domain
|
278
|
+
return domain.split('.').slice(-2, -1)[0].charAt(0).toUpperCase() + domain.split('.').slice(-2, -1)[0].slice(1);
|
279
|
+
|
280
|
+
} catch (error) {
|
281
|
+
// If there's any error in categorization, return a default category
|
282
|
+
return 'Other';
|
283
|
+
}
|
284
|
+
}
|
285
|
+
|
286
|
+
/**
|
287
|
+
* Generates category statistics from search results
|
288
|
+
* @param results The search results to analyze
|
289
|
+
* @returns An array of category information
|
290
|
+
*/
|
291
|
+
private generateCategoryStats(results: SearchResult[]): CategoryInfo[] {
|
292
|
+
// Count results by category
|
293
|
+
const categoryCounts: Record<string, number> = {};
|
294
|
+
|
295
|
+
results.forEach(result => {
|
296
|
+
const category = result.category || 'Other';
|
297
|
+
categoryCounts[category] = (categoryCounts[category] || 0) + 1;
|
298
|
+
});
|
299
|
+
|
300
|
+
// Convert to array of category info objects
|
301
|
+
return Object.entries(categoryCounts)
|
302
|
+
.map(([name, count]) => ({ name, count }))
|
303
|
+
.sort((a, b) => b.count - a.count); // Sort by count in descending order
|
304
|
+
}
|
305
|
+
}
|
package/src/types.ts
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
export interface SearchFilters {
|
2
|
+
site?: string;
|
3
|
+
language?: string;
|
4
|
+
dateRestrict?: string;
|
5
|
+
exactTerms?: string;
|
6
|
+
resultType?: string;
|
7
|
+
page?: number;
|
8
|
+
resultsPerPage?: number;
|
9
|
+
sort?: string;
|
10
|
+
}
|
11
|
+
|
12
|
+
export interface SearchResult {
|
13
|
+
title: string;
|
14
|
+
link: string;
|
15
|
+
snippet: string;
|
16
|
+
pagemap: Record<string, any>;
|
17
|
+
datePublished: string;
|
18
|
+
source: string;
|
19
|
+
category?: string;
|
20
|
+
}
|
21
|
+
|
22
|
+
export interface CategoryInfo {
|
23
|
+
name: string;
|
24
|
+
count: number;
|
25
|
+
}
|
26
|
+
|
27
|
+
export interface SearchPaginationInfo {
|
28
|
+
currentPage: number;
|
29
|
+
totalResults?: number;
|
30
|
+
resultsPerPage: number;
|
31
|
+
totalPages?: number;
|
32
|
+
hasNextPage: boolean;
|
33
|
+
hasPreviousPage: boolean;
|
34
|
+
}
|
35
|
+
|
36
|
+
export interface SearchResponse {
|
37
|
+
results: SearchResult[];
|
38
|
+
filters?: SearchFilters;
|
39
|
+
pagination?: SearchPaginationInfo;
|
40
|
+
categories?: CategoryInfo[];
|
41
|
+
}
|
42
|
+
|
43
|
+
export type OutputFormat = 'markdown' | 'html' | 'text';
|
44
|
+
|
45
|
+
export interface WebpageContent {
|
46
|
+
url: string;
|
47
|
+
title: string;
|
48
|
+
description: string;
|
49
|
+
content: string;
|
50
|
+
format: OutputFormat;
|
51
|
+
meta_tags: Record<string, string>;
|
52
|
+
stats: {
|
53
|
+
word_count: number;
|
54
|
+
approximate_chars: number;
|
55
|
+
};
|
56
|
+
content_preview: {
|
57
|
+
first_500_chars: string;
|
58
|
+
};
|
59
|
+
summary?: string;
|
60
|
+
}
|
61
|
+
|
62
|
+
export interface WebpageAnalysisResponse {
|
63
|
+
[url: string]: WebpageContent | { error: string };
|
64
|
+
}
|