smart-image-scraper-mcp 2.5.2 → 2.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,213 @@
1
+ #!/usr/bin/env node
2
+
3
+ /**
4
+ * 全网智能图片抓取 MCP 服务器 - 简化版
5
+ * 模仿主流 MCP 服务器的实现方式
6
+ *
7
+ * 设计原则:
8
+ * - 简洁:最小化基础设施代码
9
+ * - 无状态:每个请求独立处理
10
+ * - 可靠:简单的错误处理,避免资源泄漏
11
+ */
12
+
13
+ import { Server } from '@modelcontextprotocol/sdk/server/index.js';
14
+ import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
15
+ import {
16
+ CallToolRequestSchema,
17
+ ListToolsRequestSchema,
18
+ } from '@modelcontextprotocol/sdk/types.js';
19
+ import { createRequire } from 'module';
20
+
21
+ import { Orchestrator } from './services/orchestrator.js';
22
+ import config from './config/index.js';
23
+
24
+ // 从 package.json 读取版本号
25
+ const require = createRequire(import.meta.url);
26
+ const packageJson = require('../package.json');
27
+
28
+ // 创建 MCP 服务器(主流做法:简单配置)
29
+ const server = new Server(
30
+ {
31
+ name: 'smart-image-scraper',
32
+ version: packageJson.version,
33
+ },
34
+ {
35
+ capabilities: {
36
+ tools: {},
37
+ },
38
+ }
39
+ );
40
+
41
+ // 工具定义(主流做法:简洁的 schema)
42
+ const SMART_SCRAPER_TOOL = {
43
+ name: 'smart_scraper',
44
+ description: `全网智能图片抓取工具 - 从 Bing/Google 搜索、验证、下载高质量图片。
45
+
46
+ 【核心功能】
47
+ 1. 搜索图片链接 (mode=link) - 返回验证过的图片URL列表
48
+ 2. 下载图片 (mode=download) - 下载到本地,自动按质量排序优先高清
49
+ 3. 尺寸统一 (targetSize) - 下载后自动裁剪/缩放到指定尺寸
50
+ 4. 宽高比过滤 (aspect) - 横向/竖向/正方形
51
+
52
+ 【参数选择指南】
53
+ - 用户要"找/搜索/查找图片" → mode="link"
54
+ - 用户要"下载/保存/获取图片" → mode="download"
55
+ - 用户要"高清/大图/壁纸" → size="large" 或 "wallpaper"
56
+ - 用户要"电脑壁纸/横屏/横向" → aspect="wide"
57
+ - 用户要"手机壁纸/竖屏/竖向" → aspect="tall"
58
+ - 用户要"统一尺寸/固定大小" → targetSize="1920x1080" 或预设名
59
+ - 用户要"多种类型图片" → query="猫,狗,鸟"(英文逗号分隔)
60
+
61
+ 【预设尺寸名称】
62
+ - 电脑壁纸: desktop_1080p(1920x1080), desktop_2k(2560x1440), desktop_4k(3840x2160)
63
+ - 手机壁纸: mobile_hd(1080x1920), mobile_2k(1440x2560)
64
+ - 正方形: square_1080(1080x1080), square_512(512x512)
65
+ - 社交媒体: instagram(1080x1080), twitter(1200x675), facebook(1200x630)
66
+
67
+ 【调用示例】
68
+ 1. 搜索5张猫的图片: {"query":"可爱的猫","mode":"link","count":5}
69
+ 2. 下载10张高清风景图: {"query":"风景","mode":"download","count":10,"size":"large"}
70
+ 3. 下载电脑壁纸并统一为1080p: {"query":"风景","mode":"download","count":10,"aspect":"wide","targetSize":"desktop_1080p"}
71
+ 4. 下载手机壁纸: {"query":"动漫","mode":"download","count":10,"aspect":"tall","targetSize":"mobile_hd"}
72
+ 5. 批量下载多类图片: {"query":"猫,狗,兔子","mode":"download","count":5}`,
73
+ inputSchema: {
74
+ type: 'object',
75
+ properties: {
76
+ query: {
77
+ type: 'string',
78
+ description: '搜索关键词。批量搜索用英文逗号分隔,如 "猫,狗,鸟"。建议使用具体描述性词语如"可爱的橘猫"而非"猫"',
79
+ },
80
+ mode: {
81
+ type: 'string',
82
+ enum: ['link', 'download'],
83
+ description: "运行模式。link=仅返回验证过的图片URL列表(用户只需要链接时使用);download=下载图片到本地文件系统(用户说下载/保存时使用)",
84
+ },
85
+ count: {
86
+ type: 'number',
87
+ description: '每个关键词获取的图片数量。范围1-100,推荐1-20。用户说"几张"用5-10,说"很多"用20-30',
88
+ default: 10,
89
+ },
90
+ source: {
91
+ type: 'string',
92
+ enum: ['bing', 'google'],
93
+ description: '搜索引擎。bing更稳定推荐优先使用,google结果可能更丰富但可能被限制',
94
+ default: 'bing',
95
+ },
96
+ size: {
97
+ type: 'string',
98
+ enum: ['all', 'small', 'medium', 'large', 'wallpaper'],
99
+ description: '图片尺寸。all=不限;small=小图/图标;medium=中图;large=大图/高清;wallpaper=壁纸级别(1080p+)',
100
+ default: 'all',
101
+ },
102
+ aspect: {
103
+ type: 'string',
104
+ enum: ['all', 'wide', 'tall', 'square'],
105
+ description: '图片宽高比。all=不限;wide=横向/宽屏(电脑壁纸);tall=纵向/竖屏(手机壁纸);square=正方形',
106
+ default: 'all',
107
+ },
108
+ targetSize: {
109
+ type: 'string',
110
+ description: '目标尺寸,下载后统一裁剪/缩放到此尺寸。格式: "宽x高"(如"1920x1080")或预设名(desktop_1080p/desktop_2k/desktop_4k/mobile_hd/mobile_2k/square_1080/instagram/twitter/facebook)',
111
+ },
112
+ fit: {
113
+ type: 'string',
114
+ enum: ['cover', 'contain', 'fill'],
115
+ description: '尺寸处理时的适应模式。cover=裁剪填充(默认,不留白);contain=包含留白;fill=拉伸填充',
116
+ default: 'cover',
117
+ },
118
+ safeSearch: {
119
+ type: 'string',
120
+ enum: ['off', 'moderate', 'strict'],
121
+ description: '安全搜索。off=关闭;moderate=中等过滤(默认);strict=严格过滤(儿童/家庭内容)',
122
+ default: 'moderate',
123
+ },
124
+ },
125
+ required: ['query', 'mode'],
126
+ },
127
+ };
128
+
129
+ // 注册工具列表(主流做法:简单返回)
130
+ server.setRequestHandler(ListToolsRequestSchema, async () => ({
131
+ tools: [SMART_SCRAPER_TOOL],
132
+ }));
133
+
134
+ // 注册工具调用(主流做法:每个请求创建新实例,避免状态污染)
135
+ server.setRequestHandler(CallToolRequestSchema, async (request) => {
136
+ const { name, arguments: args } = request.params;
137
+
138
+ if (name !== 'smart_scraper') {
139
+ return {
140
+ content: [{ type: 'text', text: `未知工具: ${name}` }],
141
+ isError: true,
142
+ };
143
+ }
144
+
145
+ // 参数验证(主流做法:快速失败)
146
+ if (!args?.query || typeof args.query !== 'string' || !args.query.trim()) {
147
+ return {
148
+ content: [{ type: 'text', text: '错误: 请提供有效的搜索关键词 (query)' }],
149
+ isError: true,
150
+ };
151
+ }
152
+
153
+ if (!args.mode || !['link', 'download'].includes(args.mode)) {
154
+ return {
155
+ content: [{ type: 'text', text: "错误: 请指定有效的运行模式 (mode): 'link' 或 'download'" }],
156
+ isError: true,
157
+ };
158
+ }
159
+
160
+ try {
161
+ // 主流做法:每个请求创建新的 Orchestrator 实例,确保无状态
162
+ const orchestrator = new Orchestrator();
163
+
164
+ // 规范化参数
165
+ const params = {
166
+ query: args.query.trim(),
167
+ mode: args.mode,
168
+ count: Math.min(Math.max(parseInt(args.count, 10) || 10, 1), 100),
169
+ source: ['bing', 'google'].includes(args.source) ? args.source : 'bing',
170
+ size: args.size || 'all',
171
+ aspect: args.aspect || 'all',
172
+ targetSize: args.targetSize || null,
173
+ fit: args.fit || 'cover',
174
+ safeSearch: args.safeSearch || 'moderate',
175
+ };
176
+
177
+ // 执行任务
178
+ const result = await orchestrator.execute(params);
179
+
180
+ // 格式化输出
181
+ const formattedResult = orchestrator.formatResult(result);
182
+
183
+ return {
184
+ content: [{ type: 'text', text: formattedResult }],
185
+ };
186
+ } catch (error) {
187
+ // 主流做法:简洁的错误处理
188
+ console.error(`[MCP Error] ${error.message}`);
189
+ return {
190
+ content: [{
191
+ type: 'text',
192
+ text: `## ❌ 执行错误\n\n**错误信息**: ${error.message}\n\n请检查网络连接或稍后重试。`
193
+ }],
194
+ isError: true,
195
+ };
196
+ }
197
+ });
198
+
199
+ // 启动服务器(主流做法:最简启动)
200
+ async function main() {
201
+ console.error(`[MCP] Starting Smart Image Scraper v${packageJson.version}`);
202
+ console.error(`[MCP] Save root: ${config.SAVE_ROOT}`);
203
+
204
+ const transport = new StdioServerTransport();
205
+ await server.connect(transport);
206
+
207
+ console.error('[MCP] Server is running');
208
+ }
209
+
210
+ main().catch((error) => {
211
+ console.error(`[MCP] Startup error: ${error.message}`);
212
+ process.exit(1);
213
+ });
@@ -201,6 +201,7 @@ export class SearchCache extends LRUCache {
201
201
  keyword: keyword.toLowerCase().trim(),
202
202
  source,
203
203
  size: options.size || 'all',
204
+ aspect: options.aspect || 'all',
204
205
  safeSearch: options.safeSearch || 'moderate',
205
206
  });
206
207
  }
@@ -6,6 +6,7 @@
6
6
  import logger from './logger.js';
7
7
  import { metrics } from './metrics.js';
8
8
  import { searchCache, validationCache } from './cache.js';
9
+ import { destroyAgents } from './httpClient.js';
9
10
 
10
11
  /**
11
12
  * 优雅关闭管理器
@@ -128,6 +129,9 @@ export class GracefulShutdown {
128
129
  searchCache.clear();
129
130
  validationCache.clear();
130
131
 
132
+ // 销毁 HTTP 连接池
133
+ destroyAgents();
134
+
131
135
  logger.info('Graceful shutdown complete');
132
136
  } catch (error) {
133
137
  logger.error('Error during shutdown', { error: error.message });
@@ -10,24 +10,34 @@ import config from '../config/index.js';
10
10
  import logger from './logger.js';
11
11
 
12
12
  // HTTP 连接池配置 - 复用 TCP 连接,大幅提升性能
13
- // 注意:不设置 timeout,让 axios 控制超时,避免连接被过早关闭
13
+ // 添加 timeout 确保空闲连接被及时释放,避免连接池耗尽
14
14
  const httpAgent = new http.Agent({
15
15
  keepAlive: true, // 启用 Keep-Alive
16
16
  keepAliveMsecs: 1000, // Keep-Alive 探测间隔
17
- maxSockets: 100, // 增加最大并发连接数
18
- maxFreeSockets: 20, // 增加最大空闲连接数
17
+ maxSockets: 50, // 降低最大并发连接数,避免资源耗尽
18
+ maxFreeSockets: 10, // 降低最大空闲连接数
19
19
  scheduling: 'lifo', // 后进先出,优先使用最近的连接
20
+ timeout: 30000, // 空闲连接30秒后关闭
20
21
  });
21
22
 
22
23
  const httpsAgent = new https.Agent({
23
24
  keepAlive: true,
24
25
  keepAliveMsecs: 1000,
25
- maxSockets: 100,
26
- maxFreeSockets: 20,
26
+ maxSockets: 50, // 降低最大并发连接数
27
+ maxFreeSockets: 10, // 降低最大空闲连接数
27
28
  scheduling: 'lifo',
29
+ timeout: 30000, // 空闲连接30秒后关闭
28
30
  rejectUnauthorized: false, // 允许自签名证书
29
31
  });
30
32
 
33
+ // 注意:不再使用 destroy(),因为它会销毁所有连接包括正在使用的
34
+ // 连接池会自动管理空闲连接的超时(通过 timeout 配置)
35
+ // 如需手动清理,可导出 destroyAgents 函数在关闭时调用
36
+ export function destroyAgents() {
37
+ httpAgent.destroy();
38
+ httpsAgent.destroy();
39
+ }
40
+
31
41
  const httpClient = axios.create({
32
42
  timeout: config.REQUEST_TIMEOUT,
33
43
  httpAgent, // ✅ 使用连接池
@@ -126,6 +126,27 @@ export class FileManager {
126
126
  async downloadOne(url, keyword) {
127
127
  let filePath = null;
128
128
  let writer = null;
129
+ let response = null;
130
+ let downloadTimeout = null;
131
+ let resolved = false;
132
+
133
+ // 辅助函数:安全地销毁流
134
+ const safeDestroy = () => {
135
+ try {
136
+ if (response?.data && !response.data.destroyed) {
137
+ response.data.destroy();
138
+ }
139
+ } catch (e) { /* ignore */ }
140
+ try {
141
+ if (writer && !writer.destroyed) {
142
+ writer.destroy();
143
+ }
144
+ } catch (e) { /* ignore */ }
145
+ if (downloadTimeout) {
146
+ clearTimeout(downloadTimeout);
147
+ downloadTimeout = null;
148
+ }
149
+ };
129
150
 
130
151
  try {
131
152
  // 验证 URL 格式
@@ -139,28 +160,28 @@ export class FileManager {
139
160
  return { success: false, url, error: 'Malformed URL' };
140
161
  }
141
162
 
142
- const response = await httpClient.get(url, {
163
+ response = await httpClient.get(url, {
143
164
  responseType: 'stream',
144
- timeout: 30000, // 下载超时30
165
+ timeout: 20000, // 连接超时20
145
166
  maxContentLength: 50 * 1024 * 1024, // 最大50MB
146
167
  maxBodyLength: 50 * 1024 * 1024,
147
168
  });
148
169
 
149
170
  if (response.status !== 200) {
171
+ safeDestroy();
150
172
  return { success: false, url, error: `HTTP ${response.status}` };
151
173
  }
152
174
 
153
175
  const contentType = response.headers['content-type'] || '';
154
176
  if (!contentType.includes('image')) {
155
- // 销毁响应流
156
- response.data.destroy();
177
+ safeDestroy();
157
178
  return { success: false, url, error: 'Not an image' };
158
179
  }
159
180
 
160
181
  // 检查文件大小(如果有 content-length)
161
182
  const contentLength = parseInt(response.headers['content-length'] || '0', 10);
162
183
  if (contentLength > 50 * 1024 * 1024) {
163
- response.data.destroy();
184
+ safeDestroy();
164
185
  return { success: false, url, error: 'File too large (>50MB)' };
165
186
  }
166
187
 
@@ -185,38 +206,60 @@ export class FileManager {
185
206
  let downloadedBytes = 0;
186
207
  const maxBytes = 50 * 1024 * 1024;
187
208
 
209
+ // 下载超时保护(30秒)
210
+ downloadTimeout = setTimeout(() => {
211
+ if (!resolved) {
212
+ resolved = true;
213
+ logger.warn(`Download timeout: ${url}`);
214
+ safeDestroy();
215
+ this._cleanupFile(filePath);
216
+ resolve({ success: false, url, error: 'Download timeout' });
217
+ }
218
+ }, 30000);
219
+
188
220
  response.data.on('data', (chunk) => {
189
221
  downloadedBytes += chunk.length;
190
- if (downloadedBytes > maxBytes) {
191
- response.data.destroy();
192
- writer.destroy();
193
- // 清理临时文件
222
+ if (downloadedBytes > maxBytes && !resolved) {
223
+ resolved = true;
224
+ safeDestroy();
194
225
  this._cleanupFile(filePath);
195
226
  resolve({ success: false, url, error: 'File too large during download' });
196
227
  }
197
228
  });
198
229
 
199
230
  response.data.on('error', (error) => {
200
- writer.destroy();
201
- this._cleanupFile(filePath);
202
- resolve({ success: false, url, error: `Stream error: ${error.message}` });
231
+ if (!resolved) {
232
+ resolved = true;
233
+ safeDestroy();
234
+ this._cleanupFile(filePath);
235
+ resolve({ success: false, url, error: `Stream error: ${error.message}` });
236
+ }
203
237
  });
204
238
 
205
239
  response.data.pipe(writer);
206
240
 
207
241
  writer.on('finish', () => {
208
- logger.debug(`Downloaded: ${filePath}`);
209
- resolve({ success: true, url, path: filePath });
242
+ if (!resolved) {
243
+ resolved = true;
244
+ clearTimeout(downloadTimeout);
245
+ logger.debug(`Downloaded: ${filePath}`);
246
+ resolve({ success: true, url, path: filePath });
247
+ }
210
248
  });
211
249
 
212
250
  writer.on('error', (error) => {
213
- logger.warn(`Write error: ${filePath}`, { error: error.message });
214
- this._cleanupFile(filePath);
215
- resolve({ success: false, url, error: error.message });
251
+ if (!resolved) {
252
+ resolved = true;
253
+ logger.warn(`Write error: ${filePath}`, { error: error.message });
254
+ safeDestroy();
255
+ this._cleanupFile(filePath);
256
+ resolve({ success: false, url, error: error.message });
257
+ }
216
258
  });
217
259
  });
218
260
  } catch (error) {
219
- // 清理可能创建的临时文件
261
+ // 清理可能创建的临时文件和流
262
+ safeDestroy();
220
263
  if (filePath) {
221
264
  this._cleanupFile(filePath);
222
265
  }
@@ -8,8 +8,8 @@ import httpClient from '../infrastructure/httpClient.js';
8
8
  import logger from '../infrastructure/logger.js';
9
9
  import config from '../config/index.js';
10
10
 
11
- // 极速并发验证配置
12
- const MAX_VALIDATE_CONCURRENCY = 30; // 极速验证并发 30
11
+ // 并发验证配置 - 降低并发避免连接池耗尽
12
+ const MAX_VALIDATE_CONCURRENCY = 15; // 验证并发 15
13
13
  const globalValidateLimit = pLimit(MAX_VALIDATE_CONCURRENCY);
14
14
 
15
15
  export class LinkValidator {
@@ -45,11 +45,11 @@ export class LinkValidator {
45
45
  }
46
46
 
47
47
  const controller = new AbortController();
48
- const timeoutId = setTimeout(() => controller.abort(), 2000); // 极速超时 2
48
+ const timeoutId = setTimeout(() => controller.abort(), 3000); // 超时 3
49
49
 
50
50
  try {
51
51
  const response = await httpClient.head(url, {
52
- timeout: 1500, // 极速超时 1.5 秒
52
+ timeout: 2500, // 超时 2.5 秒
53
53
  maxRedirects: 1, // 最多 1 次重定向
54
54
  signal: controller.signal,
55
55
  });
@@ -68,6 +68,10 @@ export class LinkValidator {
68
68
  return { url, valid: false, error: `status=${response.status}` };
69
69
  } catch (error) {
70
70
  clearTimeout(timeoutId);
71
+ // 确保 abort controller 被清理
72
+ if (!controller.signal.aborted) {
73
+ controller.abort();
74
+ }
71
75
  return { url, valid: false, error: 'timeout' };
72
76
  }
73
77
  }
@@ -178,12 +182,12 @@ export class LinkValidator {
178
182
  */
179
183
  async _validateWithGet(url, fetchQuality = false) {
180
184
  const controller = new AbortController();
181
- const timeoutId = setTimeout(() => controller.abort(), 8000);
185
+ const timeoutId = setTimeout(() => controller.abort(), 5000);
182
186
 
183
187
  try {
184
188
  const response = await httpClient.get(url, {
185
- timeout: 5000,
186
- maxRedirects: 3,
189
+ timeout: 4000,
190
+ maxRedirects: 2,
187
191
  responseType: 'arraybuffer',
188
192
  signal: controller.signal,
189
193
  headers: {
@@ -205,7 +209,10 @@ export class LinkValidator {
205
209
  return { url, valid: false, error: `GET validation failed: status=${response.status}` };
206
210
  } catch (error) {
207
211
  clearTimeout(timeoutId);
208
-
212
+ // 确保 abort controller 被清理
213
+ if (!controller.signal.aborted) {
214
+ controller.abort();
215
+ }
209
216
  if (error.name === 'AbortError' || error.code === 'ERR_CANCELED') {
210
217
  return { url, valid: false, error: 'Request timeout' };
211
218
  }