smart-image-scraper-mcp 2.10.0 → 2.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "smart-image-scraper-mcp",
3
- "version": "2.10.0",
3
+ "version": "2.11.0",
4
4
  "description": "全网智能图片抓取 MCP 服务器 - 支持 Bing/Google 图片搜索、验证和下载",
5
5
  "main": "src/index.js",
6
6
  "type": "module",
@@ -27,6 +27,19 @@
27
27
  "engines": {
28
28
  "node": ">=18.0.0"
29
29
  },
30
+ "files": [
31
+ "src/index.js",
32
+ "src/config/",
33
+ "src/infrastructure/",
34
+ "src/providers/",
35
+ "src/services/orchestrator.js",
36
+ "src/services/linkValidator.js",
37
+ "src/services/fileManager.js",
38
+ "src/services/imageProcessor.js",
39
+ "src/services/index.js",
40
+ "README.md",
41
+ "LICENSE"
42
+ ],
30
43
  "repository": {
31
44
  "type": "git",
32
45
  "url": ""
package/src/index.js CHANGED
@@ -175,10 +175,14 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
175
175
  };
176
176
  }
177
177
 
178
+ // MCP 层最外层超时保护(55秒硬限制)
179
+ const MCP_TIMEOUT = 55000;
180
+
181
+ // 主流做法:每个请求创建新的 Orchestrator 实例,确保无状态
182
+ const orchestrator = new Orchestrator();
183
+ let mcpTimeoutId;
184
+
178
185
  try {
179
- // 主流做法:每个请求创建新的 Orchestrator 实例,确保无状态
180
- const orchestrator = new Orchestrator();
181
-
182
186
  // 规范化参数
183
187
  const params = {
184
188
  query: args.query.trim(),
@@ -194,30 +198,44 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
194
198
  minFileSize: ['any', '50kb', '100kb', '200kb', '500kb', '1mb'].includes(args.minFileSize) ? args.minFileSize : 'any',
195
199
  };
196
200
 
197
- // 执行任务
198
- const result = await orchestrator.execute(params);
199
-
200
- // 格式化输出
201
- const formattedResult = orchestrator.formatResult(result);
202
-
203
- // 如果任务失败,标记为错误
204
- if (!result.success) {
205
- return {
206
- content: [{ type: 'text', text: formattedResult }],
207
- isError: true,
208
- };
209
- }
210
-
211
- return {
212
- content: [{ type: 'text', text: formattedResult }],
213
- };
201
+ // 使用 Promise.race 确保一定会在超时内返回
202
+ const result = await Promise.race([
203
+ (async () => {
204
+ const result = await orchestrator.execute(params);
205
+ const formattedResult = orchestrator.formatResult(result);
206
+ if (!result.success) {
207
+ return {
208
+ content: [{ type: 'text', text: formattedResult }],
209
+ isError: true,
210
+ };
211
+ }
212
+ return {
213
+ content: [{ type: 'text', text: formattedResult }],
214
+ };
215
+ })(),
216
+ new Promise((_, reject) => {
217
+ mcpTimeoutId = setTimeout(() => {
218
+ // 超时时中止 orchestrator 的所有操作
219
+ if (orchestrator.abortController) {
220
+ orchestrator.abortController.abort();
221
+ }
222
+ reject(new Error('MCP_TIMEOUT: 请求超时(55秒),请减少关键词数量或稍后重试'));
223
+ }, MCP_TIMEOUT);
224
+ })
225
+ ]);
226
+ clearTimeout(mcpTimeoutId);
227
+ return result;
214
228
  } catch (error) {
215
- // 主流做法:简洁的错误处理,使用 stderr 输出日志
229
+ clearTimeout(mcpTimeoutId);
230
+ // 确保中止所有操作
231
+ if (orchestrator.abortController && !orchestrator.abortController.signal.aborted) {
232
+ orchestrator.abortController.abort();
233
+ }
216
234
  console.error(`[MCP Error] ${error.message}`);
217
235
  return {
218
236
  content: [{
219
237
  type: 'text',
220
- text: `## ❌ 执行错误\n\n**错误信息**: ${error.message}\n\n请检查网络连接或稍后重试。`
238
+ text: `## ❌ 执行错误\n\n**错误信息**: ${error.message}\n\n请减少关键词数量或稍后重试。`
221
239
  }],
222
240
  isError: true,
223
241
  };
@@ -253,10 +253,11 @@ export class ValidationCache extends LRUCache {
253
253
  export const searchCache = new SearchCache();
254
254
  export const validationCache = new ValidationCache();
255
255
 
256
- // 定期清理过期缓存
257
- setInterval(() => {
256
+ // 定期清理过期缓存(unref 避免阻止进程退出)
257
+ const cacheCleanupInterval = setInterval(() => {
258
258
  searchCache.cleanup();
259
259
  validationCache.cleanup();
260
260
  }, 60000); // 每分钟清理一次
261
+ cacheCleanupInterval.unref();
261
262
 
262
263
  export default { LRUCache, SearchCache, ValidationCache, searchCache, validationCache };
@@ -26,7 +26,8 @@ export class GracefulShutdown {
26
26
  * 注册信号处理器
27
27
  */
28
28
  _registerSignalHandlers() {
29
- const signals = ['SIGINT', 'SIGTERM', 'SIGQUIT'];
29
+ // 仅注册 SIGINT SIGTERM(SIGQUIT 在 Windows 上不存在)
30
+ const signals = ['SIGINT', 'SIGTERM'];
30
31
 
31
32
  signals.forEach(signal => {
32
33
  process.on(signal, async () => {
@@ -35,16 +36,15 @@ export class GracefulShutdown {
35
36
  });
36
37
  });
37
38
 
38
- // 处理未捕获的异常
39
- process.on('uncaughtException', async (error) => {
39
+ // 处理未捕获的异常 - 仅记录日志,不退出进程(避免中断 MCP 通信)
40
+ process.on('uncaughtException', (error) => {
40
41
  logger.error('Uncaught exception', { error: error.message, stack: error.stack });
41
- await this.shutdown(1);
42
+ // 不调用 process.exit,让 MCP 连接保持活跃
42
43
  });
43
44
 
44
- // 处理未处理的 Promise 拒绝
45
- process.on('unhandledRejection', async (reason, promise) => {
45
+ // 处理未处理的 Promise 拒绝 - 仅记录日志
46
+ process.on('unhandledRejection', (reason, promise) => {
46
47
  logger.error('Unhandled rejection', { reason: String(reason) });
47
- // 不立即退出,只记录日志
48
48
  });
49
49
  }
50
50
 
@@ -14,19 +14,19 @@ import logger from './logger.js';
14
14
  const httpAgent = new http.Agent({
15
15
  keepAlive: true, // 启用 Keep-Alive
16
16
  keepAliveMsecs: 1000, // Keep-Alive 探测间隔
17
- maxSockets: 50, // 降低最大并发连接数,避免资源耗尽
18
- maxFreeSockets: 10, // 降低最大空闲连接数
17
+ maxSockets: 20, // 降低最大并发连接数,避免资源耗尽
18
+ maxFreeSockets: 5, // 降低最大空闲连接数
19
19
  scheduling: 'lifo', // 后进先出,优先使用最近的连接
20
- timeout: 30000, // 空闲连接30秒后关闭
20
+ timeout: 10000, // 空闲连接10秒后关闭
21
21
  });
22
22
 
23
23
  const httpsAgent = new https.Agent({
24
24
  keepAlive: true,
25
25
  keepAliveMsecs: 1000,
26
- maxSockets: 50, // 降低最大并发连接数
27
- maxFreeSockets: 10, // 降低最大空闲连接数
26
+ maxSockets: 20, // 降低最大并发连接数
27
+ maxFreeSockets: 5, // 降低最大空闲连接数
28
28
  scheduling: 'lifo',
29
- timeout: 30000, // 空闲连接30秒后关闭
29
+ timeout: 10000, // 空闲连接10秒后关闭
30
30
  rejectUnauthorized: false, // 允许自签名证书
31
31
  });
32
32
 
@@ -137,14 +137,12 @@ class Logger {
137
137
  // 输出到 stderr
138
138
  console.error(formatted);
139
139
 
140
- // 输出到文件
140
+ // 输出到文件(异步写入,避免阻塞事件循环)
141
141
  if (this.logFile) {
142
- try {
143
- fs.appendFileSync(this.logFile, formatted + '\n');
144
- this._rotateLogIfNeeded();
145
- } catch (error) {
142
+ fs.appendFile(this.logFile, formatted + '\n', (err) => {
146
143
  // 忽略文件写入错误
147
- }
144
+ if (!err) this._rotateLogIfNeeded();
145
+ });
148
146
  }
149
147
  }
150
148
 
@@ -282,11 +282,12 @@ export class MetricsCollector {
282
282
  // 全局指标收集器
283
283
  export const metrics = new MetricsCollector();
284
284
 
285
- // 定期输出指标日志(每5分钟)
286
- setInterval(() => {
285
+ // 定期输出指标日志(每5分钟,unref 避免阻止进程退出)
286
+ const metricsInterval = setInterval(() => {
287
287
  if (metrics.metrics.requests.total > 0) {
288
288
  metrics.logSummary();
289
289
  }
290
290
  }, 5 * 60 * 1000);
291
+ metricsInterval.unref();
291
292
 
292
293
  export default metrics;
@@ -91,6 +91,7 @@ export class RateLimiter {
91
91
  resolve(false);
92
92
  }
93
93
  }, Math.min(100, this.interval / 10));
94
+ checkInterval.unref(); // 避免阻止进程退出
94
95
  });
95
96
  }
96
97
 
@@ -27,8 +27,9 @@ export class RequestQueue {
27
27
  totalTimeout: 0,
28
28
  };
29
29
 
30
- // 定期清理超时请求
30
+ // 定期清理超时请求(unref 避免阻止进程退出)
31
31
  this.cleanupInterval = setInterval(() => this._cleanupTimeouts(), 5000);
32
+ this.cleanupInterval.unref();
32
33
  }
33
34
 
34
35
  /**
@@ -23,7 +23,6 @@ export class BingScraper extends BaseScraper {
23
23
  * @returns {Promise<string[]>} - 图片URL列表
24
24
  */
25
25
  async search(keyword, limit = 10, options = {}) {
26
- this.options = options;
27
26
  const pageSize = 35;
28
27
 
29
28
  // 计算需要获取的页数(最多3页,避免触发速率限制)
@@ -31,13 +30,19 @@ export class BingScraper extends BaseScraper {
31
30
  logger.info(`[Bing] Searching "${keyword}" - ${pagesNeeded} page(s) for ${limit} images`);
32
31
 
33
32
  try {
33
+ const seen = new Set(); // 去重
34
34
  let allUrls = [];
35
35
 
36
36
  // 顺序获取多页(避免并发触发限制)
37
37
  for (let page = 0; page < pagesNeeded; page++) {
38
38
  const offset = page * pageSize;
39
- const urls = await this._fetchPage(keyword, offset);
40
- allUrls = allUrls.concat(urls);
39
+ const urls = await this._fetchPage(keyword, offset, options);
40
+ for (const url of urls) {
41
+ if (!seen.has(url)) {
42
+ seen.add(url);
43
+ allUrls.push(url);
44
+ }
45
+ }
41
46
 
42
47
  // 如果已经够了就停止
43
48
  if (allUrls.length >= limit) {
@@ -62,8 +67,8 @@ export class BingScraper extends BaseScraper {
62
67
  /**
63
68
  * 获取单页结果
64
69
  */
65
- async _fetchPage(keyword, offset) {
66
- const searchUrl = this._buildSearchUrl(keyword, offset);
70
+ async _fetchPage(keyword, offset, options = {}) {
71
+ const searchUrl = this._buildSearchUrl(keyword, offset, options);
67
72
 
68
73
  try {
69
74
  const response = await withRetry(
@@ -89,7 +94,7 @@ export class BingScraper extends BaseScraper {
89
94
  /**
90
95
  * 构建搜索 URL
91
96
  */
92
- _buildSearchUrl(keyword, offset = 0) {
97
+ _buildSearchUrl(keyword, offset = 0, options = {}) {
93
98
  // 尺寸过滤映射
94
99
  const sizeMap = {
95
100
  'small': '+filterui:imagesize-small',
@@ -114,9 +119,9 @@ export class BingScraper extends BaseScraper {
114
119
  'strict': 'strict',
115
120
  };
116
121
 
117
- const size = this.options?.size || 'all';
118
- const aspect = this.options?.aspect || 'all';
119
- const safeSearch = this.options?.safeSearch || 'moderate';
122
+ const size = options.size || 'all';
123
+ const aspect = options.aspect || 'all';
124
+ const safeSearch = options.safeSearch || 'moderate';
120
125
 
121
126
  let qft = '+filterui:photo-photo';
122
127
  if (sizeMap[size]) {
@@ -23,7 +23,6 @@ export class GoogleScraper extends BaseScraper {
23
23
  * @returns {Promise<string[]>} - 图片URL列表
24
24
  */
25
25
  async search(keyword, limit = 10, options = {}) {
26
- this.options = options;
27
26
  const pageSize = 20; // Google 每页约20张
28
27
 
29
28
  // 计算需要获取的页数(最多3页,避免触发速率限制)
@@ -31,13 +30,19 @@ export class GoogleScraper extends BaseScraper {
31
30
  logger.info(`[Google] Searching "${keyword}" - ${pagesNeeded} page(s) for ${limit} images`);
32
31
 
33
32
  try {
33
+ const seen = new Set(); // 去重
34
34
  let allUrls = [];
35
35
 
36
36
  // 顺序获取多页
37
37
  for (let page = 0; page < pagesNeeded; page++) {
38
38
  const start = page * pageSize;
39
- const urls = await this._fetchPage(keyword, start);
40
- allUrls = allUrls.concat(urls);
39
+ const urls = await this._fetchPage(keyword, start, options);
40
+ for (const url of urls) {
41
+ if (!seen.has(url)) {
42
+ seen.add(url);
43
+ allUrls.push(url);
44
+ }
45
+ }
41
46
 
42
47
  if (allUrls.length >= limit) {
43
48
  break;
@@ -57,16 +62,12 @@ export class GoogleScraper extends BaseScraper {
57
62
  return [];
58
63
  }
59
64
  }
60
-
61
- _delay(ms) {
62
- return new Promise(resolve => setTimeout(resolve, ms));
63
- }
64
65
 
65
66
  /**
66
67
  * 获取单页结果
67
68
  */
68
- async _fetchPage(keyword, start) {
69
- const searchUrl = this._buildSearchUrl(keyword, start);
69
+ async _fetchPage(keyword, start, options = {}) {
70
+ const searchUrl = this._buildSearchUrl(keyword, start, options);
70
71
 
71
72
  try {
72
73
  const response = await withRetry(
@@ -98,7 +99,7 @@ export class GoogleScraper extends BaseScraper {
98
99
  /**
99
100
  * 构建搜索 URL
100
101
  */
101
- _buildSearchUrl(keyword, start = 0) {
102
+ _buildSearchUrl(keyword, start = 0, options = {}) {
102
103
  // 尺寸过滤映射 (Google 使用 tbs 参数)
103
104
  const sizeMap = {
104
105
  'small': 'isz:s',
@@ -123,9 +124,9 @@ export class GoogleScraper extends BaseScraper {
123
124
  'strict': 'active',
124
125
  };
125
126
 
126
- const size = this.options?.size || 'all';
127
- const aspect = this.options?.aspect || 'all';
128
- const safeSearch = this.options?.safeSearch || 'moderate';
127
+ const size = options.size || 'all';
128
+ const aspect = options.aspect || 'all';
129
+ const safeSearch = options.safeSearch || 'moderate';
129
130
 
130
131
  const params = new URLSearchParams({
131
132
  q: keyword,
@@ -158,14 +159,16 @@ export class GoogleScraper extends BaseScraper {
158
159
 
159
160
  try {
160
161
  // 方法1: 使用正则提取图片URL
161
- // Google 图片结果中的原图URL通常在特定的JSON结构中
162
- const patterns = [
162
+ // 每次创建新的 RegExp 实例避免全局标志 lastIndex 状态污染
163
+ const patternDefs = [
163
164
  /\["(https?:\/\/[^"]+\.(?:jpg|jpeg|png|gif|webp)[^"]*)"/gi,
164
165
  /"ou":"(https?:\/\/[^"]+)"/gi,
165
166
  /\["(https?:\/\/[^"]+)",\d+,\d+\]/gi,
166
167
  ];
167
168
 
168
- for (const pattern of patterns) {
169
+ for (const pattern of patternDefs) {
170
+ // 重置 lastIndex 确保每次从头开始匹配
171
+ pattern.lastIndex = 0;
169
172
  let match;
170
173
  while ((match = pattern.exec(html)) !== null) {
171
174
  const url = this._decodeUrl(match[1]);
@@ -240,14 +243,11 @@ export class GoogleScraper extends BaseScraper {
240
243
  // 排除 Google 自身的缩略图和无效链接
241
244
  const invalidPatterns = [
242
245
  'gstatic.com',
243
- 'google.com',
244
- 'googleapis.com',
245
- 'googleusercontent.com/encrypted',
246
+ 'google.com/images',
247
+ 'google.com/logos',
248
+ 'googleapis.com/proxy',
246
249
  'data:image',
247
250
  'base64',
248
- 'favicon',
249
- 'logo',
250
- 'icon',
251
251
  ];
252
252
 
253
253
  for (const pattern of invalidPatterns) {
@@ -263,12 +263,6 @@ export class GoogleScraper extends BaseScraper {
263
263
  return hasImageExt || looksLikeImage || url.length > 50;
264
264
  }
265
265
 
266
- /**
267
- * 延迟函数
268
- */
269
- _delay(ms) {
270
- return new Promise(resolve => setTimeout(resolve, ms));
271
- }
272
266
  }
273
267
 
274
268
  export default GoogleScraper;
@@ -238,7 +238,7 @@ export class FileManager {
238
238
 
239
239
  response.data.pipe(writer);
240
240
 
241
- writer.on('finish', () => {
241
+ writer.on('close', () => {
242
242
  if (!resolved) {
243
243
  resolved = true;
244
244
  clearTimeout(downloadTimeout);
@@ -6,8 +6,12 @@
6
6
  import sharp from 'sharp';
7
7
  import fs from 'fs-extra';
8
8
  import path from 'path';
9
+ import pLimit from 'p-limit';
9
10
  import logger from '../infrastructure/logger.js';
10
11
 
12
+ // sharp 是 CPU 密集型操作,限制并发避免卡死
13
+ const imageProcessLimit = pLimit(2);
14
+
11
15
  export class ImageProcessor {
12
16
  /**
13
17
  * 预设尺寸配置
@@ -80,9 +84,6 @@ export class ImageProcessor {
80
84
  // 替换原文件
81
85
  await fs.move(tempPath, finalOutputPath, { overwrite: true });
82
86
 
83
- // 获取处理后的信息
84
- const newMetadata = await sharp(finalOutputPath).metadata();
85
-
86
87
  logger.debug(`Processed image: ${inputPath} -> ${width}x${height}`);
87
88
 
88
89
  return {
@@ -90,7 +91,7 @@ export class ImageProcessor {
90
91
  path: finalOutputPath,
91
92
  metadata: {
92
93
  original: { width: metadata.width, height: metadata.height },
93
- processed: { width: newMetadata.width, height: newMetadata.height },
94
+ processed: { width, height },
94
95
  },
95
96
  };
96
97
  } catch (error) {
@@ -130,7 +131,7 @@ export class ImageProcessor {
130
131
  */
131
132
  async processMany(files, options = {}) {
132
133
  const results = await Promise.all(
133
- files.map(file => this.processOne(file.path, options))
134
+ files.map(file => imageProcessLimit(() => this.processOne(file.path, options)))
134
135
  );
135
136
 
136
137
  const success = [];
@@ -8,9 +8,8 @@ import httpClient from '../infrastructure/httpClient.js';
8
8
  import logger from '../infrastructure/logger.js';
9
9
  import config from '../config/index.js';
10
10
 
11
- // 并发验证配置 - 降低并发避免连接池耗尽
12
- const MAX_VALIDATE_CONCURRENCY = 15; // 验证并发 15
13
- const globalValidateLimit = pLimit(MAX_VALIDATE_CONCURRENCY);
11
+ // 使用配置中的并发数,避免硬编码与配置不一致
12
+ const globalValidateLimit = pLimit(config.MAX_VALIDATE_CONCURRENCY);
14
13
 
15
14
  export class LinkValidator {
16
15
  constructor() {
@@ -65,13 +64,21 @@ export class LinkValidator {
65
64
  return { url, valid: true, quality };
66
65
  }
67
66
 
67
+ // 某些服务器不支持 HEAD,返回 405/403 时尝试 GET 降级
68
+ if (response.status === 405 || response.status === 403) {
69
+ return await this._validateWithGet(url, fetchQuality);
70
+ }
71
+
68
72
  return { url, valid: false, error: `status=${response.status}` };
69
73
  } catch (error) {
70
74
  clearTimeout(timeoutId);
71
- // 确保 abort controller 被清理
72
75
  if (!controller.signal.aborted) {
73
76
  controller.abort();
74
77
  }
78
+ // 网络错误时也尝试 GET 降级(某些 CDN 完全拒绝 HEAD)
79
+ if (error.response && (error.response.status === 405 || error.response.status === 403)) {
80
+ return await this._validateWithGet(url, fetchQuality);
81
+ }
75
82
  return { url, valid: false, error: 'timeout' };
76
83
  }
77
84
  }