smart-image-scraper-mcp 2.1.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "smart-image-scraper-mcp",
3
- "version": "2.1.0",
3
+ "version": "2.2.0",
4
4
  "description": "全网智能图片抓取 MCP 服务器 - 支持 Bing/Google 图片搜索、验证和下载",
5
5
  "main": "src/index.js",
6
6
  "type": "module",
@@ -0,0 +1,260 @@
1
+ /**
2
+ * 请求队列管理器 - 多线程队列机制
3
+ * 管理并发请求,确保资源正确释放
4
+ */
5
+
6
+ import logger from './logger.js';
7
+ import { metrics } from './metrics.js';
8
+
9
+ /**
10
+ * 请求队列管理器
11
+ */
12
+ export class RequestQueue {
13
+ constructor(options = {}) {
14
+ this.maxConcurrent = options.maxConcurrent || 5; // 最大并发数
15
+ this.maxQueueSize = options.maxQueueSize || 20; // 最大队列长度
16
+ this.requestTimeout = options.requestTimeout || 60000; // 请求超时 60 秒
17
+
18
+ this.queue = []; // 等待队列
19
+ this.active = new Map(); // 活跃请求 Map<requestId, requestInfo>
20
+ this.completed = []; // 已完成请求(保留最近 100 个)
21
+ this.maxCompleted = 100;
22
+
23
+ this.stats = {
24
+ totalProcessed: 0,
25
+ totalSuccess: 0,
26
+ totalFailed: 0,
27
+ totalTimeout: 0,
28
+ };
29
+
30
+ // 定期清理超时请求
31
+ this.cleanupInterval = setInterval(() => this._cleanupTimeouts(), 5000);
32
+ }
33
+
34
+ /**
35
+ * 生成请求 ID
36
+ */
37
+ generateRequestId() {
38
+ return `req_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
39
+ }
40
+
41
+ /**
42
+ * 提交请求到队列
43
+ * @param {Function} task - 异步任务函数
44
+ * @param {Object} metadata - 请求元数据
45
+ * @returns {Promise<Object>} - 任务结果
46
+ */
47
+ async submit(task, metadata = {}) {
48
+ const requestId = this.generateRequestId();
49
+
50
+ // 检查队列是否已满
51
+ if (this.queue.length >= this.maxQueueSize) {
52
+ logger.warn(`[Queue] Request rejected: queue full (${this.queue.length}/${this.maxQueueSize})`);
53
+ metrics.recordError('queue_full');
54
+ return {
55
+ success: false,
56
+ error: `队列已满,请稍后重试。当前队列: ${this.queue.length}`,
57
+ requestId,
58
+ queueStatus: this.getStatus(),
59
+ };
60
+ }
61
+
62
+ // 创建请求包装
63
+ const requestInfo = {
64
+ id: requestId,
65
+ metadata,
66
+ createdAt: Date.now(),
67
+ status: 'queued',
68
+ resolve: null,
69
+ reject: null,
70
+ };
71
+
72
+ // 创建 Promise
73
+ const promise = new Promise((resolve, reject) => {
74
+ requestInfo.resolve = resolve;
75
+ requestInfo.reject = reject;
76
+ requestInfo.task = task;
77
+ });
78
+
79
+ // 加入队列
80
+ this.queue.push(requestInfo);
81
+ logger.info(`[Queue] Request queued: ${requestId} (queue: ${this.queue.length}, active: ${this.active.size})`);
82
+
83
+ // 尝试处理队列
84
+ this._processQueue();
85
+
86
+ // 设置超时
87
+ const timeoutPromise = new Promise((_, reject) => {
88
+ setTimeout(() => {
89
+ reject(new Error('Request timeout'));
90
+ }, this.requestTimeout);
91
+ });
92
+
93
+ try {
94
+ const result = await Promise.race([promise, timeoutPromise]);
95
+ return result;
96
+ } catch (error) {
97
+ if (error.message === 'Request timeout') {
98
+ this.stats.totalTimeout++;
99
+ this._removeFromActive(requestId);
100
+ return {
101
+ success: false,
102
+ error: '请求超时',
103
+ requestId,
104
+ };
105
+ }
106
+ throw error;
107
+ }
108
+ }
109
+
110
+ /**
111
+ * 处理队列中的请求
112
+ */
113
+ _processQueue() {
114
+ while (this.queue.length > 0 && this.active.size < this.maxConcurrent) {
115
+ const requestInfo = this.queue.shift();
116
+ this._executeRequest(requestInfo);
117
+ }
118
+ }
119
+
120
+ /**
121
+ * 执行单个请求
122
+ */
123
+ async _executeRequest(requestInfo) {
124
+ const { id, task, resolve, metadata } = requestInfo;
125
+
126
+ // 移入活跃列表
127
+ requestInfo.status = 'processing';
128
+ requestInfo.startedAt = Date.now();
129
+ this.active.set(id, requestInfo);
130
+
131
+ logger.info(`[Queue] Processing: ${id} (active: ${this.active.size})`);
132
+
133
+ try {
134
+ // 执行任务
135
+ const result = await task();
136
+
137
+ // 标记完成
138
+ requestInfo.status = 'completed';
139
+ requestInfo.completedAt = Date.now();
140
+ requestInfo.duration = requestInfo.completedAt - requestInfo.startedAt;
141
+
142
+ this.stats.totalProcessed++;
143
+ this.stats.totalSuccess++;
144
+
145
+ // 添加请求信息到结果
146
+ result.requestId = id;
147
+ result.processingTime = requestInfo.duration;
148
+
149
+ resolve(result);
150
+
151
+ logger.info(`[Queue] Completed: ${id} in ${requestInfo.duration}ms`);
152
+ } catch (error) {
153
+ requestInfo.status = 'failed';
154
+ requestInfo.error = error.message;
155
+
156
+ this.stats.totalProcessed++;
157
+ this.stats.totalFailed++;
158
+
159
+ resolve({
160
+ success: false,
161
+ error: error.message,
162
+ requestId: id,
163
+ });
164
+
165
+ logger.error(`[Queue] Failed: ${id} - ${error.message}`);
166
+ } finally {
167
+ // 从活跃列表移除
168
+ this._removeFromActive(id);
169
+
170
+ // 保存到已完成列表
171
+ this._addToCompleted(requestInfo);
172
+
173
+ // 继续处理队列
174
+ this._processQueue();
175
+ }
176
+ }
177
+
178
+ /**
179
+ * 从活跃列表移除
180
+ */
181
+ _removeFromActive(requestId) {
182
+ this.active.delete(requestId);
183
+ }
184
+
185
+ /**
186
+ * 添加到已完成列表
187
+ */
188
+ _addToCompleted(requestInfo) {
189
+ this.completed.push({
190
+ id: requestInfo.id,
191
+ status: requestInfo.status,
192
+ duration: requestInfo.duration,
193
+ completedAt: requestInfo.completedAt,
194
+ });
195
+
196
+ // 限制已完成列表大小
197
+ if (this.completed.length > this.maxCompleted) {
198
+ this.completed.shift();
199
+ }
200
+ }
201
+
202
+ /**
203
+ * 清理超时请求
204
+ */
205
+ _cleanupTimeouts() {
206
+ const now = Date.now();
207
+
208
+ for (const [id, info] of this.active.entries()) {
209
+ if (now - info.startedAt > this.requestTimeout) {
210
+ logger.warn(`[Queue] Timeout cleanup: ${id}`);
211
+ info.reject(new Error('Request timeout'));
212
+ this._removeFromActive(id);
213
+ this.stats.totalTimeout++;
214
+ }
215
+ }
216
+ }
217
+
218
+ /**
219
+ * 获取队列状态
220
+ */
221
+ getStatus() {
222
+ return {
223
+ queueLength: this.queue.length,
224
+ activeCount: this.active.size,
225
+ maxConcurrent: this.maxConcurrent,
226
+ maxQueueSize: this.maxQueueSize,
227
+ stats: { ...this.stats },
228
+ activeRequests: Array.from(this.active.entries()).map(([id, info]) => ({
229
+ id,
230
+ status: info.status,
231
+ duration: Date.now() - info.startedAt,
232
+ metadata: info.metadata,
233
+ })),
234
+ };
235
+ }
236
+
237
+ /**
238
+ * 清理资源
239
+ */
240
+ cleanup() {
241
+ clearInterval(this.cleanupInterval);
242
+
243
+ // 拒绝所有等待中的请求
244
+ for (const info of this.queue) {
245
+ info.reject(new Error('Queue shutdown'));
246
+ }
247
+ this.queue = [];
248
+
249
+ logger.info('[Queue] Cleanup completed');
250
+ }
251
+ }
252
+
253
+ // 全局请求队列实例
254
+ export const requestQueue = new RequestQueue({
255
+ maxConcurrent: 5,
256
+ maxQueueSize: 20,
257
+ requestTimeout: 60000,
258
+ });
259
+
260
+ export default requestQueue;
@@ -74,16 +74,15 @@ export class LinkValidator {
74
74
 
75
75
  /**
76
76
  * 快速验证 - 仅检查 URL 格式,不发送 HTTP 请求
77
+ * 宽松模式:只要 URL 格式正确就通过
77
78
  */
78
79
  quickValidate(url) {
79
80
  if (!this._isValidUrlFormat(url)) {
80
81
  return { url, valid: false, error: 'Invalid URL' };
81
82
  }
82
- // 检查常见图片扩展名
83
- const imageExtensions = ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.svg'];
84
- const urlLower = url.toLowerCase();
85
- const hasImageExt = imageExtensions.some(ext => urlLower.includes(ext));
86
- return { url, valid: hasImageExt, quality: null };
83
+ // 宽松验证:只要 URL 格式正确就认为有效
84
+ // 图片 URL 可能不包含扩展名(动态生成的 URL)
85
+ return { url, valid: true, quality: null };
87
86
  }
88
87
 
89
88
  /**
@@ -18,19 +18,13 @@ import logger from '../infrastructure/logger.js';
18
18
  import config from '../config/index.js';
19
19
  import { searchCache } from '../infrastructure/cache.js';
20
20
  import { metrics } from '../infrastructure/metrics.js';
21
+ import { requestQueue } from '../infrastructure/requestQueue.js';
21
22
 
22
- // 极速并发配置 - 最大化吹吐量
23
- const MAX_CONCURRENT_REQUESTS = 5; // 同时处理 5 个 MCP 请求
23
+ // 极速并发配置
24
24
  const MAX_CONCURRENT_KEYWORDS = 3; // 每个请求内并行 3 个关键词
25
- const MAX_CONCURRENT_SEARCHES = 5; // 并行搜索 5 个
26
25
 
27
- // 全局共享的并发限制器
28
- const globalRequestLimit = pLimit(MAX_CONCURRENT_REQUESTS);
26
+ // 关键词并发限制器
29
27
  const globalKeywordLimit = pLimit(MAX_CONCURRENT_KEYWORDS);
30
- const globalSearchLimit = pLimit(MAX_CONCURRENT_SEARCHES);
31
-
32
- // 请求状态跟踪
33
- let activeRequests = new Map(); // requestId -> { startTime, query, status }
34
28
 
35
29
  export class Orchestrator {
36
30
  constructor() {
@@ -38,7 +32,6 @@ export class Orchestrator {
38
32
  this.fileManager = new FileManager();
39
33
  this.imageProcessor = new ImageProcessor();
40
34
  this.keywordLimit = globalKeywordLimit;
41
- this.requestLimit = globalRequestLimit;
42
35
  }
43
36
 
44
37
  /**
@@ -67,23 +60,24 @@ export class Orchestrator {
67
60
  */
68
61
  async processKeywordLink(keyword, count, source, options = {}) {
69
62
  const startTime = Date.now();
70
- const fastMode = options.fastMode !== false; // 默认开启快速模式
63
+ const fastMode = options.fastMode !== false;
71
64
  const prioritizeQuality = options.prioritizeQuality === true;
72
65
 
73
66
  try {
74
67
  const scraper = getScraper(source);
75
- const searchCount = fastMode ? count : Math.ceil(count * 1.5);
68
+ // 多搜索一些以确保有足够的结果
69
+ const searchCount = Math.max(count * 3, 10);
76
70
 
77
71
  // 尝试从缓存获取
78
72
  const cachedUrls = searchCache.getSearchResult(keyword, source, options);
79
73
  let rawUrls;
80
74
 
81
- if (cachedUrls && cachedUrls.length >= searchCount) {
75
+ if (cachedUrls && cachedUrls.length >= count) {
82
76
  logger.info(`[CACHE] "${keyword}" - ${cachedUrls.length} URLs`);
83
77
  rawUrls = cachedUrls;
84
78
  metrics.recordCacheHit();
85
79
  } else {
86
- logger.info(`[SEARCH] "${keyword}"...`);
80
+ logger.info(`[SEARCH] "${keyword}" (target: ${searchCount})...`);
87
81
  rawUrls = await scraper.search(keyword, searchCount, options);
88
82
  if (rawUrls.length > 0) {
89
83
  searchCache.setSearchResult(keyword, source, options, rawUrls);
@@ -100,20 +94,24 @@ export class Orchestrator {
100
94
  };
101
95
  }
102
96
 
103
- // 快速模式:跳过 HTTP 验证,仅检查 URL 格式
97
+ // 快速模式:直接返回搜索结果(不验证)
104
98
  let resultUrls;
105
99
  if (fastMode && !prioritizeQuality) {
106
- // 快速验证:仅检查 URL 格式和扩展名
107
- const quickResults = rawUrls.map(url => this.linkValidator.quickValidate(url));
108
- resultUrls = quickResults.filter(r => r.valid).slice(0, count).map(r => r.url);
109
- logger.info(`[FAST] "${keyword}" - ${resultUrls.length} URLs (no HTTP validation)`);
100
+ // 快速模式:直接使用搜索结果
101
+ resultUrls = rawUrls.slice(0, count);
102
+ logger.info(`[FAST] "${keyword}" - ${resultUrls.length} URLs`);
110
103
  } else {
111
- // 完整验证
104
+ // 完整验证模式:验证不通过的继续搜索更多
112
105
  const { valid } = await this.linkValidator.validateMany(rawUrls, {
113
106
  fetchQuality: prioritizeQuality,
114
107
  sortByQuality: prioritizeQuality,
115
108
  });
116
109
  resultUrls = valid.slice(0, count).map(v => v.url);
110
+
111
+ // 如果验证通过的不够,记录警告
112
+ if (resultUrls.length < count) {
113
+ logger.warn(`[VALIDATE] "${keyword}" - only ${resultUrls.length}/${count} valid`);
114
+ }
117
115
  }
118
116
 
119
117
  return {
@@ -251,79 +249,26 @@ export class Orchestrator {
251
249
  }
252
250
 
253
251
  /**
254
- * 执行任务 - 高性能入口
255
- * 支持多个 MCP 请求并行处理
252
+ * 执行任务 - 使用请求队列管理
253
+ * 支持多个 MCP 请求并行处理,自动资源释放
256
254
  * @param {Object} params - 任务参数
257
255
  * @returns {Promise<Object>} - 执行结果
258
256
  */
259
257
  async execute(params) {
260
- const requestId = `req_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
261
- const pendingCount = this.requestLimit.pendingCount;
262
- const activeCount = this.requestLimit.activeCount;
263
-
264
- // 记录请求指标
265
258
  metrics.recordRequest();
266
259
 
267
- // 检查是否超过最大并发限制
268
- if (activeCount >= MAX_CONCURRENT_REQUESTS && pendingCount >= MAX_CONCURRENT_REQUESTS) {
269
- logger.warn(`[${requestId}] Request rejected: queue full (${activeCount} active, ${pendingCount} pending)`);
270
- metrics.recordError('queue_full');
271
- return {
272
- success: false,
273
- error: `服务繁忙,已达到最大并发数 ${MAX_CONCURRENT_REQUESTS}。请稍后重试。`,
274
- requestId,
275
- activeRequests: activeCount,
276
- pendingRequests: pendingCount,
277
- };
278
- }
279
-
280
- // 记录活跃请求
281
- activeRequests.set(requestId, {
282
- startTime: Date.now(),
283
- query: params.query,
284
- status: 'queued',
285
- });
286
-
287
- logger.info(`[${requestId}] Request queued: ${activeCount} active, ${pendingCount} pending, query="${params.query}"`);
288
-
289
- try {
290
- // 使用并发限制器,支持多个请求并行
291
- const result = await this.requestLimit(async () => {
292
- activeRequests.get(requestId).status = 'processing';
293
- logger.info(`[${requestId}] Processing started`);
294
- return await this._executeInternal(params, requestId);
295
- });
296
-
297
- result.requestId = requestId;
298
- result.processingTime = Date.now() - activeRequests.get(requestId).startTime;
299
- return result;
300
- } catch (error) {
301
- logger.error(`[${requestId}] Request failed: ${error.message}`);
302
- metrics.recordError(error);
303
- return {
304
- success: false,
305
- error: error.message,
306
- requestId,
307
- };
308
- } finally {
309
- activeRequests.delete(requestId);
310
- }
260
+ // 使用请求队列提交任务
261
+ return requestQueue.submit(
262
+ () => this._executeInternal(params),
263
+ { query: params.query, mode: params.mode }
264
+ );
311
265
  }
312
266
 
313
267
  /**
314
268
  * 获取当前请求状态
315
269
  */
316
270
  static getStatus() {
317
- return {
318
- activeRequests: Array.from(activeRequests.entries()).map(([id, info]) => ({
319
- id,
320
- query: info.query,
321
- status: info.status,
322
- duration: Date.now() - info.startTime,
323
- })),
324
- activeCount: activeRequests.size,
325
- maxConcurrent: MAX_CONCURRENT_REQUESTS,
326
- };
271
+ return requestQueue.getStatus();
327
272
  }
328
273
 
329
274
  /**