smart-image-scraper-mcp 1.1.3 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "smart-image-scraper-mcp",
3
- "version": "1.1.3",
3
+ "version": "2.0.0",
4
4
  "description": "全网智能图片抓取 MCP 服务器 - 支持 Bing/Google 图片搜索、验证和下载",
5
5
  "main": "src/index.js",
6
6
  "type": "module",
@@ -120,13 +120,41 @@ export class MetricsCollector {
120
120
  * 记录错误
121
121
  */
122
122
  recordError(error) {
123
- const code = error.code || 'UNKNOWN';
124
- const type = error.name || 'Error';
123
+ const code = typeof error === 'string' ? error : (error.code || 'UNKNOWN');
124
+ const type = typeof error === 'string' ? error : (error.name || 'Error');
125
125
 
126
126
  this.metrics.errors.byCode[code] = (this.metrics.errors.byCode[code] || 0) + 1;
127
127
  this.metrics.errors.byType[type] = (this.metrics.errors.byType[type] || 0) + 1;
128
128
  }
129
129
 
130
+ /**
131
+ * 记录请求
132
+ */
133
+ recordRequest() {
134
+ this.metrics.requests.total++;
135
+ this.metrics.system.lastActivity = Date.now();
136
+ }
137
+
138
+ /**
139
+ * 记录缓存命中
140
+ */
141
+ recordCacheHit() {
142
+ if (!this.metrics.cache) {
143
+ this.metrics.cache = { hits: 0, misses: 0 };
144
+ }
145
+ this.metrics.cache.hits++;
146
+ }
147
+
148
+ /**
149
+ * 记录缓存未命中
150
+ */
151
+ recordCacheMiss() {
152
+ if (!this.metrics.cache) {
153
+ this.metrics.cache = { hits: 0, misses: 0 };
154
+ }
155
+ this.metrics.cache.misses++;
156
+ }
157
+
130
158
  /**
131
159
  * 添加持续时间样本
132
160
  */
@@ -46,8 +46,8 @@ export class BingScraper extends BaseScraper {
46
46
  break;
47
47
  }
48
48
 
49
- // 添加请求间隔,防止被封
50
- await this._delay(200 + Math.random() * 100);
49
+ // 减少请求间隔,提高速度
50
+ await this._delay(100 + Math.random() * 50);
51
51
 
52
52
  const newUrls = this._parseResponse(response.data);
53
53
 
@@ -74,8 +74,8 @@ export class GoogleScraper extends BaseScraper {
74
74
 
75
75
  start += pageSize;
76
76
 
77
- // 添加延迟防止被封
78
- await this._delay(300 + Math.random() * 200);
77
+ // 减少延迟,提高速度
78
+ await this._delay(100 + Math.random() * 100);
79
79
  }
80
80
  } catch (error) {
81
81
  logger.error(`Google search error for "${keyword}"`, { message: error.message });
@@ -8,8 +8,9 @@ import httpClient from '../infrastructure/httpClient.js';
8
8
  import logger from '../infrastructure/logger.js';
9
9
  import config from '../config/index.js';
10
10
 
11
- // 全局共享的并发限制器,避免多个请求同时发起导致资源竞争
12
- const globalValidateLimit = pLimit(5); // 限制验证并发为 5
11
+ // 高性能并发验证配置
12
+ const MAX_VALIDATE_CONCURRENCY = 15; // 提高验证并发到 15
13
+ const globalValidateLimit = pLimit(MAX_VALIDATE_CONCURRENCY);
13
14
 
14
15
  export class LinkValidator {
15
16
  constructor() {
@@ -1,6 +1,12 @@
1
1
  /**
2
- * 编排器
2
+ * 编排器 - 高性能实现
3
3
  * 负责解析批量关键词,分配任务,协调各模块工作
4
+ *
5
+ * 高性能特性:
6
+ * - 支持多个 MCP 请求并行处理
7
+ * - 智能负载均衡和资源分配
8
+ * - 缓存集成减少重复请求
9
+ * - 快速响应模式
4
10
  */
5
11
 
6
12
  import pLimit from 'p-limit';
@@ -10,10 +16,21 @@ import { FileManager } from './fileManager.js';
10
16
  import { ImageProcessor } from './imageProcessor.js';
11
17
  import logger from '../infrastructure/logger.js';
12
18
  import config from '../config/index.js';
19
+ import { searchCache } from '../infrastructure/cache.js';
20
+ import { metrics } from '../infrastructure/metrics.js';
13
21
 
14
- // 全局共享的并发限制器,避免多个 MCP 请求同时发起导致资源竞争
15
- const globalKeywordLimit = pLimit(1); // 严格限制关键词并发
16
- const globalRequestLimit = pLimit(1); // 严格限制同时只处理 1 个 MCP 请求
22
+ // 高性能并发配置
23
+ const MAX_CONCURRENT_REQUESTS = 3; // 同时处理最多 3 个 MCP 请求
24
+ const MAX_CONCURRENT_KEYWORDS = 2; // 每个请求内并行处理 2 个关键词
25
+ const MAX_CONCURRENT_VALIDATIONS = 10; // 并行验证 10 个 URL
26
+
27
+ // 全局共享的并发限制器
28
+ const globalRequestLimit = pLimit(MAX_CONCURRENT_REQUESTS);
29
+ const globalKeywordLimit = pLimit(MAX_CONCURRENT_KEYWORDS);
30
+ const globalValidationLimit = pLimit(MAX_CONCURRENT_VALIDATIONS);
31
+
32
+ // 请求状态跟踪
33
+ let activeRequests = new Map(); // requestId -> { startTime, query, status }
17
34
 
18
35
  export class Orchestrator {
19
36
  constructor() {
@@ -50,18 +67,30 @@ export class Orchestrator {
50
67
  */
51
68
  async processKeywordLink(keyword, count, source, options = {}) {
52
69
  const startTime = Date.now();
53
- // Link 模式默认不进行质量评估(加快速度),除非明确要求
54
70
  const prioritizeQuality = options.prioritizeQuality === true;
55
71
 
56
72
  try {
57
- // 获取搜索源
58
73
  const scraper = getScraper(source);
74
+ const searchCount = Math.ceil(count * 1.5);
59
75
 
60
- logger.info(`Searching for "${keyword}"...`);
76
+ // 尝试从缓存获取
77
+ const cachedUrls = searchCache.getSearchResult(keyword, source, options);
78
+ let rawUrls;
61
79
 
62
- // 搜索图片(多获取一些以弥补验证失败的损失)
63
- const searchCount = Math.ceil(count * 1.5);
64
- const rawUrls = await scraper.search(keyword, searchCount, options);
80
+ if (cachedUrls && cachedUrls.length >= searchCount) {
81
+ logger.info(`[CACHE HIT] "${keyword}" - ${cachedUrls.length} URLs from cache`);
82
+ rawUrls = cachedUrls;
83
+ metrics.recordCacheHit();
84
+ } else {
85
+ logger.info(`[SEARCH] "${keyword}"...`);
86
+ rawUrls = await scraper.search(keyword, searchCount, options);
87
+
88
+ // 缓存搜索结果
89
+ if (rawUrls.length > 0) {
90
+ searchCache.setSearchResult(keyword, source, options, rawUrls);
91
+ }
92
+ metrics.recordCacheMiss();
93
+ }
65
94
 
66
95
  logger.info(`Found ${rawUrls.length} URLs for "${keyword}"`);
67
96
 
@@ -116,15 +145,28 @@ export class Orchestrator {
116
145
  */
117
146
  async processKeywordDownload(keyword, count, source, options = {}) {
118
147
  const startTime = Date.now();
119
- const prioritizeQuality = options.prioritizeQuality !== false; // 默认优先高质量
148
+ const prioritizeQuality = options.prioritizeQuality !== false;
120
149
 
121
150
  try {
122
- // 获取搜索源
123
151
  const scraper = getScraper(source);
124
-
125
- // 搜索图片(多获取一些以弥补下载失败的损失)
126
152
  const searchCount = Math.ceil(count * 2);
127
- const rawUrls = await scraper.search(keyword, searchCount, options);
153
+
154
+ // 尝试从缓存获取
155
+ const cachedUrls = searchCache.getSearchResult(keyword, source, options);
156
+ let rawUrls;
157
+
158
+ if (cachedUrls && cachedUrls.length >= searchCount) {
159
+ logger.info(`[CACHE HIT] "${keyword}" - ${cachedUrls.length} URLs from cache`);
160
+ rawUrls = cachedUrls;
161
+ metrics.recordCacheHit();
162
+ } else {
163
+ logger.info(`[SEARCH] "${keyword}"...`);
164
+ rawUrls = await scraper.search(keyword, searchCount, options);
165
+ if (rawUrls.length > 0) {
166
+ searchCache.setSearchResult(keyword, source, options, rawUrls);
167
+ }
168
+ metrics.recordCacheMiss();
169
+ }
128
170
 
129
171
  if (rawUrls.length === 0) {
130
172
  return {
@@ -207,26 +249,79 @@ export class Orchestrator {
207
249
  }
208
250
 
209
251
  /**
210
- * 执行任务
252
+ * 执行任务 - 高性能入口
253
+ * 支持多个 MCP 请求并行处理
211
254
  * @param {Object} params - 任务参数
212
255
  * @returns {Promise<Object>} - 执行结果
213
256
  */
214
257
  async execute(params) {
215
- // 检查当前队列状态
258
+ const requestId = `req_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
216
259
  const pendingCount = this.requestLimit.pendingCount;
217
260
  const activeCount = this.requestLimit.activeCount;
218
261
 
219
- // 如果队列中已有等待的请求,直接返回提示(避免长时间等待)
220
- if (pendingCount > 0) {
221
- logger.warn(`Request queued: ${pendingCount} pending, ${activeCount} active`);
262
+ // 记录请求指标
263
+ metrics.recordRequest();
264
+
265
+ // 检查是否超过最大并发限制
266
+ if (activeCount >= MAX_CONCURRENT_REQUESTS && pendingCount >= MAX_CONCURRENT_REQUESTS) {
267
+ logger.warn(`[${requestId}] Request rejected: queue full (${activeCount} active, ${pendingCount} pending)`);
268
+ metrics.recordError('queue_full');
222
269
  return {
223
270
  success: false,
224
- error: `当前有 ${pendingCount + activeCount} 个请求正在处理中,请稍后重试。建议一次只发起一个搜索请求。`,
271
+ error: `服务繁忙,已达到最大并发数 ${MAX_CONCURRENT_REQUESTS}。请稍后重试。`,
272
+ requestId,
273
+ activeRequests: activeCount,
274
+ pendingRequests: pendingCount,
225
275
  };
226
276
  }
227
277
 
228
- // 使用全局请求限制器
229
- return this.requestLimit(() => this._executeInternal(params));
278
+ // 记录活跃请求
279
+ activeRequests.set(requestId, {
280
+ startTime: Date.now(),
281
+ query: params.query,
282
+ status: 'queued',
283
+ });
284
+
285
+ logger.info(`[${requestId}] Request queued: ${activeCount} active, ${pendingCount} pending, query="${params.query}"`);
286
+
287
+ try {
288
+ // 使用并发限制器,支持多个请求并行
289
+ const result = await this.requestLimit(async () => {
290
+ activeRequests.get(requestId).status = 'processing';
291
+ logger.info(`[${requestId}] Processing started`);
292
+ return await this._executeInternal(params, requestId);
293
+ });
294
+
295
+ result.requestId = requestId;
296
+ result.processingTime = Date.now() - activeRequests.get(requestId).startTime;
297
+ return result;
298
+ } catch (error) {
299
+ logger.error(`[${requestId}] Request failed: ${error.message}`);
300
+ metrics.recordError(error);
301
+ return {
302
+ success: false,
303
+ error: error.message,
304
+ requestId,
305
+ };
306
+ } finally {
307
+ activeRequests.delete(requestId);
308
+ }
309
+ }
310
+
311
+ /**
312
+ * 获取当前请求状态
313
+ */
314
+ static getStatus() {
315
+ return {
316
+ activeRequests: Array.from(activeRequests.entries()).map(([id, info]) => ({
317
+ id,
318
+ query: info.query,
319
+ status: info.status,
320
+ duration: Date.now() - info.startTime,
321
+ })),
322
+ activeCount: activeRequests.size,
323
+ maxConcurrent: MAX_CONCURRENT_REQUESTS,
324
+ };
230
325
  }
231
326
 
232
327
  /**