smart-image-scraper-mcp 2.1.0 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* 请求队列管理器 - 多线程队列机制
|
|
3
|
+
* 管理并发请求,确保资源正确释放
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import logger from './logger.js';
|
|
7
|
+
import { metrics } from './metrics.js';
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* 请求队列管理器
|
|
11
|
+
*/
|
|
12
|
+
export class RequestQueue {
|
|
13
|
+
constructor(options = {}) {
|
|
14
|
+
this.maxConcurrent = options.maxConcurrent || 5; // 最大并发数
|
|
15
|
+
this.maxQueueSize = options.maxQueueSize || 20; // 最大队列长度
|
|
16
|
+
this.requestTimeout = options.requestTimeout || 60000; // 请求超时 60 秒
|
|
17
|
+
|
|
18
|
+
this.queue = []; // 等待队列
|
|
19
|
+
this.active = new Map(); // 活跃请求 Map<requestId, requestInfo>
|
|
20
|
+
this.completed = []; // 已完成请求(保留最近 100 个)
|
|
21
|
+
this.maxCompleted = 100;
|
|
22
|
+
|
|
23
|
+
this.stats = {
|
|
24
|
+
totalProcessed: 0,
|
|
25
|
+
totalSuccess: 0,
|
|
26
|
+
totalFailed: 0,
|
|
27
|
+
totalTimeout: 0,
|
|
28
|
+
};
|
|
29
|
+
|
|
30
|
+
// 定期清理超时请求
|
|
31
|
+
this.cleanupInterval = setInterval(() => this._cleanupTimeouts(), 5000);
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* 生成请求 ID
|
|
36
|
+
*/
|
|
37
|
+
generateRequestId() {
|
|
38
|
+
return `req_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* 提交请求到队列
|
|
43
|
+
* @param {Function} task - 异步任务函数
|
|
44
|
+
* @param {Object} metadata - 请求元数据
|
|
45
|
+
* @returns {Promise<Object>} - 任务结果
|
|
46
|
+
*/
|
|
47
|
+
async submit(task, metadata = {}) {
|
|
48
|
+
const requestId = this.generateRequestId();
|
|
49
|
+
|
|
50
|
+
// 检查队列是否已满
|
|
51
|
+
if (this.queue.length >= this.maxQueueSize) {
|
|
52
|
+
logger.warn(`[Queue] Request rejected: queue full (${this.queue.length}/${this.maxQueueSize})`);
|
|
53
|
+
metrics.recordError('queue_full');
|
|
54
|
+
return {
|
|
55
|
+
success: false,
|
|
56
|
+
error: `队列已满,请稍后重试。当前队列: ${this.queue.length}`,
|
|
57
|
+
requestId,
|
|
58
|
+
queueStatus: this.getStatus(),
|
|
59
|
+
};
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// 创建请求包装
|
|
63
|
+
const requestInfo = {
|
|
64
|
+
id: requestId,
|
|
65
|
+
metadata,
|
|
66
|
+
createdAt: Date.now(),
|
|
67
|
+
status: 'queued',
|
|
68
|
+
resolve: null,
|
|
69
|
+
reject: null,
|
|
70
|
+
};
|
|
71
|
+
|
|
72
|
+
// 创建 Promise
|
|
73
|
+
const promise = new Promise((resolve, reject) => {
|
|
74
|
+
requestInfo.resolve = resolve;
|
|
75
|
+
requestInfo.reject = reject;
|
|
76
|
+
requestInfo.task = task;
|
|
77
|
+
});
|
|
78
|
+
|
|
79
|
+
// 加入队列
|
|
80
|
+
this.queue.push(requestInfo);
|
|
81
|
+
logger.info(`[Queue] Request queued: ${requestId} (queue: ${this.queue.length}, active: ${this.active.size})`);
|
|
82
|
+
|
|
83
|
+
// 尝试处理队列
|
|
84
|
+
this._processQueue();
|
|
85
|
+
|
|
86
|
+
// 设置超时
|
|
87
|
+
const timeoutPromise = new Promise((_, reject) => {
|
|
88
|
+
setTimeout(() => {
|
|
89
|
+
reject(new Error('Request timeout'));
|
|
90
|
+
}, this.requestTimeout);
|
|
91
|
+
});
|
|
92
|
+
|
|
93
|
+
try {
|
|
94
|
+
const result = await Promise.race([promise, timeoutPromise]);
|
|
95
|
+
return result;
|
|
96
|
+
} catch (error) {
|
|
97
|
+
if (error.message === 'Request timeout') {
|
|
98
|
+
this.stats.totalTimeout++;
|
|
99
|
+
this._removeFromActive(requestId);
|
|
100
|
+
return {
|
|
101
|
+
success: false,
|
|
102
|
+
error: '请求超时',
|
|
103
|
+
requestId,
|
|
104
|
+
};
|
|
105
|
+
}
|
|
106
|
+
throw error;
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* 处理队列中的请求
|
|
112
|
+
*/
|
|
113
|
+
_processQueue() {
|
|
114
|
+
while (this.queue.length > 0 && this.active.size < this.maxConcurrent) {
|
|
115
|
+
const requestInfo = this.queue.shift();
|
|
116
|
+
this._executeRequest(requestInfo);
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
/**
|
|
121
|
+
* 执行单个请求
|
|
122
|
+
*/
|
|
123
|
+
async _executeRequest(requestInfo) {
|
|
124
|
+
const { id, task, resolve, metadata } = requestInfo;
|
|
125
|
+
|
|
126
|
+
// 移入活跃列表
|
|
127
|
+
requestInfo.status = 'processing';
|
|
128
|
+
requestInfo.startedAt = Date.now();
|
|
129
|
+
this.active.set(id, requestInfo);
|
|
130
|
+
|
|
131
|
+
logger.info(`[Queue] Processing: ${id} (active: ${this.active.size})`);
|
|
132
|
+
|
|
133
|
+
try {
|
|
134
|
+
// 执行任务
|
|
135
|
+
const result = await task();
|
|
136
|
+
|
|
137
|
+
// 标记完成
|
|
138
|
+
requestInfo.status = 'completed';
|
|
139
|
+
requestInfo.completedAt = Date.now();
|
|
140
|
+
requestInfo.duration = requestInfo.completedAt - requestInfo.startedAt;
|
|
141
|
+
|
|
142
|
+
this.stats.totalProcessed++;
|
|
143
|
+
this.stats.totalSuccess++;
|
|
144
|
+
|
|
145
|
+
// 添加请求信息到结果
|
|
146
|
+
result.requestId = id;
|
|
147
|
+
result.processingTime = requestInfo.duration;
|
|
148
|
+
|
|
149
|
+
resolve(result);
|
|
150
|
+
|
|
151
|
+
logger.info(`[Queue] Completed: ${id} in ${requestInfo.duration}ms`);
|
|
152
|
+
} catch (error) {
|
|
153
|
+
requestInfo.status = 'failed';
|
|
154
|
+
requestInfo.error = error.message;
|
|
155
|
+
|
|
156
|
+
this.stats.totalProcessed++;
|
|
157
|
+
this.stats.totalFailed++;
|
|
158
|
+
|
|
159
|
+
resolve({
|
|
160
|
+
success: false,
|
|
161
|
+
error: error.message,
|
|
162
|
+
requestId: id,
|
|
163
|
+
});
|
|
164
|
+
|
|
165
|
+
logger.error(`[Queue] Failed: ${id} - ${error.message}`);
|
|
166
|
+
} finally {
|
|
167
|
+
// 从活跃列表移除
|
|
168
|
+
this._removeFromActive(id);
|
|
169
|
+
|
|
170
|
+
// 保存到已完成列表
|
|
171
|
+
this._addToCompleted(requestInfo);
|
|
172
|
+
|
|
173
|
+
// 继续处理队列
|
|
174
|
+
this._processQueue();
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
/**
|
|
179
|
+
* 从活跃列表移除
|
|
180
|
+
*/
|
|
181
|
+
_removeFromActive(requestId) {
|
|
182
|
+
this.active.delete(requestId);
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
/**
|
|
186
|
+
* 添加到已完成列表
|
|
187
|
+
*/
|
|
188
|
+
_addToCompleted(requestInfo) {
|
|
189
|
+
this.completed.push({
|
|
190
|
+
id: requestInfo.id,
|
|
191
|
+
status: requestInfo.status,
|
|
192
|
+
duration: requestInfo.duration,
|
|
193
|
+
completedAt: requestInfo.completedAt,
|
|
194
|
+
});
|
|
195
|
+
|
|
196
|
+
// 限制已完成列表大小
|
|
197
|
+
if (this.completed.length > this.maxCompleted) {
|
|
198
|
+
this.completed.shift();
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
/**
|
|
203
|
+
* 清理超时请求
|
|
204
|
+
*/
|
|
205
|
+
_cleanupTimeouts() {
|
|
206
|
+
const now = Date.now();
|
|
207
|
+
|
|
208
|
+
for (const [id, info] of this.active.entries()) {
|
|
209
|
+
if (now - info.startedAt > this.requestTimeout) {
|
|
210
|
+
logger.warn(`[Queue] Timeout cleanup: ${id}`);
|
|
211
|
+
info.reject(new Error('Request timeout'));
|
|
212
|
+
this._removeFromActive(id);
|
|
213
|
+
this.stats.totalTimeout++;
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
/**
|
|
219
|
+
* 获取队列状态
|
|
220
|
+
*/
|
|
221
|
+
getStatus() {
|
|
222
|
+
return {
|
|
223
|
+
queueLength: this.queue.length,
|
|
224
|
+
activeCount: this.active.size,
|
|
225
|
+
maxConcurrent: this.maxConcurrent,
|
|
226
|
+
maxQueueSize: this.maxQueueSize,
|
|
227
|
+
stats: { ...this.stats },
|
|
228
|
+
activeRequests: Array.from(this.active.entries()).map(([id, info]) => ({
|
|
229
|
+
id,
|
|
230
|
+
status: info.status,
|
|
231
|
+
duration: Date.now() - info.startedAt,
|
|
232
|
+
metadata: info.metadata,
|
|
233
|
+
})),
|
|
234
|
+
};
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
/**
|
|
238
|
+
* 清理资源
|
|
239
|
+
*/
|
|
240
|
+
cleanup() {
|
|
241
|
+
clearInterval(this.cleanupInterval);
|
|
242
|
+
|
|
243
|
+
// 拒绝所有等待中的请求
|
|
244
|
+
for (const info of this.queue) {
|
|
245
|
+
info.reject(new Error('Queue shutdown'));
|
|
246
|
+
}
|
|
247
|
+
this.queue = [];
|
|
248
|
+
|
|
249
|
+
logger.info('[Queue] Cleanup completed');
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
// 全局请求队列实例
|
|
254
|
+
export const requestQueue = new RequestQueue({
|
|
255
|
+
maxConcurrent: 5,
|
|
256
|
+
maxQueueSize: 20,
|
|
257
|
+
requestTimeout: 60000,
|
|
258
|
+
});
|
|
259
|
+
|
|
260
|
+
export default requestQueue;
|
|
@@ -74,16 +74,15 @@ export class LinkValidator {
|
|
|
74
74
|
|
|
75
75
|
/**
|
|
76
76
|
* 快速验证 - 仅检查 URL 格式,不发送 HTTP 请求
|
|
77
|
+
* 宽松模式:只要 URL 格式正确就通过
|
|
77
78
|
*/
|
|
78
79
|
quickValidate(url) {
|
|
79
80
|
if (!this._isValidUrlFormat(url)) {
|
|
80
81
|
return { url, valid: false, error: 'Invalid URL' };
|
|
81
82
|
}
|
|
82
|
-
//
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
const hasImageExt = imageExtensions.some(ext => urlLower.includes(ext));
|
|
86
|
-
return { url, valid: hasImageExt, quality: null };
|
|
83
|
+
// 宽松验证:只要 URL 格式正确就认为有效
|
|
84
|
+
// 图片 URL 可能不包含扩展名(动态生成的 URL)
|
|
85
|
+
return { url, valid: true, quality: null };
|
|
87
86
|
}
|
|
88
87
|
|
|
89
88
|
/**
|
|
@@ -18,19 +18,13 @@ import logger from '../infrastructure/logger.js';
|
|
|
18
18
|
import config from '../config/index.js';
|
|
19
19
|
import { searchCache } from '../infrastructure/cache.js';
|
|
20
20
|
import { metrics } from '../infrastructure/metrics.js';
|
|
21
|
+
import { requestQueue } from '../infrastructure/requestQueue.js';
|
|
21
22
|
|
|
22
|
-
// 极速并发配置
|
|
23
|
-
const MAX_CONCURRENT_REQUESTS = 5; // 同时处理 5 个 MCP 请求
|
|
23
|
+
// 极速并发配置
|
|
24
24
|
const MAX_CONCURRENT_KEYWORDS = 3; // 每个请求内并行 3 个关键词
|
|
25
|
-
const MAX_CONCURRENT_SEARCHES = 5; // 并行搜索 5 个
|
|
26
25
|
|
|
27
|
-
//
|
|
28
|
-
const globalRequestLimit = pLimit(MAX_CONCURRENT_REQUESTS);
|
|
26
|
+
// 关键词并发限制器
|
|
29
27
|
const globalKeywordLimit = pLimit(MAX_CONCURRENT_KEYWORDS);
|
|
30
|
-
const globalSearchLimit = pLimit(MAX_CONCURRENT_SEARCHES);
|
|
31
|
-
|
|
32
|
-
// 请求状态跟踪
|
|
33
|
-
let activeRequests = new Map(); // requestId -> { startTime, query, status }
|
|
34
28
|
|
|
35
29
|
export class Orchestrator {
|
|
36
30
|
constructor() {
|
|
@@ -38,7 +32,6 @@ export class Orchestrator {
|
|
|
38
32
|
this.fileManager = new FileManager();
|
|
39
33
|
this.imageProcessor = new ImageProcessor();
|
|
40
34
|
this.keywordLimit = globalKeywordLimit;
|
|
41
|
-
this.requestLimit = globalRequestLimit;
|
|
42
35
|
}
|
|
43
36
|
|
|
44
37
|
/**
|
|
@@ -67,23 +60,24 @@ export class Orchestrator {
|
|
|
67
60
|
*/
|
|
68
61
|
async processKeywordLink(keyword, count, source, options = {}) {
|
|
69
62
|
const startTime = Date.now();
|
|
70
|
-
const fastMode = options.fastMode !== false;
|
|
63
|
+
const fastMode = options.fastMode !== false;
|
|
71
64
|
const prioritizeQuality = options.prioritizeQuality === true;
|
|
72
65
|
|
|
73
66
|
try {
|
|
74
67
|
const scraper = getScraper(source);
|
|
75
|
-
|
|
68
|
+
// 多搜索一些以确保有足够的结果
|
|
69
|
+
const searchCount = Math.max(count * 3, 10);
|
|
76
70
|
|
|
77
71
|
// 尝试从缓存获取
|
|
78
72
|
const cachedUrls = searchCache.getSearchResult(keyword, source, options);
|
|
79
73
|
let rawUrls;
|
|
80
74
|
|
|
81
|
-
if (cachedUrls && cachedUrls.length >=
|
|
75
|
+
if (cachedUrls && cachedUrls.length >= count) {
|
|
82
76
|
logger.info(`[CACHE] "${keyword}" - ${cachedUrls.length} URLs`);
|
|
83
77
|
rawUrls = cachedUrls;
|
|
84
78
|
metrics.recordCacheHit();
|
|
85
79
|
} else {
|
|
86
|
-
logger.info(`[SEARCH] "${keyword}"...`);
|
|
80
|
+
logger.info(`[SEARCH] "${keyword}" (target: ${searchCount})...`);
|
|
87
81
|
rawUrls = await scraper.search(keyword, searchCount, options);
|
|
88
82
|
if (rawUrls.length > 0) {
|
|
89
83
|
searchCache.setSearchResult(keyword, source, options, rawUrls);
|
|
@@ -100,20 +94,24 @@ export class Orchestrator {
|
|
|
100
94
|
};
|
|
101
95
|
}
|
|
102
96
|
|
|
103
|
-
//
|
|
97
|
+
// 快速模式:直接返回搜索结果(不验证)
|
|
104
98
|
let resultUrls;
|
|
105
99
|
if (fastMode && !prioritizeQuality) {
|
|
106
|
-
//
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
logger.info(`[FAST] "${keyword}" - ${resultUrls.length} URLs (no HTTP validation)`);
|
|
100
|
+
// 快速模式:直接使用搜索结果
|
|
101
|
+
resultUrls = rawUrls.slice(0, count);
|
|
102
|
+
logger.info(`[FAST] "${keyword}" - ${resultUrls.length} URLs`);
|
|
110
103
|
} else {
|
|
111
|
-
//
|
|
104
|
+
// 完整验证模式:验证不通过的继续搜索更多
|
|
112
105
|
const { valid } = await this.linkValidator.validateMany(rawUrls, {
|
|
113
106
|
fetchQuality: prioritizeQuality,
|
|
114
107
|
sortByQuality: prioritizeQuality,
|
|
115
108
|
});
|
|
116
109
|
resultUrls = valid.slice(0, count).map(v => v.url);
|
|
110
|
+
|
|
111
|
+
// 如果验证通过的不够,记录警告
|
|
112
|
+
if (resultUrls.length < count) {
|
|
113
|
+
logger.warn(`[VALIDATE] "${keyword}" - only ${resultUrls.length}/${count} valid`);
|
|
114
|
+
}
|
|
117
115
|
}
|
|
118
116
|
|
|
119
117
|
return {
|
|
@@ -251,79 +249,26 @@ export class Orchestrator {
|
|
|
251
249
|
}
|
|
252
250
|
|
|
253
251
|
/**
|
|
254
|
-
* 执行任务 -
|
|
255
|
-
* 支持多个 MCP
|
|
252
|
+
* 执行任务 - 使用请求队列管理
|
|
253
|
+
* 支持多个 MCP 请求并行处理,自动资源释放
|
|
256
254
|
* @param {Object} params - 任务参数
|
|
257
255
|
* @returns {Promise<Object>} - 执行结果
|
|
258
256
|
*/
|
|
259
257
|
async execute(params) {
|
|
260
|
-
const requestId = `req_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
|
|
261
|
-
const pendingCount = this.requestLimit.pendingCount;
|
|
262
|
-
const activeCount = this.requestLimit.activeCount;
|
|
263
|
-
|
|
264
|
-
// 记录请求指标
|
|
265
258
|
metrics.recordRequest();
|
|
266
259
|
|
|
267
|
-
//
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
success: false,
|
|
273
|
-
error: `服务繁忙,已达到最大并发数 ${MAX_CONCURRENT_REQUESTS}。请稍后重试。`,
|
|
274
|
-
requestId,
|
|
275
|
-
activeRequests: activeCount,
|
|
276
|
-
pendingRequests: pendingCount,
|
|
277
|
-
};
|
|
278
|
-
}
|
|
279
|
-
|
|
280
|
-
// 记录活跃请求
|
|
281
|
-
activeRequests.set(requestId, {
|
|
282
|
-
startTime: Date.now(),
|
|
283
|
-
query: params.query,
|
|
284
|
-
status: 'queued',
|
|
285
|
-
});
|
|
286
|
-
|
|
287
|
-
logger.info(`[${requestId}] Request queued: ${activeCount} active, ${pendingCount} pending, query="${params.query}"`);
|
|
288
|
-
|
|
289
|
-
try {
|
|
290
|
-
// 使用并发限制器,支持多个请求并行
|
|
291
|
-
const result = await this.requestLimit(async () => {
|
|
292
|
-
activeRequests.get(requestId).status = 'processing';
|
|
293
|
-
logger.info(`[${requestId}] Processing started`);
|
|
294
|
-
return await this._executeInternal(params, requestId);
|
|
295
|
-
});
|
|
296
|
-
|
|
297
|
-
result.requestId = requestId;
|
|
298
|
-
result.processingTime = Date.now() - activeRequests.get(requestId).startTime;
|
|
299
|
-
return result;
|
|
300
|
-
} catch (error) {
|
|
301
|
-
logger.error(`[${requestId}] Request failed: ${error.message}`);
|
|
302
|
-
metrics.recordError(error);
|
|
303
|
-
return {
|
|
304
|
-
success: false,
|
|
305
|
-
error: error.message,
|
|
306
|
-
requestId,
|
|
307
|
-
};
|
|
308
|
-
} finally {
|
|
309
|
-
activeRequests.delete(requestId);
|
|
310
|
-
}
|
|
260
|
+
// 使用请求队列提交任务
|
|
261
|
+
return requestQueue.submit(
|
|
262
|
+
() => this._executeInternal(params),
|
|
263
|
+
{ query: params.query, mode: params.mode }
|
|
264
|
+
);
|
|
311
265
|
}
|
|
312
266
|
|
|
313
267
|
/**
|
|
314
268
|
* 获取当前请求状态
|
|
315
269
|
*/
|
|
316
270
|
static getStatus() {
|
|
317
|
-
return
|
|
318
|
-
activeRequests: Array.from(activeRequests.entries()).map(([id, info]) => ({
|
|
319
|
-
id,
|
|
320
|
-
query: info.query,
|
|
321
|
-
status: info.status,
|
|
322
|
-
duration: Date.now() - info.startTime,
|
|
323
|
-
})),
|
|
324
|
-
activeCount: activeRequests.size,
|
|
325
|
-
maxConcurrent: MAX_CONCURRENT_REQUESTS,
|
|
326
|
-
};
|
|
271
|
+
return requestQueue.getStatus();
|
|
327
272
|
}
|
|
328
273
|
|
|
329
274
|
/**
|