smart-image-scraper-mcp 1.1.3 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -120,13 +120,41 @@ export class MetricsCollector {
|
|
|
120
120
|
* 记录错误
|
|
121
121
|
*/
|
|
122
122
|
recordError(error) {
|
|
123
|
-
const code = error.code || 'UNKNOWN';
|
|
124
|
-
const type = error.name || 'Error';
|
|
123
|
+
const code = typeof error === 'string' ? error : (error.code || 'UNKNOWN');
|
|
124
|
+
const type = typeof error === 'string' ? error : (error.name || 'Error');
|
|
125
125
|
|
|
126
126
|
this.metrics.errors.byCode[code] = (this.metrics.errors.byCode[code] || 0) + 1;
|
|
127
127
|
this.metrics.errors.byType[type] = (this.metrics.errors.byType[type] || 0) + 1;
|
|
128
128
|
}
|
|
129
129
|
|
|
130
|
+
/**
|
|
131
|
+
* 记录请求
|
|
132
|
+
*/
|
|
133
|
+
recordRequest() {
|
|
134
|
+
this.metrics.requests.total++;
|
|
135
|
+
this.metrics.system.lastActivity = Date.now();
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
/**
|
|
139
|
+
* 记录缓存命中
|
|
140
|
+
*/
|
|
141
|
+
recordCacheHit() {
|
|
142
|
+
if (!this.metrics.cache) {
|
|
143
|
+
this.metrics.cache = { hits: 0, misses: 0 };
|
|
144
|
+
}
|
|
145
|
+
this.metrics.cache.hits++;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
/**
|
|
149
|
+
* 记录缓存未命中
|
|
150
|
+
*/
|
|
151
|
+
recordCacheMiss() {
|
|
152
|
+
if (!this.metrics.cache) {
|
|
153
|
+
this.metrics.cache = { hits: 0, misses: 0 };
|
|
154
|
+
}
|
|
155
|
+
this.metrics.cache.misses++;
|
|
156
|
+
}
|
|
157
|
+
|
|
130
158
|
/**
|
|
131
159
|
* 添加持续时间样本
|
|
132
160
|
*/
|
|
@@ -74,8 +74,8 @@ export class GoogleScraper extends BaseScraper {
|
|
|
74
74
|
|
|
75
75
|
start += pageSize;
|
|
76
76
|
|
|
77
|
-
//
|
|
78
|
-
await this._delay(
|
|
77
|
+
// 极速模式:最小延迟
|
|
78
|
+
await this._delay(50);
|
|
79
79
|
}
|
|
80
80
|
} catch (error) {
|
|
81
81
|
logger.error(`Google search error for "${keyword}"`, { message: error.message });
|
|
@@ -8,8 +8,9 @@ import httpClient from '../infrastructure/httpClient.js';
|
|
|
8
8
|
import logger from '../infrastructure/logger.js';
|
|
9
9
|
import config from '../config/index.js';
|
|
10
10
|
|
|
11
|
-
//
|
|
12
|
-
const
|
|
11
|
+
// 极速并发验证配置
|
|
12
|
+
const MAX_VALIDATE_CONCURRENCY = 30; // 极速验证并发 30
|
|
13
|
+
const globalValidateLimit = pLimit(MAX_VALIDATE_CONCURRENCY);
|
|
13
14
|
|
|
14
15
|
export class LinkValidator {
|
|
15
16
|
constructor() {
|
|
@@ -32,7 +33,7 @@ export class LinkValidator {
|
|
|
32
33
|
}
|
|
33
34
|
|
|
34
35
|
/**
|
|
35
|
-
* 验证单个链接
|
|
36
|
+
* 验证单个链接 - 极速版本
|
|
36
37
|
* @param {string} url - 图片URL
|
|
37
38
|
* @param {boolean} fetchQuality - 是否获取质量信息
|
|
38
39
|
* @returns {Promise<{url: string, valid: boolean, error?: string, quality?: Object}>}
|
|
@@ -44,14 +45,12 @@ export class LinkValidator {
|
|
|
44
45
|
}
|
|
45
46
|
|
|
46
47
|
const controller = new AbortController();
|
|
47
|
-
const timeoutId = setTimeout(() =>
|
|
48
|
-
controller.abort();
|
|
49
|
-
}, 5000); // 缩短超时时间到5秒
|
|
48
|
+
const timeoutId = setTimeout(() => controller.abort(), 2000); // 极速超时 2 秒
|
|
50
49
|
|
|
51
50
|
try {
|
|
52
51
|
const response = await httpClient.head(url, {
|
|
53
|
-
timeout:
|
|
54
|
-
maxRedirects:
|
|
52
|
+
timeout: 1500, // 极速超时 1.5 秒
|
|
53
|
+
maxRedirects: 1, // 最多 1 次重定向
|
|
55
54
|
signal: controller.signal,
|
|
56
55
|
});
|
|
57
56
|
|
|
@@ -59,27 +58,32 @@ export class LinkValidator {
|
|
|
59
58
|
|
|
60
59
|
const contentType = response.headers['content-type'] || '';
|
|
61
60
|
const isImage = contentType.toLowerCase().includes('image');
|
|
62
|
-
const isValidStatus = response.status === 200;
|
|
61
|
+
const isValidStatus = response.status === 200 || response.status === 206;
|
|
63
62
|
|
|
64
63
|
if (isValidStatus && isImage) {
|
|
65
|
-
// 获取质量信息
|
|
66
64
|
const quality = fetchQuality ? this._extractQualityInfo(response.headers, url) : null;
|
|
67
65
|
return { url, valid: true, quality };
|
|
68
66
|
}
|
|
69
67
|
|
|
70
|
-
|
|
71
|
-
return { url, valid: false, error: `Invalid: status=${response.status}` };
|
|
68
|
+
return { url, valid: false, error: `status=${response.status}` };
|
|
72
69
|
} catch (error) {
|
|
73
70
|
clearTimeout(timeoutId);
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
71
|
+
return { url, valid: false, error: 'timeout' };
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* 快速验证 - 仅检查 URL 格式,不发送 HTTP 请求
|
|
77
|
+
*/
|
|
78
|
+
quickValidate(url) {
|
|
79
|
+
if (!this._isValidUrlFormat(url)) {
|
|
80
|
+
return { url, valid: false, error: 'Invalid URL' };
|
|
82
81
|
}
|
|
82
|
+
// 检查常见图片扩展名
|
|
83
|
+
const imageExtensions = ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.svg'];
|
|
84
|
+
const urlLower = url.toLowerCase();
|
|
85
|
+
const hasImageExt = imageExtensions.some(ext => urlLower.includes(ext));
|
|
86
|
+
return { url, valid: hasImageExt, quality: null };
|
|
83
87
|
}
|
|
84
88
|
|
|
85
89
|
/**
|
|
@@ -1,6 +1,12 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* 编排器
|
|
2
|
+
* 编排器 - 高性能实现
|
|
3
3
|
* 负责解析批量关键词,分配任务,协调各模块工作
|
|
4
|
+
*
|
|
5
|
+
* 高性能特性:
|
|
6
|
+
* - 支持多个 MCP 请求并行处理
|
|
7
|
+
* - 智能负载均衡和资源分配
|
|
8
|
+
* - 缓存集成减少重复请求
|
|
9
|
+
* - 快速响应模式
|
|
4
10
|
*/
|
|
5
11
|
|
|
6
12
|
import pLimit from 'p-limit';
|
|
@@ -10,10 +16,21 @@ import { FileManager } from './fileManager.js';
|
|
|
10
16
|
import { ImageProcessor } from './imageProcessor.js';
|
|
11
17
|
import logger from '../infrastructure/logger.js';
|
|
12
18
|
import config from '../config/index.js';
|
|
19
|
+
import { searchCache } from '../infrastructure/cache.js';
|
|
20
|
+
import { metrics } from '../infrastructure/metrics.js';
|
|
13
21
|
|
|
14
|
-
//
|
|
15
|
-
const
|
|
16
|
-
const
|
|
22
|
+
// 极速并发配置 - 最大化吹吐量
|
|
23
|
+
const MAX_CONCURRENT_REQUESTS = 5; // 同时处理 5 个 MCP 请求
|
|
24
|
+
const MAX_CONCURRENT_KEYWORDS = 3; // 每个请求内并行 3 个关键词
|
|
25
|
+
const MAX_CONCURRENT_SEARCHES = 5; // 并行搜索 5 个
|
|
26
|
+
|
|
27
|
+
// 全局共享的并发限制器
|
|
28
|
+
const globalRequestLimit = pLimit(MAX_CONCURRENT_REQUESTS);
|
|
29
|
+
const globalKeywordLimit = pLimit(MAX_CONCURRENT_KEYWORDS);
|
|
30
|
+
const globalSearchLimit = pLimit(MAX_CONCURRENT_SEARCHES);
|
|
31
|
+
|
|
32
|
+
// 请求状态跟踪
|
|
33
|
+
let activeRequests = new Map(); // requestId -> { startTime, query, status }
|
|
17
34
|
|
|
18
35
|
export class Orchestrator {
|
|
19
36
|
constructor() {
|
|
@@ -50,20 +67,29 @@ export class Orchestrator {
|
|
|
50
67
|
*/
|
|
51
68
|
async processKeywordLink(keyword, count, source, options = {}) {
|
|
52
69
|
const startTime = Date.now();
|
|
53
|
-
//
|
|
70
|
+
const fastMode = options.fastMode !== false; // 默认开启快速模式
|
|
54
71
|
const prioritizeQuality = options.prioritizeQuality === true;
|
|
55
72
|
|
|
56
73
|
try {
|
|
57
|
-
// 获取搜索源
|
|
58
74
|
const scraper = getScraper(source);
|
|
75
|
+
const searchCount = fastMode ? count : Math.ceil(count * 1.5);
|
|
59
76
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
const searchCount = Math.ceil(count * 1.5);
|
|
64
|
-
const rawUrls = await scraper.search(keyword, searchCount, options);
|
|
77
|
+
// 尝试从缓存获取
|
|
78
|
+
const cachedUrls = searchCache.getSearchResult(keyword, source, options);
|
|
79
|
+
let rawUrls;
|
|
65
80
|
|
|
66
|
-
|
|
81
|
+
if (cachedUrls && cachedUrls.length >= searchCount) {
|
|
82
|
+
logger.info(`[CACHE] "${keyword}" - ${cachedUrls.length} URLs`);
|
|
83
|
+
rawUrls = cachedUrls;
|
|
84
|
+
metrics.recordCacheHit();
|
|
85
|
+
} else {
|
|
86
|
+
logger.info(`[SEARCH] "${keyword}"...`);
|
|
87
|
+
rawUrls = await scraper.search(keyword, searchCount, options);
|
|
88
|
+
if (rawUrls.length > 0) {
|
|
89
|
+
searchCache.setSearchResult(keyword, source, options, rawUrls);
|
|
90
|
+
}
|
|
91
|
+
metrics.recordCacheMiss();
|
|
92
|
+
}
|
|
67
93
|
|
|
68
94
|
if (rawUrls.length === 0) {
|
|
69
95
|
return {
|
|
@@ -74,25 +100,30 @@ export class Orchestrator {
|
|
|
74
100
|
};
|
|
75
101
|
}
|
|
76
102
|
|
|
77
|
-
//
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
103
|
+
// 快速模式:跳过 HTTP 验证,仅检查 URL 格式
|
|
104
|
+
let resultUrls;
|
|
105
|
+
if (fastMode && !prioritizeQuality) {
|
|
106
|
+
// 快速验证:仅检查 URL 格式和扩展名
|
|
107
|
+
const quickResults = rawUrls.map(url => this.linkValidator.quickValidate(url));
|
|
108
|
+
resultUrls = quickResults.filter(r => r.valid).slice(0, count).map(r => r.url);
|
|
109
|
+
logger.info(`[FAST] "${keyword}" - ${resultUrls.length} URLs (no HTTP validation)`);
|
|
110
|
+
} else {
|
|
111
|
+
// 完整验证
|
|
112
|
+
const { valid } = await this.linkValidator.validateMany(rawUrls, {
|
|
113
|
+
fetchQuality: prioritizeQuality,
|
|
114
|
+
sortByQuality: prioritizeQuality,
|
|
115
|
+
});
|
|
116
|
+
resultUrls = valid.slice(0, count).map(v => v.url);
|
|
117
|
+
}
|
|
85
118
|
|
|
86
119
|
return {
|
|
87
120
|
keyword,
|
|
88
121
|
success: true,
|
|
89
122
|
mode: 'link',
|
|
90
123
|
totalSearched: rawUrls.length,
|
|
91
|
-
totalValidated: valid.length,
|
|
92
|
-
totalInvalid: invalid.length,
|
|
93
124
|
urls: resultUrls,
|
|
94
125
|
count: resultUrls.length,
|
|
95
|
-
|
|
126
|
+
fastMode,
|
|
96
127
|
duration: Date.now() - startTime,
|
|
97
128
|
};
|
|
98
129
|
} catch (error) {
|
|
@@ -116,15 +147,28 @@ export class Orchestrator {
|
|
|
116
147
|
*/
|
|
117
148
|
async processKeywordDownload(keyword, count, source, options = {}) {
|
|
118
149
|
const startTime = Date.now();
|
|
119
|
-
const prioritizeQuality = options.prioritizeQuality !== false;
|
|
150
|
+
const prioritizeQuality = options.prioritizeQuality !== false;
|
|
120
151
|
|
|
121
152
|
try {
|
|
122
|
-
// 获取搜索源
|
|
123
153
|
const scraper = getScraper(source);
|
|
124
|
-
|
|
125
|
-
// 搜索图片(多获取一些以弥补下载失败的损失)
|
|
126
154
|
const searchCount = Math.ceil(count * 2);
|
|
127
|
-
|
|
155
|
+
|
|
156
|
+
// 尝试从缓存获取
|
|
157
|
+
const cachedUrls = searchCache.getSearchResult(keyword, source, options);
|
|
158
|
+
let rawUrls;
|
|
159
|
+
|
|
160
|
+
if (cachedUrls && cachedUrls.length >= searchCount) {
|
|
161
|
+
logger.info(`[CACHE HIT] "${keyword}" - ${cachedUrls.length} URLs from cache`);
|
|
162
|
+
rawUrls = cachedUrls;
|
|
163
|
+
metrics.recordCacheHit();
|
|
164
|
+
} else {
|
|
165
|
+
logger.info(`[SEARCH] "${keyword}"...`);
|
|
166
|
+
rawUrls = await scraper.search(keyword, searchCount, options);
|
|
167
|
+
if (rawUrls.length > 0) {
|
|
168
|
+
searchCache.setSearchResult(keyword, source, options, rawUrls);
|
|
169
|
+
}
|
|
170
|
+
metrics.recordCacheMiss();
|
|
171
|
+
}
|
|
128
172
|
|
|
129
173
|
if (rawUrls.length === 0) {
|
|
130
174
|
return {
|
|
@@ -207,26 +251,79 @@ export class Orchestrator {
|
|
|
207
251
|
}
|
|
208
252
|
|
|
209
253
|
/**
|
|
210
|
-
* 执行任务
|
|
254
|
+
* 执行任务 - 高性能入口
|
|
255
|
+
* 支持多个 MCP 请求并行处理
|
|
211
256
|
* @param {Object} params - 任务参数
|
|
212
257
|
* @returns {Promise<Object>} - 执行结果
|
|
213
258
|
*/
|
|
214
259
|
async execute(params) {
|
|
215
|
-
|
|
260
|
+
const requestId = `req_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
|
|
216
261
|
const pendingCount = this.requestLimit.pendingCount;
|
|
217
262
|
const activeCount = this.requestLimit.activeCount;
|
|
218
263
|
|
|
219
|
-
//
|
|
220
|
-
|
|
221
|
-
|
|
264
|
+
// 记录请求指标
|
|
265
|
+
metrics.recordRequest();
|
|
266
|
+
|
|
267
|
+
// 检查是否超过最大并发限制
|
|
268
|
+
if (activeCount >= MAX_CONCURRENT_REQUESTS && pendingCount >= MAX_CONCURRENT_REQUESTS) {
|
|
269
|
+
logger.warn(`[${requestId}] Request rejected: queue full (${activeCount} active, ${pendingCount} pending)`);
|
|
270
|
+
metrics.recordError('queue_full');
|
|
222
271
|
return {
|
|
223
272
|
success: false,
|
|
224
|
-
error:
|
|
273
|
+
error: `服务繁忙,已达到最大并发数 ${MAX_CONCURRENT_REQUESTS}。请稍后重试。`,
|
|
274
|
+
requestId,
|
|
275
|
+
activeRequests: activeCount,
|
|
276
|
+
pendingRequests: pendingCount,
|
|
225
277
|
};
|
|
226
278
|
}
|
|
227
279
|
|
|
228
|
-
//
|
|
229
|
-
|
|
280
|
+
// 记录活跃请求
|
|
281
|
+
activeRequests.set(requestId, {
|
|
282
|
+
startTime: Date.now(),
|
|
283
|
+
query: params.query,
|
|
284
|
+
status: 'queued',
|
|
285
|
+
});
|
|
286
|
+
|
|
287
|
+
logger.info(`[${requestId}] Request queued: ${activeCount} active, ${pendingCount} pending, query="${params.query}"`);
|
|
288
|
+
|
|
289
|
+
try {
|
|
290
|
+
// 使用并发限制器,支持多个请求并行
|
|
291
|
+
const result = await this.requestLimit(async () => {
|
|
292
|
+
activeRequests.get(requestId).status = 'processing';
|
|
293
|
+
logger.info(`[${requestId}] Processing started`);
|
|
294
|
+
return await this._executeInternal(params, requestId);
|
|
295
|
+
});
|
|
296
|
+
|
|
297
|
+
result.requestId = requestId;
|
|
298
|
+
result.processingTime = Date.now() - activeRequests.get(requestId).startTime;
|
|
299
|
+
return result;
|
|
300
|
+
} catch (error) {
|
|
301
|
+
logger.error(`[${requestId}] Request failed: ${error.message}`);
|
|
302
|
+
metrics.recordError(error);
|
|
303
|
+
return {
|
|
304
|
+
success: false,
|
|
305
|
+
error: error.message,
|
|
306
|
+
requestId,
|
|
307
|
+
};
|
|
308
|
+
} finally {
|
|
309
|
+
activeRequests.delete(requestId);
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
/**
|
|
314
|
+
* 获取当前请求状态
|
|
315
|
+
*/
|
|
316
|
+
static getStatus() {
|
|
317
|
+
return {
|
|
318
|
+
activeRequests: Array.from(activeRequests.entries()).map(([id, info]) => ({
|
|
319
|
+
id,
|
|
320
|
+
query: info.query,
|
|
321
|
+
status: info.status,
|
|
322
|
+
duration: Date.now() - info.startTime,
|
|
323
|
+
})),
|
|
324
|
+
activeCount: activeRequests.size,
|
|
325
|
+
maxConcurrent: MAX_CONCURRENT_REQUESTS,
|
|
326
|
+
};
|
|
230
327
|
}
|
|
231
328
|
|
|
232
329
|
/**
|