smart-image-scraper-mcp 2.5.2 → 2.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -2
- package/package.json +1 -1
- package/src/config/index.js +4 -4
- package/src/index.backup.js +340 -0
- package/src/index.js +59 -198
- package/src/index.new.js +213 -0
- package/src/index.simple.js +213 -0
- package/src/infrastructure/cache.js +1 -0
- package/src/infrastructure/gracefulShutdown.js +4 -0
- package/src/infrastructure/httpClient.js +15 -5
- package/src/services/fileManager.js +61 -18
- package/src/services/linkValidator.js +15 -8
- package/src/services/orchestrator.js +8 -10
- package/src/services/orchestrator.simple.js +259 -0
|
@@ -8,8 +8,8 @@ import httpClient from '../infrastructure/httpClient.js';
|
|
|
8
8
|
import logger from '../infrastructure/logger.js';
|
|
9
9
|
import config from '../config/index.js';
|
|
10
10
|
|
|
11
|
-
//
|
|
12
|
-
const MAX_VALIDATE_CONCURRENCY =
|
|
11
|
+
// 并发验证配置 - 降低并发避免连接池耗尽
|
|
12
|
+
const MAX_VALIDATE_CONCURRENCY = 15; // 验证并发 15
|
|
13
13
|
const globalValidateLimit = pLimit(MAX_VALIDATE_CONCURRENCY);
|
|
14
14
|
|
|
15
15
|
export class LinkValidator {
|
|
@@ -45,11 +45,11 @@ export class LinkValidator {
|
|
|
45
45
|
}
|
|
46
46
|
|
|
47
47
|
const controller = new AbortController();
|
|
48
|
-
const timeoutId = setTimeout(() => controller.abort(),
|
|
48
|
+
const timeoutId = setTimeout(() => controller.abort(), 3000); // 超时 3 秒
|
|
49
49
|
|
|
50
50
|
try {
|
|
51
51
|
const response = await httpClient.head(url, {
|
|
52
|
-
timeout:
|
|
52
|
+
timeout: 2500, // 超时 2.5 秒
|
|
53
53
|
maxRedirects: 1, // 最多 1 次重定向
|
|
54
54
|
signal: controller.signal,
|
|
55
55
|
});
|
|
@@ -68,6 +68,10 @@ export class LinkValidator {
|
|
|
68
68
|
return { url, valid: false, error: `status=${response.status}` };
|
|
69
69
|
} catch (error) {
|
|
70
70
|
clearTimeout(timeoutId);
|
|
71
|
+
// 确保 abort controller 被清理
|
|
72
|
+
if (!controller.signal.aborted) {
|
|
73
|
+
controller.abort();
|
|
74
|
+
}
|
|
71
75
|
return { url, valid: false, error: 'timeout' };
|
|
72
76
|
}
|
|
73
77
|
}
|
|
@@ -178,12 +182,12 @@ export class LinkValidator {
|
|
|
178
182
|
*/
|
|
179
183
|
async _validateWithGet(url, fetchQuality = false) {
|
|
180
184
|
const controller = new AbortController();
|
|
181
|
-
const timeoutId = setTimeout(() => controller.abort(),
|
|
185
|
+
const timeoutId = setTimeout(() => controller.abort(), 5000);
|
|
182
186
|
|
|
183
187
|
try {
|
|
184
188
|
const response = await httpClient.get(url, {
|
|
185
|
-
timeout:
|
|
186
|
-
maxRedirects:
|
|
189
|
+
timeout: 4000,
|
|
190
|
+
maxRedirects: 2,
|
|
187
191
|
responseType: 'arraybuffer',
|
|
188
192
|
signal: controller.signal,
|
|
189
193
|
headers: {
|
|
@@ -205,7 +209,10 @@ export class LinkValidator {
|
|
|
205
209
|
return { url, valid: false, error: `GET validation failed: status=${response.status}` };
|
|
206
210
|
} catch (error) {
|
|
207
211
|
clearTimeout(timeoutId);
|
|
208
|
-
|
|
212
|
+
// 确保 abort controller 被清理
|
|
213
|
+
if (!controller.signal.aborted) {
|
|
214
|
+
controller.abort();
|
|
215
|
+
}
|
|
209
216
|
if (error.name === 'AbortError' || error.code === 'ERR_CANCELED') {
|
|
210
217
|
return { url, valid: false, error: 'Request timeout' };
|
|
211
218
|
}
|
|
@@ -20,8 +20,8 @@ import { searchCache } from '../infrastructure/cache.js';
|
|
|
20
20
|
import { metrics } from '../infrastructure/metrics.js';
|
|
21
21
|
import { requestQueue } from '../infrastructure/requestQueue.js';
|
|
22
22
|
|
|
23
|
-
//
|
|
24
|
-
const MAX_CONCURRENT_KEYWORDS =
|
|
23
|
+
// 并发配置 - 降低并发避免资源耗尽
|
|
24
|
+
const MAX_CONCURRENT_KEYWORDS = 2; // 每个请求内并行 2 个关键词
|
|
25
25
|
|
|
26
26
|
// 关键词并发限制器
|
|
27
27
|
const globalKeywordLimit = pLimit(MAX_CONCURRENT_KEYWORDS);
|
|
@@ -68,9 +68,8 @@ export class Orchestrator {
|
|
|
68
68
|
// 多搜索一些以确保有足够的结果
|
|
69
69
|
const searchCount = Math.max(count * 3, 10);
|
|
70
70
|
|
|
71
|
-
//
|
|
72
|
-
const
|
|
73
|
-
const cachedUrls = searchCache.getSearchResult(keyword, source, cacheKey);
|
|
71
|
+
// 使用统一的缓存键策略(options 已包含 size, aspect, safeSearch)
|
|
72
|
+
const cachedUrls = searchCache.getSearchResult(keyword, source, options);
|
|
74
73
|
let rawUrls;
|
|
75
74
|
|
|
76
75
|
if (cachedUrls && cachedUrls.length >= count) {
|
|
@@ -81,7 +80,7 @@ export class Orchestrator {
|
|
|
81
80
|
logger.info(`[SEARCH] "${keyword}" (target: ${searchCount})...`);
|
|
82
81
|
rawUrls = await scraper.search(keyword, searchCount, options);
|
|
83
82
|
if (rawUrls.length > 0) {
|
|
84
|
-
searchCache.setSearchResult(keyword, source,
|
|
83
|
+
searchCache.setSearchResult(keyword, source, options, rawUrls);
|
|
85
84
|
}
|
|
86
85
|
metrics.recordCacheMiss();
|
|
87
86
|
}
|
|
@@ -153,9 +152,8 @@ export class Orchestrator {
|
|
|
153
152
|
// 统一搜索数量策略:与 Link 模式一致
|
|
154
153
|
const searchCount = Math.max(count * 3, 10);
|
|
155
154
|
|
|
156
|
-
//
|
|
157
|
-
const
|
|
158
|
-
const cachedUrls = searchCache.getSearchResult(keyword, source, cacheKey);
|
|
155
|
+
// 尝试从缓存获取(options 已包含 size, aspect, safeSearch)
|
|
156
|
+
const cachedUrls = searchCache.getSearchResult(keyword, source, options);
|
|
159
157
|
let rawUrls;
|
|
160
158
|
|
|
161
159
|
if (cachedUrls && cachedUrls.length >= count) {
|
|
@@ -166,7 +164,7 @@ export class Orchestrator {
|
|
|
166
164
|
logger.info(`[SEARCH] "${keyword}" (target: ${searchCount})...`);
|
|
167
165
|
rawUrls = await scraper.search(keyword, searchCount, options);
|
|
168
166
|
if (rawUrls.length > 0) {
|
|
169
|
-
searchCache.setSearchResult(keyword, source,
|
|
167
|
+
searchCache.setSearchResult(keyword, source, options, rawUrls);
|
|
170
168
|
}
|
|
171
169
|
metrics.recordCacheMiss();
|
|
172
170
|
}
|
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* 编排器 - 简化版
|
|
3
|
+
* 模仿主流 MCP 的实现方式:无状态、无全局缓存、每次请求独立
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import pLimit from 'p-limit';
|
|
7
|
+
import { getScraper } from '../providers/index.js';
|
|
8
|
+
import { LinkValidator } from './linkValidator.js';
|
|
9
|
+
import { FileManager } from './fileManager.js';
|
|
10
|
+
import { ImageProcessor } from './imageProcessor.js';
|
|
11
|
+
import config from '../config/index.js';
|
|
12
|
+
|
|
13
|
+
export class Orchestrator {
|
|
14
|
+
constructor() {
|
|
15
|
+
// 每个实例独立的限制器,避免全局状态
|
|
16
|
+
this.keywordLimit = pLimit(2);
|
|
17
|
+
this.linkValidator = new LinkValidator();
|
|
18
|
+
this.fileManager = new FileManager();
|
|
19
|
+
this.imageProcessor = new ImageProcessor();
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* 解析关键词字符串
|
|
24
|
+
*/
|
|
25
|
+
parseKeywords(query) {
|
|
26
|
+
if (!query || typeof query !== 'string') return [];
|
|
27
|
+
return query.split(',').map(k => k.trim()).filter(k => k.length > 0);
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* 执行任务
|
|
32
|
+
*/
|
|
33
|
+
async execute(params) {
|
|
34
|
+
const {
|
|
35
|
+
query, mode, count = 10, source = 'bing',
|
|
36
|
+
size = 'all', safeSearch = 'moderate', aspect = 'all',
|
|
37
|
+
targetSize = null, fit = 'cover', position = 'center'
|
|
38
|
+
} = params;
|
|
39
|
+
|
|
40
|
+
const options = { size, safeSearch, aspect, targetSize, fit, position };
|
|
41
|
+
const startTime = Date.now();
|
|
42
|
+
const keywords = this.parseKeywords(query);
|
|
43
|
+
|
|
44
|
+
if (keywords.length === 0) {
|
|
45
|
+
return { success: false, error: '请提供有效的搜索关键词' };
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
// 根据模式选择处理函数
|
|
49
|
+
const processFunc = mode === 'link'
|
|
50
|
+
? this._processLink.bind(this)
|
|
51
|
+
: this._processDownload.bind(this);
|
|
52
|
+
|
|
53
|
+
// 并发处理关键词(使用实例级限制器)
|
|
54
|
+
const results = await Promise.all(
|
|
55
|
+
keywords.map(keyword =>
|
|
56
|
+
this.keywordLimit(() => processFunc(keyword, count, source, options))
|
|
57
|
+
)
|
|
58
|
+
);
|
|
59
|
+
|
|
60
|
+
const successResults = results.filter(r => r.success);
|
|
61
|
+
const failedResults = results.filter(r => !r.success);
|
|
62
|
+
|
|
63
|
+
return {
|
|
64
|
+
success: true,
|
|
65
|
+
mode,
|
|
66
|
+
source,
|
|
67
|
+
totalKeywords: keywords.length,
|
|
68
|
+
successCount: successResults.length,
|
|
69
|
+
failedCount: failedResults.length,
|
|
70
|
+
results,
|
|
71
|
+
duration: Date.now() - startTime,
|
|
72
|
+
};
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* 处理 Link 模式
|
|
77
|
+
*/
|
|
78
|
+
async _processLink(keyword, count, source, options) {
|
|
79
|
+
const startTime = Date.now();
|
|
80
|
+
|
|
81
|
+
try {
|
|
82
|
+
const scraper = getScraper(source);
|
|
83
|
+
const searchCount = Math.max(count * 2, 10);
|
|
84
|
+
|
|
85
|
+
// 直接搜索,不使用缓存(简化设计)
|
|
86
|
+
const rawUrls = await scraper.search(keyword, searchCount, options);
|
|
87
|
+
|
|
88
|
+
if (rawUrls.length === 0) {
|
|
89
|
+
return {
|
|
90
|
+
keyword,
|
|
91
|
+
success: false,
|
|
92
|
+
error: '未找到任何图片',
|
|
93
|
+
duration: Date.now() - startTime,
|
|
94
|
+
};
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
// 快速模式:直接返回搜索结果
|
|
98
|
+
const resultUrls = rawUrls.slice(0, count);
|
|
99
|
+
|
|
100
|
+
return {
|
|
101
|
+
keyword,
|
|
102
|
+
success: true,
|
|
103
|
+
mode: 'link',
|
|
104
|
+
totalSearched: rawUrls.length,
|
|
105
|
+
urls: resultUrls,
|
|
106
|
+
count: resultUrls.length,
|
|
107
|
+
fastMode: true,
|
|
108
|
+
duration: Date.now() - startTime,
|
|
109
|
+
};
|
|
110
|
+
} catch (error) {
|
|
111
|
+
return {
|
|
112
|
+
keyword,
|
|
113
|
+
success: false,
|
|
114
|
+
error: error.message,
|
|
115
|
+
duration: Date.now() - startTime,
|
|
116
|
+
};
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
/**
|
|
121
|
+
* 处理 Download 模式
|
|
122
|
+
*/
|
|
123
|
+
async _processDownload(keyword, count, source, options) {
|
|
124
|
+
const startTime = Date.now();
|
|
125
|
+
|
|
126
|
+
try {
|
|
127
|
+
const scraper = getScraper(source);
|
|
128
|
+
const searchCount = Math.max(count * 2, 10);
|
|
129
|
+
|
|
130
|
+
// 搜索
|
|
131
|
+
const rawUrls = await scraper.search(keyword, searchCount, options);
|
|
132
|
+
|
|
133
|
+
if (rawUrls.length === 0) {
|
|
134
|
+
return {
|
|
135
|
+
keyword,
|
|
136
|
+
success: false,
|
|
137
|
+
error: '未找到任何图片',
|
|
138
|
+
duration: Date.now() - startTime,
|
|
139
|
+
};
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// 下载图片
|
|
143
|
+
const { success, failed } = await this.fileManager.downloadMany(rawUrls, keyword);
|
|
144
|
+
let resultDownloads = success.slice(0, count);
|
|
145
|
+
|
|
146
|
+
// 如果指定了目标尺寸,进行后处理
|
|
147
|
+
let processedCount = 0;
|
|
148
|
+
let processFailedCount = 0;
|
|
149
|
+
if (options.targetSize && resultDownloads.length > 0) {
|
|
150
|
+
const targetSize = this.imageProcessor.parseTargetSize(options.targetSize);
|
|
151
|
+
if (targetSize) {
|
|
152
|
+
const processResult = await this.imageProcessor.processMany(resultDownloads, {
|
|
153
|
+
width: targetSize.width,
|
|
154
|
+
height: targetSize.height,
|
|
155
|
+
fit: options.fit || 'cover',
|
|
156
|
+
position: options.position || 'center',
|
|
157
|
+
});
|
|
158
|
+
resultDownloads = processResult.success;
|
|
159
|
+
processedCount = processResult.success.length;
|
|
160
|
+
processFailedCount = processResult.failed.length;
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
// 保存元数据
|
|
165
|
+
let metadataPath = null;
|
|
166
|
+
if (resultDownloads.length > 0) {
|
|
167
|
+
metadataPath = await this.fileManager.saveMetadata(keyword, resultDownloads);
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
return {
|
|
171
|
+
keyword,
|
|
172
|
+
success: true,
|
|
173
|
+
mode: 'download',
|
|
174
|
+
totalSearched: rawUrls.length,
|
|
175
|
+
totalDownloaded: success.length,
|
|
176
|
+
totalFailed: failed.length,
|
|
177
|
+
totalProcessed: processedCount,
|
|
178
|
+
totalProcessFailed: processFailedCount,
|
|
179
|
+
files: resultDownloads,
|
|
180
|
+
count: resultDownloads.length,
|
|
181
|
+
saveDir: this.fileManager.getKeywordDir(keyword),
|
|
182
|
+
metadataPath,
|
|
183
|
+
targetSize: options.targetSize || null,
|
|
184
|
+
duration: Date.now() - startTime,
|
|
185
|
+
};
|
|
186
|
+
} catch (error) {
|
|
187
|
+
return {
|
|
188
|
+
keyword,
|
|
189
|
+
success: false,
|
|
190
|
+
error: error.message,
|
|
191
|
+
duration: Date.now() - startTime,
|
|
192
|
+
};
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
/**
|
|
197
|
+
* 格式化输出为 Markdown
|
|
198
|
+
*/
|
|
199
|
+
formatResult(result) {
|
|
200
|
+
if (!result.success) {
|
|
201
|
+
return `## ❌ 任务失败\n\n**错误原因**: ${result.error}`;
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
const lines = [];
|
|
205
|
+
lines.push(`# 📷 图片抓取报告`);
|
|
206
|
+
lines.push('');
|
|
207
|
+
lines.push(`- **模式**: ${result.mode === 'link' ? '链接提取' : '本地下载'}`);
|
|
208
|
+
lines.push(`- **搜索源**: ${result.source}`);
|
|
209
|
+
lines.push(`- **关键词数量**: ${result.totalKeywords}`);
|
|
210
|
+
lines.push(`- **成功**: ${result.successCount} | **失败**: ${result.failedCount}`);
|
|
211
|
+
lines.push(`- **总耗时**: ${(result.duration / 1000).toFixed(2)}秒`);
|
|
212
|
+
lines.push('');
|
|
213
|
+
|
|
214
|
+
for (const r of result.results) {
|
|
215
|
+
lines.push(`## 🔍 关键词: ${r.keyword}`);
|
|
216
|
+
lines.push('');
|
|
217
|
+
|
|
218
|
+
if (!r.success) {
|
|
219
|
+
lines.push(`❌ **失败**: ${r.error}`);
|
|
220
|
+
lines.push('');
|
|
221
|
+
continue;
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
if (r.mode === 'link') {
|
|
225
|
+
lines.push(`- 搜索到: ${r.totalSearched || 0} 张`);
|
|
226
|
+
lines.push(`- 返回: ${r.count || 0} 张`);
|
|
227
|
+
lines.push(`- 耗时: ${(r.duration / 1000).toFixed(2)}秒`);
|
|
228
|
+
lines.push('');
|
|
229
|
+
lines.push('### 有效链接');
|
|
230
|
+
lines.push('');
|
|
231
|
+
(r.urls || []).forEach((url, i) => {
|
|
232
|
+
lines.push(`${i + 1}. ${url}`);
|
|
233
|
+
});
|
|
234
|
+
} else {
|
|
235
|
+
lines.push(`- 搜索到: ${r.totalSearched} 张`);
|
|
236
|
+
lines.push(`- 下载成功: ${r.totalDownloaded} 张`);
|
|
237
|
+
lines.push(`- 下载失败: ${r.totalFailed} 张`);
|
|
238
|
+
if (r.targetSize) {
|
|
239
|
+
lines.push(`- 尺寸处理: ${r.totalProcessed} 成功, ${r.totalProcessFailed} 失败`);
|
|
240
|
+
lines.push(`- 目标尺寸: ${r.targetSize}`);
|
|
241
|
+
}
|
|
242
|
+
lines.push(`- 最终保存: ${r.count} 张`);
|
|
243
|
+
lines.push(`- 存储目录: \`${r.saveDir}\``);
|
|
244
|
+
lines.push(`- 耗时: ${(r.duration / 1000).toFixed(2)}秒`);
|
|
245
|
+
lines.push('');
|
|
246
|
+
lines.push('### 已下载文件');
|
|
247
|
+
lines.push('');
|
|
248
|
+
r.files.forEach((file, i) => {
|
|
249
|
+
lines.push(`${i + 1}. \`${file.path}\``);
|
|
250
|
+
});
|
|
251
|
+
}
|
|
252
|
+
lines.push('');
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
return lines.join('\n');
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
export default Orchestrator;
|