smart-image-scraper-mcp 2.5.2 → 2.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -20,8 +20,8 @@ import { searchCache } from '../infrastructure/cache.js';
20
20
  import { metrics } from '../infrastructure/metrics.js';
21
21
  import { requestQueue } from '../infrastructure/requestQueue.js';
22
22
 
23
- // 极速并发配置
24
- const MAX_CONCURRENT_KEYWORDS = 3; // 每个请求内并行 3 个关键词
23
+ // 并发配置 - 降低并发避免资源耗尽
24
+ const MAX_CONCURRENT_KEYWORDS = 2; // 每个请求内并行 2 个关键词
25
25
 
26
26
  // 关键词并发限制器
27
27
  const globalKeywordLimit = pLimit(MAX_CONCURRENT_KEYWORDS);
@@ -50,6 +50,25 @@ export class Orchestrator {
50
50
  .filter(k => k.length > 0);
51
51
  }
52
52
 
53
+ /**
54
+ * 解析最小文件大小参数
55
+ * @param {string} minFileSize - 最小文件大小字符串
56
+ * @returns {number} - 字节数
57
+ */
58
+ _parseMinFileSize(minFileSize) {
59
+ if (!minFileSize || minFileSize === 'any') return 0;
60
+
61
+ const sizeMap = {
62
+ '50kb': 50 * 1024,
63
+ '100kb': 100 * 1024,
64
+ '200kb': 200 * 1024,
65
+ '500kb': 500 * 1024,
66
+ '1mb': 1024 * 1024,
67
+ };
68
+
69
+ return sizeMap[minFileSize.toLowerCase()] || 0;
70
+ }
71
+
53
72
  /**
54
73
  * 处理单个关键词 - Link 模式
55
74
  * @param {string} keyword - 关键词
@@ -60,17 +79,19 @@ export class Orchestrator {
60
79
  */
61
80
  async processKeywordLink(keyword, count, source, options = {}) {
62
81
  const startTime = Date.now();
63
- const fastMode = options.fastMode !== false;
64
- const prioritizeQuality = options.prioritizeQuality === true;
82
+ // 根据 quality 参数决定模式
83
+ const qualityMode = options.quality || 'balanced';
84
+ const fastMode = qualityMode === 'fast';
85
+ const prioritizeQuality = qualityMode === 'high';
86
+ const minFileSize = this._parseMinFileSize(options.minFileSize);
65
87
 
66
88
  try {
67
89
  const scraper = getScraper(source);
68
90
  // 多搜索一些以确保有足够的结果
69
91
  const searchCount = Math.max(count * 3, 10);
70
92
 
71
- // 使用统一的缓存键策略
72
- const cacheKey = { keyword, source, size: options.size, aspect: options.aspect };
73
- const cachedUrls = searchCache.getSearchResult(keyword, source, cacheKey);
93
+ // 使用统一的缓存键策略(options 已包含 size, aspect, safeSearch)
94
+ const cachedUrls = searchCache.getSearchResult(keyword, source, options);
74
95
  let rawUrls;
75
96
 
76
97
  if (cachedUrls && cachedUrls.length >= count) {
@@ -81,7 +102,7 @@ export class Orchestrator {
81
102
  logger.info(`[SEARCH] "${keyword}" (target: ${searchCount})...`);
82
103
  rawUrls = await scraper.search(keyword, searchCount, options);
83
104
  if (rawUrls.length > 0) {
84
- searchCache.setSearchResult(keyword, source, cacheKey, rawUrls);
105
+ searchCache.setSearchResult(keyword, source, options, rawUrls);
85
106
  }
86
107
  metrics.recordCacheMiss();
87
108
  }
@@ -95,21 +116,36 @@ export class Orchestrator {
95
116
  };
96
117
  }
97
118
 
98
- // 快速模式:直接返回搜索结果(不验证)
119
+ // 根据 quality 模式处理
99
120
  let resultUrls;
100
- if (fastMode && !prioritizeQuality) {
101
- // 快速模式:直接使用搜索结果
121
+ let qualityModeLabel;
122
+
123
+ if (fastMode) {
124
+ // fast 模式:直接使用搜索结果,不验证
102
125
  resultUrls = rawUrls.slice(0, count);
126
+ qualityModeLabel = '快速模式(跳过验证)';
103
127
  logger.info(`[FAST] "${keyword}" - ${resultUrls.length} URLs`);
104
128
  } else {
105
- // 完整验证模式:验证不通过的继续搜索更多
129
+ // balanced 或 high 模式:验证链接
106
130
  const { valid } = await this.linkValidator.validateMany(rawUrls, {
107
131
  fetchQuality: prioritizeQuality,
108
132
  sortByQuality: prioritizeQuality,
133
+ minFileSize: minFileSize,
109
134
  });
110
- resultUrls = valid.slice(0, count).map(v => v.url);
111
135
 
112
- // 如果验证通过的不够,记录警告
136
+ // 过滤最小文件大小
137
+ let filteredValid = valid;
138
+ if (minFileSize > 0) {
139
+ filteredValid = valid.filter(v => {
140
+ const size = v.quality?.contentLength || 0;
141
+ return size >= minFileSize || size === 0; // size=0 表示未知,保留
142
+ });
143
+ logger.info(`[FILTER] minFileSize=${options.minFileSize}: ${valid.length} -> ${filteredValid.length}`);
144
+ }
145
+
146
+ resultUrls = filteredValid.slice(0, count).map(v => v.url);
147
+ qualityModeLabel = prioritizeQuality ? '高质量模式(验证+排序)' : '平衡模式(验证)';
148
+
113
149
  if (resultUrls.length < count) {
114
150
  logger.warn(`[VALIDATE] "${keyword}" - only ${resultUrls.length}/${count} valid`);
115
151
  }
@@ -122,7 +158,8 @@ export class Orchestrator {
122
158
  totalSearched: rawUrls.length,
123
159
  urls: resultUrls,
124
160
  count: resultUrls.length,
125
- fastMode,
161
+ qualityMode,
162
+ qualityModeLabel,
126
163
  duration: Date.now() - startTime,
127
164
  };
128
165
  } catch (error) {
@@ -146,16 +183,18 @@ export class Orchestrator {
146
183
  */
147
184
  async processKeywordDownload(keyword, count, source, options = {}) {
148
185
  const startTime = Date.now();
149
- const prioritizeQuality = options.prioritizeQuality !== false;
186
+ // 根据 quality 参数决定模式(download 模式默认高质量)
187
+ const qualityMode = options.quality || 'balanced';
188
+ const prioritizeQuality = qualityMode !== 'fast';
189
+ const minFileSize = this._parseMinFileSize(options.minFileSize);
150
190
 
151
191
  try {
152
192
  const scraper = getScraper(source);
153
193
  // 统一搜索数量策略:与 Link 模式一致
154
194
  const searchCount = Math.max(count * 3, 10);
155
195
 
156
- // 尝试从缓存获取(使用统一的缓存键策略)
157
- const cacheKey = { keyword, source, size: options.size, aspect: options.aspect };
158
- const cachedUrls = searchCache.getSearchResult(keyword, source, cacheKey);
196
+ // 尝试从缓存获取(options 已包含 size, aspect, safeSearch)
197
+ const cachedUrls = searchCache.getSearchResult(keyword, source, options);
159
198
  let rawUrls;
160
199
 
161
200
  if (cachedUrls && cachedUrls.length >= count) {
@@ -166,7 +205,7 @@ export class Orchestrator {
166
205
  logger.info(`[SEARCH] "${keyword}" (target: ${searchCount})...`);
167
206
  rawUrls = await scraper.search(keyword, searchCount, options);
168
207
  if (rawUrls.length > 0) {
169
- searchCache.setSearchResult(keyword, source, cacheKey, rawUrls);
208
+ searchCache.setSearchResult(keyword, source, options, rawUrls);
170
209
  }
171
210
  metrics.recordCacheMiss();
172
211
  }
@@ -180,17 +219,29 @@ export class Orchestrator {
180
219
  };
181
220
  }
182
221
 
183
- // 先验证链接并按质量排序
222
+ // 根据 quality 模式处理
184
223
  let urlsToDownload = rawUrls.slice(0, searchCount);
185
224
  if (prioritizeQuality) {
186
- logger.info(`Validating and sorting ${urlsToDownload.length} URLs by quality...`);
225
+ const sortByQuality = qualityMode === 'high';
226
+ logger.info(`Validating ${urlsToDownload.length} URLs (quality=${qualityMode})...`);
187
227
  const { valid } = await this.linkValidator.validateMany(urlsToDownload, {
188
- fetchQuality: true,
189
- sortByQuality: true,
228
+ fetchQuality: sortByQuality,
229
+ sortByQuality: sortByQuality,
230
+ minFileSize: minFileSize,
190
231
  });
191
- // 使用排序后的URL列表
192
- urlsToDownload = valid.map(v => v.url);
193
- logger.info(`Quality sorted: ${urlsToDownload.length} valid URLs`);
232
+
233
+ // 过滤最小文件大小
234
+ let filteredValid = valid;
235
+ if (minFileSize > 0) {
236
+ filteredValid = valid.filter(v => {
237
+ const size = v.quality?.contentLength || 0;
238
+ return size >= minFileSize || size === 0;
239
+ });
240
+ logger.info(`[FILTER] minFileSize: ${valid.length} -> ${filteredValid.length}`);
241
+ }
242
+
243
+ urlsToDownload = filteredValid.map(v => v.url);
244
+ logger.info(`Quality filtered: ${urlsToDownload.length} valid URLs`);
194
245
  }
195
246
 
196
247
  // 下载图片(已按质量排序,高质量优先)
@@ -362,11 +413,7 @@ export class Orchestrator {
362
413
 
363
414
  if (r.mode === 'link') {
364
415
  lines.push(`- 搜索到: ${r.totalSearched || 0} 张`);
365
- if (r.fastMode) {
366
- lines.push(`- 模式: 快速模式(跳过验证)`);
367
- } else {
368
- lines.push(`- 验证通过: ${r.totalValidated || r.count || 0} 张`);
369
- }
416
+ lines.push(`- 质量模式: ${r.qualityModeLabel || '快速模式'}`);
370
417
  lines.push(`- 返回: ${r.count || 0} 张`);
371
418
  lines.push(`- 耗时: ${(r.duration / 1000).toFixed(2)}秒`);
372
419
  lines.push('');
@@ -0,0 +1,259 @@
1
+ /**
2
+ * 编排器 - 简化版
3
+ * 模仿主流 MCP 的实现方式:无状态、无全局缓存、每次请求独立
4
+ */
5
+
6
+ import pLimit from 'p-limit';
7
+ import { getScraper } from '../providers/index.js';
8
+ import { LinkValidator } from './linkValidator.js';
9
+ import { FileManager } from './fileManager.js';
10
+ import { ImageProcessor } from './imageProcessor.js';
11
+ import config from '../config/index.js';
12
+
13
+ export class Orchestrator {
14
+ constructor() {
15
+ // 每个实例独立的限制器,避免全局状态
16
+ this.keywordLimit = pLimit(2);
17
+ this.linkValidator = new LinkValidator();
18
+ this.fileManager = new FileManager();
19
+ this.imageProcessor = new ImageProcessor();
20
+ }
21
+
22
+ /**
23
+ * 解析关键词字符串
24
+ */
25
+ parseKeywords(query) {
26
+ if (!query || typeof query !== 'string') return [];
27
+ return query.split(',').map(k => k.trim()).filter(k => k.length > 0);
28
+ }
29
+
30
+ /**
31
+ * 执行任务
32
+ */
33
+ async execute(params) {
34
+ const {
35
+ query, mode, count = 10, source = 'bing',
36
+ size = 'all', safeSearch = 'moderate', aspect = 'all',
37
+ targetSize = null, fit = 'cover', position = 'center'
38
+ } = params;
39
+
40
+ const options = { size, safeSearch, aspect, targetSize, fit, position };
41
+ const startTime = Date.now();
42
+ const keywords = this.parseKeywords(query);
43
+
44
+ if (keywords.length === 0) {
45
+ return { success: false, error: '请提供有效的搜索关键词' };
46
+ }
47
+
48
+ // 根据模式选择处理函数
49
+ const processFunc = mode === 'link'
50
+ ? this._processLink.bind(this)
51
+ : this._processDownload.bind(this);
52
+
53
+ // 并发处理关键词(使用实例级限制器)
54
+ const results = await Promise.all(
55
+ keywords.map(keyword =>
56
+ this.keywordLimit(() => processFunc(keyword, count, source, options))
57
+ )
58
+ );
59
+
60
+ const successResults = results.filter(r => r.success);
61
+ const failedResults = results.filter(r => !r.success);
62
+
63
+ return {
64
+ success: true,
65
+ mode,
66
+ source,
67
+ totalKeywords: keywords.length,
68
+ successCount: successResults.length,
69
+ failedCount: failedResults.length,
70
+ results,
71
+ duration: Date.now() - startTime,
72
+ };
73
+ }
74
+
75
+ /**
76
+ * 处理 Link 模式
77
+ */
78
+ async _processLink(keyword, count, source, options) {
79
+ const startTime = Date.now();
80
+
81
+ try {
82
+ const scraper = getScraper(source);
83
+ const searchCount = Math.max(count * 2, 10);
84
+
85
+ // 直接搜索,不使用缓存(简化设计)
86
+ const rawUrls = await scraper.search(keyword, searchCount, options);
87
+
88
+ if (rawUrls.length === 0) {
89
+ return {
90
+ keyword,
91
+ success: false,
92
+ error: '未找到任何图片',
93
+ duration: Date.now() - startTime,
94
+ };
95
+ }
96
+
97
+ // 快速模式:直接返回搜索结果
98
+ const resultUrls = rawUrls.slice(0, count);
99
+
100
+ return {
101
+ keyword,
102
+ success: true,
103
+ mode: 'link',
104
+ totalSearched: rawUrls.length,
105
+ urls: resultUrls,
106
+ count: resultUrls.length,
107
+ fastMode: true,
108
+ duration: Date.now() - startTime,
109
+ };
110
+ } catch (error) {
111
+ return {
112
+ keyword,
113
+ success: false,
114
+ error: error.message,
115
+ duration: Date.now() - startTime,
116
+ };
117
+ }
118
+ }
119
+
120
+ /**
121
+ * 处理 Download 模式
122
+ */
123
+ async _processDownload(keyword, count, source, options) {
124
+ const startTime = Date.now();
125
+
126
+ try {
127
+ const scraper = getScraper(source);
128
+ const searchCount = Math.max(count * 2, 10);
129
+
130
+ // 搜索
131
+ const rawUrls = await scraper.search(keyword, searchCount, options);
132
+
133
+ if (rawUrls.length === 0) {
134
+ return {
135
+ keyword,
136
+ success: false,
137
+ error: '未找到任何图片',
138
+ duration: Date.now() - startTime,
139
+ };
140
+ }
141
+
142
+ // 下载图片
143
+ const { success, failed } = await this.fileManager.downloadMany(rawUrls, keyword);
144
+ let resultDownloads = success.slice(0, count);
145
+
146
+ // 如果指定了目标尺寸,进行后处理
147
+ let processedCount = 0;
148
+ let processFailedCount = 0;
149
+ if (options.targetSize && resultDownloads.length > 0) {
150
+ const targetSize = this.imageProcessor.parseTargetSize(options.targetSize);
151
+ if (targetSize) {
152
+ const processResult = await this.imageProcessor.processMany(resultDownloads, {
153
+ width: targetSize.width,
154
+ height: targetSize.height,
155
+ fit: options.fit || 'cover',
156
+ position: options.position || 'center',
157
+ });
158
+ resultDownloads = processResult.success;
159
+ processedCount = processResult.success.length;
160
+ processFailedCount = processResult.failed.length;
161
+ }
162
+ }
163
+
164
+ // 保存元数据
165
+ let metadataPath = null;
166
+ if (resultDownloads.length > 0) {
167
+ metadataPath = await this.fileManager.saveMetadata(keyword, resultDownloads);
168
+ }
169
+
170
+ return {
171
+ keyword,
172
+ success: true,
173
+ mode: 'download',
174
+ totalSearched: rawUrls.length,
175
+ totalDownloaded: success.length,
176
+ totalFailed: failed.length,
177
+ totalProcessed: processedCount,
178
+ totalProcessFailed: processFailedCount,
179
+ files: resultDownloads,
180
+ count: resultDownloads.length,
181
+ saveDir: this.fileManager.getKeywordDir(keyword),
182
+ metadataPath,
183
+ targetSize: options.targetSize || null,
184
+ duration: Date.now() - startTime,
185
+ };
186
+ } catch (error) {
187
+ return {
188
+ keyword,
189
+ success: false,
190
+ error: error.message,
191
+ duration: Date.now() - startTime,
192
+ };
193
+ }
194
+ }
195
+
196
+ /**
197
+ * 格式化输出为 Markdown
198
+ */
199
+ formatResult(result) {
200
+ if (!result.success) {
201
+ return `## ❌ 任务失败\n\n**错误原因**: ${result.error}`;
202
+ }
203
+
204
+ const lines = [];
205
+ lines.push(`# 📷 图片抓取报告`);
206
+ lines.push('');
207
+ lines.push(`- **模式**: ${result.mode === 'link' ? '链接提取' : '本地下载'}`);
208
+ lines.push(`- **搜索源**: ${result.source}`);
209
+ lines.push(`- **关键词数量**: ${result.totalKeywords}`);
210
+ lines.push(`- **成功**: ${result.successCount} | **失败**: ${result.failedCount}`);
211
+ lines.push(`- **总耗时**: ${(result.duration / 1000).toFixed(2)}秒`);
212
+ lines.push('');
213
+
214
+ for (const r of result.results) {
215
+ lines.push(`## 🔍 关键词: ${r.keyword}`);
216
+ lines.push('');
217
+
218
+ if (!r.success) {
219
+ lines.push(`❌ **失败**: ${r.error}`);
220
+ lines.push('');
221
+ continue;
222
+ }
223
+
224
+ if (r.mode === 'link') {
225
+ lines.push(`- 搜索到: ${r.totalSearched || 0} 张`);
226
+ lines.push(`- 返回: ${r.count || 0} 张`);
227
+ lines.push(`- 耗时: ${(r.duration / 1000).toFixed(2)}秒`);
228
+ lines.push('');
229
+ lines.push('### 有效链接');
230
+ lines.push('');
231
+ (r.urls || []).forEach((url, i) => {
232
+ lines.push(`${i + 1}. ${url}`);
233
+ });
234
+ } else {
235
+ lines.push(`- 搜索到: ${r.totalSearched} 张`);
236
+ lines.push(`- 下载成功: ${r.totalDownloaded} 张`);
237
+ lines.push(`- 下载失败: ${r.totalFailed} 张`);
238
+ if (r.targetSize) {
239
+ lines.push(`- 尺寸处理: ${r.totalProcessed} 成功, ${r.totalProcessFailed} 失败`);
240
+ lines.push(`- 目标尺寸: ${r.targetSize}`);
241
+ }
242
+ lines.push(`- 最终保存: ${r.count} 张`);
243
+ lines.push(`- 存储目录: \`${r.saveDir}\``);
244
+ lines.push(`- 耗时: ${(r.duration / 1000).toFixed(2)}秒`);
245
+ lines.push('');
246
+ lines.push('### 已下载文件');
247
+ lines.push('');
248
+ r.files.forEach((file, i) => {
249
+ lines.push(`${i + 1}. \`${file.path}\``);
250
+ });
251
+ }
252
+ lines.push('');
253
+ }
254
+
255
+ return lines.join('\n');
256
+ }
257
+ }
258
+
259
+ export default Orchestrator;