smart-image-scraper-mcp 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "smart-image-scraper-mcp",
3
- "version": "1.0.0",
3
+ "version": "1.1.0",
4
4
  "description": "全网智能图片抓取 MCP 服务器 - 支持 Bing/Google 图片搜索、验证和下载",
5
5
  "main": "src/index.js",
6
6
  "type": "module",
@@ -38,6 +38,7 @@
38
38
  "fs-extra": "^11.2.0",
39
39
  "https-proxy-agent": "^7.0.0",
40
40
  "p-limit": "^5.0.0",
41
+ "sharp": "^0.34.5",
41
42
  "socks-proxy-agent": "^8.0.0",
42
43
  "uuid": "^9.0.0"
43
44
  }
package/src/index.js CHANGED
@@ -46,20 +46,35 @@ const orchestrator = new Orchestrator();
46
46
  // 定义 Tool Schema
47
47
  const SMART_SCRAPER_TOOL = {
48
48
  name: 'smart_scraper',
49
- description: `全网智能图片抓取工具。
49
+ description: `全网智能图片抓取工具 - 从 Bing/Google 搜索、验证、下载高质量图片。
50
50
 
51
- 【功能】从 Bing/Google 搜索图片,支持验证链接和下载到本地。
51
+ 【核心功能】
52
+ 1. 搜索图片链接 (mode=link) - 返回验证过的图片URL列表
53
+ 2. 下载图片 (mode=download) - 下载到本地,自动按质量排序优先高清
54
+ 3. 尺寸统一 (targetSize) - 下载后自动裁剪/缩放到指定尺寸
55
+ 4. 宽高比过滤 (aspect) - 横向/竖向/正方形
52
56
 
53
- 【使用场景】
54
- - 用户说"找图片"、"搜索图片"使用 mode="link"
55
- - 用户说"下载图片"、"保存图片"使用 mode="download"
56
- - 用户说"高清"、"壁纸"使用 size="large" 或 "wallpaper"
57
- - 用户说"猫和狗的图片" → 使用 query="猫,狗"(逗号分隔)
57
+ 【参数选择指南】
58
+ - 用户要"找/搜索/查找图片" → mode="link"
59
+ - 用户要"下载/保存/获取图片" → mode="download"
60
+ - 用户要"高清/大图/壁纸" → size="large" 或 "wallpaper"
61
+ - 用户要"电脑壁纸/横屏/横向" → aspect="wide"
62
+ - 用户要"手机壁纸/竖屏/竖向" → aspect="tall"
63
+ - 用户要"统一尺寸/固定大小" → targetSize="1920x1080" 或预设名
64
+ - 用户要"多种类型图片" → query="猫,狗,鸟"(英文逗号分隔)
58
65
 
59
- 【示例调用】
60
- 1. 搜索10张猫咪图片链接: {"query":"猫咪","mode":"link","count":10}
61
- 2. 下载20张风景壁纸: {"query":"风景","mode":"download","count":20,"size":"wallpaper"}
62
- 3. 批量下载多类图片: {"query":"猫,狗,鸟","mode":"download","count":5}`,
66
+ 【预设尺寸名称】
67
+ - 电脑壁纸: desktop_1080p(1920x1080), desktop_2k(2560x1440), desktop_4k(3840x2160)
68
+ - 手机壁纸: mobile_hd(1080x1920), mobile_2k(1440x2560)
69
+ - 正方形: square_1080(1080x1080), square_512(512x512)
70
+ - 社交媒体: instagram(1080x1080), twitter(1200x675), facebook(1200x630)
71
+
72
+ 【调用示例】
73
+ 1. 搜索5张猫的图片: {"query":"可爱的猫","mode":"link","count":5}
74
+ 2. 下载10张高清风景图: {"query":"风景","mode":"download","count":10,"size":"large"}
75
+ 3. 下载电脑壁纸并统一为1080p: {"query":"风景","mode":"download","count":10,"aspect":"wide","targetSize":"desktop_1080p"}
76
+ 4. 下载手机壁纸: {"query":"动漫","mode":"download","count":10,"aspect":"tall","targetSize":"mobile_hd"}
77
+ 5. 批量下载多类图片: {"query":"猫,狗,兔子","mode":"download","count":5}`,
63
78
  inputSchema: {
64
79
  type: 'object',
65
80
  properties: {
@@ -74,7 +89,7 @@ const SMART_SCRAPER_TOOL = {
74
89
  },
75
90
  count: {
76
91
  type: 'number',
77
- description: '每个关键词获取的图片数量。范围1-100,推荐10-20。用户说"几张"用5-10,说"很多"用20-30',
92
+ description: '每个关键词获取的图片数量。范围1-100,推荐1-20。用户说"几张"用5-10,说"很多"用20-30',
78
93
  default: 10,
79
94
  },
80
95
  source: {
@@ -89,6 +104,22 @@ const SMART_SCRAPER_TOOL = {
89
104
  description: '图片尺寸。all=不限;small=小图/图标;medium=中图;large=大图/高清;wallpaper=壁纸级别(1080p+)',
90
105
  default: 'all',
91
106
  },
107
+ aspect: {
108
+ type: 'string',
109
+ enum: ['all', 'wide', 'tall', 'square'],
110
+ description: '图片宽高比。all=不限;wide=横向/宽屏(电脑壁纸);tall=纵向/竖屏(手机壁纸);square=正方形',
111
+ default: 'all',
112
+ },
113
+ targetSize: {
114
+ type: 'string',
115
+ description: '目标尺寸,下载后统一裁剪/缩放到此尺寸。格式: "宽x高"(如"1920x1080")或预设名(desktop_1080p/desktop_2k/desktop_4k/mobile_hd/mobile_2k/square_1080/instagram/twitter/facebook)',
116
+ },
117
+ fit: {
118
+ type: 'string',
119
+ enum: ['cover', 'contain', 'fill'],
120
+ description: '尺寸处理时的适应模式。cover=裁剪填充(默认,不留白);contain=包含留白;fill=拉伸填充',
121
+ default: 'cover',
122
+ },
92
123
  safeSearch: {
93
124
  type: 'string',
94
125
  enum: ['off', 'moderate', 'strict'],
@@ -221,6 +252,9 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
221
252
  count: count,
222
253
  source: source,
223
254
  size: args.size || 'all',
255
+ aspect: args.aspect || 'all',
256
+ targetSize: args.targetSize || null,
257
+ fit: args.fit || 'cover',
224
258
  safeSearch: args.safeSearch || 'moderate',
225
259
  }),
226
260
  timeoutPromise,
@@ -34,9 +34,9 @@ export class BingScraper extends BaseScraper {
34
34
  logger.info(`Searching Bing Images: ${keyword}, offset: ${offset}`);
35
35
 
36
36
  const response = await withRetry(
37
- () => httpClient.get(searchUrl),
37
+ () => httpClient.get(searchUrl, { timeout: 15000 }), // 添加超时
38
38
  {
39
- maxRetries: 3,
39
+ maxRetries: 2, // 减少重试次数
40
40
  retryCondition: isRetryableError,
41
41
  }
42
42
  );
@@ -47,7 +47,7 @@ export class BingScraper extends BaseScraper {
47
47
  }
48
48
 
49
49
  // 添加请求间隔,防止被封
50
- await this._delay(300 + Math.random() * 200);
50
+ await this._delay(200 + Math.random() * 100);
51
51
 
52
52
  const newUrls = this._parseResponse(response.data);
53
53
 
@@ -62,10 +62,15 @@ export class BingScraper extends BaseScraper {
62
62
  }
63
63
  });
64
64
 
65
+ // 如果已经获取足够数量,直接跳出
66
+ if (urls.size >= limit) {
67
+ break;
68
+ }
69
+
65
70
  offset += pageSize;
66
71
 
67
72
  // 防止无限循环
68
- if (offset > 500) {
73
+ if (offset > 200) { // 减少最大偏移量
69
74
  logger.warn('Reached maximum offset limit');
70
75
  break;
71
76
  }
@@ -74,6 +79,7 @@ export class BingScraper extends BaseScraper {
74
79
  logger.error(`Bing search error for "${keyword}"`, { message: error.message });
75
80
  }
76
81
 
82
+ logger.info(`Bing search complete: found ${urls.size} URLs for "${keyword}"`);
77
83
  return Array.from(urls);
78
84
  }
79
85
 
@@ -90,6 +96,14 @@ export class BingScraper extends BaseScraper {
90
96
  'all': '',
91
97
  };
92
98
 
99
+ // 宽高比过滤映射
100
+ const aspectMap = {
101
+ 'wide': '+filterui:aspect-wide', // 横向/宽屏 (16:9, 4:3 等)
102
+ 'tall': '+filterui:aspect-tall', // 纵向/竖屏 (9:16, 3:4 等)
103
+ 'square': '+filterui:aspect-square', // 正方形 (1:1)
104
+ 'all': '',
105
+ };
106
+
93
107
  // 安全搜索映射
94
108
  const safeSearchMap = {
95
109
  'off': 'off',
@@ -98,12 +112,16 @@ export class BingScraper extends BaseScraper {
98
112
  };
99
113
 
100
114
  const size = this.options?.size || 'all';
115
+ const aspect = this.options?.aspect || 'all';
101
116
  const safeSearch = this.options?.safeSearch || 'moderate';
102
117
 
103
118
  let qft = '+filterui:photo-photo';
104
119
  if (sizeMap[size]) {
105
120
  qft += sizeMap[size];
106
121
  }
122
+ if (aspectMap[aspect]) {
123
+ qft += aspectMap[aspect];
124
+ }
107
125
 
108
126
  const params = new URLSearchParams({
109
127
  q: keyword,
@@ -29,12 +29,13 @@ export class GoogleScraper extends BaseScraper {
29
29
  const pageSize = 20;
30
30
 
31
31
  try {
32
- while (urls.size < limit && start < 200) {
32
+ while (urls.size < limit && start < 100) { // 减少最大偏移
33
33
  const searchUrl = this._buildSearchUrl(keyword, start);
34
34
  logger.info(`Searching Google Images: ${keyword}, start: ${start}`);
35
35
 
36
36
  const response = await withRetry(
37
37
  () => httpClient.get(searchUrl, {
38
+ timeout: 15000, // 添加超时
38
39
  headers: {
39
40
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
40
41
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
@@ -43,7 +44,7 @@ export class GoogleScraper extends BaseScraper {
43
44
  },
44
45
  }),
45
46
  {
46
- maxRetries: 3,
47
+ maxRetries: 2, // 减少重试次数
47
48
  retryCondition: isRetryableError,
48
49
  }
49
50
  );
@@ -66,15 +67,21 @@ export class GoogleScraper extends BaseScraper {
66
67
  }
67
68
  });
68
69
 
70
+ // 如果已经获取足够数量,直接跳出
71
+ if (urls.size >= limit) {
72
+ break;
73
+ }
74
+
69
75
  start += pageSize;
70
76
 
71
77
  // 添加延迟防止被封
72
- await this._delay(500 + Math.random() * 500);
78
+ await this._delay(300 + Math.random() * 200);
73
79
  }
74
80
  } catch (error) {
75
81
  logger.error(`Google search error for "${keyword}"`, { message: error.message });
76
82
  }
77
83
 
84
+ logger.info(`Google search complete: found ${urls.size} URLs for "${keyword}"`);
78
85
  return Array.from(urls);
79
86
  }
80
87
 
@@ -91,6 +98,14 @@ export class GoogleScraper extends BaseScraper {
91
98
  'all': '',
92
99
  };
93
100
 
101
+ // 宽高比过滤映射 (Google 使用 iar 参数)
102
+ const aspectMap = {
103
+ 'wide': 'iar:w', // 横向/宽屏
104
+ 'tall': 'iar:t', // 纵向/竖屏
105
+ 'square': 'iar:s', // 正方形
106
+ 'all': '',
107
+ };
108
+
94
109
  // 安全搜索映射
95
110
  const safeSearchMap = {
96
111
  'off': 'off',
@@ -99,6 +114,7 @@ export class GoogleScraper extends BaseScraper {
99
114
  };
100
115
 
101
116
  const size = this.options?.size || 'all';
117
+ const aspect = this.options?.aspect || 'all';
102
118
  const safeSearch = this.options?.safeSearch || 'moderate';
103
119
 
104
120
  const params = new URLSearchParams({
@@ -109,9 +125,16 @@ export class GoogleScraper extends BaseScraper {
109
125
  safe: safeSearchMap[safeSearch] || 'medium',
110
126
  });
111
127
 
112
- // 添加尺寸过滤
128
+ // 构建 tbs 参数(尺寸 + 宽高比)
129
+ const tbsParts = [];
113
130
  if (sizeMap[size]) {
114
- params.set('tbs', sizeMap[size]);
131
+ tbsParts.push(sizeMap[size]);
132
+ }
133
+ if (aspectMap[aspect]) {
134
+ tbsParts.push(aspectMap[aspect]);
135
+ }
136
+ if (tbsParts.length > 0) {
137
+ params.set('tbs', tbsParts.join(','));
115
138
  }
116
139
 
117
140
  return `${this.baseUrl}?${params.toString()}`;
@@ -0,0 +1,242 @@
1
+ /**
2
+ * 图片处理器
3
+ * 负责图片的裁剪、缩放、尺寸统一处理
4
+ */
5
+
6
+ import sharp from 'sharp';
7
+ import fs from 'fs-extra';
8
+ import path from 'path';
9
+ import logger from '../infrastructure/logger.js';
10
+
11
+ export class ImageProcessor {
12
+ /**
13
+ * 预设尺寸配置
14
+ */
15
+ static PRESETS = {
16
+ // 电脑壁纸
17
+ 'desktop_1080p': { width: 1920, height: 1080 },
18
+ 'desktop_2k': { width: 2560, height: 1440 },
19
+ 'desktop_4k': { width: 3840, height: 2160 },
20
+ // 手机壁纸
21
+ 'mobile_hd': { width: 1080, height: 1920 },
22
+ 'mobile_2k': { width: 1440, height: 2560 },
23
+ // 正方形
24
+ 'square_1080': { width: 1080, height: 1080 },
25
+ 'square_512': { width: 512, height: 512 },
26
+ // 社交媒体
27
+ 'instagram': { width: 1080, height: 1080 },
28
+ 'twitter': { width: 1200, height: 675 },
29
+ 'facebook': { width: 1200, height: 630 },
30
+ };
31
+
32
+ /**
33
+ * 处理单张图片
34
+ * @param {string} inputPath - 输入图片路径
35
+ * @param {Object} options - 处理选项
36
+ * @param {number} options.width - 目标宽度
37
+ * @param {number} options.height - 目标高度
38
+ * @param {string} options.fit - 适应模式: cover(裁剪填充), contain(包含留白), fill(拉伸)
39
+ * @param {string} options.position - 裁剪位置: center, top, bottom, left, right
40
+ * @param {string} options.outputPath - 输出路径(可选,默认覆盖原文件)
41
+ * @returns {Promise<{success: boolean, path?: string, error?: string, metadata?: Object}>}
42
+ */
43
+ async processOne(inputPath, options = {}) {
44
+ const {
45
+ width,
46
+ height,
47
+ fit = 'cover',
48
+ position = 'center',
49
+ outputPath = null,
50
+ quality = 90,
51
+ } = options;
52
+
53
+ if (!width || !height) {
54
+ return { success: false, path: inputPath, error: '必须指定目标宽度和高度' };
55
+ }
56
+
57
+ try {
58
+ // 检查文件是否存在
59
+ if (!await fs.pathExists(inputPath)) {
60
+ return { success: false, path: inputPath, error: '文件不存在' };
61
+ }
62
+
63
+ // 读取原始图片信息
64
+ const metadata = await sharp(inputPath).metadata();
65
+
66
+ // 确定输出路径
67
+ const finalOutputPath = outputPath || inputPath;
68
+ const tempPath = inputPath + '.tmp';
69
+
70
+ // 处理图片
71
+ await sharp(inputPath)
72
+ .resize(width, height, {
73
+ fit: fit,
74
+ position: position,
75
+ withoutEnlargement: false, // 允许放大小图
76
+ })
77
+ .jpeg({ quality: quality })
78
+ .toFile(tempPath);
79
+
80
+ // 替换原文件
81
+ await fs.move(tempPath, finalOutputPath, { overwrite: true });
82
+
83
+ // 获取处理后的信息
84
+ const newMetadata = await sharp(finalOutputPath).metadata();
85
+
86
+ logger.debug(`Processed image: ${inputPath} -> ${width}x${height}`);
87
+
88
+ return {
89
+ success: true,
90
+ path: finalOutputPath,
91
+ metadata: {
92
+ original: { width: metadata.width, height: metadata.height },
93
+ processed: { width: newMetadata.width, height: newMetadata.height },
94
+ },
95
+ };
96
+ } catch (error) {
97
+ logger.error(`Image processing error: ${inputPath}`, { error: error.message });
98
+ // 清理临时文件
99
+ try {
100
+ await fs.remove(inputPath + '.tmp');
101
+ } catch {}
102
+ return { success: false, path: inputPath, error: error.message };
103
+ }
104
+ }
105
+
106
+ /**
107
+ * 使用预设处理图片
108
+ * @param {string} inputPath - 输入图片路径
109
+ * @param {string} preset - 预设名称
110
+ * @param {Object} extraOptions - 额外选项
111
+ * @returns {Promise<Object>}
112
+ */
113
+ async processWithPreset(inputPath, preset, extraOptions = {}) {
114
+ const presetConfig = ImageProcessor.PRESETS[preset];
115
+ if (!presetConfig) {
116
+ return { success: false, path: inputPath, error: `未知预设: ${preset}` };
117
+ }
118
+
119
+ return this.processOne(inputPath, {
120
+ ...presetConfig,
121
+ ...extraOptions,
122
+ });
123
+ }
124
+
125
+ /**
126
+ * 批量处理图片
127
+ * @param {Array<{path: string}>} files - 文件列表
128
+ * @param {Object} options - 处理选项
129
+ * @returns {Promise<{success: Array, failed: Array}>}
130
+ */
131
+ async processMany(files, options = {}) {
132
+ const results = await Promise.all(
133
+ files.map(file => this.processOne(file.path, options))
134
+ );
135
+
136
+ const success = [];
137
+ const failed = [];
138
+
139
+ for (let i = 0; i < results.length; i++) {
140
+ const result = results[i];
141
+ if (result.success) {
142
+ success.push({
143
+ url: files[i].url,
144
+ path: result.path,
145
+ metadata: result.metadata,
146
+ });
147
+ } else {
148
+ failed.push({
149
+ url: files[i].url,
150
+ path: files[i].path,
151
+ error: result.error,
152
+ });
153
+ }
154
+ }
155
+
156
+ logger.info(`Image processing complete: ${success.length} success, ${failed.length} failed`);
157
+ return { success, failed };
158
+ }
159
+
160
+ /**
161
+ * 获取图片尺寸信息
162
+ * @param {string} imagePath - 图片路径
163
+ * @returns {Promise<{width: number, height: number, format: string} | null>}
164
+ */
165
+ async getImageInfo(imagePath) {
166
+ try {
167
+ const metadata = await sharp(imagePath).metadata();
168
+ return {
169
+ width: metadata.width,
170
+ height: metadata.height,
171
+ format: metadata.format,
172
+ aspectRatio: metadata.width / metadata.height,
173
+ };
174
+ } catch (error) {
175
+ logger.warn(`Failed to get image info: ${imagePath}`, { error: error.message });
176
+ return null;
177
+ }
178
+ }
179
+
180
+ /**
181
+ * 检查图片是否符合目标尺寸
182
+ * @param {string} imagePath - 图片路径
183
+ * @param {number} targetWidth - 目标宽度
184
+ * @param {number} targetHeight - 目标高度
185
+ * @param {number} tolerance - 容差百分比 (0-1)
186
+ * @returns {Promise<boolean>}
187
+ */
188
+ async matchesSize(imagePath, targetWidth, targetHeight, tolerance = 0.1) {
189
+ const info = await this.getImageInfo(imagePath);
190
+ if (!info) return false;
191
+
192
+ const targetRatio = targetWidth / targetHeight;
193
+ const ratioDiff = Math.abs(info.aspectRatio - targetRatio) / targetRatio;
194
+
195
+ return ratioDiff <= tolerance;
196
+ }
197
+
198
+ /**
199
+ * 解析目标尺寸参数
200
+ * @param {string|Object} target - 目标尺寸(预设名称或 {width, height} 对象或 "1920x1080" 字符串)
201
+ * @returns {{width: number, height: number} | null}
202
+ */
203
+ parseTargetSize(target) {
204
+ if (!target) return null;
205
+
206
+ // 如果是预设名称
207
+ if (typeof target === 'string' && ImageProcessor.PRESETS[target]) {
208
+ return ImageProcessor.PRESETS[target];
209
+ }
210
+
211
+ // 如果是 "宽x高" 格式的字符串
212
+ if (typeof target === 'string') {
213
+ const match = target.match(/^(\d+)[xX×](\d+)$/);
214
+ if (match) {
215
+ return {
216
+ width: parseInt(match[1], 10),
217
+ height: parseInt(match[2], 10),
218
+ };
219
+ }
220
+ }
221
+
222
+ // 如果是对象
223
+ if (typeof target === 'object' && target.width && target.height) {
224
+ return {
225
+ width: parseInt(target.width, 10),
226
+ height: parseInt(target.height, 10),
227
+ };
228
+ }
229
+
230
+ return null;
231
+ }
232
+
233
+ /**
234
+ * 获取所有可用预设
235
+ * @returns {Object}
236
+ */
237
+ static getPresets() {
238
+ return { ...ImageProcessor.PRESETS };
239
+ }
240
+ }
241
+
242
+ export default ImageProcessor;
@@ -4,4 +4,5 @@
4
4
 
5
5
  export { LinkValidator } from './linkValidator.js';
6
6
  export { FileManager } from './fileManager.js';
7
+ export { ImageProcessor } from './imageProcessor.js';
7
8
  export { Orchestrator } from './orchestrator.js';
@@ -31,21 +31,24 @@ export class LinkValidator {
31
31
  /**
32
32
  * 验证单个链接
33
33
  * @param {string} url - 图片URL
34
- * @returns {Promise<{url: string, valid: boolean, error?: string}>}
34
+ * @param {boolean} fetchQuality - 是否获取质量信息
35
+ * @returns {Promise<{url: string, valid: boolean, error?: string, quality?: Object}>}
35
36
  */
36
- async validateOne(url) {
37
+ async validateOne(url, fetchQuality = false) {
37
38
  // 先验证 URL 格式
38
39
  if (!this._isValidUrlFormat(url)) {
39
40
  return { url, valid: false, error: 'Invalid URL format' };
40
41
  }
41
42
 
42
43
  const controller = new AbortController();
43
- const timeoutId = setTimeout(() => controller.abort(), 8000);
44
+ const timeoutId = setTimeout(() => {
45
+ controller.abort();
46
+ }, 5000); // 缩短超时时间到5秒
44
47
 
45
48
  try {
46
49
  const response = await httpClient.head(url, {
47
- timeout: 5000,
48
- maxRedirects: 3,
50
+ timeout: 4000, // 缩短超时
51
+ maxRedirects: 2, // 减少重定向次数
49
52
  signal: controller.signal,
50
53
  });
51
54
 
@@ -56,32 +59,118 @@ export class LinkValidator {
56
59
  const isValidStatus = response.status === 200;
57
60
 
58
61
  if (isValidStatus && isImage) {
59
- return { url, valid: true };
60
- }
61
-
62
- // 如果 HEAD 请求失败,尝试 GET 请求(某些服务器不支持 HEAD)
63
- if (!isValidStatus || !isImage) {
64
- return await this._validateWithGet(url);
62
+ // 获取质量信息
63
+ const quality = fetchQuality ? this._extractQualityInfo(response.headers, url) : null;
64
+ return { url, valid: true, quality };
65
65
  }
66
66
 
67
- return { url, valid: false, error: `Invalid response: status=${response.status}, contentType=${contentType}` };
67
+ // HEAD 失败不再尝试 GET,直接返回失败(加快速度)
68
+ return { url, valid: false, error: `Invalid: status=${response.status}` };
68
69
  } catch (error) {
69
70
  clearTimeout(timeoutId);
70
71
 
71
- // 如果是取消错误,直接返回失败
72
- if (error.name === 'AbortError' || error.code === 'ERR_CANCELED') {
73
- return { url, valid: false, error: 'Request timeout' };
72
+ // 超时或取消,直接返回失败
73
+ if (error.name === 'AbortError' || error.code === 'ERR_CANCELED' || error.code === 'ECONNABORTED') {
74
+ return { url, valid: false, error: 'Timeout' };
74
75
  }
75
76
 
76
- // HEAD 请求失败,尝试 GET
77
- return await this._validateWithGet(url);
77
+ // 其他错误也直接返回失败(不再尝试 GET
78
+ return { url, valid: false, error: error.message || 'Request failed' };
78
79
  }
79
80
  }
80
81
 
82
+ /**
83
+ * 从响应头和URL提取质量信息
84
+ */
85
+ _extractQualityInfo(headers, url) {
86
+ const contentLength = parseInt(headers['content-length'] || '0', 10);
87
+ const contentType = headers['content-type'] || '';
88
+
89
+ // 从 URL 提取可能的尺寸信息
90
+ const sizeHints = this._extractSizeFromUrl(url);
91
+
92
+ // 计算质量分数
93
+ let score = 0;
94
+
95
+ // 文件大小评分(越大通常质量越高,但有上限)
96
+ if (contentLength > 0) {
97
+ if (contentLength > 1024 * 1024) score += 50; // >1MB
98
+ else if (contentLength > 500 * 1024) score += 40; // >500KB
99
+ else if (contentLength > 200 * 1024) score += 30; // >200KB
100
+ else if (contentLength > 100 * 1024) score += 20; // >100KB
101
+ else if (contentLength > 50 * 1024) score += 10; // >50KB
102
+ else score += 5; // <50KB
103
+ }
104
+
105
+ // 格式评分
106
+ if (contentType.includes('png')) score += 10; // PNG 无损
107
+ else if (contentType.includes('webp')) score += 8; // WebP 高效
108
+ else if (contentType.includes('jpeg') || contentType.includes('jpg')) score += 5;
109
+
110
+ // URL 中的尺寸提示评分
111
+ if (sizeHints.width && sizeHints.height) {
112
+ const pixels = sizeHints.width * sizeHints.height;
113
+ if (pixels >= 3840 * 2160) score += 30; // 4K+
114
+ else if (pixels >= 1920 * 1080) score += 25; // 1080p+
115
+ else if (pixels >= 1280 * 720) score += 15; // 720p+
116
+ else if (pixels >= 640 * 480) score += 5; // VGA+
117
+ }
118
+
119
+ // URL 质量关键词评分
120
+ const urlLower = url.toLowerCase();
121
+ if (urlLower.includes('original') || urlLower.includes('full')) score += 15;
122
+ if (urlLower.includes('hd') || urlLower.includes('hq')) score += 10;
123
+ if (urlLower.includes('large') || urlLower.includes('big')) score += 8;
124
+ if (urlLower.includes('thumb') || urlLower.includes('small')) score -= 20;
125
+ if (urlLower.includes('preview') || urlLower.includes('mini')) score -= 15;
126
+
127
+ return {
128
+ contentLength,
129
+ contentType,
130
+ estimatedWidth: sizeHints.width,
131
+ estimatedHeight: sizeHints.height,
132
+ score,
133
+ };
134
+ }
135
+
136
+ /**
137
+ * 从 URL 提取可能的尺寸信息
138
+ */
139
+ _extractSizeFromUrl(url) {
140
+ const result = { width: null, height: null };
141
+
142
+ // 常见模式: 1920x1080, 1920_1080, w=1920&h=1080, width=1920, etc.
143
+ const patterns = [
144
+ /(\d{3,4})[x×X](\d{3,4})/, // 1920x1080
145
+ /(\d{3,4})_(\d{3,4})/, // 1920_1080
146
+ /[wW](?:idth)?[=:](\d{3,4}).*[hH](?:eight)?[=:](\d{3,4})/, // w=1920&h=1080
147
+ /[hH](?:eight)?[=:](\d{3,4}).*[wW](?:idth)?[=:](\d{3,4})/, // h=1080&w=1920
148
+ ];
149
+
150
+ for (const pattern of patterns) {
151
+ const match = url.match(pattern);
152
+ if (match) {
153
+ const num1 = parseInt(match[1], 10);
154
+ const num2 = parseInt(match[2], 10);
155
+ // 确定哪个是宽度哪个是高度
156
+ if (num1 > num2) {
157
+ result.width = num1;
158
+ result.height = num2;
159
+ } else {
160
+ result.width = num2;
161
+ result.height = num1;
162
+ }
163
+ break;
164
+ }
165
+ }
166
+
167
+ return result;
168
+ }
169
+
81
170
  /**
82
171
  * 使用 GET 请求验证(某些服务器不支持 HEAD)
83
172
  */
84
- async _validateWithGet(url) {
173
+ async _validateWithGet(url, fetchQuality = false) {
85
174
  const controller = new AbortController();
86
175
  const timeoutId = setTimeout(() => controller.abort(), 8000);
87
176
 
@@ -103,7 +192,8 @@ export class LinkValidator {
103
192
  const isValidStatus = response.status === 200 || response.status === 206;
104
193
 
105
194
  if (isValidStatus && isImage) {
106
- return { url, valid: true };
195
+ const quality = fetchQuality ? this._extractQualityInfo(response.headers, url) : null;
196
+ return { url, valid: true, quality };
107
197
  }
108
198
 
109
199
  return { url, valid: false, error: `GET validation failed: status=${response.status}` };
@@ -120,27 +210,45 @@ export class LinkValidator {
120
210
  /**
121
211
  * 批量验证链接
122
212
  * @param {string[]} urls - URL列表
123
- * @returns {Promise<{valid: string[], invalid: {url: string, error: string}[]}>}
213
+ * @param {Object} options - 选项
214
+ * @param {boolean} options.fetchQuality - 是否获取质量信息
215
+ * @param {boolean} options.sortByQuality - 是否按质量排序
216
+ * @returns {Promise<{valid: Array, invalid: Array}>}
124
217
  */
125
- async validateMany(urls) {
126
- logger.info(`Validating ${urls.length} URLs with concurrency ${config.MAX_VALIDATE_CONCURRENCY}`);
218
+ async validateMany(urls, options = {}) {
219
+ const { fetchQuality = false, sortByQuality = false } = options;
220
+
221
+ logger.info(`Validating ${urls.length} URLs with concurrency ${config.MAX_VALIDATE_CONCURRENCY}${fetchQuality ? ' (with quality check)' : ''}`);
127
222
 
128
223
  const results = await Promise.all(
129
- urls.map(url => this.limit(() => this.validateOne(url)))
224
+ urls.map(url => this.limit(() => this.validateOne(url, fetchQuality)))
130
225
  );
131
226
 
132
- const valid = [];
227
+ let valid = [];
133
228
  const invalid = [];
134
229
 
135
230
  for (const result of results) {
136
231
  if (result.valid) {
137
- valid.push(result.url);
232
+ valid.push({
233
+ url: result.url,
234
+ quality: result.quality,
235
+ });
138
236
  } else {
139
237
  invalid.push({ url: result.url, error: result.error });
140
238
  logger.debug(`Invalid URL: ${result.url}`, { error: result.error });
141
239
  }
142
240
  }
143
241
 
242
+ // 按质量分数排序(高分优先)
243
+ if (sortByQuality && fetchQuality) {
244
+ valid.sort((a, b) => {
245
+ const scoreA = a.quality?.score || 0;
246
+ const scoreB = b.quality?.score || 0;
247
+ return scoreB - scoreA;
248
+ });
249
+ logger.info(`Sorted ${valid.length} URLs by quality score`);
250
+ }
251
+
144
252
  logger.info(`Validation complete: ${valid.length} valid, ${invalid.length} invalid`);
145
253
  return { valid, invalid };
146
254
  }
@@ -7,6 +7,7 @@ import pLimit from 'p-limit';
7
7
  import { getScraper } from '../providers/index.js';
8
8
  import { LinkValidator } from './linkValidator.js';
9
9
  import { FileManager } from './fileManager.js';
10
+ import { ImageProcessor } from './imageProcessor.js';
10
11
  import logger from '../infrastructure/logger.js';
11
12
  import config from '../config/index.js';
12
13
 
@@ -14,6 +15,7 @@ export class Orchestrator {
14
15
  constructor() {
15
16
  this.linkValidator = new LinkValidator();
16
17
  this.fileManager = new FileManager();
18
+ this.imageProcessor = new ImageProcessor();
17
19
  this.keywordLimit = pLimit(config.MAX_KEYWORD_CONCURRENCY);
18
20
  }
19
21
 
@@ -43,15 +45,21 @@ export class Orchestrator {
43
45
  */
44
46
  async processKeywordLink(keyword, count, source, options = {}) {
45
47
  const startTime = Date.now();
48
+ // Link 模式默认不进行质量评估(加快速度),除非明确要求
49
+ const prioritizeQuality = options.prioritizeQuality === true;
46
50
 
47
51
  try {
48
52
  // 获取搜索源
49
53
  const scraper = getScraper(source);
50
54
 
55
+ logger.info(`Searching for "${keyword}"...`);
56
+
51
57
  // 搜索图片(多获取一些以弥补验证失败的损失)
52
58
  const searchCount = Math.ceil(count * 1.5);
53
59
  const rawUrls = await scraper.search(keyword, searchCount, options);
54
60
 
61
+ logger.info(`Found ${rawUrls.length} URLs for "${keyword}"`);
62
+
55
63
  if (rawUrls.length === 0) {
56
64
  return {
57
65
  keyword,
@@ -62,10 +70,13 @@ export class Orchestrator {
62
70
  }
63
71
 
64
72
  // 验证链接
65
- const { valid, invalid } = await this.linkValidator.validateMany(rawUrls);
73
+ const { valid, invalid } = await this.linkValidator.validateMany(rawUrls, {
74
+ fetchQuality: prioritizeQuality,
75
+ sortByQuality: prioritizeQuality,
76
+ });
66
77
 
67
78
  // 截取需要的数量
68
- const resultUrls = valid.slice(0, count);
79
+ const resultUrls = valid.slice(0, count).map(v => v.url);
69
80
 
70
81
  return {
71
82
  keyword,
@@ -76,6 +87,7 @@ export class Orchestrator {
76
87
  totalInvalid: invalid.length,
77
88
  urls: resultUrls,
78
89
  count: resultUrls.length,
90
+ qualitySorted: prioritizeQuality,
79
91
  duration: Date.now() - startTime,
80
92
  };
81
93
  } catch (error) {
@@ -99,6 +111,7 @@ export class Orchestrator {
99
111
  */
100
112
  async processKeywordDownload(keyword, count, source, options = {}) {
101
113
  const startTime = Date.now();
114
+ const prioritizeQuality = options.prioritizeQuality !== false; // 默认优先高质量
102
115
 
103
116
  try {
104
117
  // 获取搜索源
@@ -117,11 +130,43 @@ export class Orchestrator {
117
130
  };
118
131
  }
119
132
 
120
- // 下载图片
121
- const { success, failed } = await this.fileManager.downloadMany(rawUrls.slice(0, searchCount), keyword);
133
+ // 先验证链接并按质量排序
134
+ let urlsToDownload = rawUrls.slice(0, searchCount);
135
+ if (prioritizeQuality) {
136
+ logger.info(`Validating and sorting ${urlsToDownload.length} URLs by quality...`);
137
+ const { valid } = await this.linkValidator.validateMany(urlsToDownload, {
138
+ fetchQuality: true,
139
+ sortByQuality: true,
140
+ });
141
+ // 使用排序后的URL列表
142
+ urlsToDownload = valid.map(v => v.url);
143
+ logger.info(`Quality sorted: ${urlsToDownload.length} valid URLs`);
144
+ }
145
+
146
+ // 下载图片(已按质量排序,高质量优先)
147
+ const { success, failed } = await this.fileManager.downloadMany(urlsToDownload, keyword);
122
148
 
123
149
  // 截取需要的数量
124
- const resultDownloads = success.slice(0, count);
150
+ let resultDownloads = success.slice(0, count);
151
+
152
+ // 如果指定了目标尺寸,进行后处理
153
+ let processedCount = 0;
154
+ let processFailedCount = 0;
155
+ if (options.targetSize && resultDownloads.length > 0) {
156
+ const targetSize = this.imageProcessor.parseTargetSize(options.targetSize);
157
+ if (targetSize) {
158
+ logger.info(`Processing images to ${targetSize.width}x${targetSize.height}`);
159
+ const processResult = await this.imageProcessor.processMany(resultDownloads, {
160
+ width: targetSize.width,
161
+ height: targetSize.height,
162
+ fit: options.fit || 'cover',
163
+ position: options.position || 'center',
164
+ });
165
+ resultDownloads = processResult.success;
166
+ processedCount = processResult.success.length;
167
+ processFailedCount = processResult.failed.length;
168
+ }
169
+ }
125
170
 
126
171
  // 保存元数据
127
172
  let metadataPath = null;
@@ -136,10 +181,13 @@ export class Orchestrator {
136
181
  totalSearched: rawUrls.length,
137
182
  totalDownloaded: success.length,
138
183
  totalFailed: failed.length,
184
+ totalProcessed: processedCount,
185
+ totalProcessFailed: processFailedCount,
139
186
  files: resultDownloads,
140
187
  count: resultDownloads.length,
141
188
  saveDir: this.fileManager.getKeywordDir(keyword),
142
189
  metadataPath,
190
+ targetSize: options.targetSize || null,
143
191
  duration: Date.now() - startTime,
144
192
  };
145
193
  } catch (error) {
@@ -159,8 +207,8 @@ export class Orchestrator {
159
207
  * @returns {Promise<Object>} - 执行结果
160
208
  */
161
209
  async execute(params) {
162
- const { query, mode, count = config.DEFAULT_COUNT, source = config.DEFAULT_SOURCE, size = 'all', safeSearch = 'moderate' } = params;
163
- const options = { size, safeSearch };
210
+ const { query, mode, count = config.DEFAULT_COUNT, source = config.DEFAULT_SOURCE, size = 'all', safeSearch = 'moderate', aspect = 'all', targetSize = null, fit = 'cover', position = 'center' } = params;
211
+ const options = { size, safeSearch, aspect, targetSize, fit, position };
164
212
 
165
213
  const startTime = Date.now();
166
214
  const keywords = this.parseKeywords(query);
@@ -247,7 +295,11 @@ export class Orchestrator {
247
295
  lines.push(`- 搜索到: ${r.totalSearched} 张`);
248
296
  lines.push(`- 下载成功: ${r.totalDownloaded} 张`);
249
297
  lines.push(`- 下载失败: ${r.totalFailed} 张`);
250
- lines.push(`- 保存: ${r.count} 张`);
298
+ if (r.targetSize) {
299
+ lines.push(`- 尺寸处理: ${r.totalProcessed} 成功, ${r.totalProcessFailed} 失败`);
300
+ lines.push(`- 目标尺寸: ${r.targetSize}`);
301
+ }
302
+ lines.push(`- 最终保存: ${r.count} 张`);
251
303
  lines.push(`- 存储目录: \`${r.saveDir}\``);
252
304
  lines.push(`- 耗时: ${(r.duration / 1000).toFixed(2)}秒`);
253
305
  lines.push('');