smart-image-scraper-mcp 2.13.3 → 2.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -74,9 +74,8 @@ export class BingScraper extends BaseScraper {
|
|
|
74
74
|
const response = await withRetry(
|
|
75
75
|
() => httpClient.get(searchUrl, {
|
|
76
76
|
timeout: 8000, // 8秒超时,快速失败
|
|
77
|
-
signal: AbortSignal.timeout(8000),
|
|
78
77
|
}),
|
|
79
|
-
{ maxRetries: 1, retryCondition:
|
|
78
|
+
{ maxRetries: 1, retryCondition: (e) => e.code === 'ECONNRESET' || (e.response?.status >= 500) } // 只重试真正的网络重置或服务器错误,超时不重试
|
|
80
79
|
);
|
|
81
80
|
|
|
82
81
|
if (response.status !== 200) {
|
|
@@ -73,7 +73,6 @@ export class GoogleScraper extends BaseScraper {
|
|
|
73
73
|
const response = await withRetry(
|
|
74
74
|
() => httpClient.get(searchUrl, {
|
|
75
75
|
timeout: 8000, // 8秒超时,快速失败
|
|
76
|
-
signal: AbortSignal.timeout(8000),
|
|
77
76
|
headers: {
|
|
78
77
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
79
78
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
@@ -81,7 +80,7 @@ export class GoogleScraper extends BaseScraper {
|
|
|
81
80
|
'Referer': 'https://www.google.com/',
|
|
82
81
|
},
|
|
83
82
|
}),
|
|
84
|
-
{ maxRetries: 1, retryCondition:
|
|
83
|
+
{ maxRetries: 1, retryCondition: (e) => e.code === 'ECONNRESET' || (e.response?.status >= 500) } // 只重试真正的网络重置或服务器错误,超时不重试
|
|
85
84
|
);
|
|
86
85
|
|
|
87
86
|
if (response.status !== 200) {
|
|
@@ -11,13 +11,10 @@ import httpClient from '../infrastructure/httpClient.js';
|
|
|
11
11
|
import logger from '../infrastructure/logger.js';
|
|
12
12
|
import config from '../config/index.js';
|
|
13
13
|
|
|
14
|
-
// 全局共享的下载并发限制器(避免每个实例独立限制)
|
|
15
|
-
const globalDownloadLimit = pLimit(config.MAX_DOWNLOAD_CONCURRENCY || 10);
|
|
16
|
-
|
|
17
14
|
export class FileManager {
|
|
18
15
|
constructor(options = {}) {
|
|
19
16
|
this.saveRoot = options.saveRoot || config.SAVE_ROOT;
|
|
20
|
-
this.limit =
|
|
17
|
+
this.limit = pLimit(config.MAX_DOWNLOAD_CONCURRENCY || 3);
|
|
21
18
|
}
|
|
22
19
|
|
|
23
20
|
/**
|
|
@@ -183,7 +180,6 @@ export class FileManager {
|
|
|
183
180
|
maxContentLength: 50 * 1024 * 1024, // 最大50MB
|
|
184
181
|
maxBodyLength: 50 * 1024 * 1024,
|
|
185
182
|
headers: downloadHeaders,
|
|
186
|
-
signal: AbortSignal.timeout(12000), // 12秒硬超时,防止连接挂起
|
|
187
183
|
});
|
|
188
184
|
|
|
189
185
|
if (response.status !== 200) {
|
|
@@ -305,13 +301,22 @@ export class FileManager {
|
|
|
305
301
|
* 批量下载图片
|
|
306
302
|
* @param {string[]} urls - URL列表
|
|
307
303
|
* @param {string} keyword - 搜索关键词
|
|
304
|
+
* @param {number} maxSuccess - 达到此成功数后跳过剩余下载(提前退出)
|
|
308
305
|
* @returns {Promise<{success: Array, failed: Array}>}
|
|
309
306
|
*/
|
|
310
|
-
async downloadMany(urls, keyword) {
|
|
307
|
+
async downloadMany(urls, keyword, maxSuccess = Infinity) {
|
|
311
308
|
logger.info(`Downloading ${urls.length} images for "${keyword}" with concurrency ${config.MAX_DOWNLOAD_CONCURRENCY}`);
|
|
312
309
|
|
|
310
|
+
let successCount = 0;
|
|
313
311
|
const results = await Promise.all(
|
|
314
|
-
urls.map(url => this.limit(() =>
|
|
312
|
+
urls.map(url => this.limit(async () => {
|
|
313
|
+
if (successCount >= maxSuccess) {
|
|
314
|
+
return { success: false, url, error: 'skipped' };
|
|
315
|
+
}
|
|
316
|
+
const result = await this.downloadOne(url, keyword);
|
|
317
|
+
if (result.success) successCount++;
|
|
318
|
+
return result;
|
|
319
|
+
}))
|
|
315
320
|
);
|
|
316
321
|
|
|
317
322
|
const success = [];
|
|
@@ -9,10 +9,12 @@ import path from 'path';
|
|
|
9
9
|
import pLimit from 'p-limit';
|
|
10
10
|
import logger from '../infrastructure/logger.js';
|
|
11
11
|
|
|
12
|
-
// sharp 是 CPU 密集型操作,限制并发避免卡死
|
|
13
|
-
const imageProcessLimit = pLimit(2);
|
|
14
12
|
|
|
15
13
|
export class ImageProcessor {
|
|
14
|
+
constructor() {
|
|
15
|
+
// sharp 是 CPU 密集型操作,限制并发避免卡死;每个实例独立防止跨请求堵塞
|
|
16
|
+
this._limit = pLimit(2);
|
|
17
|
+
}
|
|
16
18
|
/**
|
|
17
19
|
* 预设尺寸配置
|
|
18
20
|
*/
|
|
@@ -64,15 +66,16 @@ export class ImageProcessor {
|
|
|
64
66
|
return { success: false, path: inputPath, error: '文件不存在' };
|
|
65
67
|
}
|
|
66
68
|
|
|
67
|
-
//
|
|
68
|
-
const
|
|
69
|
+
// 用 Buffer 读取,避免 Windows 文件句柄锁导致后续 move 失败
|
|
70
|
+
const inputBuffer = await fs.readFile(inputPath);
|
|
71
|
+
const metadata = await sharp(inputBuffer).metadata();
|
|
69
72
|
|
|
70
73
|
// 确定输出路径
|
|
71
74
|
const finalOutputPath = outputPath || inputPath;
|
|
72
75
|
const tempPath = inputPath + '.tmp';
|
|
73
76
|
|
|
74
77
|
// 处理图片
|
|
75
|
-
await sharp(
|
|
78
|
+
await sharp(inputBuffer)
|
|
76
79
|
.resize(width, height, {
|
|
77
80
|
fit: fit,
|
|
78
81
|
position: position,
|
|
@@ -131,7 +134,7 @@ export class ImageProcessor {
|
|
|
131
134
|
*/
|
|
132
135
|
async processMany(files, options = {}) {
|
|
133
136
|
const results = await Promise.all(
|
|
134
|
-
files.map(file =>
|
|
137
|
+
files.map(file => this._limit(() => this.processOne(file.path, options)))
|
|
135
138
|
);
|
|
136
139
|
|
|
137
140
|
const success = [];
|
|
@@ -8,9 +8,6 @@ import httpClient from '../infrastructure/httpClient.js';
|
|
|
8
8
|
import logger from '../infrastructure/logger.js';
|
|
9
9
|
import config from '../config/index.js';
|
|
10
10
|
|
|
11
|
-
// 使用配置中的并发数,避免硬编码与配置不一致
|
|
12
|
-
const globalValidateLimit = pLimit(config.MAX_VALIDATE_CONCURRENCY);
|
|
13
|
-
|
|
14
11
|
// 已知有严格防盗链保护的域名列表(仅包含确实无法在浏览器直接打开的)
|
|
15
12
|
// 这些域名的图片在浏览器地址栏直接打开会返回 403、替换图或空白
|
|
16
13
|
const HOTLINK_PROTECTED_DOMAINS = [
|
|
@@ -27,7 +24,7 @@ const HOTLINK_PROTECTED_DOMAINS = [
|
|
|
27
24
|
|
|
28
25
|
export class LinkValidator {
|
|
29
26
|
constructor() {
|
|
30
|
-
this.limit =
|
|
27
|
+
this.limit = pLimit(config.MAX_VALIDATE_CONCURRENCY);
|
|
31
28
|
}
|
|
32
29
|
|
|
33
30
|
/**
|
|
@@ -320,8 +320,8 @@ export class Orchestrator {
|
|
|
320
320
|
// 检查是否已中止
|
|
321
321
|
if (signal?.aborted) throw new Error('操作已取消');
|
|
322
322
|
|
|
323
|
-
// 根据 quality
|
|
324
|
-
const maxValidate = Math.min(rawUrls.length, count
|
|
323
|
+
// 根据 quality 模式处理(只多取少量备用,避免下载过多)
|
|
324
|
+
const maxValidate = Math.min(rawUrls.length, count + 3);
|
|
325
325
|
let urlsToDownload = rawUrls.slice(0, maxValidate);
|
|
326
326
|
if (prioritizeQuality) {
|
|
327
327
|
const sortByQuality = qualityMode === 'high';
|
|
@@ -347,8 +347,8 @@ export class Orchestrator {
|
|
|
347
347
|
logger.info(`Quality filtered: ${urlsToDownload.length} valid URLs`);
|
|
348
348
|
}
|
|
349
349
|
|
|
350
|
-
//
|
|
351
|
-
const { success, failed } = await this.fileManager.downloadMany(urlsToDownload, keyword);
|
|
350
|
+
// 下载图片(已按质量排序,高质量优先),达到目标数量后跳过剩余
|
|
351
|
+
const { success, failed } = await this.fileManager.downloadMany(urlsToDownload, keyword, count);
|
|
352
352
|
|
|
353
353
|
// 截取需要的数量
|
|
354
354
|
let resultDownloads = success.slice(0, count);
|
|
@@ -490,11 +490,11 @@ export class Orchestrator {
|
|
|
490
490
|
|
|
491
491
|
if (fastMode) {
|
|
492
492
|
// fast 模式:不验证
|
|
493
|
-
allValidUrls = rawUrls.slice(0, count
|
|
493
|
+
allValidUrls = rawUrls.slice(0, count + 3);
|
|
494
494
|
qualityModeLabel = '快速模式(跳过验证)';
|
|
495
495
|
} else {
|
|
496
496
|
// balanced/high 模式:统一验证所有原始 URL
|
|
497
|
-
const maxValidate = Math.min(rawUrls.length, count
|
|
497
|
+
const maxValidate = Math.min(rawUrls.length, count + 3);
|
|
498
498
|
const urlsToValidate = rawUrls.slice(0, maxValidate);
|
|
499
499
|
const { valid } = await this.linkValidator.validateMany(urlsToValidate, {
|
|
500
500
|
fetchQuality: prioritizeQuality,
|
|
@@ -528,11 +528,11 @@ export class Orchestrator {
|
|
|
528
528
|
}
|
|
529
529
|
|
|
530
530
|
// 下载列表:使用所有验证通过的 URL(不过滤防盗链,下载时加 Referer 绕过)
|
|
531
|
-
downloadUrls = allValidUrls.slice(0, count
|
|
531
|
+
downloadUrls = allValidUrls.slice(0, count + 3);
|
|
532
532
|
|
|
533
533
|
if (signal?.aborted) throw new Error('操作已取消');
|
|
534
534
|
|
|
535
|
-
const { success, failed } = await this.fileManager.downloadMany(downloadUrls, keyword);
|
|
535
|
+
const { success, failed } = await this.fileManager.downloadMany(downloadUrls, keyword, count);
|
|
536
536
|
|
|
537
537
|
let resultDownloads = success.slice(0, count);
|
|
538
538
|
|