npm - smart-image-scraper-mcp - Versions diffs - 2.10.0 → 2.11.0 - Mend

smart-image-scraper-mcp 2.10.0 → 2.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/LICENSE +21 -0
package/package.json +14 -1
package/src/index.js +40 -22
package/src/infrastructure/cache.js +3 -2
package/src/infrastructure/gracefulShutdown.js +7 -7
package/src/infrastructure/httpClient.js +6 -6
package/src/infrastructure/logger.js +4 -6
package/src/infrastructure/metrics.js +3 -2
package/src/infrastructure/rateLimiter.js +1 -0
package/src/infrastructure/requestQueue.js +2 -1
package/src/providers/bingScraper.js +14 -9
package/src/providers/googleScraper.js +22 -28
package/src/services/fileManager.js +1 -1
package/src/services/imageProcessor.js +6 -5
package/src/services/linkValidator.js +11 -4
package/src/services/orchestrator.js +54 -14
package/src/index.backup.js +0 -340
package/src/index.new.js +0 -213
package/src/index.simple.js +0 -213
package/src/services/orchestrator.simple.js +0 -259

package/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "smart-image-scraper-mcp",
-  "version": "2.10.0",
+  "version": "2.11.0",
   "description": "全网智能图片抓取 MCP 服务器 - 支持 Bing/Google 图片搜索、验证和下载",
   "main": "src/index.js",
   "type": "module",
@@ -27,6 +27,19 @@
   "engines": {
     "node": ">=18.0.0"
   },
+  "files": [
+    "src/index.js",
+    "src/config/",
+    "src/infrastructure/",
+    "src/providers/",
+    "src/services/orchestrator.js",
+    "src/services/linkValidator.js",
+    "src/services/fileManager.js",
+    "src/services/imageProcessor.js",
+    "src/services/index.js",
+    "README.md",
+    "LICENSE"
+  ],
   "repository": {
     "type": "git",
     "url": ""

package/src/index.js CHANGED Viewed

@@ -175,10 +175,14 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
     };
   }
+  // MCP 层最外层超时保护（55秒硬限制）
+  const MCP_TIMEOUT = 55000;
+  // 主流做法：每个请求创建新的 Orchestrator 实例，确保无状态
+  const orchestrator = new Orchestrator();
+  let mcpTimeoutId;
   try {
-    // 主流做法：每个请求创建新的 Orchestrator 实例，确保无状态
-    const orchestrator = new Orchestrator();
     // 规范化参数
     const params = {
       query: args.query.trim(),
@@ -194,30 +198,44 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
       minFileSize: ['any', '50kb', '100kb', '200kb', '500kb', '1mb'].includes(args.minFileSize) ? args.minFileSize : 'any',
     };
-    // 执行任务
-    const result = await orchestrator.execute(params);
-    // 格式化输出
-    const formattedResult = orchestrator.formatResult(result);
-    // 如果任务失败，标记为错误
-    if (!result.success) {
-      return {
-        content: [{ type: 'text', text: formattedResult }],
-        isError: true,
-      };
-    }
-    return {
-      content: [{ type: 'text', text: formattedResult }],
-    };
+    // 使用 Promise.race 确保一定会在超时内返回
+    const result = await Promise.race([
+      (async () => {
+        const result = await orchestrator.execute(params);
+        const formattedResult = orchestrator.formatResult(result);
+        if (!result.success) {
+          return {
+            content: [{ type: 'text', text: formattedResult }],
+            isError: true,
+          };
+        }
+        return {
+          content: [{ type: 'text', text: formattedResult }],
+        };
+      })(),
+      new Promise((_, reject) => {
+        mcpTimeoutId = setTimeout(() => {
+          // 超时时中止 orchestrator 的所有操作
+          if (orchestrator.abortController) {
+            orchestrator.abortController.abort();
+          }
+          reject(new Error('MCP_TIMEOUT: 请求超时(55秒)，请减少关键词数量或稍后重试'));
+        }, MCP_TIMEOUT);
+      })
+    ]);
+    clearTimeout(mcpTimeoutId);
+    return result;
   } catch (error) {
-    // 主流做法：简洁的错误处理，使用 stderr 输出日志
+    clearTimeout(mcpTimeoutId);
+    // 确保中止所有操作
+    if (orchestrator.abortController && !orchestrator.abortController.signal.aborted) {
+      orchestrator.abortController.abort();
+    }
     console.error(`[MCP Error] ${error.message}`);
     return {
       content: [{
         type: 'text',
-        text: `## ❌ 执行错误\n\n**错误信息**: ${error.message}\n\n请检查网络连接或稍后重试。`
+        text: `## ❌ 执行错误\n\n**错误信息**: ${error.message}\n\n请减少关键词数量或稍后重试。`
       }],
       isError: true,
     };

package/src/infrastructure/cache.js CHANGED Viewed

@@ -253,10 +253,11 @@ export class ValidationCache extends LRUCache {
 export const searchCache = new SearchCache();
 export const validationCache = new ValidationCache();
-// 定期清理过期缓存
-setInterval(() => {
+// 定期清理过期缓存（unref 避免阻止进程退出）
+const cacheCleanupInterval = setInterval(() => {
   searchCache.cleanup();
   validationCache.cleanup();
 }, 60000); // 每分钟清理一次
+cacheCleanupInterval.unref();
 export default { LRUCache, SearchCache, ValidationCache, searchCache, validationCache };

package/src/infrastructure/gracefulShutdown.js CHANGED Viewed

@@ -26,7 +26,8 @@ export class GracefulShutdown {
    * 注册信号处理器
    */
   _registerSignalHandlers() {
-    const signals = ['SIGINT', 'SIGTERM', 'SIGQUIT'];
+    // 仅注册 SIGINT 和 SIGTERM（SIGQUIT 在 Windows 上不存在）
+    const signals = ['SIGINT', 'SIGTERM'];
     signals.forEach(signal => {
       process.on(signal, async () => {
@@ -35,16 +36,15 @@ export class GracefulShutdown {
       });
     });
-    // 处理未捕获的异常
-    process.on('uncaughtException', async (error) => {
+    // 处理未捕获的异常 - 仅记录日志，不退出进程（避免中断 MCP 通信）
+    process.on('uncaughtException', (error) => {
       logger.error('Uncaught exception', { error: error.message, stack: error.stack });
-      await this.shutdown(1);
+      // 不调用 process.exit，让 MCP 连接保持活跃
     });
-    // 处理未处理的 Promise 拒绝
-    process.on('unhandledRejection', async (reason, promise) => {
+    // 处理未处理的 Promise 拒绝 - 仅记录日志
+    process.on('unhandledRejection', (reason, promise) => {
       logger.error('Unhandled rejection', { reason: String(reason) });
-      // 不立即退出，只记录日志
     });
   }

package/src/infrastructure/httpClient.js CHANGED Viewed

@@ -14,19 +14,19 @@ import logger from './logger.js';
 const httpAgent = new http.Agent({
   keepAlive: true,           // 启用 Keep-Alive
   keepAliveMsecs: 1000,      // Keep-Alive 探测间隔
-  maxSockets: 50,            // 降低最大并发连接数，避免资源耗尽
-  maxFreeSockets: 10,        // 降低最大空闲连接数
+  maxSockets: 20,            // 降低最大并发连接数，避免资源耗尽
+  maxFreeSockets: 5,         // 降低最大空闲连接数
   scheduling: 'lifo',        // 后进先出，优先使用最近的连接
-  timeout: 30000,            // 空闲连接30秒后关闭
+  timeout: 10000,            // 空闲连接10秒后关闭
 });
 const httpsAgent = new https.Agent({
   keepAlive: true,
   keepAliveMsecs: 1000,
-  maxSockets: 50,            // 降低最大并发连接数
-  maxFreeSockets: 10,        // 降低最大空闲连接数
+  maxSockets: 20,            // 降低最大并发连接数
+  maxFreeSockets: 5,         // 降低最大空闲连接数
   scheduling: 'lifo',
-  timeout: 30000,            // 空闲连接30秒后关闭
+  timeout: 10000,            // 空闲连接10秒后关闭
   rejectUnauthorized: false, // 允许自签名证书
 });

package/src/infrastructure/logger.js CHANGED Viewed

@@ -137,14 +137,12 @@ class Logger {
     // 输出到 stderr
     console.error(formatted);
-    // 输出到文件
+    // 输出到文件（异步写入，避免阻塞事件循环）
     if (this.logFile) {
-      try {
-        fs.appendFileSync(this.logFile, formatted + '\n');
-        this._rotateLogIfNeeded();
-      } catch (error) {
+      fs.appendFile(this.logFile, formatted + '\n', (err) => {
         // 忽略文件写入错误
-      }
+        if (!err) this._rotateLogIfNeeded();
+      });
     }
   }

package/src/infrastructure/metrics.js CHANGED Viewed

@@ -282,11 +282,12 @@ export class MetricsCollector {
 // 全局指标收集器
 export const metrics = new MetricsCollector();
-// 定期输出指标日志（每5分钟）
-setInterval(() => {
+// 定期输出指标日志（每5分钟，unref 避免阻止进程退出）
+const metricsInterval = setInterval(() => {
   if (metrics.metrics.requests.total > 0) {
     metrics.logSummary();
   }
 }, 5 * 60 * 1000);
+metricsInterval.unref();
 export default metrics;

package/src/infrastructure/rateLimiter.js CHANGED Viewed

@@ -91,6 +91,7 @@ export class RateLimiter {
           resolve(false);
         }
       }, Math.min(100, this.interval / 10));
+      checkInterval.unref(); // 避免阻止进程退出
     });
   }

package/src/infrastructure/requestQueue.js CHANGED Viewed

@@ -27,8 +27,9 @@ export class RequestQueue {
       totalTimeout: 0,
     };
-    // 定期清理超时请求
+    // 定期清理超时请求（unref 避免阻止进程退出）
     this.cleanupInterval = setInterval(() => this._cleanupTimeouts(), 5000);
+    this.cleanupInterval.unref();
   }
   /**

package/src/providers/bingScraper.js CHANGED Viewed

@@ -23,7 +23,6 @@ export class BingScraper extends BaseScraper {
    * @returns {Promise<string[]>} - 图片URL列表
    */
   async search(keyword, limit = 10, options = {}) {
-    this.options = options;
     const pageSize = 35;
     // 计算需要获取的页数（最多3页，避免触发速率限制）
@@ -31,13 +30,19 @@ export class BingScraper extends BaseScraper {
     logger.info(`[Bing] Searching "${keyword}" - ${pagesNeeded} page(s) for ${limit} images`);
     try {
+      const seen = new Set(); // 去重
       let allUrls = [];
       // 顺序获取多页（避免并发触发限制）
       for (let page = 0; page < pagesNeeded; page++) {
         const offset = page * pageSize;
-        const urls = await this._fetchPage(keyword, offset);
-        allUrls = allUrls.concat(urls);
+        const urls = await this._fetchPage(keyword, offset, options);
+        for (const url of urls) {
+          if (!seen.has(url)) {
+            seen.add(url);
+            allUrls.push(url);
+          }
+        }
         // 如果已经够了就停止
         if (allUrls.length >= limit) {
@@ -62,8 +67,8 @@ export class BingScraper extends BaseScraper {
   /**
    * 获取单页结果
    */
-  async _fetchPage(keyword, offset) {
-    const searchUrl = this._buildSearchUrl(keyword, offset);
+  async _fetchPage(keyword, offset, options = {}) {
+    const searchUrl = this._buildSearchUrl(keyword, offset, options);
     try {
       const response = await withRetry(
@@ -89,7 +94,7 @@ export class BingScraper extends BaseScraper {
   /**
    * 构建搜索 URL
    */
-  _buildSearchUrl(keyword, offset = 0) {
+  _buildSearchUrl(keyword, offset = 0, options = {}) {
     // 尺寸过滤映射
     const sizeMap = {
       'small': '+filterui:imagesize-small',
@@ -114,9 +119,9 @@ export class BingScraper extends BaseScraper {
       'strict': 'strict',
     };
-    const size = this.options?.size || 'all';
-    const aspect = this.options?.aspect || 'all';
-    const safeSearch = this.options?.safeSearch || 'moderate';
+    const size = options.size || 'all';
+    const aspect = options.aspect || 'all';
+    const safeSearch = options.safeSearch || 'moderate';
     let qft = '+filterui:photo-photo';
     if (sizeMap[size]) {

package/src/providers/googleScraper.js CHANGED Viewed

@@ -23,7 +23,6 @@ export class GoogleScraper extends BaseScraper {
    * @returns {Promise<string[]>} - 图片URL列表
    */
   async search(keyword, limit = 10, options = {}) {
-    this.options = options;
     const pageSize = 20; // Google 每页约20张
     // 计算需要获取的页数（最多3页，避免触发速率限制）
@@ -31,13 +30,19 @@ export class GoogleScraper extends BaseScraper {
     logger.info(`[Google] Searching "${keyword}" - ${pagesNeeded} page(s) for ${limit} images`);
     try {
+      const seen = new Set(); // 去重
       let allUrls = [];
       // 顺序获取多页
       for (let page = 0; page < pagesNeeded; page++) {
         const start = page * pageSize;
-        const urls = await this._fetchPage(keyword, start);
-        allUrls = allUrls.concat(urls);
+        const urls = await this._fetchPage(keyword, start, options);
+        for (const url of urls) {
+          if (!seen.has(url)) {
+            seen.add(url);
+            allUrls.push(url);
+          }
+        }
         if (allUrls.length >= limit) {
           break;
@@ -57,16 +62,12 @@ export class GoogleScraper extends BaseScraper {
       return [];
     }
   }
-  _delay(ms) {
-    return new Promise(resolve => setTimeout(resolve, ms));
-  }
   /**
    * 获取单页结果
    */
-  async _fetchPage(keyword, start) {
-    const searchUrl = this._buildSearchUrl(keyword, start);
+  async _fetchPage(keyword, start, options = {}) {
+    const searchUrl = this._buildSearchUrl(keyword, start, options);
     try {
       const response = await withRetry(
@@ -98,7 +99,7 @@ export class GoogleScraper extends BaseScraper {
   /**
    * 构建搜索 URL
    */
-  _buildSearchUrl(keyword, start = 0) {
+  _buildSearchUrl(keyword, start = 0, options = {}) {
     // 尺寸过滤映射 (Google 使用 tbs 参数)
     const sizeMap = {
       'small': 'isz:s',
@@ -123,9 +124,9 @@ export class GoogleScraper extends BaseScraper {
       'strict': 'active',
     };
-    const size = this.options?.size || 'all';
-    const aspect = this.options?.aspect || 'all';
-    const safeSearch = this.options?.safeSearch || 'moderate';
+    const size = options.size || 'all';
+    const aspect = options.aspect || 'all';
+    const safeSearch = options.safeSearch || 'moderate';
     const params = new URLSearchParams({
       q: keyword,
@@ -158,14 +159,16 @@ export class GoogleScraper extends BaseScraper {
     try {
       // 方法1: 使用正则提取图片URL
-      // Google 图片结果中的原图URL通常在特定的JSON结构中
-      const patterns = [
+      // 每次创建新的 RegExp 实例避免全局标志 lastIndex 状态污染
+      const patternDefs = [
         /\["(https?:\/\/[^"]+\.(?:jpg|jpeg|png|gif|webp)[^"]*)"/gi,
         /"ou":"(https?:\/\/[^"]+)"/gi,
         /\["(https?:\/\/[^"]+)",\d+,\d+\]/gi,
       ];
-      for (const pattern of patterns) {
+      for (const pattern of patternDefs) {
+        // 重置 lastIndex 确保每次从头开始匹配
+        pattern.lastIndex = 0;
         let match;
         while ((match = pattern.exec(html)) !== null) {
           const url = this._decodeUrl(match[1]);
@@ -240,14 +243,11 @@ export class GoogleScraper extends BaseScraper {
     // 排除 Google 自身的缩略图和无效链接
     const invalidPatterns = [
       'gstatic.com',
-      'google.com',
-      'googleapis.com',
-      'googleusercontent.com/encrypted',
+      'google.com/images',
+      'google.com/logos',
+      'googleapis.com/proxy',
       'data:image',
       'base64',
-      'favicon',
-      'logo',
-      'icon',
     ];
     for (const pattern of invalidPatterns) {
@@ -263,12 +263,6 @@ export class GoogleScraper extends BaseScraper {
     return hasImageExt || looksLikeImage || url.length > 50;
   }
-  /**
-   * 延迟函数
-   */
-  _delay(ms) {
-    return new Promise(resolve => setTimeout(resolve, ms));
-  }
 }
 export default GoogleScraper;

package/src/services/fileManager.js CHANGED Viewed

@@ -238,7 +238,7 @@ export class FileManager {
         response.data.pipe(writer);
-        writer.on('finish', () => {
+        writer.on('close', () => {
           if (!resolved) {
             resolved = true;
             clearTimeout(downloadTimeout);

package/src/services/imageProcessor.js CHANGED Viewed

@@ -6,8 +6,12 @@
 import sharp from 'sharp';
 import fs from 'fs-extra';
 import path from 'path';
+import pLimit from 'p-limit';
 import logger from '../infrastructure/logger.js';
+// sharp 是 CPU 密集型操作，限制并发避免卡死
+const imageProcessLimit = pLimit(2);
 export class ImageProcessor {
   /**
    * 预设尺寸配置
@@ -80,9 +84,6 @@ export class ImageProcessor {
       // 替换原文件
       await fs.move(tempPath, finalOutputPath, { overwrite: true });
-      // 获取处理后的信息
-      const newMetadata = await sharp(finalOutputPath).metadata();
       logger.debug(`Processed image: ${inputPath} -> ${width}x${height}`);
       return {
@@ -90,7 +91,7 @@ export class ImageProcessor {
         path: finalOutputPath,
         metadata: {
           original: { width: metadata.width, height: metadata.height },
-          processed: { width: newMetadata.width, height: newMetadata.height },
+          processed: { width, height },
         },
       };
     } catch (error) {
@@ -130,7 +131,7 @@ export class ImageProcessor {
    */
   async processMany(files, options = {}) {
     const results = await Promise.all(
-      files.map(file => this.processOne(file.path, options))
+      files.map(file => imageProcessLimit(() => this.processOne(file.path, options)))
     );
     const success = [];

package/src/services/linkValidator.js CHANGED Viewed

@@ -8,9 +8,8 @@ import httpClient from '../infrastructure/httpClient.js';
 import logger from '../infrastructure/logger.js';
 import config from '../config/index.js';
-// 并发验证配置 - 降低并发避免连接池耗尽
-const MAX_VALIDATE_CONCURRENCY = 15; // 验证并发 15
-const globalValidateLimit = pLimit(MAX_VALIDATE_CONCURRENCY);
+// 使用配置中的并发数，避免硬编码与配置不一致
+const globalValidateLimit = pLimit(config.MAX_VALIDATE_CONCURRENCY);
 export class LinkValidator {
   constructor() {
@@ -65,13 +64,21 @@ export class LinkValidator {
         return { url, valid: true, quality };
       }
+      // 某些服务器不支持 HEAD，返回 405/403 时尝试 GET 降级
+      if (response.status === 405 || response.status === 403) {
+        return await this._validateWithGet(url, fetchQuality);
+      }
       return { url, valid: false, error: `status=${response.status}` };
     } catch (error) {
       clearTimeout(timeoutId);
-      // 确保 abort controller 被清理
       if (!controller.signal.aborted) {
         controller.abort();
       }
+      // 网络错误时也尝试 GET 降级（某些 CDN 完全拒绝 HEAD）
+      if (error.response && (error.response.status === 405 || error.response.status === 403)) {
+        return await this._validateWithGet(url, fetchQuality);
+      }
       return { url, valid: false, error: 'timeout' };
     }
   }