npm - smart-image-scraper-mcp - Versions diffs - 2.5.2 → 2.7.0 - Mend

smart-image-scraper-mcp 2.5.2 → 2.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/README.md +22 -4
package/package.json +1 -1
package/src/config/index.js +4 -4
package/src/index.backup.js +340 -0
package/src/index.js +77 -200
package/src/index.new.js +213 -0
package/src/index.simple.js +213 -0
package/src/infrastructure/cache.js +1 -0
package/src/infrastructure/gracefulShutdown.js +4 -0
package/src/infrastructure/httpClient.js +15 -5
package/src/services/fileManager.js +61 -18
package/src/services/linkValidator.js +15 -8
package/src/services/orchestrator.js +79 -32
package/src/services/orchestrator.simple.js +259 -0

package/src/services/orchestrator.js CHANGED Viewed

@@ -20,8 +20,8 @@ import { searchCache } from '../infrastructure/cache.js';
 import { metrics } from '../infrastructure/metrics.js';
 import { requestQueue } from '../infrastructure/requestQueue.js';
-// 极速并发配置
-const MAX_CONCURRENT_KEYWORDS = 3;   // 每个请求内并行 3 个关键词
+// 并发配置 - 降低并发避免资源耗尽
+const MAX_CONCURRENT_KEYWORDS = 2;   // 每个请求内并行 2 个关键词
 // 关键词并发限制器
 const globalKeywordLimit = pLimit(MAX_CONCURRENT_KEYWORDS);
@@ -50,6 +50,25 @@ export class Orchestrator {
       .filter(k => k.length > 0);
   }
+  /**
+   * 解析最小文件大小参数
+   * @param {string} minFileSize - 最小文件大小字符串
+   * @returns {number} - 字节数
+   */
+  _parseMinFileSize(minFileSize) {
+    if (!minFileSize || minFileSize === 'any') return 0;
+    const sizeMap = {
+      '50kb': 50 * 1024,
+      '100kb': 100 * 1024,
+      '200kb': 200 * 1024,
+      '500kb': 500 * 1024,
+      '1mb': 1024 * 1024,
+    };
+    return sizeMap[minFileSize.toLowerCase()] || 0;
+  }
   /**
    * 处理单个关键词 - Link 模式
    * @param {string} keyword - 关键词
@@ -60,17 +79,19 @@ export class Orchestrator {
    */
   async processKeywordLink(keyword, count, source, options = {}) {
     const startTime = Date.now();
-    const fastMode = options.fastMode !== false;
-    const prioritizeQuality = options.prioritizeQuality === true;
+    // 根据 quality 参数决定模式
+    const qualityMode = options.quality || 'balanced';
+    const fastMode = qualityMode === 'fast';
+    const prioritizeQuality = qualityMode === 'high';
+    const minFileSize = this._parseMinFileSize(options.minFileSize);
     try {
       const scraper = getScraper(source);
       // 多搜索一些以确保有足够的结果
       const searchCount = Math.max(count * 3, 10);
-      // 使用统一的缓存键策略
-      const cacheKey = { keyword, source, size: options.size, aspect: options.aspect };
-      const cachedUrls = searchCache.getSearchResult(keyword, source, cacheKey);
+      // 使用统一的缓存键策略（options 已包含 size, aspect, safeSearch）
+      const cachedUrls = searchCache.getSearchResult(keyword, source, options);
       let rawUrls;
       if (cachedUrls && cachedUrls.length >= count) {
@@ -81,7 +102,7 @@ export class Orchestrator {
         logger.info(`[SEARCH] "${keyword}" (target: ${searchCount})...`);
         rawUrls = await scraper.search(keyword, searchCount, options);
         if (rawUrls.length > 0) {
-          searchCache.setSearchResult(keyword, source, cacheKey, rawUrls);
+          searchCache.setSearchResult(keyword, source, options, rawUrls);
         }
         metrics.recordCacheMiss();
       }
@@ -95,21 +116,36 @@ export class Orchestrator {
         };
       }
-      // 快速模式：直接返回搜索结果（不验证）
+      // 根据 quality 模式处理
       let resultUrls;
-      if (fastMode && !prioritizeQuality) {
-        // 快速模式：直接使用搜索结果
+      let qualityModeLabel;
+      if (fastMode) {
+        // fast 模式：直接使用搜索结果，不验证
         resultUrls = rawUrls.slice(0, count);
+        qualityModeLabel = '快速模式（跳过验证）';
         logger.info(`[FAST] "${keyword}" - ${resultUrls.length} URLs`);
       } else {
-        // 完整验证模式：验证不通过的继续搜索更多
+        // balanced 或 high 模式：验证链接
         const { valid } = await this.linkValidator.validateMany(rawUrls, {
           fetchQuality: prioritizeQuality,
           sortByQuality: prioritizeQuality,
+          minFileSize: minFileSize,
         });
-        resultUrls = valid.slice(0, count).map(v => v.url);
-        // 如果验证通过的不够，记录警告
+        // 过滤最小文件大小
+        let filteredValid = valid;
+        if (minFileSize > 0) {
+          filteredValid = valid.filter(v => {
+            const size = v.quality?.contentLength || 0;
+            return size >= minFileSize || size === 0; // size=0 表示未知，保留
+          });
+          logger.info(`[FILTER] minFileSize=${options.minFileSize}: ${valid.length} -> ${filteredValid.length}`);
+        }
+        resultUrls = filteredValid.slice(0, count).map(v => v.url);
+        qualityModeLabel = prioritizeQuality ? '高质量模式（验证+排序）' : '平衡模式（验证）';
         if (resultUrls.length < count) {
           logger.warn(`[VALIDATE] "${keyword}" - only ${resultUrls.length}/${count} valid`);
         }
@@ -122,7 +158,8 @@ export class Orchestrator {
         totalSearched: rawUrls.length,
         urls: resultUrls,
         count: resultUrls.length,
-        fastMode,
+        qualityMode,
+        qualityModeLabel,
         duration: Date.now() - startTime,
       };
     } catch (error) {
@@ -146,16 +183,18 @@ export class Orchestrator {
    */
   async processKeywordDownload(keyword, count, source, options = {}) {
     const startTime = Date.now();
-    const prioritizeQuality = options.prioritizeQuality !== false;
+    // 根据 quality 参数决定模式（download 模式默认高质量）
+    const qualityMode = options.quality || 'balanced';
+    const prioritizeQuality = qualityMode !== 'fast';
+    const minFileSize = this._parseMinFileSize(options.minFileSize);
     try {
       const scraper = getScraper(source);
       // 统一搜索数量策略：与 Link 模式一致
       const searchCount = Math.max(count * 3, 10);
-      // 尝试从缓存获取（使用统一的缓存键策略）
-      const cacheKey = { keyword, source, size: options.size, aspect: options.aspect };
-      const cachedUrls = searchCache.getSearchResult(keyword, source, cacheKey);
+      // 尝试从缓存获取（options 已包含 size, aspect, safeSearch）
+      const cachedUrls = searchCache.getSearchResult(keyword, source, options);
       let rawUrls;
       if (cachedUrls && cachedUrls.length >= count) {
@@ -166,7 +205,7 @@ export class Orchestrator {
         logger.info(`[SEARCH] "${keyword}" (target: ${searchCount})...`);
         rawUrls = await scraper.search(keyword, searchCount, options);
         if (rawUrls.length > 0) {
-          searchCache.setSearchResult(keyword, source, cacheKey, rawUrls);
+          searchCache.setSearchResult(keyword, source, options, rawUrls);
         }
         metrics.recordCacheMiss();
       }
@@ -180,17 +219,29 @@ export class Orchestrator {
         };
       }
-      // 先验证链接并按质量排序
+      // 根据 quality 模式处理
       let urlsToDownload = rawUrls.slice(0, searchCount);
       if (prioritizeQuality) {
-        logger.info(`Validating and sorting ${urlsToDownload.length} URLs by quality...`);
+        const sortByQuality = qualityMode === 'high';
+        logger.info(`Validating ${urlsToDownload.length} URLs (quality=${qualityMode})...`);
         const { valid } = await this.linkValidator.validateMany(urlsToDownload, {
-          fetchQuality: true,
-          sortByQuality: true,
+          fetchQuality: sortByQuality,
+          sortByQuality: sortByQuality,
+          minFileSize: minFileSize,
         });
-        // 使用排序后的URL列表
-        urlsToDownload = valid.map(v => v.url);
-        logger.info(`Quality sorted: ${urlsToDownload.length} valid URLs`);
+        // 过滤最小文件大小
+        let filteredValid = valid;
+        if (minFileSize > 0) {
+          filteredValid = valid.filter(v => {
+            const size = v.quality?.contentLength || 0;
+            return size >= minFileSize || size === 0;
+          });
+          logger.info(`[FILTER] minFileSize: ${valid.length} -> ${filteredValid.length}`);
+        }
+        urlsToDownload = filteredValid.map(v => v.url);
+        logger.info(`Quality filtered: ${urlsToDownload.length} valid URLs`);
       }
       // 下载图片（已按质量排序，高质量优先）
@@ -362,11 +413,7 @@ export class Orchestrator {
       if (r.mode === 'link') {
         lines.push(`- 搜索到: ${r.totalSearched || 0} 张`);
-        if (r.fastMode) {
-          lines.push(`- 模式: 快速模式（跳过验证）`);
-        } else {
-          lines.push(`- 验证通过: ${r.totalValidated || r.count || 0} 张`);
-        }
+        lines.push(`- 质量模式: ${r.qualityModeLabel || '快速模式'}`);
         lines.push(`- 返回: ${r.count || 0} 张`);
         lines.push(`- 耗时: ${(r.duration / 1000).toFixed(2)}秒`);
         lines.push('');

package/src/services/orchestrator.simple.js ADDED Viewed

@@ -0,0 +1,259 @@
+/**
+ * 编排器 - 简化版
+ * 模仿主流 MCP 的实现方式：无状态、无全局缓存、每次请求独立
+ */
+import pLimit from 'p-limit';
+import { getScraper } from '../providers/index.js';
+import { LinkValidator } from './linkValidator.js';
+import { FileManager } from './fileManager.js';
+import { ImageProcessor } from './imageProcessor.js';
+import config from '../config/index.js';
+export class Orchestrator {
+  constructor() {
+    // 每个实例独立的限制器，避免全局状态
+    this.keywordLimit = pLimit(2);
+    this.linkValidator = new LinkValidator();
+    this.fileManager = new FileManager();
+    this.imageProcessor = new ImageProcessor();
+  }
+  /**
+   * 解析关键词字符串
+   */
+  parseKeywords(query) {
+    if (!query || typeof query !== 'string') return [];
+    return query.split(',').map(k => k.trim()).filter(k => k.length > 0);
+  }
+  /**
+   * 执行任务
+   */
+  async execute(params) {
+    const {
+      query, mode, count = 10, source = 'bing',
+      size = 'all', safeSearch = 'moderate', aspect = 'all',
+      targetSize = null, fit = 'cover', position = 'center'
+    } = params;
+    const options = { size, safeSearch, aspect, targetSize, fit, position };
+    const startTime = Date.now();
+    const keywords = this.parseKeywords(query);
+    if (keywords.length === 0) {
+      return { success: false, error: '请提供有效的搜索关键词' };
+    }
+    // 根据模式选择处理函数
+    const processFunc = mode === 'link'
+      ? this._processLink.bind(this)
+      : this._processDownload.bind(this);
+    // 并发处理关键词（使用实例级限制器）
+    const results = await Promise.all(
+      keywords.map(keyword =>
+        this.keywordLimit(() => processFunc(keyword, count, source, options))
+      )
+    );
+    const successResults = results.filter(r => r.success);
+    const failedResults = results.filter(r => !r.success);
+    return {
+      success: true,
+      mode,
+      source,
+      totalKeywords: keywords.length,
+      successCount: successResults.length,
+      failedCount: failedResults.length,
+      results,
+      duration: Date.now() - startTime,
+    };
+  }
+  /**
+   * 处理 Link 模式
+   */
+  async _processLink(keyword, count, source, options) {
+    const startTime = Date.now();
+    try {
+      const scraper = getScraper(source);
+      const searchCount = Math.max(count * 2, 10);
+      // 直接搜索，不使用缓存（简化设计）
+      const rawUrls = await scraper.search(keyword, searchCount, options);
+      if (rawUrls.length === 0) {
+        return {
+          keyword,
+          success: false,
+          error: '未找到任何图片',
+          duration: Date.now() - startTime,
+        };
+      }
+      // 快速模式：直接返回搜索结果
+      const resultUrls = rawUrls.slice(0, count);
+      return {
+        keyword,
+        success: true,
+        mode: 'link',
+        totalSearched: rawUrls.length,
+        urls: resultUrls,
+        count: resultUrls.length,
+        fastMode: true,
+        duration: Date.now() - startTime,
+      };
+    } catch (error) {
+      return {
+        keyword,
+        success: false,
+        error: error.message,
+        duration: Date.now() - startTime,
+      };
+    }
+  }
+  /**
+   * 处理 Download 模式
+   */
+  async _processDownload(keyword, count, source, options) {
+    const startTime = Date.now();
+    try {
+      const scraper = getScraper(source);
+      const searchCount = Math.max(count * 2, 10);
+      // 搜索
+      const rawUrls = await scraper.search(keyword, searchCount, options);
+      if (rawUrls.length === 0) {
+        return {
+          keyword,
+          success: false,
+          error: '未找到任何图片',
+          duration: Date.now() - startTime,
+        };
+      }
+      // 下载图片
+      const { success, failed } = await this.fileManager.downloadMany(rawUrls, keyword);
+      let resultDownloads = success.slice(0, count);
+      // 如果指定了目标尺寸，进行后处理
+      let processedCount = 0;
+      let processFailedCount = 0;
+      if (options.targetSize && resultDownloads.length > 0) {
+        const targetSize = this.imageProcessor.parseTargetSize(options.targetSize);
+        if (targetSize) {
+          const processResult = await this.imageProcessor.processMany(resultDownloads, {
+            width: targetSize.width,
+            height: targetSize.height,
+            fit: options.fit || 'cover',
+            position: options.position || 'center',
+          });
+          resultDownloads = processResult.success;
+          processedCount = processResult.success.length;
+          processFailedCount = processResult.failed.length;
+        }
+      }
+      // 保存元数据
+      let metadataPath = null;
+      if (resultDownloads.length > 0) {
+        metadataPath = await this.fileManager.saveMetadata(keyword, resultDownloads);
+      }
+      return {
+        keyword,
+        success: true,
+        mode: 'download',
+        totalSearched: rawUrls.length,
+        totalDownloaded: success.length,
+        totalFailed: failed.length,
+        totalProcessed: processedCount,
+        totalProcessFailed: processFailedCount,
+        files: resultDownloads,
+        count: resultDownloads.length,
+        saveDir: this.fileManager.getKeywordDir(keyword),
+        metadataPath,
+        targetSize: options.targetSize || null,
+        duration: Date.now() - startTime,
+      };
+    } catch (error) {
+      return {
+        keyword,
+        success: false,
+        error: error.message,
+        duration: Date.now() - startTime,
+      };
+    }
+  }
+  /**
+   * 格式化输出为 Markdown
+   */
+  formatResult(result) {
+    if (!result.success) {
+      return `## ❌ 任务失败\n\n**错误原因**: ${result.error}`;
+    }
+    const lines = [];
+    lines.push(`# 📷 图片抓取报告`);
+    lines.push('');
+    lines.push(`- **模式**: ${result.mode === 'link' ? '链接提取' : '本地下载'}`);
+    lines.push(`- **搜索源**: ${result.source}`);
+    lines.push(`- **关键词数量**: ${result.totalKeywords}`);
+    lines.push(`- **成功**: ${result.successCount} | **失败**: ${result.failedCount}`);
+    lines.push(`- **总耗时**: ${(result.duration / 1000).toFixed(2)}秒`);
+    lines.push('');
+    for (const r of result.results) {
+      lines.push(`## 🔍 关键词: ${r.keyword}`);
+      lines.push('');
+      if (!r.success) {
+        lines.push(`❌ **失败**: ${r.error}`);
+        lines.push('');
+        continue;
+      }
+      if (r.mode === 'link') {
+        lines.push(`- 搜索到: ${r.totalSearched || 0} 张`);
+        lines.push(`- 返回: ${r.count || 0} 张`);
+        lines.push(`- 耗时: ${(r.duration / 1000).toFixed(2)}秒`);
+        lines.push('');
+        lines.push('### 有效链接');
+        lines.push('');
+        (r.urls || []).forEach((url, i) => {
+          lines.push(`${i + 1}. ${url}`);
+        });
+      } else {
+        lines.push(`- 搜索到: ${r.totalSearched} 张`);
+        lines.push(`- 下载成功: ${r.totalDownloaded} 张`);
+        lines.push(`- 下载失败: ${r.totalFailed} 张`);
+        if (r.targetSize) {
+          lines.push(`- 尺寸处理: ${r.totalProcessed} 成功, ${r.totalProcessFailed} 失败`);
+          lines.push(`- 目标尺寸: ${r.targetSize}`);
+        }
+        lines.push(`- 最终保存: ${r.count} 张`);
+        lines.push(`- 存储目录: \`${r.saveDir}\``);
+        lines.push(`- 耗时: ${(r.duration / 1000).toFixed(2)}秒`);
+        lines.push('');
+        lines.push('### 已下载文件');
+        lines.push('');
+        r.files.forEach((file, i) => {
+          lines.push(`${i + 1}. \`${file.path}\``);
+        });
+      }
+      lines.push('');
+    }
+    return lines.join('\n');
+  }
+}
+export default Orchestrator;