npm - smart-image-scraper-mcp - Versions diffs - 2.5.2 → 2.7.0 - Mend

smart-image-scraper-mcp 2.5.2 → 2.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/README.md +22 -4
package/package.json +1 -1
package/src/config/index.js +4 -4
package/src/index.backup.js +340 -0
package/src/index.js +77 -200
package/src/index.new.js +213 -0
package/src/index.simple.js +213 -0
package/src/infrastructure/cache.js +1 -0
package/src/infrastructure/gracefulShutdown.js +4 -0
package/src/infrastructure/httpClient.js +15 -5
package/src/services/fileManager.js +61 -18
package/src/services/linkValidator.js +15 -8
package/src/services/orchestrator.js +79 -32
package/src/services/orchestrator.simple.js +259 -0

package/src/index.simple.js ADDED Viewed

@@ -0,0 +1,213 @@
+#!/usr/bin/env node
+/**
+ * 全网智能图片抓取 MCP 服务器 - 简化版
+ * 模仿主流 MCP 服务器的实现方式
+ *
+ * 设计原则：
+ * - 简洁：最小化基础设施代码
+ * - 无状态：每个请求独立处理
+ * - 可靠：简单的错误处理，避免资源泄漏
+ */
+import { Server } from '@modelcontextprotocol/sdk/server/index.js';
+import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
+import {
+  CallToolRequestSchema,
+  ListToolsRequestSchema,
+} from '@modelcontextprotocol/sdk/types.js';
+import { createRequire } from 'module';
+import { Orchestrator } from './services/orchestrator.js';
+import config from './config/index.js';
+// 从 package.json 读取版本号
+const require = createRequire(import.meta.url);
+const packageJson = require('../package.json');
+// 创建 MCP 服务器（主流做法：简单配置）
+const server = new Server(
+  {
+    name: 'smart-image-scraper',
+    version: packageJson.version,
+  },
+  {
+    capabilities: {
+      tools: {},
+    },
+  }
+);
+// 工具定义（主流做法：简洁的 schema）
+const SMART_SCRAPER_TOOL = {
+  name: 'smart_scraper',
+  description: `全网智能图片抓取工具 - 从 Bing/Google 搜索、验证、下载高质量图片。
+【核心功能】
+1. 搜索图片链接 (mode=link) - 返回验证过的图片URL列表
+2. 下载图片 (mode=download) - 下载到本地，自动按质量排序优先高清
+3. 尺寸统一 (targetSize) - 下载后自动裁剪/缩放到指定尺寸
+4. 宽高比过滤 (aspect) - 横向/竖向/正方形
+【参数选择指南】
+- 用户要"找/搜索/查找图片" → mode="link"
+- 用户要"下载/保存/获取图片" → mode="download"
+- 用户要"高清/大图/壁纸" → size="large" 或 "wallpaper"
+- 用户要"电脑壁纸/横屏/横向" → aspect="wide"
+- 用户要"手机壁纸/竖屏/竖向" → aspect="tall"
+- 用户要"统一尺寸/固定大小" → targetSize="1920x1080" 或预设名
+- 用户要"多种类型图片" → query="猫,狗,鸟"（英文逗号分隔）
+【预设尺寸名称】
+- 电脑壁纸: desktop_1080p(1920x1080), desktop_2k(2560x1440), desktop_4k(3840x2160)
+- 手机壁纸: mobile_hd(1080x1920), mobile_2k(1440x2560)
+- 正方形: square_1080(1080x1080), square_512(512x512)
+- 社交媒体: instagram(1080x1080), twitter(1200x675), facebook(1200x630)
+【调用示例】
+1. 搜索5张猫的图片: {"query":"可爱的猫","mode":"link","count":5}
+2. 下载10张高清风景图: {"query":"风景","mode":"download","count":10,"size":"large"}
+3. 下载电脑壁纸并统一为1080p: {"query":"风景","mode":"download","count":10,"aspect":"wide","targetSize":"desktop_1080p"}
+4. 下载手机壁纸: {"query":"动漫","mode":"download","count":10,"aspect":"tall","targetSize":"mobile_hd"}
+5. 批量下载多类图片: {"query":"猫,狗,兔子","mode":"download","count":5}`,
+  inputSchema: {
+    type: 'object',
+    properties: {
+      query: {
+        type: 'string',
+        description: '搜索关键词。批量搜索用英文逗号分隔，如 "猫,狗,鸟"。建议使用具体描述性词语如"可爱的橘猫"而非"猫"',
+      },
+      mode: {
+        type: 'string',
+        enum: ['link', 'download'],
+        description: "运行模式。link=仅返回验证过的图片URL列表（用户只需要链接时使用）；download=下载图片到本地文件系统（用户说下载/保存时使用）",
+      },
+      count: {
+        type: 'number',
+        description: '每个关键词获取的图片数量。范围1-100，推荐1-20。用户说"几张"用5-10，说"很多"用20-30',
+        default: 10,
+      },
+      source: {
+        type: 'string',
+        enum: ['bing', 'google'],
+        description: '搜索引擎。bing更稳定推荐优先使用，google结果可能更丰富但可能被限制',
+        default: 'bing',
+      },
+      size: {
+        type: 'string',
+        enum: ['all', 'small', 'medium', 'large', 'wallpaper'],
+        description: '图片尺寸。all=不限；small=小图/图标；medium=中图；large=大图/高清；wallpaper=壁纸级别(1080p+)',
+        default: 'all',
+      },
+      aspect: {
+        type: 'string',
+        enum: ['all', 'wide', 'tall', 'square'],
+        description: '图片宽高比。all=不限；wide=横向/宽屏(电脑壁纸)；tall=纵向/竖屏(手机壁纸)；square=正方形',
+        default: 'all',
+      },
+      targetSize: {
+        type: 'string',
+        description: '目标尺寸，下载后统一裁剪/缩放到此尺寸。格式: "宽x高"(如"1920x1080")或预设名(desktop_1080p/desktop_2k/desktop_4k/mobile_hd/mobile_2k/square_1080/instagram/twitter/facebook)',
+      },
+      fit: {
+        type: 'string',
+        enum: ['cover', 'contain', 'fill'],
+        description: '尺寸处理时的适应模式。cover=裁剪填充(默认,不留白)；contain=包含留白；fill=拉伸填充',
+        default: 'cover',
+      },
+      safeSearch: {
+        type: 'string',
+        enum: ['off', 'moderate', 'strict'],
+        description: '安全搜索。off=关闭；moderate=中等过滤(默认)；strict=严格过滤(儿童/家庭内容)',
+        default: 'moderate',
+      },
+    },
+    required: ['query', 'mode'],
+  },
+};
+// 注册工具列表（主流做法：简单返回）
+server.setRequestHandler(ListToolsRequestSchema, async () => ({
+  tools: [SMART_SCRAPER_TOOL],
+}));
+// 注册工具调用（主流做法：每个请求创建新实例，避免状态污染）
+server.setRequestHandler(CallToolRequestSchema, async (request) => {
+  const { name, arguments: args } = request.params;
+  if (name !== 'smart_scraper') {
+    return {
+      content: [{ type: 'text', text: `未知工具: ${name}` }],
+      isError: true,
+    };
+  }
+  // 参数验证（主流做法：快速失败）
+  if (!args?.query || typeof args.query !== 'string' || !args.query.trim()) {
+    return {
+      content: [{ type: 'text', text: '错误: 请提供有效的搜索关键词 (query)' }],
+      isError: true,
+    };
+  }
+  if (!args.mode || !['link', 'download'].includes(args.mode)) {
+    return {
+      content: [{ type: 'text', text: "错误: 请指定有效的运行模式 (mode): 'link' 或 'download'" }],
+      isError: true,
+    };
+  }
+  try {
+    // 主流做法：每个请求创建新的 Orchestrator 实例，确保无状态
+    const orchestrator = new Orchestrator();
+    // 规范化参数
+    const params = {
+      query: args.query.trim(),
+      mode: args.mode,
+      count: Math.min(Math.max(parseInt(args.count, 10) || 10, 1), 100),
+      source: ['bing', 'google'].includes(args.source) ? args.source : 'bing',
+      size: args.size || 'all',
+      aspect: args.aspect || 'all',
+      targetSize: args.targetSize || null,
+      fit: args.fit || 'cover',
+      safeSearch: args.safeSearch || 'moderate',
+    };
+    // 执行任务
+    const result = await orchestrator.execute(params);
+    // 格式化输出
+    const formattedResult = orchestrator.formatResult(result);
+    return {
+      content: [{ type: 'text', text: formattedResult }],
+    };
+  } catch (error) {
+    // 主流做法：简洁的错误处理
+    console.error(`[MCP Error] ${error.message}`);
+    return {
+      content: [{
+        type: 'text',
+        text: `## ❌ 执行错误\n\n**错误信息**: ${error.message}\n\n请检查网络连接或稍后重试。`
+      }],
+      isError: true,
+    };
+  }
+});
+// 启动服务器（主流做法：最简启动）
+async function main() {
+  console.error(`[MCP] Starting Smart Image Scraper v${packageJson.version}`);
+  console.error(`[MCP] Save root: ${config.SAVE_ROOT}`);
+  const transport = new StdioServerTransport();
+  await server.connect(transport);
+  console.error('[MCP] Server is running');
+}
+main().catch((error) => {
+  console.error(`[MCP] Startup error: ${error.message}`);
+  process.exit(1);
+});

package/src/infrastructure/cache.js CHANGED Viewed

@@ -201,6 +201,7 @@ export class SearchCache extends LRUCache {
       keyword: keyword.toLowerCase().trim(),
       source,
       size: options.size || 'all',
+      aspect: options.aspect || 'all',
       safeSearch: options.safeSearch || 'moderate',
     });
   }

package/src/infrastructure/gracefulShutdown.js CHANGED Viewed

@@ -6,6 +6,7 @@
 import logger from './logger.js';
 import { metrics } from './metrics.js';
 import { searchCache, validationCache } from './cache.js';
+import { destroyAgents } from './httpClient.js';
 /**
  * 优雅关闭管理器
@@ -128,6 +129,9 @@ export class GracefulShutdown {
       searchCache.clear();
       validationCache.clear();
+      // 销毁 HTTP 连接池
+      destroyAgents();
       logger.info('Graceful shutdown complete');
     } catch (error) {
       logger.error('Error during shutdown', { error: error.message });

package/src/infrastructure/httpClient.js CHANGED Viewed

@@ -10,24 +10,34 @@ import config from '../config/index.js';
 import logger from './logger.js';
 // HTTP 连接池配置 - 复用 TCP 连接，大幅提升性能
-// 注意：不设置 timeout，让 axios 控制超时，避免连接被过早关闭
+// 添加 timeout 确保空闲连接被及时释放，避免连接池耗尽
 const httpAgent = new http.Agent({
   keepAlive: true,           // 启用 Keep-Alive
   keepAliveMsecs: 1000,      // Keep-Alive 探测间隔
-  maxSockets: 100,           // 增加最大并发连接数
-  maxFreeSockets: 20,        // 增加最大空闲连接数
+  maxSockets: 50,            // 降低最大并发连接数，避免资源耗尽
+  maxFreeSockets: 10,        // 降低最大空闲连接数
   scheduling: 'lifo',        // 后进先出，优先使用最近的连接
+  timeout: 30000,            // 空闲连接30秒后关闭
 });
 const httpsAgent = new https.Agent({
   keepAlive: true,
   keepAliveMsecs: 1000,
-  maxSockets: 100,
-  maxFreeSockets: 20,
+  maxSockets: 50,            // 降低最大并发连接数
+  maxFreeSockets: 10,        // 降低最大空闲连接数
   scheduling: 'lifo',
+  timeout: 30000,            // 空闲连接30秒后关闭
   rejectUnauthorized: false, // 允许自签名证书
 });
+// 注意：不再使用 destroy()，因为它会销毁所有连接包括正在使用的
+// 连接池会自动管理空闲连接的超时（通过 timeout 配置）
+// 如需手动清理，可导出 destroyAgents 函数在关闭时调用
+export function destroyAgents() {
+  httpAgent.destroy();
+  httpsAgent.destroy();
+}
 const httpClient = axios.create({
   timeout: config.REQUEST_TIMEOUT,
   httpAgent,                 // ✅ 使用连接池

package/src/services/fileManager.js CHANGED Viewed

@@ -126,6 +126,27 @@ export class FileManager {
   async downloadOne(url, keyword) {
     let filePath = null;
     let writer = null;
+    let response = null;
+    let downloadTimeout = null;
+    let resolved = false;
+    // 辅助函数：安全地销毁流
+    const safeDestroy = () => {
+      try {
+        if (response?.data && !response.data.destroyed) {
+          response.data.destroy();
+        }
+      } catch (e) { /* ignore */ }
+      try {
+        if (writer && !writer.destroyed) {
+          writer.destroy();
+        }
+      } catch (e) { /* ignore */ }
+      if (downloadTimeout) {
+        clearTimeout(downloadTimeout);
+        downloadTimeout = null;
+      }
+    };
     try {
       // 验证 URL 格式
@@ -139,28 +160,28 @@ export class FileManager {
         return { success: false, url, error: 'Malformed URL' };
       }
-      const response = await httpClient.get(url, {
+      response = await httpClient.get(url, {
         responseType: 'stream',
-        timeout: 30000, // 下载超时30秒
+        timeout: 20000, // 连接超时20秒
         maxContentLength: 50 * 1024 * 1024, // 最大50MB
         maxBodyLength: 50 * 1024 * 1024,
       });
       if (response.status !== 200) {
+        safeDestroy();
         return { success: false, url, error: `HTTP ${response.status}` };
       }
       const contentType = response.headers['content-type'] || '';
       if (!contentType.includes('image')) {
-        // 销毁响应流
-        response.data.destroy();
+        safeDestroy();
         return { success: false, url, error: 'Not an image' };
       }
       // 检查文件大小（如果有 content-length）
       const contentLength = parseInt(response.headers['content-length'] || '0', 10);
       if (contentLength > 50 * 1024 * 1024) {
-        response.data.destroy();
+        safeDestroy();
         return { success: false, url, error: 'File too large (>50MB)' };
       }
@@ -185,38 +206,60 @@ export class FileManager {
         let downloadedBytes = 0;
         const maxBytes = 50 * 1024 * 1024;
+        // 下载超时保护（30秒）
+        downloadTimeout = setTimeout(() => {
+          if (!resolved) {
+            resolved = true;
+            logger.warn(`Download timeout: ${url}`);
+            safeDestroy();
+            this._cleanupFile(filePath);
+            resolve({ success: false, url, error: 'Download timeout' });
+          }
+        }, 30000);
         response.data.on('data', (chunk) => {
           downloadedBytes += chunk.length;
-          if (downloadedBytes > maxBytes) {
-            response.data.destroy();
-            writer.destroy();
-            // 清理临时文件
+          if (downloadedBytes > maxBytes && !resolved) {
+            resolved = true;
+            safeDestroy();
             this._cleanupFile(filePath);
             resolve({ success: false, url, error: 'File too large during download' });
           }
         });
         response.data.on('error', (error) => {
-          writer.destroy();
-          this._cleanupFile(filePath);
-          resolve({ success: false, url, error: `Stream error: ${error.message}` });
+          if (!resolved) {
+            resolved = true;
+            safeDestroy();
+            this._cleanupFile(filePath);
+            resolve({ success: false, url, error: `Stream error: ${error.message}` });
+          }
         });
         response.data.pipe(writer);
         writer.on('finish', () => {
-          logger.debug(`Downloaded: ${filePath}`);
-          resolve({ success: true, url, path: filePath });
+          if (!resolved) {
+            resolved = true;
+            clearTimeout(downloadTimeout);
+            logger.debug(`Downloaded: ${filePath}`);
+            resolve({ success: true, url, path: filePath });
+          }
         });
         writer.on('error', (error) => {
-          logger.warn(`Write error: ${filePath}`, { error: error.message });
-          this._cleanupFile(filePath);
-          resolve({ success: false, url, error: error.message });
+          if (!resolved) {
+            resolved = true;
+            logger.warn(`Write error: ${filePath}`, { error: error.message });
+            safeDestroy();
+            this._cleanupFile(filePath);
+            resolve({ success: false, url, error: error.message });
+          }
         });
       });
     } catch (error) {
-      // 清理可能创建的临时文件
+      // 清理可能创建的临时文件和流
+      safeDestroy();
       if (filePath) {
         this._cleanupFile(filePath);
       }

package/src/services/linkValidator.js CHANGED Viewed

@@ -8,8 +8,8 @@ import httpClient from '../infrastructure/httpClient.js';
 import logger from '../infrastructure/logger.js';
 import config from '../config/index.js';
-// 极速并发验证配置
-const MAX_VALIDATE_CONCURRENCY = 30; // 极速验证并发 30
+// 并发验证配置 - 降低并发避免连接池耗尽
+const MAX_VALIDATE_CONCURRENCY = 15; // 验证并发 15
 const globalValidateLimit = pLimit(MAX_VALIDATE_CONCURRENCY);
 export class LinkValidator {
@@ -45,11 +45,11 @@ export class LinkValidator {
     }
     const controller = new AbortController();
-    const timeoutId = setTimeout(() => controller.abort(), 2000); // 极速超时 2 秒
+    const timeoutId = setTimeout(() => controller.abort(), 3000); // 超时 3 秒
     try {
       const response = await httpClient.head(url, {
-        timeout: 1500, // 极速超时 1.5 秒
+        timeout: 2500, // 超时 2.5 秒
         maxRedirects: 1, // 最多 1 次重定向
         signal: controller.signal,
       });
@@ -68,6 +68,10 @@ export class LinkValidator {
       return { url, valid: false, error: `status=${response.status}` };
     } catch (error) {
       clearTimeout(timeoutId);
+      // 确保 abort controller 被清理
+      if (!controller.signal.aborted) {
+        controller.abort();
+      }
       return { url, valid: false, error: 'timeout' };
     }
   }
@@ -178,12 +182,12 @@ export class LinkValidator {
    */
   async _validateWithGet(url, fetchQuality = false) {
     const controller = new AbortController();
-    const timeoutId = setTimeout(() => controller.abort(), 8000);
+    const timeoutId = setTimeout(() => controller.abort(), 5000);
     try {
       const response = await httpClient.get(url, {
-        timeout: 5000,
-        maxRedirects: 3,
+        timeout: 4000,
+        maxRedirects: 2,
         responseType: 'arraybuffer',
         signal: controller.signal,
         headers: {
@@ -205,7 +209,10 @@ export class LinkValidator {
       return { url, valid: false, error: `GET validation failed: status=${response.status}` };
     } catch (error) {
       clearTimeout(timeoutId);
+      // 确保 abort controller 被清理
+      if (!controller.signal.aborted) {
+        controller.abort();
+      }
       if (error.name === 'AbortError' || error.code === 'ERR_CANCELED') {
         return { url, valid: false, error: 'Request timeout' };
       }