npm - @hzttt/multimodal-rag - Versions diffs - 0.1.1 - Mend

@hzttt/multimodal-rag 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/README.md ADDED Viewed

@@ -0,0 +1,208 @@
+# Multimodal RAG Plugin
+OpenClaw 多模态 RAG 插件 — 使用本地 AI 模型对图像和音频进行语义索引与时间感知搜索。
+## 功能特性
+- **图像索引**：使用 Qwen3-VL 自动描述图像内容并生成嵌入向量
+- **音频索引**：使用 Whisper 转录音频并生成嵌入向量
+- **语义搜索**：基于向量相似度的语义检索，支持中英文
+- **时间过滤**：按文件创建时间范围过滤搜索结果
+- **自动监听**：实时监听文件夹变化，自动索引新增文件
+- **向量存储**：使用 LanceDB 高效存储和检索
+- **智能去重**：基于文件 SHA256 哈希去重
+## 前置条件
+- [Ollama](https://ollama.ai) 已安装并运行
+- 以下 Ollama 模型已拉取：
+  - `qwen3-vl:2b` (视觉模型，图像描述)
+  - `qwen3-embedding:latest` (嵌入模型，向量生成)
+```bash
+# 安装模型
+ollama pull qwen3-vl:2b
+ollama pull qwen3-embedding:latest
+```
+## 安装
+### 方式一：从 npm 安装（推荐）
+```bash
+openclaw plugins install @hzttt/multimodal-rag
+```
+插件会自动安装到 `~/.openclaw/extensions/multimodal-rag/`，并自动安装所有运行时依赖。
+### 方式二：从 GitHub 安装
+```bash
+openclaw plugins install github:hzttt/multimodal-rag
+```
+### 方式三：从本地路径安装
+```bash
+git clone https://github.com/hzttt/multimodal-rag.git
+openclaw plugins install ./multimodal-rag
+```
+## 配置
+### 交互式配置（推荐）
+安装完成后，运行引导配置向导：
+```bash
+openclaw multimodal-rag setup
+```
+向导将引导你配置**文件监听路径**，其他参数已使用推荐的默认值：
+- **Ollama 地址**: `http://127.0.0.1:11434`
+- **视觉模型**: `qwen3-vl:2b` (图像描述)
+- **嵌入模型**: `qwen3-embedding:latest` (向量生成)
+- **嵌入提供者**: `ollama` (本地)
+- **数据库路径**: `/home/lucy/.openclaw/multimodal-rag.lance`
+- **启动时索引**: `true` (自动索引已有文件)
+你只需要指定要监听的文件夹路径即可。
+### 手动配置
+如需自定义配置，编辑 `~/.openclaw/openclaw.json`：
+```json
+{
+  "plugins": {
+    "entries": {
+      "multimodal-rag": {
+        "enabled": true,
+        "config": {
+          "watchPaths": ["~/mic-recordings", "/home/lucy/usb_data"],
+          "ollama": {
+            "baseUrl": "http://127.0.0.1:11434",
+            "visionModel": "qwen3-vl:2b",
+            "embedModel": "qwen3-embedding:latest"
+          },
+          "embedding": {
+            "provider": "ollama"
+          },
+          "dbPath": "/home/lucy/.openclaw/multimodal-rag.lance",
+          "indexExistingOnStart": true
+        }
+      }
+    }
+  }
+}
+```
+### 配置项说明
+| 配置项 | 类型 | 默认值 | 说明 |
+|--------|------|--------|------|
+| `watchPaths` | string[] | `[]` | 监听的文件夹路径（支持 `~` 展开） |
+| `ollama.baseUrl` | string | `http://127.0.0.1:11434` | Ollama 服务地址 |
+| `ollama.visionModel` | string | `qwen3-vl:2b` | 用于图像描述的视觉模型 |
+| `ollama.embedModel` | string | `qwen3-embedding:latest` | 用于生成嵌入向量的模型 |
+| `embedding.provider` | string | `ollama` | 嵌入提供者: `ollama` 或 `openai` |
+| `embedding.openaiApiKey` | string | - | OpenAI API Key（仅 openai 时需要） |
+| `embedding.openaiModel` | string | `text-embedding-3-small` | OpenAI 嵌入模型 |
+| `dbPath` | string | `~/.openclaw/multimodal-rag.lance` | LanceDB 数据库路径 |
+| `watchDebounceMs` | number | `1000` | 文件监听去抖延迟（毫秒） |
+| `indexExistingOnStart` | boolean | `true` | 启动时是否索引已有文件 |
+配置完成后，重启 OpenClaw Gateway 使配置生效。
+## 使用方法
+### Agent 工具
+插件注册 4 个 Agent 工具，可以在对话中自然地调用：
+#### `media_search` — 语义搜索
+```
+用户：上周我去东方明珠拍的照片在哪
+Agent：[调用 media_search] → 找到 3 张匹配的照片 → 发送给用户
+```
+#### `media_describe` — 获取媒体描述
+```
+用户：这个录音说了什么
+Agent：[调用 media_describe(filePath)] → 返回音频转录内容
+```
+#### `media_list` — 浏览媒体文件
+```
+用户：列出最近的照片
+Agent：[调用 media_list(type="image")] → 返回最近索引的图片列表
+```
+#### `media_stats` — 查看库统计
+```
+用户：我的媒体库有多少文件
+Agent：[调用 media_stats] → 总计 120 个文件，图片 80，音频 40
+```
+### CLI 命令
+```bash
+# 交互式配置
+openclaw multimodal-rag setup
+# 手动索引文件或文件夹
+openclaw multimodal-rag index ~/Pictures/photo.jpg
+# 语义搜索
+openclaw multimodal-rag search "东方明珠"
+openclaw multimodal-rag search "会议讨论" --type audio --after 2026-01-29
+# 查看索引统计
+openclaw multimodal-rag stats
+# 列出已索引文件
+openclaw multimodal-rag list --type image --limit 10
+# 完整重新索引
+openclaw multimodal-rag reindex --confirm
+# 清空索引
+openclaw multimodal-rag clear --confirm
+```
+## 故障排除
+### Ollama 连接失败
+```bash
+# 确保 Ollama 已启动
+ollama serve
+# 检查连接
+curl http://127.0.0.1:11434/api/tags
+```
+### 嵌入维度不匹配
+切换嵌入模型后需要重建索引：
+```bash
+openclaw multimodal-rag reindex --confirm
+```
+### 文件监听不生效
+检查路径是否正确，以及插件是否已启用：
+```bash
+openclaw plugins list | grep multimodal-rag
+```
+## 许可证
+MIT

package/index.ts ADDED Viewed

@@ -0,0 +1,321 @@
+/**
+ * OpenClaw Multimodal RAG Plugin
+ *
+ * 多模态 RAG 插件，支持图像和音频的语义索引与时间感知搜索。
+ */
+import type { OpenClawPluginApi } from "openclaw/plugin-sdk";
+import { MediaStorage } from "./src/storage.js";
+import { createEmbeddingProvider } from "./src/embeddings.js";
+import { createMediaProcessor } from "./src/processor.js";
+import { MediaWatcher } from "./src/watcher.js";
+import {
+  createMediaSearchTool,
+  createMediaDescribeTool,
+  createMediaListTool,
+  createMediaStatsTool,
+} from "./src/tools.js";
+import { runSetup } from "./src/setup.js";
+import type { PluginConfig } from "./src/types.js";
+const multimodalRagPlugin = {
+  id: "multimodal-rag",
+  name: "Multimodal RAG",
+  description:
+    "多模态 RAG 插件，支持图像和音频的语义索引与时间感知搜索",
+  kind: "rag" as const,
+  register(api: OpenClawPluginApi) {
+    // 解析配置（合并默认值）
+    const userConfig = (api.pluginConfig || {}) as Partial<PluginConfig>;
+    const cfg: PluginConfig = {
+      watchPaths: userConfig.watchPaths || [],
+      fileTypes: {
+        image: userConfig.fileTypes?.image || [".jpg", ".jpeg", ".png", ".webp", ".gif", ".heic"],
+        audio: userConfig.fileTypes?.audio || [".wav", ".mp3", ".m4a", ".ogg", ".flac", ".aac"],
+      },
+      ollama: {
+        baseUrl: userConfig.ollama?.baseUrl || "http://127.0.0.1:11434",
+        visionModel: userConfig.ollama?.visionModel || "qwen3-vl:2b",
+        embedModel: userConfig.ollama?.embedModel || "qwen3-embedding:latest",
+      },
+      embedding: {
+        provider: userConfig.embedding?.provider || "ollama",
+        openaiApiKey: userConfig.embedding?.openaiApiKey,
+        openaiModel: userConfig.embedding?.openaiModel || "text-embedding-3-small",
+      },
+      dbPath: userConfig.dbPath || "~/.openclaw/multimodal-rag.lance",
+      watchDebounceMs: userConfig.watchDebounceMs || 1000,
+      indexExistingOnStart: userConfig.indexExistingOnStart !== false,
+    };
+    // 解析数据库路径
+    const resolvedDbPath = api.resolvePath(cfg.dbPath);
+    // 创建嵌入提供者
+    const embeddings = createEmbeddingProvider({
+      provider: cfg.embedding.provider,
+      ollamaBaseUrl: cfg.ollama.baseUrl,
+      ollamaModel: cfg.ollama.embedModel,
+      openaiApiKey: cfg.embedding.openaiApiKey,
+      openaiModel: cfg.embedding.openaiModel,
+    });
+    const vectorDim = embeddings.getDimension();
+    api.logger.info?.(
+      `multimodal-rag: Using ${cfg.embedding.provider} embeddings (dim=${vectorDim})`,
+    );
+    // 创建存储
+    const storage = new MediaStorage(resolvedDbPath, vectorDim);
+    // 创建媒体处理器
+    const processor = createMediaProcessor({
+      ollamaBaseUrl: cfg.ollama.baseUrl,
+      visionModel: cfg.ollama.visionModel,
+    });
+    // 创建文件监听器
+    const watcher = new MediaWatcher(cfg, storage, embeddings, processor, api.logger);
+    // ========================================================================
+    // 注册工具
+    // ========================================================================
+    // 1. 统计工具 - 让 Agent 了解媒体库状态
+    api.registerTool(createMediaStatsTool(storage, watcher), {
+      name: "media_stats",
+    });
+    // 2. 搜索工具 - 主要的内容查找工具
+    api.registerTool(createMediaSearchTool(storage, embeddings), {
+      name: "media_search",
+    });
+    // 3. 列表工具 - 浏览和按时间过滤
+    api.registerTool(createMediaListTool(storage, cfg), {
+      name: "media_list",
+    });
+    // 4. 描述工具 - 查看单个文件详情
+    api.registerTool(createMediaDescribeTool(storage, processor, embeddings, watcher), {
+      name: "media_describe",
+    });
+    api.logger.info?.("multimodal-rag: Registered 4 agent tools");
+    // ========================================================================
+    // 注册 CLI 命令
+    // ========================================================================
+    api.registerCli(({ program }) => {
+      const rag = program
+        .command("multimodal-rag")
+        .description("Multimodal RAG plugin commands");
+      // openclaw multimodal-rag index <path>
+      rag
+        .command("index")
+        .description("手动索引指定路径的媒体文件")
+        .argument("<path>", "文件或文件夹路径")
+        .action(async (path: string) => {
+          try {
+            await watcher.indexPath(path);
+            console.log(`✓ 索引完成: ${path}`);
+          } catch (error) {
+            console.error(`✗ 索引失败: ${String(error)}`);
+            process.exit(1);
+          }
+        });
+      // openclaw multimodal-rag search <query>
+      rag
+        .command("search")
+        .description("搜索媒体文件")
+        .argument("<query>", "搜索查询")
+        .option("--type <type>", "媒体类型: image, audio, all", "all")
+        .option("--after <date>", "开始时间 (ISO 格式)")
+        .option("--before <date>", "结束时间 (ISO 格式)")
+        .option("--limit <n>", "返回数量", "5")
+        .action(async (query: string, opts: any) => {
+          try {
+            const vector = await embeddings.embed(query);
+            const afterTs = opts.after ? new Date(opts.after).getTime() : undefined;
+            const beforeTs = opts.before ? new Date(opts.before).getTime() : undefined;
+            const results = await storage.search(vector, {
+              type: opts.type,
+              after: afterTs,
+              before: beforeTs,
+              limit: Number.parseInt(opts.limit),
+              minScore: 0.3,
+            });
+            if (results.length === 0) {
+              console.log("未找到相关媒体文件");
+              return;
+            }
+            console.log(`找到 ${results.length} 个相关媒体文件:\n`);
+            for (const r of results) {
+              const date = new Date(r.entry.fileCreatedAt).toLocaleString("zh-CN");
+              const score = (r.score * 100).toFixed(0);
+              console.log(`[${r.entry.fileType}] ${r.entry.fileName} (${score}%)`);
+              console.log(`  路径: ${r.entry.filePath}`);
+              console.log(`  时间: ${date}`);
+              console.log(`  描述: ${r.entry.description.slice(0, 100)}...\n`);
+            }
+          } catch (error) {
+            console.error(`搜索失败: ${String(error)}`);
+            process.exit(1);
+          }
+        });
+      // openclaw multimodal-rag stats
+      rag
+        .command("stats")
+        .description("显示索引统计")
+        .action(async () => {
+          try {
+            // 使用 count() 统一查询逻辑（全量扫描 + 内存过滤）
+            const total = await storage.count();
+            const imageCount = await storage.count("image");
+            const audioCount = await storage.count("audio");
+            console.log("媒体库统计:");
+            console.log(`  总计: ${total} 个文件`);
+            console.log(`  图片: ${imageCount} 个`);
+            console.log(`  音频: ${audioCount} 个`);
+            // 数据完整性检查
+            if (total !== imageCount + audioCount) {
+              console.log(`  警告: 总数不匹配 (${total} ≠ ${imageCount} + ${audioCount})`);
+            }
+          } catch (error) {
+            console.error(`统计失败: ${String(error)}`);
+            process.exit(1);
+          }
+        });
+      // openclaw multimodal-rag list
+      rag
+        .command("list")
+        .description("列出已索引的媒体文件")
+        .option("--type <type>", "媒体类型: image, audio, all", "all")
+        .option("--after <date>", "开始时间 (ISO 格式)")
+        .option("--before <date>", "结束时间 (ISO 格式)")
+        .option("--limit <n>", "返回数量", "20")
+        .option("--offset <n>", "偏移量", "0")
+        .action(async (opts: any) => {
+          try {
+            const afterTs = opts.after ? new Date(opts.after).getTime() : undefined;
+            const beforeTs = opts.before ? new Date(opts.before).getTime() : undefined;
+            const { total, entries } = await storage.list({
+              type: opts.type,
+              after: afterTs,
+              before: beforeTs,
+              limit: Number.parseInt(opts.limit),
+              offset: Number.parseInt(opts.offset),
+            });
+            if (entries.length === 0) {
+              console.log("没有找到符合条件的媒体文件");
+              return;
+            }
+            console.log(`已索引 ${total} 个媒体文件:\n`);
+            for (let i = 0; i < entries.length; i++) {
+              const e = entries[i];
+              const date = new Date(e.fileCreatedAt).toLocaleString("zh-CN");
+              console.log(`${opts.offset + i + 1}. [${e.fileType}] ${e.fileName}`);
+              console.log(`   路径: ${e.filePath}`);
+              console.log(`   时间: ${date}`);
+              console.log(`   描述: ${e.description.slice(0, 80)}${e.description.length > 80 ? "..." : ""}\n`);
+            }
+            if (total > opts.offset + entries.length) {
+              console.log(`（显示 ${opts.offset + 1}-${opts.offset + entries.length}，共 ${total} 个）`);
+            }
+          } catch (error) {
+            console.error(`列表失败: ${String(error)}`);
+            process.exit(1);
+          }
+        });
+      // openclaw multimodal-rag clear
+      rag
+        .command("clear")
+        .description("清空索引（谨慎使用）")
+        .option("--confirm", "确认清空")
+        .action(async (opts: any) => {
+          if (!opts.confirm) {
+            console.error("请使用 --confirm 确认清空操作");
+            process.exit(1);
+          }
+          try {
+            await storage.clear();
+            console.log("✓ 索引已清空");
+          } catch (error) {
+            console.error(`清空失败: ${String(error)}`);
+            process.exit(1);
+          }
+        });
+      // openclaw multimodal-rag reindex
+      rag
+        .command("reindex")
+        .description("完整重新索引（清空数据库并重新扫描所有文件）")
+        .option("--confirm", "确认重新索引")
+        .action(async (opts: any) => {
+          if (!opts.confirm) {
+            console.error("请使用 --confirm 确认重新索引操作");
+            console.error("警告: 此操作会清空现有索引并重新扫描所有文件");
+            process.exit(1);
+          }
+          try {
+            console.log("开始完整重新索引...");
+            await watcher.reindexAll();
+            console.log("✓ 重新索引完成");
+            console.log("提示: 使用 'openclaw multimodal-rag stats' 查看进度");
+          } catch (error) {
+            console.error(`重新索引失败: ${String(error)}`);
+            process.exit(1);
+          }
+        });
+      // openclaw multimodal-rag setup
+      rag
+        .command("setup")
+        .description("交互式引导配置插件")
+        .action(async () => {
+          await runSetup();
+        });
+    }, { commands: ["multimodal-rag"] });
+    // ========================================================================
+    // 注册服务（文件监听）
+    // ========================================================================
+    api.registerService({
+      id: "multimodal-rag-watcher",
+      start: async () => {
+        await watcher.start();
+        api.logger.info?.("multimodal-rag: File watcher started");
+      },
+      stop: async () => {
+        await watcher.stop();
+        api.logger.info?.("multimodal-rag: File watcher stopped");
+      },
+    });
+    api.logger.info?.(
+      `multimodal-rag: Plugin initialized (db: ${resolvedDbPath})`,
+    );
+  },
+};
+export default multimodalRagPlugin;

package/openclaw.plugin.json ADDED Viewed

@@ -0,0 +1,114 @@
+{
+  "id": "multimodal-rag",
+  "name": "Multimodal RAG",
+  "description": "多模态 RAG 插件，使用本地 AI 模型对图像和音频进行语义索引与时间感知搜索",
+  "version": "0.1.0",
+  "kind": "rag",
+  "configSchema": {
+    "type": "object",
+    "properties": {
+      "watchPaths": {
+        "type": "array",
+        "items": { "type": "string" },
+        "default": [],
+        "description": "监听的文件夹路径列表（支持 ~ 展开）"
+      },
+      "fileTypes": {
+        "type": "object",
+        "properties": {
+          "image": {
+            "type": "array",
+            "items": { "type": "string" },
+            "default": [".jpg", ".jpeg", ".png", ".webp", ".gif", ".heic"]
+          },
+          "audio": {
+            "type": "array",
+            "items": { "type": "string" },
+            "default": [".wav", ".mp3", ".m4a", ".ogg", ".flac", ".aac"]
+          }
+        },
+        "default": {}
+      },
+      "ollama": {
+        "type": "object",
+        "properties": {
+          "baseUrl": {
+            "type": "string",
+            "default": "http://127.0.0.1:11434"
+          },
+          "visionModel": {
+            "type": "string",
+            "default": "qwen3-vl:2b",
+            "description": "用于图像描述的视觉模型"
+          },
+          "embedModel": {
+            "type": "string",
+            "default": "qwen3-embedding:latest",
+            "description": "用于生成嵌入向量的模型"
+          }
+        },
+        "default": {}
+      },
+      "embedding": {
+        "type": "object",
+        "properties": {
+          "provider": {
+            "type": "string",
+            "enum": ["ollama", "openai"],
+            "default": "ollama"
+          },
+          "openaiApiKey": {
+            "type": "string",
+            "description": "OpenAI API Key（仅当 provider=openai 时需要）"
+          },
+          "openaiModel": {
+            "type": "string",
+            "default": "text-embedding-3-small"
+          }
+        },
+        "default": {}
+      },
+      "dbPath": {
+        "type": "string",
+        "default": "~/.openclaw/multimodal-rag.lance",
+        "description": "LanceDB 数据库路径"
+      },
+      "watchDebounceMs": {
+        "type": "number",
+        "default": 1000,
+        "description": "文件监听去抖延迟（毫秒）"
+      },
+      "indexExistingOnStart": {
+        "type": "boolean",
+        "default": true,
+        "description": "启动时是否索引现有文件"
+      }
+    }
+  },
+  "uiHints": {
+    "watchPaths": {
+      "label": "监听路径",
+      "placeholder": "~/mic-recordings"
+    },
+    "ollama.visionModel": {
+      "label": "视觉模型",
+      "placeholder": "qwen3-vl:2b"
+    },
+    "ollama.embedModel": {
+      "label": "嵌入模型",
+      "placeholder": "qwen3-embedding:latest"
+    },
+    "embedding.openaiApiKey": {
+      "label": "OpenAI API Key",
+      "sensitive": true
+    },
+    "dbPath": {
+      "label": "数据库路径",
+      "advanced": true
+    },
+    "watchDebounceMs": {
+      "label": "去抖延迟（毫秒）",
+      "advanced": true
+    }
+  }
+}

package/package.json ADDED Viewed

@@ -0,0 +1,49 @@
+{
+  "name": "@hzttt/multimodal-rag",
+  "version": "0.1.1",
+  "description": "OpenClaw plugin for multimodal RAG - semantic indexing and time-aware search for images and audio using local AI models",
+  "type": "module",
+  "repository": {
+    "type": "git",
+    "url": "https://github.com/hzttt/multimodal-rag.git"
+  },
+  "scripts": {
+    "build": "tsc",
+    "test": "node test/test-embedding.js"
+  },
+  "keywords": [
+    "openclaw",
+    "plugin",
+    "rag",
+    "multimodal",
+    "vision",
+    "embedding",
+    "lancedb",
+    "ollama",
+    "whisper"
+  ],
+  "author": "hzttt",
+  "license": "MIT",
+  "files": [
+    "index.ts",
+    "src/",
+    "openclaw.plugin.json",
+    "README.md"
+  ],
+  "dependencies": {
+    "@lancedb/lancedb": "^0.14.0",
+    "@sinclair/typebox": "^0.33.0",
+    "chokidar": "^4.0.0"
+  },
+  "peerDependencies": {
+    "openclaw": "*"
+  },
+  "openclaw": {
+    "extensions": [
+      "./index.ts"
+    ],
+    "install": {
+      "npmSpec": "@hzttt/multimodal-rag"
+    }
+  }
+}