npm - @ontos-ai/knowhere-claw - Versions diffs - 0.2.3 → 0.2.5 - Mend

@ontos-ai/knowhere-claw 0.2.3 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

package/README.md +5 -5
package/dist/client.js +1 -1
package/dist/config.d.ts +8 -0
package/dist/config.js +56 -8
package/dist/connect-builder.d.ts +2 -0
package/dist/connect-builder.js +9 -10
package/dist/graph-builder.d.ts +4 -1
package/dist/graph-builder.js +21 -34
package/dist/index.js +3 -9
package/dist/kg-service.d.ts +0 -2
package/dist/kg-service.js +12 -45
package/dist/parser.d.ts +4 -8
package/dist/parser.js +25 -243
package/dist/store.d.ts +4 -14
package/dist/store.js +21 -106
package/dist/text.js +1 -13
package/dist/tools.js +413 -848
package/dist/types.d.ts +1 -58
package/openclaw.plugin.json +71 -1
package/package.json +2 -3
package/skills/knowhere_memory/SKILL.md +80 -98
package/skills/knowhere/SKILL.md +0 -285
/package/dist/__tests__/{read-result-file-tool.test.d.ts → storage-layout.test.d.ts} +0 -0

package/dist/types.d.ts CHANGED Viewed

@@ -44,53 +44,6 @@ export interface KnowhereManifest {
     statistics?: KnowhereStatistics;
     files?: StringRecord;
 }
-export interface StoredChunk {
-    chunkId: string;
-    type: "text" | "image" | "table";
-    path: string | null;
-    summary: string;
-    content: string;
-    tokens: number | null;
-    keywords: string[];
-    relationships: unknown[];
-    metadata: StringRecord;
-    assetFilePath: string | null;
-    originalName: string | null;
-    tableType: string | null;
-}
-export interface StoredPathRecord {
-    path: string;
-    parentPath: string | null;
-    depth: number;
-    childPaths: string[];
-    chunkIds: string[];
-    directChunkCount: number;
-    chunkCount: number;
-    textChunkCount: number;
-    imageChunkCount: number;
-    tableChunkCount: number;
-}
-export type StoredResultFileKind = "manifest" | "chunks" | "fullMarkdown" | "kbCsv" | "hierarchy" | "hierarchyView" | "image" | "table" | "other";
-export interface StoredResultFileRecord {
-    relativePath: string;
-    kind: StoredResultFileKind;
-    chunkId: string | null;
-    format: string | null;
-    sizeBytes: number | null;
-}
-export interface StoredBrowseIndex {
-    version: number;
-    paths: StoredPathRecord[];
-    chunkOrder: string[];
-    resultFiles: StoredResultFileRecord[];
-}
-export interface KnowhereParseResult {
-    manifest: KnowhereManifest;
-    chunks: StoredChunk[];
-    fullMarkdown: string;
-    hierarchy: unknown;
-    browseIndex: StoredBrowseIndex;
-}
 export interface KnowhereDownloadedResult {
     zipBytes: Buffer;
     rawZipSha1: string;
@@ -175,6 +128,7 @@ export interface KnowhereScope {
     key: string;
     label: string;
     rootDir: string;
+    metadataDir: string;
     documentsDir: string;
     indexPath: string;
 }
@@ -198,17 +152,6 @@ export interface StoredDocumentRecord {
     chunkCount: number;
     statistics: KnowhereStatistics;
 }
-export interface StoredDocumentPayload {
-    version: number;
-    document: StoredDocumentRecord;
-    manifest: KnowhereManifest;
-    jobResult: KnowhereJobResult;
-    fullMarkdown: string;
-    hierarchy: unknown;
-    browseIndex: StoredBrowseIndex;
-    rawZipSha1: string;
-    chunks: StoredChunk[];
-}
 export interface StoredDocumentMetadata {
     version: number;
     document: StoredDocumentRecord;

package/openclaw.plugin.json CHANGED Viewed

@@ -3,7 +3,7 @@
   "name": "Knowhere",
   "description": "Parse documents with Knowhere and expose the stored result as tool-queryable document state for OpenClaw agents.",
   "skills": ["./skills"],
-  "version": "0.2.3",
+  "version": "0.2.5",
   "uiHints": {
     "apiKey": {
       "label": "Knowhere API Key",
@@ -82,6 +82,76 @@
         "minimum": 1000,
         "maximum": 7200000,
         "default": 600000
+      },
+      "knowledgeGraph": {
+        "type": "object",
+        "additionalProperties": false,
+        "properties": {
+          "enabled": {
+            "type": "boolean",
+            "default": true
+          },
+          "kbId": {
+            "type": "string"
+          },
+          "kbIdSource": {
+            "type": "string",
+            "enum": ["disabled", "agent", "agent-session", "global"],
+            "default": "disabled"
+          },
+          "concurrentBuildStrategy": {
+            "type": "string",
+            "enum": ["queue", "skip"],
+            "default": "queue"
+          },
+          "buildTimeout": {
+            "type": "integer",
+            "minimum": 10000,
+            "maximum": 600000,
+            "default": 300000
+          },
+          "notifyOnGraphFailure": {
+            "type": "boolean",
+            "default": false
+          },
+          "connectConfig": {
+            "type": "object",
+            "additionalProperties": false,
+            "properties": {
+              "minKeywordOverlap": {
+                "type": "integer",
+                "minimum": 1,
+                "default": 3
+              },
+              "keywordScoreWeight": {
+                "type": "number",
+                "minimum": 0,
+                "default": 1
+              },
+              "maxConnectionsPerChunk": {
+                "type": "integer",
+                "minimum": 1,
+                "default": 10
+              },
+              "minScoreThreshold": {
+                "type": "number",
+                "minimum": 0,
+                "maximum": 1,
+                "default": 0.8
+              },
+              "crossFileOnly": {
+                "type": "boolean",
+                "default": true
+              },
+              "maxContentOverlap": {
+                "type": "number",
+                "minimum": 0,
+                "maximum": 1,
+                "default": 0.8
+              }
+            }
+          }
+        }
       }
     }
   }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@ontos-ai/knowhere-claw",
-  "version": "0.2.3",
+  "version": "0.2.5",
   "description": "OpenClaw plugin for Knowhere-powered document ingestion and automatic grounding.",
   "files": [
     "dist/",
@@ -40,8 +40,7 @@
   "dependencies": {
     "@knowhere-ai/sdk": "^0.1.1",
     "fflate": "^0.8.2",
-    "fs-extra": "^11.2.0",
-    "nodejieba": "^2.6.0"
+    "fs-extra": "^11.2.0"
   },
   "devDependencies": {
     "@changesets/changelog-github": "^0.6.0",

package/skills/knowhere_memory/SKILL.md CHANGED Viewed

@@ -1,12 +1,12 @@
 ---
 name: knowhere_memory
-description: Auto-discover and search knowledge from Knowhere parsed documents. Use when the user asks questions, needs information, or references their knowledge base.
+description: Auto-discover and search knowledge from Knowhere parsed documents. Use when the user asks questions, needs information, or references their knowledge base. Also handles document ingestion when files are uploaded.
 user-invocable: false
 ---
 # Knowhere Knowledge Memory
-This agent has access to a **personal knowledge base** managed by Knowhere.
+This agent has access to a **personal knowledge base** managed by Knowhere. The knowledge base stores parsed documents as structured JSON files under `~/.knowhere/`.
 ## When to Use
@@ -15,37 +15,85 @@ Activate this skill when:
 - The user asks a question that might be answered by their documents
 - The user says "look it up", "help me find", "knowledge base", "my materials", etc.
 - The user asks "what materials do I have" or wants an overview
+- A file is uploaded or attached (trigger ingestion)
-## Data Location
+## Part 1: Ingesting New Documents
+When a file is uploaded or attached (e.g. via Telegram), the agent should parse it into the knowledge base.
+### Attachment markers
+When a prompt contains a marker like:
+```text
+[media attached: /absolute/path/to/file.pdf (application/pdf) | handbook.pdf]
+```
+Use the exact absolute path as `filePath` and the visible filename as `fileName`.
+### Ingestion workflow
+1. Call `knowhere_ingest_document` with the file path
+2. The plugin handles everything automatically:
+   - Uploads the file to Knowhere API for parsing
+   - Polls until parsing completes
+   - Downloads and extracts the result package
+   - **Automatically** copies parsed data to `~/.knowhere/{kbId}/`
+   - **Automatically** builds/updates `knowledge_graph.json`
+3. After ingest completes, the new document is immediately searchable via the retrieval workflow below
+Supported formats: PDF, DOCX, XLSX, PPTX, TXT, MD, images (JPG, PNG)
+## Part 2: Retrieving Knowledge
+### Data Location
 All knowledge data lives under `~/.knowhere/{kb_id}/`:
 ```text
 ~/.knowhere/
-└── {kb_id}/                          # e.g. "chengke_kb"
-    ├── knowledge_graph.json          # START HERE — file-level overview + cross-file edges
-    ├── chunk_stats.json              # Hit counts / usage stats per chunk
+└── {kb_id}/                          # e.g. "telegram"
+    ├── knowledge_graph.json          # File-level overview + cross-file edges
+    ├── chunk_stats.json              # Usage stats per chunk
     └── {document_name}/              # One subdir per parsed document
-        ├── chunks.json               # All chunks for this document (the actual content)
+        ├── chunks.json               # All chunks (the actual content)
         ├── hierarchy.json            # Document structure tree
-        ├── images/                   # Extracted images (JPEG/PNG)
-        └── tables/                   # Extracted tables (HTML files)
+        ├── images/                   # Extracted images
+        └── tables/                   # Extracted tables (HTML)
 ```
-## File Schema Reference
+### Strategy: Prefer tools, fall back to files
+#### If `knowhere_kg_list` / `knowhere_kg_query` tools are available → use them
+These tools provide efficient access to the knowledge graph:
-### knowledge_graph.json — Global Navigation (read this first)
+1. `knowhere_kg_list` — list all available knowledge bases
+2. `knowhere_kg_query(kbId)` — returns the full knowledge graph (files, keywords, edges)
+3. Then read individual `chunks.json` files with your file reading tool for detailed content
+#### If no KG tools are available → self-navigate using file tools
+Follow this pattern — do NOT explore the filesystem blindly:
+**Step 0: Resolve kb_id**
+- List only the top level of `~/.knowhere/` to discover available KB IDs
+- If exactly one KB → use it. If multiple → ask the user which one
+**Step 1: Read knowledge_graph.json**
+Read `~/.knowhere/{kb_id}/knowledge_graph.json`:
 ```json
 {
   "version": "2.0",
-  "stats": { "total_files": 5, "total_chunks": 327, "total_cross_file_edges": 3 },
+  "stats": { "total_files": 5, "total_chunks": 327 },
   "files": {
     "report.docx": {
       "chunks_count": 198,
       "types": { "text": 135, "table": 21, "image": 42 },
       "top_keywords": ["excavation", "retaining", "construction"],
-      "top_summary": "",
       "importance": 0.85
     }
   },
@@ -54,114 +102,48 @@ All knowledge data lives under `~/.knowhere/{kb_id}/`:
       "source": "file_A.docx",
       "target": "file_B.pdf",
       "connection_count": 20,
-      "avg_score": 0.95,
       "top_connections": [
-        {
-          "source_chunk": "Chapter 3 Safety Measures",
-          "source_id": "abc123-...",
-          "target_chunk": "Safety Management Policy",
-          "target_id": "def456-...",
-          "relation": "related",
-          "score": 1.0
-        }
+        { "source_chunk": "Chapter 3", "target_chunk": "Safety Policy", "score": 1.0 }
       ]
     }
   ]
 }
 ```
-### chunks.json — Document Content (read per-file, on demand)
+Match user query against ALL files' `top_keywords`. Prioritize by `importance`.
+**Step 2: Read chunks.json for each candidate file**
-Located at `~/.knowhere/{kb_id}/{document_name}/chunks.json`.
+Read `~/.knowhere/{kb_id}/{document_name}/chunks.json`:
 ```json
 {
   "chunks": [
     {
-      "chunk_id": "34da946a-5938-578c-...",
-      "type": "text",
-      "path": "Default_Root/report.docx/Chapter 1/1.1",
+      "chunk_id": "uuid",
+      "type": "text | table | image",
+      "path": "Default_Root/doc.pdf/Chapter 1/1.1",
       "content": "actual content...",
       "metadata": {
-        "summary": "LLM-generated summary (may be empty)",
-        "keywords": ["Extracted keywords"],
-        "tokens": ["Jieba tokenization"],
-        "length": 1234,
-        "page_nums": "Source pages (PDF/DOCX)"
+        "summary": "LLM-generated summary",
+        "keywords": ["extracted", "keywords"],
+        "length": 1234
       }
     }
   ]
 }
 ```
-**Content format by chunk type:**
-- `text`: Plain text with embedded markers like `IMAGE_uuid_IMAGE` or `TABLE_uuid_TABLE`
-- `table`: Raw HTML (`<table>...</table>`)
-- `image`: Brief description + `IMAGE_uuid_IMAGE` marker; actual image file in `images/` subdir
-### hierarchy.json — Document Structure
-Three sub-trees:
-- `images/`: all extracted images with descriptive names
-- `tables/`: all extracted tables with header-based names
-- `Default_Root/{filename}/`: section hierarchy (chapters → subsections)
-## Retrieval Workflow
-All operations below are **read-only** — use your file reading tools (e.g. `view_file`, `read_file`) to read JSON files directly. Do NOT use shell commands like `cat` — use native file reading tools that don't require user approval.
-Follow this pattern — do NOT explore the filesystem blindly:
-### Before Step 1: Resolve `kb_id`
-- If the user already specified a KB, use that `kb_id`.
-- Otherwise, inspect only the top level of `~/.knowhere/` to discover available KB IDs.
-- If exactly one KB is available, use it.
-- If multiple KBs are available and the user did not specify one, ask which KB to use.
-- Do not explore beyond the top level of `~/.knowhere/` until `kb_id` is known.
-### Step 1: Read knowledge_graph.json (global navigation)
-Read the file `~/.knowhere/{kb_id}/knowledge_graph.json` using your file reading tool.
-From this you get:
-- **File list** with `top_keywords` → match user's question against ALL files, not just one
-- **importance** → prioritize high-value files when multiple match
-- **edges** → note which matched files connect to other files (you'll need these in Step 3)
-**Important**: Identify ALL candidate files whose `top_keywords` are relevant to the query. Do not stop at the first match.
-### Step 2: Search ALL candidate files' chunks.json
-For EACH candidate file identified in Step 1, read `~/.knowhere/{kb_id}/{document_name}/chunks.json`.
-Search the `chunks` array:
-- Match `metadata.summary` or `content` against the user's query
-- Use `metadata.keywords` for topic matching
-- Use `path` to understand where the chunk sits in the document structure
-- Use `chunk_id` to cross-reference with edge `source_id`/`target_id`
-Collect matching chunks from ALL files, not just the first one that hits.
-### Step 3: Expand via edges (required, not optional)
-After finding matches, ALWAYS check the `edges` array from Step 1 for connections:
+Search `content` and `metadata.keywords` against the user's query.
-1. Look at edges involving your matched files
-2. Check `top_connections` — if any `source_chunk`/`target_chunk` names are related to the query topic, the connected file likely has relevant content too
-3. If the connected file wasn't already in your candidate set, read its `chunks.json` and search for related content
-4. Use `source_id`/`target_id` to jump directly to specific related chunks
+**Step 3: Expand via edges (do not skip)**
-**Why this matters**: Documents often split related information across files. Edges reveal these connections.
+Check `edges` from Step 1 for cross-document connections. If related files weren't in your candidate set, read their `chunks.json` too.
 ## Response Guidelines
-- **Multi-source**: Synthesize information from ALL matched files, not just one
-- **Cite sources**: Include document name and chunk path for each piece of information
-- **Show connections**: When edges link matched chunks across files, mention the relationship
-- **Distinguish**: Be transparent about what comes from parsed documents vs general knowledge
-- **Use summaries**: When available, `metadata.summary` gives a quick overview without reading full content
+- **Cite sources**: include document name and section path
+- **Multi-source**: synthesize from ALL matched files, not just the first hit
+- **Show connections**: mention cross-file relationships from edges
+- **No internal IDs**: never expose `chunk_id` or UUID paths to the user
+- **User's language**: reply in the same language the user is using