@ontos-ai/knowhere-claw 0.2.3 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/types.d.ts CHANGED
@@ -44,53 +44,6 @@ export interface KnowhereManifest {
44
44
  statistics?: KnowhereStatistics;
45
45
  files?: StringRecord;
46
46
  }
47
- export interface StoredChunk {
48
- chunkId: string;
49
- type: "text" | "image" | "table";
50
- path: string | null;
51
- summary: string;
52
- content: string;
53
- tokens: number | null;
54
- keywords: string[];
55
- relationships: unknown[];
56
- metadata: StringRecord;
57
- assetFilePath: string | null;
58
- originalName: string | null;
59
- tableType: string | null;
60
- }
61
- export interface StoredPathRecord {
62
- path: string;
63
- parentPath: string | null;
64
- depth: number;
65
- childPaths: string[];
66
- chunkIds: string[];
67
- directChunkCount: number;
68
- chunkCount: number;
69
- textChunkCount: number;
70
- imageChunkCount: number;
71
- tableChunkCount: number;
72
- }
73
- export type StoredResultFileKind = "manifest" | "chunks" | "fullMarkdown" | "kbCsv" | "hierarchy" | "hierarchyView" | "image" | "table" | "other";
74
- export interface StoredResultFileRecord {
75
- relativePath: string;
76
- kind: StoredResultFileKind;
77
- chunkId: string | null;
78
- format: string | null;
79
- sizeBytes: number | null;
80
- }
81
- export interface StoredBrowseIndex {
82
- version: number;
83
- paths: StoredPathRecord[];
84
- chunkOrder: string[];
85
- resultFiles: StoredResultFileRecord[];
86
- }
87
- export interface KnowhereParseResult {
88
- manifest: KnowhereManifest;
89
- chunks: StoredChunk[];
90
- fullMarkdown: string;
91
- hierarchy: unknown;
92
- browseIndex: StoredBrowseIndex;
93
- }
94
47
  export interface KnowhereDownloadedResult {
95
48
  zipBytes: Buffer;
96
49
  rawZipSha1: string;
@@ -175,6 +128,7 @@ export interface KnowhereScope {
175
128
  key: string;
176
129
  label: string;
177
130
  rootDir: string;
131
+ metadataDir: string;
178
132
  documentsDir: string;
179
133
  indexPath: string;
180
134
  }
@@ -198,17 +152,6 @@ export interface StoredDocumentRecord {
198
152
  chunkCount: number;
199
153
  statistics: KnowhereStatistics;
200
154
  }
201
- export interface StoredDocumentPayload {
202
- version: number;
203
- document: StoredDocumentRecord;
204
- manifest: KnowhereManifest;
205
- jobResult: KnowhereJobResult;
206
- fullMarkdown: string;
207
- hierarchy: unknown;
208
- browseIndex: StoredBrowseIndex;
209
- rawZipSha1: string;
210
- chunks: StoredChunk[];
211
- }
212
155
  export interface StoredDocumentMetadata {
213
156
  version: number;
214
157
  document: StoredDocumentRecord;
@@ -3,7 +3,7 @@
3
3
  "name": "Knowhere",
4
4
  "description": "Parse documents with Knowhere and expose the stored result as tool-queryable document state for OpenClaw agents.",
5
5
  "skills": ["./skills"],
6
- "version": "0.2.3",
6
+ "version": "0.2.5",
7
7
  "uiHints": {
8
8
  "apiKey": {
9
9
  "label": "Knowhere API Key",
@@ -82,6 +82,76 @@
82
82
  "minimum": 1000,
83
83
  "maximum": 7200000,
84
84
  "default": 600000
85
+ },
86
+ "knowledgeGraph": {
87
+ "type": "object",
88
+ "additionalProperties": false,
89
+ "properties": {
90
+ "enabled": {
91
+ "type": "boolean",
92
+ "default": true
93
+ },
94
+ "kbId": {
95
+ "type": "string"
96
+ },
97
+ "kbIdSource": {
98
+ "type": "string",
99
+ "enum": ["disabled", "agent", "agent-session", "global"],
100
+ "default": "disabled"
101
+ },
102
+ "concurrentBuildStrategy": {
103
+ "type": "string",
104
+ "enum": ["queue", "skip"],
105
+ "default": "queue"
106
+ },
107
+ "buildTimeout": {
108
+ "type": "integer",
109
+ "minimum": 10000,
110
+ "maximum": 600000,
111
+ "default": 300000
112
+ },
113
+ "notifyOnGraphFailure": {
114
+ "type": "boolean",
115
+ "default": false
116
+ },
117
+ "connectConfig": {
118
+ "type": "object",
119
+ "additionalProperties": false,
120
+ "properties": {
121
+ "minKeywordOverlap": {
122
+ "type": "integer",
123
+ "minimum": 1,
124
+ "default": 3
125
+ },
126
+ "keywordScoreWeight": {
127
+ "type": "number",
128
+ "minimum": 0,
129
+ "default": 1
130
+ },
131
+ "maxConnectionsPerChunk": {
132
+ "type": "integer",
133
+ "minimum": 1,
134
+ "default": 10
135
+ },
136
+ "minScoreThreshold": {
137
+ "type": "number",
138
+ "minimum": 0,
139
+ "maximum": 1,
140
+ "default": 0.8
141
+ },
142
+ "crossFileOnly": {
143
+ "type": "boolean",
144
+ "default": true
145
+ },
146
+ "maxContentOverlap": {
147
+ "type": "number",
148
+ "minimum": 0,
149
+ "maximum": 1,
150
+ "default": 0.8
151
+ }
152
+ }
153
+ }
154
+ }
85
155
  }
86
156
  }
87
157
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ontos-ai/knowhere-claw",
3
- "version": "0.2.3",
3
+ "version": "0.2.5",
4
4
  "description": "OpenClaw plugin for Knowhere-powered document ingestion and automatic grounding.",
5
5
  "files": [
6
6
  "dist/",
@@ -40,8 +40,7 @@
40
40
  "dependencies": {
41
41
  "@knowhere-ai/sdk": "^0.1.1",
42
42
  "fflate": "^0.8.2",
43
- "fs-extra": "^11.2.0",
44
- "nodejieba": "^2.6.0"
43
+ "fs-extra": "^11.2.0"
45
44
  },
46
45
  "devDependencies": {
47
46
  "@changesets/changelog-github": "^0.6.0",
@@ -1,12 +1,12 @@
1
1
  ---
2
2
  name: knowhere_memory
3
- description: Auto-discover and search knowledge from Knowhere parsed documents. Use when the user asks questions, needs information, or references their knowledge base.
3
+ description: Auto-discover and search knowledge from Knowhere parsed documents. Use when the user asks questions, needs information, or references their knowledge base. Also handles document ingestion when files are uploaded.
4
4
  user-invocable: false
5
5
  ---
6
6
 
7
7
  # Knowhere Knowledge Memory
8
8
 
9
- This agent has access to a **personal knowledge base** managed by Knowhere.
9
+ This agent has access to a **personal knowledge base** managed by Knowhere. The knowledge base stores parsed documents as structured JSON files under `~/.knowhere/`.
10
10
 
11
11
  ## When to Use
12
12
 
@@ -15,37 +15,85 @@ Activate this skill when:
15
15
  - The user asks a question that might be answered by their documents
16
16
  - The user says "look it up", "help me find", "knowledge base", "my materials", etc.
17
17
  - The user asks "what materials do I have" or wants an overview
18
+ - A file is uploaded or attached (trigger ingestion)
18
19
 
19
- ## Data Location
20
+ ## Part 1: Ingesting New Documents
21
+
22
+ When a file is uploaded or attached (e.g. via Telegram), the agent should parse it into the knowledge base.
23
+
24
+ ### Attachment markers
25
+
26
+ When a prompt contains a marker like:
27
+
28
+ ```text
29
+ [media attached: /absolute/path/to/file.pdf (application/pdf) | handbook.pdf]
30
+ ```
31
+
32
+ Use the exact absolute path as `filePath` and the visible filename as `fileName`.
33
+
34
+ ### Ingestion workflow
35
+
36
+ 1. Call `knowhere_ingest_document` with the file path
37
+ 2. The plugin handles everything automatically:
38
+ - Uploads the file to Knowhere API for parsing
39
+ - Polls until parsing completes
40
+ - Downloads and extracts the result package
41
+ - **Automatically** copies parsed data to `~/.knowhere/{kbId}/`
42
+ - **Automatically** builds/updates `knowledge_graph.json`
43
+ 3. After ingest completes, the new document is immediately searchable via the retrieval workflow below
44
+
45
+ Supported formats: PDF, DOCX, XLSX, PPTX, TXT, MD, images (JPG, PNG)
46
+
47
+ ## Part 2: Retrieving Knowledge
48
+
49
+ ### Data Location
20
50
 
21
51
  All knowledge data lives under `~/.knowhere/{kb_id}/`:
22
52
 
23
53
  ```text
24
54
  ~/.knowhere/
25
- └── {kb_id}/ # e.g. "chengke_kb"
26
- ├── knowledge_graph.json # START HERE — file-level overview + cross-file edges
27
- ├── chunk_stats.json # Hit counts / usage stats per chunk
55
+ └── {kb_id}/ # e.g. "telegram"
56
+ ├── knowledge_graph.json # File-level overview + cross-file edges
57
+ ├── chunk_stats.json # Usage stats per chunk
28
58
  └── {document_name}/ # One subdir per parsed document
29
- ├── chunks.json # All chunks for this document (the actual content)
59
+ ├── chunks.json # All chunks (the actual content)
30
60
  ├── hierarchy.json # Document structure tree
31
- ├── images/ # Extracted images (JPEG/PNG)
32
- └── tables/ # Extracted tables (HTML files)
61
+ ├── images/ # Extracted images
62
+ └── tables/ # Extracted tables (HTML)
33
63
  ```
34
64
 
35
- ## File Schema Reference
65
+ ### Strategy: Prefer tools, fall back to files
66
+
67
+ #### If `knowhere_kg_list` / `knowhere_kg_query` tools are available → use them
68
+
69
+ These tools provide efficient access to the knowledge graph:
36
70
 
37
- ### knowledge_graph.jsonGlobal Navigation (read this first)
71
+ 1. `knowhere_kg_list` list all available knowledge bases
72
+ 2. `knowhere_kg_query(kbId)` — returns the full knowledge graph (files, keywords, edges)
73
+ 3. Then read individual `chunks.json` files with your file reading tool for detailed content
74
+
75
+ #### If no KG tools are available → self-navigate using file tools
76
+
77
+ Follow this pattern — do NOT explore the filesystem blindly:
78
+
79
+ **Step 0: Resolve kb_id**
80
+
81
+ - List only the top level of `~/.knowhere/` to discover available KB IDs
82
+ - If exactly one KB → use it. If multiple → ask the user which one
83
+
84
+ **Step 1: Read knowledge_graph.json**
85
+
86
+ Read `~/.knowhere/{kb_id}/knowledge_graph.json`:
38
87
 
39
88
  ```json
40
89
  {
41
90
  "version": "2.0",
42
- "stats": { "total_files": 5, "total_chunks": 327, "total_cross_file_edges": 3 },
91
+ "stats": { "total_files": 5, "total_chunks": 327 },
43
92
  "files": {
44
93
  "report.docx": {
45
94
  "chunks_count": 198,
46
95
  "types": { "text": 135, "table": 21, "image": 42 },
47
96
  "top_keywords": ["excavation", "retaining", "construction"],
48
- "top_summary": "",
49
97
  "importance": 0.85
50
98
  }
51
99
  },
@@ -54,114 +102,48 @@ All knowledge data lives under `~/.knowhere/{kb_id}/`:
54
102
  "source": "file_A.docx",
55
103
  "target": "file_B.pdf",
56
104
  "connection_count": 20,
57
- "avg_score": 0.95,
58
105
  "top_connections": [
59
- {
60
- "source_chunk": "Chapter 3 Safety Measures",
61
- "source_id": "abc123-...",
62
- "target_chunk": "Safety Management Policy",
63
- "target_id": "def456-...",
64
- "relation": "related",
65
- "score": 1.0
66
- }
106
+ { "source_chunk": "Chapter 3", "target_chunk": "Safety Policy", "score": 1.0 }
67
107
  ]
68
108
  }
69
109
  ]
70
110
  }
71
111
  ```
72
112
 
73
- ### chunks.json Document Content (read per-file, on demand)
113
+ Match user query against ALL files' `top_keywords`. Prioritize by `importance`.
114
+
115
+ **Step 2: Read chunks.json for each candidate file**
74
116
 
75
- Located at `~/.knowhere/{kb_id}/{document_name}/chunks.json`.
117
+ Read `~/.knowhere/{kb_id}/{document_name}/chunks.json`:
76
118
 
77
119
  ```json
78
120
  {
79
121
  "chunks": [
80
122
  {
81
- "chunk_id": "34da946a-5938-578c-...",
82
- "type": "text",
83
- "path": "Default_Root/report.docx/Chapter 1/1.1",
123
+ "chunk_id": "uuid",
124
+ "type": "text | table | image",
125
+ "path": "Default_Root/doc.pdf/Chapter 1/1.1",
84
126
  "content": "actual content...",
85
127
  "metadata": {
86
- "summary": "LLM-generated summary (may be empty)",
87
- "keywords": ["Extracted keywords"],
88
- "tokens": ["Jieba tokenization"],
89
- "length": 1234,
90
- "page_nums": "Source pages (PDF/DOCX)"
128
+ "summary": "LLM-generated summary",
129
+ "keywords": ["extracted", "keywords"],
130
+ "length": 1234
91
131
  }
92
132
  }
93
133
  ]
94
134
  }
95
135
  ```
96
136
 
97
- **Content format by chunk type:**
98
-
99
- - `text`: Plain text with embedded markers like `IMAGE_uuid_IMAGE` or `TABLE_uuid_TABLE`
100
- - `table`: Raw HTML (`<table>...</table>`)
101
- - `image`: Brief description + `IMAGE_uuid_IMAGE` marker; actual image file in `images/` subdir
102
-
103
- ### hierarchy.json — Document Structure
104
-
105
- Three sub-trees:
106
-
107
- - `images/`: all extracted images with descriptive names
108
- - `tables/`: all extracted tables with header-based names
109
- - `Default_Root/{filename}/`: section hierarchy (chapters → subsections)
110
-
111
- ## Retrieval Workflow
112
-
113
- All operations below are **read-only** — use your file reading tools (e.g. `view_file`, `read_file`) to read JSON files directly. Do NOT use shell commands like `cat` — use native file reading tools that don't require user approval.
114
-
115
- Follow this pattern — do NOT explore the filesystem blindly:
116
-
117
- ### Before Step 1: Resolve `kb_id`
118
-
119
- - If the user already specified a KB, use that `kb_id`.
120
- - Otherwise, inspect only the top level of `~/.knowhere/` to discover available KB IDs.
121
- - If exactly one KB is available, use it.
122
- - If multiple KBs are available and the user did not specify one, ask which KB to use.
123
- - Do not explore beyond the top level of `~/.knowhere/` until `kb_id` is known.
124
-
125
- ### Step 1: Read knowledge_graph.json (global navigation)
126
-
127
- Read the file `~/.knowhere/{kb_id}/knowledge_graph.json` using your file reading tool.
128
-
129
- From this you get:
130
-
131
- - **File list** with `top_keywords` → match user's question against ALL files, not just one
132
- - **importance** → prioritize high-value files when multiple match
133
- - **edges** → note which matched files connect to other files (you'll need these in Step 3)
134
-
135
- **Important**: Identify ALL candidate files whose `top_keywords` are relevant to the query. Do not stop at the first match.
136
-
137
- ### Step 2: Search ALL candidate files' chunks.json
138
-
139
- For EACH candidate file identified in Step 1, read `~/.knowhere/{kb_id}/{document_name}/chunks.json`.
140
-
141
- Search the `chunks` array:
142
-
143
- - Match `metadata.summary` or `content` against the user's query
144
- - Use `metadata.keywords` for topic matching
145
- - Use `path` to understand where the chunk sits in the document structure
146
- - Use `chunk_id` to cross-reference with edge `source_id`/`target_id`
147
-
148
- Collect matching chunks from ALL files, not just the first one that hits.
149
-
150
- ### Step 3: Expand via edges (required, not optional)
151
-
152
- After finding matches, ALWAYS check the `edges` array from Step 1 for connections:
137
+ Search `content` and `metadata.keywords` against the user's query.
153
138
 
154
- 1. Look at edges involving your matched files
155
- 2. Check `top_connections` — if any `source_chunk`/`target_chunk` names are related to the query topic, the connected file likely has relevant content too
156
- 3. If the connected file wasn't already in your candidate set, read its `chunks.json` and search for related content
157
- 4. Use `source_id`/`target_id` to jump directly to specific related chunks
139
+ **Step 3: Expand via edges (do not skip)**
158
140
 
159
- **Why this matters**: Documents often split related information across files. Edges reveal these connections.
141
+ Check `edges` from Step 1 for cross-document connections. If related files weren't in your candidate set, read their `chunks.json` too.
160
142
 
161
143
  ## Response Guidelines
162
144
 
163
- - **Multi-source**: Synthesize information from ALL matched files, not just one
164
- - **Cite sources**: Include document name and chunk path for each piece of information
165
- - **Show connections**: When edges link matched chunks across files, mention the relationship
166
- - **Distinguish**: Be transparent about what comes from parsed documents vs general knowledge
167
- - **Use summaries**: When available, `metadata.summary` gives a quick overview without reading full content
145
+ - **Cite sources**: include document name and section path
146
+ - **Multi-source**: synthesize from ALL matched files, not just the first hit
147
+ - **Show connections**: mention cross-file relationships from edges
148
+ - **No internal IDs**: never expose `chunk_id` or UUID paths to the user
149
+ - **User's language**: reply in the same language the user is using