@ontos-ai/knowhere-claw 0.2.3 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -5
- package/dist/connect-builder.d.ts +2 -0
- package/dist/connect-builder.js +9 -10
- package/dist/graph-builder.d.ts +4 -1
- package/dist/graph-builder.js +15 -10
- package/dist/index.js +1 -7
- package/dist/kg-service.js +8 -3
- package/dist/parser.d.ts +4 -8
- package/dist/parser.js +25 -243
- package/dist/store.d.ts +4 -14
- package/dist/store.js +21 -106
- package/dist/text.js +1 -13
- package/dist/tools.js +14 -847
- package/dist/types.d.ts +1 -58
- package/openclaw.plugin.json +71 -1
- package/package.json +1 -1
- package/skills/knowhere_memory/SKILL.md +80 -98
- package/skills/knowhere/SKILL.md +0 -285
- /package/dist/__tests__/{read-result-file-tool.test.d.ts → storage-layout.test.d.ts} +0 -0
package/dist/types.d.ts
CHANGED
|
@@ -44,53 +44,6 @@ export interface KnowhereManifest {
|
|
|
44
44
|
statistics?: KnowhereStatistics;
|
|
45
45
|
files?: StringRecord;
|
|
46
46
|
}
|
|
47
|
-
export interface StoredChunk {
|
|
48
|
-
chunkId: string;
|
|
49
|
-
type: "text" | "image" | "table";
|
|
50
|
-
path: string | null;
|
|
51
|
-
summary: string;
|
|
52
|
-
content: string;
|
|
53
|
-
tokens: number | null;
|
|
54
|
-
keywords: string[];
|
|
55
|
-
relationships: unknown[];
|
|
56
|
-
metadata: StringRecord;
|
|
57
|
-
assetFilePath: string | null;
|
|
58
|
-
originalName: string | null;
|
|
59
|
-
tableType: string | null;
|
|
60
|
-
}
|
|
61
|
-
export interface StoredPathRecord {
|
|
62
|
-
path: string;
|
|
63
|
-
parentPath: string | null;
|
|
64
|
-
depth: number;
|
|
65
|
-
childPaths: string[];
|
|
66
|
-
chunkIds: string[];
|
|
67
|
-
directChunkCount: number;
|
|
68
|
-
chunkCount: number;
|
|
69
|
-
textChunkCount: number;
|
|
70
|
-
imageChunkCount: number;
|
|
71
|
-
tableChunkCount: number;
|
|
72
|
-
}
|
|
73
|
-
export type StoredResultFileKind = "manifest" | "chunks" | "fullMarkdown" | "kbCsv" | "hierarchy" | "hierarchyView" | "image" | "table" | "other";
|
|
74
|
-
export interface StoredResultFileRecord {
|
|
75
|
-
relativePath: string;
|
|
76
|
-
kind: StoredResultFileKind;
|
|
77
|
-
chunkId: string | null;
|
|
78
|
-
format: string | null;
|
|
79
|
-
sizeBytes: number | null;
|
|
80
|
-
}
|
|
81
|
-
export interface StoredBrowseIndex {
|
|
82
|
-
version: number;
|
|
83
|
-
paths: StoredPathRecord[];
|
|
84
|
-
chunkOrder: string[];
|
|
85
|
-
resultFiles: StoredResultFileRecord[];
|
|
86
|
-
}
|
|
87
|
-
export interface KnowhereParseResult {
|
|
88
|
-
manifest: KnowhereManifest;
|
|
89
|
-
chunks: StoredChunk[];
|
|
90
|
-
fullMarkdown: string;
|
|
91
|
-
hierarchy: unknown;
|
|
92
|
-
browseIndex: StoredBrowseIndex;
|
|
93
|
-
}
|
|
94
47
|
export interface KnowhereDownloadedResult {
|
|
95
48
|
zipBytes: Buffer;
|
|
96
49
|
rawZipSha1: string;
|
|
@@ -175,6 +128,7 @@ export interface KnowhereScope {
|
|
|
175
128
|
key: string;
|
|
176
129
|
label: string;
|
|
177
130
|
rootDir: string;
|
|
131
|
+
metadataDir: string;
|
|
178
132
|
documentsDir: string;
|
|
179
133
|
indexPath: string;
|
|
180
134
|
}
|
|
@@ -198,17 +152,6 @@ export interface StoredDocumentRecord {
|
|
|
198
152
|
chunkCount: number;
|
|
199
153
|
statistics: KnowhereStatistics;
|
|
200
154
|
}
|
|
201
|
-
export interface StoredDocumentPayload {
|
|
202
|
-
version: number;
|
|
203
|
-
document: StoredDocumentRecord;
|
|
204
|
-
manifest: KnowhereManifest;
|
|
205
|
-
jobResult: KnowhereJobResult;
|
|
206
|
-
fullMarkdown: string;
|
|
207
|
-
hierarchy: unknown;
|
|
208
|
-
browseIndex: StoredBrowseIndex;
|
|
209
|
-
rawZipSha1: string;
|
|
210
|
-
chunks: StoredChunk[];
|
|
211
|
-
}
|
|
212
155
|
export interface StoredDocumentMetadata {
|
|
213
156
|
version: number;
|
|
214
157
|
document: StoredDocumentRecord;
|
package/openclaw.plugin.json
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
"name": "Knowhere",
|
|
4
4
|
"description": "Parse documents with Knowhere and expose the stored result as tool-queryable document state for OpenClaw agents.",
|
|
5
5
|
"skills": ["./skills"],
|
|
6
|
-
"version": "0.2.
|
|
6
|
+
"version": "0.2.4",
|
|
7
7
|
"uiHints": {
|
|
8
8
|
"apiKey": {
|
|
9
9
|
"label": "Knowhere API Key",
|
|
@@ -82,6 +82,76 @@
|
|
|
82
82
|
"minimum": 1000,
|
|
83
83
|
"maximum": 7200000,
|
|
84
84
|
"default": 600000
|
|
85
|
+
},
|
|
86
|
+
"knowledgeGraph": {
|
|
87
|
+
"type": "object",
|
|
88
|
+
"additionalProperties": false,
|
|
89
|
+
"properties": {
|
|
90
|
+
"enabled": {
|
|
91
|
+
"type": "boolean",
|
|
92
|
+
"default": true
|
|
93
|
+
},
|
|
94
|
+
"kbId": {
|
|
95
|
+
"type": "string"
|
|
96
|
+
},
|
|
97
|
+
"kbIdSource": {
|
|
98
|
+
"type": "string",
|
|
99
|
+
"enum": ["disabled", "agent", "agent-session", "global"],
|
|
100
|
+
"default": "disabled"
|
|
101
|
+
},
|
|
102
|
+
"concurrentBuildStrategy": {
|
|
103
|
+
"type": "string",
|
|
104
|
+
"enum": ["queue", "skip"],
|
|
105
|
+
"default": "queue"
|
|
106
|
+
},
|
|
107
|
+
"buildTimeout": {
|
|
108
|
+
"type": "integer",
|
|
109
|
+
"minimum": 10000,
|
|
110
|
+
"maximum": 600000,
|
|
111
|
+
"default": 300000
|
|
112
|
+
},
|
|
113
|
+
"notifyOnGraphFailure": {
|
|
114
|
+
"type": "boolean",
|
|
115
|
+
"default": false
|
|
116
|
+
},
|
|
117
|
+
"connectConfig": {
|
|
118
|
+
"type": "object",
|
|
119
|
+
"additionalProperties": false,
|
|
120
|
+
"properties": {
|
|
121
|
+
"minKeywordOverlap": {
|
|
122
|
+
"type": "integer",
|
|
123
|
+
"minimum": 1,
|
|
124
|
+
"default": 3
|
|
125
|
+
},
|
|
126
|
+
"keywordScoreWeight": {
|
|
127
|
+
"type": "number",
|
|
128
|
+
"minimum": 0,
|
|
129
|
+
"default": 1
|
|
130
|
+
},
|
|
131
|
+
"maxConnectionsPerChunk": {
|
|
132
|
+
"type": "integer",
|
|
133
|
+
"minimum": 1,
|
|
134
|
+
"default": 10
|
|
135
|
+
},
|
|
136
|
+
"minScoreThreshold": {
|
|
137
|
+
"type": "number",
|
|
138
|
+
"minimum": 0,
|
|
139
|
+
"maximum": 1,
|
|
140
|
+
"default": 0.8
|
|
141
|
+
},
|
|
142
|
+
"crossFileOnly": {
|
|
143
|
+
"type": "boolean",
|
|
144
|
+
"default": true
|
|
145
|
+
},
|
|
146
|
+
"maxContentOverlap": {
|
|
147
|
+
"type": "number",
|
|
148
|
+
"minimum": 0,
|
|
149
|
+
"maximum": 1,
|
|
150
|
+
"default": 0.8
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
}
|
|
85
155
|
}
|
|
86
156
|
}
|
|
87
157
|
}
|
package/package.json
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: knowhere_memory
|
|
3
|
-
description: Auto-discover and search knowledge from Knowhere parsed documents. Use when the user asks questions, needs information, or references their knowledge base.
|
|
3
|
+
description: Auto-discover and search knowledge from Knowhere parsed documents. Use when the user asks questions, needs information, or references their knowledge base. Also handles document ingestion when files are uploaded.
|
|
4
4
|
user-invocable: false
|
|
5
5
|
---
|
|
6
6
|
|
|
7
7
|
# Knowhere Knowledge Memory
|
|
8
8
|
|
|
9
|
-
This agent has access to a **personal knowledge base** managed by Knowhere.
|
|
9
|
+
This agent has access to a **personal knowledge base** managed by Knowhere. The knowledge base stores parsed documents as structured JSON files under `~/.knowhere/`.
|
|
10
10
|
|
|
11
11
|
## When to Use
|
|
12
12
|
|
|
@@ -15,37 +15,85 @@ Activate this skill when:
|
|
|
15
15
|
- The user asks a question that might be answered by their documents
|
|
16
16
|
- The user says "look it up", "help me find", "knowledge base", "my materials", etc.
|
|
17
17
|
- The user asks "what materials do I have" or wants an overview
|
|
18
|
+
- A file is uploaded or attached (trigger ingestion)
|
|
18
19
|
|
|
19
|
-
##
|
|
20
|
+
## Part 1: Ingesting New Documents
|
|
21
|
+
|
|
22
|
+
When a file is uploaded or attached (e.g. via Telegram), the agent should parse it into the knowledge base.
|
|
23
|
+
|
|
24
|
+
### Attachment markers
|
|
25
|
+
|
|
26
|
+
When a prompt contains a marker like:
|
|
27
|
+
|
|
28
|
+
```text
|
|
29
|
+
[media attached: /absolute/path/to/file.pdf (application/pdf) | handbook.pdf]
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
Use the exact absolute path as `filePath` and the visible filename as `fileName`.
|
|
33
|
+
|
|
34
|
+
### Ingestion workflow
|
|
35
|
+
|
|
36
|
+
1. Call `knowhere_ingest_document` with the file path
|
|
37
|
+
2. The plugin handles everything automatically:
|
|
38
|
+
- Uploads the file to Knowhere API for parsing
|
|
39
|
+
- Polls until parsing completes
|
|
40
|
+
- Downloads and extracts the result package
|
|
41
|
+
- **Automatically** copies parsed data to `~/.knowhere/{kbId}/`
|
|
42
|
+
- **Automatically** builds/updates `knowledge_graph.json`
|
|
43
|
+
3. After ingest completes, the new document is immediately searchable via the retrieval workflow below
|
|
44
|
+
|
|
45
|
+
Supported formats: PDF, DOCX, XLSX, PPTX, TXT, MD, images (JPG, PNG)
|
|
46
|
+
|
|
47
|
+
## Part 2: Retrieving Knowledge
|
|
48
|
+
|
|
49
|
+
### Data Location
|
|
20
50
|
|
|
21
51
|
All knowledge data lives under `~/.knowhere/{kb_id}/`:
|
|
22
52
|
|
|
23
53
|
```text
|
|
24
54
|
~/.knowhere/
|
|
25
|
-
└── {kb_id}/ # e.g. "
|
|
26
|
-
├── knowledge_graph.json #
|
|
27
|
-
├── chunk_stats.json #
|
|
55
|
+
└── {kb_id}/ # e.g. "telegram"
|
|
56
|
+
├── knowledge_graph.json # File-level overview + cross-file edges
|
|
57
|
+
├── chunk_stats.json # Usage stats per chunk
|
|
28
58
|
└── {document_name}/ # One subdir per parsed document
|
|
29
|
-
├── chunks.json # All chunks
|
|
59
|
+
├── chunks.json # All chunks (the actual content)
|
|
30
60
|
├── hierarchy.json # Document structure tree
|
|
31
|
-
├── images/ # Extracted images
|
|
32
|
-
└── tables/ # Extracted tables (HTML
|
|
61
|
+
├── images/ # Extracted images
|
|
62
|
+
└── tables/ # Extracted tables (HTML)
|
|
33
63
|
```
|
|
34
64
|
|
|
35
|
-
|
|
65
|
+
### Strategy: Prefer tools, fall back to files
|
|
66
|
+
|
|
67
|
+
#### If `knowhere_kg_list` / `knowhere_kg_query` tools are available → use them
|
|
68
|
+
|
|
69
|
+
These tools provide efficient access to the knowledge graph:
|
|
36
70
|
|
|
37
|
-
|
|
71
|
+
1. `knowhere_kg_list` — list all available knowledge bases
|
|
72
|
+
2. `knowhere_kg_query(kbId)` — returns the full knowledge graph (files, keywords, edges)
|
|
73
|
+
3. Then read individual `chunks.json` files with your file reading tool for detailed content
|
|
74
|
+
|
|
75
|
+
#### If no KG tools are available → self-navigate using file tools
|
|
76
|
+
|
|
77
|
+
Follow this pattern — do NOT explore the filesystem blindly:
|
|
78
|
+
|
|
79
|
+
**Step 0: Resolve kb_id**
|
|
80
|
+
|
|
81
|
+
- List only the top level of `~/.knowhere/` to discover available KB IDs
|
|
82
|
+
- If exactly one KB → use it. If multiple → ask the user which one
|
|
83
|
+
|
|
84
|
+
**Step 1: Read knowledge_graph.json**
|
|
85
|
+
|
|
86
|
+
Read `~/.knowhere/{kb_id}/knowledge_graph.json`:
|
|
38
87
|
|
|
39
88
|
```json
|
|
40
89
|
{
|
|
41
90
|
"version": "2.0",
|
|
42
|
-
"stats": { "total_files": 5, "total_chunks": 327
|
|
91
|
+
"stats": { "total_files": 5, "total_chunks": 327 },
|
|
43
92
|
"files": {
|
|
44
93
|
"report.docx": {
|
|
45
94
|
"chunks_count": 198,
|
|
46
95
|
"types": { "text": 135, "table": 21, "image": 42 },
|
|
47
96
|
"top_keywords": ["excavation", "retaining", "construction"],
|
|
48
|
-
"top_summary": "",
|
|
49
97
|
"importance": 0.85
|
|
50
98
|
}
|
|
51
99
|
},
|
|
@@ -54,114 +102,48 @@ All knowledge data lives under `~/.knowhere/{kb_id}/`:
|
|
|
54
102
|
"source": "file_A.docx",
|
|
55
103
|
"target": "file_B.pdf",
|
|
56
104
|
"connection_count": 20,
|
|
57
|
-
"avg_score": 0.95,
|
|
58
105
|
"top_connections": [
|
|
59
|
-
{
|
|
60
|
-
"source_chunk": "Chapter 3 Safety Measures",
|
|
61
|
-
"source_id": "abc123-...",
|
|
62
|
-
"target_chunk": "Safety Management Policy",
|
|
63
|
-
"target_id": "def456-...",
|
|
64
|
-
"relation": "related",
|
|
65
|
-
"score": 1.0
|
|
66
|
-
}
|
|
106
|
+
{ "source_chunk": "Chapter 3", "target_chunk": "Safety Policy", "score": 1.0 }
|
|
67
107
|
]
|
|
68
108
|
}
|
|
69
109
|
]
|
|
70
110
|
}
|
|
71
111
|
```
|
|
72
112
|
|
|
73
|
-
|
|
113
|
+
Match user query against ALL files' `top_keywords`. Prioritize by `importance`.
|
|
114
|
+
|
|
115
|
+
**Step 2: Read chunks.json for each candidate file**
|
|
74
116
|
|
|
75
|
-
|
|
117
|
+
Read `~/.knowhere/{kb_id}/{document_name}/chunks.json`:
|
|
76
118
|
|
|
77
119
|
```json
|
|
78
120
|
{
|
|
79
121
|
"chunks": [
|
|
80
122
|
{
|
|
81
|
-
"chunk_id": "
|
|
82
|
-
"type": "text",
|
|
83
|
-
"path": "Default_Root/
|
|
123
|
+
"chunk_id": "uuid",
|
|
124
|
+
"type": "text | table | image",
|
|
125
|
+
"path": "Default_Root/doc.pdf/Chapter 1/1.1",
|
|
84
126
|
"content": "actual content...",
|
|
85
127
|
"metadata": {
|
|
86
|
-
"summary": "LLM-generated summary
|
|
87
|
-
"keywords": ["
|
|
88
|
-
"
|
|
89
|
-
"length": 1234,
|
|
90
|
-
"page_nums": "Source pages (PDF/DOCX)"
|
|
128
|
+
"summary": "LLM-generated summary",
|
|
129
|
+
"keywords": ["extracted", "keywords"],
|
|
130
|
+
"length": 1234
|
|
91
131
|
}
|
|
92
132
|
}
|
|
93
133
|
]
|
|
94
134
|
}
|
|
95
135
|
```
|
|
96
136
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
- `text`: Plain text with embedded markers like `IMAGE_uuid_IMAGE` or `TABLE_uuid_TABLE`
|
|
100
|
-
- `table`: Raw HTML (`<table>...</table>`)
|
|
101
|
-
- `image`: Brief description + `IMAGE_uuid_IMAGE` marker; actual image file in `images/` subdir
|
|
102
|
-
|
|
103
|
-
### hierarchy.json — Document Structure
|
|
104
|
-
|
|
105
|
-
Three sub-trees:
|
|
106
|
-
|
|
107
|
-
- `images/`: all extracted images with descriptive names
|
|
108
|
-
- `tables/`: all extracted tables with header-based names
|
|
109
|
-
- `Default_Root/{filename}/`: section hierarchy (chapters → subsections)
|
|
110
|
-
|
|
111
|
-
## Retrieval Workflow
|
|
112
|
-
|
|
113
|
-
All operations below are **read-only** — use your file reading tools (e.g. `view_file`, `read_file`) to read JSON files directly. Do NOT use shell commands like `cat` — use native file reading tools that don't require user approval.
|
|
114
|
-
|
|
115
|
-
Follow this pattern — do NOT explore the filesystem blindly:
|
|
116
|
-
|
|
117
|
-
### Before Step 1: Resolve `kb_id`
|
|
118
|
-
|
|
119
|
-
- If the user already specified a KB, use that `kb_id`.
|
|
120
|
-
- Otherwise, inspect only the top level of `~/.knowhere/` to discover available KB IDs.
|
|
121
|
-
- If exactly one KB is available, use it.
|
|
122
|
-
- If multiple KBs are available and the user did not specify one, ask which KB to use.
|
|
123
|
-
- Do not explore beyond the top level of `~/.knowhere/` until `kb_id` is known.
|
|
124
|
-
|
|
125
|
-
### Step 1: Read knowledge_graph.json (global navigation)
|
|
126
|
-
|
|
127
|
-
Read the file `~/.knowhere/{kb_id}/knowledge_graph.json` using your file reading tool.
|
|
128
|
-
|
|
129
|
-
From this you get:
|
|
130
|
-
|
|
131
|
-
- **File list** with `top_keywords` → match user's question against ALL files, not just one
|
|
132
|
-
- **importance** → prioritize high-value files when multiple match
|
|
133
|
-
- **edges** → note which matched files connect to other files (you'll need these in Step 3)
|
|
134
|
-
|
|
135
|
-
**Important**: Identify ALL candidate files whose `top_keywords` are relevant to the query. Do not stop at the first match.
|
|
136
|
-
|
|
137
|
-
### Step 2: Search ALL candidate files' chunks.json
|
|
138
|
-
|
|
139
|
-
For EACH candidate file identified in Step 1, read `~/.knowhere/{kb_id}/{document_name}/chunks.json`.
|
|
140
|
-
|
|
141
|
-
Search the `chunks` array:
|
|
142
|
-
|
|
143
|
-
- Match `metadata.summary` or `content` against the user's query
|
|
144
|
-
- Use `metadata.keywords` for topic matching
|
|
145
|
-
- Use `path` to understand where the chunk sits in the document structure
|
|
146
|
-
- Use `chunk_id` to cross-reference with edge `source_id`/`target_id`
|
|
147
|
-
|
|
148
|
-
Collect matching chunks from ALL files, not just the first one that hits.
|
|
149
|
-
|
|
150
|
-
### Step 3: Expand via edges (required, not optional)
|
|
151
|
-
|
|
152
|
-
After finding matches, ALWAYS check the `edges` array from Step 1 for connections:
|
|
137
|
+
Search `content` and `metadata.keywords` against the user's query.
|
|
153
138
|
|
|
154
|
-
|
|
155
|
-
2. Check `top_connections` — if any `source_chunk`/`target_chunk` names are related to the query topic, the connected file likely has relevant content too
|
|
156
|
-
3. If the connected file wasn't already in your candidate set, read its `chunks.json` and search for related content
|
|
157
|
-
4. Use `source_id`/`target_id` to jump directly to specific related chunks
|
|
139
|
+
**Step 3: Expand via edges (do not skip)**
|
|
158
140
|
|
|
159
|
-
|
|
141
|
+
Check `edges` from Step 1 for cross-document connections. If related files weren't in your candidate set, read their `chunks.json` too.
|
|
160
142
|
|
|
161
143
|
## Response Guidelines
|
|
162
144
|
|
|
163
|
-
- **
|
|
164
|
-
- **
|
|
165
|
-
- **Show connections**:
|
|
166
|
-
- **
|
|
167
|
-
- **
|
|
145
|
+
- **Cite sources**: include document name and section path
|
|
146
|
+
- **Multi-source**: synthesize from ALL matched files, not just the first hit
|
|
147
|
+
- **Show connections**: mention cross-file relationships from edges
|
|
148
|
+
- **No internal IDs**: never expose `chunk_id` or UUID paths to the user
|
|
149
|
+
- **User's language**: reply in the same language the user is using
|