@ontos-ai/knowhere-claw 0.2.7 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -0
- package/dist/__tests__/read-chunks-schema-v21.test.d.ts +1 -0
- package/dist/agent-hooks.js +27 -3
- package/dist/index.js +3 -3
- package/dist/kg-service.d.ts +1 -0
- package/dist/kg-service.js +56 -23
- package/dist/tools.d.ts +60 -0
- package/dist/tools.js +204 -96
- package/dist/types.d.ts +13 -2
- package/openclaw.plugin.json +1 -1
- package/package.json +1 -1
- package/skills/knowhere_memory/SKILL.md +56 -17
package/README.md
CHANGED
|
@@ -122,6 +122,12 @@ Within each scope, the plugin keeps:
|
|
|
122
122
|
4. When needed, the agent can preview structure, search chunks, read raw result
|
|
123
123
|
files, or clear stored documents.
|
|
124
124
|
|
|
125
|
+
## Schema v2.1 Media Handling
|
|
126
|
+
|
|
127
|
+
- `knowhere_read_chunks` now treats `[images/...]` and `[tables/...]` path references in `chunks.json` content as the primary media enrichment path.
|
|
128
|
+
- Standalone `image` and `table` chunks resolve their real asset locations from `metadata.file_path`.
|
|
129
|
+
- Assets without `metadata.file_path` are ignored by the runtime enrichment and delivery pipeline.
|
|
130
|
+
|
|
125
131
|
## Troubleshooting
|
|
126
132
|
|
|
127
133
|
- Missing API key: `apiKey` config is optional. You can set
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
package/dist/agent-hooks.js
CHANGED
|
@@ -10,10 +10,34 @@ const KNOWHERE_PROMPT_CONTEXT = [
|
|
|
10
10
|
"- If the file is in the cloud (e.g. Feishu Drive), first obtain the download URL via the appropriate channel tool, then use the `url` parameter.",
|
|
11
11
|
"- Refer to your **knowhere_memory** skill for the complete step-by-step workflow.",
|
|
12
12
|
"",
|
|
13
|
+
"### ⚠️ Feishu / Lark Cloud Files",
|
|
14
|
+
"**Never** pass a raw `open.feishu.cn` or `feishu.cn/drive/file/...` URL directly to `knowhere_ingest_document`.",
|
|
15
|
+
"These URLs require authentication and will redirect to a login page, causing Knowhere to parse HTML instead of the actual document.",
|
|
16
|
+
"Instead:",
|
|
17
|
+
"1. Use `feishu_drive` (action: `download`) or equivalents to obtain an **authenticated temporary download URL**.",
|
|
18
|
+
"2. Then pass that authenticated URL to `knowhere_ingest_document(url: ...)`.",
|
|
19
|
+
"",
|
|
20
|
+
"### Empty File Rejection",
|
|
21
|
+
"If a parsed result contains 0 usable chunks, it will be **automatically rejected** and not stored.",
|
|
22
|
+
"This typically means the source file was corrupt, empty, or required authentication that was not provided.",
|
|
23
|
+
"",
|
|
13
24
|
"### Knowledge Retrieval",
|
|
14
|
-
"
|
|
15
|
-
"
|
|
16
|
-
"
|
|
25
|
+
"Use this **single retrieval path** — do not skip steps:",
|
|
26
|
+
"1. `knowhere_get_map` — get the KG overview: which files exist, their keywords, importance, and cross-file edges.",
|
|
27
|
+
"2. `knowhere_get_structure` — inspect the chapter/section hierarchy of a specific document.",
|
|
28
|
+
"3. `knowhere_read_chunks` — fetch content for a specific section (use `sectionPath` to narrow scope).",
|
|
29
|
+
"If you're unsure which file contains the answer, also call `knowhere_discover_files` for keyword-based file discovery.",
|
|
30
|
+
"- ❌ Do NOT use `exec` or shell commands to read files inside `~/.knowhere/`",
|
|
31
|
+
"- ❌ Do NOT skip `knowhere_get_map` and jump directly to `knowhere_read_chunks`",
|
|
32
|
+
"",
|
|
33
|
+
"### 📷 Image Delivery",
|
|
34
|
+
"**`knowhere_read_chunks` has built-in automatic image delivery.** When it returns chunks containing images,",
|
|
35
|
+
"those images are automatically sent to the user's channel (Telegram/Feishu/etc). You do NOT need to send them again.",
|
|
36
|
+
"- The tool result will contain `resolved_assets` with `mode: 'image_sent'` for successfully delivered images.",
|
|
37
|
+
"- If the user asks to **see** or **view** an image from the knowledge base, call `knowhere_read_chunks` with the relevant section — images will be auto-delivered.",
|
|
38
|
+
"- `knowhere_view_image` is for **AI visual analysis only** (it loads image pixels into your context for you to describe/analyze). It does NOT send the image to the user.",
|
|
39
|
+
"- If the user asks you to re-send a specific image, use the `message` tool with the staged file path from `~/.openclaw/knowhere-assets/`.",
|
|
40
|
+
"- **Never tell the user you cannot send images.** You CAN — via `knowhere_read_chunks` auto-delivery or `message` tool."
|
|
17
41
|
].join("\n");
|
|
18
42
|
const KNOWHERE_DIR_PATTERN = ".knowhere";
|
|
19
43
|
const BLOCK_REASON = "Do not use exec to read .knowhere/ directly. Use knowhere retrieval tools instead: knowhere_get_map, knowhere_get_structure, knowhere_read_chunks, knowhere_kg_query.";
|
package/dist/index.js
CHANGED
|
@@ -45,12 +45,12 @@ const plugin = {
|
|
|
45
45
|
"knowhere_get_job_status",
|
|
46
46
|
"knowhere_import_completed_job",
|
|
47
47
|
"knowhere_set_api_key",
|
|
48
|
-
"knowhere_kg_list",
|
|
49
|
-
"knowhere_kg_query",
|
|
50
48
|
"knowhere_get_map",
|
|
51
49
|
"knowhere_get_structure",
|
|
52
50
|
"knowhere_read_chunks",
|
|
53
|
-
"
|
|
51
|
+
"knowhere_view_image",
|
|
52
|
+
"knowhere_discover_files",
|
|
53
|
+
"knowhere_delete_document"
|
|
54
54
|
] });
|
|
55
55
|
}
|
|
56
56
|
};
|
package/dist/kg-service.d.ts
CHANGED
|
@@ -23,6 +23,7 @@ export declare class KnowledgeGraphService {
|
|
|
23
23
|
keywords: string[];
|
|
24
24
|
metadata: Record<string, unknown>;
|
|
25
25
|
}): Promise<void>;
|
|
26
|
+
removeDocumentFromKb(kbId: string, docId: string): Promise<void>;
|
|
26
27
|
scheduleBuild(kbId: string, task: () => Promise<void>): Promise<void>;
|
|
27
28
|
buildKnowledgeGraph(kbId: string): Promise<void>;
|
|
28
29
|
private updateKbMetadata;
|
package/dist/kg-service.js
CHANGED
|
@@ -1,12 +1,20 @@
|
|
|
1
1
|
import { resolveStoredKnowhereResultRoot } from "./parser.js";
|
|
2
2
|
import { buildConnections, init_connect_builder } from "./connect-builder.js";
|
|
3
3
|
import { buildKnowledgeGraph } from "./graph-builder.js";
|
|
4
|
+
import fs from "node:fs/promises";
|
|
4
5
|
import path from "node:path";
|
|
5
6
|
import os from "node:os";
|
|
6
7
|
import { spawn } from "node:child_process";
|
|
7
|
-
import fs from "fs-extra";
|
|
8
|
+
import fs$1 from "fs-extra";
|
|
8
9
|
//#region src/kg-service.ts
|
|
9
10
|
init_connect_builder();
|
|
11
|
+
/**
|
|
12
|
+
* Directories that belong to the Store layer and must be excluded when the KG
|
|
13
|
+
* scans a kb directory for document entries. This matters when the kb
|
|
14
|
+
* directory coincides with the Store root (e.g. both resolve to
|
|
15
|
+
* `~/.knowhere/global/`).
|
|
16
|
+
*/
|
|
17
|
+
const STORE_INFRA_DIRS = new Set(["documents", "metadata"]);
|
|
10
18
|
const DEFAULT_CONNECT_CONFIG = {
|
|
11
19
|
minKeywordOverlap: 3,
|
|
12
20
|
keywordScoreWeight: 1,
|
|
@@ -101,19 +109,43 @@ var KnowledgeGraphService = class {
|
|
|
101
109
|
}
|
|
102
110
|
async ensureKbDirectory(kbId) {
|
|
103
111
|
const kbPath = this.getKbPath(kbId);
|
|
104
|
-
await fs.ensureDir(kbPath);
|
|
112
|
+
await fs$1.ensureDir(kbPath);
|
|
105
113
|
return kbPath;
|
|
106
114
|
}
|
|
107
115
|
async saveDocumentToKb(params) {
|
|
108
116
|
const kbPath = await this.ensureKbDirectory(params.kbId);
|
|
109
|
-
const
|
|
110
|
-
await fs.ensureDir(docDir);
|
|
117
|
+
const linkPath = path.join(kbPath, params.docId);
|
|
111
118
|
const sourceResultRoot = await resolveStoredKnowhereResultRoot(params.sourcePath);
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
119
|
+
try {
|
|
120
|
+
const existingStat = await fs.lstat(linkPath).catch(() => null);
|
|
121
|
+
if (existingStat) {
|
|
122
|
+
if (existingStat.isSymbolicLink()) if (await fs.readlink(linkPath) === sourceResultRoot) this.logger.debug?.(`knowhere: saveDocumentToKb symlink already correct doc=${params.docId}`);
|
|
123
|
+
else {
|
|
124
|
+
await fs.unlink(linkPath);
|
|
125
|
+
await fs.symlink(sourceResultRoot, linkPath);
|
|
126
|
+
}
|
|
127
|
+
else if (existingStat.isDirectory()) {
|
|
128
|
+
await fs$1.remove(linkPath);
|
|
129
|
+
await fs.symlink(sourceResultRoot, linkPath);
|
|
130
|
+
this.logger.info(`knowhere: saveDocumentToKb replaced legacy copy with symlink doc=${params.docId}`);
|
|
131
|
+
}
|
|
132
|
+
} else await fs.symlink(sourceResultRoot, linkPath);
|
|
133
|
+
} catch (symlinkError) {
|
|
134
|
+
this.logger.warn(`knowhere: symlink failed for doc=${params.docId}, falling back to copy: ${formatUnknownError(symlinkError)}`);
|
|
135
|
+
await fs$1.ensureDir(linkPath);
|
|
136
|
+
await fs$1.copy(sourceResultRoot, linkPath, { overwrite: true });
|
|
137
|
+
}
|
|
115
138
|
this.logger.info(`Document saved to knowledge base: kb=${params.kbId} doc=${params.docId}`);
|
|
116
139
|
}
|
|
140
|
+
async removeDocumentFromKb(kbId, docId) {
|
|
141
|
+
const kbPath = this.getKbPath(kbId);
|
|
142
|
+
const docPath = path.join(kbPath, docId);
|
|
143
|
+
const stat = await fs.lstat(docPath).catch(() => null);
|
|
144
|
+
if (!stat) return;
|
|
145
|
+
if (stat.isSymbolicLink()) await fs.unlink(docPath);
|
|
146
|
+
else if (stat.isDirectory()) await fs$1.remove(docPath);
|
|
147
|
+
this.logger.info(`Document removed from knowledge base: kb=${kbId} doc=${docId}`);
|
|
148
|
+
}
|
|
117
149
|
async scheduleBuild(kbId, task) {
|
|
118
150
|
if ((this.config.concurrentBuildStrategy || "queue") === "skip") {
|
|
119
151
|
if (this.buildQueues.has(kbId)) {
|
|
@@ -132,11 +164,12 @@ var KnowledgeGraphService = class {
|
|
|
132
164
|
}
|
|
133
165
|
async buildKnowledgeGraph(kbId) {
|
|
134
166
|
const kbPath = this.getKbPath(kbId);
|
|
135
|
-
const docs = await fs.readdir(kbPath);
|
|
167
|
+
const docs = await fs$1.readdir(kbPath);
|
|
136
168
|
const docDirs = [];
|
|
137
169
|
for (const doc of docs) {
|
|
170
|
+
if (doc.startsWith(".") || STORE_INFRA_DIRS.has(doc)) continue;
|
|
138
171
|
const docPath = path.join(kbPath, doc);
|
|
139
|
-
if ((await fs.stat(docPath))
|
|
172
|
+
if ((await fs$1.stat(docPath).catch(() => null))?.isDirectory() && doc !== "knowledge_graph.json" && doc !== "chunk_stats.json" && doc !== "kb_metadata.json") docDirs.push(doc);
|
|
140
173
|
}
|
|
141
174
|
if (docDirs.length < 1) {
|
|
142
175
|
this.logger.info(`Not enough documents for graph building (need >=2, have ${docDirs.length}), skipping`);
|
|
@@ -147,8 +180,8 @@ var KnowledgeGraphService = class {
|
|
|
147
180
|
const allChunks = [];
|
|
148
181
|
for (const docDir of docDirs) {
|
|
149
182
|
const chunksPath = path.join(kbPath, docDir, "chunks.json");
|
|
150
|
-
if (await fs.pathExists(chunksPath)) {
|
|
151
|
-
const chunksData = await fs.readJSON(chunksPath);
|
|
183
|
+
if (await fs$1.pathExists(chunksPath)) {
|
|
184
|
+
const chunksData = await fs$1.readJSON(chunksPath);
|
|
152
185
|
if (chunksData.chunks && Array.isArray(chunksData.chunks)) allChunks.push(...chunksData.chunks.map((c) => ({
|
|
153
186
|
...c,
|
|
154
187
|
fileKey: docDir
|
|
@@ -164,10 +197,10 @@ var KnowledgeGraphService = class {
|
|
|
164
197
|
this.logger.info(`Built ${connections.length} connections`);
|
|
165
198
|
const chunkStatsPath = path.join(kbPath, "chunk_stats.json");
|
|
166
199
|
let chunkStats = {};
|
|
167
|
-
if (await fs.pathExists(chunkStatsPath)) chunkStats = await fs.readJSON(chunkStatsPath);
|
|
200
|
+
if (await fs$1.pathExists(chunkStatsPath)) chunkStats = await fs$1.readJSON(chunkStatsPath);
|
|
168
201
|
const knowledgeGraph = buildKnowledgeGraph(allChunks, connections, chunkStats, false, this.logger, kbId);
|
|
169
202
|
const graphFile = path.join(kbPath, "knowledge_graph.json");
|
|
170
|
-
await fs.writeJSON(graphFile, knowledgeGraph, { spaces: 2 });
|
|
203
|
+
await fs$1.writeJSON(graphFile, knowledgeGraph, { spaces: 2 });
|
|
171
204
|
this.logger.info(`Knowledge graph saved to ${graphFile}`);
|
|
172
205
|
await this.updateKbMetadata(kbPath, {
|
|
173
206
|
lastUpdated: (/* @__PURE__ */ new Date()).toISOString(),
|
|
@@ -182,34 +215,34 @@ var KnowledgeGraphService = class {
|
|
|
182
215
|
async updateKbMetadata(kbPath, updates) {
|
|
183
216
|
const metadataPath = path.join(kbPath, "kb_metadata.json");
|
|
184
217
|
let metadata = {};
|
|
185
|
-
if (await fs.pathExists(metadataPath)) metadata = await fs.readJSON(metadataPath);
|
|
218
|
+
if (await fs$1.pathExists(metadataPath)) metadata = await fs$1.readJSON(metadataPath);
|
|
186
219
|
const updated = {
|
|
187
220
|
...metadata,
|
|
188
221
|
...updates
|
|
189
222
|
};
|
|
190
|
-
await fs.writeJSON(metadataPath, updated, { spaces: 2 });
|
|
223
|
+
await fs$1.writeJSON(metadataPath, updated, { spaces: 2 });
|
|
191
224
|
}
|
|
192
225
|
async queryGraph(kbId, fileKey) {
|
|
193
226
|
const graphPath = path.join(this.getKbPath(kbId), "knowledge_graph.json");
|
|
194
|
-
if (!await fs.pathExists(graphPath)) return [];
|
|
195
|
-
const graph = await fs.readJSON(graphPath);
|
|
227
|
+
if (!await fs$1.pathExists(graphPath)) return [];
|
|
228
|
+
const graph = await fs$1.readJSON(graphPath);
|
|
196
229
|
if (!fileKey) return graph.edges;
|
|
197
230
|
return graph.edges.filter((edge) => edge.source === fileKey || edge.target === fileKey);
|
|
198
231
|
}
|
|
199
232
|
async getKnowledgeGraph(kbId) {
|
|
200
233
|
const graphPath = path.join(this.getKbPath(kbId), "knowledge_graph.json");
|
|
201
|
-
if (!await fs.pathExists(graphPath)) return null;
|
|
202
|
-
return await fs.readJSON(graphPath);
|
|
234
|
+
if (!await fs$1.pathExists(graphPath)) return null;
|
|
235
|
+
return await fs$1.readJSON(graphPath);
|
|
203
236
|
}
|
|
204
237
|
async listKnowledgeBases() {
|
|
205
238
|
const knowhereRoot = path.join(os.homedir(), ".knowhere");
|
|
206
|
-
if (!await fs.pathExists(knowhereRoot)) return [];
|
|
207
|
-
return (await fs.readdir(knowhereRoot, { withFileTypes: true })).filter((e) => e.isDirectory()).map((e) => e.name);
|
|
239
|
+
if (!await fs$1.pathExists(knowhereRoot)) return [];
|
|
240
|
+
return (await fs$1.readdir(knowhereRoot, { withFileTypes: true })).filter((e) => e.isDirectory()).map((e) => e.name);
|
|
208
241
|
}
|
|
209
242
|
async getKbMetadata(kbId) {
|
|
210
243
|
const metadataPath = path.join(this.getKbPath(kbId), "kb_metadata.json");
|
|
211
|
-
if (!await fs.pathExists(metadataPath)) return null;
|
|
212
|
-
return await fs.readJSON(metadataPath);
|
|
244
|
+
if (!await fs$1.pathExists(metadataPath)) return null;
|
|
245
|
+
return await fs$1.readJSON(metadataPath);
|
|
213
246
|
}
|
|
214
247
|
isEnabled() {
|
|
215
248
|
return this.degradationMode !== "disabled";
|
package/dist/tools.d.ts
CHANGED
|
@@ -2,9 +2,69 @@ import { type AnyAgentTool, type OpenClawPluginApi } from "openclaw/plugin-sdk/c
|
|
|
2
2
|
import { KnowhereStore } from "./store";
|
|
3
3
|
import type { KnowledgeGraphService } from "./kg-service";
|
|
4
4
|
import type { ResolvedKnowhereConfig, ToolRuntimeContext } from "./types";
|
|
5
|
+
interface T2ChunkRelation {
|
|
6
|
+
relation?: string;
|
|
7
|
+
target?: string;
|
|
8
|
+
ref?: string;
|
|
9
|
+
[key: string]: unknown;
|
|
10
|
+
}
|
|
11
|
+
interface T2ChunkMetadata {
|
|
12
|
+
summary?: string;
|
|
13
|
+
keywords?: string[];
|
|
14
|
+
tokens?: string[];
|
|
15
|
+
file_path?: string;
|
|
16
|
+
connect_to?: T2ChunkRelation[];
|
|
17
|
+
[key: string]: unknown;
|
|
18
|
+
}
|
|
19
|
+
interface T2ChunkSlim {
|
|
20
|
+
chunk_id?: string;
|
|
21
|
+
type: string;
|
|
22
|
+
path: string;
|
|
23
|
+
content: string;
|
|
24
|
+
summary: string;
|
|
25
|
+
file_path?: string;
|
|
26
|
+
connect_to?: T2ChunkRelation[];
|
|
27
|
+
metadata?: T2ChunkMetadata;
|
|
28
|
+
}
|
|
29
|
+
interface T2EnrichResult {
|
|
30
|
+
chunks: T2ChunkSlim[];
|
|
31
|
+
/** Image paths that were inlined into text via placeholder replacement (need delivery). */
|
|
32
|
+
inlinedImagePaths: ReadonlySet<string>;
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Runtime-only enrichment of chunks returned to the AI:
|
|
36
|
+
* 1. Prefer Schema v2.1 path refs ([images/...], [tables/...]) in text chunks
|
|
37
|
+
* 2. Normalize standalone image/table chunks to file_path-based content
|
|
38
|
+
* 3. Remove standalone table/image chunks that were already inlined into text
|
|
39
|
+
*
|
|
40
|
+
* Does NOT modify chunks.json on disk.
|
|
41
|
+
*/
|
|
42
|
+
declare function t2EnrichChunks(chunks: T2ChunkSlim[], docDir: string): Promise<T2EnrichResult>;
|
|
43
|
+
interface T2ResolvedAsset {
|
|
44
|
+
chunk_id: string;
|
|
45
|
+
type: "image" | "table";
|
|
46
|
+
relative_path: string;
|
|
47
|
+
summary: string;
|
|
48
|
+
mode: "image_sent" | "image_failed" | "table_inline";
|
|
49
|
+
html_content?: string;
|
|
50
|
+
}
|
|
51
|
+
declare function t2ResolveAssets(params: {
|
|
52
|
+
api: OpenClawPluginApi;
|
|
53
|
+
store: KnowhereStore;
|
|
54
|
+
ctx: ToolRuntimeContext;
|
|
55
|
+
docDir: string;
|
|
56
|
+
returnedChunks: T2ChunkSlim[];
|
|
57
|
+
/** Image paths inlined by t2EnrichChunks that still need channel delivery. */
|
|
58
|
+
enrichedImagePaths?: ReadonlySet<string>;
|
|
59
|
+
}): Promise<T2ResolvedAsset[]>;
|
|
5
60
|
export declare function createKnowhereToolFactory(params: {
|
|
6
61
|
api: OpenClawPluginApi;
|
|
7
62
|
config: ResolvedKnowhereConfig;
|
|
8
63
|
store: KnowhereStore;
|
|
9
64
|
kgService: KnowledgeGraphService;
|
|
10
65
|
}): (ctx: ToolRuntimeContext) => AnyAgentTool[];
|
|
66
|
+
export declare const __internal: {
|
|
67
|
+
t2EnrichChunks: typeof t2EnrichChunks;
|
|
68
|
+
t2ResolveAssets: typeof t2ResolveAssets;
|
|
69
|
+
};
|
|
70
|
+
export {};
|
package/dist/tools.js
CHANGED
|
@@ -125,6 +125,11 @@ async function persistIngestedDocument(params) {
|
|
|
125
125
|
jobResult: params.ingestResult.jobResult,
|
|
126
126
|
downloadedResult: params.ingestResult.downloadedResult
|
|
127
127
|
}, { overwrite: params.overwrite });
|
|
128
|
+
if (storedDocument.chunkCount === 0) {
|
|
129
|
+
params.api.logger.warn(`knowhere: rejecting empty document scope=${params.scope.label} docId=${storedDocument.id} title=${JSON.stringify(storedDocument.title)} — chunkCount is 0; removing from store`);
|
|
130
|
+
await params.store.removeDocument(params.scope, storedDocument.id);
|
|
131
|
+
throw new Error(`Parsed result for "${storedDocument.title || storedDocument.id}" contains no usable content (0 chunks). The file may be corrupt, empty, or require authentication to download. For cloud files (e.g. Feishu Drive), make sure to obtain an authenticated download URL first.`);
|
|
132
|
+
}
|
|
128
133
|
params.api.logger.info(`knowhere: knowhere_ingest_document stored document scope=${params.scope.label} jobId=${params.ingestResult.job.job_id} docId=${storedDocument.id}`);
|
|
129
134
|
startKnowledgeGraphBuild({
|
|
130
135
|
api: params.api,
|
|
@@ -959,13 +964,7 @@ async function t2LoadChunks(docDir) {
|
|
|
959
964
|
if (Array.isArray(data)) chunks = data;
|
|
960
965
|
else if (isRecord(data) && Array.isArray(data.chunks)) chunks = data.chunks;
|
|
961
966
|
else continue;
|
|
962
|
-
|
|
963
|
-
type: c.type || "text",
|
|
964
|
-
path: c.path || "",
|
|
965
|
-
content: c.content || "",
|
|
966
|
-
summary: c.metadata?.summary || c.summary || ""
|
|
967
|
-
}));
|
|
968
|
-
return chunks;
|
|
967
|
+
return chunks.map((c) => t2ToSlimChunk(c));
|
|
969
968
|
} catch {
|
|
970
969
|
continue;
|
|
971
970
|
}
|
|
@@ -974,7 +973,68 @@ async function t2LoadChunks(docDir) {
|
|
|
974
973
|
function t2NormalizePath(s) {
|
|
975
974
|
return s.replace(/[\uFF01-\uFF5E]/g, (ch) => String.fromCharCode(ch.charCodeAt(0) - 65248)).replace(/[\s\u3000\u00A0]+/g, "").toLowerCase();
|
|
976
975
|
}
|
|
977
|
-
const
|
|
976
|
+
const PATH_REF_RE = /\[((?:images|tables)\/[^\]\n]+)\]/g;
|
|
977
|
+
function t2ReadConnectTo(value) {
|
|
978
|
+
if (!Array.isArray(value)) return;
|
|
979
|
+
const relations = value.filter(isRecord);
|
|
980
|
+
return relations.length > 0 ? relations : void 0;
|
|
981
|
+
}
|
|
982
|
+
function t2GetChunkFilePath(chunk) {
|
|
983
|
+
if (typeof chunk.file_path === "string" && chunk.file_path) return chunk.file_path;
|
|
984
|
+
if (typeof chunk.metadata?.file_path === "string" && chunk.metadata.file_path) return chunk.metadata.file_path;
|
|
985
|
+
}
|
|
986
|
+
function t2GetChunkAssetPath(chunk) {
|
|
987
|
+
return t2GetChunkFilePath(chunk);
|
|
988
|
+
}
|
|
989
|
+
function t2ToSlimChunk(chunk) {
|
|
990
|
+
const connectTo = t2ReadConnectTo(chunk.metadata?.connect_to);
|
|
991
|
+
const filePath = t2GetChunkFilePath(chunk);
|
|
992
|
+
return {
|
|
993
|
+
chunk_id: chunk.chunk_id || void 0,
|
|
994
|
+
type: chunk.type || "text",
|
|
995
|
+
path: chunk.path || "",
|
|
996
|
+
content: chunk.content || "",
|
|
997
|
+
summary: chunk.metadata?.summary || chunk.summary || "",
|
|
998
|
+
file_path: filePath,
|
|
999
|
+
connect_to: connectTo,
|
|
1000
|
+
metadata: chunk.metadata ? {
|
|
1001
|
+
...chunk.metadata,
|
|
1002
|
+
file_path: filePath,
|
|
1003
|
+
connect_to: connectTo
|
|
1004
|
+
} : void 0
|
|
1005
|
+
};
|
|
1006
|
+
}
|
|
1007
|
+
function t2HydrateChunk(chunk, idToRaw, pathToRaw) {
|
|
1008
|
+
const raw = (chunk.chunk_id ? idToRaw.get(chunk.chunk_id) : void 0) || (chunk.path ? pathToRaw.get(chunk.path) : void 0);
|
|
1009
|
+
if (!raw) return chunk;
|
|
1010
|
+
const rawFilePath = t2GetChunkFilePath(raw);
|
|
1011
|
+
const connectTo = chunk.connect_to || t2ReadConnectTo(raw.metadata?.connect_to);
|
|
1012
|
+
return {
|
|
1013
|
+
...chunk,
|
|
1014
|
+
chunk_id: chunk.chunk_id || raw.chunk_id,
|
|
1015
|
+
file_path: chunk.file_path || rawFilePath,
|
|
1016
|
+
connect_to: connectTo,
|
|
1017
|
+
metadata: {
|
|
1018
|
+
...raw.metadata || {},
|
|
1019
|
+
...chunk.metadata || {},
|
|
1020
|
+
file_path: chunk.file_path || rawFilePath,
|
|
1021
|
+
connect_to: connectTo
|
|
1022
|
+
}
|
|
1023
|
+
};
|
|
1024
|
+
}
|
|
1025
|
+
async function t2ReadTableHtml(docDir, relativePath) {
|
|
1026
|
+
try {
|
|
1027
|
+
return await fs.readFile(path.join(docDir, relativePath), "utf-8");
|
|
1028
|
+
} catch {
|
|
1029
|
+
return null;
|
|
1030
|
+
}
|
|
1031
|
+
}
|
|
1032
|
+
function t2HasUnresolvedMediaReference(text) {
|
|
1033
|
+
PATH_REF_RE.lastIndex = 0;
|
|
1034
|
+
const hasPathRef = PATH_REF_RE.test(text);
|
|
1035
|
+
PATH_REF_RE.lastIndex = 0;
|
|
1036
|
+
return hasPathRef;
|
|
1037
|
+
}
|
|
978
1038
|
async function t2LoadRawChunks(docDir) {
|
|
979
1039
|
try {
|
|
980
1040
|
const raw = await fs.readFile(path.join(docDir, "chunks.json"), "utf-8");
|
|
@@ -988,110 +1048,93 @@ async function t2LoadRawChunks(docDir) {
|
|
|
988
1048
|
}
|
|
989
1049
|
/**
|
|
990
1050
|
* Runtime-only enrichment of chunks returned to the AI:
|
|
991
|
-
* 1.
|
|
992
|
-
* 2.
|
|
993
|
-
* 3. Remove standalone table chunks that were inlined
|
|
994
|
-
* 4. Strip self-referencing placeholders from image/table chunk content & summary
|
|
1051
|
+
* 1. Prefer Schema v2.1 path refs ([images/...], [tables/...]) in text chunks
|
|
1052
|
+
* 2. Normalize standalone image/table chunks to file_path-based content
|
|
1053
|
+
* 3. Remove standalone table/image chunks that were already inlined into text
|
|
995
1054
|
*
|
|
996
1055
|
* Does NOT modify chunks.json on disk.
|
|
997
1056
|
*/
|
|
998
1057
|
async function t2EnrichChunks(chunks, docDir) {
|
|
999
1058
|
const rawChunks = await t2LoadRawChunks(docDir);
|
|
1000
1059
|
const idToRaw = /* @__PURE__ */ new Map();
|
|
1001
|
-
|
|
1002
|
-
const
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
for (const entry of Array.isArray(files.images) ? files.images : []) if (typeof entry.id === "string" && typeof entry.file_path === "string") manifestPaths.set(entry.id, {
|
|
1008
|
-
type: "image",
|
|
1009
|
-
filePath: entry.file_path
|
|
1010
|
-
});
|
|
1011
|
-
for (const entry of Array.isArray(files.tables) ? files.tables : []) if (typeof entry.id === "string" && typeof entry.file_path === "string") manifestPaths.set(entry.id, {
|
|
1012
|
-
type: "table",
|
|
1013
|
-
filePath: entry.file_path
|
|
1014
|
-
});
|
|
1015
|
-
}
|
|
1016
|
-
} catch {}
|
|
1060
|
+
const pathToRaw = /* @__PURE__ */ new Map();
|
|
1061
|
+
for (const rc of rawChunks) {
|
|
1062
|
+
if (rc.chunk_id) idToRaw.set(rc.chunk_id, rc);
|
|
1063
|
+
if (rc.path) pathToRaw.set(rc.path, rc);
|
|
1064
|
+
}
|
|
1065
|
+
chunks = chunks.map((chunk) => t2HydrateChunk(chunk, idToRaw, pathToRaw));
|
|
1017
1066
|
const inlinedTablePaths = /* @__PURE__ */ new Set();
|
|
1018
1067
|
const inlinedImagePaths = /* @__PURE__ */ new Set();
|
|
1019
1068
|
for (const chunk of chunks) {
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
chunk.
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
chunk.content = chunk.content.replace(PLACEHOLDER_RE, `[📊 ${chunk.path}]`);
|
|
1032
|
-
}
|
|
1033
|
-
else if (chunk.type === "image") chunk.content = chunk.content.replace(PLACEHOLDER_RE, `[📷 ${chunk.path}]`);
|
|
1034
|
-
}
|
|
1069
|
+
const relativePath = t2GetChunkAssetPath(chunk);
|
|
1070
|
+
if (!relativePath) continue;
|
|
1071
|
+
chunk.file_path = relativePath;
|
|
1072
|
+
chunk.metadata = {
|
|
1073
|
+
...chunk.metadata || {},
|
|
1074
|
+
file_path: relativePath,
|
|
1075
|
+
connect_to: chunk.connect_to
|
|
1076
|
+
};
|
|
1077
|
+
if (chunk.type === "image") {
|
|
1078
|
+
chunk.content = `[📷 ${relativePath}]`;
|
|
1079
|
+
continue;
|
|
1035
1080
|
}
|
|
1036
|
-
if (chunk.
|
|
1037
|
-
|
|
1038
|
-
if (
|
|
1039
|
-
|
|
1040
|
-
|
|
1081
|
+
if (chunk.type === "table") {
|
|
1082
|
+
const html = await t2ReadTableHtml(docDir, relativePath);
|
|
1083
|
+
if (html) chunk.content = html.slice(0, 8e3);
|
|
1084
|
+
else if (!chunk.content) chunk.content = `[📊 ${relativePath}]`;
|
|
1085
|
+
}
|
|
1086
|
+
}
|
|
1087
|
+
for (const chunk of chunks) {
|
|
1088
|
+
const relativePath = t2GetChunkAssetPath(chunk);
|
|
1089
|
+
if (chunk.content) {
|
|
1090
|
+
if (chunk.type === "text") chunk.content = await replacePathReferences(chunk.content, docDir, inlinedTablePaths, inlinedImagePaths);
|
|
1091
|
+
if (chunk.type !== "text" && relativePath && t2HasUnresolvedMediaReference(chunk.content)) {
|
|
1092
|
+
if (chunk.type === "table") {
|
|
1093
|
+
const html = await t2ReadTableHtml(docDir, relativePath);
|
|
1094
|
+
chunk.content = html ? html.slice(0, 8e3) : `[📊 ${relativePath}]`;
|
|
1095
|
+
} else if (chunk.type === "image") chunk.content = `[📷 ${relativePath}]`;
|
|
1041
1096
|
}
|
|
1042
1097
|
}
|
|
1098
|
+
if (chunk.summary) chunk.summary = await replacePathReferences(chunk.summary, docDir);
|
|
1043
1099
|
}
|
|
1044
1100
|
chunks = chunks.filter((c) => {
|
|
1045
|
-
|
|
1046
|
-
if (c.type === "
|
|
1101
|
+
const relativePath = t2GetChunkAssetPath(c) || "";
|
|
1102
|
+
if (c.type === "table" && relativePath && inlinedTablePaths.has(relativePath)) return false;
|
|
1103
|
+
if (c.type === "image" && relativePath && inlinedImagePaths.has(relativePath)) return false;
|
|
1047
1104
|
return true;
|
|
1048
1105
|
});
|
|
1049
|
-
return
|
|
1106
|
+
return {
|
|
1107
|
+
chunks,
|
|
1108
|
+
inlinedImagePaths
|
|
1109
|
+
};
|
|
1050
1110
|
}
|
|
1051
|
-
async function
|
|
1111
|
+
async function replacePathReferences(text, docDir, inlinedTablePaths, inlinedImagePaths) {
|
|
1052
1112
|
const matches = [];
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
while ((
|
|
1056
|
-
full:
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
end: m.index + m[0].length
|
|
1113
|
+
let match;
|
|
1114
|
+
PATH_REF_RE.lastIndex = 0;
|
|
1115
|
+
while ((match = PATH_REF_RE.exec(text)) !== null) matches.push({
|
|
1116
|
+
full: match[0],
|
|
1117
|
+
relativePath: match[1],
|
|
1118
|
+
start: match.index,
|
|
1119
|
+
end: match.index + match[0].length
|
|
1061
1120
|
});
|
|
1121
|
+
PATH_REF_RE.lastIndex = 0;
|
|
1062
1122
|
if (matches.length === 0) return text;
|
|
1063
1123
|
const replacements = [];
|
|
1064
|
-
for (const
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
const mEntry = manifestPaths.get(match.id);
|
|
1069
|
-
if (mEntry) resolvedPath = mEntry.filePath;
|
|
1070
|
-
}
|
|
1071
|
-
if (!resolvedPath) {
|
|
1072
|
-
replacements.push(match.full);
|
|
1124
|
+
for (const ref of matches) {
|
|
1125
|
+
if (ref.relativePath.startsWith("images/")) {
|
|
1126
|
+
replacements.push(`[📷 ${ref.relativePath}]`);
|
|
1127
|
+
inlinedImagePaths?.add(ref.relativePath);
|
|
1073
1128
|
continue;
|
|
1074
1129
|
}
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
try {
|
|
1081
|
-
const html = await fs.readFile(htmlPath, "utf-8");
|
|
1082
|
-
replacements.push(`\n${html.slice(0, 8e3)}\n`);
|
|
1083
|
-
inlinedTablePaths?.add(resolvedPath);
|
|
1084
|
-
} catch {
|
|
1085
|
-
const tableContent = raw?.content || "";
|
|
1086
|
-
if (tableContent && tableContent.includes("<")) {
|
|
1087
|
-
replacements.push(`\n${tableContent}\n`);
|
|
1088
|
-
inlinedTablePaths?.add(resolvedPath);
|
|
1089
|
-
} else replacements.push(`[📊 ${resolvedPath}]`);
|
|
1090
|
-
}
|
|
1091
|
-
}
|
|
1130
|
+
const html = await t2ReadTableHtml(docDir, ref.relativePath);
|
|
1131
|
+
if (html) {
|
|
1132
|
+
replacements.push(`\n${html.slice(0, 8e3)}\n`);
|
|
1133
|
+
inlinedTablePaths?.add(ref.relativePath);
|
|
1134
|
+
} else replacements.push(`[📊 ${ref.relativePath}]`);
|
|
1092
1135
|
}
|
|
1093
1136
|
let result = text;
|
|
1094
|
-
for (let
|
|
1137
|
+
for (let index = matches.length - 1; index >= 0; index -= 1) result = result.slice(0, matches[index].start) + replacements[index] + result.slice(matches[index].end);
|
|
1095
1138
|
return result;
|
|
1096
1139
|
}
|
|
1097
1140
|
function t2ComputeTfIdfKeywords(rawChunks, topK = 10) {
|
|
@@ -1213,13 +1256,25 @@ async function t2ResolveAssets(params) {
|
|
|
1213
1256
|
params.api.logger.debug?.(`knowhere: t2ResolveAssets image delivery failed: ${absolutePath} — ${err instanceof Error ? err.message : String(err)}`);
|
|
1214
1257
|
}
|
|
1215
1258
|
};
|
|
1216
|
-
for (const chunk of params.returnedChunks)
|
|
1217
|
-
|
|
1259
|
+
for (const chunk of params.returnedChunks) {
|
|
1260
|
+
const relativePath = t2GetChunkAssetPath(chunk);
|
|
1261
|
+
if ((chunk.type === "image" || chunk.type === "table") && relativePath) {
|
|
1262
|
+
if (chunk.type === "table" && chunk.content && !t2HasUnresolvedMediaReference(chunk.content)) continue;
|
|
1263
|
+
await resolveOne({
|
|
1264
|
+
chunkId: chunk.chunk_id || relativePath,
|
|
1265
|
+
type: chunk.type,
|
|
1266
|
+
relativePath,
|
|
1267
|
+
summary: chunk.summary || chunk.content?.slice(0, 200) || ""
|
|
1268
|
+
});
|
|
1269
|
+
}
|
|
1270
|
+
}
|
|
1271
|
+
if (params.enrichedImagePaths && params.enrichedImagePaths.size > 0) for (const relativePath of params.enrichedImagePaths) {
|
|
1272
|
+
if (processedPaths.has(path.join(params.docDir, relativePath))) continue;
|
|
1218
1273
|
await resolveOne({
|
|
1219
|
-
chunkId:
|
|
1220
|
-
type:
|
|
1221
|
-
relativePath
|
|
1222
|
-
summary:
|
|
1274
|
+
chunkId: relativePath,
|
|
1275
|
+
type: "image",
|
|
1276
|
+
relativePath,
|
|
1277
|
+
summary: path.basename(relativePath)
|
|
1223
1278
|
});
|
|
1224
1279
|
}
|
|
1225
1280
|
return assets;
|
|
@@ -1436,7 +1491,8 @@ function createReadChunksTool(_params) {
|
|
|
1436
1491
|
await fs.writeFile(kgPath, JSON.stringify(g, null, 2), "utf-8");
|
|
1437
1492
|
}
|
|
1438
1493
|
} catch {}
|
|
1439
|
-
|
|
1494
|
+
const enrichResult = await t2EnrichChunks(chunks, docDir);
|
|
1495
|
+
chunks = enrichResult.chunks;
|
|
1440
1496
|
let resolvedAssets = [];
|
|
1441
1497
|
try {
|
|
1442
1498
|
resolvedAssets = await t2ResolveAssets({
|
|
@@ -1444,7 +1500,8 @@ function createReadChunksTool(_params) {
|
|
|
1444
1500
|
store: _params.store,
|
|
1445
1501
|
ctx: _params.ctx,
|
|
1446
1502
|
docDir,
|
|
1447
|
-
returnedChunks: chunks
|
|
1503
|
+
returnedChunks: chunks,
|
|
1504
|
+
enrichedImagePaths: enrichResult.inlinedImagePaths
|
|
1448
1505
|
});
|
|
1449
1506
|
} catch (err) {
|
|
1450
1507
|
_params.api.logger.debug?.(`knowhere: read_chunks asset resolution failed: ${err instanceof Error ? err.message : String(err)}`);
|
|
@@ -1602,6 +1659,51 @@ function createDiscoverFilesTool(_params) {
|
|
|
1602
1659
|
}
|
|
1603
1660
|
};
|
|
1604
1661
|
}
|
|
1662
|
+
function createDeleteDocumentTool(params) {
|
|
1663
|
+
return {
|
|
1664
|
+
name: "knowhere_delete_document",
|
|
1665
|
+
label: "Knowhere Delete Document",
|
|
1666
|
+
description: "Delete a parsed document from the underlying storage and remove it from the Knowledge Graph mapping. Use this to completely remove a document or file from the user's knowledge base. You must provide the exact docId obtained from knowhere_kg_query or the ingest result.",
|
|
1667
|
+
parameters: {
|
|
1668
|
+
type: "object",
|
|
1669
|
+
additionalProperties: false,
|
|
1670
|
+
properties: { docId: {
|
|
1671
|
+
type: "string",
|
|
1672
|
+
description: "The targeted document ID to delete."
|
|
1673
|
+
} },
|
|
1674
|
+
required: ["docId"]
|
|
1675
|
+
},
|
|
1676
|
+
execute: async (_toolCallId, rawParams) => {
|
|
1677
|
+
const docId = readString((isRecord(rawParams) ? rawParams : {}).docId);
|
|
1678
|
+
if (!docId) throw new Error("docId is required.");
|
|
1679
|
+
const scope = params.store.resolveScope(params.ctx);
|
|
1680
|
+
const kbId = params.kgService.resolveKbId(params.ctx);
|
|
1681
|
+
let wasRemovedFromStore = false;
|
|
1682
|
+
try {
|
|
1683
|
+
if (await params.store.removeDocument(scope, docId)) {
|
|
1684
|
+
wasRemovedFromStore = true;
|
|
1685
|
+
params.api.logger.info(`knowhere: document ${docId} removed from store`);
|
|
1686
|
+
}
|
|
1687
|
+
} catch (error) {
|
|
1688
|
+
params.api.logger.warn(`knowhere: store.removeDocument failed for ${docId}: ${formatErrorMessage(error)}`);
|
|
1689
|
+
}
|
|
1690
|
+
let wasRemovedFromKg = false;
|
|
1691
|
+
if (kbId) try {
|
|
1692
|
+
await params.kgService.removeDocumentFromKb(kbId, docId);
|
|
1693
|
+
wasRemovedFromKg = true;
|
|
1694
|
+
params.kgService.scheduleBuild(kbId, async () => {
|
|
1695
|
+
await params.kgService.buildKnowledgeGraph(kbId);
|
|
1696
|
+
}).catch((e) => {
|
|
1697
|
+
params.api.logger.warn(`knowhere: rebuild failed after doc removal: ${formatErrorMessage(e)}`);
|
|
1698
|
+
});
|
|
1699
|
+
} catch (error) {
|
|
1700
|
+
params.api.logger.warn(`knowhere: kgService.removeDocumentFromKb failed for ${docId}: ${formatErrorMessage(error)}`);
|
|
1701
|
+
}
|
|
1702
|
+
if (wasRemovedFromStore || wasRemovedFromKg) return textResult(`Success: The document "${docId}" has been deleted from the knowledge base.\nThe Knowledge Graph is being rebuilt in the background.`);
|
|
1703
|
+
else return textResult(`Failed: The document "${docId}" could not be found or removed.`);
|
|
1704
|
+
}
|
|
1705
|
+
};
|
|
1706
|
+
}
|
|
1605
1707
|
function createKnowhereToolFactory(params) {
|
|
1606
1708
|
return (ctx) => [
|
|
1607
1709
|
createIngestTool({
|
|
@@ -1652,7 +1754,13 @@ function createKnowhereToolFactory(params) {
|
|
|
1652
1754
|
ctx
|
|
1653
1755
|
}),
|
|
1654
1756
|
createViewImageTool({ api: params.api }),
|
|
1655
|
-
createDiscoverFilesTool({ api: params.api })
|
|
1757
|
+
createDiscoverFilesTool({ api: params.api }),
|
|
1758
|
+
createDeleteDocumentTool({
|
|
1759
|
+
api: params.api,
|
|
1760
|
+
store: params.store,
|
|
1761
|
+
kgService: params.kgService,
|
|
1762
|
+
ctx
|
|
1763
|
+
})
|
|
1656
1764
|
];
|
|
1657
1765
|
}
|
|
1658
1766
|
//#endregion
|
package/dist/types.d.ts
CHANGED
|
@@ -232,7 +232,8 @@ export interface FileEdge {
|
|
|
232
232
|
}>;
|
|
233
233
|
}
|
|
234
234
|
/**
|
|
235
|
-
* File metadata in knowledge graph (matches
|
|
235
|
+
* File metadata in knowledge graph (v2.0 schema — matches graph-builder.ts output).
|
|
236
|
+
* `hit_count` and `last_hit` are maintained at runtime by `knowhere_read_chunks`.
|
|
236
237
|
*/
|
|
237
238
|
export interface FileMetadata {
|
|
238
239
|
chunks_count: number;
|
|
@@ -240,12 +241,22 @@ export interface FileMetadata {
|
|
|
240
241
|
top_keywords: string[];
|
|
241
242
|
top_summary: string;
|
|
242
243
|
importance: number;
|
|
244
|
+
/** ISO timestamp of when this file entry was first created in the graph. */
|
|
245
|
+
created_at: string;
|
|
246
|
+
/** Number of times chunks from this file have been read via knowhere_read_chunks. */
|
|
247
|
+
hit_count?: number;
|
|
248
|
+
/** ISO timestamp of the last knowhere_read_chunks access for this file. */
|
|
249
|
+
last_hit?: string;
|
|
243
250
|
}
|
|
244
251
|
/**
|
|
245
|
-
* Knowledge graph structure (matches
|
|
252
|
+
* Knowledge graph structure (v2.0 schema — matches graph-builder.ts output).
|
|
246
253
|
*/
|
|
247
254
|
export interface KnowledgeGraph {
|
|
248
255
|
version: string;
|
|
256
|
+
/** ISO timestamp of the last graph build or partial update. */
|
|
257
|
+
updated_at: string;
|
|
258
|
+
/** Knowledge base ID this graph belongs to. */
|
|
259
|
+
kb_id: string;
|
|
249
260
|
stats: {
|
|
250
261
|
total_files: number;
|
|
251
262
|
total_chunks: number;
|
package/openclaw.plugin.json
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
"name": "Knowhere",
|
|
4
4
|
"description": "Parse documents with Knowhere and expose the stored result as tool-queryable document state for OpenClaw agents.",
|
|
5
5
|
"skills": ["./skills"],
|
|
6
|
-
"version": "0.2.
|
|
6
|
+
"version": "0.2.9",
|
|
7
7
|
"uiHints": {
|
|
8
8
|
"apiKey": {
|
|
9
9
|
"label": "Knowhere API Key",
|
package/package.json
CHANGED
|
@@ -55,8 +55,10 @@ The plugin handles everything automatically:
|
|
|
55
55
|
- Uploads/fetches the file for parsing
|
|
56
56
|
- Polls until parsing completes
|
|
57
57
|
- Downloads and extracts the result package
|
|
58
|
-
-
|
|
58
|
+
- Stores parsed data under `~/.knowhere/global/documents/{docId}/`
|
|
59
|
+
- Creates a symlink in `~/.knowhere/{kbId}/{docId}` → the stored document
|
|
59
60
|
- Builds/updates `knowledge_graph.json`
|
|
61
|
+
- **Rejects** files that parse to 0 chunks (empty, corrupt, or auth-gated)
|
|
60
62
|
|
|
61
63
|
After ingest completes, the new document is immediately searchable via the retrieval workflow below.
|
|
62
64
|
|
|
@@ -70,27 +72,36 @@ All knowledge data lives under `~/.knowhere/{kb_id}/`:
|
|
|
70
72
|
|
|
71
73
|
```text
|
|
72
74
|
~/.knowhere/
|
|
73
|
-
|
|
75
|
+
├── global/ # Store: document storage (scopeMode=global)
|
|
76
|
+
│ ├── index.json # Store document index
|
|
77
|
+
│ ├── documents/
|
|
78
|
+
│ │ └── {docId}/ # One subdir per parsed document
|
|
79
|
+
│ │ ├── chunks.json # All chunks (the actual content)
|
|
80
|
+
│ │ ├── hierarchy.json # Document structure tree
|
|
81
|
+
│ │ ├── images/ # Extracted images
|
|
82
|
+
│ │ └── tables/ # Extracted tables (HTML)
|
|
83
|
+
│ └── metadata/
|
|
84
|
+
│ └── {docId}.json # Document metadata
|
|
85
|
+
└── {kb_id}/ # KG: knowledge graph layer
|
|
74
86
|
├── knowledge_graph.json # File-level overview + cross-file edges
|
|
87
|
+
├── kb_metadata.json # KG metadata
|
|
75
88
|
├── chunk_stats.json # Usage stats per chunk
|
|
76
|
-
└── {
|
|
77
|
-
├── chunks.json # All chunks (the actual content)
|
|
78
|
-
├── hierarchy.json # Document structure tree
|
|
79
|
-
├── images/ # Extracted images
|
|
80
|
-
└── tables/ # Extracted tables (HTML)
|
|
89
|
+
└── {docId} → ../global/documents/{docId} # Symlink to Store
|
|
81
90
|
```
|
|
82
91
|
|
|
83
|
-
### Strategy:
|
|
92
|
+
### Strategy: Use the Tier-2 retrieval tools
|
|
84
93
|
|
|
85
|
-
|
|
94
|
+
The canonical retrieval path is **always** the Tier-2 tool chain — do not skip steps:
|
|
86
95
|
|
|
87
|
-
|
|
96
|
+
1. `knowhere_get_map` — get the full KG overview: which files exist, their keywords, importance scores, and cross-file edges. Pass `kbId` if known, or leave empty to scan all knowledge bases.
|
|
97
|
+
2. `knowhere_discover_files` — if you're unsure which file contains the answer, run a keyword search across all KB documents and merge with the `get_map` results.
|
|
98
|
+
3. `knowhere_get_structure` — inspect the chapter/section hierarchy of the most relevant document.
|
|
99
|
+
4. `knowhere_read_chunks` — fetch the actual content. Use `sectionPath` to narrow to the specific chapter and minimize token usage.
|
|
88
100
|
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
3. Then read individual `chunks.json` files with your file reading tool for detailed content
|
|
101
|
+
❌ Do **not** use `exec` or shell commands to read `~/.knowhere/` files directly.
|
|
102
|
+
❌ Do **not** skip `knowhere_get_map` and jump straight to `knowhere_read_chunks`.
|
|
92
103
|
|
|
93
|
-
#### If no
|
|
104
|
+
#### If no Knowhere tools are available → self-navigate using file tools
|
|
94
105
|
|
|
95
106
|
Follow this pattern — do NOT explore the filesystem blindly:
|
|
96
107
|
|
|
@@ -106,13 +117,17 @@ Read `~/.knowhere/{kb_id}/knowledge_graph.json`:
|
|
|
106
117
|
```json
|
|
107
118
|
{
|
|
108
119
|
"version": "2.0",
|
|
109
|
-
"
|
|
120
|
+
"updated_at": "2026-04-09T10:00:00.000Z",
|
|
121
|
+
"kb_id": "telegram",
|
|
122
|
+
"stats": { "total_files": 5, "total_chunks": 327, "total_cross_file_edges": 12 },
|
|
110
123
|
"files": {
|
|
111
124
|
"report.docx": {
|
|
112
125
|
"chunks_count": 198,
|
|
113
126
|
"types": { "text": 135, "table": 21, "image": 42 },
|
|
114
127
|
"top_keywords": ["excavation", "retaining", "construction"],
|
|
115
|
-
"
|
|
128
|
+
"top_summary": "Construction safety report for the Lujiazui project.",
|
|
129
|
+
"importance": 0.85,
|
|
130
|
+
"created_at": "2026-04-09T08:00:00.000Z"
|
|
116
131
|
}
|
|
117
132
|
},
|
|
118
133
|
"edges": [
|
|
@@ -120,8 +135,16 @@ Read `~/.knowhere/{kb_id}/knowledge_graph.json`:
|
|
|
120
135
|
"source": "file_A.docx",
|
|
121
136
|
"target": "file_B.pdf",
|
|
122
137
|
"connection_count": 20,
|
|
138
|
+
"avg_score": 0.91,
|
|
123
139
|
"top_connections": [
|
|
124
|
-
{
|
|
140
|
+
{
|
|
141
|
+
"source_chunk": "Chapter 3",
|
|
142
|
+
"source_id": "uuid-a",
|
|
143
|
+
"target_chunk": "Safety Policy",
|
|
144
|
+
"target_id": "uuid-b",
|
|
145
|
+
"relation": "keyword",
|
|
146
|
+
"score": 1.0
|
|
147
|
+
}
|
|
125
148
|
]
|
|
126
149
|
}
|
|
127
150
|
]
|
|
@@ -165,3 +188,19 @@ Check `edges` from Step 1 for cross-document connections. If related files weren
|
|
|
165
188
|
- **Show connections**: mention cross-file relationships from edges
|
|
166
189
|
- **No internal IDs**: never expose `chunk_id` or UUID paths to the user
|
|
167
190
|
- **User's language**: reply in the same language the user is using
|
|
191
|
+
|
|
192
|
+
## Part 3: Deleting Knowledge
|
|
193
|
+
|
|
194
|
+
When the user asks to "delete", "remove", or "forget" a specific document:
|
|
195
|
+
|
|
196
|
+
1. Use `knowhere_get_map` to get an overview of all files in the knowledge base, then identify the correct `docId` that uniquely corresponds to the document the user named.
|
|
197
|
+
2. If the user provided a filename, use it to disambiguate across multiple hits.
|
|
198
|
+
3. Call `knowhere_delete_document` with the discovered `docId`.
|
|
199
|
+
|
|
200
|
+
The `knowhere_delete_document` tool natively handles all internal consistency logic:
|
|
201
|
+
|
|
202
|
+
- Deeply cleaning up the `chunks.json`, `images/`, and `tables/` locally.
|
|
203
|
+
- Removing the symlink mapping from the knowledge base profile.
|
|
204
|
+
- Dispatching a background rebuild for `knowledge_graph.json` so that the reference disappears from future queries.
|
|
205
|
+
|
|
206
|
+
**Rule:** DO NOT try to execute Unix file deletion (`rm`) commands on `~/.knowhere/` directly. Always use `knowhere_delete_document`.
|