@ontos-ai/knowhere-claw 0.2.2 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -6
- package/dist/connect-builder.d.ts +2 -0
- package/dist/connect-builder.js +9 -10
- package/dist/graph-builder.d.ts +4 -1
- package/dist/graph-builder.js +15 -10
- package/dist/index.js +1 -7
- package/dist/kg-service.js +8 -3
- package/dist/parser.d.ts +4 -8
- package/dist/parser.js +25 -243
- package/dist/store.d.ts +4 -14
- package/dist/store.js +21 -106
- package/dist/text.js +1 -13
- package/dist/tools.js +135 -879
- package/dist/types.d.ts +1 -58
- package/openclaw.plugin.json +71 -1
- package/package.json +1 -1
- package/skills/knowhere_memory/SKILL.md +80 -98
- package/skills/knowhere/SKILL.md +0 -280
- /package/dist/__tests__/{read-result-file-tool.test.d.ts → storage-layout.test.d.ts} +0 -0
package/README.md
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# Knowhere for OpenClaw
|
|
2
2
|
|
|
3
3
|
Knowhere is an OpenClaw plugin that parses documents and URLs with Knowhere,
|
|
4
|
-
stores
|
|
4
|
+
stores extracted Knowhere files in OpenClaw state, and gives agents a
|
|
5
5
|
browse-first toolset for grounded document work.
|
|
6
6
|
|
|
7
7
|
Quick mental model:
|
|
@@ -22,7 +22,7 @@ the machine running that Gateway, then restart that Gateway.
|
|
|
22
22
|
## What You Get
|
|
23
23
|
|
|
24
24
|
- Ingest local files or document URLs with Knowhere
|
|
25
|
-
- Store parsed
|
|
25
|
+
- Store parsed documents inside OpenClaw-managed state
|
|
26
26
|
- Preview document structure, search chunks, and inspect raw result files
|
|
27
27
|
- Reuse stored documents across `session`, `agent`, or `global` scope
|
|
28
28
|
- Ship bundled `knowhere` and `knowhere_memory` skills so agents prefer this
|
|
@@ -73,7 +73,7 @@ Config notes:
|
|
|
73
73
|
- `pollIntervalMs`, `pollTimeoutMs`, `requestTimeoutMs`, `uploadTimeoutMs`:
|
|
74
74
|
optional tuning for job polling, API calls, and large uploads.
|
|
75
75
|
- An explicit `storageDir` such as
|
|
76
|
-
`/home/<user>/.openclaw/plugin-state/knowhere` makes stored
|
|
76
|
+
`/home/<user>/.openclaw/plugin-state/knowhere` makes stored documents
|
|
77
77
|
easier to inspect, back up, or clean up.
|
|
78
78
|
|
|
79
79
|
## How OpenClaw Uses It
|
|
@@ -110,13 +110,13 @@ actually call the plugin tools.
|
|
|
110
110
|
Within each scope, the plugin keeps:
|
|
111
111
|
|
|
112
112
|
- an `index.json` cache of stored document summaries
|
|
113
|
-
-
|
|
114
|
-
- the extracted Knowhere result
|
|
113
|
+
- a `metadata/` directory with one JSON record per stored document
|
|
114
|
+
- the extracted Knowhere result files directly inside each document directory
|
|
115
115
|
|
|
116
116
|
## Common Workflow
|
|
117
117
|
|
|
118
118
|
1. Provide a file path or URL to the agent.
|
|
119
|
-
2. The agent ingests it into Knowhere and
|
|
119
|
+
2. The agent ingests it into Knowhere. By default this starts parsing asynchronously and returns a job ID; when the current turn needs the parsed result immediately, the agent can call `knowhere_ingest_document` with `blockUntilComplete: true`.
|
|
120
120
|
3. Follow-up questions reuse stored results from the current scope.
|
|
121
121
|
4. When needed, the agent can preview structure, search chunks, read raw result
|
|
122
122
|
files, or clear stored documents.
|
package/dist/connect-builder.js
CHANGED
|
@@ -8,15 +8,14 @@ function normalizeKeyword(keyword) {
|
|
|
8
8
|
return keyword.toLowerCase().trim();
|
|
9
9
|
}
|
|
10
10
|
/**
|
|
11
|
-
* Extract file key from a chunk
|
|
12
|
-
*
|
|
13
|
-
*
|
|
14
|
-
* Example: "Default_Root/report.docx/Chapter 1" -> "report.docx"
|
|
11
|
+
* Extract file key from a chunk.
|
|
12
|
+
* Prefers the explicit fileKey field; falls back to path-based extraction for backward compatibility.
|
|
15
13
|
*/
|
|
16
|
-
function
|
|
17
|
-
|
|
14
|
+
function getFileKey(chunk) {
|
|
15
|
+
if (chunk.fileKey) return chunk.fileKey;
|
|
16
|
+
const parts = chunk.path.replace(/-->/g, "/").split("/");
|
|
18
17
|
if (parts.length >= 2) return parts[1];
|
|
19
|
-
return path;
|
|
18
|
+
return chunk.path;
|
|
20
19
|
}
|
|
21
20
|
/**
|
|
22
21
|
* Build inverted keyword index: keyword -> [chunk_ids]
|
|
@@ -97,7 +96,7 @@ function buildConnections(chunks, config, logger) {
|
|
|
97
96
|
for (const chunk of chunks) chunkById.set(chunk.chunk_id, chunk);
|
|
98
97
|
const chunksByFile = /* @__PURE__ */ new Map();
|
|
99
98
|
for (const chunk of chunks) {
|
|
100
|
-
const fileKey =
|
|
99
|
+
const fileKey = getFileKey(chunk);
|
|
101
100
|
if (!chunksByFile.has(fileKey)) chunksByFile.set(fileKey, /* @__PURE__ */ new Set());
|
|
102
101
|
chunksByFile.get(fileKey).add(chunk.chunk_id);
|
|
103
102
|
}
|
|
@@ -120,14 +119,14 @@ function buildConnections(chunks, config, logger) {
|
|
|
120
119
|
const sourceChunk = chunkById.get(sourceId);
|
|
121
120
|
if (!sourceChunk) continue;
|
|
122
121
|
const sourceKeywords = getKeywords(sourceChunk);
|
|
123
|
-
const sourceFileKey =
|
|
122
|
+
const sourceFileKey = getFileKey(sourceChunk);
|
|
124
123
|
for (const targetId of targetIds) {
|
|
125
124
|
const targetChunk = chunkById.get(targetId);
|
|
126
125
|
if (!targetChunk) continue;
|
|
127
126
|
const pairKey = sourceId < targetId ? `${sourceId}::${targetId}` : `${targetId}::${sourceId}`;
|
|
128
127
|
if (seenPairs.has(pairKey)) continue;
|
|
129
128
|
seenPairs.add(pairKey);
|
|
130
|
-
const targetFileKey =
|
|
129
|
+
const targetFileKey = getFileKey(targetChunk);
|
|
131
130
|
if (config.crossFileOnly && sourceFileKey === targetFileKey) continue;
|
|
132
131
|
const contentRatio = sequenceMatcherRatio(sourceChunk.content.slice(0, 500), targetChunk.content.slice(0, 500));
|
|
133
132
|
if (contentRatio >= config.maxContentOverlap) {
|
package/dist/graph-builder.d.ts
CHANGED
|
@@ -32,12 +32,15 @@ interface FileMetadata {
|
|
|
32
32
|
top_keywords: string[];
|
|
33
33
|
top_summary: string;
|
|
34
34
|
importance: number;
|
|
35
|
+
created_at: string;
|
|
35
36
|
}
|
|
36
37
|
/**
|
|
37
38
|
* Complete knowledge graph structure
|
|
38
39
|
*/
|
|
39
40
|
export interface KnowledgeGraph {
|
|
40
41
|
version: string;
|
|
42
|
+
updated_at: string;
|
|
43
|
+
kb_id: string;
|
|
41
44
|
stats: {
|
|
42
45
|
total_files: number;
|
|
43
46
|
total_chunks: number;
|
|
@@ -59,7 +62,7 @@ export interface ChunkStats {
|
|
|
59
62
|
* Main function to build knowledge graph
|
|
60
63
|
* Equivalent to Python: build_knowledge_graph
|
|
61
64
|
*/
|
|
62
|
-
export declare function buildKnowledgeGraph(chunks: ChunkData[], connections: Connection[], chunkStats: ChunkStats, jiebaInitialized: boolean, logger?: PluginLogger): KnowledgeGraph;
|
|
65
|
+
export declare function buildKnowledgeGraph(chunks: ChunkData[], connections: Connection[], chunkStats: ChunkStats, jiebaInitialized: boolean, logger?: PluginLogger, kbId?: string): KnowledgeGraph;
|
|
63
66
|
/**
|
|
64
67
|
* Incremental update: match new chunks against existing chunks
|
|
65
68
|
* Equivalent to Python: _incremental_connections
|
package/dist/graph-builder.js
CHANGED
|
@@ -7,12 +7,14 @@ import * as nodejieba from "nodejieba";
|
|
|
7
7
|
* Builds file-level knowledge graphs from chunk connections with TF-IDF and importance scoring.
|
|
8
8
|
*/
|
|
9
9
|
/**
|
|
10
|
-
* Extract file key from chunk
|
|
10
|
+
* Extract file key from a chunk.
|
|
11
|
+
* Prefers the explicit fileKey field; falls back to path-based extraction for backward compatibility.
|
|
11
12
|
*/
|
|
12
|
-
function
|
|
13
|
-
|
|
13
|
+
function getFileKey(chunk) {
|
|
14
|
+
if (chunk.fileKey) return chunk.fileKey;
|
|
15
|
+
const parts = chunk.path.replace(/-->/g, "/").split("/");
|
|
14
16
|
if (parts.length >= 2) return parts[1];
|
|
15
|
-
return path;
|
|
17
|
+
return chunk.path;
|
|
16
18
|
}
|
|
17
19
|
/**
|
|
18
20
|
* Extract label from chunk path (last segment)
|
|
@@ -107,7 +109,7 @@ function computeFileImportance(fileKey, fileChunks, allChunks, chunkStats, decay
|
|
|
107
109
|
function getAllChunkCountsByFile(chunks) {
|
|
108
110
|
const countsByFile = /* @__PURE__ */ new Map();
|
|
109
111
|
for (const chunk of chunks) {
|
|
110
|
-
const fileKey =
|
|
112
|
+
const fileKey = getFileKey(chunk);
|
|
111
113
|
countsByFile.set(fileKey, (countsByFile.get(fileKey) || 0) + 1);
|
|
112
114
|
}
|
|
113
115
|
return Array.from(countsByFile.values());
|
|
@@ -122,8 +124,8 @@ function aggregateFileLevelEdges(connections, chunkById, topN = 5) {
|
|
|
122
124
|
const sourceChunk = chunkById.get(conn.source);
|
|
123
125
|
const targetChunk = chunkById.get(conn.target);
|
|
124
126
|
if (!sourceChunk || !targetChunk) continue;
|
|
125
|
-
const sourceFile =
|
|
126
|
-
const targetFile =
|
|
127
|
+
const sourceFile = getFileKey(sourceChunk);
|
|
128
|
+
const targetFile = getFileKey(targetChunk);
|
|
127
129
|
if (sourceFile === targetFile) continue;
|
|
128
130
|
const pairKey = sourceFile < targetFile ? `${sourceFile}::${targetFile}` : `${targetFile}::${sourceFile}`;
|
|
129
131
|
if (!filePairs.has(pairKey)) filePairs.set(pairKey, []);
|
|
@@ -159,13 +161,13 @@ function aggregateFileLevelEdges(connections, chunkById, topN = 5) {
|
|
|
159
161
|
* Main function to build knowledge graph
|
|
160
162
|
* Equivalent to Python: build_knowledge_graph
|
|
161
163
|
*/
|
|
162
|
-
function buildKnowledgeGraph(chunks, connections, chunkStats, jiebaInitialized, logger) {
|
|
164
|
+
function buildKnowledgeGraph(chunks, connections, chunkStats, jiebaInitialized, logger, kbId) {
|
|
163
165
|
logger?.info(`Building knowledge graph from ${chunks.length} chunks and ${connections.length} connections`);
|
|
164
166
|
const chunkById = /* @__PURE__ */ new Map();
|
|
165
167
|
for (const chunk of chunks) chunkById.set(chunk.chunk_id, chunk);
|
|
166
168
|
const chunksByFile = /* @__PURE__ */ new Map();
|
|
167
169
|
for (const chunk of chunks) {
|
|
168
|
-
const fileKey =
|
|
170
|
+
const fileKey = getFileKey(chunk);
|
|
169
171
|
if (!chunksByFile.has(fileKey)) chunksByFile.set(fileKey, []);
|
|
170
172
|
chunksByFile.get(fileKey).push(chunk);
|
|
171
173
|
}
|
|
@@ -188,13 +190,16 @@ function buildKnowledgeGraph(chunks, connections, chunkStats, jiebaInitialized,
|
|
|
188
190
|
types: typeCount,
|
|
189
191
|
top_keywords: topKeywords,
|
|
190
192
|
top_summary: topSummary,
|
|
191
|
-
importance
|
|
193
|
+
importance,
|
|
194
|
+
created_at: (/* @__PURE__ */ new Date()).toISOString()
|
|
192
195
|
};
|
|
193
196
|
}
|
|
194
197
|
const fileEdges = aggregateFileLevelEdges(connections, chunkById, 5);
|
|
195
198
|
logger?.info(`Created graph with ${Object.keys(filesMetadata).length} files and ${fileEdges.length} edges`);
|
|
196
199
|
return {
|
|
197
200
|
version: "2.0",
|
|
201
|
+
updated_at: (/* @__PURE__ */ new Date()).toISOString(),
|
|
202
|
+
kb_id: kbId || "",
|
|
198
203
|
stats: {
|
|
199
204
|
total_files: Object.keys(filesMetadata).length,
|
|
200
205
|
total_chunks: chunks.length,
|
package/dist/index.js
CHANGED
|
@@ -6,7 +6,7 @@ import { KnowledgeGraphService } from "./kg-service.js";
|
|
|
6
6
|
const plugin = {
|
|
7
7
|
id: "knowhere-claw",
|
|
8
8
|
name: "Knowhere",
|
|
9
|
-
description: "Knowhere document ingestion and
|
|
9
|
+
description: "Knowhere document ingestion, job management, and knowledge graph tools for OpenClaw.",
|
|
10
10
|
configSchema: knowherePluginConfigSchema,
|
|
11
11
|
register(api) {
|
|
12
12
|
const config = resolveKnowhereConfig(api);
|
|
@@ -41,12 +41,6 @@ const plugin = {
|
|
|
41
41
|
"knowhere_list_jobs",
|
|
42
42
|
"knowhere_get_job_status",
|
|
43
43
|
"knowhere_import_completed_job",
|
|
44
|
-
"knowhere_grep",
|
|
45
|
-
"knowhere_read_result_file",
|
|
46
|
-
"knowhere_preview_document",
|
|
47
|
-
"knowhere_list_documents",
|
|
48
|
-
"knowhere_remove_document",
|
|
49
|
-
"knowhere_clear_scope",
|
|
50
44
|
"knowhere_set_api_key",
|
|
51
45
|
"knowhere_kg_list",
|
|
52
46
|
"knowhere_kg_query"
|
package/dist/kg-service.js
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { resolveStoredKnowhereResultRoot } from "./parser.js";
|
|
1
2
|
import { buildConnections, init_connect_builder } from "./connect-builder.js";
|
|
2
3
|
import { buildKnowledgeGraph } from "./graph-builder.js";
|
|
3
4
|
import path from "node:path";
|
|
@@ -143,7 +144,8 @@ var KnowledgeGraphService = class {
|
|
|
143
144
|
const kbPath = await this.ensureKbDirectory(params.kbId);
|
|
144
145
|
const docDir = path.join(kbPath, params.docId);
|
|
145
146
|
await fs.ensureDir(docDir);
|
|
146
|
-
await
|
|
147
|
+
const sourceResultRoot = await resolveStoredKnowhereResultRoot(params.sourcePath);
|
|
148
|
+
await fs.copy(sourceResultRoot, docDir, { overwrite: true });
|
|
147
149
|
const keywordsPath = path.join(docDir, "keywords.json");
|
|
148
150
|
await fs.writeJSON(keywordsPath, params.keywords, { spaces: 2 });
|
|
149
151
|
const metadataPath = path.join(docDir, "metadata.json");
|
|
@@ -185,7 +187,10 @@ var KnowledgeGraphService = class {
|
|
|
185
187
|
const chunksPath = path.join(kbPath, docDir, "chunks.json");
|
|
186
188
|
if (await fs.pathExists(chunksPath)) {
|
|
187
189
|
const chunksData = await fs.readJSON(chunksPath);
|
|
188
|
-
if (chunksData.chunks && Array.isArray(chunksData.chunks)) allChunks.push(...chunksData.chunks)
|
|
190
|
+
if (chunksData.chunks && Array.isArray(chunksData.chunks)) allChunks.push(...chunksData.chunks.map((c) => ({
|
|
191
|
+
...c,
|
|
192
|
+
fileKey: docDir
|
|
193
|
+
})));
|
|
189
194
|
}
|
|
190
195
|
}
|
|
191
196
|
if (allChunks.length === 0) {
|
|
@@ -198,7 +203,7 @@ var KnowledgeGraphService = class {
|
|
|
198
203
|
const chunkStatsPath = path.join(kbPath, "chunk_stats.json");
|
|
199
204
|
let chunkStats = {};
|
|
200
205
|
if (await fs.pathExists(chunkStatsPath)) chunkStats = await fs.readJSON(chunkStatsPath);
|
|
201
|
-
const knowledgeGraph = buildKnowledgeGraph(allChunks, connections, chunkStats, this.jiebaInitialized, this.logger);
|
|
206
|
+
const knowledgeGraph = buildKnowledgeGraph(allChunks, connections, chunkStats, this.jiebaInitialized, this.logger, kbId);
|
|
202
207
|
const graphFile = path.join(kbPath, "knowledge_graph.json");
|
|
203
208
|
await fs.writeJSON(graphFile, knowledgeGraph, { spaces: 2 });
|
|
204
209
|
this.logger.info(`Knowledge graph saved to ${graphFile}`);
|
package/dist/parser.d.ts
CHANGED
|
@@ -1,16 +1,12 @@
|
|
|
1
|
-
import type { KnowhereDownloadedResult, KnowhereManifest,
|
|
1
|
+
import type { KnowhereDownloadedResult, KnowhereManifest, KnowhereStatistics } from "./types";
|
|
2
2
|
type KnowhereStoredResultSummary = {
|
|
3
3
|
manifest: KnowhereManifest;
|
|
4
4
|
chunkCount: number;
|
|
5
5
|
statistics: KnowhereStatistics;
|
|
6
6
|
};
|
|
7
|
-
export declare const STORED_BROWSE_INDEX_VERSION = 2;
|
|
8
7
|
export declare function resolveResultEntryPath(rootDir: string, entryPath: string): string;
|
|
9
|
-
export declare function buildStoredPathPrefixes(storedPath: string): string[];
|
|
10
|
-
export declare function isStoredBrowseIndex(value: unknown): value is StoredBrowseIndex;
|
|
11
|
-
export declare function buildStoredBrowseIndex(resultDir: string, manifest: KnowhereManifest, chunks: StoredChunk[]): Promise<StoredBrowseIndex>;
|
|
12
8
|
export declare function extractKnowhereResultArchive(downloadedResult: KnowhereDownloadedResult, targetDir: string): Promise<void>;
|
|
13
|
-
export declare function
|
|
14
|
-
export declare function
|
|
15
|
-
export declare function
|
|
9
|
+
export declare function resolveStoredKnowhereResultRoot(documentDir: string): Promise<string>;
|
|
10
|
+
export declare function resolveStoredKnowhereArtifactPath(documentDir: string, entryPath: string): Promise<string>;
|
|
11
|
+
export declare function readStoredKnowhereResultSummary(documentDir: string): Promise<KnowhereStoredResultSummary>;
|
|
16
12
|
export {};
|
package/dist/parser.js
CHANGED
|
@@ -3,11 +3,9 @@ import fs from "node:fs/promises";
|
|
|
3
3
|
import path from "node:path";
|
|
4
4
|
import { createHash } from "node:crypto";
|
|
5
5
|
import { strFromU8, unzipSync } from "fflate";
|
|
6
|
+
//#region src/parser.ts
|
|
6
7
|
const CHUNKS_FILE_NAME = "chunks.json";
|
|
7
|
-
const
|
|
8
|
-
const HIERARCHY_FILE_NAME = "hierarchy.json";
|
|
9
|
-
const HIERARCHY_VIEW_FILE_NAME = "hierarchy_view.html";
|
|
10
|
-
const KB_CSV_FILE_NAME = "kb.csv";
|
|
8
|
+
const LEGACY_RESULT_DIRECTORY_NAME = "result";
|
|
11
9
|
const MANIFEST_FILE_NAME = "manifest.json";
|
|
12
10
|
function readZipText(entries, fileName) {
|
|
13
11
|
const entry = entries[fileName];
|
|
@@ -16,6 +14,15 @@ function readZipText(entries, fileName) {
|
|
|
16
14
|
async function ensureDir(targetPath) {
|
|
17
15
|
await fs.mkdir(targetPath, { recursive: true });
|
|
18
16
|
}
|
|
17
|
+
async function pathExists(targetPath) {
|
|
18
|
+
try {
|
|
19
|
+
await fs.access(targetPath);
|
|
20
|
+
return true;
|
|
21
|
+
} catch (error) {
|
|
22
|
+
if (isNodeError(error) && error.code === "ENOENT") return false;
|
|
23
|
+
throw error;
|
|
24
|
+
}
|
|
25
|
+
}
|
|
19
26
|
async function readTextFile(targetPath) {
|
|
20
27
|
try {
|
|
21
28
|
return await fs.readFile(targetPath, "utf-8");
|
|
@@ -39,36 +46,6 @@ function resolveResultEntryPath(rootDir, entryPath) {
|
|
|
39
46
|
function normalizeRelativePath(value) {
|
|
40
47
|
return value.replace(/\\/g, "/").replace(/^\/+/, "");
|
|
41
48
|
}
|
|
42
|
-
function normalizeStoredPath(value) {
|
|
43
|
-
if (typeof value !== "string") return null;
|
|
44
|
-
return value.trim() || null;
|
|
45
|
-
}
|
|
46
|
-
function readChunkNumber(rawChunk, metadata, key) {
|
|
47
|
-
const metadataValue = metadata[key];
|
|
48
|
-
if (typeof metadataValue === "number" && Number.isFinite(metadataValue)) return metadataValue;
|
|
49
|
-
const rawValue = rawChunk[key];
|
|
50
|
-
if (typeof rawValue === "number" && Number.isFinite(rawValue)) return rawValue;
|
|
51
|
-
return null;
|
|
52
|
-
}
|
|
53
|
-
function readChunkStringArray(rawChunk, metadata, key) {
|
|
54
|
-
const metadataValue = metadata[key];
|
|
55
|
-
if (Array.isArray(metadataValue)) return metadataValue.filter((entry) => typeof entry === "string");
|
|
56
|
-
const rawValue = rawChunk[key];
|
|
57
|
-
if (Array.isArray(rawValue)) return rawValue.filter((entry) => typeof entry === "string");
|
|
58
|
-
return [];
|
|
59
|
-
}
|
|
60
|
-
function readChunkArray(rawChunk, metadata, key) {
|
|
61
|
-
const metadataValue = metadata[key];
|
|
62
|
-
if (Array.isArray(metadataValue)) return metadataValue;
|
|
63
|
-
const rawValue = rawChunk[key];
|
|
64
|
-
if (Array.isArray(rawValue)) return rawValue;
|
|
65
|
-
return [];
|
|
66
|
-
}
|
|
67
|
-
function extractAssetFilePath(rawChunk, metadata) {
|
|
68
|
-
const candidates = [rawChunk.file_path, metadata.file_path];
|
|
69
|
-
for (const candidate of candidates) if (typeof candidate === "string" && candidate.trim()) return normalizeRelativePath(candidate.trim());
|
|
70
|
-
return null;
|
|
71
|
-
}
|
|
72
49
|
function parseRawChunks(value) {
|
|
73
50
|
if (Array.isArray(value)) return value.filter((entry) => isRecord(entry));
|
|
74
51
|
if (isRecord(value) && Array.isArray(value.chunks)) return value.chunks.filter((entry) => isRecord(entry));
|
|
@@ -77,24 +54,6 @@ function parseRawChunks(value) {
|
|
|
77
54
|
function parseManifest(value) {
|
|
78
55
|
return isRecord(value) ? value : {};
|
|
79
56
|
}
|
|
80
|
-
function buildChunk(rawChunk) {
|
|
81
|
-
const metadata = isRecord(rawChunk.metadata) ? rawChunk.metadata : {};
|
|
82
|
-
const type = rawChunk.type === "image" || rawChunk.type === "table" || rawChunk.type === "text" ? rawChunk.type : "text";
|
|
83
|
-
return {
|
|
84
|
-
chunkId: typeof rawChunk.chunk_id === "string" ? rawChunk.chunk_id : "",
|
|
85
|
-
type,
|
|
86
|
-
path: normalizeStoredPath(rawChunk.path),
|
|
87
|
-
summary: typeof metadata.summary === "string" ? metadata.summary : typeof rawChunk.summary === "string" ? rawChunk.summary : "",
|
|
88
|
-
content: typeof rawChunk.content === "string" ? rawChunk.content : "",
|
|
89
|
-
tokens: readChunkNumber(rawChunk, metadata, "tokens"),
|
|
90
|
-
keywords: readChunkStringArray(rawChunk, metadata, "keywords"),
|
|
91
|
-
relationships: readChunkArray(rawChunk, metadata, "relationships"),
|
|
92
|
-
metadata,
|
|
93
|
-
assetFilePath: extractAssetFilePath(rawChunk, metadata),
|
|
94
|
-
originalName: typeof metadata.original_name === "string" ? metadata.original_name : typeof rawChunk.original_name === "string" ? rawChunk.original_name : null,
|
|
95
|
-
tableType: typeof metadata.table_type === "string" ? metadata.table_type : typeof rawChunk.table_type === "string" ? rawChunk.table_type : null
|
|
96
|
-
};
|
|
97
|
-
}
|
|
98
57
|
function normalizeStatistics(manifest, rawChunks) {
|
|
99
58
|
if (manifest.statistics) return manifest.statistics;
|
|
100
59
|
return {
|
|
@@ -110,185 +69,6 @@ function validateKnowhereResultChecksum(zipBuffer, manifest) {
|
|
|
110
69
|
if (typeof checksum !== "string" || !checksum) return;
|
|
111
70
|
if (createHash("sha256").update(zipBuffer).digest("hex") !== checksum) throw new Error("Knowhere result ZIP checksum mismatch.");
|
|
112
71
|
}
|
|
113
|
-
function tokenizeStoredPath(storedPath) {
|
|
114
|
-
const slashSegments = storedPath.split("/").map((segment) => segment.trim()).filter(Boolean);
|
|
115
|
-
const tokens = [];
|
|
116
|
-
for (const slashSegment of slashSegments) {
|
|
117
|
-
const arrowSegments = slashSegment.split("-->").map((segment) => segment.trim()).filter(Boolean);
|
|
118
|
-
if (arrowSegments.length === 0) continue;
|
|
119
|
-
tokens.push({
|
|
120
|
-
delimiter: tokens.length === 0 ? null : "/",
|
|
121
|
-
segment: arrowSegments[0] || ""
|
|
122
|
-
});
|
|
123
|
-
for (const arrowSegment of arrowSegments.slice(1)) tokens.push({
|
|
124
|
-
delimiter: "-->",
|
|
125
|
-
segment: arrowSegment
|
|
126
|
-
});
|
|
127
|
-
}
|
|
128
|
-
return tokens;
|
|
129
|
-
}
|
|
130
|
-
function buildStoredPathPrefixes(storedPath) {
|
|
131
|
-
const tokens = tokenizeStoredPath(storedPath);
|
|
132
|
-
const prefixes = [];
|
|
133
|
-
let currentPath = "";
|
|
134
|
-
for (const token of tokens) {
|
|
135
|
-
currentPath = token.delimiter ? `${currentPath}${token.delimiter}${token.segment}` : token.segment;
|
|
136
|
-
prefixes.push(currentPath);
|
|
137
|
-
}
|
|
138
|
-
return prefixes;
|
|
139
|
-
}
|
|
140
|
-
function ensurePathAccumulator(accumulators, pathValue, parentPath, depth) {
|
|
141
|
-
const existing = accumulators.get(pathValue);
|
|
142
|
-
if (existing) {
|
|
143
|
-
if (parentPath && !existing.parentPath) existing.parentPath = parentPath;
|
|
144
|
-
return existing;
|
|
145
|
-
}
|
|
146
|
-
const next = {
|
|
147
|
-
childPaths: /* @__PURE__ */ new Set(),
|
|
148
|
-
chunkCount: 0,
|
|
149
|
-
chunkIds: [],
|
|
150
|
-
depth,
|
|
151
|
-
directChunkCount: 0,
|
|
152
|
-
imageChunkCount: 0,
|
|
153
|
-
parentPath,
|
|
154
|
-
path: pathValue,
|
|
155
|
-
tableChunkCount: 0,
|
|
156
|
-
textChunkCount: 0
|
|
157
|
-
};
|
|
158
|
-
accumulators.set(pathValue, next);
|
|
159
|
-
return next;
|
|
160
|
-
}
|
|
161
|
-
function incrementPathCounters(accumulator, chunkType) {
|
|
162
|
-
accumulator.chunkCount += 1;
|
|
163
|
-
if (chunkType === "image") {
|
|
164
|
-
accumulator.imageChunkCount += 1;
|
|
165
|
-
return;
|
|
166
|
-
}
|
|
167
|
-
if (chunkType === "table") {
|
|
168
|
-
accumulator.tableChunkCount += 1;
|
|
169
|
-
return;
|
|
170
|
-
}
|
|
171
|
-
accumulator.textChunkCount += 1;
|
|
172
|
-
}
|
|
173
|
-
function buildPathRecords(chunks) {
|
|
174
|
-
const accumulators = /* @__PURE__ */ new Map();
|
|
175
|
-
for (const chunk of chunks) {
|
|
176
|
-
if (!chunk.path) continue;
|
|
177
|
-
const prefixes = buildStoredPathPrefixes(chunk.path);
|
|
178
|
-
for (const [index, prefix] of prefixes.entries()) {
|
|
179
|
-
const parentPath = index > 0 ? prefixes[index - 1] || null : null;
|
|
180
|
-
const accumulator = ensurePathAccumulator(accumulators, prefix, parentPath, index + 1);
|
|
181
|
-
incrementPathCounters(accumulator, chunk.type);
|
|
182
|
-
if (parentPath) ensurePathAccumulator(accumulators, parentPath, index > 1 ? prefixes[index - 2] || null : null, index).childPaths.add(prefix);
|
|
183
|
-
if (index === prefixes.length - 1) {
|
|
184
|
-
accumulator.directChunkCount += 1;
|
|
185
|
-
if (chunk.chunkId) accumulator.chunkIds.push(chunk.chunkId);
|
|
186
|
-
}
|
|
187
|
-
}
|
|
188
|
-
}
|
|
189
|
-
return [...accumulators.values()].sort((left, right) => left.depth - right.depth || left.path.localeCompare(right.path)).map((entry) => ({
|
|
190
|
-
path: entry.path,
|
|
191
|
-
parentPath: entry.parentPath,
|
|
192
|
-
depth: entry.depth,
|
|
193
|
-
childPaths: [...entry.childPaths].sort((left, right) => left.localeCompare(right)),
|
|
194
|
-
chunkIds: [...entry.chunkIds],
|
|
195
|
-
directChunkCount: entry.directChunkCount,
|
|
196
|
-
chunkCount: entry.chunkCount,
|
|
197
|
-
textChunkCount: entry.textChunkCount,
|
|
198
|
-
imageChunkCount: entry.imageChunkCount,
|
|
199
|
-
tableChunkCount: entry.tableChunkCount
|
|
200
|
-
}));
|
|
201
|
-
}
|
|
202
|
-
function readManifestAssetEntries(manifest, key) {
|
|
203
|
-
const rawEntries = (isRecord(manifest.files) ? manifest.files : {})[key];
|
|
204
|
-
if (!Array.isArray(rawEntries)) return [];
|
|
205
|
-
return rawEntries.filter((entry) => isRecord(entry));
|
|
206
|
-
}
|
|
207
|
-
function buildResultFileChunkLookup(manifest, chunks) {
|
|
208
|
-
const entries = /* @__PURE__ */ new Map();
|
|
209
|
-
for (const key of ["images", "tables"]) {
|
|
210
|
-
const assetEntries = readManifestAssetEntries(manifest, key);
|
|
211
|
-
for (const entry of assetEntries) {
|
|
212
|
-
const filePath = typeof entry.file_path === "string" && entry.file_path.trim() ? normalizeRelativePath(entry.file_path.trim()) : null;
|
|
213
|
-
if (!filePath) continue;
|
|
214
|
-
entries.set(filePath, {
|
|
215
|
-
chunkId: typeof entry.id === "string" ? entry.id : null,
|
|
216
|
-
format: typeof entry.format === "string" ? entry.format : null
|
|
217
|
-
});
|
|
218
|
-
}
|
|
219
|
-
}
|
|
220
|
-
for (const chunk of chunks) {
|
|
221
|
-
if (!chunk.assetFilePath || entries.has(chunk.assetFilePath)) continue;
|
|
222
|
-
entries.set(chunk.assetFilePath, {
|
|
223
|
-
chunkId: chunk.chunkId || null,
|
|
224
|
-
format: null
|
|
225
|
-
});
|
|
226
|
-
}
|
|
227
|
-
return entries;
|
|
228
|
-
}
|
|
229
|
-
function inferResultFileKind(relativePath) {
|
|
230
|
-
if (relativePath === MANIFEST_FILE_NAME) return "manifest";
|
|
231
|
-
if (relativePath === CHUNKS_FILE_NAME) return "chunks";
|
|
232
|
-
if (relativePath === FULL_MARKDOWN_FILE_NAME) return "fullMarkdown";
|
|
233
|
-
if (relativePath === KB_CSV_FILE_NAME) return "kbCsv";
|
|
234
|
-
if (relativePath === HIERARCHY_FILE_NAME) return "hierarchy";
|
|
235
|
-
if (relativePath === HIERARCHY_VIEW_FILE_NAME) return "hierarchyView";
|
|
236
|
-
if (relativePath.startsWith("images/")) return "image";
|
|
237
|
-
if (relativePath.startsWith("tables/")) return "table";
|
|
238
|
-
return "other";
|
|
239
|
-
}
|
|
240
|
-
function inferResultFileFormat(relativePath) {
|
|
241
|
-
return path.posix.extname(relativePath).replace(/^\./, "").trim() || null;
|
|
242
|
-
}
|
|
243
|
-
function isStringArray(value) {
|
|
244
|
-
return Array.isArray(value) && value.every((entry) => typeof entry === "string");
|
|
245
|
-
}
|
|
246
|
-
function isStoredBrowseIndex(value) {
|
|
247
|
-
if (!isRecord(value)) return false;
|
|
248
|
-
if (value.version !== 2) return false;
|
|
249
|
-
if (!isStringArray(value.chunkOrder)) return false;
|
|
250
|
-
if (!Array.isArray(value.paths) || !Array.isArray(value.resultFiles)) return false;
|
|
251
|
-
if (!value.paths.every((entry) => isRecord(entry) && typeof entry.path === "string" && (entry.parentPath === null || typeof entry.parentPath === "string") && typeof entry.depth === "number" && Number.isFinite(entry.depth) && isStringArray(entry.childPaths) && isStringArray(entry.chunkIds) && typeof entry.directChunkCount === "number" && typeof entry.chunkCount === "number" && typeof entry.textChunkCount === "number" && typeof entry.imageChunkCount === "number" && typeof entry.tableChunkCount === "number")) return false;
|
|
252
|
-
return value.resultFiles.every((entry) => isRecord(entry) && typeof entry.relativePath === "string" && typeof entry.kind === "string" && (entry.chunkId === null || typeof entry.chunkId === "string") && (entry.format === null || typeof entry.format === "string") && (entry.sizeBytes === null || typeof entry.sizeBytes === "number" && Number.isFinite(entry.sizeBytes)));
|
|
253
|
-
}
|
|
254
|
-
async function listResultFiles(rootDir, currentDir = rootDir) {
|
|
255
|
-
const entries = await fs.readdir(currentDir, { withFileTypes: true });
|
|
256
|
-
const files = [];
|
|
257
|
-
for (const entry of entries) {
|
|
258
|
-
const absolutePath = path.join(currentDir, entry.name);
|
|
259
|
-
if (entry.isDirectory()) {
|
|
260
|
-
files.push(...await listResultFiles(rootDir, absolutePath));
|
|
261
|
-
continue;
|
|
262
|
-
}
|
|
263
|
-
if (!entry.isFile()) continue;
|
|
264
|
-
files.push(normalizeRelativePath(path.relative(rootDir, absolutePath)));
|
|
265
|
-
}
|
|
266
|
-
return files.sort((left, right) => left.localeCompare(right));
|
|
267
|
-
}
|
|
268
|
-
async function buildResultFileRecords(resultDir, manifest, chunks) {
|
|
269
|
-
const lookup = buildResultFileChunkLookup(manifest, chunks);
|
|
270
|
-
const relativePaths = await listResultFiles(resultDir);
|
|
271
|
-
return Promise.all(relativePaths.map(async (relativePath) => {
|
|
272
|
-
const absolutePath = resolveResultEntryPath(resultDir, relativePath);
|
|
273
|
-
const stats = await fs.stat(absolutePath);
|
|
274
|
-
const manifestEntry = lookup.get(relativePath);
|
|
275
|
-
return {
|
|
276
|
-
relativePath,
|
|
277
|
-
kind: inferResultFileKind(relativePath),
|
|
278
|
-
chunkId: manifestEntry?.chunkId ?? null,
|
|
279
|
-
format: manifestEntry?.format ?? inferResultFileFormat(relativePath),
|
|
280
|
-
sizeBytes: stats.isFile() ? stats.size : null
|
|
281
|
-
};
|
|
282
|
-
}));
|
|
283
|
-
}
|
|
284
|
-
async function buildStoredBrowseIndex(resultDir, manifest, chunks) {
|
|
285
|
-
return {
|
|
286
|
-
version: 2,
|
|
287
|
-
paths: buildPathRecords(chunks),
|
|
288
|
-
chunkOrder: chunks.map((chunk) => chunk.chunkId).filter((chunkId) => chunkId.length > 0),
|
|
289
|
-
resultFiles: await buildResultFileRecords(resultDir, manifest, chunks)
|
|
290
|
-
};
|
|
291
|
-
}
|
|
292
72
|
async function extractKnowhereResultArchive(downloadedResult, targetDir) {
|
|
293
73
|
const zipBuffer = Buffer.isBuffer(downloadedResult.zipBytes) ? downloadedResult.zipBytes : Buffer.from(downloadedResult.zipBytes);
|
|
294
74
|
const entries = unzipSync(new Uint8Array(zipBuffer));
|
|
@@ -302,22 +82,24 @@ async function extractKnowhereResultArchive(downloadedResult, targetDir) {
|
|
|
302
82
|
await fs.writeFile(outputPath, entryBytes);
|
|
303
83
|
}
|
|
304
84
|
}
|
|
305
|
-
async function
|
|
306
|
-
|
|
307
|
-
const
|
|
85
|
+
async function resolveStoredKnowhereResultRoot(documentDir) {
|
|
86
|
+
if (await pathExists(path.join(documentDir, MANIFEST_FILE_NAME))) return documentDir;
|
|
87
|
+
const legacyResultDir = path.join(documentDir, LEGACY_RESULT_DIRECTORY_NAME);
|
|
88
|
+
if (await pathExists(path.join(legacyResultDir, MANIFEST_FILE_NAME))) return legacyResultDir;
|
|
89
|
+
return documentDir;
|
|
90
|
+
}
|
|
91
|
+
async function resolveStoredKnowhereArtifactPath(documentDir, entryPath) {
|
|
92
|
+
return resolveResultEntryPath(await resolveStoredKnowhereResultRoot(documentDir), entryPath);
|
|
93
|
+
}
|
|
94
|
+
async function readStoredKnowhereResultSummary(documentDir) {
|
|
95
|
+
const resultRoot = await resolveStoredKnowhereResultRoot(documentDir);
|
|
96
|
+
const manifest = parseManifest(await readJsonFile(path.join(resultRoot, MANIFEST_FILE_NAME)));
|
|
97
|
+
const rawChunks = parseRawChunks(await readJsonFile(path.join(resultRoot, CHUNKS_FILE_NAME)));
|
|
308
98
|
return {
|
|
309
99
|
manifest,
|
|
310
100
|
chunkCount: rawChunks.length,
|
|
311
101
|
statistics: normalizeStatistics(manifest, rawChunks)
|
|
312
102
|
};
|
|
313
103
|
}
|
|
314
|
-
async function readStoredKnowhereResultContent(resultDir) {
|
|
315
|
-
return {
|
|
316
|
-
manifest: parseManifest(await readJsonFile(path.join(resultDir, MANIFEST_FILE_NAME))),
|
|
317
|
-
chunks: parseRawChunks(await readJsonFile(path.join(resultDir, CHUNKS_FILE_NAME))).map((rawChunk) => buildChunk(rawChunk)),
|
|
318
|
-
fullMarkdown: await readTextFile(path.join(resultDir, FULL_MARKDOWN_FILE_NAME)) || "",
|
|
319
|
-
hierarchy: await readJsonFile(path.join(resultDir, HIERARCHY_FILE_NAME))
|
|
320
|
-
};
|
|
321
|
-
}
|
|
322
104
|
//#endregion
|
|
323
|
-
export {
|
|
105
|
+
export { extractKnowhereResultArchive, readStoredKnowhereResultSummary, resolveStoredKnowhereArtifactPath, resolveStoredKnowhereResultRoot };
|
package/dist/store.d.ts
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
import type { ChannelRouteRecord, KnowhereScope, PluginLogger, SaveStoredDocumentPayload, ScopeMode,
|
|
1
|
+
import type { ChannelRouteRecord, KnowhereScope, PluginLogger, SaveStoredDocumentPayload, ScopeMode, StoredDocumentMetadata, StoredDocumentRecord } from "./types";
|
|
2
|
+
type StoredDocumentScopePaths = Pick<KnowhereScope, "documentsDir" | "metadataDir">;
|
|
2
3
|
export declare class KnowhereStore {
|
|
3
4
|
private readonly rootDir;
|
|
4
5
|
private readonly scopeMode;
|
|
5
6
|
private readonly logger;
|
|
6
7
|
private readonly indexCache;
|
|
7
|
-
private readonly documentPayloadCache;
|
|
8
8
|
private readonly scopeAccessChains;
|
|
9
9
|
private readonly scopeKeyAliases;
|
|
10
10
|
private readonly sessionScopeKeysBySessionId;
|
|
@@ -48,14 +48,8 @@ export declare class KnowhereStore {
|
|
|
48
48
|
sessionKey?: string;
|
|
49
49
|
sessionId?: string;
|
|
50
50
|
}): KnowhereScope;
|
|
51
|
+
readDocumentMetadata(scope: StoredDocumentScopePaths, docId: string): Promise<StoredDocumentMetadata | null>;
|
|
51
52
|
listDocuments(scope: KnowhereScope): Promise<StoredDocumentRecord[]>;
|
|
52
|
-
loadDocumentPayload(scope: KnowhereScope, docId: string): Promise<StoredDocumentPayload | null>;
|
|
53
|
-
getResultFileAbsolutePath(scope: KnowhereScope, docId: string, relativePath: string): string;
|
|
54
|
-
readResultFile(scope: KnowhereScope, docId: string, relativePath: string): Promise<{
|
|
55
|
-
document: StoredDocumentRecord;
|
|
56
|
-
relativePath: string;
|
|
57
|
-
text: string | null;
|
|
58
|
-
} | null>;
|
|
59
53
|
saveDownloadedDocument(scope: KnowhereScope, payload: SaveStoredDocumentPayload, options?: {
|
|
60
54
|
overwrite?: boolean;
|
|
61
55
|
}): Promise<StoredDocumentRecord>;
|
|
@@ -65,11 +59,6 @@ export declare class KnowhereStore {
|
|
|
65
59
|
private persistIndex;
|
|
66
60
|
private runWithScopeAccessLock;
|
|
67
61
|
private removeDocumentArtifacts;
|
|
68
|
-
private buildDocumentPayloadCacheKey;
|
|
69
|
-
private touchDocumentPayloadCache;
|
|
70
|
-
private deleteDocumentPayloadCache;
|
|
71
|
-
private deleteScopeDocumentPayloadCaches;
|
|
72
|
-
private loadOrBuildBrowseIndex;
|
|
73
62
|
private buildRouteKey;
|
|
74
63
|
private ensureRoutesLoaded;
|
|
75
64
|
private persistRoutes;
|
|
@@ -77,3 +66,4 @@ export declare class KnowhereStore {
|
|
|
77
66
|
private resolveKnownScopeKey;
|
|
78
67
|
private rebuildIndex;
|
|
79
68
|
}
|
|
69
|
+
export {};
|