@ontos-ai/knowhere-claw 0.2.3 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -5
- package/dist/client.js +1 -1
- package/dist/config.d.ts +8 -0
- package/dist/config.js +56 -8
- package/dist/connect-builder.d.ts +2 -0
- package/dist/connect-builder.js +9 -10
- package/dist/graph-builder.d.ts +4 -1
- package/dist/graph-builder.js +21 -34
- package/dist/index.js +3 -9
- package/dist/kg-service.d.ts +0 -2
- package/dist/kg-service.js +12 -45
- package/dist/parser.d.ts +4 -8
- package/dist/parser.js +25 -243
- package/dist/store.d.ts +4 -14
- package/dist/store.js +21 -106
- package/dist/text.js +1 -13
- package/dist/tools.js +413 -848
- package/dist/types.d.ts +1 -58
- package/openclaw.plugin.json +71 -1
- package/package.json +2 -3
- package/skills/knowhere_memory/SKILL.md +80 -98
- package/skills/knowhere/SKILL.md +0 -285
- /package/dist/__tests__/{read-result-file-tool.test.d.ts → storage-layout.test.d.ts} +0 -0
package/dist/parser.js
CHANGED
|
@@ -3,11 +3,9 @@ import fs from "node:fs/promises";
|
|
|
3
3
|
import path from "node:path";
|
|
4
4
|
import { createHash } from "node:crypto";
|
|
5
5
|
import { strFromU8, unzipSync } from "fflate";
|
|
6
|
+
//#region src/parser.ts
|
|
6
7
|
const CHUNKS_FILE_NAME = "chunks.json";
|
|
7
|
-
const
|
|
8
|
-
const HIERARCHY_FILE_NAME = "hierarchy.json";
|
|
9
|
-
const HIERARCHY_VIEW_FILE_NAME = "hierarchy_view.html";
|
|
10
|
-
const KB_CSV_FILE_NAME = "kb.csv";
|
|
8
|
+
const LEGACY_RESULT_DIRECTORY_NAME = "result";
|
|
11
9
|
const MANIFEST_FILE_NAME = "manifest.json";
|
|
12
10
|
function readZipText(entries, fileName) {
|
|
13
11
|
const entry = entries[fileName];
|
|
@@ -16,6 +14,15 @@ function readZipText(entries, fileName) {
|
|
|
16
14
|
async function ensureDir(targetPath) {
|
|
17
15
|
await fs.mkdir(targetPath, { recursive: true });
|
|
18
16
|
}
|
|
17
|
+
async function pathExists(targetPath) {
|
|
18
|
+
try {
|
|
19
|
+
await fs.access(targetPath);
|
|
20
|
+
return true;
|
|
21
|
+
} catch (error) {
|
|
22
|
+
if (isNodeError(error) && error.code === "ENOENT") return false;
|
|
23
|
+
throw error;
|
|
24
|
+
}
|
|
25
|
+
}
|
|
19
26
|
async function readTextFile(targetPath) {
|
|
20
27
|
try {
|
|
21
28
|
return await fs.readFile(targetPath, "utf-8");
|
|
@@ -39,36 +46,6 @@ function resolveResultEntryPath(rootDir, entryPath) {
|
|
|
39
46
|
function normalizeRelativePath(value) {
|
|
40
47
|
return value.replace(/\\/g, "/").replace(/^\/+/, "");
|
|
41
48
|
}
|
|
42
|
-
function normalizeStoredPath(value) {
|
|
43
|
-
if (typeof value !== "string") return null;
|
|
44
|
-
return value.trim() || null;
|
|
45
|
-
}
|
|
46
|
-
function readChunkNumber(rawChunk, metadata, key) {
|
|
47
|
-
const metadataValue = metadata[key];
|
|
48
|
-
if (typeof metadataValue === "number" && Number.isFinite(metadataValue)) return metadataValue;
|
|
49
|
-
const rawValue = rawChunk[key];
|
|
50
|
-
if (typeof rawValue === "number" && Number.isFinite(rawValue)) return rawValue;
|
|
51
|
-
return null;
|
|
52
|
-
}
|
|
53
|
-
function readChunkStringArray(rawChunk, metadata, key) {
|
|
54
|
-
const metadataValue = metadata[key];
|
|
55
|
-
if (Array.isArray(metadataValue)) return metadataValue.filter((entry) => typeof entry === "string");
|
|
56
|
-
const rawValue = rawChunk[key];
|
|
57
|
-
if (Array.isArray(rawValue)) return rawValue.filter((entry) => typeof entry === "string");
|
|
58
|
-
return [];
|
|
59
|
-
}
|
|
60
|
-
function readChunkArray(rawChunk, metadata, key) {
|
|
61
|
-
const metadataValue = metadata[key];
|
|
62
|
-
if (Array.isArray(metadataValue)) return metadataValue;
|
|
63
|
-
const rawValue = rawChunk[key];
|
|
64
|
-
if (Array.isArray(rawValue)) return rawValue;
|
|
65
|
-
return [];
|
|
66
|
-
}
|
|
67
|
-
function extractAssetFilePath(rawChunk, metadata) {
|
|
68
|
-
const candidates = [rawChunk.file_path, metadata.file_path];
|
|
69
|
-
for (const candidate of candidates) if (typeof candidate === "string" && candidate.trim()) return normalizeRelativePath(candidate.trim());
|
|
70
|
-
return null;
|
|
71
|
-
}
|
|
72
49
|
function parseRawChunks(value) {
|
|
73
50
|
if (Array.isArray(value)) return value.filter((entry) => isRecord(entry));
|
|
74
51
|
if (isRecord(value) && Array.isArray(value.chunks)) return value.chunks.filter((entry) => isRecord(entry));
|
|
@@ -77,24 +54,6 @@ function parseRawChunks(value) {
|
|
|
77
54
|
function parseManifest(value) {
|
|
78
55
|
return isRecord(value) ? value : {};
|
|
79
56
|
}
|
|
80
|
-
function buildChunk(rawChunk) {
|
|
81
|
-
const metadata = isRecord(rawChunk.metadata) ? rawChunk.metadata : {};
|
|
82
|
-
const type = rawChunk.type === "image" || rawChunk.type === "table" || rawChunk.type === "text" ? rawChunk.type : "text";
|
|
83
|
-
return {
|
|
84
|
-
chunkId: typeof rawChunk.chunk_id === "string" ? rawChunk.chunk_id : "",
|
|
85
|
-
type,
|
|
86
|
-
path: normalizeStoredPath(rawChunk.path),
|
|
87
|
-
summary: typeof metadata.summary === "string" ? metadata.summary : typeof rawChunk.summary === "string" ? rawChunk.summary : "",
|
|
88
|
-
content: typeof rawChunk.content === "string" ? rawChunk.content : "",
|
|
89
|
-
tokens: readChunkNumber(rawChunk, metadata, "tokens"),
|
|
90
|
-
keywords: readChunkStringArray(rawChunk, metadata, "keywords"),
|
|
91
|
-
relationships: readChunkArray(rawChunk, metadata, "relationships"),
|
|
92
|
-
metadata,
|
|
93
|
-
assetFilePath: extractAssetFilePath(rawChunk, metadata),
|
|
94
|
-
originalName: typeof metadata.original_name === "string" ? metadata.original_name : typeof rawChunk.original_name === "string" ? rawChunk.original_name : null,
|
|
95
|
-
tableType: typeof metadata.table_type === "string" ? metadata.table_type : typeof rawChunk.table_type === "string" ? rawChunk.table_type : null
|
|
96
|
-
};
|
|
97
|
-
}
|
|
98
57
|
function normalizeStatistics(manifest, rawChunks) {
|
|
99
58
|
if (manifest.statistics) return manifest.statistics;
|
|
100
59
|
return {
|
|
@@ -110,185 +69,6 @@ function validateKnowhereResultChecksum(zipBuffer, manifest) {
|
|
|
110
69
|
if (typeof checksum !== "string" || !checksum) return;
|
|
111
70
|
if (createHash("sha256").update(zipBuffer).digest("hex") !== checksum) throw new Error("Knowhere result ZIP checksum mismatch.");
|
|
112
71
|
}
|
|
113
|
-
function tokenizeStoredPath(storedPath) {
|
|
114
|
-
const slashSegments = storedPath.split("/").map((segment) => segment.trim()).filter(Boolean);
|
|
115
|
-
const tokens = [];
|
|
116
|
-
for (const slashSegment of slashSegments) {
|
|
117
|
-
const arrowSegments = slashSegment.split("-->").map((segment) => segment.trim()).filter(Boolean);
|
|
118
|
-
if (arrowSegments.length === 0) continue;
|
|
119
|
-
tokens.push({
|
|
120
|
-
delimiter: tokens.length === 0 ? null : "/",
|
|
121
|
-
segment: arrowSegments[0] || ""
|
|
122
|
-
});
|
|
123
|
-
for (const arrowSegment of arrowSegments.slice(1)) tokens.push({
|
|
124
|
-
delimiter: "-->",
|
|
125
|
-
segment: arrowSegment
|
|
126
|
-
});
|
|
127
|
-
}
|
|
128
|
-
return tokens;
|
|
129
|
-
}
|
|
130
|
-
function buildStoredPathPrefixes(storedPath) {
|
|
131
|
-
const tokens = tokenizeStoredPath(storedPath);
|
|
132
|
-
const prefixes = [];
|
|
133
|
-
let currentPath = "";
|
|
134
|
-
for (const token of tokens) {
|
|
135
|
-
currentPath = token.delimiter ? `${currentPath}${token.delimiter}${token.segment}` : token.segment;
|
|
136
|
-
prefixes.push(currentPath);
|
|
137
|
-
}
|
|
138
|
-
return prefixes;
|
|
139
|
-
}
|
|
140
|
-
function ensurePathAccumulator(accumulators, pathValue, parentPath, depth) {
|
|
141
|
-
const existing = accumulators.get(pathValue);
|
|
142
|
-
if (existing) {
|
|
143
|
-
if (parentPath && !existing.parentPath) existing.parentPath = parentPath;
|
|
144
|
-
return existing;
|
|
145
|
-
}
|
|
146
|
-
const next = {
|
|
147
|
-
childPaths: /* @__PURE__ */ new Set(),
|
|
148
|
-
chunkCount: 0,
|
|
149
|
-
chunkIds: [],
|
|
150
|
-
depth,
|
|
151
|
-
directChunkCount: 0,
|
|
152
|
-
imageChunkCount: 0,
|
|
153
|
-
parentPath,
|
|
154
|
-
path: pathValue,
|
|
155
|
-
tableChunkCount: 0,
|
|
156
|
-
textChunkCount: 0
|
|
157
|
-
};
|
|
158
|
-
accumulators.set(pathValue, next);
|
|
159
|
-
return next;
|
|
160
|
-
}
|
|
161
|
-
function incrementPathCounters(accumulator, chunkType) {
|
|
162
|
-
accumulator.chunkCount += 1;
|
|
163
|
-
if (chunkType === "image") {
|
|
164
|
-
accumulator.imageChunkCount += 1;
|
|
165
|
-
return;
|
|
166
|
-
}
|
|
167
|
-
if (chunkType === "table") {
|
|
168
|
-
accumulator.tableChunkCount += 1;
|
|
169
|
-
return;
|
|
170
|
-
}
|
|
171
|
-
accumulator.textChunkCount += 1;
|
|
172
|
-
}
|
|
173
|
-
function buildPathRecords(chunks) {
|
|
174
|
-
const accumulators = /* @__PURE__ */ new Map();
|
|
175
|
-
for (const chunk of chunks) {
|
|
176
|
-
if (!chunk.path) continue;
|
|
177
|
-
const prefixes = buildStoredPathPrefixes(chunk.path);
|
|
178
|
-
for (const [index, prefix] of prefixes.entries()) {
|
|
179
|
-
const parentPath = index > 0 ? prefixes[index - 1] || null : null;
|
|
180
|
-
const accumulator = ensurePathAccumulator(accumulators, prefix, parentPath, index + 1);
|
|
181
|
-
incrementPathCounters(accumulator, chunk.type);
|
|
182
|
-
if (parentPath) ensurePathAccumulator(accumulators, parentPath, index > 1 ? prefixes[index - 2] || null : null, index).childPaths.add(prefix);
|
|
183
|
-
if (index === prefixes.length - 1) {
|
|
184
|
-
accumulator.directChunkCount += 1;
|
|
185
|
-
if (chunk.chunkId) accumulator.chunkIds.push(chunk.chunkId);
|
|
186
|
-
}
|
|
187
|
-
}
|
|
188
|
-
}
|
|
189
|
-
return [...accumulators.values()].sort((left, right) => left.depth - right.depth || left.path.localeCompare(right.path)).map((entry) => ({
|
|
190
|
-
path: entry.path,
|
|
191
|
-
parentPath: entry.parentPath,
|
|
192
|
-
depth: entry.depth,
|
|
193
|
-
childPaths: [...entry.childPaths].sort((left, right) => left.localeCompare(right)),
|
|
194
|
-
chunkIds: [...entry.chunkIds],
|
|
195
|
-
directChunkCount: entry.directChunkCount,
|
|
196
|
-
chunkCount: entry.chunkCount,
|
|
197
|
-
textChunkCount: entry.textChunkCount,
|
|
198
|
-
imageChunkCount: entry.imageChunkCount,
|
|
199
|
-
tableChunkCount: entry.tableChunkCount
|
|
200
|
-
}));
|
|
201
|
-
}
|
|
202
|
-
function readManifestAssetEntries(manifest, key) {
|
|
203
|
-
const rawEntries = (isRecord(manifest.files) ? manifest.files : {})[key];
|
|
204
|
-
if (!Array.isArray(rawEntries)) return [];
|
|
205
|
-
return rawEntries.filter((entry) => isRecord(entry));
|
|
206
|
-
}
|
|
207
|
-
function buildResultFileChunkLookup(manifest, chunks) {
|
|
208
|
-
const entries = /* @__PURE__ */ new Map();
|
|
209
|
-
for (const key of ["images", "tables"]) {
|
|
210
|
-
const assetEntries = readManifestAssetEntries(manifest, key);
|
|
211
|
-
for (const entry of assetEntries) {
|
|
212
|
-
const filePath = typeof entry.file_path === "string" && entry.file_path.trim() ? normalizeRelativePath(entry.file_path.trim()) : null;
|
|
213
|
-
if (!filePath) continue;
|
|
214
|
-
entries.set(filePath, {
|
|
215
|
-
chunkId: typeof entry.id === "string" ? entry.id : null,
|
|
216
|
-
format: typeof entry.format === "string" ? entry.format : null
|
|
217
|
-
});
|
|
218
|
-
}
|
|
219
|
-
}
|
|
220
|
-
for (const chunk of chunks) {
|
|
221
|
-
if (!chunk.assetFilePath || entries.has(chunk.assetFilePath)) continue;
|
|
222
|
-
entries.set(chunk.assetFilePath, {
|
|
223
|
-
chunkId: chunk.chunkId || null,
|
|
224
|
-
format: null
|
|
225
|
-
});
|
|
226
|
-
}
|
|
227
|
-
return entries;
|
|
228
|
-
}
|
|
229
|
-
function inferResultFileKind(relativePath) {
|
|
230
|
-
if (relativePath === MANIFEST_FILE_NAME) return "manifest";
|
|
231
|
-
if (relativePath === CHUNKS_FILE_NAME) return "chunks";
|
|
232
|
-
if (relativePath === FULL_MARKDOWN_FILE_NAME) return "fullMarkdown";
|
|
233
|
-
if (relativePath === KB_CSV_FILE_NAME) return "kbCsv";
|
|
234
|
-
if (relativePath === HIERARCHY_FILE_NAME) return "hierarchy";
|
|
235
|
-
if (relativePath === HIERARCHY_VIEW_FILE_NAME) return "hierarchyView";
|
|
236
|
-
if (relativePath.startsWith("images/")) return "image";
|
|
237
|
-
if (relativePath.startsWith("tables/")) return "table";
|
|
238
|
-
return "other";
|
|
239
|
-
}
|
|
240
|
-
function inferResultFileFormat(relativePath) {
|
|
241
|
-
return path.posix.extname(relativePath).replace(/^\./, "").trim() || null;
|
|
242
|
-
}
|
|
243
|
-
function isStringArray(value) {
|
|
244
|
-
return Array.isArray(value) && value.every((entry) => typeof entry === "string");
|
|
245
|
-
}
|
|
246
|
-
function isStoredBrowseIndex(value) {
|
|
247
|
-
if (!isRecord(value)) return false;
|
|
248
|
-
if (value.version !== 2) return false;
|
|
249
|
-
if (!isStringArray(value.chunkOrder)) return false;
|
|
250
|
-
if (!Array.isArray(value.paths) || !Array.isArray(value.resultFiles)) return false;
|
|
251
|
-
if (!value.paths.every((entry) => isRecord(entry) && typeof entry.path === "string" && (entry.parentPath === null || typeof entry.parentPath === "string") && typeof entry.depth === "number" && Number.isFinite(entry.depth) && isStringArray(entry.childPaths) && isStringArray(entry.chunkIds) && typeof entry.directChunkCount === "number" && typeof entry.chunkCount === "number" && typeof entry.textChunkCount === "number" && typeof entry.imageChunkCount === "number" && typeof entry.tableChunkCount === "number")) return false;
|
|
252
|
-
return value.resultFiles.every((entry) => isRecord(entry) && typeof entry.relativePath === "string" && typeof entry.kind === "string" && (entry.chunkId === null || typeof entry.chunkId === "string") && (entry.format === null || typeof entry.format === "string") && (entry.sizeBytes === null || typeof entry.sizeBytes === "number" && Number.isFinite(entry.sizeBytes)));
|
|
253
|
-
}
|
|
254
|
-
async function listResultFiles(rootDir, currentDir = rootDir) {
|
|
255
|
-
const entries = await fs.readdir(currentDir, { withFileTypes: true });
|
|
256
|
-
const files = [];
|
|
257
|
-
for (const entry of entries) {
|
|
258
|
-
const absolutePath = path.join(currentDir, entry.name);
|
|
259
|
-
if (entry.isDirectory()) {
|
|
260
|
-
files.push(...await listResultFiles(rootDir, absolutePath));
|
|
261
|
-
continue;
|
|
262
|
-
}
|
|
263
|
-
if (!entry.isFile()) continue;
|
|
264
|
-
files.push(normalizeRelativePath(path.relative(rootDir, absolutePath)));
|
|
265
|
-
}
|
|
266
|
-
return files.sort((left, right) => left.localeCompare(right));
|
|
267
|
-
}
|
|
268
|
-
async function buildResultFileRecords(resultDir, manifest, chunks) {
|
|
269
|
-
const lookup = buildResultFileChunkLookup(manifest, chunks);
|
|
270
|
-
const relativePaths = await listResultFiles(resultDir);
|
|
271
|
-
return Promise.all(relativePaths.map(async (relativePath) => {
|
|
272
|
-
const absolutePath = resolveResultEntryPath(resultDir, relativePath);
|
|
273
|
-
const stats = await fs.stat(absolutePath);
|
|
274
|
-
const manifestEntry = lookup.get(relativePath);
|
|
275
|
-
return {
|
|
276
|
-
relativePath,
|
|
277
|
-
kind: inferResultFileKind(relativePath),
|
|
278
|
-
chunkId: manifestEntry?.chunkId ?? null,
|
|
279
|
-
format: manifestEntry?.format ?? inferResultFileFormat(relativePath),
|
|
280
|
-
sizeBytes: stats.isFile() ? stats.size : null
|
|
281
|
-
};
|
|
282
|
-
}));
|
|
283
|
-
}
|
|
284
|
-
async function buildStoredBrowseIndex(resultDir, manifest, chunks) {
|
|
285
|
-
return {
|
|
286
|
-
version: 2,
|
|
287
|
-
paths: buildPathRecords(chunks),
|
|
288
|
-
chunkOrder: chunks.map((chunk) => chunk.chunkId).filter((chunkId) => chunkId.length > 0),
|
|
289
|
-
resultFiles: await buildResultFileRecords(resultDir, manifest, chunks)
|
|
290
|
-
};
|
|
291
|
-
}
|
|
292
72
|
async function extractKnowhereResultArchive(downloadedResult, targetDir) {
|
|
293
73
|
const zipBuffer = Buffer.isBuffer(downloadedResult.zipBytes) ? downloadedResult.zipBytes : Buffer.from(downloadedResult.zipBytes);
|
|
294
74
|
const entries = unzipSync(new Uint8Array(zipBuffer));
|
|
@@ -302,22 +82,24 @@ async function extractKnowhereResultArchive(downloadedResult, targetDir) {
|
|
|
302
82
|
await fs.writeFile(outputPath, entryBytes);
|
|
303
83
|
}
|
|
304
84
|
}
|
|
305
|
-
async function
|
|
306
|
-
|
|
307
|
-
const
|
|
85
|
+
async function resolveStoredKnowhereResultRoot(documentDir) {
|
|
86
|
+
if (await pathExists(path.join(documentDir, MANIFEST_FILE_NAME))) return documentDir;
|
|
87
|
+
const legacyResultDir = path.join(documentDir, LEGACY_RESULT_DIRECTORY_NAME);
|
|
88
|
+
if (await pathExists(path.join(legacyResultDir, MANIFEST_FILE_NAME))) return legacyResultDir;
|
|
89
|
+
return documentDir;
|
|
90
|
+
}
|
|
91
|
+
async function resolveStoredKnowhereArtifactPath(documentDir, entryPath) {
|
|
92
|
+
return resolveResultEntryPath(await resolveStoredKnowhereResultRoot(documentDir), entryPath);
|
|
93
|
+
}
|
|
94
|
+
async function readStoredKnowhereResultSummary(documentDir) {
|
|
95
|
+
const resultRoot = await resolveStoredKnowhereResultRoot(documentDir);
|
|
96
|
+
const manifest = parseManifest(await readJsonFile(path.join(resultRoot, MANIFEST_FILE_NAME)));
|
|
97
|
+
const rawChunks = parseRawChunks(await readJsonFile(path.join(resultRoot, CHUNKS_FILE_NAME)));
|
|
308
98
|
return {
|
|
309
99
|
manifest,
|
|
310
100
|
chunkCount: rawChunks.length,
|
|
311
101
|
statistics: normalizeStatistics(manifest, rawChunks)
|
|
312
102
|
};
|
|
313
103
|
}
|
|
314
|
-
async function readStoredKnowhereResultContent(resultDir) {
|
|
315
|
-
return {
|
|
316
|
-
manifest: parseManifest(await readJsonFile(path.join(resultDir, MANIFEST_FILE_NAME))),
|
|
317
|
-
chunks: parseRawChunks(await readJsonFile(path.join(resultDir, CHUNKS_FILE_NAME))).map((rawChunk) => buildChunk(rawChunk)),
|
|
318
|
-
fullMarkdown: await readTextFile(path.join(resultDir, FULL_MARKDOWN_FILE_NAME)) || "",
|
|
319
|
-
hierarchy: await readJsonFile(path.join(resultDir, HIERARCHY_FILE_NAME))
|
|
320
|
-
};
|
|
321
|
-
}
|
|
322
104
|
//#endregion
|
|
323
|
-
export {
|
|
105
|
+
export { extractKnowhereResultArchive, readStoredKnowhereResultSummary, resolveStoredKnowhereArtifactPath, resolveStoredKnowhereResultRoot };
|
package/dist/store.d.ts
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
import type { ChannelRouteRecord, KnowhereScope, PluginLogger, SaveStoredDocumentPayload, ScopeMode,
|
|
1
|
+
import type { ChannelRouteRecord, KnowhereScope, PluginLogger, SaveStoredDocumentPayload, ScopeMode, StoredDocumentMetadata, StoredDocumentRecord } from "./types";
|
|
2
|
+
type StoredDocumentScopePaths = Pick<KnowhereScope, "documentsDir" | "metadataDir">;
|
|
2
3
|
export declare class KnowhereStore {
|
|
3
4
|
private readonly rootDir;
|
|
4
5
|
private readonly scopeMode;
|
|
5
6
|
private readonly logger;
|
|
6
7
|
private readonly indexCache;
|
|
7
|
-
private readonly documentPayloadCache;
|
|
8
8
|
private readonly scopeAccessChains;
|
|
9
9
|
private readonly scopeKeyAliases;
|
|
10
10
|
private readonly sessionScopeKeysBySessionId;
|
|
@@ -48,14 +48,8 @@ export declare class KnowhereStore {
|
|
|
48
48
|
sessionKey?: string;
|
|
49
49
|
sessionId?: string;
|
|
50
50
|
}): KnowhereScope;
|
|
51
|
+
readDocumentMetadata(scope: StoredDocumentScopePaths, docId: string): Promise<StoredDocumentMetadata | null>;
|
|
51
52
|
listDocuments(scope: KnowhereScope): Promise<StoredDocumentRecord[]>;
|
|
52
|
-
loadDocumentPayload(scope: KnowhereScope, docId: string): Promise<StoredDocumentPayload | null>;
|
|
53
|
-
getResultFileAbsolutePath(scope: KnowhereScope, docId: string, relativePath: string): string;
|
|
54
|
-
readResultFile(scope: KnowhereScope, docId: string, relativePath: string): Promise<{
|
|
55
|
-
document: StoredDocumentRecord;
|
|
56
|
-
relativePath: string;
|
|
57
|
-
text: string | null;
|
|
58
|
-
} | null>;
|
|
59
53
|
saveDownloadedDocument(scope: KnowhereScope, payload: SaveStoredDocumentPayload, options?: {
|
|
60
54
|
overwrite?: boolean;
|
|
61
55
|
}): Promise<StoredDocumentRecord>;
|
|
@@ -65,11 +59,6 @@ export declare class KnowhereStore {
|
|
|
65
59
|
private persistIndex;
|
|
66
60
|
private runWithScopeAccessLock;
|
|
67
61
|
private removeDocumentArtifacts;
|
|
68
|
-
private buildDocumentPayloadCacheKey;
|
|
69
|
-
private touchDocumentPayloadCache;
|
|
70
|
-
private deleteDocumentPayloadCache;
|
|
71
|
-
private deleteScopeDocumentPayloadCaches;
|
|
72
|
-
private loadOrBuildBrowseIndex;
|
|
73
62
|
private buildRouteKey;
|
|
74
63
|
private ensureRoutesLoaded;
|
|
75
64
|
private persistRoutes;
|
|
@@ -77,3 +66,4 @@ export declare class KnowhereStore {
|
|
|
77
66
|
private resolveKnownScopeKey;
|
|
78
67
|
private rebuildIndex;
|
|
79
68
|
}
|
|
69
|
+
export {};
|
package/dist/store.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { isNodeError } from "./types.js";
|
|
2
|
-
import {
|
|
2
|
+
import { extractKnowhereResultArchive, readStoredKnowhereResultSummary } from "./parser.js";
|
|
3
3
|
import { hashString, normalizeWhitespace, sanitizeStringArray, slugify } from "./text.js";
|
|
4
4
|
import { deriveMessageContextScopeKey, findConversationSegmentValue, parseConversationSessionKey } from "./session.js";
|
|
5
5
|
import fs from "node:fs/promises";
|
|
@@ -7,10 +7,8 @@ import path from "node:path";
|
|
|
7
7
|
import { randomUUID } from "node:crypto";
|
|
8
8
|
//#region src/store.ts
|
|
9
9
|
const INDEX_VERSION = 1;
|
|
10
|
-
const
|
|
11
|
-
const DOCUMENT_PAYLOAD_CACHE_LIMIT = 16;
|
|
10
|
+
const METADATA_DIRECTORY_NAME = "metadata";
|
|
12
11
|
const METADATA_FILE_NAME = "metadata.json";
|
|
13
|
-
const RESULT_DIRECTORY_NAME = "result";
|
|
14
12
|
const ROUTES_FILE_NAME = "routes.json";
|
|
15
13
|
async function pathExists(targetPath) {
|
|
16
14
|
try {
|
|
@@ -119,10 +117,9 @@ function createEmptyIndex(scope) {
|
|
|
119
117
|
function buildStoredDocumentPaths(scope, docId) {
|
|
120
118
|
const documentDir = path.join(scope.documentsDir, docId);
|
|
121
119
|
return {
|
|
122
|
-
browseIndexPath: path.join(documentDir, BROWSE_INDEX_FILE_NAME),
|
|
123
120
|
documentDir,
|
|
124
|
-
metadataPath: path.join(
|
|
125
|
-
|
|
121
|
+
metadataPath: path.join(scope.metadataDir, `${docId}.json`),
|
|
122
|
+
legacyMetadataPath: path.join(documentDir, METADATA_FILE_NAME)
|
|
126
123
|
};
|
|
127
124
|
}
|
|
128
125
|
async function readStoredDocumentMetadata(metadataPath) {
|
|
@@ -142,7 +139,6 @@ var KnowhereStore = class {
|
|
|
142
139
|
scopeMode;
|
|
143
140
|
logger;
|
|
144
141
|
indexCache;
|
|
145
|
-
documentPayloadCache;
|
|
146
142
|
scopeAccessChains;
|
|
147
143
|
scopeKeyAliases;
|
|
148
144
|
sessionScopeKeysBySessionId;
|
|
@@ -154,7 +150,6 @@ var KnowhereStore = class {
|
|
|
154
150
|
this.scopeMode = params.scopeMode;
|
|
155
151
|
this.logger = params.logger;
|
|
156
152
|
this.indexCache = /* @__PURE__ */ new Map();
|
|
157
|
-
this.documentPayloadCache = /* @__PURE__ */ new Map();
|
|
158
153
|
this.scopeAccessChains = /* @__PURE__ */ new Map();
|
|
159
154
|
this.scopeKeyAliases = /* @__PURE__ */ new Map();
|
|
160
155
|
this.sessionScopeKeysBySessionId = /* @__PURE__ */ new Map();
|
|
@@ -277,80 +272,32 @@ var KnowhereStore = class {
|
|
|
277
272
|
key: rawKey,
|
|
278
273
|
label: rawKey === "global" ? "global" : `${mode}:${rawKey}`,
|
|
279
274
|
rootDir: scopeRoot,
|
|
275
|
+
metadataDir: path.join(scopeRoot, METADATA_DIRECTORY_NAME),
|
|
280
276
|
documentsDir: path.join(scopeRoot, "documents"),
|
|
281
277
|
indexPath: path.join(scopeRoot, "index.json")
|
|
282
278
|
};
|
|
283
279
|
}
|
|
280
|
+
async readDocumentMetadata(scope, docId) {
|
|
281
|
+
const paths = buildStoredDocumentPaths(scope, docId);
|
|
282
|
+
const preferredMetadata = await readStoredDocumentMetadata(paths.metadataPath);
|
|
283
|
+
if (preferredMetadata) return preferredMetadata;
|
|
284
|
+
return readStoredDocumentMetadata(paths.legacyMetadataPath);
|
|
285
|
+
}
|
|
284
286
|
async listDocuments(scope) {
|
|
285
287
|
return this.runWithScopeAccessLock(scope, async () => {
|
|
286
288
|
return [...(await this.getIndex(scope, true)).documents].sort((left, right) => String(right.updatedAt || right.ingestedAt || "").localeCompare(String(left.updatedAt || left.ingestedAt || "")));
|
|
287
289
|
});
|
|
288
290
|
}
|
|
289
|
-
async loadDocumentPayload(scope, docId) {
|
|
290
|
-
return this.runWithScopeAccessLock(scope, async () => {
|
|
291
|
-
const cacheKey = this.buildDocumentPayloadCacheKey(scope, docId);
|
|
292
|
-
const cachedPayload = this.documentPayloadCache.get(cacheKey);
|
|
293
|
-
if (cachedPayload) {
|
|
294
|
-
this.touchDocumentPayloadCache(cacheKey, cachedPayload);
|
|
295
|
-
return cachedPayload;
|
|
296
|
-
}
|
|
297
|
-
const paths = buildStoredDocumentPaths(scope, docId);
|
|
298
|
-
const metadata = await readStoredDocumentMetadata(paths.metadataPath);
|
|
299
|
-
if (!metadata) return null;
|
|
300
|
-
const resultContent = await readStoredKnowhereResultContent(paths.resultDir);
|
|
301
|
-
const browseIndex = await this.loadOrBuildBrowseIndex(paths, resultContent.manifest, resultContent.chunks);
|
|
302
|
-
const payload = {
|
|
303
|
-
version: metadata.version,
|
|
304
|
-
document: metadata.document,
|
|
305
|
-
manifest: resultContent.manifest,
|
|
306
|
-
jobResult: metadata.jobResult,
|
|
307
|
-
fullMarkdown: resultContent.fullMarkdown,
|
|
308
|
-
hierarchy: resultContent.hierarchy,
|
|
309
|
-
browseIndex,
|
|
310
|
-
rawZipSha1: metadata.rawZipSha1,
|
|
311
|
-
chunks: resultContent.chunks
|
|
312
|
-
};
|
|
313
|
-
this.touchDocumentPayloadCache(cacheKey, payload);
|
|
314
|
-
return payload;
|
|
315
|
-
});
|
|
316
|
-
}
|
|
317
|
-
getResultFileAbsolutePath(scope, docId, relativePath) {
|
|
318
|
-
return resolveResultEntryPath(buildStoredDocumentPaths(scope, docId).resultDir, relativePath);
|
|
319
|
-
}
|
|
320
|
-
async readResultFile(scope, docId, relativePath) {
|
|
321
|
-
return this.runWithScopeAccessLock(scope, async () => {
|
|
322
|
-
const paths = buildStoredDocumentPaths(scope, docId);
|
|
323
|
-
const metadata = await readStoredDocumentMetadata(paths.metadataPath);
|
|
324
|
-
if (!metadata) return null;
|
|
325
|
-
const filePath = resolveResultEntryPath(paths.resultDir, relativePath);
|
|
326
|
-
try {
|
|
327
|
-
const text = await fs.readFile(filePath, "utf-8");
|
|
328
|
-
return {
|
|
329
|
-
document: metadata.document,
|
|
330
|
-
relativePath: path.posix.normalize(relativePath.replace(/\\/g, "/")),
|
|
331
|
-
text
|
|
332
|
-
};
|
|
333
|
-
} catch (error) {
|
|
334
|
-
if (isNodeError(error) && error.code === "ENOENT") return {
|
|
335
|
-
document: metadata.document,
|
|
336
|
-
relativePath: path.posix.normalize(relativePath.replace(/\\/g, "/")),
|
|
337
|
-
text: null
|
|
338
|
-
};
|
|
339
|
-
throw error;
|
|
340
|
-
}
|
|
341
|
-
});
|
|
342
|
-
}
|
|
343
291
|
async saveDownloadedDocument(scope, payload, options = {}) {
|
|
344
292
|
return this.runWithScopeAccessLock(scope, async () => {
|
|
345
293
|
const index = await this.getIndex(scope, true);
|
|
346
294
|
const existingIds = new Set(index.documents.map((document) => document.id));
|
|
347
295
|
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
348
296
|
const tempPaths = buildStoredDocumentPaths(scope, `.tmp-${randomUUID()}`);
|
|
297
|
+
let finalPaths = null;
|
|
349
298
|
try {
|
|
350
|
-
await extractKnowhereResultArchive(payload.downloadedResult, tempPaths.
|
|
351
|
-
const resultSummary = await readStoredKnowhereResultSummary(tempPaths.
|
|
352
|
-
const resultContent = await readStoredKnowhereResultContent(tempPaths.resultDir);
|
|
353
|
-
const browseIndex = await buildStoredBrowseIndex(tempPaths.resultDir, resultContent.manifest, resultContent.chunks);
|
|
299
|
+
await extractKnowhereResultArchive(payload.downloadedResult, tempPaths.documentDir);
|
|
300
|
+
const resultSummary = await readStoredKnowhereResultSummary(tempPaths.documentDir);
|
|
354
301
|
const documentIdSeed = [
|
|
355
302
|
payload.sourceType,
|
|
356
303
|
payload.source,
|
|
@@ -364,6 +311,7 @@ var KnowhereStore = class {
|
|
|
364
311
|
if (payload.docId && existingDocument && options.overwrite !== true) throw new Error(`Document ${documentIdCandidate} already exists in scope ${scope.label}.`);
|
|
365
312
|
const docId = existingDocument && options.overwrite === true ? existingDocument.id : buildUniqueDocumentId(documentIdCandidate, documentIdSeed, existingIds);
|
|
366
313
|
const paths = buildStoredDocumentPaths(scope, docId);
|
|
314
|
+
finalPaths = paths;
|
|
367
315
|
const originalFileName = deriveOriginalFileName(payload, resultSummary.manifest);
|
|
368
316
|
const documentRecord = {
|
|
369
317
|
id: docId,
|
|
@@ -392,16 +340,17 @@ var KnowhereStore = class {
|
|
|
392
340
|
rawZipSha1: payload.downloadedResult.rawZipSha1
|
|
393
341
|
};
|
|
394
342
|
await writeJsonAtomic(tempPaths.metadataPath, metadata);
|
|
395
|
-
await writeJsonAtomic(tempPaths.browseIndexPath, browseIndex);
|
|
396
343
|
await this.removeDocumentArtifacts(paths);
|
|
397
344
|
await ensureDir(scope.documentsDir);
|
|
345
|
+
await ensureDir(scope.metadataDir);
|
|
398
346
|
await fs.rename(tempPaths.documentDir, paths.documentDir);
|
|
347
|
+
await fs.rename(tempPaths.metadataPath, paths.metadataPath);
|
|
399
348
|
index.documents = index.documents.filter((document) => document.id !== docId).concat(documentRecord);
|
|
400
|
-
this.deleteDocumentPayloadCache(scope, docId);
|
|
401
349
|
await this.persistIndex(scope, index);
|
|
402
350
|
return documentRecord;
|
|
403
351
|
} catch (error) {
|
|
404
352
|
await this.removeDocumentArtifacts(tempPaths).catch(() => void 0);
|
|
353
|
+
if (finalPaths) await this.removeDocumentArtifacts(finalPaths).catch(() => void 0);
|
|
405
354
|
throw error;
|
|
406
355
|
}
|
|
407
356
|
});
|
|
@@ -413,7 +362,6 @@ var KnowhereStore = class {
|
|
|
413
362
|
if (!existingDocument) return null;
|
|
414
363
|
await this.removeDocumentArtifacts(buildStoredDocumentPaths(scope, docId));
|
|
415
364
|
index.documents = index.documents.filter((document) => document.id !== docId);
|
|
416
|
-
this.deleteDocumentPayloadCache(scope, docId);
|
|
417
365
|
await this.persistIndex(scope, index);
|
|
418
366
|
return existingDocument;
|
|
419
367
|
});
|
|
@@ -426,7 +374,6 @@ var KnowhereStore = class {
|
|
|
426
374
|
force: true
|
|
427
375
|
});
|
|
428
376
|
this.indexCache.delete(scope.rootDir);
|
|
429
|
-
this.deleteScopeDocumentPayloadCaches(scope);
|
|
430
377
|
this.logger.info(`knowhere: cleared scope ${scope.label} (${removedDocuments.length} document${removedDocuments.length === 1 ? "" : "s"})`);
|
|
431
378
|
return removedDocuments;
|
|
432
379
|
});
|
|
@@ -473,39 +420,8 @@ var KnowhereStore = class {
|
|
|
473
420
|
recursive: true,
|
|
474
421
|
force: true
|
|
475
422
|
});
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
return `${scope.rootDir}:${docId}`;
|
|
479
|
-
}
|
|
480
|
-
touchDocumentPayloadCache(cacheKey, payload) {
|
|
481
|
-
this.documentPayloadCache.delete(cacheKey);
|
|
482
|
-
this.documentPayloadCache.set(cacheKey, payload);
|
|
483
|
-
while (this.documentPayloadCache.size > DOCUMENT_PAYLOAD_CACHE_LIMIT) {
|
|
484
|
-
const oldestKey = this.documentPayloadCache.keys().next().value;
|
|
485
|
-
if (!oldestKey) break;
|
|
486
|
-
this.documentPayloadCache.delete(oldestKey);
|
|
487
|
-
}
|
|
488
|
-
}
|
|
489
|
-
deleteDocumentPayloadCache(scope, docId) {
|
|
490
|
-
this.documentPayloadCache.delete(this.buildDocumentPayloadCacheKey(scope, docId));
|
|
491
|
-
}
|
|
492
|
-
deleteScopeDocumentPayloadCaches(scope) {
|
|
493
|
-
const cacheKeyPrefix = `${scope.rootDir}:`;
|
|
494
|
-
for (const cacheKey of this.documentPayloadCache.keys()) if (cacheKey.startsWith(cacheKeyPrefix)) this.documentPayloadCache.delete(cacheKey);
|
|
495
|
-
}
|
|
496
|
-
async loadOrBuildBrowseIndex(paths, manifest, chunks) {
|
|
497
|
-
if (await pathExists(paths.browseIndexPath)) {
|
|
498
|
-
try {
|
|
499
|
-
const existingBrowseIndex = await readJson(paths.browseIndexPath, null);
|
|
500
|
-
if (isStoredBrowseIndex(existingBrowseIndex)) return existingBrowseIndex;
|
|
501
|
-
} catch (error) {
|
|
502
|
-
this.logger.warn(`knowhere: failed to read browse index ${paths.browseIndexPath}; rebuilding. ${error instanceof Error ? error.message : String(error)}`);
|
|
503
|
-
}
|
|
504
|
-
this.logger.info(`knowhere: rebuilding browse index for ${paths.documentDir} (expected version 2)`);
|
|
505
|
-
}
|
|
506
|
-
const browseIndex = await buildStoredBrowseIndex(paths.resultDir, manifest, chunks);
|
|
507
|
-
await writeJsonAtomic(paths.browseIndexPath, browseIndex);
|
|
508
|
-
return browseIndex;
|
|
423
|
+
if (await pathExists(paths.metadataPath)) await fs.rm(paths.metadataPath, { force: true });
|
|
424
|
+
if (await pathExists(paths.legacyMetadataPath)) await fs.rm(paths.legacyMetadataPath, { force: true });
|
|
509
425
|
}
|
|
510
426
|
buildRouteKey(channelId, conversationId) {
|
|
511
427
|
const normalizedChannel = normalizeWhitespace(channelId)?.toLowerCase();
|
|
@@ -557,8 +473,7 @@ var KnowhereStore = class {
|
|
|
557
473
|
const documentEntries = await fs.readdir(scope.documentsDir, { withFileTypes: true });
|
|
558
474
|
for (const documentEntry of documentEntries) {
|
|
559
475
|
if (!documentEntry.isDirectory()) continue;
|
|
560
|
-
const
|
|
561
|
-
const metadata = await readStoredDocumentMetadata(path.join(documentPath, METADATA_FILE_NAME));
|
|
476
|
+
const metadata = await this.readDocumentMetadata(scope, documentEntry.name);
|
|
562
477
|
if (!metadata?.document) continue;
|
|
563
478
|
rebuiltIndex.documents.push(metadata.document);
|
|
564
479
|
}
|
package/dist/text.js
CHANGED
|
@@ -18,17 +18,5 @@ function sanitizeStringArray(value) {
|
|
|
18
18
|
}
|
|
19
19
|
return [];
|
|
20
20
|
}
|
|
21
|
-
function stripHtmlTags(text) {
|
|
22
|
-
return text.replace(/<[^>]*>/g, "");
|
|
23
|
-
}
|
|
24
|
-
function stripLatex(text) {
|
|
25
|
-
return text.replace(/\$([^$]*)\$/g, "$1").replace(/\\text\{([^}]*)}/g, "$1").replace(/\\(?:text(?:bf|it|tt|sf|sc|rm)|math(?:rm|bf|it|sf|tt|cal|bb|frak))\{([^}]*)}/g, "$1").replace(/\\(?:emph|underline|overline)\{([^}]*)}/g, "$1").replace(/\\([%$&#_])/g, "$1").replace(/\\(?:right|Right)arrow/g, "→").replace(/\\(?:left|Left)arrow/g, "←").replace(/\\leftrightarrow/g, "↔").replace(/\\times/g, "×").replace(/\\cdot/g, "·").replace(/\\pm/g, "±").replace(/\\leq/g, "≤").replace(/\\geq/g, "≥").replace(/\\neq/g, "≠").replace(/\\approx/g, "≈").replace(/\\(sup|inf|max|min|log|ln|sin|cos|tan|exp|lim)\b/g, "$1").replace(/\\([{}])/g, "$1").replace(/\\\\/g, " ").replace(/\\[a-zA-Z]+/g, "");
|
|
26
|
-
}
|
|
27
|
-
function normalizeUnicode(text) {
|
|
28
|
-
return text.replace(/[\u2018\u2019\u201A]/g, "'").replace(/[\u201C\u201D\u201E]/g, "\"").replace(/[\u2013\u2014]/g, "-").replace(/[\u00A0\u2009\u200A\u200B\u2007\u202F]/g, " ").replace(/\u2026/g, "...").replace(/\u2022/g, "-");
|
|
29
|
-
}
|
|
30
|
-
function normalizeForGrep(text) {
|
|
31
|
-
return stripHtmlTags(stripLatex(normalizeUnicode(text))).replace(/\s+/g, " ").trim();
|
|
32
|
-
}
|
|
33
21
|
//#endregion
|
|
34
|
-
export { hashString,
|
|
22
|
+
export { hashString, normalizeWhitespace, sanitizeStringArray, slugify };
|