@ontos-ai/knowhere-claw 0.2.3 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/parser.js CHANGED
@@ -3,11 +3,9 @@ import fs from "node:fs/promises";
3
3
  import path from "node:path";
4
4
  import { createHash } from "node:crypto";
5
5
  import { strFromU8, unzipSync } from "fflate";
6
+ //#region src/parser.ts
6
7
  const CHUNKS_FILE_NAME = "chunks.json";
7
- const FULL_MARKDOWN_FILE_NAME = "full.md";
8
- const HIERARCHY_FILE_NAME = "hierarchy.json";
9
- const HIERARCHY_VIEW_FILE_NAME = "hierarchy_view.html";
10
- const KB_CSV_FILE_NAME = "kb.csv";
8
+ const LEGACY_RESULT_DIRECTORY_NAME = "result";
11
9
  const MANIFEST_FILE_NAME = "manifest.json";
12
10
  function readZipText(entries, fileName) {
13
11
  const entry = entries[fileName];
@@ -16,6 +14,15 @@ function readZipText(entries, fileName) {
16
14
  async function ensureDir(targetPath) {
17
15
  await fs.mkdir(targetPath, { recursive: true });
18
16
  }
17
+ async function pathExists(targetPath) {
18
+ try {
19
+ await fs.access(targetPath);
20
+ return true;
21
+ } catch (error) {
22
+ if (isNodeError(error) && error.code === "ENOENT") return false;
23
+ throw error;
24
+ }
25
+ }
19
26
  async function readTextFile(targetPath) {
20
27
  try {
21
28
  return await fs.readFile(targetPath, "utf-8");
@@ -39,36 +46,6 @@ function resolveResultEntryPath(rootDir, entryPath) {
39
46
  function normalizeRelativePath(value) {
40
47
  return value.replace(/\\/g, "/").replace(/^\/+/, "");
41
48
  }
42
- function normalizeStoredPath(value) {
43
- if (typeof value !== "string") return null;
44
- return value.trim() || null;
45
- }
46
- function readChunkNumber(rawChunk, metadata, key) {
47
- const metadataValue = metadata[key];
48
- if (typeof metadataValue === "number" && Number.isFinite(metadataValue)) return metadataValue;
49
- const rawValue = rawChunk[key];
50
- if (typeof rawValue === "number" && Number.isFinite(rawValue)) return rawValue;
51
- return null;
52
- }
53
- function readChunkStringArray(rawChunk, metadata, key) {
54
- const metadataValue = metadata[key];
55
- if (Array.isArray(metadataValue)) return metadataValue.filter((entry) => typeof entry === "string");
56
- const rawValue = rawChunk[key];
57
- if (Array.isArray(rawValue)) return rawValue.filter((entry) => typeof entry === "string");
58
- return [];
59
- }
60
- function readChunkArray(rawChunk, metadata, key) {
61
- const metadataValue = metadata[key];
62
- if (Array.isArray(metadataValue)) return metadataValue;
63
- const rawValue = rawChunk[key];
64
- if (Array.isArray(rawValue)) return rawValue;
65
- return [];
66
- }
67
- function extractAssetFilePath(rawChunk, metadata) {
68
- const candidates = [rawChunk.file_path, metadata.file_path];
69
- for (const candidate of candidates) if (typeof candidate === "string" && candidate.trim()) return normalizeRelativePath(candidate.trim());
70
- return null;
71
- }
72
49
  function parseRawChunks(value) {
73
50
  if (Array.isArray(value)) return value.filter((entry) => isRecord(entry));
74
51
  if (isRecord(value) && Array.isArray(value.chunks)) return value.chunks.filter((entry) => isRecord(entry));
@@ -77,24 +54,6 @@ function parseRawChunks(value) {
77
54
  function parseManifest(value) {
78
55
  return isRecord(value) ? value : {};
79
56
  }
80
- function buildChunk(rawChunk) {
81
- const metadata = isRecord(rawChunk.metadata) ? rawChunk.metadata : {};
82
- const type = rawChunk.type === "image" || rawChunk.type === "table" || rawChunk.type === "text" ? rawChunk.type : "text";
83
- return {
84
- chunkId: typeof rawChunk.chunk_id === "string" ? rawChunk.chunk_id : "",
85
- type,
86
- path: normalizeStoredPath(rawChunk.path),
87
- summary: typeof metadata.summary === "string" ? metadata.summary : typeof rawChunk.summary === "string" ? rawChunk.summary : "",
88
- content: typeof rawChunk.content === "string" ? rawChunk.content : "",
89
- tokens: readChunkNumber(rawChunk, metadata, "tokens"),
90
- keywords: readChunkStringArray(rawChunk, metadata, "keywords"),
91
- relationships: readChunkArray(rawChunk, metadata, "relationships"),
92
- metadata,
93
- assetFilePath: extractAssetFilePath(rawChunk, metadata),
94
- originalName: typeof metadata.original_name === "string" ? metadata.original_name : typeof rawChunk.original_name === "string" ? rawChunk.original_name : null,
95
- tableType: typeof metadata.table_type === "string" ? metadata.table_type : typeof rawChunk.table_type === "string" ? rawChunk.table_type : null
96
- };
97
- }
98
57
  function normalizeStatistics(manifest, rawChunks) {
99
58
  if (manifest.statistics) return manifest.statistics;
100
59
  return {
@@ -110,185 +69,6 @@ function validateKnowhereResultChecksum(zipBuffer, manifest) {
110
69
  if (typeof checksum !== "string" || !checksum) return;
111
70
  if (createHash("sha256").update(zipBuffer).digest("hex") !== checksum) throw new Error("Knowhere result ZIP checksum mismatch.");
112
71
  }
113
- function tokenizeStoredPath(storedPath) {
114
- const slashSegments = storedPath.split("/").map((segment) => segment.trim()).filter(Boolean);
115
- const tokens = [];
116
- for (const slashSegment of slashSegments) {
117
- const arrowSegments = slashSegment.split("-->").map((segment) => segment.trim()).filter(Boolean);
118
- if (arrowSegments.length === 0) continue;
119
- tokens.push({
120
- delimiter: tokens.length === 0 ? null : "/",
121
- segment: arrowSegments[0] || ""
122
- });
123
- for (const arrowSegment of arrowSegments.slice(1)) tokens.push({
124
- delimiter: "-->",
125
- segment: arrowSegment
126
- });
127
- }
128
- return tokens;
129
- }
130
- function buildStoredPathPrefixes(storedPath) {
131
- const tokens = tokenizeStoredPath(storedPath);
132
- const prefixes = [];
133
- let currentPath = "";
134
- for (const token of tokens) {
135
- currentPath = token.delimiter ? `${currentPath}${token.delimiter}${token.segment}` : token.segment;
136
- prefixes.push(currentPath);
137
- }
138
- return prefixes;
139
- }
140
- function ensurePathAccumulator(accumulators, pathValue, parentPath, depth) {
141
- const existing = accumulators.get(pathValue);
142
- if (existing) {
143
- if (parentPath && !existing.parentPath) existing.parentPath = parentPath;
144
- return existing;
145
- }
146
- const next = {
147
- childPaths: /* @__PURE__ */ new Set(),
148
- chunkCount: 0,
149
- chunkIds: [],
150
- depth,
151
- directChunkCount: 0,
152
- imageChunkCount: 0,
153
- parentPath,
154
- path: pathValue,
155
- tableChunkCount: 0,
156
- textChunkCount: 0
157
- };
158
- accumulators.set(pathValue, next);
159
- return next;
160
- }
161
- function incrementPathCounters(accumulator, chunkType) {
162
- accumulator.chunkCount += 1;
163
- if (chunkType === "image") {
164
- accumulator.imageChunkCount += 1;
165
- return;
166
- }
167
- if (chunkType === "table") {
168
- accumulator.tableChunkCount += 1;
169
- return;
170
- }
171
- accumulator.textChunkCount += 1;
172
- }
173
- function buildPathRecords(chunks) {
174
- const accumulators = /* @__PURE__ */ new Map();
175
- for (const chunk of chunks) {
176
- if (!chunk.path) continue;
177
- const prefixes = buildStoredPathPrefixes(chunk.path);
178
- for (const [index, prefix] of prefixes.entries()) {
179
- const parentPath = index > 0 ? prefixes[index - 1] || null : null;
180
- const accumulator = ensurePathAccumulator(accumulators, prefix, parentPath, index + 1);
181
- incrementPathCounters(accumulator, chunk.type);
182
- if (parentPath) ensurePathAccumulator(accumulators, parentPath, index > 1 ? prefixes[index - 2] || null : null, index).childPaths.add(prefix);
183
- if (index === prefixes.length - 1) {
184
- accumulator.directChunkCount += 1;
185
- if (chunk.chunkId) accumulator.chunkIds.push(chunk.chunkId);
186
- }
187
- }
188
- }
189
- return [...accumulators.values()].sort((left, right) => left.depth - right.depth || left.path.localeCompare(right.path)).map((entry) => ({
190
- path: entry.path,
191
- parentPath: entry.parentPath,
192
- depth: entry.depth,
193
- childPaths: [...entry.childPaths].sort((left, right) => left.localeCompare(right)),
194
- chunkIds: [...entry.chunkIds],
195
- directChunkCount: entry.directChunkCount,
196
- chunkCount: entry.chunkCount,
197
- textChunkCount: entry.textChunkCount,
198
- imageChunkCount: entry.imageChunkCount,
199
- tableChunkCount: entry.tableChunkCount
200
- }));
201
- }
202
- function readManifestAssetEntries(manifest, key) {
203
- const rawEntries = (isRecord(manifest.files) ? manifest.files : {})[key];
204
- if (!Array.isArray(rawEntries)) return [];
205
- return rawEntries.filter((entry) => isRecord(entry));
206
- }
207
- function buildResultFileChunkLookup(manifest, chunks) {
208
- const entries = /* @__PURE__ */ new Map();
209
- for (const key of ["images", "tables"]) {
210
- const assetEntries = readManifestAssetEntries(manifest, key);
211
- for (const entry of assetEntries) {
212
- const filePath = typeof entry.file_path === "string" && entry.file_path.trim() ? normalizeRelativePath(entry.file_path.trim()) : null;
213
- if (!filePath) continue;
214
- entries.set(filePath, {
215
- chunkId: typeof entry.id === "string" ? entry.id : null,
216
- format: typeof entry.format === "string" ? entry.format : null
217
- });
218
- }
219
- }
220
- for (const chunk of chunks) {
221
- if (!chunk.assetFilePath || entries.has(chunk.assetFilePath)) continue;
222
- entries.set(chunk.assetFilePath, {
223
- chunkId: chunk.chunkId || null,
224
- format: null
225
- });
226
- }
227
- return entries;
228
- }
229
- function inferResultFileKind(relativePath) {
230
- if (relativePath === MANIFEST_FILE_NAME) return "manifest";
231
- if (relativePath === CHUNKS_FILE_NAME) return "chunks";
232
- if (relativePath === FULL_MARKDOWN_FILE_NAME) return "fullMarkdown";
233
- if (relativePath === KB_CSV_FILE_NAME) return "kbCsv";
234
- if (relativePath === HIERARCHY_FILE_NAME) return "hierarchy";
235
- if (relativePath === HIERARCHY_VIEW_FILE_NAME) return "hierarchyView";
236
- if (relativePath.startsWith("images/")) return "image";
237
- if (relativePath.startsWith("tables/")) return "table";
238
- return "other";
239
- }
240
- function inferResultFileFormat(relativePath) {
241
- return path.posix.extname(relativePath).replace(/^\./, "").trim() || null;
242
- }
243
- function isStringArray(value) {
244
- return Array.isArray(value) && value.every((entry) => typeof entry === "string");
245
- }
246
- function isStoredBrowseIndex(value) {
247
- if (!isRecord(value)) return false;
248
- if (value.version !== 2) return false;
249
- if (!isStringArray(value.chunkOrder)) return false;
250
- if (!Array.isArray(value.paths) || !Array.isArray(value.resultFiles)) return false;
251
- if (!value.paths.every((entry) => isRecord(entry) && typeof entry.path === "string" && (entry.parentPath === null || typeof entry.parentPath === "string") && typeof entry.depth === "number" && Number.isFinite(entry.depth) && isStringArray(entry.childPaths) && isStringArray(entry.chunkIds) && typeof entry.directChunkCount === "number" && typeof entry.chunkCount === "number" && typeof entry.textChunkCount === "number" && typeof entry.imageChunkCount === "number" && typeof entry.tableChunkCount === "number")) return false;
252
- return value.resultFiles.every((entry) => isRecord(entry) && typeof entry.relativePath === "string" && typeof entry.kind === "string" && (entry.chunkId === null || typeof entry.chunkId === "string") && (entry.format === null || typeof entry.format === "string") && (entry.sizeBytes === null || typeof entry.sizeBytes === "number" && Number.isFinite(entry.sizeBytes)));
253
- }
254
- async function listResultFiles(rootDir, currentDir = rootDir) {
255
- const entries = await fs.readdir(currentDir, { withFileTypes: true });
256
- const files = [];
257
- for (const entry of entries) {
258
- const absolutePath = path.join(currentDir, entry.name);
259
- if (entry.isDirectory()) {
260
- files.push(...await listResultFiles(rootDir, absolutePath));
261
- continue;
262
- }
263
- if (!entry.isFile()) continue;
264
- files.push(normalizeRelativePath(path.relative(rootDir, absolutePath)));
265
- }
266
- return files.sort((left, right) => left.localeCompare(right));
267
- }
268
- async function buildResultFileRecords(resultDir, manifest, chunks) {
269
- const lookup = buildResultFileChunkLookup(manifest, chunks);
270
- const relativePaths = await listResultFiles(resultDir);
271
- return Promise.all(relativePaths.map(async (relativePath) => {
272
- const absolutePath = resolveResultEntryPath(resultDir, relativePath);
273
- const stats = await fs.stat(absolutePath);
274
- const manifestEntry = lookup.get(relativePath);
275
- return {
276
- relativePath,
277
- kind: inferResultFileKind(relativePath),
278
- chunkId: manifestEntry?.chunkId ?? null,
279
- format: manifestEntry?.format ?? inferResultFileFormat(relativePath),
280
- sizeBytes: stats.isFile() ? stats.size : null
281
- };
282
- }));
283
- }
284
- async function buildStoredBrowseIndex(resultDir, manifest, chunks) {
285
- return {
286
- version: 2,
287
- paths: buildPathRecords(chunks),
288
- chunkOrder: chunks.map((chunk) => chunk.chunkId).filter((chunkId) => chunkId.length > 0),
289
- resultFiles: await buildResultFileRecords(resultDir, manifest, chunks)
290
- };
291
- }
292
72
  async function extractKnowhereResultArchive(downloadedResult, targetDir) {
293
73
  const zipBuffer = Buffer.isBuffer(downloadedResult.zipBytes) ? downloadedResult.zipBytes : Buffer.from(downloadedResult.zipBytes);
294
74
  const entries = unzipSync(new Uint8Array(zipBuffer));
@@ -302,22 +82,24 @@ async function extractKnowhereResultArchive(downloadedResult, targetDir) {
302
82
  await fs.writeFile(outputPath, entryBytes);
303
83
  }
304
84
  }
305
- async function readStoredKnowhereResultSummary(resultDir) {
306
- const manifest = parseManifest(await readJsonFile(path.join(resultDir, MANIFEST_FILE_NAME)));
307
- const rawChunks = parseRawChunks(await readJsonFile(path.join(resultDir, CHUNKS_FILE_NAME)));
85
+ async function resolveStoredKnowhereResultRoot(documentDir) {
86
+ if (await pathExists(path.join(documentDir, MANIFEST_FILE_NAME))) return documentDir;
87
+ const legacyResultDir = path.join(documentDir, LEGACY_RESULT_DIRECTORY_NAME);
88
+ if (await pathExists(path.join(legacyResultDir, MANIFEST_FILE_NAME))) return legacyResultDir;
89
+ return documentDir;
90
+ }
91
+ async function resolveStoredKnowhereArtifactPath(documentDir, entryPath) {
92
+ return resolveResultEntryPath(await resolveStoredKnowhereResultRoot(documentDir), entryPath);
93
+ }
94
+ async function readStoredKnowhereResultSummary(documentDir) {
95
+ const resultRoot = await resolveStoredKnowhereResultRoot(documentDir);
96
+ const manifest = parseManifest(await readJsonFile(path.join(resultRoot, MANIFEST_FILE_NAME)));
97
+ const rawChunks = parseRawChunks(await readJsonFile(path.join(resultRoot, CHUNKS_FILE_NAME)));
308
98
  return {
309
99
  manifest,
310
100
  chunkCount: rawChunks.length,
311
101
  statistics: normalizeStatistics(manifest, rawChunks)
312
102
  };
313
103
  }
314
- async function readStoredKnowhereResultContent(resultDir) {
315
- return {
316
- manifest: parseManifest(await readJsonFile(path.join(resultDir, MANIFEST_FILE_NAME))),
317
- chunks: parseRawChunks(await readJsonFile(path.join(resultDir, CHUNKS_FILE_NAME))).map((rawChunk) => buildChunk(rawChunk)),
318
- fullMarkdown: await readTextFile(path.join(resultDir, FULL_MARKDOWN_FILE_NAME)) || "",
319
- hierarchy: await readJsonFile(path.join(resultDir, HIERARCHY_FILE_NAME))
320
- };
321
- }
322
104
  //#endregion
323
- export { buildStoredBrowseIndex, extractKnowhereResultArchive, isStoredBrowseIndex, readStoredKnowhereResultContent, readStoredKnowhereResultSummary, resolveResultEntryPath };
105
+ export { extractKnowhereResultArchive, readStoredKnowhereResultSummary, resolveStoredKnowhereArtifactPath, resolveStoredKnowhereResultRoot };
package/dist/store.d.ts CHANGED
@@ -1,10 +1,10 @@
1
- import type { ChannelRouteRecord, KnowhereScope, PluginLogger, SaveStoredDocumentPayload, ScopeMode, StoredDocumentPayload, StoredDocumentRecord } from "./types";
1
+ import type { ChannelRouteRecord, KnowhereScope, PluginLogger, SaveStoredDocumentPayload, ScopeMode, StoredDocumentMetadata, StoredDocumentRecord } from "./types";
2
+ type StoredDocumentScopePaths = Pick<KnowhereScope, "documentsDir" | "metadataDir">;
2
3
  export declare class KnowhereStore {
3
4
  private readonly rootDir;
4
5
  private readonly scopeMode;
5
6
  private readonly logger;
6
7
  private readonly indexCache;
7
- private readonly documentPayloadCache;
8
8
  private readonly scopeAccessChains;
9
9
  private readonly scopeKeyAliases;
10
10
  private readonly sessionScopeKeysBySessionId;
@@ -48,14 +48,8 @@ export declare class KnowhereStore {
48
48
  sessionKey?: string;
49
49
  sessionId?: string;
50
50
  }): KnowhereScope;
51
+ readDocumentMetadata(scope: StoredDocumentScopePaths, docId: string): Promise<StoredDocumentMetadata | null>;
51
52
  listDocuments(scope: KnowhereScope): Promise<StoredDocumentRecord[]>;
52
- loadDocumentPayload(scope: KnowhereScope, docId: string): Promise<StoredDocumentPayload | null>;
53
- getResultFileAbsolutePath(scope: KnowhereScope, docId: string, relativePath: string): string;
54
- readResultFile(scope: KnowhereScope, docId: string, relativePath: string): Promise<{
55
- document: StoredDocumentRecord;
56
- relativePath: string;
57
- text: string | null;
58
- } | null>;
59
53
  saveDownloadedDocument(scope: KnowhereScope, payload: SaveStoredDocumentPayload, options?: {
60
54
  overwrite?: boolean;
61
55
  }): Promise<StoredDocumentRecord>;
@@ -65,11 +59,6 @@ export declare class KnowhereStore {
65
59
  private persistIndex;
66
60
  private runWithScopeAccessLock;
67
61
  private removeDocumentArtifacts;
68
- private buildDocumentPayloadCacheKey;
69
- private touchDocumentPayloadCache;
70
- private deleteDocumentPayloadCache;
71
- private deleteScopeDocumentPayloadCaches;
72
- private loadOrBuildBrowseIndex;
73
62
  private buildRouteKey;
74
63
  private ensureRoutesLoaded;
75
64
  private persistRoutes;
@@ -77,3 +66,4 @@ export declare class KnowhereStore {
77
66
  private resolveKnownScopeKey;
78
67
  private rebuildIndex;
79
68
  }
69
+ export {};
package/dist/store.js CHANGED
@@ -1,5 +1,5 @@
1
1
  import { isNodeError } from "./types.js";
2
- import { buildStoredBrowseIndex, extractKnowhereResultArchive, isStoredBrowseIndex, readStoredKnowhereResultContent, readStoredKnowhereResultSummary, resolveResultEntryPath } from "./parser.js";
2
+ import { extractKnowhereResultArchive, readStoredKnowhereResultSummary } from "./parser.js";
3
3
  import { hashString, normalizeWhitespace, sanitizeStringArray, slugify } from "./text.js";
4
4
  import { deriveMessageContextScopeKey, findConversationSegmentValue, parseConversationSessionKey } from "./session.js";
5
5
  import fs from "node:fs/promises";
@@ -7,10 +7,8 @@ import path from "node:path";
7
7
  import { randomUUID } from "node:crypto";
8
8
  //#region src/store.ts
9
9
  const INDEX_VERSION = 1;
10
- const BROWSE_INDEX_FILE_NAME = "browse-index.json";
11
- const DOCUMENT_PAYLOAD_CACHE_LIMIT = 16;
10
+ const METADATA_DIRECTORY_NAME = "metadata";
12
11
  const METADATA_FILE_NAME = "metadata.json";
13
- const RESULT_DIRECTORY_NAME = "result";
14
12
  const ROUTES_FILE_NAME = "routes.json";
15
13
  async function pathExists(targetPath) {
16
14
  try {
@@ -119,10 +117,9 @@ function createEmptyIndex(scope) {
119
117
  function buildStoredDocumentPaths(scope, docId) {
120
118
  const documentDir = path.join(scope.documentsDir, docId);
121
119
  return {
122
- browseIndexPath: path.join(documentDir, BROWSE_INDEX_FILE_NAME),
123
120
  documentDir,
124
- metadataPath: path.join(documentDir, METADATA_FILE_NAME),
125
- resultDir: path.join(documentDir, RESULT_DIRECTORY_NAME)
121
+ metadataPath: path.join(scope.metadataDir, `${docId}.json`),
122
+ legacyMetadataPath: path.join(documentDir, METADATA_FILE_NAME)
126
123
  };
127
124
  }
128
125
  async function readStoredDocumentMetadata(metadataPath) {
@@ -142,7 +139,6 @@ var KnowhereStore = class {
142
139
  scopeMode;
143
140
  logger;
144
141
  indexCache;
145
- documentPayloadCache;
146
142
  scopeAccessChains;
147
143
  scopeKeyAliases;
148
144
  sessionScopeKeysBySessionId;
@@ -154,7 +150,6 @@ var KnowhereStore = class {
154
150
  this.scopeMode = params.scopeMode;
155
151
  this.logger = params.logger;
156
152
  this.indexCache = /* @__PURE__ */ new Map();
157
- this.documentPayloadCache = /* @__PURE__ */ new Map();
158
153
  this.scopeAccessChains = /* @__PURE__ */ new Map();
159
154
  this.scopeKeyAliases = /* @__PURE__ */ new Map();
160
155
  this.sessionScopeKeysBySessionId = /* @__PURE__ */ new Map();
@@ -277,80 +272,32 @@ var KnowhereStore = class {
277
272
  key: rawKey,
278
273
  label: rawKey === "global" ? "global" : `${mode}:${rawKey}`,
279
274
  rootDir: scopeRoot,
275
+ metadataDir: path.join(scopeRoot, METADATA_DIRECTORY_NAME),
280
276
  documentsDir: path.join(scopeRoot, "documents"),
281
277
  indexPath: path.join(scopeRoot, "index.json")
282
278
  };
283
279
  }
280
+ async readDocumentMetadata(scope, docId) {
281
+ const paths = buildStoredDocumentPaths(scope, docId);
282
+ const preferredMetadata = await readStoredDocumentMetadata(paths.metadataPath);
283
+ if (preferredMetadata) return preferredMetadata;
284
+ return readStoredDocumentMetadata(paths.legacyMetadataPath);
285
+ }
284
286
  async listDocuments(scope) {
285
287
  return this.runWithScopeAccessLock(scope, async () => {
286
288
  return [...(await this.getIndex(scope, true)).documents].sort((left, right) => String(right.updatedAt || right.ingestedAt || "").localeCompare(String(left.updatedAt || left.ingestedAt || "")));
287
289
  });
288
290
  }
289
- async loadDocumentPayload(scope, docId) {
290
- return this.runWithScopeAccessLock(scope, async () => {
291
- const cacheKey = this.buildDocumentPayloadCacheKey(scope, docId);
292
- const cachedPayload = this.documentPayloadCache.get(cacheKey);
293
- if (cachedPayload) {
294
- this.touchDocumentPayloadCache(cacheKey, cachedPayload);
295
- return cachedPayload;
296
- }
297
- const paths = buildStoredDocumentPaths(scope, docId);
298
- const metadata = await readStoredDocumentMetadata(paths.metadataPath);
299
- if (!metadata) return null;
300
- const resultContent = await readStoredKnowhereResultContent(paths.resultDir);
301
- const browseIndex = await this.loadOrBuildBrowseIndex(paths, resultContent.manifest, resultContent.chunks);
302
- const payload = {
303
- version: metadata.version,
304
- document: metadata.document,
305
- manifest: resultContent.manifest,
306
- jobResult: metadata.jobResult,
307
- fullMarkdown: resultContent.fullMarkdown,
308
- hierarchy: resultContent.hierarchy,
309
- browseIndex,
310
- rawZipSha1: metadata.rawZipSha1,
311
- chunks: resultContent.chunks
312
- };
313
- this.touchDocumentPayloadCache(cacheKey, payload);
314
- return payload;
315
- });
316
- }
317
- getResultFileAbsolutePath(scope, docId, relativePath) {
318
- return resolveResultEntryPath(buildStoredDocumentPaths(scope, docId).resultDir, relativePath);
319
- }
320
- async readResultFile(scope, docId, relativePath) {
321
- return this.runWithScopeAccessLock(scope, async () => {
322
- const paths = buildStoredDocumentPaths(scope, docId);
323
- const metadata = await readStoredDocumentMetadata(paths.metadataPath);
324
- if (!metadata) return null;
325
- const filePath = resolveResultEntryPath(paths.resultDir, relativePath);
326
- try {
327
- const text = await fs.readFile(filePath, "utf-8");
328
- return {
329
- document: metadata.document,
330
- relativePath: path.posix.normalize(relativePath.replace(/\\/g, "/")),
331
- text
332
- };
333
- } catch (error) {
334
- if (isNodeError(error) && error.code === "ENOENT") return {
335
- document: metadata.document,
336
- relativePath: path.posix.normalize(relativePath.replace(/\\/g, "/")),
337
- text: null
338
- };
339
- throw error;
340
- }
341
- });
342
- }
343
291
  async saveDownloadedDocument(scope, payload, options = {}) {
344
292
  return this.runWithScopeAccessLock(scope, async () => {
345
293
  const index = await this.getIndex(scope, true);
346
294
  const existingIds = new Set(index.documents.map((document) => document.id));
347
295
  const now = (/* @__PURE__ */ new Date()).toISOString();
348
296
  const tempPaths = buildStoredDocumentPaths(scope, `.tmp-${randomUUID()}`);
297
+ let finalPaths = null;
349
298
  try {
350
- await extractKnowhereResultArchive(payload.downloadedResult, tempPaths.resultDir);
351
- const resultSummary = await readStoredKnowhereResultSummary(tempPaths.resultDir);
352
- const resultContent = await readStoredKnowhereResultContent(tempPaths.resultDir);
353
- const browseIndex = await buildStoredBrowseIndex(tempPaths.resultDir, resultContent.manifest, resultContent.chunks);
299
+ await extractKnowhereResultArchive(payload.downloadedResult, tempPaths.documentDir);
300
+ const resultSummary = await readStoredKnowhereResultSummary(tempPaths.documentDir);
354
301
  const documentIdSeed = [
355
302
  payload.sourceType,
356
303
  payload.source,
@@ -364,6 +311,7 @@ var KnowhereStore = class {
364
311
  if (payload.docId && existingDocument && options.overwrite !== true) throw new Error(`Document ${documentIdCandidate} already exists in scope ${scope.label}.`);
365
312
  const docId = existingDocument && options.overwrite === true ? existingDocument.id : buildUniqueDocumentId(documentIdCandidate, documentIdSeed, existingIds);
366
313
  const paths = buildStoredDocumentPaths(scope, docId);
314
+ finalPaths = paths;
367
315
  const originalFileName = deriveOriginalFileName(payload, resultSummary.manifest);
368
316
  const documentRecord = {
369
317
  id: docId,
@@ -392,16 +340,17 @@ var KnowhereStore = class {
392
340
  rawZipSha1: payload.downloadedResult.rawZipSha1
393
341
  };
394
342
  await writeJsonAtomic(tempPaths.metadataPath, metadata);
395
- await writeJsonAtomic(tempPaths.browseIndexPath, browseIndex);
396
343
  await this.removeDocumentArtifacts(paths);
397
344
  await ensureDir(scope.documentsDir);
345
+ await ensureDir(scope.metadataDir);
398
346
  await fs.rename(tempPaths.documentDir, paths.documentDir);
347
+ await fs.rename(tempPaths.metadataPath, paths.metadataPath);
399
348
  index.documents = index.documents.filter((document) => document.id !== docId).concat(documentRecord);
400
- this.deleteDocumentPayloadCache(scope, docId);
401
349
  await this.persistIndex(scope, index);
402
350
  return documentRecord;
403
351
  } catch (error) {
404
352
  await this.removeDocumentArtifacts(tempPaths).catch(() => void 0);
353
+ if (finalPaths) await this.removeDocumentArtifacts(finalPaths).catch(() => void 0);
405
354
  throw error;
406
355
  }
407
356
  });
@@ -413,7 +362,6 @@ var KnowhereStore = class {
413
362
  if (!existingDocument) return null;
414
363
  await this.removeDocumentArtifacts(buildStoredDocumentPaths(scope, docId));
415
364
  index.documents = index.documents.filter((document) => document.id !== docId);
416
- this.deleteDocumentPayloadCache(scope, docId);
417
365
  await this.persistIndex(scope, index);
418
366
  return existingDocument;
419
367
  });
@@ -426,7 +374,6 @@ var KnowhereStore = class {
426
374
  force: true
427
375
  });
428
376
  this.indexCache.delete(scope.rootDir);
429
- this.deleteScopeDocumentPayloadCaches(scope);
430
377
  this.logger.info(`knowhere: cleared scope ${scope.label} (${removedDocuments.length} document${removedDocuments.length === 1 ? "" : "s"})`);
431
378
  return removedDocuments;
432
379
  });
@@ -473,39 +420,8 @@ var KnowhereStore = class {
473
420
  recursive: true,
474
421
  force: true
475
422
  });
476
- }
477
- buildDocumentPayloadCacheKey(scope, docId) {
478
- return `${scope.rootDir}:${docId}`;
479
- }
480
- touchDocumentPayloadCache(cacheKey, payload) {
481
- this.documentPayloadCache.delete(cacheKey);
482
- this.documentPayloadCache.set(cacheKey, payload);
483
- while (this.documentPayloadCache.size > DOCUMENT_PAYLOAD_CACHE_LIMIT) {
484
- const oldestKey = this.documentPayloadCache.keys().next().value;
485
- if (!oldestKey) break;
486
- this.documentPayloadCache.delete(oldestKey);
487
- }
488
- }
489
- deleteDocumentPayloadCache(scope, docId) {
490
- this.documentPayloadCache.delete(this.buildDocumentPayloadCacheKey(scope, docId));
491
- }
492
- deleteScopeDocumentPayloadCaches(scope) {
493
- const cacheKeyPrefix = `${scope.rootDir}:`;
494
- for (const cacheKey of this.documentPayloadCache.keys()) if (cacheKey.startsWith(cacheKeyPrefix)) this.documentPayloadCache.delete(cacheKey);
495
- }
496
- async loadOrBuildBrowseIndex(paths, manifest, chunks) {
497
- if (await pathExists(paths.browseIndexPath)) {
498
- try {
499
- const existingBrowseIndex = await readJson(paths.browseIndexPath, null);
500
- if (isStoredBrowseIndex(existingBrowseIndex)) return existingBrowseIndex;
501
- } catch (error) {
502
- this.logger.warn(`knowhere: failed to read browse index ${paths.browseIndexPath}; rebuilding. ${error instanceof Error ? error.message : String(error)}`);
503
- }
504
- this.logger.info(`knowhere: rebuilding browse index for ${paths.documentDir} (expected version 2)`);
505
- }
506
- const browseIndex = await buildStoredBrowseIndex(paths.resultDir, manifest, chunks);
507
- await writeJsonAtomic(paths.browseIndexPath, browseIndex);
508
- return browseIndex;
423
+ if (await pathExists(paths.metadataPath)) await fs.rm(paths.metadataPath, { force: true });
424
+ if (await pathExists(paths.legacyMetadataPath)) await fs.rm(paths.legacyMetadataPath, { force: true });
509
425
  }
510
426
  buildRouteKey(channelId, conversationId) {
511
427
  const normalizedChannel = normalizeWhitespace(channelId)?.toLowerCase();
@@ -557,8 +473,7 @@ var KnowhereStore = class {
557
473
  const documentEntries = await fs.readdir(scope.documentsDir, { withFileTypes: true });
558
474
  for (const documentEntry of documentEntries) {
559
475
  if (!documentEntry.isDirectory()) continue;
560
- const documentPath = path.join(scope.documentsDir, documentEntry.name);
561
- const metadata = await readStoredDocumentMetadata(path.join(documentPath, METADATA_FILE_NAME));
476
+ const metadata = await this.readDocumentMetadata(scope, documentEntry.name);
562
477
  if (!metadata?.document) continue;
563
478
  rebuiltIndex.documents.push(metadata.document);
564
479
  }
package/dist/text.js CHANGED
@@ -18,17 +18,5 @@ function sanitizeStringArray(value) {
18
18
  }
19
19
  return [];
20
20
  }
21
- function stripHtmlTags(text) {
22
- return text.replace(/<[^>]*>/g, "");
23
- }
24
- function stripLatex(text) {
25
- return text.replace(/\$([^$]*)\$/g, "$1").replace(/\\text\{([^}]*)}/g, "$1").replace(/\\(?:text(?:bf|it|tt|sf|sc|rm)|math(?:rm|bf|it|sf|tt|cal|bb|frak))\{([^}]*)}/g, "$1").replace(/\\(?:emph|underline|overline)\{([^}]*)}/g, "$1").replace(/\\([%$&#_])/g, "$1").replace(/\\(?:right|Right)arrow/g, "→").replace(/\\(?:left|Left)arrow/g, "←").replace(/\\leftrightarrow/g, "↔").replace(/\\times/g, "×").replace(/\\cdot/g, "·").replace(/\\pm/g, "±").replace(/\\leq/g, "≤").replace(/\\geq/g, "≥").replace(/\\neq/g, "≠").replace(/\\approx/g, "≈").replace(/\\(sup|inf|max|min|log|ln|sin|cos|tan|exp|lim)\b/g, "$1").replace(/\\([{}])/g, "$1").replace(/\\\\/g, " ").replace(/\\[a-zA-Z]+/g, "");
26
- }
27
- function normalizeUnicode(text) {
28
- return text.replace(/[\u2018\u2019\u201A]/g, "'").replace(/[\u201C\u201D\u201E]/g, "\"").replace(/[\u2013\u2014]/g, "-").replace(/[\u00A0\u2009\u200A\u200B\u2007\u202F]/g, " ").replace(/\u2026/g, "...").replace(/\u2022/g, "-");
29
- }
30
- function normalizeForGrep(text) {
31
- return stripHtmlTags(stripLatex(normalizeUnicode(text))).replace(/\s+/g, " ").trim();
32
- }
33
21
  //#endregion
34
- export { hashString, normalizeForGrep, normalizeWhitespace, sanitizeStringArray, slugify };
22
+ export { hashString, normalizeWhitespace, sanitizeStringArray, slugify };