@ontos-ai/knowhere-claw 0.2.3 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/store.js CHANGED
@@ -1,5 +1,5 @@
1
1
  import { isNodeError } from "./types.js";
2
- import { buildStoredBrowseIndex, extractKnowhereResultArchive, isStoredBrowseIndex, readStoredKnowhereResultContent, readStoredKnowhereResultSummary, resolveResultEntryPath } from "./parser.js";
2
+ import { extractKnowhereResultArchive, readStoredKnowhereResultSummary } from "./parser.js";
3
3
  import { hashString, normalizeWhitespace, sanitizeStringArray, slugify } from "./text.js";
4
4
  import { deriveMessageContextScopeKey, findConversationSegmentValue, parseConversationSessionKey } from "./session.js";
5
5
  import fs from "node:fs/promises";
@@ -7,10 +7,8 @@ import path from "node:path";
7
7
  import { randomUUID } from "node:crypto";
8
8
  //#region src/store.ts
9
9
  const INDEX_VERSION = 1;
10
- const BROWSE_INDEX_FILE_NAME = "browse-index.json";
11
- const DOCUMENT_PAYLOAD_CACHE_LIMIT = 16;
10
+ const METADATA_DIRECTORY_NAME = "metadata";
12
11
  const METADATA_FILE_NAME = "metadata.json";
13
- const RESULT_DIRECTORY_NAME = "result";
14
12
  const ROUTES_FILE_NAME = "routes.json";
15
13
  async function pathExists(targetPath) {
16
14
  try {
@@ -119,10 +117,9 @@ function createEmptyIndex(scope) {
119
117
  function buildStoredDocumentPaths(scope, docId) {
120
118
  const documentDir = path.join(scope.documentsDir, docId);
121
119
  return {
122
- browseIndexPath: path.join(documentDir, BROWSE_INDEX_FILE_NAME),
123
120
  documentDir,
124
- metadataPath: path.join(documentDir, METADATA_FILE_NAME),
125
- resultDir: path.join(documentDir, RESULT_DIRECTORY_NAME)
121
+ metadataPath: path.join(scope.metadataDir, `${docId}.json`),
122
+ legacyMetadataPath: path.join(documentDir, METADATA_FILE_NAME)
126
123
  };
127
124
  }
128
125
  async function readStoredDocumentMetadata(metadataPath) {
@@ -142,7 +139,6 @@ var KnowhereStore = class {
142
139
  scopeMode;
143
140
  logger;
144
141
  indexCache;
145
- documentPayloadCache;
146
142
  scopeAccessChains;
147
143
  scopeKeyAliases;
148
144
  sessionScopeKeysBySessionId;
@@ -154,7 +150,6 @@ var KnowhereStore = class {
154
150
  this.scopeMode = params.scopeMode;
155
151
  this.logger = params.logger;
156
152
  this.indexCache = /* @__PURE__ */ new Map();
157
- this.documentPayloadCache = /* @__PURE__ */ new Map();
158
153
  this.scopeAccessChains = /* @__PURE__ */ new Map();
159
154
  this.scopeKeyAliases = /* @__PURE__ */ new Map();
160
155
  this.sessionScopeKeysBySessionId = /* @__PURE__ */ new Map();
@@ -277,80 +272,32 @@ var KnowhereStore = class {
277
272
  key: rawKey,
278
273
  label: rawKey === "global" ? "global" : `${mode}:${rawKey}`,
279
274
  rootDir: scopeRoot,
275
+ metadataDir: path.join(scopeRoot, METADATA_DIRECTORY_NAME),
280
276
  documentsDir: path.join(scopeRoot, "documents"),
281
277
  indexPath: path.join(scopeRoot, "index.json")
282
278
  };
283
279
  }
280
+ async readDocumentMetadata(scope, docId) {
281
+ const paths = buildStoredDocumentPaths(scope, docId);
282
+ const preferredMetadata = await readStoredDocumentMetadata(paths.metadataPath);
283
+ if (preferredMetadata) return preferredMetadata;
284
+ return readStoredDocumentMetadata(paths.legacyMetadataPath);
285
+ }
284
286
  async listDocuments(scope) {
285
287
  return this.runWithScopeAccessLock(scope, async () => {
286
288
  return [...(await this.getIndex(scope, true)).documents].sort((left, right) => String(right.updatedAt || right.ingestedAt || "").localeCompare(String(left.updatedAt || left.ingestedAt || "")));
287
289
  });
288
290
  }
289
- async loadDocumentPayload(scope, docId) {
290
- return this.runWithScopeAccessLock(scope, async () => {
291
- const cacheKey = this.buildDocumentPayloadCacheKey(scope, docId);
292
- const cachedPayload = this.documentPayloadCache.get(cacheKey);
293
- if (cachedPayload) {
294
- this.touchDocumentPayloadCache(cacheKey, cachedPayload);
295
- return cachedPayload;
296
- }
297
- const paths = buildStoredDocumentPaths(scope, docId);
298
- const metadata = await readStoredDocumentMetadata(paths.metadataPath);
299
- if (!metadata) return null;
300
- const resultContent = await readStoredKnowhereResultContent(paths.resultDir);
301
- const browseIndex = await this.loadOrBuildBrowseIndex(paths, resultContent.manifest, resultContent.chunks);
302
- const payload = {
303
- version: metadata.version,
304
- document: metadata.document,
305
- manifest: resultContent.manifest,
306
- jobResult: metadata.jobResult,
307
- fullMarkdown: resultContent.fullMarkdown,
308
- hierarchy: resultContent.hierarchy,
309
- browseIndex,
310
- rawZipSha1: metadata.rawZipSha1,
311
- chunks: resultContent.chunks
312
- };
313
- this.touchDocumentPayloadCache(cacheKey, payload);
314
- return payload;
315
- });
316
- }
317
- getResultFileAbsolutePath(scope, docId, relativePath) {
318
- return resolveResultEntryPath(buildStoredDocumentPaths(scope, docId).resultDir, relativePath);
319
- }
320
- async readResultFile(scope, docId, relativePath) {
321
- return this.runWithScopeAccessLock(scope, async () => {
322
- const paths = buildStoredDocumentPaths(scope, docId);
323
- const metadata = await readStoredDocumentMetadata(paths.metadataPath);
324
- if (!metadata) return null;
325
- const filePath = resolveResultEntryPath(paths.resultDir, relativePath);
326
- try {
327
- const text = await fs.readFile(filePath, "utf-8");
328
- return {
329
- document: metadata.document,
330
- relativePath: path.posix.normalize(relativePath.replace(/\\/g, "/")),
331
- text
332
- };
333
- } catch (error) {
334
- if (isNodeError(error) && error.code === "ENOENT") return {
335
- document: metadata.document,
336
- relativePath: path.posix.normalize(relativePath.replace(/\\/g, "/")),
337
- text: null
338
- };
339
- throw error;
340
- }
341
- });
342
- }
343
291
  async saveDownloadedDocument(scope, payload, options = {}) {
344
292
  return this.runWithScopeAccessLock(scope, async () => {
345
293
  const index = await this.getIndex(scope, true);
346
294
  const existingIds = new Set(index.documents.map((document) => document.id));
347
295
  const now = (/* @__PURE__ */ new Date()).toISOString();
348
296
  const tempPaths = buildStoredDocumentPaths(scope, `.tmp-${randomUUID()}`);
297
+ let finalPaths = null;
349
298
  try {
350
- await extractKnowhereResultArchive(payload.downloadedResult, tempPaths.resultDir);
351
- const resultSummary = await readStoredKnowhereResultSummary(tempPaths.resultDir);
352
- const resultContent = await readStoredKnowhereResultContent(tempPaths.resultDir);
353
- const browseIndex = await buildStoredBrowseIndex(tempPaths.resultDir, resultContent.manifest, resultContent.chunks);
299
+ await extractKnowhereResultArchive(payload.downloadedResult, tempPaths.documentDir);
300
+ const resultSummary = await readStoredKnowhereResultSummary(tempPaths.documentDir);
354
301
  const documentIdSeed = [
355
302
  payload.sourceType,
356
303
  payload.source,
@@ -364,6 +311,7 @@ var KnowhereStore = class {
364
311
  if (payload.docId && existingDocument && options.overwrite !== true) throw new Error(`Document ${documentIdCandidate} already exists in scope ${scope.label}.`);
365
312
  const docId = existingDocument && options.overwrite === true ? existingDocument.id : buildUniqueDocumentId(documentIdCandidate, documentIdSeed, existingIds);
366
313
  const paths = buildStoredDocumentPaths(scope, docId);
314
+ finalPaths = paths;
367
315
  const originalFileName = deriveOriginalFileName(payload, resultSummary.manifest);
368
316
  const documentRecord = {
369
317
  id: docId,
@@ -392,16 +340,17 @@ var KnowhereStore = class {
392
340
  rawZipSha1: payload.downloadedResult.rawZipSha1
393
341
  };
394
342
  await writeJsonAtomic(tempPaths.metadataPath, metadata);
395
- await writeJsonAtomic(tempPaths.browseIndexPath, browseIndex);
396
343
  await this.removeDocumentArtifacts(paths);
397
344
  await ensureDir(scope.documentsDir);
345
+ await ensureDir(scope.metadataDir);
398
346
  await fs.rename(tempPaths.documentDir, paths.documentDir);
347
+ await fs.rename(tempPaths.metadataPath, paths.metadataPath);
399
348
  index.documents = index.documents.filter((document) => document.id !== docId).concat(documentRecord);
400
- this.deleteDocumentPayloadCache(scope, docId);
401
349
  await this.persistIndex(scope, index);
402
350
  return documentRecord;
403
351
  } catch (error) {
404
352
  await this.removeDocumentArtifacts(tempPaths).catch(() => void 0);
353
+ if (finalPaths) await this.removeDocumentArtifacts(finalPaths).catch(() => void 0);
405
354
  throw error;
406
355
  }
407
356
  });
@@ -413,7 +362,6 @@ var KnowhereStore = class {
413
362
  if (!existingDocument) return null;
414
363
  await this.removeDocumentArtifacts(buildStoredDocumentPaths(scope, docId));
415
364
  index.documents = index.documents.filter((document) => document.id !== docId);
416
- this.deleteDocumentPayloadCache(scope, docId);
417
365
  await this.persistIndex(scope, index);
418
366
  return existingDocument;
419
367
  });
@@ -426,7 +374,6 @@ var KnowhereStore = class {
426
374
  force: true
427
375
  });
428
376
  this.indexCache.delete(scope.rootDir);
429
- this.deleteScopeDocumentPayloadCaches(scope);
430
377
  this.logger.info(`knowhere: cleared scope ${scope.label} (${removedDocuments.length} document${removedDocuments.length === 1 ? "" : "s"})`);
431
378
  return removedDocuments;
432
379
  });
@@ -473,39 +420,8 @@ var KnowhereStore = class {
473
420
  recursive: true,
474
421
  force: true
475
422
  });
476
- }
477
- buildDocumentPayloadCacheKey(scope, docId) {
478
- return `${scope.rootDir}:${docId}`;
479
- }
480
- touchDocumentPayloadCache(cacheKey, payload) {
481
- this.documentPayloadCache.delete(cacheKey);
482
- this.documentPayloadCache.set(cacheKey, payload);
483
- while (this.documentPayloadCache.size > DOCUMENT_PAYLOAD_CACHE_LIMIT) {
484
- const oldestKey = this.documentPayloadCache.keys().next().value;
485
- if (!oldestKey) break;
486
- this.documentPayloadCache.delete(oldestKey);
487
- }
488
- }
489
- deleteDocumentPayloadCache(scope, docId) {
490
- this.documentPayloadCache.delete(this.buildDocumentPayloadCacheKey(scope, docId));
491
- }
492
- deleteScopeDocumentPayloadCaches(scope) {
493
- const cacheKeyPrefix = `${scope.rootDir}:`;
494
- for (const cacheKey of this.documentPayloadCache.keys()) if (cacheKey.startsWith(cacheKeyPrefix)) this.documentPayloadCache.delete(cacheKey);
495
- }
496
- async loadOrBuildBrowseIndex(paths, manifest, chunks) {
497
- if (await pathExists(paths.browseIndexPath)) {
498
- try {
499
- const existingBrowseIndex = await readJson(paths.browseIndexPath, null);
500
- if (isStoredBrowseIndex(existingBrowseIndex)) return existingBrowseIndex;
501
- } catch (error) {
502
- this.logger.warn(`knowhere: failed to read browse index ${paths.browseIndexPath}; rebuilding. ${error instanceof Error ? error.message : String(error)}`);
503
- }
504
- this.logger.info(`knowhere: rebuilding browse index for ${paths.documentDir} (expected version 2)`);
505
- }
506
- const browseIndex = await buildStoredBrowseIndex(paths.resultDir, manifest, chunks);
507
- await writeJsonAtomic(paths.browseIndexPath, browseIndex);
508
- return browseIndex;
423
+ if (await pathExists(paths.metadataPath)) await fs.rm(paths.metadataPath, { force: true });
424
+ if (await pathExists(paths.legacyMetadataPath)) await fs.rm(paths.legacyMetadataPath, { force: true });
509
425
  }
510
426
  buildRouteKey(channelId, conversationId) {
511
427
  const normalizedChannel = normalizeWhitespace(channelId)?.toLowerCase();
@@ -557,8 +473,7 @@ var KnowhereStore = class {
557
473
  const documentEntries = await fs.readdir(scope.documentsDir, { withFileTypes: true });
558
474
  for (const documentEntry of documentEntries) {
559
475
  if (!documentEntry.isDirectory()) continue;
560
- const documentPath = path.join(scope.documentsDir, documentEntry.name);
561
- const metadata = await readStoredDocumentMetadata(path.join(documentPath, METADATA_FILE_NAME));
476
+ const metadata = await this.readDocumentMetadata(scope, documentEntry.name);
562
477
  if (!metadata?.document) continue;
563
478
  rebuiltIndex.documents.push(metadata.document);
564
479
  }
package/dist/text.js CHANGED
@@ -18,17 +18,5 @@ function sanitizeStringArray(value) {
18
18
  }
19
19
  return [];
20
20
  }
21
- function stripHtmlTags(text) {
22
- return text.replace(/<[^>]*>/g, "");
23
- }
24
- function stripLatex(text) {
25
- return text.replace(/\$([^$]*)\$/g, "$1").replace(/\\text\{([^}]*)}/g, "$1").replace(/\\(?:text(?:bf|it|tt|sf|sc|rm)|math(?:rm|bf|it|sf|tt|cal|bb|frak))\{([^}]*)}/g, "$1").replace(/\\(?:emph|underline|overline)\{([^}]*)}/g, "$1").replace(/\\([%$&#_])/g, "$1").replace(/\\(?:right|Right)arrow/g, "→").replace(/\\(?:left|Left)arrow/g, "←").replace(/\\leftrightarrow/g, "↔").replace(/\\times/g, "×").replace(/\\cdot/g, "·").replace(/\\pm/g, "±").replace(/\\leq/g, "≤").replace(/\\geq/g, "≥").replace(/\\neq/g, "≠").replace(/\\approx/g, "≈").replace(/\\(sup|inf|max|min|log|ln|sin|cos|tan|exp|lim)\b/g, "$1").replace(/\\([{}])/g, "$1").replace(/\\\\/g, " ").replace(/\\[a-zA-Z]+/g, "");
26
- }
27
- function normalizeUnicode(text) {
28
- return text.replace(/[\u2018\u2019\u201A]/g, "'").replace(/[\u201C\u201D\u201E]/g, "\"").replace(/[\u2013\u2014]/g, "-").replace(/[\u00A0\u2009\u200A\u200B\u2007\u202F]/g, " ").replace(/\u2026/g, "...").replace(/\u2022/g, "-");
29
- }
30
- function normalizeForGrep(text) {
31
- return stripHtmlTags(stripLatex(normalizeUnicode(text))).replace(/\s+/g, " ").trim();
32
- }
33
21
  //#endregion
34
- export { hashString, normalizeForGrep, normalizeWhitespace, sanitizeStringArray, slugify };
22
+ export { hashString, normalizeWhitespace, sanitizeStringArray, slugify };