npm - @guidekit/knowledge - Versions diffs - 1.0.0 - Mend

@guidekit/knowledge 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/dist/index.js ADDED Viewed

@@ -0,0 +1,657 @@
+import { KnowledgeError, ErrorCodes } from '@guidekit/core';
+// src/knowledge-store.ts
+// src/chunker.ts
+var HEADING_RE = /^#{1,6}\s+/;
+function isHeadingLine(line) {
+  return HEADING_RE.test(line);
+}
+function extractHeading(line) {
+  return line.replace(HEADING_RE, "").trim();
+}
+function normalize(text) {
+  return text.replace(/\n{3,}/g, "\n\n").trim();
+}
+function makeChunk(doc, content, index, startOffset, headingContext) {
+  const trimmed = normalize(content);
+  if (trimmed.length === 0) return null;
+  return {
+    id: `${doc.id}:${index}`,
+    documentId: doc.id,
+    content: trimmed,
+    index,
+    startOffset,
+    endOffset: startOffset + content.length,
+    ...headingContext !== void 0 ? { headingContext } : {}
+  };
+}
+function chunkByHeading(doc) {
+  const lines = doc.content.split("\n");
+  const chunks = [];
+  let current = "";
+  let currentStart = 0;
+  let currentHeading;
+  let offset = 0;
+  let idx = 0;
+  for (let i = 0; i < lines.length; i++) {
+    const line = lines[i];
+    const lineWithNewline = i < lines.length - 1 ? line + "\n" : line;
+    if (isHeadingLine(line)) {
+      if (current.length > 0) {
+        const chunk = makeChunk(doc, current, idx, currentStart, currentHeading);
+        if (chunk) {
+          chunks.push(chunk);
+          idx++;
+        }
+      }
+      currentHeading = extractHeading(line);
+      currentStart = offset;
+      current = lineWithNewline;
+    } else {
+      current += lineWithNewline;
+    }
+    offset += lineWithNewline.length;
+  }
+  if (current.length > 0) {
+    const chunk = makeChunk(doc, current, idx, currentStart, currentHeading);
+    if (chunk) chunks.push(chunk);
+  }
+  return chunks;
+}
+function chunkByParagraph(doc) {
+  const parts = doc.content.split("\n\n");
+  const chunks = [];
+  let offset = 0;
+  let idx = 0;
+  let lastHeading;
+  for (let i = 0; i < parts.length; i++) {
+    const part = parts[i];
+    const startOffset = offset;
+    offset += part.length + (i < parts.length - 1 ? 2 : 0);
+    const lines = part.split("\n");
+    for (const line of lines) {
+      if (isHeadingLine(line)) {
+        lastHeading = extractHeading(line);
+      }
+    }
+    const chunk = makeChunk(doc, part, idx, startOffset, lastHeading);
+    if (chunk) {
+      chunks.push(chunk);
+      idx++;
+    }
+  }
+  return chunks;
+}
+function chunkByFixed(doc, chunkSize, overlap) {
+  const content = doc.content;
+  const chunks = [];
+  let pos = 0;
+  let idx = 0;
+  while (pos < content.length) {
+    const end = Math.min(pos + chunkSize, content.length);
+    const slice = content.slice(pos, end);
+    let headingContext;
+    const lines = slice.split("\n");
+    for (const line of lines) {
+      if (isHeadingLine(line)) {
+        headingContext = extractHeading(line);
+      }
+    }
+    const chunk = makeChunk(doc, slice, idx, pos, headingContext);
+    if (chunk) {
+      chunks.push(chunk);
+      idx++;
+    }
+    const step = chunkSize - overlap;
+    pos += step > 0 ? step : chunkSize;
+  }
+  return chunks;
+}
+function chunkDocument(doc, options) {
+  const strategy = options?.strategy ?? "heading";
+  switch (strategy) {
+    case "heading":
+      return chunkByHeading(doc);
+    case "paragraph":
+      return chunkByParagraph(doc);
+    case "fixed":
+      return chunkByFixed(doc, options?.chunkSize ?? 512, options?.overlap ?? 64);
+    default:
+      return chunkByHeading(doc);
+  }
+}
+// src/tokenizer.ts
+var STOPWORDS = /* @__PURE__ */ new Set([
+  "a",
+  "an",
+  "the",
+  "and",
+  "or",
+  "but",
+  "not",
+  "no",
+  "nor",
+  "so",
+  "yet",
+  "is",
+  "are",
+  "was",
+  "were",
+  "be",
+  "been",
+  "being",
+  "am",
+  "have",
+  "has",
+  "had",
+  "having",
+  "do",
+  "does",
+  "did",
+  "doing",
+  "will",
+  "would",
+  "could",
+  "should",
+  "shall",
+  "may",
+  "might",
+  "must",
+  "can",
+  "i",
+  "me",
+  "my",
+  "myself",
+  "we",
+  "our",
+  "ours",
+  "ourselves",
+  "you",
+  "your",
+  "yours",
+  "yourself",
+  "yourselves",
+  "he",
+  "him",
+  "his",
+  "himself",
+  "she",
+  "her",
+  "hers",
+  "herself",
+  "it",
+  "its",
+  "itself",
+  "they",
+  "them",
+  "their",
+  "theirs",
+  "themselves",
+  "what",
+  "which",
+  "who",
+  "whom",
+  "this",
+  "that",
+  "these",
+  "those",
+  "if",
+  "then",
+  "else",
+  "when",
+  "where",
+  "why",
+  "how",
+  "whether",
+  "in",
+  "on",
+  "at",
+  "to",
+  "for",
+  "from",
+  "by",
+  "with",
+  "about",
+  "against",
+  "between",
+  "through",
+  "during",
+  "before",
+  "after",
+  "above",
+  "below",
+  "up",
+  "down",
+  "out",
+  "off",
+  "over",
+  "under",
+  "again",
+  "further",
+  "of",
+  "into",
+  "as",
+  "until",
+  "while",
+  "among",
+  "within",
+  "without",
+  "than",
+  "too",
+  "very",
+  "just",
+  "also",
+  "now",
+  "here",
+  "there",
+  "all",
+  "any",
+  "both",
+  "each",
+  "few",
+  "more",
+  "most",
+  "other",
+  "some",
+  "such",
+  "only",
+  "own",
+  "same",
+  "much",
+  "many",
+  "enough",
+  "every",
+  "once",
+  "twice",
+  "already",
+  "always",
+  "never",
+  "often",
+  "still",
+  "because",
+  "since",
+  "although",
+  "though",
+  "however",
+  "therefore",
+  "either",
+  "neither",
+  "nor",
+  "rather",
+  "per",
+  "via",
+  "don",
+  "doesn",
+  "didn",
+  "won",
+  "wouldn",
+  "couldn",
+  "shouldn",
+  "isn",
+  "aren",
+  "wasn",
+  "weren",
+  "hasn",
+  "haven",
+  "hadn"
+]);
+function tokenize(text) {
+  return text.toLowerCase().split(/\W+/).filter(Boolean);
+}
+function removeStopwords(tokens) {
+  return tokens.filter((t) => !STOPWORDS.has(t));
+}
+// src/bm25.ts
+var BM25Index = class {
+  k1;
+  b;
+  /** Inverted index: term -> (chunkId -> frequency) */
+  invertedIndex = /* @__PURE__ */ new Map();
+  /** Document length in tokens per chunk */
+  docLengths = /* @__PURE__ */ new Map();
+  /** Stored chunks */
+  chunks = /* @__PURE__ */ new Map();
+  /** Track which chunks belong to which document */
+  docToChunks = /* @__PURE__ */ new Map();
+  /** Running total of all document lengths for avgdl computation */
+  totalDocLength = 0;
+  constructor(options) {
+    this.k1 = options?.k1 ?? 1.2;
+    this.b = options?.b ?? 0.75;
+  }
+  /** Add chunks from a document to the index. */
+  addDocument(chunks) {
+    for (const chunk of chunks) {
+      if (this.chunks.has(chunk.id)) continue;
+      const tokens = removeStopwords(tokenize(chunk.content));
+      this.chunks.set(chunk.id, chunk);
+      this.docLengths.set(chunk.id, tokens.length);
+      this.totalDocLength += tokens.length;
+      let chunkSet = this.docToChunks.get(chunk.documentId);
+      if (!chunkSet) {
+        chunkSet = /* @__PURE__ */ new Set();
+        this.docToChunks.set(chunk.documentId, chunkSet);
+      }
+      chunkSet.add(chunk.id);
+      const freqs = /* @__PURE__ */ new Map();
+      for (const token of tokens) {
+        freqs.set(token, (freqs.get(token) ?? 0) + 1);
+      }
+      for (const [term, freq] of freqs) {
+        let postings = this.invertedIndex.get(term);
+        if (!postings) {
+          postings = /* @__PURE__ */ new Map();
+          this.invertedIndex.set(term, postings);
+        }
+        postings.set(chunk.id, freq);
+      }
+    }
+  }
+  /** Remove all chunks belonging to a document. */
+  removeDocument(documentId) {
+    const chunkIds = this.docToChunks.get(documentId);
+    if (!chunkIds) return;
+    for (const chunkId of chunkIds) {
+      const docLen = this.docLengths.get(chunkId) ?? 0;
+      this.totalDocLength -= docLen;
+      this.docLengths.delete(chunkId);
+      this.chunks.delete(chunkId);
+      for (const [, postings] of this.invertedIndex) {
+        postings.delete(chunkId);
+      }
+    }
+    this.docToChunks.delete(documentId);
+  }
+  /** Search the index. Returns chunks sorted by relevance (descending). */
+  search(query, topK = 10) {
+    const queryTerms = removeStopwords(tokenize(query));
+    if (queryTerms.length === 0 || this.size === 0) return [];
+    const N = this.size;
+    const avgdl = this.totalDocLength / N;
+    const scores = /* @__PURE__ */ new Map();
+    for (const term of queryTerms) {
+      const postings = this.invertedIndex.get(term);
+      if (!postings) continue;
+      const n = postings.size;
+      const idf = Math.log((N - n + 0.5) / (n + 0.5) + 1);
+      for (const [chunkId, freq] of postings) {
+        const dl = this.docLengths.get(chunkId) ?? 0;
+        const tf = freq * (this.k1 + 1) / (freq + this.k1 * (1 - this.b + this.b * (dl / avgdl)));
+        const prev = scores.get(chunkId) ?? 0;
+        scores.set(chunkId, prev + idf * tf);
+      }
+    }
+    const results = [];
+    for (const [chunkId, score] of scores) {
+      const chunk = this.chunks.get(chunkId);
+      results.push({ chunk, score });
+    }
+    results.sort((a, b) => b.score - a.score);
+    return results.slice(0, topK);
+  }
+  /** Number of chunks in the index. */
+  get size() {
+    return this.chunks.size;
+  }
+  /** Clear the entire index. */
+  clear() {
+    this.invertedIndex.clear();
+    this.docLengths.clear();
+    this.chunks.clear();
+    this.docToChunks.clear();
+    this.totalDocLength = 0;
+  }
+};
+// src/tfidf.ts
+var TFIDFIndex = class {
+  /** Inverted index: term -> (chunkId -> frequency) */
+  invertedIndex = /* @__PURE__ */ new Map();
+  /** Stored chunks */
+  chunks = /* @__PURE__ */ new Map();
+  /** Track which chunks belong to which document */
+  docToChunks = /* @__PURE__ */ new Map();
+  /** Add chunks from a document to the index. */
+  addDocument(chunks) {
+    for (const chunk of chunks) {
+      if (this.chunks.has(chunk.id)) continue;
+      const tokens = removeStopwords(tokenize(chunk.content));
+      this.chunks.set(chunk.id, chunk);
+      let chunkSet = this.docToChunks.get(chunk.documentId);
+      if (!chunkSet) {
+        chunkSet = /* @__PURE__ */ new Set();
+        this.docToChunks.set(chunk.documentId, chunkSet);
+      }
+      chunkSet.add(chunk.id);
+      const freqs = /* @__PURE__ */ new Map();
+      for (const token of tokens) {
+        freqs.set(token, (freqs.get(token) ?? 0) + 1);
+      }
+      for (const [term, freq] of freqs) {
+        let postings = this.invertedIndex.get(term);
+        if (!postings) {
+          postings = /* @__PURE__ */ new Map();
+          this.invertedIndex.set(term, postings);
+        }
+        postings.set(chunk.id, freq);
+      }
+    }
+  }
+  /** Remove all chunks belonging to a document. */
+  removeDocument(documentId) {
+    const chunkIds = this.docToChunks.get(documentId);
+    if (!chunkIds) return;
+    for (const chunkId of chunkIds) {
+      this.chunks.delete(chunkId);
+      for (const [, postings] of this.invertedIndex) {
+        postings.delete(chunkId);
+      }
+    }
+    this.docToChunks.delete(documentId);
+  }
+  /** Search the index. Returns chunks sorted by relevance (descending). */
+  search(query, topK = 10) {
+    const queryTerms = removeStopwords(tokenize(query));
+    if (queryTerms.length === 0 || this.size === 0) return [];
+    const N = this.size;
+    const scores = /* @__PURE__ */ new Map();
+    for (const term of queryTerms) {
+      const postings = this.invertedIndex.get(term);
+      if (!postings) continue;
+      const df = postings.size;
+      const idf = Math.log(N / df);
+      for (const [chunkId, freq] of postings) {
+        const tf = 1 + Math.log(freq);
+        const prev = scores.get(chunkId) ?? 0;
+        scores.set(chunkId, prev + tf * idf);
+      }
+    }
+    const results = [];
+    for (const [chunkId, score] of scores) {
+      const chunk = this.chunks.get(chunkId);
+      results.push({ chunk, score });
+    }
+    results.sort((a, b) => b.score - a.score);
+    return results.slice(0, topK);
+  }
+  /** Number of chunks in the index. */
+  get size() {
+    return this.chunks.size;
+  }
+  /** Clear the entire index. */
+  clear() {
+    this.invertedIndex.clear();
+    this.chunks.clear();
+    this.docToChunks.clear();
+  }
+};
+// src/attribution.ts
+function buildAttribution(chunk, score, title) {
+  const truncated = chunk.content.length > 200;
+  const excerpt = truncated ? chunk.content.slice(0, 200) + "..." : chunk.content;
+  return {
+    documentId: chunk.documentId,
+    chunkId: chunk.id,
+    title,
+    relevanceScore: score,
+    excerpt
+  };
+}
+function formatAttributions(results) {
+  if (results.length === 0) return "";
+  const sorted = [...results].sort((a, b) => b.score - a.score).slice(0, 10);
+  const lines = sorted.map((r, i) => {
+    const excerpt = r.source.excerpt.length > 100 ? r.source.excerpt.slice(0, 100) + "..." : r.source.excerpt;
+    return `[${i + 1}] *${r.source.title}* (relevance: ${r.score.toFixed(2)}) \u2014 "${excerpt}"`;
+  });
+  return `**Sources:**
+${lines.join("\n")}`;
+}
+// src/knowledge-store.ts
+var KnowledgeStore = class {
+  options;
+  documents = /* @__PURE__ */ new Map();
+  bm25 = new BM25Index();
+  tfidf = new TFIDFIndex();
+  totalChunks = 0;
+  constructor(options) {
+    this.options = {
+      engine: options?.engine ?? "bm25",
+      maxDocuments: options?.maxDocuments ?? 100,
+      maxTotalChunks: options?.maxTotalChunks ?? 5e3,
+      topK: options?.topK ?? 5,
+      chunker: options?.chunker
+    };
+    if (options?.persistConsent) {
+      console.warn(
+        "[GuideKit] KnowledgeStore persistence via IndexedDB is not yet implemented. Data is in-memory only."
+      );
+    }
+  }
+  /** Add a document. Chunks it and indexes all chunks. */
+  addDocument(doc) {
+    if (this.documents.size >= this.options.maxDocuments) {
+      throw new KnowledgeError({
+        code: ErrorCodes.KNOWLEDGE_STORE_QUOTA,
+        message: `Maximum document limit (${this.options.maxDocuments}) reached`,
+        suggestion: "Remove unused documents before adding new ones."
+      });
+    }
+    const chunks = chunkDocument(doc, this.options.chunker);
+    if (this.totalChunks + chunks.length > this.options.maxTotalChunks) {
+      throw new KnowledgeError({
+        code: ErrorCodes.KNOWLEDGE_STORE_QUOTA,
+        message: `Adding ${chunks.length} chunks would exceed the total chunk limit (${this.options.maxTotalChunks})`,
+        suggestion: "Remove documents or increase maxTotalChunks."
+      });
+    }
+    const storedDoc = { ...doc, chunks };
+    this.documents.set(doc.id, storedDoc);
+    this.bm25.addDocument(chunks);
+    this.tfidf.addDocument(chunks);
+    this.totalChunks += chunks.length;
+  }
+  /** Remove a document and its chunks from the index. */
+  removeDocument(id) {
+    const doc = this.documents.get(id);
+    if (!doc) return;
+    const chunkCount = doc.chunks?.length ?? 0;
+    this.bm25.removeDocument(id);
+    this.tfidf.removeDocument(id);
+    this.documents.delete(id);
+    this.totalChunks -= chunkCount;
+  }
+  /** Update a document (remove + re-add). */
+  updateDocument(id, doc) {
+    this.removeDocument(id);
+    this.addDocument(doc);
+  }
+  /** Search the knowledge base. */
+  search(query, options) {
+    const engine = options?.engine ?? this.options.engine;
+    const topK = options?.topK ?? this.options.topK;
+    const index = engine === "tfidf" ? this.tfidf : this.bm25;
+    let scored = index.search(query, this.totalChunks || 1);
+    if (options?.documentIds && options.documentIds.length > 0) {
+      const allowed = new Set(options.documentIds);
+      scored = scored.filter((s) => allowed.has(s.chunk.documentId));
+    }
+    if (options?.minScore !== void 0) {
+      scored = scored.filter((s) => s.score >= options.minScore);
+    }
+    scored = scored.slice(0, topK);
+    return scored.map((s) => {
+      const doc = this.documents.get(s.chunk.documentId);
+      const title = doc?.title ?? s.chunk.documentId;
+      return {
+        chunk: s.chunk,
+        score: s.score,
+        source: buildAttribution(s.chunk, s.score, title)
+      };
+    });
+  }
+  /** Get a document by ID. */
+  getDocument(id) {
+    return this.documents.get(id);
+  }
+  /** Get all document IDs. */
+  getDocumentIds() {
+    return [...this.documents.keys()];
+  }
+  /** Clear all documents and indexes. */
+  clear() {
+    this.documents.clear();
+    this.bm25.clear();
+    this.tfidf.clear();
+    this.totalChunks = 0;
+  }
+  /** Get store statistics. */
+  getStats() {
+    return {
+      documentCount: this.documents.size,
+      chunkCount: this.totalChunks
+    };
+  }
+};
+// src/context-provider.ts
+function createKnowledgeContextProvider(store, options) {
+  const tokenBudget = options?.tokenBudget ?? 500;
+  const searchOptions = options?.searchOptions;
+  const header = options?.header ?? "Relevant Knowledge";
+  const maxChars = tokenBudget * 4;
+  return (query) => {
+    const results = store.search(query, searchOptions);
+    if (results.length === 0) return "";
+    const sectionHeader = `## ${header}
+`;
+    const attributionFooter = `
+${formatAttributions(results)}`;
+    const reservedChars = sectionHeader.length + attributionFooter.length;
+    let remaining = maxChars - reservedChars;
+    const chunks = [];
+    for (const result of results) {
+      const entry = result.chunk.content;
+      const cost = entry.length + (chunks.length > 0 ? 2 : 0);
+      if (cost > remaining) break;
+      chunks.push(entry);
+      remaining -= cost;
+    }
+    if (chunks.length === 0) return "";
+    return sectionHeader + chunks.join("\n\n") + attributionFooter;
+  };
+}
+// src/index.ts
+var KNOWLEDGE_VERSION = "0.1.0";
+export { BM25Index, KNOWLEDGE_VERSION, KnowledgeStore, TFIDFIndex, buildAttribution, chunkDocument, createKnowledgeContextProvider, formatAttributions, removeStopwords, tokenize };
+//# sourceMappingURL=index.js.map
+//# sourceMappingURL=index.js.map