npm - @operor/knowledge - Versions diffs - 0.1.0 - Mend

@operor/knowledge 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

package/README.md +457 -0
package/dist/index.d.ts +437 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +1442 -0
package/dist/index.js.map +1 -0
package/package.json +42 -0
package/src/EmbeddingService.ts +92 -0
package/src/IngestionPipeline.ts +357 -0
package/src/QueryNormalizer.ts +59 -0
package/src/QueryRewriter.ts +73 -0
package/src/RankFusion.ts +72 -0
package/src/RetrievalPipeline.ts +388 -0
package/src/SQLiteKnowledgeStore.ts +379 -0
package/src/TextChunker.ts +34 -0
package/src/__tests__/cli-integration.test.ts +134 -0
package/src/__tests__/content-fetcher.test.ts +156 -0
package/src/__tests__/knowledge.test.ts +493 -0
package/src/__tests__/retrieval-layers.test.ts +672 -0
package/src/index.ts +41 -0
package/src/ingestors/FileIngestor.ts +85 -0
package/src/ingestors/SiteCrawler.ts +153 -0
package/src/ingestors/UrlIngestor.ts +106 -0
package/src/ingestors/WatiFaqSync.ts +75 -0
package/src/ingestors/content-fetcher.ts +142 -0
package/src/types.ts +62 -0
package/tsconfig.json +9 -0
package/tsdown.config.ts +10 -0

package/src/IngestionPipeline.ts ADDED Viewed

@@ -0,0 +1,357 @@
+import { randomUUID, createHash } from 'node:crypto';
+import type { KBDocument, KBChunk, KnowledgeStore } from './types.js';
+import type { EmbeddingService } from './EmbeddingService.js';
+import type { TextChunker } from './TextChunker.js';
+import type { SQLiteKnowledgeStore } from './SQLiteKnowledgeStore.js';
+import { normalizeQuery } from './QueryNormalizer.js';
+export interface ContentReformatter {
+  complete(messages: { role: 'system' | 'user'; content: string }[]): Promise<{ text: string }>;
+}
+export interface IngestInput {
+  sourceType: KBDocument['sourceType'];
+  content: string;
+  title?: string;
+  sourceUrl?: string;
+  fileName?: string;
+  metadata?: Record<string, any>;
+  isMarkdown?: boolean;
+  /** Opt-in to LLM Q&A extraction (expensive). Default: false (chunking path). */
+  extractQA?: boolean;
+  /** Document priority: 1=official, 2=supplementary, 3=archived. Auto-assigned if omitted. */
+  priority?: number;
+}
+export interface IngestFaqOptions {
+  sourceUrl?: string;
+  [key: string]: any;
+}
+export interface IngestFaqResult extends KBDocument {
+  existingMatch?: { id: string; question: string; answer: string; score: number };
+}
+export interface RebuildResult {
+  documentsRebuilt: number;
+  chunksRebuilt: number;
+  oldDimensions: number;
+  newDimensions: number;
+}
+export class IngestionPipeline {
+  private store: KnowledgeStore;
+  private embedder: EmbeddingService;
+  private chunker: TextChunker;
+  private llmProvider?: ContentReformatter;
+  constructor(store: KnowledgeStore, embedder: EmbeddingService, chunker: TextChunker, llmProvider?: ContentReformatter) {
+    this.store = store;
+    this.embedder = embedder;
+    this.chunker = chunker;
+    this.llmProvider = llmProvider;
+  }
+  private cleanContent(text: string): string {
+    return text
+      .replace(/!\[.*?\]\(.*?\)/g, '')        // strip image markdown
+      .replace(/[ \t]+/g, ' ')                // collapse horizontal whitespace
+      .replace(/(\n\s*){3,}/g, '\n\n')        // collapse 3+ newlines to 2
+      .split('\n').filter((line, i, arr) => i === 0 || line !== arr[i - 1]).join('\n') // dedup consecutive identical lines
+      .trim()
+      .slice(0, 15000);
+  }
+  private async extractQAPairs(content: string, title?: string): Promise<Array<{ question: string; answer: string }>> {
+    const cleaned = this.cleanContent(content);
+    const titleHint = title ? `\nPage title: "${title}"` : '';
+    const response = await this.llmProvider!.complete([
+      {
+        role: 'system',
+        content: `You extract self-contained Q&A pairs from web page content. Each answer must include ALL relevant details (names, numbers, prices, dates) so it can be understood without the original page. Output ONLY a JSON array of {"question":"...","answer":"..."} objects. No markdown fences.`,
+      },
+      {
+        role: 'user',
+        content: `Extract Q&A pairs from this content.${titleHint}\n\n${cleaned}`,
+      },
+    ]);
+    try {
+      // Strip markdown fences if present
+      const text = response.text.replace(/^```(?:json)?\s*/m, '').replace(/\s*```\s*$/m, '').trim();
+      const parsed = JSON.parse(text);
+      if (Array.isArray(parsed)) return parsed.filter((p: any) => p.question && p.answer);
+    } catch {
+      // Try to find JSON array in response
+      const match = response.text.match(/\[[\s\S]*\]/);
+      if (match) {
+        try {
+          const parsed = JSON.parse(match[0]);
+          if (Array.isArray(parsed)) return parsed.filter((p: any) => p.question && p.answer);
+        } catch { /* fall through */ }
+      }
+    }
+    return [];
+  }
+  private computeHash(content: string): string {
+    return createHash('sha256').update(content).digest('hex');
+  }
+  async ingest(input: IngestInput): Promise<KBDocument> {
+    if (!input.content || input.content.trim().length === 0) {
+      throw new Error(`No content to ingest for "${input.title || input.sourceUrl || 'unknown'}"`);
+    }
+    const sqliteStore = this.store as SQLiteKnowledgeStore;
+    const cleaned = this.cleanContent(input.content);
+    const contentHash = this.computeHash(cleaned);
+    // Auto-assign priority by source type if not specified
+    const priority = input.priority ?? (input.sourceType === 'faq' ? 1 : 2);
+    // Dedup: check by source URL first
+    if (input.sourceUrl && sqliteStore.findBySourceUrl) {
+      const existing = await sqliteStore.findBySourceUrl(input.sourceUrl);
+      if (existing) {
+        // Update existing document instead of duplicating
+        await sqliteStore.updateDocument(existing.id, {
+          content: input.content,
+          title: input.title,
+          contentHash,
+          priority,
+          metadata: input.metadata,
+        });
+        // Delete old chunks and re-chunk
+        await this.store.deleteDocument(existing.id);
+        // Re-add the updated document (deleteDocument removes everything)
+        // Fall through to create new doc with same content
+      }
+    }
+    // Dedup: check by content hash
+    if (sqliteStore.findByContentHash) {
+      const existing = await sqliteStore.findByContentHash(contentHash);
+      if (existing) {
+        console.log(`[KB] Duplicate content detected (hash match), skipping: "${input.title || input.sourceUrl || 'unknown'}"`);
+        return existing;
+      }
+    }
+    // LLM Q&A extraction: only when explicitly opted in
+    if (input.extractQA && this.llmProvider) {
+      const pairs = await this.extractQAPairs(input.content, input.title);
+      if (pairs.length > 0) {
+        for (const pair of pairs) {
+          await this.ingestFaq(pair.question, pair.answer, { sourceUrl: input.sourceUrl });
+        }
+        const now = Date.now();
+        const parentDoc: KBDocument = {
+          id: randomUUID(),
+          sourceType: input.sourceType,
+          sourceUrl: input.sourceUrl,
+          fileName: input.fileName,
+          title: input.title,
+          content: `Extracted ${pairs.length} Q&A pairs`,
+          metadata: { ...input.metadata, faqCount: pairs.length },
+          createdAt: now,
+          updatedAt: now,
+          priority,
+          contentHash,
+        };
+        await this.store.addDocument(parentDoc);
+        return parentDoc;
+      }
+    }
+    // Default path: chunk content → embed → store
+    const now = Date.now();
+    const doc: KBDocument = {
+      id: randomUUID(),
+      sourceType: input.sourceType,
+      sourceUrl: input.sourceUrl,
+      fileName: input.fileName,
+      title: input.title,
+      content: input.content,
+      metadata: input.metadata,
+      createdAt: now,
+      updatedAt: now,
+      priority,
+      contentHash,
+    };
+    await this.store.addDocument(doc);
+    // Use MarkdownTextSplitter for URL content or explicit markdown
+    const useMarkdown = input.isMarkdown || input.sourceType === 'url';
+    const texts = useMarkdown
+      ? await this.chunker.chunkMarkdown(input.content)
+      : await this.chunker.chunk(input.content);
+    const embeddings = await this.embedder.embedMany(texts);
+    const chunks: KBChunk[] = texts.map((text, i) => ({
+      id: randomUUID(),
+      documentId: doc.id,
+      content: text,
+      chunkIndex: i,
+      embedding: embeddings[i],
+      metadata: input.metadata,
+    }));
+    await this.store.addChunks(chunks);
+    if (this.store.getChunkCount) {
+      const storedCount = this.store.getChunkCount(doc.id);
+      if (storedCount === 0) {
+        console.warn(`[KB] WARNING: Document "${input.title || doc.id}" was saved but NO vector embeddings were stored.`);
+      }
+    }
+    return doc;
+  }
+  async ingestFaq(question: string, answer: string, metadata?: Record<string, any> & { forceReplace?: boolean }): Promise<IngestFaqResult> {
+    const embedding = await this.embedder.embed(normalizeQuery(question));
+    const sqliteStore = this.store as SQLiteKnowledgeStore;
+    // FAQ dedup: check for similar existing FAQ
+    if (sqliteStore.findSimilarFaq && !metadata?.forceReplace) {
+      const match = await sqliteStore.findSimilarFaq(embedding, 0.90);
+      if (match) {
+        const existingQ = match.chunk.metadata?.question || match.document.title;
+        const existingA = match.chunk.metadata?.answer;
+        // Return the new doc with existingMatch info so caller can decide
+        const now = Date.now();
+        const content = `Q: ${question}\nA: ${answer}`;
+        const doc: IngestFaqResult = {
+          id: randomUUID(),
+          sourceType: 'faq',
+          sourceUrl: metadata?.sourceUrl,
+          title: question,
+          content,
+          metadata: { ...metadata, question, answer },
+          priority: 1,
+          createdAt: now,
+          updatedAt: now,
+          existingMatch: { id: match.document.id, question: existingQ, answer: existingA, score: match.score },
+        };
+        return doc;
+      }
+    }
+    // If forceReplace, delete the existing FAQ first (caller provides the ID via metadata)
+    if (metadata?.forceReplace && metadata?.replaceId) {
+      await this.store.deleteDocument(metadata.replaceId);
+    }
+    const now = Date.now();
+    const content = `Q: ${question}\nA: ${answer}`;
+    const doc: IngestFaqResult = {
+      id: randomUUID(),
+      sourceType: 'faq',
+      sourceUrl: metadata?.sourceUrl,
+      title: question,
+      content,
+      metadata: { ...metadata, question, answer },
+      priority: 1,
+      createdAt: now,
+      updatedAt: now,
+    };
+    await this.store.addDocument(doc);
+    const chunk: KBChunk = {
+      id: randomUUID(),
+      documentId: doc.id,
+      content,
+      chunkIndex: 0,
+      embedding,
+      metadata: { question, answer },
+    };
+    await this.store.addChunks([chunk]);
+    return doc;
+  }
+  /**
+   * Rebuild all vector embeddings using the current embedding provider.
+   * Preserves all document content, chunks, and FTS data — only replaces vectors.
+   *
+   * Requires the store to be a SQLiteKnowledgeStore (uses rebuild-specific methods).
+   */
+  async rebuild(onProgress?: (current: number, total: number, docTitle: string) => void): Promise<RebuildResult> {
+    const sqliteStore = this.store as SQLiteKnowledgeStore;
+    if (!sqliteStore.getAllChunks || !sqliteStore.rebuildVecTable || !sqliteStore.batchInsertEmbeddings) {
+      throw new Error('Rebuild requires a SQLiteKnowledgeStore with rebuild methods.');
+    }
+    const oldDimensions = sqliteStore.getDimensions();
+    const newDimensions = this.embedder.dimensions;
+    // Get all documents (for sourceType lookup) and all chunks
+    const documents = await this.store.listDocuments();
+    const docMap = new Map(documents.map(d => [d.id, d]));
+    const allChunks = sqliteStore.getAllChunks();
+    if (allChunks.length === 0) {
+      return { documentsRebuilt: 0, chunksRebuilt: 0, oldDimensions, newDimensions };
+    }
+    // Drop and recreate vec_chunks with new dimensions
+    sqliteStore.rebuildVecTable(newDimensions);
+    // Group chunks by document for progress reporting
+    const chunksByDoc = new Map<string, typeof allChunks>();
+    for (const chunk of allChunks) {
+      const list = chunksByDoc.get(chunk.documentId) || [];
+      list.push(chunk);
+      chunksByDoc.set(chunk.documentId, list);
+    }
+    let processedDocs = 0;
+    const totalDocs = chunksByDoc.size;
+    let totalChunksRebuilt = 0;
+    for (const [docId, chunks] of chunksByDoc) {
+      const doc = docMap.get(docId);
+      const docTitle = doc?.title || docId.slice(0, 8);
+      onProgress?.(processedDocs, totalDocs, docTitle);
+      // Determine what text to embed per chunk
+      const textsToEmbed: string[] = [];
+      for (const chunk of chunks) {
+        if (doc?.sourceType === 'faq') {
+          // FAQ: embed the normalized question for consistent matching
+          const meta = chunk.metadata ? JSON.parse(chunk.metadata) : null;
+          const question = meta?.question || doc.title || chunk.content;
+          textsToEmbed.push(normalizeQuery(question));
+        } else {
+          textsToEmbed.push(chunk.content);
+        }
+      }
+      // Embed all chunks for this document in one batch
+      const embeddings = await this.embedder.embedMany(textsToEmbed);
+      // Insert new embeddings
+      const items = chunks.map((chunk, i) => ({
+        chunkId: chunk.id,
+        embedding: embeddings[i],
+      }));
+      sqliteStore.batchInsertEmbeddings(items);
+      totalChunksRebuilt += chunks.length;
+      processedDocs++;
+    }
+    onProgress?.(totalDocs, totalDocs, 'done');
+    return {
+      documentsRebuilt: totalDocs,
+      chunksRebuilt: totalChunksRebuilt,
+      oldDimensions,
+      newDimensions,
+    };
+  }
+}

package/src/QueryNormalizer.ts ADDED Viewed

@@ -0,0 +1,59 @@
+/**
+ * Query normalization for improved KB retrieval.
+ * Expands chat abbreviations and normalizes whitespace before embedding.
+ */
+const ABBREVIATIONS: [RegExp, string][] = [
+  [/\bu\b/gi, 'you'],
+  [/\bur\b/gi, 'your'],
+  [/\br\b/gi, 'are'],
+  [/\bpls\b/gi, 'please'],
+  [/\bplz\b/gi, 'please'],
+  [/\bthx\b/gi, 'thanks'],
+  [/\bthnx\b/gi, 'thanks'],
+  [/\bty\b/gi, 'thank you'],
+  [/\bwat\b/gi, 'what'],
+  [/\bbc\b/gi, 'because'],
+  [/\bcuz\b/gi, 'because'],
+  [/\bgonna\b/gi, 'going to'],
+  [/\bwanna\b/gi, 'want to'],
+  [/\bgotta\b/gi, 'got to'],
+  [/\blemme\b/gi, 'let me'],
+  [/\bgimme\b/gi, 'give me'],
+  [/\bdunno\b/gi, 'do not know'],
+  [/\bhrs\b/gi, 'hours'],
+  [/\bmins\b/gi, 'minutes'],
+  [/\bmsg\b/gi, 'message'],
+  [/\bmsgs\b/gi, 'messages'],
+  [/\binfo\b/gi, 'information'],
+  [/\btmr\b/gi, 'tomorrow'],
+  [/\btmrw\b/gi, 'tomorrow'],
+  [/\bw\/\b/gi, 'with'],
+  [/\bw\/o\b/gi, 'without'],
+  [/\bidk\b/gi, 'I do not know'],
+  [/\bimo\b/gi, 'in my opinion'],
+  [/\bbtw\b/gi, 'by the way'],
+  [/\basap\b/gi, 'as soon as possible'],
+  // Digit substitutions — word-boundary aware
+  [/\b4\b/g, 'for'],
+  [/\b2\b/g, 'to'],
+];
+/**
+ * Normalize a user query for better embedding similarity.
+ * - Expands chat abbreviations with word-boundary awareness
+ * - Lowercases
+ * - Collapses whitespace
+ */
+export function normalizeQuery(query: string): string {
+  let normalized = query.toLowerCase();
+  for (const [pattern, replacement] of ABBREVIATIONS) {
+    normalized = normalized.replace(pattern, replacement);
+  }
+  // Collapse whitespace
+  normalized = normalized.replace(/\s+/g, ' ').trim();
+  return normalized;
+}

package/src/QueryRewriter.ts ADDED Viewed

@@ -0,0 +1,73 @@
+import type { LanguageModelV1 } from 'ai';
+import { generateText } from 'ai';
+const SYSTEM_PROMPT =
+  'You are a query normalizer. Rewrite the following informal/casual query into a clear, well-formed question. Only output the rewritten question, nothing else.';
+const MAX_CACHE_SIZE = 1000;
+export interface QueryRewriterOptions {
+  model: LanguageModelV1;
+  maxCacheSize?: number;
+}
+export interface RewriteResult {
+  original: string;
+  rewritten: string;
+  cached: boolean;
+  tokenUsage?: { prompt: number; completion: number };
+}
+export class QueryRewriter {
+  private model: LanguageModelV1;
+  private cache: Map<string, string>;
+  private maxCacheSize: number;
+  constructor(options: QueryRewriterOptions) {
+    this.model = options.model;
+    this.cache = new Map();
+    this.maxCacheSize = options.maxCacheSize ?? MAX_CACHE_SIZE;
+  }
+  async rewrite(query: string): Promise<RewriteResult> {
+    const cacheKey = query.toLowerCase().trim();
+    // Check cache first
+    const cached = this.cache.get(cacheKey);
+    if (cached) {
+      return { original: query, rewritten: cached, cached: true };
+    }
+    const { text, usage } = await generateText({
+      model: this.model,
+      system: SYSTEM_PROMPT,
+      prompt: query,
+    });
+    const rewritten = text.trim();
+    // LRU eviction: delete oldest entry if at capacity
+    if (this.cache.size >= this.maxCacheSize) {
+      const oldest = this.cache.keys().next().value!;
+      this.cache.delete(oldest);
+    }
+    this.cache.set(cacheKey, rewritten);
+    return {
+      original: query,
+      rewritten,
+      cached: false,
+      tokenUsage: usage
+        ? { prompt: usage.promptTokens, completion: usage.completionTokens }
+        : undefined,
+    };
+  }
+  get cacheSize(): number {
+    return this.cache.size;
+  }
+  clearCache(): void {
+    this.cache.clear();
+  }
+}

package/src/RankFusion.ts ADDED Viewed

@@ -0,0 +1,72 @@
+/**
+ * Reciprocal Rank Fusion (RRF) for combining multiple ranked result sets.
+ * Standard technique for hybrid search (vector + keyword).
+ */
+/**
+ * Fuse multiple ranked result sets using Reciprocal Rank Fusion.
+ *
+ * @param resultSets - Array of Maps where key = item ID, value = rank (0-based)
+ * @param k - Smoothing constant (default 60, industry standard)
+ * @returns Map of item ID → fused RRF score, sorted descending by score
+ */
+export function reciprocalRankFusion(
+  resultSets: Map<string, number>[],
+  k: number = 60,
+): Map<string, number> {
+  const scores = new Map<string, number>();
+  for (const rankMap of resultSets) {
+    for (const [id, rank] of rankMap) {
+      const prev = scores.get(id) ?? 0;
+      scores.set(id, prev + 1 / (k + rank));
+    }
+  }
+  // Sort by score descending
+  const sorted = new Map(
+    [...scores.entries()].sort((a, b) => b[1] - a[1]),
+  );
+  return sorted;
+}
+/**
+ * Weighted Score Fusion: combine vector and keyword scores using weighted average.
+ * BM25 scores are min-max normalized to 0-1 before combining.
+ *
+ * @returns Map of item ID → fused score, sorted descending
+ */
+export function weightedScoreFusion(
+  vectorResults: { id: string; score: number }[],
+  keywordResults: { id: string; score: number }[],
+  vectorWeight: number = 0.7,
+  keywordWeight: number = 0.3,
+): Map<string, number> {
+  // Min-max normalize BM25 scores to 0-1
+  const bm25Scores = new Map<string, number>();
+  if (keywordResults.length > 0) {
+    const scores = keywordResults.map(r => r.score);
+    const min = Math.min(...scores);
+    const max = Math.max(...scores);
+    const range = max - min || 1;
+    for (const r of keywordResults) {
+      bm25Scores.set(r.id, (r.score - min) / range);
+    }
+  }
+  const vecScores = new Map<string, number>();
+  for (const r of vectorResults) vecScores.set(r.id, r.score);
+  // Combine all IDs
+  const allIds = new Set([...vecScores.keys(), ...bm25Scores.keys()]);
+  const fused = new Map<string, number>();
+  for (const id of allIds) {
+    const vs = vecScores.get(id) ?? 0;
+    const ks = bm25Scores.get(id) ?? 0;
+    fused.set(id, vectorWeight * vs + keywordWeight * ks);
+  }
+  return new Map([...fused.entries()].sort((a, b) => b[1] - a[1]));
+}