npm - @operor/knowledge - Versions diffs - 0.1.0 - Mend

@operor/knowledge 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

package/README.md +457 -0
package/dist/index.d.ts +437 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +1442 -0
package/dist/index.js.map +1 -0
package/package.json +42 -0
package/src/EmbeddingService.ts +92 -0
package/src/IngestionPipeline.ts +357 -0
package/src/QueryNormalizer.ts +59 -0
package/src/QueryRewriter.ts +73 -0
package/src/RankFusion.ts +72 -0
package/src/RetrievalPipeline.ts +388 -0
package/src/SQLiteKnowledgeStore.ts +379 -0
package/src/TextChunker.ts +34 -0
package/src/__tests__/cli-integration.test.ts +134 -0
package/src/__tests__/content-fetcher.test.ts +156 -0
package/src/__tests__/knowledge.test.ts +493 -0
package/src/__tests__/retrieval-layers.test.ts +672 -0
package/src/index.ts +41 -0
package/src/ingestors/FileIngestor.ts +85 -0
package/src/ingestors/SiteCrawler.ts +153 -0
package/src/ingestors/UrlIngestor.ts +106 -0
package/src/ingestors/WatiFaqSync.ts +75 -0
package/src/ingestors/content-fetcher.ts +142 -0
package/src/types.ts +62 -0
package/tsconfig.json +9 -0
package/tsdown.config.ts +10 -0

package/src/__tests__/content-fetcher.test.ts ADDED Viewed

@@ -0,0 +1,156 @@
+import { describe, it, expect, vi, beforeEach } from 'vitest';
+import { fetchContent, extractFromHtml, extractLinks, resetCrawl4aiHealthCache } from '../ingestors/content-fetcher.js';
+// Mock global fetch
+const mockFetch = vi.fn();
+vi.stubGlobal('fetch', mockFetch);
+beforeEach(() => {
+  mockFetch.mockReset();
+  resetCrawl4aiHealthCache();
+});
+describe('content-fetcher', () => {
+  describe('fetchContent without Crawl4AI', () => {
+    it('uses readability extraction from static HTML', async () => {
+      const html = `
+        <html><head><title>Test Page</title></head>
+        <body><article><h1>Hello World</h1><p>Some content here.</p></article></body>
+        </html>
+      `;
+      mockFetch.mockResolvedValueOnce({
+        ok: true,
+        text: async () => html,
+      });
+      const result = await fetchContent('https://example.com/page');
+      expect(result.isMarkdown).toBe(false);
+      expect(result.content).toContain('Some content here');
+      expect(mockFetch).toHaveBeenCalledWith('https://example.com/page', expect.objectContaining({
+        headers: { 'User-Agent': 'Operor-KB/1.0' },
+      }));
+    });
+  });
+  describe('fetchContent with Crawl4AI', () => {
+    it('uses Crawl4AI when health check passes and returns markdown', async () => {
+      // Health check
+      mockFetch.mockResolvedValueOnce({ ok: true });
+      // Crawl4AI POST response
+      mockFetch.mockResolvedValueOnce({
+        ok: true,
+        json: async () => ({
+          results: [{
+            markdown: {
+              fit_markdown: '# Dynamic Content\n\nRendered by JavaScript.',
+              raw_markdown: '# Dynamic Content\n\nRendered by JavaScript.',
+            },
+          }],
+        }),
+      });
+      const result = await fetchContent('https://example.com/spa', { crawl4aiUrl: 'http://localhost:11235' });
+      expect(result.isMarkdown).toBe(true);
+      expect(result.content).toContain('Rendered by JavaScript');
+      expect(result.title).toBe('Dynamic Content');
+    });
+    it('falls back to Readability when Crawl4AI health check fails', async () => {
+      // Health check fails
+      mockFetch.mockRejectedValueOnce(new Error('Connection refused'));
+      // Readability fallback fetch
+      const html = `
+        <html><head><title>Fallback Page</title></head>
+        <body><article><p>Fallback content.</p></article></body>
+        </html>
+      `;
+      mockFetch.mockResolvedValueOnce({
+        ok: true,
+        text: async () => html,
+      });
+      const result = await fetchContent('https://example.com/page', { crawl4aiUrl: 'http://localhost:11235' });
+      expect(result.isMarkdown).toBe(false);
+      expect(result.content).toContain('Fallback content');
+    });
+    it('falls back to Readability when Crawl4AI returns empty results', async () => {
+      // Health check passes
+      mockFetch.mockResolvedValueOnce({ ok: true });
+      // Crawl4AI returns empty
+      mockFetch.mockResolvedValueOnce({
+        ok: true,
+        json: async () => ({ results: [{ markdown: { fit_markdown: '', raw_markdown: '' } }] }),
+      });
+      // Readability fallback
+      const html = `
+        <html><head><title>Fallback</title></head>
+        <body><article><p>Recovery content.</p></article></body>
+        </html>
+      `;
+      mockFetch.mockResolvedValueOnce({
+        ok: true,
+        text: async () => html,
+      });
+      const result = await fetchContent('https://example.com/empty', { crawl4aiUrl: 'http://localhost:11235' });
+      expect(result).toHaveProperty('content');
+      expect(result.content).toContain('Recovery content');
+    });
+  });
+  describe('extractFromHtml', () => {
+    it('extracts title and content from HTML', () => {
+      const html = `
+        <html><head><title>My Page</title></head>
+        <body><article><p>Article content goes here.</p></article></body>
+        </html>
+      `;
+      const result = extractFromHtml(html, 'https://example.com');
+      expect(result.content).toContain('Article content goes here');
+    });
+  });
+  describe('extractLinks', () => {
+    it('extracts same-domain links', () => {
+      const html = `
+        <html><body>
+          <a href="/about">About</a>
+          <a href="https://example.com/contact">Contact</a>
+          <a href="https://other.com/external">External</a>
+        </body></html>
+      `;
+      const links = extractLinks(html, 'https://example.com');
+      expect(links).toContain('https://example.com/about');
+      expect(links).toContain('https://example.com/contact');
+      expect(links).not.toContain('https://other.com/external');
+    });
+    it('deduplicates links', () => {
+      const html = `
+        <html><body>
+          <a href="/page">Link 1</a>
+          <a href="/page">Link 2</a>
+        </body></html>
+      `;
+      const links = extractLinks(html, 'https://example.com');
+      expect(links.filter(l => l === 'https://example.com/page')).toHaveLength(1);
+    });
+    it('strips fragment identifiers', () => {
+      const html = `
+        <html><body>
+          <a href="/page#section1">Section 1</a>
+          <a href="/page#section2">Section 2</a>
+        </body></html>
+      `;
+      const links = extractLinks(html, 'https://example.com');
+      expect(links).toContain('https://example.com/page');
+      expect(links).toHaveLength(1);
+    });
+  });
+});

package/src/__tests__/knowledge.test.ts ADDED Viewed

@@ -0,0 +1,493 @@
+import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest';
+import { EmbeddingService } from '../EmbeddingService.js';
+import { SQLiteKnowledgeStore } from '../SQLiteKnowledgeStore.js';
+import { TextChunker } from '../TextChunker.js';
+import { IngestionPipeline } from '../IngestionPipeline.js';
+import { RetrievalPipeline } from '../RetrievalPipeline.js';
+import { unlinkSync } from 'node:fs';
+// Mock AI SDK
+vi.mock('ai', () => ({
+  embed: vi.fn(async ({ value }: { value: string }) => ({
+    embedding: mockEmbed(value),
+  })),
+  embedMany: vi.fn(async ({ values }: { values: string[] }) => ({
+    embeddings: values.map(mockEmbed),
+  })),
+}));
+vi.mock('@ai-sdk/openai', () => ({
+  createOpenAI: vi.fn(() => ({
+    embedding: vi.fn(() => ({})),
+  })),
+}));
+vi.mock('@ai-sdk/google', () => ({
+  createGoogleGenerativeAI: vi.fn(() => ({
+    textEmbeddingModel: vi.fn(() => ({})),
+  })),
+}));
+vi.mock('@ai-sdk/mistral', () => ({
+  mistral: {
+    embedding: vi.fn(() => ({})),
+  },
+}));
+vi.mock('@ai-sdk/cohere', () => ({
+  cohere: {
+    embedding: vi.fn(() => ({})),
+  },
+}));
+function mockEmbed(text: string): number[] {
+  const hash = text.split('').reduce((acc, c) => acc + c.charCodeAt(0), 0);
+  return Array.from({ length: 1536 }, (_, i) => Math.sin(hash + i) * 0.1);
+}
+describe('EmbeddingService', () => {
+  it('should embed single text', async () => {
+    const service = new EmbeddingService({ provider: 'openai', apiKey: 'test' });
+    const embedding = await service.embed('hello world');
+    expect(embedding).toHaveLength(1536);
+  });
+  it('should embed multiple texts', async () => {
+    const service = new EmbeddingService({ provider: 'openai', apiKey: 'test' });
+    const embeddings = await service.embedMany(['hello', 'world']);
+    expect(embeddings).toHaveLength(2);
+    expect(embeddings[0]).toHaveLength(1536);
+  });
+  it('should return dimensions', () => {
+    const service = new EmbeddingService({ provider: 'openai', apiKey: 'test', dimensions: 512 });
+    expect(service.dimensions).toBe(512);
+  });
+  it('should return default dimensions per provider', () => {
+    expect(new EmbeddingService({ provider: 'openai', apiKey: 'test' }).dimensions).toBe(1536);
+    expect(new EmbeddingService({ provider: 'google', apiKey: 'test' }).dimensions).toBe(768);
+    expect(new EmbeddingService({ provider: 'mistral', apiKey: 'test' }).dimensions).toBe(1024);
+    expect(new EmbeddingService({ provider: 'cohere', apiKey: 'test' }).dimensions).toBe(1024);
+    expect(new EmbeddingService({ provider: 'ollama' }).dimensions).toBe(768);
+  });
+  it('should expose provider name', () => {
+    const service = new EmbeddingService({ provider: 'google', apiKey: 'test' });
+    expect(service.provider).toBe('google');
+  });
+});
+describe('SQLiteKnowledgeStore', () => {
+  let store: SQLiteKnowledgeStore;
+  const dbPath = './test-kb.db';
+  beforeEach(async () => {
+    store = new SQLiteKnowledgeStore(dbPath);
+    await store.initialize();
+  });
+  afterEach(async () => {
+    await store.close();
+    try {
+      unlinkSync(dbPath);
+      unlinkSync(`${dbPath}-shm`);
+      unlinkSync(`${dbPath}-wal`);
+    } catch {}
+  });
+  it('should add and retrieve document', async () => {
+    const doc = {
+      id: 'doc1',
+      sourceType: 'url' as const,
+      sourceUrl: 'https://example.com',
+      title: 'Test Doc',
+      content: 'Test content',
+      createdAt: Date.now(),
+      updatedAt: Date.now(),
+    };
+    await store.addDocument(doc);
+    const retrieved = await store.getDocument('doc1');
+    expect(retrieved).toMatchObject(doc);
+  });
+  it('should list documents', async () => {
+    const doc1 = {
+      id: 'doc1',
+      sourceType: 'url' as const,
+      content: 'Content 1',
+      createdAt: Date.now(),
+      updatedAt: Date.now(),
+    };
+    const doc2 = {
+      id: 'doc2',
+      sourceType: 'file' as const,
+      content: 'Content 2',
+      createdAt: Date.now(),
+      updatedAt: Date.now(),
+    };
+    await store.addDocument(doc1);
+    await store.addDocument(doc2);
+    const docs = await store.listDocuments();
+    expect(docs).toHaveLength(2);
+  });
+  it('should delete document and chunks', async () => {
+    const doc = {
+      id: 'doc1',
+      sourceType: 'url' as const,
+      content: 'Test',
+      createdAt: Date.now(),
+      updatedAt: Date.now(),
+    };
+    await store.addDocument(doc);
+    await store.addChunks([
+      {
+        id: 'chunk1',
+        documentId: 'doc1',
+        content: 'Test chunk',
+        chunkIndex: 0,
+        embedding: mockEmbed('test'),
+      },
+    ]);
+    await store.deleteDocument('doc1');
+    const retrieved = await store.getDocument('doc1');
+    expect(retrieved).toBeNull();
+  });
+  it('should search by embedding', async () => {
+    const doc = {
+      id: 'doc1',
+      sourceType: 'url' as const,
+      content: 'Test content',
+      createdAt: Date.now(),
+      updatedAt: Date.now(),
+    };
+    await store.addDocument(doc);
+    await store.addChunks([
+      {
+        id: 'chunk1',
+        documentId: 'doc1',
+        content: 'hello world',
+        chunkIndex: 0,
+        embedding: mockEmbed('hello world'),
+      },
+    ]);
+    const results = await store.searchByEmbedding(mockEmbed('hello world'), { limit: 5 });
+    expect(results.length).toBeGreaterThan(0);
+    expect(results[0].chunk.content).toBe('hello world');
+  });
+});
+describe('TextChunker', () => {
+  it('should chunk plain text', async () => {
+    const chunker = new TextChunker({ chunkSize: 50, chunkOverlap: 10 });
+    const text = 'a'.repeat(200);
+    const chunks = await chunker.chunk(text);
+    expect(chunks.length).toBeGreaterThan(1);
+  });
+  it('should chunk markdown', async () => {
+    const chunker = new TextChunker({ chunkSize: 100, chunkOverlap: 20 });
+    const markdown = '# Title\n\n' + 'Content. '.repeat(50);
+    const chunks = await chunker.chunkMarkdown(markdown);
+    expect(chunks.length).toBeGreaterThan(0);
+  });
+});
+describe('IngestionPipeline', () => {
+  let store: SQLiteKnowledgeStore;
+  let embedder: EmbeddingService;
+  let chunker: TextChunker;
+  let pipeline: IngestionPipeline;
+  const dbPath = './test-ingest.db';
+  beforeEach(async () => {
+    store = new SQLiteKnowledgeStore(dbPath);
+    await store.initialize();
+    embedder = new EmbeddingService({ provider: 'openai', apiKey: 'test' });
+    chunker = new TextChunker({ chunkSize: 100, chunkOverlap: 20 });
+    pipeline = new IngestionPipeline(store, embedder, chunker);
+  });
+  afterEach(async () => {
+    await store.close();
+    try {
+      unlinkSync(dbPath);
+      unlinkSync(`${dbPath}-shm`);
+      unlinkSync(`${dbPath}-wal`);
+    } catch {}
+  });
+  it('should ingest document with chunks', async () => {
+    const doc = await pipeline.ingest({
+      sourceType: 'url',
+      content: 'This is a test document. '.repeat(20),
+      title: 'Test',
+    });
+    expect(doc.id).toBeDefined();
+    const retrieved = await store.getDocument(doc.id);
+    expect(retrieved).toBeDefined();
+  });
+  it('should ingest FAQ', async () => {
+    const doc = await pipeline.ingestFaq('What is Operor?', 'Operor is a framework.');
+    expect(doc.sourceType).toBe('faq');
+    expect(doc.title).toBe('What is Operor?');
+  });
+});
+describe('RetrievalPipeline', () => {
+  let store: SQLiteKnowledgeStore;
+  let embedder: EmbeddingService;
+  let chunker: TextChunker;
+  let ingestion: IngestionPipeline;
+  let retrieval: RetrievalPipeline;
+  const dbPath = './test-retrieval.db';
+  beforeEach(async () => {
+    store = new SQLiteKnowledgeStore(dbPath);
+    await store.initialize();
+    embedder = new EmbeddingService({ provider: 'openai', apiKey: 'test' });
+    chunker = new TextChunker({ chunkSize: 100, chunkOverlap: 20 });
+    ingestion = new IngestionPipeline(store, embedder, chunker);
+    retrieval = new RetrievalPipeline(store, embedder, 0.85);
+  });
+  afterEach(async () => {
+    await store.close();
+    try {
+      unlinkSync(dbPath);
+      unlinkSync(`${dbPath}-shm`);
+      unlinkSync(`${dbPath}-wal`);
+    } catch {}
+  });
+  it('should retrieve FAQ with fast-path', async () => {
+    await ingestion.ingestFaq('What is the return policy?', 'You can return within 30 days.');
+    const result = await retrieval.retrieve('What is the return policy?');
+    expect(result.isFaqMatch).toBe(true);
+    expect(result.context).toContain('return policy');
+  });
+  it('should retrieve general KB content', async () => {
+    await ingestion.ingest({
+      sourceType: 'url',
+      content: 'Operor is a framework for building AI agents.',
+      title: 'About Operor',
+    });
+    const result = await retrieval.retrieve('What is Operor?');
+    expect(result.results.length).toBeGreaterThan(0);
+    expect(result.context).toContain('Knowledge Base Context');
+  });
+  it('should format context correctly', async () => {
+    await ingestion.ingest({
+      sourceType: 'url',
+      sourceUrl: 'https://example.com',
+      content: 'Test content',
+      title: 'Test',
+    });
+    const result = await retrieval.retrieve('test');
+    expect(result.context).toContain('## Knowledge Base Context');
+    expect(result.context).toContain('### Source');
+  });
+});
+describe('Content Deduplication', () => {
+  let store: SQLiteKnowledgeStore;
+  let embedder: EmbeddingService;
+  let chunker: TextChunker;
+  let pipeline: IngestionPipeline;
+  const dbPath = './test-dedup.db';
+  beforeEach(async () => {
+    store = new SQLiteKnowledgeStore(dbPath);
+    await store.initialize();
+    embedder = new EmbeddingService({ provider: 'openai', apiKey: 'test' });
+    chunker = new TextChunker({ chunkSize: 100, chunkOverlap: 20 });
+    pipeline = new IngestionPipeline(store, embedder, chunker);
+  });
+  afterEach(async () => {
+    await store.close();
+    try {
+      unlinkSync(dbPath);
+      unlinkSync(`${dbPath}-shm`);
+      unlinkSync(`${dbPath}-wal`);
+    } catch {}
+  });
+  it('should update existing doc when same URL is ingested twice', async () => {
+    await pipeline.ingest({
+      sourceType: 'url',
+      sourceUrl: 'https://example.com/page',
+      content: 'Original content',
+      title: 'Page',
+    });
+    await pipeline.ingest({
+      sourceType: 'url',
+      sourceUrl: 'https://example.com/page',
+      content: 'Updated content',
+      title: 'Page v2',
+    });
+    const docs = await store.listDocuments();
+    expect(docs).toHaveLength(1);
+    const doc = await store.getDocument(docs[0].id);
+    expect(doc!.content).toBe('Updated content');
+  });
+  it('should skip when content hash matches existing doc', async () => {
+    await pipeline.ingest({
+      sourceType: 'url',
+      content: 'Identical content here',
+      title: 'First',
+    });
+    await pipeline.ingest({
+      sourceType: 'file',
+      content: 'Identical content here',
+      title: 'Second',
+    });
+    const docs = await store.listDocuments();
+    expect(docs).toHaveLength(1);
+  });
+  it('should store contentHash on ingested documents', async () => {
+    await pipeline.ingest({
+      sourceType: 'url',
+      content: 'Some content',
+      title: 'Hash Test',
+    });
+    const docs = await store.listDocuments();
+    const doc = await store.getDocument(docs[0].id);
+    expect(doc!.contentHash).toBeDefined();
+    expect(doc!.contentHash!.length).toBe(64); // SHA-256 hex
+  });
+});
+describe('FAQ Deduplication', () => {
+  let store: SQLiteKnowledgeStore;
+  let embedder: EmbeddingService;
+  let chunker: TextChunker;
+  let pipeline: IngestionPipeline;
+  const dbPath = './test-faq-dedup.db';
+  beforeEach(async () => {
+    store = new SQLiteKnowledgeStore(dbPath);
+    await store.initialize();
+    embedder = new EmbeddingService({ provider: 'openai', apiKey: 'test' });
+    chunker = new TextChunker({ chunkSize: 100, chunkOverlap: 20 });
+    pipeline = new IngestionPipeline(store, embedder, chunker);
+  });
+  afterEach(async () => {
+    await store.close();
+    try {
+      unlinkSync(dbPath);
+      unlinkSync(`${dbPath}-shm`);
+      unlinkSync(`${dbPath}-wal`);
+    } catch {}
+  });
+  it('should return existingMatch when similar FAQ exists', async () => {
+    await pipeline.ingestFaq('What is the return policy?', 'You can return within 30 days.');
+    // Exact same question should trigger dedup
+    const result = await pipeline.ingestFaq('What is the return policy?', 'New answer.');
+    expect(result.existingMatch).toBeDefined();
+    expect(result.existingMatch!.answer).toBe('You can return within 30 days.');
+  });
+  it('should replace existing FAQ with forceReplace', async () => {
+    const original = await pipeline.ingestFaq('What are your hours?', 'We are open 9-5.');
+    await pipeline.ingestFaq('What are your hours?', 'We are open 24/7.', {
+      forceReplace: true,
+      replaceId: original.id,
+    });
+    const docs = await store.listDocuments();
+    const faqs = docs.filter(d => d.sourceType === 'faq');
+    expect(faqs).toHaveLength(1);
+  });
+});
+describe('Priority Auto-Assignment', () => {
+  let store: SQLiteKnowledgeStore;
+  let embedder: EmbeddingService;
+  let chunker: TextChunker;
+  let pipeline: IngestionPipeline;
+  const dbPath = './test-priority.db';
+  beforeEach(async () => {
+    store = new SQLiteKnowledgeStore(dbPath);
+    await store.initialize();
+    embedder = new EmbeddingService({ provider: 'openai', apiKey: 'test' });
+    chunker = new TextChunker({ chunkSize: 100, chunkOverlap: 20 });
+    pipeline = new IngestionPipeline(store, embedder, chunker);
+  });
+  afterEach(async () => {
+    await store.close();
+    try {
+      unlinkSync(dbPath);
+      unlinkSync(`${dbPath}-shm`);
+      unlinkSync(`${dbPath}-wal`);
+    } catch {}
+  });
+  it('should assign priority 1 to FAQ, priority 2 to URL and file', async () => {
+    const faq = await pipeline.ingestFaq('Q?', 'A.');
+    const url = await pipeline.ingest({ sourceType: 'url', content: 'URL content', title: 'URL' });
+    const file = await pipeline.ingest({ sourceType: 'file', content: 'File content', title: 'File' });
+    const faqDoc = await store.getDocument(faq.id);
+    const urlDoc = await store.getDocument(url.id);
+    const fileDoc = await store.getDocument(file.id);
+    expect(faqDoc!.priority).toBe(1);
+    expect(urlDoc!.priority).toBe(2);
+    expect(fileDoc!.priority).toBe(2);
+  });
+  it('should allow manual priority override', async () => {
+    const doc = await pipeline.ingest({
+      sourceType: 'url',
+      content: 'Important content',
+      title: 'Important',
+      priority: 1,
+    });
+    const retrieved = await store.getDocument(doc.id);
+    expect(retrieved!.priority).toBe(1);
+  });
+});
+describe('TextChunker Defaults', () => {
+  it('should default to chunkSize 3200 and chunkOverlap 200', async () => {
+    const chunker = new TextChunker();
+    // Text shorter than 3200 should be a single chunk
+    const shortChunks = await chunker.chunk('a'.repeat(3000));
+    expect(shortChunks).toHaveLength(1);
+    // Text longer than 3200 should be split
+    const longChunks = await chunker.chunk('a'.repeat(6000));
+    expect(longChunks.length).toBeGreaterThan(1);
+  });
+});