@operor/knowledge 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/index.ts ADDED
@@ -0,0 +1,41 @@
1
+ // Types
2
+ export type {
3
+ KBDocument,
4
+ KBChunk,
5
+ KBSearchResult,
6
+ KBSearchOptions,
7
+ KBStats,
8
+ KnowledgeStore,
9
+ } from './types.js';
10
+
11
+ // Core
12
+ export { EmbeddingService } from './EmbeddingService.js';
13
+ export type { EmbeddingServiceConfig } from './EmbeddingService.js';
14
+ export { SQLiteKnowledgeStore } from './SQLiteKnowledgeStore.js';
15
+ export { TextChunker } from './TextChunker.js';
16
+ export type { ChunkOptions } from './TextChunker.js';
17
+
18
+ // Pipelines
19
+ export { IngestionPipeline } from './IngestionPipeline.js';
20
+ export type { IngestInput, IngestFaqResult, IngestFaqOptions, RebuildResult, ContentReformatter } from './IngestionPipeline.js';
21
+ export { RetrievalPipeline, splitCompoundQuery } from './RetrievalPipeline.js';
22
+ export type { RetrievalResult, RetrievalPipelineOptions } from './RetrievalPipeline.js';
23
+
24
+ // Query processing
25
+ export { normalizeQuery } from './QueryNormalizer.js';
26
+ export { reciprocalRankFusion, weightedScoreFusion } from './RankFusion.js';
27
+ export { QueryRewriter } from './QueryRewriter.js';
28
+ export type { QueryRewriterOptions, RewriteResult } from './QueryRewriter.js';
29
+
30
+ // Ingestors
31
+ export { UrlIngestor } from './ingestors/UrlIngestor.js';
32
+ export type { CrawlOptions, UrlIngestorOptions } from './ingestors/UrlIngestor.js';
33
+ export { FileIngestor } from './ingestors/FileIngestor.js';
34
+ export { SiteCrawler } from './ingestors/SiteCrawler.js';
35
+ export type { SiteCrawlOptions, SiteCrawlerOptions } from './ingestors/SiteCrawler.js';
36
+ export { WatiFaqSync } from './ingestors/WatiFaqSync.js';
37
+ export type { FaqPair, WatiFaqSyncOptions } from './ingestors/WatiFaqSync.js';
38
+
39
+ // Content fetching
40
+ export { fetchHtml, fetchContent, extractFromHtml, extractLinks, resetCrawl4aiHealthCache } from './ingestors/content-fetcher.js';
41
+ export type { FetchContentOptions, FetchContentResult } from './ingestors/content-fetcher.js';
@@ -0,0 +1,85 @@
1
+ import { readFile } from 'node:fs/promises';
2
+ import { extname } from 'node:path';
3
+ import type { IngestionPipeline } from '../IngestionPipeline.js';
4
+ import type { KBDocument } from '../types.js';
5
+
6
+ export class FileIngestor {
7
+ private pipeline: IngestionPipeline;
8
+
9
+ constructor(pipeline: IngestionPipeline) {
10
+ this.pipeline = pipeline;
11
+ }
12
+
13
+ async ingestFile(filePath: string, title?: string, options?: { priority?: number }): Promise<KBDocument> {
14
+ const ext = extname(filePath).toLowerCase();
15
+ const content = await this.extractContent(filePath, ext);
16
+ const fileName = filePath.split('/').pop() || filePath;
17
+
18
+ return this.pipeline.ingest({
19
+ sourceType: 'file',
20
+ fileName,
21
+ title: title || fileName,
22
+ content,
23
+ priority: options?.priority,
24
+ });
25
+ }
26
+
27
+ private async extractContent(filePath: string, ext: string): Promise<string> {
28
+ switch (ext) {
29
+ case '.pdf':
30
+ return this.extractPdf(filePath);
31
+ case '.docx':
32
+ return this.extractDocx(filePath);
33
+ case '.xlsx':
34
+ case '.xls':
35
+ return this.extractXlsx(filePath);
36
+ case '.csv':
37
+ case '.txt':
38
+ case '.md':
39
+ return readFile(filePath, 'utf-8');
40
+ case '.html':
41
+ case '.htm':
42
+ return this.extractHtml(filePath);
43
+ default:
44
+ return readFile(filePath, 'utf-8');
45
+ }
46
+ }
47
+
48
+ private async extractPdf(filePath: string): Promise<string> {
49
+ const { getDocumentProxy, extractText } = await import('unpdf');
50
+ const buffer = await readFile(filePath);
51
+ const pdf = await getDocumentProxy(new Uint8Array(buffer));
52
+ const { text } = await extractText(pdf, { mergePages: true });
53
+ return text;
54
+ }
55
+
56
+ private async extractDocx(filePath: string): Promise<string> {
57
+ const mammoth = await import('mammoth');
58
+ const buffer = await readFile(filePath);
59
+ const result = await mammoth.extractRawText({ buffer });
60
+ return result.value;
61
+ }
62
+
63
+ private async extractXlsx(filePath: string): Promise<string> {
64
+ const XLSX = await import('xlsx');
65
+ const buffer = await readFile(filePath);
66
+ const workbook = XLSX.read(buffer, { type: 'buffer' });
67
+ const lines: string[] = [];
68
+ for (const sheetName of workbook.SheetNames) {
69
+ const sheet = workbook.Sheets[sheetName];
70
+ const csv = XLSX.utils.sheet_to_csv(sheet);
71
+ lines.push(`## ${sheetName}\n${csv}`);
72
+ }
73
+ return lines.join('\n\n');
74
+ }
75
+
76
+ private async extractHtml(filePath: string): Promise<string> {
77
+ const { parseHTML } = await import('linkedom');
78
+ const { Readability } = await import('@mozilla/readability');
79
+ const html = await readFile(filePath, 'utf-8');
80
+ const { document } = parseHTML(html);
81
+ const reader = new Readability(document as any);
82
+ const article = reader.parse();
83
+ return article?.textContent?.trim() || html;
84
+ }
85
+ }
@@ -0,0 +1,153 @@
1
+ import { fetchHtml, fetchContent, extractLinks } from './content-fetcher.js';
2
+ import type { FetchContentOptions } from './content-fetcher.js';
3
+ import type { IngestionPipeline } from '../IngestionPipeline.js';
4
+ import type { KBDocument } from '../types.js';
5
+
6
+ export interface SiteCrawlOptions {
7
+ maxDepth?: number;
8
+ maxPages?: number;
9
+ useSitemap?: boolean;
10
+ delayMs?: number;
11
+ onProgress?: (crawled: number, discovered: number, url: string) => void;
12
+ }
13
+
14
+ export interface SiteCrawlerOptions {
15
+ crawl4aiUrl?: string;
16
+ }
17
+
18
+ export class SiteCrawler {
19
+ private pipeline: IngestionPipeline;
20
+ private fetchOptions: FetchContentOptions;
21
+
22
+ constructor(pipeline: IngestionPipeline, options?: SiteCrawlerOptions) {
23
+ this.pipeline = pipeline;
24
+ this.fetchOptions = { crawl4aiUrl: options?.crawl4aiUrl };
25
+ }
26
+
27
+ async crawlSite(startUrl: string, options: SiteCrawlOptions = {}): Promise<KBDocument[]> {
28
+ const {
29
+ maxDepth = 2,
30
+ maxPages = 50,
31
+ useSitemap = true,
32
+ delayMs = 500,
33
+ onProgress,
34
+ } = options;
35
+
36
+ const docs: KBDocument[] = [];
37
+ const visited = new Set<string>();
38
+
39
+ if (useSitemap) {
40
+ const sitemapUrls = await this.tryFetchSitemap(startUrl);
41
+ if (sitemapUrls.length > 0) {
42
+ const urlsToIngest = sitemapUrls.slice(0, maxPages);
43
+ for (const url of urlsToIngest) {
44
+ if (visited.has(url)) continue;
45
+ visited.add(url);
46
+ try {
47
+ onProgress?.(docs.length + 1, urlsToIngest.length, url);
48
+ const doc = await this.ingestPage(url);
49
+ docs.push(doc);
50
+ if (delayMs > 0) await this.delay(delayMs);
51
+ } catch {
52
+ // Skip failed URLs
53
+ }
54
+ if (docs.length >= maxPages) break;
55
+ }
56
+ return docs;
57
+ }
58
+ }
59
+
60
+ const queue: { url: string; depth: number }[] = [{ url: startUrl, depth: 0 }];
61
+
62
+ while (queue.length > 0 && docs.length < maxPages) {
63
+ const item = queue.shift()!;
64
+ if (visited.has(item.url) || item.depth > maxDepth) continue;
65
+ visited.add(item.url);
66
+
67
+ try {
68
+ onProgress?.(docs.length + 1, docs.length + queue.length + 1, item.url);
69
+ const html = await fetchHtml(item.url);
70
+ const { title, content, isMarkdown } = await fetchContent(item.url, this.fetchOptions);
71
+
72
+ const doc = await this.pipeline.ingest({
73
+ sourceType: 'url',
74
+ sourceUrl: item.url,
75
+ title,
76
+ content,
77
+ isMarkdown,
78
+ });
79
+ docs.push(doc);
80
+
81
+ if (delayMs > 0) await this.delay(delayMs);
82
+
83
+ if (item.depth < maxDepth) {
84
+ for (const link of extractLinks(html, item.url)) {
85
+ if (!visited.has(link)) {
86
+ queue.push({ url: link, depth: item.depth + 1 });
87
+ }
88
+ }
89
+ }
90
+ } catch {
91
+ // Skip failed URLs
92
+ }
93
+ }
94
+
95
+ return docs;
96
+ }
97
+
98
+ private async tryFetchSitemap(baseUrl: string): Promise<string[]> {
99
+ try {
100
+ const url = new URL(baseUrl);
101
+ const sitemapUrl = `${url.protocol}//${url.hostname}/sitemap.xml`;
102
+ const xml = await fetchHtml(sitemapUrl);
103
+ return await this.parseSitemapUrls(xml);
104
+ } catch {
105
+ return [];
106
+ }
107
+ }
108
+
109
+ private async parseSitemapUrls(xml: string): Promise<string[]> {
110
+ const urls: string[] = [];
111
+ const locRegex = /<loc>(.*?)<\/loc>/g;
112
+ let match;
113
+ while ((match = locRegex.exec(xml)) !== null) {
114
+ urls.push(match[1]);
115
+ }
116
+
117
+ if (xml.includes('<sitemapindex') || urls.every(u => u.endsWith('.xml'))) {
118
+ const pageUrls: string[] = [];
119
+ for (const childSitemapUrl of urls) {
120
+ try {
121
+ const childXml = await fetchHtml(childSitemapUrl);
122
+ const childUrls: string[] = [];
123
+ const childRegex = /<loc>(.*?)<\/loc>/g;
124
+ let childMatch;
125
+ while ((childMatch = childRegex.exec(childXml)) !== null) {
126
+ childUrls.push(childMatch[1]);
127
+ }
128
+ pageUrls.push(...childUrls.filter(u => !u.endsWith('.xml')));
129
+ } catch {
130
+ // Skip unreachable child sitemaps
131
+ }
132
+ }
133
+ return pageUrls;
134
+ }
135
+
136
+ return urls.filter(u => !u.endsWith('.xml'));
137
+ }
138
+
139
+ private async ingestPage(url: string): Promise<KBDocument> {
140
+ const { title, content, isMarkdown } = await fetchContent(url, this.fetchOptions);
141
+ return this.pipeline.ingest({
142
+ sourceType: 'url',
143
+ sourceUrl: url,
144
+ title,
145
+ content,
146
+ isMarkdown,
147
+ });
148
+ }
149
+
150
+ private delay(ms: number): Promise<void> {
151
+ return new Promise((resolve) => setTimeout(resolve, ms));
152
+ }
153
+ }
@@ -0,0 +1,106 @@
1
+ import { fetchHtml, fetchContent, extractLinks } from './content-fetcher.js';
2
+ import type { FetchContentOptions } from './content-fetcher.js';
3
+ import type { IngestionPipeline } from '../IngestionPipeline.js';
4
+ import type { KBDocument } from '../types.js';
5
+
6
+ export interface CrawlOptions {
7
+ maxPages?: number;
8
+ maxDepth?: number;
9
+ }
10
+
11
+ export interface UrlIngestorOptions {
12
+ crawl4aiUrl?: string;
13
+ }
14
+
15
+ export class UrlIngestor {
16
+ private pipeline: IngestionPipeline;
17
+ private fetchOptions: FetchContentOptions;
18
+
19
+ constructor(pipeline: IngestionPipeline, options?: UrlIngestorOptions) {
20
+ this.pipeline = pipeline;
21
+ this.fetchOptions = { crawl4aiUrl: options?.crawl4aiUrl };
22
+ }
23
+
24
+ async ingestUrl(url: string, options?: { priority?: number; extractQA?: boolean }): Promise<KBDocument> {
25
+ const { title, content, isMarkdown } = await fetchContent(url, this.fetchOptions);
26
+
27
+ return this.pipeline.ingest({
28
+ sourceType: 'url',
29
+ sourceUrl: url,
30
+ title,
31
+ content,
32
+ isMarkdown,
33
+ priority: options?.priority,
34
+ extractQA: options?.extractQA,
35
+ });
36
+ }
37
+
38
+ async ingestSitemap(sitemapUrl: string, options?: CrawlOptions): Promise<KBDocument[]> {
39
+ const maxPages = options?.maxPages || 50;
40
+ const xml = await fetchHtml(sitemapUrl);
41
+ const urls = this.parseSitemapUrls(xml).slice(0, maxPages);
42
+
43
+ const docs: KBDocument[] = [];
44
+ for (const url of urls) {
45
+ try {
46
+ const doc = await this.ingestUrl(url);
47
+ docs.push(doc);
48
+ } catch {
49
+ // Skip failed URLs
50
+ }
51
+ }
52
+ return docs;
53
+ }
54
+
55
+ async crawl(startUrl: string, options?: CrawlOptions): Promise<KBDocument[]> {
56
+ const maxPages = options?.maxPages || 20;
57
+ const maxDepth = options?.maxDepth || 2;
58
+ const visited = new Set<string>();
59
+ const docs: KBDocument[] = [];
60
+
61
+ const queue: { url: string; depth: number }[] = [{ url: startUrl, depth: 0 }];
62
+
63
+ while (queue.length > 0 && docs.length < maxPages) {
64
+ const item = queue.shift()!;
65
+ if (visited.has(item.url) || item.depth > maxDepth) continue;
66
+ visited.add(item.url);
67
+
68
+ try {
69
+ const html = await fetchHtml(item.url);
70
+ const { title, content, isMarkdown } = await fetchContent(item.url, this.fetchOptions);
71
+
72
+ const doc = await this.pipeline.ingest({
73
+ sourceType: 'url',
74
+ sourceUrl: item.url,
75
+ title,
76
+ content,
77
+ isMarkdown,
78
+ });
79
+ docs.push(doc);
80
+
81
+ if (item.depth < maxDepth) {
82
+ const links = extractLinks(html, item.url);
83
+ for (const link of links) {
84
+ if (!visited.has(link)) {
85
+ queue.push({ url: link, depth: item.depth + 1 });
86
+ }
87
+ }
88
+ }
89
+ } catch {
90
+ // Skip failed URLs
91
+ }
92
+ }
93
+
94
+ return docs;
95
+ }
96
+
97
+ private parseSitemapUrls(xml: string): string[] {
98
+ const urls: string[] = [];
99
+ const locRegex = /<loc>(.*?)<\/loc>/g;
100
+ let match;
101
+ while ((match = locRegex.exec(xml)) !== null) {
102
+ urls.push(match[1]);
103
+ }
104
+ return urls.filter(u => !u.endsWith('.xml'));
105
+ }
106
+ }
@@ -0,0 +1,75 @@
1
+ import type { IngestionPipeline } from '../IngestionPipeline.js';
2
+ import type { KBDocument } from '../types.js';
3
+
4
+ export interface FaqPair {
5
+ question: string;
6
+ answer: string;
7
+ }
8
+
9
+ export interface WatiFaqSyncOptions {
10
+ minAnswerLength?: number;
11
+ maxPairs?: number;
12
+ }
13
+
14
+ export class WatiFaqSync {
15
+ private pipeline: IngestionPipeline;
16
+ private llmExtract?: (conversation: string) => Promise<FaqPair[]>;
17
+
18
+ constructor(
19
+ pipeline: IngestionPipeline,
20
+ llmExtract?: (conversation: string) => Promise<FaqPair[]>,
21
+ ) {
22
+ this.pipeline = pipeline;
23
+ this.llmExtract = llmExtract;
24
+ }
25
+
26
+ async syncFromConversations(
27
+ conversations: string[],
28
+ options?: WatiFaqSyncOptions,
29
+ ): Promise<KBDocument[]> {
30
+ const minLen = options?.minAnswerLength || 20;
31
+ const maxPairs = options?.maxPairs || 100;
32
+
33
+ if (!this.llmExtract) {
34
+ throw new Error('LLM extract function required for FAQ extraction');
35
+ }
36
+
37
+ const allPairs: FaqPair[] = [];
38
+ for (const convo of conversations) {
39
+ const pairs = await this.llmExtract(convo);
40
+ allPairs.push(...pairs);
41
+ if (allPairs.length >= maxPairs) break;
42
+ }
43
+
44
+ // Quality filtering
45
+ const filtered = allPairs
46
+ .filter((p) => p.answer.length >= minLen && p.question.trim().length > 0)
47
+ .slice(0, maxPairs);
48
+
49
+ const docs: KBDocument[] = [];
50
+ for (const pair of filtered) {
51
+ const doc = await this.pipeline.ingestFaq(pair.question, pair.answer, {
52
+ source: 'wati-sync',
53
+ });
54
+ docs.push(doc);
55
+ }
56
+
57
+ return docs;
58
+ }
59
+
60
+ async syncFromPairs(pairs: FaqPair[], options?: WatiFaqSyncOptions): Promise<KBDocument[]> {
61
+ const minLen = options?.minAnswerLength || 20;
62
+ const filtered = pairs.filter(
63
+ (p) => p.answer.length >= minLen && p.question.trim().length > 0,
64
+ );
65
+
66
+ const docs: KBDocument[] = [];
67
+ for (const pair of filtered) {
68
+ const doc = await this.pipeline.ingestFaq(pair.question, pair.answer, {
69
+ source: 'wati-sync',
70
+ });
71
+ docs.push(doc);
72
+ }
73
+ return docs;
74
+ }
75
+ }
@@ -0,0 +1,142 @@
1
+ import { Readability } from '@mozilla/readability';
2
+ import { parseHTML } from 'linkedom';
3
+
4
+ export interface FetchContentOptions {
5
+ crawl4aiUrl?: string;
6
+ }
7
+
8
+ export interface FetchContentResult {
9
+ title: string;
10
+ content: string;
11
+ isMarkdown: boolean;
12
+ }
13
+
14
+ // Cache Crawl4AI health check result for 5 minutes
15
+ let crawl4aiHealthy: boolean | null = null;
16
+ let crawl4aiHealthCheckedAt = 0;
17
+ const HEALTH_CACHE_MS = 5 * 60 * 1000;
18
+
19
+ /** Reset the Crawl4AI health check cache. Used in tests. */
20
+ export function resetCrawl4aiHealthCache(): void {
21
+ crawl4aiHealthy = null;
22
+ crawl4aiHealthCheckedAt = 0;
23
+ }
24
+
25
+ /**
26
+ * Fetch raw HTML from a URL. Used for link extraction, sitemaps, etc.
27
+ */
28
+ export async function fetchHtml(url: string): Promise<string> {
29
+ const response = await fetch(url, {
30
+ headers: { 'User-Agent': 'Operor-KB/1.0' },
31
+ });
32
+ if (!response.ok) throw new Error(`Failed to fetch ${url}: ${response.status}`);
33
+ return response.text();
34
+ }
35
+
36
+ /**
37
+ * Smart content fetch: tries Crawl4AI first (if configured), falls back to Readability.
38
+ */
39
+ export async function fetchContent(url: string, options?: FetchContentOptions): Promise<FetchContentResult> {
40
+ if (options?.crawl4aiUrl) {
41
+ try {
42
+ if (await isCrawl4aiHealthy(options.crawl4aiUrl)) {
43
+ return await fetchViaCrawl4AI(url, options.crawl4aiUrl);
44
+ }
45
+ } catch {
46
+ // Fall through to Readability
47
+ }
48
+ }
49
+
50
+ const html = await fetchHtml(url);
51
+ const { title, content } = extractFromHtml(html, url);
52
+ return { title, content, isMarkdown: false };
53
+ }
54
+
55
+ /**
56
+ * Extract readable content from pre-fetched HTML using @mozilla/readability.
57
+ */
58
+ export function extractFromHtml(html: string, url: string): { title: string; content: string } {
59
+ const { document } = parseHTML(html);
60
+ const reader = new Readability(document as any, { url });
61
+ const article = reader.parse();
62
+ return {
63
+ title: article?.title || '',
64
+ content: article?.textContent?.trim() || '',
65
+ };
66
+ }
67
+
68
+ /**
69
+ * Extract same-domain links from HTML.
70
+ */
71
+ export function extractLinks(html: string, baseUrl: string): string[] {
72
+ const { document } = parseHTML(html);
73
+ const links: string[] = [];
74
+ const base = new URL(baseUrl);
75
+
76
+ for (const a of document.querySelectorAll('a[href]')) {
77
+ try {
78
+ const href = (a as any).getAttribute('href');
79
+ if (!href) continue;
80
+ const resolved = new URL(href, baseUrl);
81
+ if (resolved.hostname === base.hostname && resolved.protocol.startsWith('http')) {
82
+ links.push(resolved.href.split('#')[0]);
83
+ }
84
+ } catch {
85
+ // Skip invalid URLs
86
+ }
87
+ }
88
+
89
+ return [...new Set(links)];
90
+ }
91
+
92
+ async function isCrawl4aiHealthy(baseUrl: string): Promise<boolean> {
93
+ if (crawl4aiHealthy !== null && Date.now() - crawl4aiHealthCheckedAt < HEALTH_CACHE_MS) {
94
+ return crawl4aiHealthy;
95
+ }
96
+ try {
97
+ const res = await fetch(`${baseUrl}/health`, { signal: AbortSignal.timeout(2000) });
98
+ crawl4aiHealthy = res.ok;
99
+ } catch {
100
+ crawl4aiHealthy = false;
101
+ }
102
+ crawl4aiHealthCheckedAt = Date.now();
103
+ return crawl4aiHealthy;
104
+ }
105
+
106
+ async function fetchViaCrawl4AI(url: string, baseUrl: string): Promise<FetchContentResult> {
107
+ const res = await fetch(`${baseUrl}/crawl`, {
108
+ method: 'POST',
109
+ headers: { 'Content-Type': 'application/json' },
110
+ body: JSON.stringify({
111
+ urls: [url],
112
+ browser_config: { type: 'BrowserConfig', params: { headless: true } },
113
+ crawler_config: {
114
+ type: 'CrawlerRunConfig',
115
+ params: {
116
+ cache_mode: 'bypass',
117
+ markdown_generator: {
118
+ type: 'DefaultMarkdownGenerator',
119
+ params: {
120
+ content_filter: { type: 'PruningContentFilter', params: { threshold: 0.48 } },
121
+ },
122
+ },
123
+ },
124
+ },
125
+ }),
126
+ signal: AbortSignal.timeout(30000),
127
+ });
128
+
129
+ if (!res.ok) throw new Error(`Crawl4AI error: ${res.status}`);
130
+ const data = await res.json() as any;
131
+ const result = data.results?.[0];
132
+ if (!result) throw new Error('Crawl4AI returned no results');
133
+
134
+ const markdown = result.markdown?.fit_markdown || result.markdown?.raw_markdown || '';
135
+ if (!markdown) throw new Error('Crawl4AI returned empty markdown');
136
+
137
+ // Extract title from first heading
138
+ const titleMatch = markdown.match(/^#\s+(.+)$/m);
139
+ const title = titleMatch?.[1] || '';
140
+
141
+ return { title, content: markdown, isMarkdown: true };
142
+ }
package/src/types.ts ADDED
@@ -0,0 +1,62 @@
1
+ export interface KBDocument {
2
+ id: string;
3
+ sourceType: 'url' | 'file' | 'faq' | 'annotation';
4
+ sourceUrl?: string;
5
+ fileName?: string;
6
+ title?: string;
7
+ content: string;
8
+ metadata?: Record<string, any>;
9
+ createdAt: number;
10
+ updatedAt: number;
11
+ priority?: number; // 1=official, 2=supplementary, 3=archived (default: 2)
12
+ contentHash?: string; // SHA-256 for deduplication
13
+ }
14
+
15
+ export interface KBChunk {
16
+ id: string;
17
+ documentId: string;
18
+ content: string;
19
+ chunkIndex: number;
20
+ embedding?: number[];
21
+ metadata?: Record<string, any>;
22
+ }
23
+
24
+ export interface KBSearchResult {
25
+ chunk: KBChunk;
26
+ document: KBDocument;
27
+ score: number;
28
+ distance: number;
29
+ }
30
+
31
+ export interface KBSearchOptions {
32
+ limit?: number;
33
+ scoreThreshold?: number;
34
+ sourceTypes?: KBDocument['sourceType'][];
35
+ metadata?: Record<string, any>;
36
+ }
37
+
38
+ export interface KBStats {
39
+ documentCount: number;
40
+ chunkCount: number;
41
+ embeddingDimensions: number;
42
+ dbSizeBytes: number;
43
+ }
44
+
45
+ export interface KnowledgeStore {
46
+ initialize(): Promise<void>;
47
+ close(): Promise<void>;
48
+
49
+ addDocument(doc: KBDocument): Promise<void>;
50
+ getDocument(id: string): Promise<KBDocument | null>;
51
+ listDocuments(): Promise<KBDocument[]>;
52
+ deleteDocument(id: string): Promise<void>;
53
+
54
+ addChunks(chunks: KBChunk[]): Promise<void>;
55
+ search(query: string, embedding: number[], options?: KBSearchOptions): Promise<KBSearchResult[]>;
56
+ searchByEmbedding(embedding: number[], options?: KBSearchOptions): Promise<KBSearchResult[]>;
57
+ searchByKeyword?(query: string, options?: KBSearchOptions): Promise<KBSearchResult[]>;
58
+ getStats(): Promise<KBStats>;
59
+
60
+ /** Returns the number of chunks stored for a given document. Optional — only SQLiteKnowledgeStore implements this. */
61
+ getChunkCount?(documentId: string): number;
62
+ }
package/tsconfig.json ADDED
@@ -0,0 +1,9 @@
1
+ {
2
+ "extends": "../../tsconfig.json",
3
+ "compilerOptions": {
4
+ "outDir": "./dist",
5
+ "rootDir": "./src"
6
+ },
7
+ "include": ["src/**/*"],
8
+ "exclude": ["node_modules", "dist"]
9
+ }
@@ -0,0 +1,10 @@
1
+ import { defineConfig } from 'tsdown';
2
+
3
+ export default defineConfig({
4
+ entry: ['src/index.ts'],
5
+ format: ['esm'],
6
+ dts: true,
7
+ clean: true,
8
+ sourcemap: true,
9
+ outExtensions: () => ({ js: '.js', dts: '.d.ts' }),
10
+ });