@chatbot-packages/rag 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,242 @@
1
+ /**
2
+ * RAG System Types
3
+ */
4
+ interface Document {
5
+ id: string;
6
+ sourceId: string;
7
+ path: string;
8
+ title: string;
9
+ content: string;
10
+ metadata?: Record<string, unknown>;
11
+ createdAt: Date;
12
+ updatedAt: Date;
13
+ }
14
+ interface DocumentChunk {
15
+ id: string;
16
+ documentId: string;
17
+ text: string;
18
+ embedding?: number[];
19
+ metadata: ChunkMetadata;
20
+ createdAt: Date;
21
+ }
22
+ interface ChunkMetadata {
23
+ /** Section path (e.g., "Chapter 1 > Section 2 > Subsection A") */
24
+ sectionPath: string;
25
+ /** H1 heading */
26
+ headingH1?: string;
27
+ /** H2 heading */
28
+ headingH2?: string;
29
+ /** H3 heading */
30
+ headingH3?: string;
31
+ /** Start position in original document */
32
+ startOffset?: number;
33
+ /** End position in original document */
34
+ endOffset?: number;
35
+ /** Source file path */
36
+ sourcePath?: string;
37
+ /** Page number (if applicable) */
38
+ pageNumber?: number;
39
+ /** Custom metadata */
40
+ [key: string]: unknown;
41
+ }
42
+ type ExtractorType = 'chm' | 'html' | 'markdown' | 'text';
43
+ interface ExtractorOptions {
44
+ /** Source file or directory path */
45
+ sourcePath: string;
46
+ /** Output directory for extracted content */
47
+ outputDir?: string;
48
+ /** Whether to preserve directory structure */
49
+ preserveStructure?: boolean;
50
+ }
51
+ interface ExtractedDocument {
52
+ path: string;
53
+ title: string;
54
+ content: string;
55
+ format: ExtractorType;
56
+ metadata?: Record<string, unknown>;
57
+ }
58
+ interface ChunkingOptions {
59
+ /** Target chunk size in tokens */
60
+ chunkSize?: number;
61
+ /** Overlap between chunks in tokens */
62
+ chunkOverlap?: number;
63
+ /** Minimum chunk size in tokens */
64
+ minChunkSize?: number;
65
+ /** Maximum chunk size in tokens */
66
+ maxChunkSize?: number;
67
+ /** Whether to respect heading boundaries */
68
+ respectHeadings?: boolean;
69
+ /** Heading levels to split on (e.g., [1, 2]) */
70
+ splitOnHeadings?: number[];
71
+ }
72
+ interface ChunkResult {
73
+ chunks: DocumentChunk[];
74
+ stats: {
75
+ totalChunks: number;
76
+ avgChunkSize: number;
77
+ minChunkSize: number;
78
+ maxChunkSize: number;
79
+ };
80
+ }
81
+ type EmbeddingProvider = 'local' | 'openai' | 'huggingface';
82
+ interface EmbeddingOptions {
83
+ /** Embedding provider */
84
+ provider: EmbeddingProvider;
85
+ /** Model name (for local: 'bge-large-en-v1.5', for OpenAI: 'text-embedding-3-small') */
86
+ model?: string;
87
+ /** API key (for cloud providers) */
88
+ apiKey?: string;
89
+ /** Batch size for embedding generation */
90
+ batchSize?: number;
91
+ /** Embedding dimensions (for providers that support it) */
92
+ dimensions?: number;
93
+ }
94
+ interface EmbeddingResult {
95
+ embedding: number[];
96
+ tokens?: number;
97
+ }
98
+ interface EmbeddingBackend {
99
+ /** Embed a single text */
100
+ embed(text: string): Promise<EmbeddingResult>;
101
+ /** Embed multiple texts */
102
+ embedBatch(texts: string[]): Promise<EmbeddingResult[]>;
103
+ /** Get embedding dimensions */
104
+ getDimensions(): number;
105
+ /** Get model name */
106
+ getModel(): string;
107
+ }
108
+ type VectorStoreType = 'sqlite' | 'postgres' | 'memory';
109
+ interface VectorStoreOptions {
110
+ /** Store type */
111
+ type: VectorStoreType;
112
+ /** Database path (for SQLite) or connection string (for PostgreSQL) */
113
+ connectionString?: string;
114
+ /** Table/collection name */
115
+ tableName?: string;
116
+ /** Embedding dimensions */
117
+ dimensions: number;
118
+ }
119
+ interface SearchResult {
120
+ chunk: DocumentChunk;
121
+ score: number;
122
+ /** Search type that found this result */
123
+ searchType: 'dense' | 'sparse' | 'hybrid';
124
+ }
125
+ interface VectorStore {
126
+ /** Initialize the store (create tables, indexes) */
127
+ initialize(): Promise<void>;
128
+ /** Insert chunks with embeddings */
129
+ insert(chunks: DocumentChunk[]): Promise<void>;
130
+ /** Dense (vector) search */
131
+ denseSearch(embedding: number[], topK: number, filter?: Record<string, unknown>): Promise<SearchResult[]>;
132
+ /** Sparse (BM25/text) search */
133
+ sparseSearch(query: string, topK: number, filter?: Record<string, unknown>): Promise<SearchResult[]>;
134
+ /** Delete chunks by document ID */
135
+ deleteByDocumentId(documentId: string): Promise<number>;
136
+ /** Get chunk by ID */
137
+ getById(id: string): Promise<DocumentChunk | null>;
138
+ /** Close connection */
139
+ close(): Promise<void>;
140
+ }
141
+ interface RetrievalOptions {
142
+ /** Number of results to return */
143
+ topK?: number;
144
+ /** Dense search weight (0-1) */
145
+ denseWeight?: number;
146
+ /** Sparse search weight (0-1) */
147
+ sparseWeight?: number;
148
+ /** RRF k parameter */
149
+ rrfK?: number;
150
+ /** Whether to use reranking */
151
+ useReranking?: boolean;
152
+ /** Number of candidates to rerank */
153
+ rerankTopK?: number;
154
+ /** Metadata filter */
155
+ filter?: Record<string, unknown>;
156
+ }
157
+ interface RetrievalResult {
158
+ results: SearchResult[];
159
+ stats: {
160
+ denseCount: number;
161
+ sparseCount: number;
162
+ rerankingApplied: boolean;
163
+ totalTime: number;
164
+ };
165
+ }
166
+ interface RAGOptions {
167
+ /** Path to documents (file or directory) */
168
+ documents: string;
169
+ /** Vector store configuration */
170
+ vectorStore: VectorStoreOptions;
171
+ /** Embedding configuration */
172
+ embeddings: EmbeddingOptions;
173
+ /** LLM configuration */
174
+ llm: {
175
+ provider: 'openai' | 'anthropic' | 'cerebras' | 'groq';
176
+ apiKey?: string;
177
+ model?: string;
178
+ baseUrl?: string;
179
+ };
180
+ /** Chunking options */
181
+ chunking?: ChunkingOptions;
182
+ /** Retrieval options */
183
+ retrieval?: RetrievalOptions;
184
+ /** System prompt for the LLM */
185
+ systemPrompt?: string;
186
+ }
187
+ interface Citation {
188
+ /** Citation index (1-based) */
189
+ index: number;
190
+ /** Chunk ID */
191
+ chunkId: string;
192
+ /** Section path */
193
+ sectionPath: string;
194
+ /** Heading (if available) */
195
+ heading?: string;
196
+ /** Relevant text snippet */
197
+ snippet?: string;
198
+ }
199
+ interface RAGResponse {
200
+ /** The question that was asked */
201
+ question: string;
202
+ /** The generated answer */
203
+ answer: string;
204
+ /** Citations from source documents */
205
+ citations: Citation[];
206
+ /** Retrieved context chunks */
207
+ context: DocumentChunk[];
208
+ /** Response metadata */
209
+ metadata: {
210
+ /** Total processing time in ms */
211
+ totalTime: number;
212
+ /** Retrieval time in ms */
213
+ retrievalTime: number;
214
+ /** Generation time in ms */
215
+ generationTime: number;
216
+ /** Whether response was cached */
217
+ cached: boolean;
218
+ /** Model used */
219
+ model: string;
220
+ };
221
+ }
222
+ interface RAGService {
223
+ /** Index documents from a path */
224
+ index(path: string, options?: {
225
+ sourceId?: string;
226
+ }): Promise<{
227
+ documentsIndexed: number;
228
+ chunksCreated: number;
229
+ }>;
230
+ /** Ask a question */
231
+ ask(question: string, options?: RetrievalOptions): Promise<RAGResponse>;
232
+ /** Get indexed document count */
233
+ getDocumentCount(): Promise<number>;
234
+ /** Get chunk count */
235
+ getChunkCount(): Promise<number>;
236
+ /** Clear all indexed data */
237
+ clear(): Promise<void>;
238
+ /** Close connections */
239
+ close(): Promise<void>;
240
+ }
241
+
242
+ export type { ChunkMetadata as C, Document as D, EmbeddingBackend as E, RAGService as R, SearchResult as S, VectorStore as V, RAGOptions as a, RetrievalOptions as b, RAGResponse as c, ChunkResult as d, ChunkingOptions as e, Citation as f, DocumentChunk as g, EmbeddingOptions as h, EmbeddingResult as i, ExtractedDocument as j, RetrievalResult as k, VectorStoreOptions as l, ExtractorOptions as m, ExtractorType as n };
@@ -0,0 +1,109 @@
1
+ import { V as VectorStore, g as DocumentChunk, S as SearchResult, l as VectorStoreOptions } from '../types-CjnplPJD.js';
2
+ import { PoolConfig } from 'pg';
3
+
4
+ /**
5
+ * SQLite Vector Store
6
+ *
7
+ * Simple vector store using SQLite with better-sqlite3.
8
+ * Stores embeddings as JSON arrays and performs similarity search in JavaScript.
9
+ * Best for development, small datasets, or when PostgreSQL isn't available.
10
+ */
11
+
12
+ interface SQLiteVectorStoreOptions {
13
+ /** Database file path (use ':memory:' for in-memory) */
14
+ path?: string;
15
+ /** Table name (default: 'chunks') */
16
+ tableName?: string;
17
+ /** Embedding dimensions */
18
+ dimensions: number;
19
+ }
20
+ declare class SQLiteVectorStore implements VectorStore {
21
+ private db;
22
+ private tableName;
23
+ private dimensions;
24
+ constructor(options: SQLiteVectorStoreOptions);
25
+ initialize(): Promise<void>;
26
+ insert(chunks: DocumentChunk[]): Promise<void>;
27
+ denseSearch(embedding: number[], topK: number, filter?: Record<string, unknown>): Promise<SearchResult[]>;
28
+ sparseSearch(query: string, topK: number, filter?: Record<string, unknown>): Promise<SearchResult[]>;
29
+ deleteByDocumentId(documentId: string): Promise<number>;
30
+ getById(id: string): Promise<DocumentChunk | null>;
31
+ close(): Promise<void>;
32
+ /**
33
+ * Calculate cosine similarity between two vectors
34
+ */
35
+ private cosineSimilarity;
36
+ /**
37
+ * Convert database row to DocumentChunk
38
+ */
39
+ private rowToChunk;
40
+ /**
41
+ * Escape FTS query to prevent syntax errors
42
+ */
43
+ private escapeFTSQuery;
44
+ }
45
+
46
+ /**
47
+ * PostgreSQL Vector Store with pgvector
48
+ *
49
+ * Production-ready vector store using PostgreSQL with pgvector extension.
50
+ * Supports HNSW indexing for fast similarity search.
51
+ */
52
+
53
+ interface PostgresVectorStoreOptions {
54
+ /** Connection string or pool config */
55
+ connectionString?: string;
56
+ poolConfig?: PoolConfig;
57
+ /** Table name (default: 'chunks') */
58
+ tableName?: string;
59
+ /** Embedding dimensions */
60
+ dimensions: number;
61
+ /** Schema name (default: 'public') */
62
+ schema?: string;
63
+ }
64
+ declare class PostgresVectorStore implements VectorStore {
65
+ private pool;
66
+ private tableName;
67
+ private schema;
68
+ private dimensions;
69
+ private fullTableName;
70
+ constructor(options: PostgresVectorStoreOptions);
71
+ initialize(): Promise<void>;
72
+ insert(chunks: DocumentChunk[]): Promise<void>;
73
+ denseSearch(embedding: number[], topK: number, filter?: Record<string, unknown>): Promise<SearchResult[]>;
74
+ sparseSearch(query: string, topK: number, filter?: Record<string, unknown>): Promise<SearchResult[]>;
75
+ deleteByDocumentId(documentId: string): Promise<number>;
76
+ getById(id: string): Promise<DocumentChunk | null>;
77
+ close(): Promise<void>;
78
+ /**
79
+ * Get chunk and document counts
80
+ */
81
+ getStats(): Promise<{
82
+ chunks: number;
83
+ documents: number;
84
+ }>;
85
+ /**
86
+ * Insert or update a document
87
+ */
88
+ upsertDocument(doc: {
89
+ id: string;
90
+ sourceId: string;
91
+ path: string;
92
+ title: string;
93
+ metadata?: Record<string, unknown>;
94
+ }): Promise<void>;
95
+ private rowToChunk;
96
+ }
97
+
98
+ /**
99
+ * Vector Store Implementations
100
+ *
101
+ * Store and search document chunks using vector similarity.
102
+ */
103
+
104
+ /**
105
+ * Create a vector store based on options
106
+ */
107
+ declare function createVectorStore(options: VectorStoreOptions): VectorStore;
108
+
109
+ export { PostgresVectorStore, type PostgresVectorStoreOptions, SQLiteVectorStore, type SQLiteVectorStoreOptions, createVectorStore };