@chatbot-packages/rag 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunking/index.d.ts +51 -0
- package/dist/chunking/index.js +248 -0
- package/dist/chunking/index.js.map +1 -0
- package/dist/embeddings/index.d.ts +103 -0
- package/dist/embeddings/index.js +195 -0
- package/dist/embeddings/index.js.map +1 -0
- package/dist/extractors/index.d.ts +95 -0
- package/dist/extractors/index.js +343 -0
- package/dist/extractors/index.js.map +1 -0
- package/dist/index.d.ts +78 -0
- package/dist/index.js +1576 -0
- package/dist/index.js.map +1 -0
- package/dist/retrieval/index.d.ts +65 -0
- package/dist/retrieval/index.js +144 -0
- package/dist/retrieval/index.js.map +1 -0
- package/dist/types-CjnplPJD.d.ts +242 -0
- package/dist/vectorstore/index.d.ts +109 -0
- package/dist/vectorstore/index.js +422 -0
- package/dist/vectorstore/index.js.map +1 -0
- package/package.json +83 -0
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* RAG System Types
|
|
3
|
+
*/
|
|
4
|
+
interface Document {
|
|
5
|
+
id: string;
|
|
6
|
+
sourceId: string;
|
|
7
|
+
path: string;
|
|
8
|
+
title: string;
|
|
9
|
+
content: string;
|
|
10
|
+
metadata?: Record<string, unknown>;
|
|
11
|
+
createdAt: Date;
|
|
12
|
+
updatedAt: Date;
|
|
13
|
+
}
|
|
14
|
+
interface DocumentChunk {
|
|
15
|
+
id: string;
|
|
16
|
+
documentId: string;
|
|
17
|
+
text: string;
|
|
18
|
+
embedding?: number[];
|
|
19
|
+
metadata: ChunkMetadata;
|
|
20
|
+
createdAt: Date;
|
|
21
|
+
}
|
|
22
|
+
interface ChunkMetadata {
|
|
23
|
+
/** Section path (e.g., "Chapter 1 > Section 2 > Subsection A") */
|
|
24
|
+
sectionPath: string;
|
|
25
|
+
/** H1 heading */
|
|
26
|
+
headingH1?: string;
|
|
27
|
+
/** H2 heading */
|
|
28
|
+
headingH2?: string;
|
|
29
|
+
/** H3 heading */
|
|
30
|
+
headingH3?: string;
|
|
31
|
+
/** Start position in original document */
|
|
32
|
+
startOffset?: number;
|
|
33
|
+
/** End position in original document */
|
|
34
|
+
endOffset?: number;
|
|
35
|
+
/** Source file path */
|
|
36
|
+
sourcePath?: string;
|
|
37
|
+
/** Page number (if applicable) */
|
|
38
|
+
pageNumber?: number;
|
|
39
|
+
/** Custom metadata */
|
|
40
|
+
[key: string]: unknown;
|
|
41
|
+
}
|
|
42
|
+
type ExtractorType = 'chm' | 'html' | 'markdown' | 'text';
|
|
43
|
+
interface ExtractorOptions {
|
|
44
|
+
/** Source file or directory path */
|
|
45
|
+
sourcePath: string;
|
|
46
|
+
/** Output directory for extracted content */
|
|
47
|
+
outputDir?: string;
|
|
48
|
+
/** Whether to preserve directory structure */
|
|
49
|
+
preserveStructure?: boolean;
|
|
50
|
+
}
|
|
51
|
+
interface ExtractedDocument {
|
|
52
|
+
path: string;
|
|
53
|
+
title: string;
|
|
54
|
+
content: string;
|
|
55
|
+
format: ExtractorType;
|
|
56
|
+
metadata?: Record<string, unknown>;
|
|
57
|
+
}
|
|
58
|
+
interface ChunkingOptions {
|
|
59
|
+
/** Target chunk size in tokens */
|
|
60
|
+
chunkSize?: number;
|
|
61
|
+
/** Overlap between chunks in tokens */
|
|
62
|
+
chunkOverlap?: number;
|
|
63
|
+
/** Minimum chunk size in tokens */
|
|
64
|
+
minChunkSize?: number;
|
|
65
|
+
/** Maximum chunk size in tokens */
|
|
66
|
+
maxChunkSize?: number;
|
|
67
|
+
/** Whether to respect heading boundaries */
|
|
68
|
+
respectHeadings?: boolean;
|
|
69
|
+
/** Heading levels to split on (e.g., [1, 2]) */
|
|
70
|
+
splitOnHeadings?: number[];
|
|
71
|
+
}
|
|
72
|
+
interface ChunkResult {
|
|
73
|
+
chunks: DocumentChunk[];
|
|
74
|
+
stats: {
|
|
75
|
+
totalChunks: number;
|
|
76
|
+
avgChunkSize: number;
|
|
77
|
+
minChunkSize: number;
|
|
78
|
+
maxChunkSize: number;
|
|
79
|
+
};
|
|
80
|
+
}
|
|
81
|
+
type EmbeddingProvider = 'local' | 'openai' | 'huggingface';
|
|
82
|
+
interface EmbeddingOptions {
|
|
83
|
+
/** Embedding provider */
|
|
84
|
+
provider: EmbeddingProvider;
|
|
85
|
+
/** Model name (for local: 'bge-large-en-v1.5', for OpenAI: 'text-embedding-3-small') */
|
|
86
|
+
model?: string;
|
|
87
|
+
/** API key (for cloud providers) */
|
|
88
|
+
apiKey?: string;
|
|
89
|
+
/** Batch size for embedding generation */
|
|
90
|
+
batchSize?: number;
|
|
91
|
+
/** Embedding dimensions (for providers that support it) */
|
|
92
|
+
dimensions?: number;
|
|
93
|
+
}
|
|
94
|
+
interface EmbeddingResult {
|
|
95
|
+
embedding: number[];
|
|
96
|
+
tokens?: number;
|
|
97
|
+
}
|
|
98
|
+
interface EmbeddingBackend {
|
|
99
|
+
/** Embed a single text */
|
|
100
|
+
embed(text: string): Promise<EmbeddingResult>;
|
|
101
|
+
/** Embed multiple texts */
|
|
102
|
+
embedBatch(texts: string[]): Promise<EmbeddingResult[]>;
|
|
103
|
+
/** Get embedding dimensions */
|
|
104
|
+
getDimensions(): number;
|
|
105
|
+
/** Get model name */
|
|
106
|
+
getModel(): string;
|
|
107
|
+
}
|
|
108
|
+
type VectorStoreType = 'sqlite' | 'postgres' | 'memory';
|
|
109
|
+
interface VectorStoreOptions {
|
|
110
|
+
/** Store type */
|
|
111
|
+
type: VectorStoreType;
|
|
112
|
+
/** Database path (for SQLite) or connection string (for PostgreSQL) */
|
|
113
|
+
connectionString?: string;
|
|
114
|
+
/** Table/collection name */
|
|
115
|
+
tableName?: string;
|
|
116
|
+
/** Embedding dimensions */
|
|
117
|
+
dimensions: number;
|
|
118
|
+
}
|
|
119
|
+
interface SearchResult {
|
|
120
|
+
chunk: DocumentChunk;
|
|
121
|
+
score: number;
|
|
122
|
+
/** Search type that found this result */
|
|
123
|
+
searchType: 'dense' | 'sparse' | 'hybrid';
|
|
124
|
+
}
|
|
125
|
+
interface VectorStore {
|
|
126
|
+
/** Initialize the store (create tables, indexes) */
|
|
127
|
+
initialize(): Promise<void>;
|
|
128
|
+
/** Insert chunks with embeddings */
|
|
129
|
+
insert(chunks: DocumentChunk[]): Promise<void>;
|
|
130
|
+
/** Dense (vector) search */
|
|
131
|
+
denseSearch(embedding: number[], topK: number, filter?: Record<string, unknown>): Promise<SearchResult[]>;
|
|
132
|
+
/** Sparse (BM25/text) search */
|
|
133
|
+
sparseSearch(query: string, topK: number, filter?: Record<string, unknown>): Promise<SearchResult[]>;
|
|
134
|
+
/** Delete chunks by document ID */
|
|
135
|
+
deleteByDocumentId(documentId: string): Promise<number>;
|
|
136
|
+
/** Get chunk by ID */
|
|
137
|
+
getById(id: string): Promise<DocumentChunk | null>;
|
|
138
|
+
/** Close connection */
|
|
139
|
+
close(): Promise<void>;
|
|
140
|
+
}
|
|
141
|
+
interface RetrievalOptions {
|
|
142
|
+
/** Number of results to return */
|
|
143
|
+
topK?: number;
|
|
144
|
+
/** Dense search weight (0-1) */
|
|
145
|
+
denseWeight?: number;
|
|
146
|
+
/** Sparse search weight (0-1) */
|
|
147
|
+
sparseWeight?: number;
|
|
148
|
+
/** RRF k parameter */
|
|
149
|
+
rrfK?: number;
|
|
150
|
+
/** Whether to use reranking */
|
|
151
|
+
useReranking?: boolean;
|
|
152
|
+
/** Number of candidates to rerank */
|
|
153
|
+
rerankTopK?: number;
|
|
154
|
+
/** Metadata filter */
|
|
155
|
+
filter?: Record<string, unknown>;
|
|
156
|
+
}
|
|
157
|
+
interface RetrievalResult {
|
|
158
|
+
results: SearchResult[];
|
|
159
|
+
stats: {
|
|
160
|
+
denseCount: number;
|
|
161
|
+
sparseCount: number;
|
|
162
|
+
rerankingApplied: boolean;
|
|
163
|
+
totalTime: number;
|
|
164
|
+
};
|
|
165
|
+
}
|
|
166
|
+
interface RAGOptions {
|
|
167
|
+
/** Path to documents (file or directory) */
|
|
168
|
+
documents: string;
|
|
169
|
+
/** Vector store configuration */
|
|
170
|
+
vectorStore: VectorStoreOptions;
|
|
171
|
+
/** Embedding configuration */
|
|
172
|
+
embeddings: EmbeddingOptions;
|
|
173
|
+
/** LLM configuration */
|
|
174
|
+
llm: {
|
|
175
|
+
provider: 'openai' | 'anthropic' | 'cerebras' | 'groq';
|
|
176
|
+
apiKey?: string;
|
|
177
|
+
model?: string;
|
|
178
|
+
baseUrl?: string;
|
|
179
|
+
};
|
|
180
|
+
/** Chunking options */
|
|
181
|
+
chunking?: ChunkingOptions;
|
|
182
|
+
/** Retrieval options */
|
|
183
|
+
retrieval?: RetrievalOptions;
|
|
184
|
+
/** System prompt for the LLM */
|
|
185
|
+
systemPrompt?: string;
|
|
186
|
+
}
|
|
187
|
+
interface Citation {
|
|
188
|
+
/** Citation index (1-based) */
|
|
189
|
+
index: number;
|
|
190
|
+
/** Chunk ID */
|
|
191
|
+
chunkId: string;
|
|
192
|
+
/** Section path */
|
|
193
|
+
sectionPath: string;
|
|
194
|
+
/** Heading (if available) */
|
|
195
|
+
heading?: string;
|
|
196
|
+
/** Relevant text snippet */
|
|
197
|
+
snippet?: string;
|
|
198
|
+
}
|
|
199
|
+
interface RAGResponse {
|
|
200
|
+
/** The question that was asked */
|
|
201
|
+
question: string;
|
|
202
|
+
/** The generated answer */
|
|
203
|
+
answer: string;
|
|
204
|
+
/** Citations from source documents */
|
|
205
|
+
citations: Citation[];
|
|
206
|
+
/** Retrieved context chunks */
|
|
207
|
+
context: DocumentChunk[];
|
|
208
|
+
/** Response metadata */
|
|
209
|
+
metadata: {
|
|
210
|
+
/** Total processing time in ms */
|
|
211
|
+
totalTime: number;
|
|
212
|
+
/** Retrieval time in ms */
|
|
213
|
+
retrievalTime: number;
|
|
214
|
+
/** Generation time in ms */
|
|
215
|
+
generationTime: number;
|
|
216
|
+
/** Whether response was cached */
|
|
217
|
+
cached: boolean;
|
|
218
|
+
/** Model used */
|
|
219
|
+
model: string;
|
|
220
|
+
};
|
|
221
|
+
}
|
|
222
|
+
interface RAGService {
|
|
223
|
+
/** Index documents from a path */
|
|
224
|
+
index(path: string, options?: {
|
|
225
|
+
sourceId?: string;
|
|
226
|
+
}): Promise<{
|
|
227
|
+
documentsIndexed: number;
|
|
228
|
+
chunksCreated: number;
|
|
229
|
+
}>;
|
|
230
|
+
/** Ask a question */
|
|
231
|
+
ask(question: string, options?: RetrievalOptions): Promise<RAGResponse>;
|
|
232
|
+
/** Get indexed document count */
|
|
233
|
+
getDocumentCount(): Promise<number>;
|
|
234
|
+
/** Get chunk count */
|
|
235
|
+
getChunkCount(): Promise<number>;
|
|
236
|
+
/** Clear all indexed data */
|
|
237
|
+
clear(): Promise<void>;
|
|
238
|
+
/** Close connections */
|
|
239
|
+
close(): Promise<void>;
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
export type { ChunkMetadata as C, Document as D, EmbeddingBackend as E, RAGService as R, SearchResult as S, VectorStore as V, RAGOptions as a, RetrievalOptions as b, RAGResponse as c, ChunkResult as d, ChunkingOptions as e, Citation as f, DocumentChunk as g, EmbeddingOptions as h, EmbeddingResult as i, ExtractedDocument as j, RetrievalResult as k, VectorStoreOptions as l, ExtractorOptions as m, ExtractorType as n };
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
import { V as VectorStore, g as DocumentChunk, S as SearchResult, l as VectorStoreOptions } from '../types-CjnplPJD.js';
|
|
2
|
+
import { PoolConfig } from 'pg';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* SQLite Vector Store
|
|
6
|
+
*
|
|
7
|
+
* Simple vector store using SQLite with better-sqlite3.
|
|
8
|
+
* Stores embeddings as JSON arrays and performs similarity search in JavaScript.
|
|
9
|
+
* Best for development, small datasets, or when PostgreSQL isn't available.
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
interface SQLiteVectorStoreOptions {
|
|
13
|
+
/** Database file path (use ':memory:' for in-memory) */
|
|
14
|
+
path?: string;
|
|
15
|
+
/** Table name (default: 'chunks') */
|
|
16
|
+
tableName?: string;
|
|
17
|
+
/** Embedding dimensions */
|
|
18
|
+
dimensions: number;
|
|
19
|
+
}
|
|
20
|
+
declare class SQLiteVectorStore implements VectorStore {
|
|
21
|
+
private db;
|
|
22
|
+
private tableName;
|
|
23
|
+
private dimensions;
|
|
24
|
+
constructor(options: SQLiteVectorStoreOptions);
|
|
25
|
+
initialize(): Promise<void>;
|
|
26
|
+
insert(chunks: DocumentChunk[]): Promise<void>;
|
|
27
|
+
denseSearch(embedding: number[], topK: number, filter?: Record<string, unknown>): Promise<SearchResult[]>;
|
|
28
|
+
sparseSearch(query: string, topK: number, filter?: Record<string, unknown>): Promise<SearchResult[]>;
|
|
29
|
+
deleteByDocumentId(documentId: string): Promise<number>;
|
|
30
|
+
getById(id: string): Promise<DocumentChunk | null>;
|
|
31
|
+
close(): Promise<void>;
|
|
32
|
+
/**
|
|
33
|
+
* Calculate cosine similarity between two vectors
|
|
34
|
+
*/
|
|
35
|
+
private cosineSimilarity;
|
|
36
|
+
/**
|
|
37
|
+
* Convert database row to DocumentChunk
|
|
38
|
+
*/
|
|
39
|
+
private rowToChunk;
|
|
40
|
+
/**
|
|
41
|
+
* Escape FTS query to prevent syntax errors
|
|
42
|
+
*/
|
|
43
|
+
private escapeFTSQuery;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* PostgreSQL Vector Store with pgvector
|
|
48
|
+
*
|
|
49
|
+
* Production-ready vector store using PostgreSQL with pgvector extension.
|
|
50
|
+
* Supports HNSW indexing for fast similarity search.
|
|
51
|
+
*/
|
|
52
|
+
|
|
53
|
+
interface PostgresVectorStoreOptions {
|
|
54
|
+
/** Connection string or pool config */
|
|
55
|
+
connectionString?: string;
|
|
56
|
+
poolConfig?: PoolConfig;
|
|
57
|
+
/** Table name (default: 'chunks') */
|
|
58
|
+
tableName?: string;
|
|
59
|
+
/** Embedding dimensions */
|
|
60
|
+
dimensions: number;
|
|
61
|
+
/** Schema name (default: 'public') */
|
|
62
|
+
schema?: string;
|
|
63
|
+
}
|
|
64
|
+
declare class PostgresVectorStore implements VectorStore {
|
|
65
|
+
private pool;
|
|
66
|
+
private tableName;
|
|
67
|
+
private schema;
|
|
68
|
+
private dimensions;
|
|
69
|
+
private fullTableName;
|
|
70
|
+
constructor(options: PostgresVectorStoreOptions);
|
|
71
|
+
initialize(): Promise<void>;
|
|
72
|
+
insert(chunks: DocumentChunk[]): Promise<void>;
|
|
73
|
+
denseSearch(embedding: number[], topK: number, filter?: Record<string, unknown>): Promise<SearchResult[]>;
|
|
74
|
+
sparseSearch(query: string, topK: number, filter?: Record<string, unknown>): Promise<SearchResult[]>;
|
|
75
|
+
deleteByDocumentId(documentId: string): Promise<number>;
|
|
76
|
+
getById(id: string): Promise<DocumentChunk | null>;
|
|
77
|
+
close(): Promise<void>;
|
|
78
|
+
/**
|
|
79
|
+
* Get chunk and document counts
|
|
80
|
+
*/
|
|
81
|
+
getStats(): Promise<{
|
|
82
|
+
chunks: number;
|
|
83
|
+
documents: number;
|
|
84
|
+
}>;
|
|
85
|
+
/**
|
|
86
|
+
* Insert or update a document
|
|
87
|
+
*/
|
|
88
|
+
upsertDocument(doc: {
|
|
89
|
+
id: string;
|
|
90
|
+
sourceId: string;
|
|
91
|
+
path: string;
|
|
92
|
+
title: string;
|
|
93
|
+
metadata?: Record<string, unknown>;
|
|
94
|
+
}): Promise<void>;
|
|
95
|
+
private rowToChunk;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
/**
|
|
99
|
+
* Vector Store Implementations
|
|
100
|
+
*
|
|
101
|
+
* Store and search document chunks using vector similarity.
|
|
102
|
+
*/
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* Create a vector store based on options
|
|
106
|
+
*/
|
|
107
|
+
declare function createVectorStore(options: VectorStoreOptions): VectorStore;
|
|
108
|
+
|
|
109
|
+
export { PostgresVectorStore, type PostgresVectorStoreOptions, SQLiteVectorStore, type SQLiteVectorStoreOptions, createVectorStore };
|