@elsium-ai/rag 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/README.md +731 -12
  2. package/package.json +3 -3
package/README.md CHANGED
@@ -13,26 +13,745 @@ npm install @elsium-ai/rag @elsium-ai/core
13
13
 
14
14
  ## What's Inside
15
15
 
16
- - **Document Loading** Load documents from various sources
17
- - **Chunking** — Split documents with configurable strategies (fixed-size, recursive, semantic)
18
- - **Embeddings** Generate embeddings with pluggable providers
19
- - **Vector Search** In-memory vector store with cosine similarity search
20
- - **RAG Pipeline** End-to-end retrieval-augmented generation
16
+ | Category | Exports |
17
+ |---|---|
18
+ | **Types** | `Document`, `DocumentMetadata`, `Chunk`, `ChunkMetadata`, `EmbeddingVector`, `EmbeddedChunk`, `RetrievalResult`, `QueryOptions`, `LoaderType`, `ChunkingStrategy`, `ChunkingConfig`, `EmbeddingConfig`, `VectorStoreConfig`, `RetrievalConfig` |
19
+ | **Loaders** | `textLoader`, `markdownLoader`, `htmlLoader`, `jsonLoader`, `csvLoader`, `getLoader`, `DocumentLoader` |
20
+ | **Chunkers** | `fixedSizeChunker`, `recursiveChunker`, `sentenceChunker`, `getChunker`, `Chunker` |
21
+ | **Embeddings** | `createOpenAIEmbeddings`, `createMockEmbeddings`, `getEmbeddingProvider`, `EmbeddingProvider` |
22
+ | **Vector Store** | `createInMemoryStore`, `cosineSimilarity`, `mmrRerank`, `VectorStore` |
23
+ | **Pipeline** | `rag`, `RAGPipeline`, `RAGPipelineConfig`, `IngestResult` |
21
24
 
22
- ## Usage
25
+ ---
26
+
27
+ ## Types
28
+
29
+ ### `Document`
30
+
31
+ Represents a loaded document ready for chunking.
32
+
33
+ ```typescript
34
+ interface Document {
35
+ id: string
36
+ content: string
37
+ metadata: DocumentMetadata
38
+ }
39
+ ```
40
+
41
+ ### `DocumentMetadata`
42
+
43
+ Metadata attached to a document. Includes required fields plus arbitrary extra properties.
44
+
45
+ ```typescript
46
+ interface DocumentMetadata {
47
+ source: string
48
+ type: string
49
+ title?: string
50
+ language?: string
51
+ createdAt?: string
52
+ [key: string]: unknown
53
+ }
54
+ ```
55
+
56
+ ### `Chunk`
57
+
58
+ A segment of a document produced by a chunker.
59
+
60
+ ```typescript
61
+ interface Chunk {
62
+ id: string
63
+ content: string
64
+ documentId: string
65
+ index: number
66
+ metadata: ChunkMetadata
67
+ }
68
+ ```
69
+
70
+ ### `ChunkMetadata`
71
+
72
+ Positional and token metadata for a chunk. Supports arbitrary extra properties.
73
+
74
+ ```typescript
75
+ interface ChunkMetadata {
76
+ startChar: number
77
+ endChar: number
78
+ tokenEstimate: number
79
+ [key: string]: unknown
80
+ }
81
+ ```
82
+
83
+ ### `EmbeddingVector`
84
+
85
+ A numeric embedding vector with its dimensionality.
86
+
87
+ ```typescript
88
+ interface EmbeddingVector {
89
+ values: number[]
90
+ dimensions: number
91
+ }
92
+ ```
93
+
94
+ ### `EmbeddedChunk`
95
+
96
+ A chunk that has been embedded. Extends `Chunk` with an `embedding` field.
97
+
98
+ ```typescript
99
+ interface EmbeddedChunk extends Chunk {
100
+ embedding: EmbeddingVector
101
+ }
102
+ ```
103
+
104
+ ### `RetrievalResult`
105
+
106
+ A single result returned from a vector store query, pairing a chunk with its relevance score.
107
+
108
+ ```typescript
109
+ interface RetrievalResult {
110
+ chunk: Chunk
111
+ score: number
112
+ distance: number
113
+ }
114
+ ```
115
+
116
+ ### `QueryOptions`
117
+
118
+ Options passed to a vector store query or pipeline query.
119
+
120
+ ```typescript
121
+ interface QueryOptions {
122
+ topK?: number
123
+ minScore?: number
124
+ filter?: Record<string, unknown>
125
+ }
126
+ ```
127
+
128
+ ### `LoaderType`
129
+
130
+ Union of supported document loader types.
131
+
132
+ ```typescript
133
+ type LoaderType = 'text' | 'markdown' | 'html' | 'json' | 'csv'
134
+ ```
135
+
136
+ ### `ChunkingStrategy`
137
+
138
+ Union of supported chunking strategies.
139
+
140
+ ```typescript
141
+ type ChunkingStrategy = 'fixed-size' | 'recursive' | 'sentence'
142
+ ```
143
+
144
+ ### `ChunkingConfig`
145
+
146
+ Configuration object for creating a chunker via `getChunker`.
147
+
148
+ ```typescript
149
+ interface ChunkingConfig {
150
+ strategy: ChunkingStrategy
151
+ maxChunkSize?: number
152
+ overlap?: number
153
+ separator?: string
154
+ }
155
+ ```
156
+
157
+ ### `EmbeddingConfig`
158
+
159
+ Configuration object for creating an embedding provider.
160
+
161
+ ```typescript
162
+ interface EmbeddingConfig {
163
+ provider: string
164
+ model?: string
165
+ apiKey?: string
166
+ baseUrl?: string
167
+ dimensions?: number
168
+ batchSize?: number
169
+ }
170
+ ```
171
+
172
+ ### `VectorStoreConfig`
173
+
174
+ Configuration object for specifying a vector store backend.
175
+
176
+ ```typescript
177
+ interface VectorStoreConfig {
178
+ provider: string
179
+ connectionString?: string
180
+ tableName?: string
181
+ dimensions?: number
182
+ }
183
+ ```
184
+
185
+ ### `RetrievalConfig`
186
+
187
+ Configuration for retrieval behavior within a RAG pipeline.
188
+
189
+ ```typescript
190
+ interface RetrievalConfig {
191
+ topK?: number
192
+ minScore?: number
193
+ strategy?: 'similarity' | 'mmr'
194
+ mmrLambda?: number
195
+ }
196
+ ```
197
+
198
+ ---
199
+
200
+ ## Loaders
201
+
202
+ ### `DocumentLoader`
203
+
204
+ Interface implemented by all document loaders.
205
+
206
+ ```typescript
207
+ interface DocumentLoader {
208
+ load(source: string, content: string): Document
209
+ }
210
+ ```
211
+
212
+ ### `textLoader`
213
+
214
+ Creates a loader for plain text content.
215
+
216
+ ```typescript
217
+ function textLoader(): DocumentLoader
218
+ ```
219
+
220
+ **Returns:** A `DocumentLoader` that produces documents with `type: 'text'`.
221
+
222
+ ```typescript
223
+ import { textLoader } from '@elsium-ai/rag'
224
+
225
+ const loader = textLoader()
226
+ const doc = loader.load('notes.txt', 'Hello, world!')
227
+
228
+ console.log(doc.metadata.type) // "text"
229
+ ```
230
+
231
+ ### `markdownLoader`
232
+
233
+ Creates a loader for Markdown content. Automatically extracts the first `# heading` as the document title.
234
+
235
+ ```typescript
236
+ function markdownLoader(): DocumentLoader
237
+ ```
238
+
239
+ **Returns:** A `DocumentLoader` that produces documents with `type: 'markdown'` and an optional `title` in metadata.
240
+
241
+ ```typescript
242
+ import { markdownLoader } from '@elsium-ai/rag'
243
+
244
+ const loader = markdownLoader()
245
+ const doc = loader.load('readme.md', '# My Project\n\nSome content here.')
246
+
247
+ console.log(doc.metadata.title) // "My Project"
248
+ ```
249
+
250
+ ### `htmlLoader`
251
+
252
+ Creates a loader for HTML content. Strips tags, scripts, and styles, then extracts the `<title>` element as the document title.
253
+
254
+ ```typescript
255
+ function htmlLoader(): DocumentLoader
256
+ ```
257
+
258
+ **Returns:** A `DocumentLoader` that produces documents with `type: 'html'`, the HTML body converted to plain text, and an optional `title` in metadata.
259
+
260
+ ```typescript
261
+ import { htmlLoader } from '@elsium-ai/rag'
262
+
263
+ const loader = htmlLoader()
264
+ const doc = loader.load('page.html', '<html><title>Hello</title><body><p>World</p></body></html>')
265
+
266
+ console.log(doc.metadata.title) // "Hello"
267
+ console.log(doc.content) // "World"
268
+ ```
269
+
270
+ ### `jsonLoader`
271
+
272
+ Creates a loader for JSON content. Extracts text from a configurable content field and can pull additional metadata fields.
273
+
274
+ ```typescript
275
+ function jsonLoader(options?: {
276
+ contentField?: string
277
+ metadataFields?: string[]
278
+ }): DocumentLoader
279
+ ```
280
+
281
+ | Parameter | Type | Default | Description |
282
+ |---|---|---|---|
283
+ | `options.contentField` | `string` | `'content'` | The JSON key to read text content from. |
284
+ | `options.metadataFields` | `string[]` | `[]` | Additional top-level JSON keys to copy into document metadata. |
285
+
286
+ **Returns:** A `DocumentLoader` that produces documents with `type: 'json'`. If the parsed JSON is an array, each item's content field is joined with double newlines.
287
+
288
+ ```typescript
289
+ import { jsonLoader } from '@elsium-ai/rag'
290
+
291
+ const loader = jsonLoader({ contentField: 'text', metadataFields: ['author'] })
292
+ const doc = loader.load('data.json', JSON.stringify({ text: 'Hello', author: 'Alice' }))
293
+
294
+ console.log(doc.content) // "Hello"
295
+ console.log(doc.metadata.author) // "Alice"
296
+ ```
297
+
298
+ ### `csvLoader`
299
+
300
+ Creates a loader for CSV content. Parses headers and rows, optionally selecting specific columns.
301
+
302
+ ```typescript
303
+ function csvLoader(options?: {
304
+ separator?: string
305
+ contentColumns?: string[]
306
+ }): DocumentLoader
307
+ ```
308
+
309
+ | Parameter | Type | Default | Description |
310
+ |---|---|---|---|
311
+ | `options.separator` | `string` | `','` | Column delimiter character. |
312
+ | `options.contentColumns` | `string[]` | all columns | Subset of columns to include in the document text. |
313
+
314
+ **Returns:** A `DocumentLoader` that produces documents with `type: 'csv'`. Metadata includes `rowCount` and `columns`.
315
+
316
+ ```typescript
317
+ import { csvLoader } from '@elsium-ai/rag'
318
+
319
+ const loader = csvLoader({ contentColumns: ['name', 'bio'] })
320
+ const doc = loader.load('people.csv', 'name,age,bio\nAlice,30,Engineer\nBob,25,Designer')
321
+
322
+ console.log(doc.metadata.rowCount) // 2
323
+ console.log(doc.metadata.columns) // ["name", "age", "bio"]
324
+ ```
325
+
326
+ ### `getLoader`
327
+
328
+ Factory function that returns a `DocumentLoader` for the given `LoaderType`.
329
+
330
+ ```typescript
331
+ function getLoader(type: LoaderType): DocumentLoader
332
+ ```
333
+
334
+ | Parameter | Type | Description |
335
+ |---|---|---|
336
+ | `type` | `LoaderType` | One of `'text'`, `'markdown'`, `'html'`, `'json'`, `'csv'`. |
337
+
338
+ **Returns:** The corresponding `DocumentLoader` with default options.
339
+
340
+ ```typescript
341
+ import { getLoader } from '@elsium-ai/rag'
342
+
343
+ const loader = getLoader('markdown')
344
+ const doc = loader.load('file.md', '# Title\n\nBody text.')
345
+ ```
346
+
347
+ ---
348
+
349
+ ## Chunkers
350
+
351
+ ### `Chunker`
352
+
353
+ Interface implemented by all chunking strategies.
354
+
355
+ ```typescript
356
+ interface Chunker {
357
+ chunk(document: Document): Chunk[]
358
+ }
359
+ ```
360
+
361
+ ### `fixedSizeChunker`
362
+
363
+ Creates a chunker that splits documents into fixed-size character windows with optional overlap.
364
+
365
+ ```typescript
366
+ function fixedSizeChunker(options?: {
367
+ maxChunkSize?: number
368
+ overlap?: number
369
+ }): Chunker
370
+ ```
371
+
372
+ | Parameter | Type | Default | Description |
373
+ |---|---|---|---|
374
+ | `options.maxChunkSize` | `number` | `512` | Maximum number of characters per chunk. |
375
+ | `options.overlap` | `number` | `0` | Number of overlapping characters between consecutive chunks. Must be less than `maxChunkSize`. |
376
+
377
+ **Returns:** A `Chunker` that produces fixed-size chunks.
378
+
379
+ ```typescript
380
+ import { fixedSizeChunker } from '@elsium-ai/rag'
381
+
382
+ const chunker = fixedSizeChunker({ maxChunkSize: 256, overlap: 32 })
383
+ // Assuming `doc` is a Document:
384
+ const chunks = chunker.chunk(doc)
385
+ ```
386
+
387
+ ### `recursiveChunker`
388
+
389
+ Creates a chunker that recursively splits text using a hierarchy of separators (paragraph breaks, line breaks, sentence endings, spaces), falling back to fixed-size splitting when no separator fits.
390
+
391
+ ```typescript
392
+ function recursiveChunker(options?: {
393
+ maxChunkSize?: number
394
+ overlap?: number
395
+ separators?: string[]
396
+ }): Chunker
397
+ ```
398
+
399
+ | Parameter | Type | Default | Description |
400
+ |---|---|---|---|
401
+ | `options.maxChunkSize` | `number` | `512` | Maximum number of characters per chunk. |
402
+ | `options.overlap` | `number` | `0` | Number of overlapping characters between consecutive chunks. Must be less than `maxChunkSize`. |
403
+ | `options.separators` | `string[]` | `['\n\n', '\n', '. ', ' ', '']` | Ordered list of separators to try. |
404
+
405
+ **Returns:** A `Chunker` that recursively splits documents using natural boundaries.
406
+
407
+ ```typescript
408
+ import { recursiveChunker } from '@elsium-ai/rag'
409
+
410
+ const chunker = recursiveChunker({ maxChunkSize: 512, overlap: 50 })
411
+ const chunks = chunker.chunk(doc)
412
+ ```
413
+
414
+ ### `sentenceChunker`
415
+
416
+ Creates a chunker that splits text on sentence boundaries (`.`, `!`, `?` followed by whitespace) and groups sentences into chunks up to `maxChunkSize` characters with optional sentence-level overlap.
417
+
418
+ ```typescript
419
+ function sentenceChunker(options?: {
420
+ maxChunkSize?: number
421
+ overlap?: number
422
+ }): Chunker
423
+ ```
424
+
425
+ | Parameter | Type | Default | Description |
426
+ |---|---|---|---|
427
+ | `options.maxChunkSize` | `number` | `512` | Maximum number of characters per chunk. |
428
+ | `options.overlap` | `number` | `1` | Number of sentences to overlap between consecutive chunks. |
429
+
430
+ **Returns:** A `Chunker` that groups sentences into chunks. Each chunk's metadata includes `sentenceCount`.
431
+
432
+ ```typescript
433
+ import { sentenceChunker } from '@elsium-ai/rag'
434
+
435
+ const chunker = sentenceChunker({ maxChunkSize: 300, overlap: 2 })
436
+ const chunks = chunker.chunk(doc)
437
+
438
+ console.log(chunks[0].metadata.sentenceCount) // number of sentences in the first chunk
439
+ ```
440
+
441
+ ### `getChunker`
442
+
443
+ Factory function that returns a `Chunker` for the given `ChunkingConfig`.
444
+
445
+ ```typescript
446
+ function getChunker(config: ChunkingConfig): Chunker
447
+ ```
448
+
449
+ | Parameter | Type | Description |
450
+ |---|---|---|
451
+ | `config` | `ChunkingConfig` | Chunking configuration specifying `strategy`, `maxChunkSize`, and `overlap`. |
452
+
453
+ **Returns:** The corresponding `Chunker` instance.
454
+
455
+ ```typescript
456
+ import { getChunker } from '@elsium-ai/rag'
457
+
458
+ const chunker = getChunker({ strategy: 'recursive', maxChunkSize: 512, overlap: 50 })
459
+ const chunks = chunker.chunk(doc)
460
+ ```
461
+
462
+ ---
463
+
464
+ ## Embeddings
465
+
466
+ ### `EmbeddingProvider`
467
+
468
+ Interface implemented by all embedding providers.
469
+
470
+ ```typescript
471
+ interface EmbeddingProvider {
472
+ readonly name: string
473
+ readonly dimensions: number
474
+
475
+ embed(text: string): Promise<EmbeddingVector>
476
+ embedBatch(texts: string[]): Promise<EmbeddingVector[]>
477
+ }
478
+ ```
479
+
480
+ ### `createOpenAIEmbeddings`
481
+
482
+ Creates an embedding provider backed by the OpenAI embeddings API.
483
+
484
+ ```typescript
485
+ function createOpenAIEmbeddings(config: EmbeddingConfig): EmbeddingProvider
486
+ ```
487
+
488
+ | Parameter | Type | Default | Description |
489
+ |---|---|---|---|
490
+ | `config.apiKey` | `string` | **(required)** | OpenAI API key. Throws if not provided. |
491
+ | `config.model` | `string` | `'text-embedding-3-small'` | Model name to use. |
492
+ | `config.baseUrl` | `string` | `'https://api.openai.com'` | API base URL (useful for proxies or compatible APIs). |
493
+ | `config.dimensions` | `number` | `1536` | Desired embedding dimensions. |
494
+ | `config.batchSize` | `number` | `100` | Maximum number of texts per API call when using `embedBatch`. |
495
+
496
+ **Returns:** An `EmbeddingProvider` with `name: 'openai'`.
497
+
498
+ ```typescript
499
+ import { createOpenAIEmbeddings } from '@elsium-ai/rag'
500
+
501
+ const embeddings = createOpenAIEmbeddings({
502
+ provider: 'openai',
503
+ apiKey: process.env.OPENAI_API_KEY,
504
+ model: 'text-embedding-3-small',
505
+ dimensions: 1536,
506
+ })
507
+
508
+ const vector = await embeddings.embed('Hello, world!')
509
+ console.log(vector.dimensions) // 1536
510
+ ```
511
+
512
+ ### `createMockEmbeddings`
513
+
514
+ Creates a deterministic mock embedding provider for testing. Produces normalized vectors derived from character codes.
515
+
516
+ ```typescript
517
+ function createMockEmbeddings(dims?: number): EmbeddingProvider
518
+ ```
519
+
520
+ | Parameter | Type | Default | Description |
521
+ |---|---|---|---|
522
+ | `dims` | `number` | `128` | Number of dimensions for generated embeddings. |
523
+
524
+ **Returns:** An `EmbeddingProvider` with `name: 'mock'`.
525
+
526
+ ```typescript
527
+ import { createMockEmbeddings } from '@elsium-ai/rag'
528
+
529
+ const embeddings = createMockEmbeddings(64)
530
+ const vector = await embeddings.embed('test input')
531
+
532
+ console.log(vector.dimensions) // 64
533
+ ```
534
+
535
+ ### `getEmbeddingProvider`
536
+
537
+ Factory function that returns an `EmbeddingProvider` for the given `EmbeddingConfig`. Supports `'openai'` and `'mock'` providers.
538
+
539
+ ```typescript
540
+ function getEmbeddingProvider(config: EmbeddingConfig): EmbeddingProvider
541
+ ```
542
+
543
+ | Parameter | Type | Description |
544
+ |---|---|---|
545
+ | `config` | `EmbeddingConfig` | Configuration specifying `provider` and provider-specific options. |
546
+
547
+ **Returns:** The corresponding `EmbeddingProvider`. Throws an error for unknown providers.
548
+
549
+ ```typescript
550
+ import { getEmbeddingProvider } from '@elsium-ai/rag'
551
+
552
+ const embeddings = getEmbeddingProvider({ provider: 'mock', dimensions: 256 })
553
+ const vector = await embeddings.embed('sample text')
554
+ ```
555
+
556
+ ---
557
+
558
+ ## Vector Store
559
+
560
+ ### `VectorStore`
561
+
562
+ Interface implemented by all vector store backends.
563
+
564
+ ```typescript
565
+ interface VectorStore {
566
+ readonly name: string
567
+
568
+ upsert(chunks: EmbeddedChunk[]): Promise<void>
569
+ query(embedding: EmbeddingVector, options?: QueryOptions): Promise<RetrievalResult[]>
570
+ delete(ids: string[]): Promise<void>
571
+ clear(): Promise<void>
572
+ count(): Promise<number>
573
+ }
574
+ ```
575
+
576
+ ### `createInMemoryStore`
577
+
578
+ Creates an in-memory vector store backed by a `Map`. Supports cosine-similarity search and automatic eviction when exceeding `maxChunks`.
579
+
580
+ ```typescript
581
+ function createInMemoryStore(options?: {
582
+ maxChunks?: number
583
+ }): VectorStore
584
+ ```
585
+
586
+ | Parameter | Type | Default | Description |
587
+ |---|---|---|---|
588
+ | `options.maxChunks` | `number` | `100_000` | Maximum number of embedded chunks to store. When exceeded, the oldest entries are evicted. |
589
+
590
+ **Returns:** A `VectorStore` with `name: 'in-memory'`.
591
+
592
+ ```typescript
593
+ import { createInMemoryStore, createMockEmbeddings } from '@elsium-ai/rag'
594
+
595
+ const store = createInMemoryStore({ maxChunks: 10_000 })
596
+ const embeddings = createMockEmbeddings()
597
+
598
+ // Upsert embedded chunks
599
+ await store.upsert(embeddedChunks)
600
+
601
+ // Query
602
+ const queryVector = await embeddings.embed('search query')
603
+ const results = await store.query(queryVector, { topK: 5, minScore: 0.5 })
604
+
605
+ console.log(await store.count()) // number of stored chunks
606
+ ```
607
+
608
+ ### `cosineSimilarity`
609
+
610
+ Computes the cosine similarity between two numeric vectors. Returns `0` if the vectors have different lengths or either has zero magnitude.
611
+
612
+ ```typescript
613
+ function cosineSimilarity(a: number[], b: number[]): number
614
+ ```
615
+
616
+ | Parameter | Type | Description |
617
+ |---|---|---|
618
+ | `a` | `number[]` | First vector. |
619
+ | `b` | `number[]` | Second vector. |
620
+
621
+ **Returns:** A number between `-1` and `1` representing cosine similarity.
622
+
623
+ ```typescript
624
+ import { cosineSimilarity } from '@elsium-ai/rag'
625
+
626
+ const similarity = cosineSimilarity([1, 0, 0], [1, 0, 0])
627
+ console.log(similarity) // 1
628
+
629
+ const orthogonal = cosineSimilarity([1, 0], [0, 1])
630
+ console.log(orthogonal) // 0
631
+ ```
632
+
633
+ ### `mmrRerank`
634
+
635
+ Reranks retrieval results using Maximal Marginal Relevance (MMR) to balance relevance and diversity. Requires that each result includes its `embedding` field.
636
+
637
+ ```typescript
638
+ function mmrRerank(
639
+ queryEmbedding: EmbeddingVector,
640
+ results: Array<RetrievalResult & { embedding: EmbeddingVector }>,
641
+ options?: { topK?: number; lambda?: number },
642
+ ): RetrievalResult[]
643
+ ```
644
+
645
+ | Parameter | Type | Default | Description |
646
+ |---|---|---|---|
647
+ | `queryEmbedding` | `EmbeddingVector` | -- | The embedding of the query text. |
648
+ | `results` | `Array<RetrievalResult & { embedding: EmbeddingVector }>` | -- | Candidate results, each with its embedding attached. |
649
+ | `options.topK` | `number` | `5` | Number of results to return. |
650
+ | `options.lambda` | `number` | `0.7` | Trade-off between relevance (1.0) and diversity (0.0). |
651
+
652
+ **Returns:** An array of `RetrievalResult` objects reranked by MMR.
653
+
654
+ ```typescript
655
+ import { mmrRerank } from '@elsium-ai/rag'
656
+
657
+ const reranked = mmrRerank(queryEmbedding, candidateResults, {
658
+ topK: 3,
659
+ lambda: 0.5,
660
+ })
661
+ ```
662
+
663
+ ---
664
+
665
+ ## Pipeline
666
+
667
+ ### `RAGPipelineConfig`
668
+
669
+ Configuration object for creating a RAG pipeline via `rag()`.
23
670
 
24
671
  ```typescript
25
- import { createRAGPipeline, createVectorStore, createChunker } from '@elsium-ai/rag'
672
+ interface RAGPipelineConfig {
673
+ loader?: LoaderType
674
+ chunking?: ChunkingConfig
675
+ embeddings: EmbeddingConfig
676
+ store?: VectorStoreConfig
677
+ retrieval?: RetrievalConfig
678
+ }
679
+ ```
26
680
 
27
- const store = createVectorStore()
28
- const chunker = createChunker({ strategy: 'recursive', chunkSize: 512 })
681
+ ### `RAGPipeline`
29
682
 
30
- const chunks = chunker.chunk(document)
31
- await store.add(chunks)
683
+ Interface representing a fully configured RAG pipeline with ingest and query capabilities.
32
684
 
33
- const results = await store.search('What is ElsiumAI?', { topK: 5 })
685
+ ```typescript
686
+ interface RAGPipeline {
687
+ ingest(source: string, content: string): Promise<IngestResult>
688
+ ingestDocument(document: Document): Promise<IngestResult>
689
+ query(text: string, options?: QueryOptions): Promise<RetrievalResult[]>
690
+ clear(): Promise<void>
691
+ count(): Promise<number>
692
+ readonly embeddingProvider: EmbeddingProvider
693
+ readonly vectorStore: VectorStore
694
+ }
34
695
  ```
35
696
 
697
+ ### `IngestResult`
698
+
699
+ Summary returned after ingesting a document into the pipeline.
700
+
701
+ ```typescript
702
+ interface IngestResult {
703
+ documentId: string
704
+ chunkCount: number
705
+ totalTokens: number
706
+ }
707
+ ```
708
+
709
+ ### `rag`
710
+
711
+ Creates a complete RAG pipeline that handles loading, chunking, embedding, storing, and querying documents in a single unified API.
712
+
713
+ ```typescript
714
+ function rag(config: RAGPipelineConfig): RAGPipeline
715
+ ```
716
+
717
+ | Parameter | Type | Default | Description |
718
+ |---|---|---|---|
719
+ | `config.loader` | `LoaderType` | `'text'` | Document loader type. |
720
+ | `config.chunking` | `ChunkingConfig` | `{ strategy: 'recursive', maxChunkSize: 512, overlap: 50 }` | Chunking configuration. |
721
+ | `config.embeddings` | `EmbeddingConfig` | **(required)** | Embedding provider configuration. |
722
+ | `config.store` | `VectorStoreConfig` | in-memory store | Vector store configuration. External stores are not yet supported. |
723
+ | `config.retrieval` | `RetrievalConfig` | `{ topK: 5, minScore: 0, strategy: 'similarity' }` | Retrieval configuration. |
724
+
725
+ **Returns:** A `RAGPipeline` instance.
726
+
727
+ ```typescript
728
+ import { rag } from '@elsium-ai/rag'
729
+
730
+ const pipeline = rag({
731
+ loader: 'markdown',
732
+ chunking: { strategy: 'recursive', maxChunkSize: 512, overlap: 50 },
733
+ embeddings: { provider: 'openai', apiKey: process.env.OPENAI_API_KEY },
734
+ retrieval: { topK: 5, minScore: 0.5 },
735
+ })
736
+
737
+ // Ingest a document
738
+ const result = await pipeline.ingest('docs/guide.md', markdownContent)
739
+ console.log(result.chunkCount) // number of chunks created
740
+ console.log(result.totalTokens) // estimated total tokens
741
+
742
+ // Query the pipeline
743
+ const hits = await pipeline.query('How do I configure the pipeline?')
744
+ for (const hit of hits) {
745
+ console.log(hit.score, hit.chunk.content)
746
+ }
747
+
748
+ // Pipeline also exposes its internals
749
+ console.log(pipeline.embeddingProvider.name) // "openai"
750
+ console.log(await pipeline.count()) // total chunks stored
751
+ ```
752
+
753
+ ---
754
+
36
755
  ## Part of ElsiumAI
37
756
 
38
757
  This package is the RAG layer of the [ElsiumAI](https://github.com/elsium-ai/elsium-ai) framework. See the [full documentation](https://github.com/elsium-ai/elsium-ai) for guides and examples.
package/package.json CHANGED
@@ -1,12 +1,12 @@
1
1
  {
2
2
  "name": "@elsium-ai/rag",
3
- "version": "0.2.1",
3
+ "version": "0.2.3",
4
4
  "description": "RAG pipeline, document processing, embeddings, and vector stores for ElsiumAI",
5
5
  "license": "MIT",
6
6
  "author": "Eric Utrera <ebutrera9103@gmail.com>",
7
7
  "repository": {
8
8
  "type": "git",
9
- "url": "https://github.com/elsium-ai/elsium-ai",
9
+ "url": "git+https://github.com/elsium-ai/elsium-ai.git",
10
10
  "directory": "packages/rag"
11
11
  },
12
12
  "type": "module",
@@ -26,7 +26,7 @@
26
26
  "dev": "bun --watch src/index.ts"
27
27
  },
28
28
  "dependencies": {
29
- "@elsium-ai/core": "^0.2.1"
29
+ "@elsium-ai/core": "^0.2.3"
30
30
  },
31
31
  "devDependencies": {
32
32
  "typescript": "^5.7.0"