rag-lite-ts 1.0.2 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. package/README.md +606 -93
  2. package/dist/cli/indexer.js +192 -4
  3. package/dist/cli/search.js +50 -11
  4. package/dist/cli.js +183 -26
  5. package/dist/core/abstract-embedder.d.ts +125 -0
  6. package/dist/core/abstract-embedder.js +264 -0
  7. package/dist/core/actionable-error-messages.d.ts +60 -0
  8. package/dist/core/actionable-error-messages.js +397 -0
  9. package/dist/core/batch-processing-optimizer.d.ts +155 -0
  10. package/dist/core/batch-processing-optimizer.js +541 -0
  11. package/dist/core/chunker.d.ts +2 -0
  12. package/dist/core/cli-database-utils.d.ts +53 -0
  13. package/dist/core/cli-database-utils.js +239 -0
  14. package/dist/core/config.js +10 -3
  15. package/dist/core/content-errors.d.ts +111 -0
  16. package/dist/core/content-errors.js +362 -0
  17. package/dist/core/content-manager.d.ts +343 -0
  18. package/dist/core/content-manager.js +1504 -0
  19. package/dist/core/content-performance-optimizer.d.ts +150 -0
  20. package/dist/core/content-performance-optimizer.js +516 -0
  21. package/dist/core/content-resolver.d.ts +104 -0
  22. package/dist/core/content-resolver.js +285 -0
  23. package/dist/core/cross-modal-search.d.ts +164 -0
  24. package/dist/core/cross-modal-search.js +342 -0
  25. package/dist/core/database-connection-manager.d.ts +109 -0
  26. package/dist/core/database-connection-manager.js +304 -0
  27. package/dist/core/db.d.ts +141 -2
  28. package/dist/core/db.js +631 -89
  29. package/dist/core/embedder-factory.d.ts +176 -0
  30. package/dist/core/embedder-factory.js +338 -0
  31. package/dist/core/index.d.ts +3 -1
  32. package/dist/core/index.js +4 -1
  33. package/dist/core/ingestion.d.ts +85 -15
  34. package/dist/core/ingestion.js +510 -45
  35. package/dist/core/lazy-dependency-loader.d.ts +152 -0
  36. package/dist/core/lazy-dependency-loader.js +453 -0
  37. package/dist/core/mode-detection-service.d.ts +150 -0
  38. package/dist/core/mode-detection-service.js +565 -0
  39. package/dist/core/mode-model-validator.d.ts +92 -0
  40. package/dist/core/mode-model-validator.js +203 -0
  41. package/dist/core/model-registry.d.ts +120 -0
  42. package/dist/core/model-registry.js +415 -0
  43. package/dist/core/model-validator.d.ts +217 -0
  44. package/dist/core/model-validator.js +782 -0
  45. package/dist/core/polymorphic-search-factory.d.ts +154 -0
  46. package/dist/core/polymorphic-search-factory.js +344 -0
  47. package/dist/core/raglite-paths.d.ts +121 -0
  48. package/dist/core/raglite-paths.js +145 -0
  49. package/dist/core/reranking-config.d.ts +42 -0
  50. package/dist/core/reranking-config.js +156 -0
  51. package/dist/core/reranking-factory.d.ts +92 -0
  52. package/dist/core/reranking-factory.js +591 -0
  53. package/dist/core/reranking-strategies.d.ts +325 -0
  54. package/dist/core/reranking-strategies.js +720 -0
  55. package/dist/core/resource-cleanup.d.ts +163 -0
  56. package/dist/core/resource-cleanup.js +371 -0
  57. package/dist/core/resource-manager.d.ts +212 -0
  58. package/dist/core/resource-manager.js +564 -0
  59. package/dist/core/search.d.ts +28 -1
  60. package/dist/core/search.js +83 -5
  61. package/dist/core/streaming-operations.d.ts +145 -0
  62. package/dist/core/streaming-operations.js +409 -0
  63. package/dist/core/types.d.ts +3 -0
  64. package/dist/core/universal-embedder.d.ts +177 -0
  65. package/dist/core/universal-embedder.js +139 -0
  66. package/dist/core/validation-messages.d.ts +99 -0
  67. package/dist/core/validation-messages.js +334 -0
  68. package/dist/core/vector-index.js +7 -8
  69. package/dist/factories/index.d.ts +1 -1
  70. package/dist/factories/text-factory.d.ts +128 -34
  71. package/dist/factories/text-factory.js +346 -97
  72. package/dist/file-processor.d.ts +88 -2
  73. package/dist/file-processor.js +720 -17
  74. package/dist/index.d.ts +9 -0
  75. package/dist/index.js +11 -0
  76. package/dist/ingestion.d.ts +16 -0
  77. package/dist/ingestion.js +21 -0
  78. package/dist/mcp-server.d.ts +35 -3
  79. package/dist/mcp-server.js +1107 -31
  80. package/dist/multimodal/clip-embedder.d.ts +314 -0
  81. package/dist/multimodal/clip-embedder.js +945 -0
  82. package/dist/multimodal/index.d.ts +6 -0
  83. package/dist/multimodal/index.js +6 -0
  84. package/dist/run-error-recovery-tests.d.ts +7 -0
  85. package/dist/run-error-recovery-tests.js +101 -0
  86. package/dist/search.d.ts +26 -0
  87. package/dist/search.js +54 -1
  88. package/dist/test-utils.d.ts +8 -26
  89. package/dist/text/chunker.d.ts +1 -0
  90. package/dist/text/embedder.js +15 -8
  91. package/dist/text/index.d.ts +1 -0
  92. package/dist/text/index.js +1 -0
  93. package/dist/text/reranker.d.ts +1 -2
  94. package/dist/text/reranker.js +17 -47
  95. package/dist/text/sentence-transformer-embedder.d.ts +96 -0
  96. package/dist/text/sentence-transformer-embedder.js +340 -0
  97. package/dist/types.d.ts +39 -0
  98. package/dist/utils/vector-math.d.ts +31 -0
  99. package/dist/utils/vector-math.js +70 -0
  100. package/package.json +15 -3
  101. package/dist/api-errors.d.ts.map +0 -1
  102. package/dist/api-errors.js.map +0 -1
  103. package/dist/cli/indexer.d.ts.map +0 -1
  104. package/dist/cli/indexer.js.map +0 -1
  105. package/dist/cli/search.d.ts.map +0 -1
  106. package/dist/cli/search.js.map +0 -1
  107. package/dist/cli.d.ts.map +0 -1
  108. package/dist/cli.js.map +0 -1
  109. package/dist/config.d.ts.map +0 -1
  110. package/dist/config.js.map +0 -1
  111. package/dist/core/adapters.d.ts.map +0 -1
  112. package/dist/core/adapters.js.map +0 -1
  113. package/dist/core/chunker.d.ts.map +0 -1
  114. package/dist/core/chunker.js.map +0 -1
  115. package/dist/core/config.d.ts.map +0 -1
  116. package/dist/core/config.js.map +0 -1
  117. package/dist/core/db.d.ts.map +0 -1
  118. package/dist/core/db.js.map +0 -1
  119. package/dist/core/error-handler.d.ts.map +0 -1
  120. package/dist/core/error-handler.js.map +0 -1
  121. package/dist/core/index.d.ts.map +0 -1
  122. package/dist/core/index.js.map +0 -1
  123. package/dist/core/ingestion.d.ts.map +0 -1
  124. package/dist/core/ingestion.js.map +0 -1
  125. package/dist/core/interfaces.d.ts.map +0 -1
  126. package/dist/core/interfaces.js.map +0 -1
  127. package/dist/core/path-manager.d.ts.map +0 -1
  128. package/dist/core/path-manager.js.map +0 -1
  129. package/dist/core/search-example.d.ts +0 -25
  130. package/dist/core/search-example.d.ts.map +0 -1
  131. package/dist/core/search-example.js +0 -138
  132. package/dist/core/search-example.js.map +0 -1
  133. package/dist/core/search-pipeline-example.d.ts +0 -21
  134. package/dist/core/search-pipeline-example.d.ts.map +0 -1
  135. package/dist/core/search-pipeline-example.js +0 -188
  136. package/dist/core/search-pipeline-example.js.map +0 -1
  137. package/dist/core/search-pipeline.d.ts.map +0 -1
  138. package/dist/core/search-pipeline.js.map +0 -1
  139. package/dist/core/search.d.ts.map +0 -1
  140. package/dist/core/search.js.map +0 -1
  141. package/dist/core/types.d.ts.map +0 -1
  142. package/dist/core/types.js.map +0 -1
  143. package/dist/core/vector-index.d.ts.map +0 -1
  144. package/dist/core/vector-index.js.map +0 -1
  145. package/dist/dom-polyfills.d.ts.map +0 -1
  146. package/dist/dom-polyfills.js.map +0 -1
  147. package/dist/examples/clean-api-examples.d.ts +0 -44
  148. package/dist/examples/clean-api-examples.d.ts.map +0 -1
  149. package/dist/examples/clean-api-examples.js +0 -206
  150. package/dist/examples/clean-api-examples.js.map +0 -1
  151. package/dist/factories/index.d.ts.map +0 -1
  152. package/dist/factories/index.js.map +0 -1
  153. package/dist/factories/text-factory.d.ts.map +0 -1
  154. package/dist/factories/text-factory.js.map +0 -1
  155. package/dist/file-processor.d.ts.map +0 -1
  156. package/dist/file-processor.js.map +0 -1
  157. package/dist/index-manager.d.ts.map +0 -1
  158. package/dist/index-manager.js.map +0 -1
  159. package/dist/index.d.ts.map +0 -1
  160. package/dist/index.js.map +0 -1
  161. package/dist/indexer.d.ts.map +0 -1
  162. package/dist/indexer.js.map +0 -1
  163. package/dist/ingestion.d.ts.map +0 -1
  164. package/dist/ingestion.js.map +0 -1
  165. package/dist/mcp-server.d.ts.map +0 -1
  166. package/dist/mcp-server.js.map +0 -1
  167. package/dist/preprocess.d.ts.map +0 -1
  168. package/dist/preprocess.js.map +0 -1
  169. package/dist/preprocessors/index.d.ts.map +0 -1
  170. package/dist/preprocessors/index.js.map +0 -1
  171. package/dist/preprocessors/mdx.d.ts.map +0 -1
  172. package/dist/preprocessors/mdx.js.map +0 -1
  173. package/dist/preprocessors/mermaid.d.ts.map +0 -1
  174. package/dist/preprocessors/mermaid.js.map +0 -1
  175. package/dist/preprocessors/registry.d.ts.map +0 -1
  176. package/dist/preprocessors/registry.js.map +0 -1
  177. package/dist/search-standalone.d.ts.map +0 -1
  178. package/dist/search-standalone.js.map +0 -1
  179. package/dist/search.d.ts.map +0 -1
  180. package/dist/search.js.map +0 -1
  181. package/dist/test-utils.d.ts.map +0 -1
  182. package/dist/test-utils.js.map +0 -1
  183. package/dist/text/chunker.d.ts.map +0 -1
  184. package/dist/text/chunker.js.map +0 -1
  185. package/dist/text/embedder.d.ts.map +0 -1
  186. package/dist/text/embedder.js.map +0 -1
  187. package/dist/text/index.d.ts.map +0 -1
  188. package/dist/text/index.js.map +0 -1
  189. package/dist/text/preprocessors/index.d.ts.map +0 -1
  190. package/dist/text/preprocessors/index.js.map +0 -1
  191. package/dist/text/preprocessors/mdx.d.ts.map +0 -1
  192. package/dist/text/preprocessors/mdx.js.map +0 -1
  193. package/dist/text/preprocessors/mermaid.d.ts.map +0 -1
  194. package/dist/text/preprocessors/mermaid.js.map +0 -1
  195. package/dist/text/preprocessors/registry.d.ts.map +0 -1
  196. package/dist/text/preprocessors/registry.js.map +0 -1
  197. package/dist/text/reranker.d.ts.map +0 -1
  198. package/dist/text/reranker.js.map +0 -1
  199. package/dist/text/tokenizer.d.ts.map +0 -1
  200. package/dist/text/tokenizer.js.map +0 -1
  201. package/dist/types.d.ts.map +0 -1
  202. package/dist/types.js.map +0 -1
@@ -7,6 +7,7 @@ import { type ChunkConfig } from './chunker.js';
7
7
  import { IndexManager } from '../index-manager.js';
8
8
  import { type DatabaseConnection } from './db.js';
9
9
  import type { EmbedFunction } from './interfaces.js';
10
+ import { ContentManager, type MemoryContentMetadata } from './content-manager.js';
10
11
  /**
11
12
  * Options for the ingestion pipeline
12
13
  */
@@ -17,6 +18,10 @@ export interface IngestionOptions {
17
18
  chunkConfig?: ChunkConfig;
18
19
  /** Whether to force rebuild the index */
19
20
  forceRebuild?: boolean;
21
+ /** Mode for the ingestion pipeline (text or multimodal) */
22
+ mode?: 'text' | 'multimodal';
23
+ /** Content type for the ingested content */
24
+ contentType?: string;
20
25
  }
21
26
  /**
22
27
  * Result of the ingestion process
@@ -34,6 +39,8 @@ export interface IngestionResult {
34
39
  embeddingErrors: number;
35
40
  /** Processing time in milliseconds */
36
41
  processingTimeMs: number;
42
+ /** Content IDs of successfully ingested documents */
43
+ contentIds: string[];
37
44
  }
38
45
  /**
39
46
  * Main ingestion pipeline class
@@ -46,8 +53,10 @@ export declare class IngestionPipeline {
46
53
  private db;
47
54
  private defaultChunkConfig?;
48
55
  private pathManager;
56
+ private contentManager;
49
57
  /**
50
58
  * Creates a new IngestionPipeline with explicit dependency injection
59
+ * Enhanced with ContentManager integration for unified content system
51
60
  *
52
61
  * DEPENDENCY INJECTION PATTERN:
53
62
  * This constructor requires all dependencies to be explicitly provided, enabling:
@@ -55,6 +64,7 @@ export declare class IngestionPipeline {
55
64
  * - Support for different embedding models and content types
56
65
  * - Testability through mock injection
57
66
  * - Future extensibility for multimodal content processing
67
+ * - Unified content management for both filesystem and memory-based ingestion
58
68
  *
59
69
  * @param embedFn - Function to embed document chunks into vectors
60
70
  * - Signature: (query: string, contentType?: string) => Promise<EmbeddingResult>
@@ -74,32 +84,33 @@ export declare class IngestionPipeline {
74
84
  * - Supports different content types through metadata fields
75
85
  * - Example: await openDatabase('./db.sqlite')
76
86
  *
87
+ * @param contentManager - Optional ContentManager for unified content system
88
+ * - Handles content storage routing and deduplication
89
+ * - If not provided, creates default instance with standard configuration
90
+ * - Example: new ContentManager(db, { contentDir: '.raglite/content' })
91
+ *
77
92
  * USAGE EXAMPLES:
78
93
  * ```typescript
79
- * // Text-only ingestion pipeline
94
+ * // Text-only ingestion pipeline with unified content system
80
95
  * const textEmbedFn = await createTextEmbedder();
81
96
  * const indexManager = new IndexManager('./index.bin');
82
97
  * const db = await openDatabase('./db.sqlite');
98
+ * const contentManager = new ContentManager(db);
99
+ * const ingestion = new IngestionPipeline(textEmbedFn, indexManager, db, undefined, contentManager);
100
+ *
101
+ * // Simple usage (ContentManager created automatically)
83
102
  * const ingestion = new IngestionPipeline(textEmbedFn, indexManager, db);
84
103
  *
85
- * // Custom embedding implementation
104
+ * // Custom embedding implementation with memory ingestion
86
105
  * const customEmbedFn = async (text) => ({
87
106
  * embedding_id: generateId(),
88
107
  * vector: await myCustomModel.embed(text)
89
108
  * });
90
109
  * const ingestion = new IngestionPipeline(customEmbedFn, indexManager, db);
91
- *
92
- * // Multimodal ingestion (future)
93
- * const multimodalEmbedFn = async (content, contentType) => {
94
- * if (contentType === 'image') {
95
- * return { embedding_id: generateId(), vector: await clipModel.embedImage(content) };
96
- * }
97
- * return { embedding_id: generateId(), vector: await clipModel.embedText(content) };
98
- * };
99
- * const ingestion = new IngestionPipeline(multimodalEmbedFn, indexManager, db);
110
+ * await ingestion.ingestFromMemory(buffer, { displayName: 'file.txt' });
100
111
  * ```
101
112
  */
102
- constructor(embedFn: EmbedFunction, indexManager: IndexManager, db: DatabaseConnection, defaultChunkConfig?: ChunkConfig | undefined);
113
+ constructor(embedFn: EmbedFunction, indexManager: IndexManager, db: DatabaseConnection, defaultChunkConfig?: ChunkConfig | undefined, contentManager?: ContentManager);
103
114
  /**
104
115
  * Ingest documents from a directory
105
116
  * @param directoryPath - Path to directory containing documents
@@ -114,27 +125,86 @@ export declare class IngestionPipeline {
114
125
  * @returns Promise resolving to ingestion results
115
126
  */
116
127
  ingestFile(filePath: string, options?: IngestionOptions): Promise<IngestionResult>;
128
+ /**
129
+ * Ingest content from memory buffer
130
+ * Enables MCP integration and real-time content processing
131
+ * @param content - Buffer containing the content to ingest
132
+ * @param metadata - Memory content metadata including display name and content type
133
+ * @param options - Optional ingestion configuration
134
+ * @returns Promise resolving to content ID for the ingested content
135
+ */
136
+ ingestFromMemory(content: Buffer, metadata: MemoryContentMetadata, options?: IngestionOptions): Promise<string>;
117
137
  /**
118
138
  * Ingest documents from a path (file or directory)
119
139
  * Implements the complete pipeline: file processing → chunking → embedding → storage
140
+ * Enhanced to handle mixed content types (text and images) in multimodal mode
120
141
  */
121
142
  ingestPath(path: string, options?: IngestionOptions): Promise<IngestionResult>;
122
143
  /**
123
- * Chunk all documents and organize results
144
+ * Analyze content types in the document collection
145
+ * @private
146
+ */
147
+ private analyzeContentTypes;
148
+ /**
149
+ * Chunk all documents and organize results with content-type awareness
150
+ * Enhanced to handle different content types appropriately
151
+ */
152
+ private chunkDocumentsWithContentTypes;
153
+ /**
154
+ * Chunk all documents and organize results (legacy method for backward compatibility)
155
+ * @deprecated Use chunkDocumentsWithContentTypes for multimodal support
124
156
  */
125
157
  private chunkDocuments;
126
158
  /**
127
- * Generate embeddings for all chunks with error handling
159
+ * Generate embeddings for all chunks with content-type support
160
+ * Enhanced to handle different content types and pass metadata to embedding function
161
+ */
162
+ private generateEmbeddingsWithContentTypes;
163
+ /**
164
+ * Generate embeddings for all chunks with error handling (legacy method for backward compatibility)
165
+ * @deprecated Use generateEmbeddingsWithContentTypes for multimodal support
128
166
  */
129
167
  private generateEmbeddings;
130
168
  /**
131
- * Store documents and chunks in database
169
+ * Store documents and chunks in database with content-type support
170
+ * Enhanced to handle content type metadata and multimodal content
171
+ * @returns Array of content IDs for successfully stored documents
172
+ */
173
+ private storeDocumentsAndChunksWithContentTypes;
174
+ /**
175
+ * Store documents and chunks in database (legacy method for backward compatibility)
176
+ * @deprecated Use storeDocumentsAndChunksWithContentTypes for multimodal support
132
177
  */
133
178
  private storeDocumentsAndChunks;
134
179
  /**
135
180
  * Update vector index with new embeddings
136
181
  */
137
182
  private updateVectorIndex;
183
+ /**
184
+ * Converts MIME type to simple content type for embedding function
185
+ * @param mimeType - MIME type string (e.g., 'text/plain', 'image/jpeg')
186
+ * @returns Simple content type ('text', 'image', etc.)
187
+ */
188
+ private getContentTypeForEmbedding;
189
+ /**
190
+ * Save the vector index to disk
191
+ */
192
+ saveIndex(): Promise<void>;
193
+ /**
194
+ * Process image content from memory using the existing image processing pipeline
195
+ * @private
196
+ */
197
+ private processImageFromMemory;
198
+ /**
199
+ * Process PDF content from memory using the existing PDF processing pipeline
200
+ * @private
201
+ */
202
+ private processPDFFromMemory;
203
+ /**
204
+ * Process DOCX content from memory using the existing DOCX processing pipeline
205
+ * @private
206
+ */
207
+ private processDOCXFromMemory;
138
208
  /**
139
209
  * Clean up resources - explicit cleanup method
140
210
  */