rag-lite-ts 1.0.1 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (204) hide show
  1. package/README.md +651 -109
  2. package/dist/cli/indexer.js +262 -46
  3. package/dist/cli/search.js +54 -32
  4. package/dist/cli.js +185 -28
  5. package/dist/config.d.ts +34 -73
  6. package/dist/config.js +50 -255
  7. package/dist/core/abstract-embedder.d.ts +125 -0
  8. package/dist/core/abstract-embedder.js +264 -0
  9. package/dist/core/actionable-error-messages.d.ts +60 -0
  10. package/dist/core/actionable-error-messages.js +397 -0
  11. package/dist/core/adapters.d.ts +93 -0
  12. package/dist/core/adapters.js +139 -0
  13. package/dist/core/batch-processing-optimizer.d.ts +155 -0
  14. package/dist/core/batch-processing-optimizer.js +541 -0
  15. package/dist/core/chunker.d.ts +119 -0
  16. package/dist/core/chunker.js +73 -0
  17. package/dist/core/cli-database-utils.d.ts +53 -0
  18. package/dist/core/cli-database-utils.js +239 -0
  19. package/dist/core/config.d.ts +102 -0
  20. package/dist/core/config.js +247 -0
  21. package/dist/core/content-errors.d.ts +111 -0
  22. package/dist/core/content-errors.js +362 -0
  23. package/dist/core/content-manager.d.ts +343 -0
  24. package/dist/core/content-manager.js +1504 -0
  25. package/dist/core/content-performance-optimizer.d.ts +150 -0
  26. package/dist/core/content-performance-optimizer.js +516 -0
  27. package/dist/core/content-resolver.d.ts +104 -0
  28. package/dist/core/content-resolver.js +285 -0
  29. package/dist/core/cross-modal-search.d.ts +164 -0
  30. package/dist/core/cross-modal-search.js +342 -0
  31. package/dist/core/database-connection-manager.d.ts +109 -0
  32. package/dist/core/database-connection-manager.js +304 -0
  33. package/dist/core/db.d.ts +245 -0
  34. package/dist/core/db.js +952 -0
  35. package/dist/core/embedder-factory.d.ts +176 -0
  36. package/dist/core/embedder-factory.js +338 -0
  37. package/dist/{error-handler.d.ts → core/error-handler.d.ts} +23 -2
  38. package/dist/{error-handler.js → core/error-handler.js} +51 -8
  39. package/dist/core/index.d.ts +59 -0
  40. package/dist/core/index.js +69 -0
  41. package/dist/core/ingestion.d.ts +213 -0
  42. package/dist/core/ingestion.js +812 -0
  43. package/dist/core/interfaces.d.ts +408 -0
  44. package/dist/core/interfaces.js +106 -0
  45. package/dist/core/lazy-dependency-loader.d.ts +152 -0
  46. package/dist/core/lazy-dependency-loader.js +453 -0
  47. package/dist/core/mode-detection-service.d.ts +150 -0
  48. package/dist/core/mode-detection-service.js +565 -0
  49. package/dist/core/mode-model-validator.d.ts +92 -0
  50. package/dist/core/mode-model-validator.js +203 -0
  51. package/dist/core/model-registry.d.ts +120 -0
  52. package/dist/core/model-registry.js +415 -0
  53. package/dist/core/model-validator.d.ts +217 -0
  54. package/dist/core/model-validator.js +782 -0
  55. package/dist/{path-manager.d.ts → core/path-manager.d.ts} +5 -0
  56. package/dist/{path-manager.js → core/path-manager.js} +5 -0
  57. package/dist/core/polymorphic-search-factory.d.ts +154 -0
  58. package/dist/core/polymorphic-search-factory.js +344 -0
  59. package/dist/core/raglite-paths.d.ts +121 -0
  60. package/dist/core/raglite-paths.js +145 -0
  61. package/dist/core/reranking-config.d.ts +42 -0
  62. package/dist/core/reranking-config.js +156 -0
  63. package/dist/core/reranking-factory.d.ts +92 -0
  64. package/dist/core/reranking-factory.js +591 -0
  65. package/dist/core/reranking-strategies.d.ts +325 -0
  66. package/dist/core/reranking-strategies.js +720 -0
  67. package/dist/core/resource-cleanup.d.ts +163 -0
  68. package/dist/core/resource-cleanup.js +371 -0
  69. package/dist/core/resource-manager.d.ts +212 -0
  70. package/dist/core/resource-manager.js +564 -0
  71. package/dist/core/search-pipeline.d.ts +111 -0
  72. package/dist/core/search-pipeline.js +287 -0
  73. package/dist/core/search.d.ts +131 -0
  74. package/dist/core/search.js +296 -0
  75. package/dist/core/streaming-operations.d.ts +145 -0
  76. package/dist/core/streaming-operations.js +409 -0
  77. package/dist/core/types.d.ts +66 -0
  78. package/dist/core/types.js +6 -0
  79. package/dist/core/universal-embedder.d.ts +177 -0
  80. package/dist/core/universal-embedder.js +139 -0
  81. package/dist/core/validation-messages.d.ts +99 -0
  82. package/dist/core/validation-messages.js +334 -0
  83. package/dist/{vector-index.d.ts → core/vector-index.d.ts} +4 -0
  84. package/dist/{vector-index.js → core/vector-index.js} +21 -3
  85. package/dist/dom-polyfills.d.ts +6 -0
  86. package/dist/dom-polyfills.js +40 -0
  87. package/dist/factories/index.d.ts +43 -0
  88. package/dist/factories/index.js +44 -0
  89. package/dist/factories/text-factory.d.ts +560 -0
  90. package/dist/factories/text-factory.js +968 -0
  91. package/dist/file-processor.d.ts +90 -4
  92. package/dist/file-processor.js +723 -20
  93. package/dist/index-manager.d.ts +3 -2
  94. package/dist/index-manager.js +13 -11
  95. package/dist/index.d.ts +72 -8
  96. package/dist/index.js +102 -16
  97. package/dist/indexer.js +1 -1
  98. package/dist/ingestion.d.ts +44 -154
  99. package/dist/ingestion.js +75 -671
  100. package/dist/mcp-server.d.ts +35 -3
  101. package/dist/mcp-server.js +1186 -79
  102. package/dist/multimodal/clip-embedder.d.ts +314 -0
  103. package/dist/multimodal/clip-embedder.js +945 -0
  104. package/dist/multimodal/index.d.ts +6 -0
  105. package/dist/multimodal/index.js +6 -0
  106. package/dist/preprocess.js +1 -1
  107. package/dist/run-error-recovery-tests.d.ts +7 -0
  108. package/dist/run-error-recovery-tests.js +101 -0
  109. package/dist/search-standalone.js +1 -1
  110. package/dist/search.d.ts +51 -69
  111. package/dist/search.js +117 -412
  112. package/dist/test-utils.d.ts +8 -26
  113. package/dist/text/chunker.d.ts +33 -0
  114. package/dist/{chunker.js → text/chunker.js} +98 -75
  115. package/dist/{embedder.d.ts → text/embedder.d.ts} +22 -1
  116. package/dist/{embedder.js → text/embedder.js} +84 -10
  117. package/dist/text/index.d.ts +8 -0
  118. package/dist/text/index.js +9 -0
  119. package/dist/text/preprocessors/index.d.ts +17 -0
  120. package/dist/text/preprocessors/index.js +38 -0
  121. package/dist/text/preprocessors/mdx.d.ts +25 -0
  122. package/dist/text/preprocessors/mdx.js +101 -0
  123. package/dist/text/preprocessors/mermaid.d.ts +68 -0
  124. package/dist/text/preprocessors/mermaid.js +330 -0
  125. package/dist/text/preprocessors/registry.d.ts +56 -0
  126. package/dist/text/preprocessors/registry.js +180 -0
  127. package/dist/text/reranker.d.ts +59 -0
  128. package/dist/{reranker.js → text/reranker.js} +138 -53
  129. package/dist/text/sentence-transformer-embedder.d.ts +96 -0
  130. package/dist/text/sentence-transformer-embedder.js +340 -0
  131. package/dist/{tokenizer.d.ts → text/tokenizer.d.ts} +1 -0
  132. package/dist/{tokenizer.js → text/tokenizer.js} +7 -2
  133. package/dist/types.d.ts +40 -1
  134. package/dist/utils/vector-math.d.ts +31 -0
  135. package/dist/utils/vector-math.js +70 -0
  136. package/package.json +16 -4
  137. package/dist/api-errors.d.ts.map +0 -1
  138. package/dist/api-errors.js.map +0 -1
  139. package/dist/chunker.d.ts +0 -47
  140. package/dist/chunker.d.ts.map +0 -1
  141. package/dist/chunker.js.map +0 -1
  142. package/dist/cli/indexer.d.ts.map +0 -1
  143. package/dist/cli/indexer.js.map +0 -1
  144. package/dist/cli/search.d.ts.map +0 -1
  145. package/dist/cli/search.js.map +0 -1
  146. package/dist/cli.d.ts.map +0 -1
  147. package/dist/cli.js.map +0 -1
  148. package/dist/config.d.ts.map +0 -1
  149. package/dist/config.js.map +0 -1
  150. package/dist/db.d.ts +0 -90
  151. package/dist/db.d.ts.map +0 -1
  152. package/dist/db.js +0 -340
  153. package/dist/db.js.map +0 -1
  154. package/dist/embedder.d.ts.map +0 -1
  155. package/dist/embedder.js.map +0 -1
  156. package/dist/error-handler.d.ts.map +0 -1
  157. package/dist/error-handler.js.map +0 -1
  158. package/dist/file-processor.d.ts.map +0 -1
  159. package/dist/file-processor.js.map +0 -1
  160. package/dist/index-manager.d.ts.map +0 -1
  161. package/dist/index-manager.js.map +0 -1
  162. package/dist/index.d.ts.map +0 -1
  163. package/dist/index.js.map +0 -1
  164. package/dist/indexer.d.ts.map +0 -1
  165. package/dist/indexer.js.map +0 -1
  166. package/dist/ingestion.d.ts.map +0 -1
  167. package/dist/ingestion.js.map +0 -1
  168. package/dist/mcp-server.d.ts.map +0 -1
  169. package/dist/mcp-server.js.map +0 -1
  170. package/dist/path-manager.d.ts.map +0 -1
  171. package/dist/path-manager.js.map +0 -1
  172. package/dist/preprocess.d.ts.map +0 -1
  173. package/dist/preprocess.js.map +0 -1
  174. package/dist/preprocessors/index.d.ts.map +0 -1
  175. package/dist/preprocessors/index.js.map +0 -1
  176. package/dist/preprocessors/mdx.d.ts.map +0 -1
  177. package/dist/preprocessors/mdx.js.map +0 -1
  178. package/dist/preprocessors/mermaid.d.ts.map +0 -1
  179. package/dist/preprocessors/mermaid.js.map +0 -1
  180. package/dist/preprocessors/registry.d.ts.map +0 -1
  181. package/dist/preprocessors/registry.js.map +0 -1
  182. package/dist/reranker.d.ts +0 -40
  183. package/dist/reranker.d.ts.map +0 -1
  184. package/dist/reranker.js.map +0 -1
  185. package/dist/resource-manager-demo.d.ts +0 -7
  186. package/dist/resource-manager-demo.d.ts.map +0 -1
  187. package/dist/resource-manager-demo.js +0 -52
  188. package/dist/resource-manager-demo.js.map +0 -1
  189. package/dist/resource-manager.d.ts +0 -129
  190. package/dist/resource-manager.d.ts.map +0 -1
  191. package/dist/resource-manager.js +0 -389
  192. package/dist/resource-manager.js.map +0 -1
  193. package/dist/search-standalone.d.ts.map +0 -1
  194. package/dist/search-standalone.js.map +0 -1
  195. package/dist/search.d.ts.map +0 -1
  196. package/dist/search.js.map +0 -1
  197. package/dist/test-utils.d.ts.map +0 -1
  198. package/dist/test-utils.js.map +0 -1
  199. package/dist/tokenizer.d.ts.map +0 -1
  200. package/dist/tokenizer.js.map +0 -1
  201. package/dist/types.d.ts.map +0 -1
  202. package/dist/types.js.map +0 -1
  203. package/dist/vector-index.d.ts.map +0 -1
  204. package/dist/vector-index.js.map +0 -1
@@ -0,0 +1,59 @@
1
+ /**
2
+ * CORE MODULE — Shared between text-only (rag-lite-ts) and future multimodal (rag-lite-mm)
3
+ * Model-agnostic. No transformer or modality-specific logic.
4
+ *
5
+ * This module provides the clean re-export surface for the core layer, enabling
6
+ * dependency injection patterns for different implementations (text-only, multimodal, etc.).
7
+ *
8
+ * DEPENDENCY INJECTION ARCHITECTURE:
9
+ *
10
+ * The core layer uses explicit dependency injection to maintain clean separation between
11
+ * model-agnostic logic and implementation-specific components:
12
+ *
13
+ * 1. Core Classes (SearchEngine, IngestionPipeline):
14
+ * - Accept injected functions (EmbedFunction, RerankFunction) in constructors
15
+ * - Coordinate model-agnostic operations (database, vector index, search pipeline)
16
+ * - No knowledge of specific embedding models or transformers
17
+ *
18
+ * 2. Dependency Injection Interfaces:
19
+ * - EmbedFunction: (query: string, contentType?: string) => Promise<EmbeddingResult>
20
+ * - RerankFunction: (query: string, results: SearchResult[], contentType?: string) => Promise<SearchResult[]>
21
+ * - Support different content types (text, image, etc.) and embedding dimensions
22
+ *
23
+ * 3. Usage Patterns:
24
+ *
25
+ * // Direct dependency injection (advanced users)
26
+ * const embedFn = await createTextEmbedder();
27
+ * const rerankFn = await createTextReranker();
28
+ * const indexManager = new IndexManager('./index.bin');
29
+ * const db = await openDatabase('./db.sqlite');
30
+ * const search = new SearchEngine(embedFn, indexManager, db, rerankFn);
31
+ *
32
+ * // Factory pattern (recommended for common use cases)
33
+ * const search = await TextSearchFactory.create('./index.bin', './db.sqlite');
34
+ *
35
+ * 4. Extension Points:
36
+ * - New implementations (multimodal, custom models) implement the same interfaces
37
+ * - Core classes remain unchanged when adding new modalities
38
+ * - Plugin architecture enabled through interface-based design
39
+ *
40
+ * 5. Benefits:
41
+ * - Clean separation of concerns
42
+ * - Testability through mock injection
43
+ * - Future extensibility without core changes
44
+ * - Support for different embedding dimensions and content types
45
+ */
46
+ export { type ContentDocument, type ContentChunk, type Document, type Chunk, type EmbeddingResult, type SearchResult, type SearchOptions, } from './types.js';
47
+ export { type EmbedFunction, type RerankFunction, type EmbeddingQueryInterface, type RerankingInterface, type SearchEngineConfig, type ContentTypeStrategy, type ModelAgnosticInterface, type ExtendedEmbeddingInterface, type ExtendedRerankingInterface, type SearchPipelineInterface, type SearchDependencyFactory, InterfaceValidator } from './interfaces.js';
48
+ export * from './adapters.js';
49
+ export * from './config.js';
50
+ export { type DatabaseConnection, type ContentMetadata, openDatabase, initializeSchema, insertDocument, insertChunk, upsertDocument, getChunksByEmbeddingIds, getModelVersion, setModelVersion, getStoredModelInfo, setStoredModelInfo, insertContentMetadata, getContentMetadata, getContentMetadataByHash, getContentMetadataByStorageType, deleteContentMetadata, getStorageStats, updateStorageStats } from './db.js';
51
+ export { type VectorIndexOptions, VectorIndex } from './vector-index.js';
52
+ export { type ChunkConfig, type GenericDocument, type GenericChunk, type ChunkingStrategy, ChunkingStrategyRegistry, DEFAULT_CHUNK_CONFIG, chunkingRegistry, chunkGenericDocument, registerTextChunkingStrategy } from './chunker.js';
53
+ export * from './search.js';
54
+ export * from './ingestion.js';
55
+ export * from './path-manager.js';
56
+ export { ContentManager, type MemoryContentMetadata, type ContentIngestionResult, type ContentManagerConfig } from './content-manager.js';
57
+ export { ContentResolver, type ContentRequest, type ContentResult } from './content-resolver.js';
58
+ export * from './error-handler.js';
59
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1,69 @@
1
+ /**
2
+ * CORE MODULE — Shared between text-only (rag-lite-ts) and future multimodal (rag-lite-mm)
3
+ * Model-agnostic. No transformer or modality-specific logic.
4
+ *
5
+ * This module provides the clean re-export surface for the core layer, enabling
6
+ * dependency injection patterns for different implementations (text-only, multimodal, etc.).
7
+ *
8
+ * DEPENDENCY INJECTION ARCHITECTURE:
9
+ *
10
+ * The core layer uses explicit dependency injection to maintain clean separation between
11
+ * model-agnostic logic and implementation-specific components:
12
+ *
13
+ * 1. Core Classes (SearchEngine, IngestionPipeline):
14
+ * - Accept injected functions (EmbedFunction, RerankFunction) in constructors
15
+ * - Coordinate model-agnostic operations (database, vector index, search pipeline)
16
+ * - No knowledge of specific embedding models or transformers
17
+ *
18
+ * 2. Dependency Injection Interfaces:
19
+ * - EmbedFunction: (query: string, contentType?: string) => Promise<EmbeddingResult>
20
+ * - RerankFunction: (query: string, results: SearchResult[], contentType?: string) => Promise<SearchResult[]>
21
+ * - Support different content types (text, image, etc.) and embedding dimensions
22
+ *
23
+ * 3. Usage Patterns:
24
+ *
25
+ * // Direct dependency injection (advanced users)
26
+ * const embedFn = await createTextEmbedder();
27
+ * const rerankFn = await createTextReranker();
28
+ * const indexManager = new IndexManager('./index.bin');
29
+ * const db = await openDatabase('./db.sqlite');
30
+ * const search = new SearchEngine(embedFn, indexManager, db, rerankFn);
31
+ *
32
+ * // Factory pattern (recommended for common use cases)
33
+ * const search = await TextSearchFactory.create('./index.bin', './db.sqlite');
34
+ *
35
+ * 4. Extension Points:
36
+ * - New implementations (multimodal, custom models) implement the same interfaces
37
+ * - Core classes remain unchanged when adding new modalities
38
+ * - Plugin architecture enabled through interface-based design
39
+ *
40
+ * 5. Benefits:
41
+ * - Clean separation of concerns
42
+ * - Testability through mock injection
43
+ * - Future extensibility without core changes
44
+ * - Support for different embedding dimensions and content types
45
+ */
46
+ // Dependency injection interfaces and utilities
47
+ export { InterfaceValidator } from './interfaces.js';
48
+ // Adapter utilities for converting implementations to dependency injection
49
+ export * from './adapters.js';
50
+ // Core configuration management - model-agnostic settings
51
+ export * from './config.js';
52
+ // Database operations - supports different content types through metadata
53
+ export { openDatabase, initializeSchema, insertDocument, insertChunk, upsertDocument, getChunksByEmbeddingIds, getModelVersion, setModelVersion, getStoredModelInfo, setStoredModelInfo, insertContentMetadata, getContentMetadata, getContentMetadataByHash, getContentMetadataByStorageType, deleteContentMetadata, getStorageStats, updateStorageStats } from './db.js';
54
+ // Vector index operations - works with any embedding dimensions
55
+ export { VectorIndex } from './vector-index.js';
56
+ // Generic chunking interfaces and strategies - supports text, image metadata, etc.
57
+ export { ChunkingStrategyRegistry, DEFAULT_CHUNK_CONFIG, chunkingRegistry, chunkGenericDocument, registerTextChunkingStrategy } from './chunker.js';
58
+ // Core search engine - uses dependency injection for embedding and reranking
59
+ export * from './search.js';
60
+ // Core ingestion pipeline - uses dependency injection for embedding
61
+ export * from './ingestion.js';
62
+ // Path management utilities - content-type agnostic
63
+ export * from './path-manager.js';
64
+ // Unified content system - handles both filesystem and memory content
65
+ export { ContentManager } from './content-manager.js';
66
+ export { ContentResolver } from './content-resolver.js';
67
+ // Error handling framework - supports implementation-specific error contexts
68
+ export * from './error-handler.js';
69
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1,213 @@
1
+ /**
2
+ * CORE MODULE — Shared between text-only (rag-lite-ts) and future multimodal (rag-lite-mm)
3
+ * Model-agnostic. No transformer or modality-specific logic.
4
+ */
5
+ import { type FileProcessorOptions } from '../file-processor.js';
6
+ import { type ChunkConfig } from './chunker.js';
7
+ import { IndexManager } from '../index-manager.js';
8
+ import { type DatabaseConnection } from './db.js';
9
+ import type { EmbedFunction } from './interfaces.js';
10
+ import { ContentManager, type MemoryContentMetadata } from './content-manager.js';
11
+ /**
12
+ * Options for the ingestion pipeline
13
+ */
14
+ export interface IngestionOptions {
15
+ /** File processing options */
16
+ fileOptions?: FileProcessorOptions;
17
+ /** Chunking configuration */
18
+ chunkConfig?: ChunkConfig;
19
+ /** Whether to force rebuild the index */
20
+ forceRebuild?: boolean;
21
+ /** Mode for the ingestion pipeline (text or multimodal) */
22
+ mode?: 'text' | 'multimodal';
23
+ /** Content type for the ingested content */
24
+ contentType?: string;
25
+ }
26
+ /**
27
+ * Result of the ingestion process
28
+ */
29
+ export interface IngestionResult {
30
+ /** Total documents processed */
31
+ documentsProcessed: number;
32
+ /** Total chunks created */
33
+ chunksCreated: number;
34
+ /** Total embeddings generated */
35
+ embeddingsGenerated: number;
36
+ /** Number of documents that failed processing */
37
+ documentErrors: number;
38
+ /** Number of chunks that failed embedding */
39
+ embeddingErrors: number;
40
+ /** Processing time in milliseconds */
41
+ processingTimeMs: number;
42
+ /** Content IDs of successfully ingested documents */
43
+ contentIds: string[];
44
+ }
45
+ /**
46
+ * Main ingestion pipeline class
47
+ * Coordinates the entire process from file discovery to vector storage
48
+ * Uses explicit dependency injection for clean architecture
49
+ */
50
+ export declare class IngestionPipeline {
51
+ private embedFn;
52
+ private indexManager;
53
+ private db;
54
+ private defaultChunkConfig?;
55
+ private pathManager;
56
+ private contentManager;
57
+ /**
58
+ * Creates a new IngestionPipeline with explicit dependency injection
59
+ * Enhanced with ContentManager integration for unified content system
60
+ *
61
+ * DEPENDENCY INJECTION PATTERN:
62
+ * This constructor requires all dependencies to be explicitly provided, enabling:
63
+ * - Clean separation between core ingestion logic and implementation-specific components
64
+ * - Support for different embedding models and content types
65
+ * - Testability through mock injection
66
+ * - Future extensibility for multimodal content processing
67
+ * - Unified content management for both filesystem and memory-based ingestion
68
+ *
69
+ * @param embedFn - Function to embed document chunks into vectors
70
+ * - Signature: (query: string, contentType?: string) => Promise<EmbeddingResult>
71
+ * - Must handle chunk text and return consistent embedding format
72
+ * - Examples:
73
+ * - Text: const embedFn = (text) => textEmbedder.embedSingle(text)
74
+ * - Multimodal: const embedFn = (content, type) => type === 'image' ? clipEmbedder.embedImage(content) : clipEmbedder.embedText(content)
75
+ * - Custom: const embedFn = (text) => customModel.embed(text)
76
+ *
77
+ * @param indexManager - Vector index manager for storing embeddings
78
+ * - Handles vector storage and indexing operations
79
+ * - Must support the embedding dimensions produced by embedFn
80
+ * - Example: new IndexManager('./index.bin')
81
+ *
82
+ * @param db - Database connection for metadata storage
83
+ * - Stores document and chunk metadata with content type support
84
+ * - Supports different content types through metadata fields
85
+ * - Example: await openDatabase('./db.sqlite')
86
+ *
87
+ * @param contentManager - Optional ContentManager for unified content system
88
+ * - Handles content storage routing and deduplication
89
+ * - If not provided, creates default instance with standard configuration
90
+ * - Example: new ContentManager(db, { contentDir: '.raglite/content' })
91
+ *
92
+ * USAGE EXAMPLES:
93
+ * ```typescript
94
+ * // Text-only ingestion pipeline with unified content system
95
+ * const textEmbedFn = await createTextEmbedder();
96
+ * const indexManager = new IndexManager('./index.bin');
97
+ * const db = await openDatabase('./db.sqlite');
98
+ * const contentManager = new ContentManager(db);
99
+ * const ingestion = new IngestionPipeline(textEmbedFn, indexManager, db, undefined, contentManager);
100
+ *
101
+ * // Simple usage (ContentManager created automatically)
102
+ * const ingestion = new IngestionPipeline(textEmbedFn, indexManager, db);
103
+ *
104
+ * // Custom embedding implementation with memory ingestion
105
+ * const customEmbedFn = async (text) => ({
106
+ * embedding_id: generateId(),
107
+ * vector: await myCustomModel.embed(text)
108
+ * });
109
+ * const ingestion = new IngestionPipeline(customEmbedFn, indexManager, db);
110
+ * await ingestion.ingestFromMemory(buffer, { displayName: 'file.txt' });
111
+ * ```
112
+ */
113
+ constructor(embedFn: EmbedFunction, indexManager: IndexManager, db: DatabaseConnection, defaultChunkConfig?: ChunkConfig | undefined, contentManager?: ContentManager);
114
+ /**
115
+ * Ingest documents from a directory
116
+ * @param directoryPath - Path to directory containing documents
117
+ * @param options - Optional ingestion configuration
118
+ * @returns Promise resolving to ingestion results
119
+ */
120
+ ingestDirectory(directoryPath: string, options?: IngestionOptions): Promise<IngestionResult>;
121
+ /**
122
+ * Ingest a single file
123
+ * @param filePath - Path to the file to ingest
124
+ * @param options - Optional ingestion configuration
125
+ * @returns Promise resolving to ingestion results
126
+ */
127
+ ingestFile(filePath: string, options?: IngestionOptions): Promise<IngestionResult>;
128
+ /**
129
+ * Ingest content from memory buffer
130
+ * Enables MCP integration and real-time content processing
131
+ * @param content - Buffer containing the content to ingest
132
+ * @param metadata - Memory content metadata including display name and content type
133
+ * @param options - Optional ingestion configuration
134
+ * @returns Promise resolving to content ID for the ingested content
135
+ */
136
+ ingestFromMemory(content: Buffer, metadata: MemoryContentMetadata, options?: IngestionOptions): Promise<string>;
137
+ /**
138
+ * Ingest documents from a path (file or directory)
139
+ * Implements the complete pipeline: file processing → chunking → embedding → storage
140
+ * Enhanced to handle mixed content types (text and images) in multimodal mode
141
+ */
142
+ ingestPath(path: string, options?: IngestionOptions): Promise<IngestionResult>;
143
+ /**
144
+ * Analyze content types in the document collection
145
+ * @private
146
+ */
147
+ private analyzeContentTypes;
148
+ /**
149
+ * Chunk all documents and organize results with content-type awareness
150
+ * Enhanced to handle different content types appropriately
151
+ */
152
+ private chunkDocumentsWithContentTypes;
153
+ /**
154
+ * Chunk all documents and organize results (legacy method for backward compatibility)
155
+ * @deprecated Use chunkDocumentsWithContentTypes for multimodal support
156
+ */
157
+ private chunkDocuments;
158
+ /**
159
+ * Generate embeddings for all chunks with content-type support
160
+ * Enhanced to handle different content types and pass metadata to embedding function
161
+ */
162
+ private generateEmbeddingsWithContentTypes;
163
+ /**
164
+ * Generate embeddings for all chunks with error handling (legacy method for backward compatibility)
165
+ * @deprecated Use generateEmbeddingsWithContentTypes for multimodal support
166
+ */
167
+ private generateEmbeddings;
168
+ /**
169
+ * Store documents and chunks in database with content-type support
170
+ * Enhanced to handle content type metadata and multimodal content
171
+ * @returns Array of content IDs for successfully stored documents
172
+ */
173
+ private storeDocumentsAndChunksWithContentTypes;
174
+ /**
175
+ * Store documents and chunks in database (legacy method for backward compatibility)
176
+ * @deprecated Use storeDocumentsAndChunksWithContentTypes for multimodal support
177
+ */
178
+ private storeDocumentsAndChunks;
179
+ /**
180
+ * Update vector index with new embeddings
181
+ */
182
+ private updateVectorIndex;
183
+ /**
184
+ * Converts MIME type to simple content type for embedding function
185
+ * @param mimeType - MIME type string (e.g., 'text/plain', 'image/jpeg')
186
+ * @returns Simple content type ('text', 'image', etc.)
187
+ */
188
+ private getContentTypeForEmbedding;
189
+ /**
190
+ * Save the vector index to disk
191
+ */
192
+ saveIndex(): Promise<void>;
193
+ /**
194
+ * Process image content from memory using the existing image processing pipeline
195
+ * @private
196
+ */
197
+ private processImageFromMemory;
198
+ /**
199
+ * Process PDF content from memory using the existing PDF processing pipeline
200
+ * @private
201
+ */
202
+ private processPDFFromMemory;
203
+ /**
204
+ * Process DOCX content from memory using the existing DOCX processing pipeline
205
+ * @private
206
+ */
207
+ private processDOCXFromMemory;
208
+ /**
209
+ * Clean up resources - explicit cleanup method
210
+ */
211
+ cleanup(): Promise<void>;
212
+ }
213
+ //# sourceMappingURL=ingestion.d.ts.map