rag-lite-ts 1.0.1 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +651 -109
- package/dist/cli/indexer.js +262 -46
- package/dist/cli/search.js +54 -32
- package/dist/cli.js +185 -28
- package/dist/config.d.ts +34 -73
- package/dist/config.js +50 -255
- package/dist/core/abstract-embedder.d.ts +125 -0
- package/dist/core/abstract-embedder.js +264 -0
- package/dist/core/actionable-error-messages.d.ts +60 -0
- package/dist/core/actionable-error-messages.js +397 -0
- package/dist/core/adapters.d.ts +93 -0
- package/dist/core/adapters.js +139 -0
- package/dist/core/batch-processing-optimizer.d.ts +155 -0
- package/dist/core/batch-processing-optimizer.js +541 -0
- package/dist/core/chunker.d.ts +119 -0
- package/dist/core/chunker.js +73 -0
- package/dist/core/cli-database-utils.d.ts +53 -0
- package/dist/core/cli-database-utils.js +239 -0
- package/dist/core/config.d.ts +102 -0
- package/dist/core/config.js +247 -0
- package/dist/core/content-errors.d.ts +111 -0
- package/dist/core/content-errors.js +362 -0
- package/dist/core/content-manager.d.ts +343 -0
- package/dist/core/content-manager.js +1504 -0
- package/dist/core/content-performance-optimizer.d.ts +150 -0
- package/dist/core/content-performance-optimizer.js +516 -0
- package/dist/core/content-resolver.d.ts +104 -0
- package/dist/core/content-resolver.js +285 -0
- package/dist/core/cross-modal-search.d.ts +164 -0
- package/dist/core/cross-modal-search.js +342 -0
- package/dist/core/database-connection-manager.d.ts +109 -0
- package/dist/core/database-connection-manager.js +304 -0
- package/dist/core/db.d.ts +245 -0
- package/dist/core/db.js +952 -0
- package/dist/core/embedder-factory.d.ts +176 -0
- package/dist/core/embedder-factory.js +338 -0
- package/dist/{error-handler.d.ts → core/error-handler.d.ts} +23 -2
- package/dist/{error-handler.js → core/error-handler.js} +51 -8
- package/dist/core/index.d.ts +59 -0
- package/dist/core/index.js +69 -0
- package/dist/core/ingestion.d.ts +213 -0
- package/dist/core/ingestion.js +812 -0
- package/dist/core/interfaces.d.ts +408 -0
- package/dist/core/interfaces.js +106 -0
- package/dist/core/lazy-dependency-loader.d.ts +152 -0
- package/dist/core/lazy-dependency-loader.js +453 -0
- package/dist/core/mode-detection-service.d.ts +150 -0
- package/dist/core/mode-detection-service.js +565 -0
- package/dist/core/mode-model-validator.d.ts +92 -0
- package/dist/core/mode-model-validator.js +203 -0
- package/dist/core/model-registry.d.ts +120 -0
- package/dist/core/model-registry.js +415 -0
- package/dist/core/model-validator.d.ts +217 -0
- package/dist/core/model-validator.js +782 -0
- package/dist/{path-manager.d.ts → core/path-manager.d.ts} +5 -0
- package/dist/{path-manager.js → core/path-manager.js} +5 -0
- package/dist/core/polymorphic-search-factory.d.ts +154 -0
- package/dist/core/polymorphic-search-factory.js +344 -0
- package/dist/core/raglite-paths.d.ts +121 -0
- package/dist/core/raglite-paths.js +145 -0
- package/dist/core/reranking-config.d.ts +42 -0
- package/dist/core/reranking-config.js +156 -0
- package/dist/core/reranking-factory.d.ts +92 -0
- package/dist/core/reranking-factory.js +591 -0
- package/dist/core/reranking-strategies.d.ts +325 -0
- package/dist/core/reranking-strategies.js +720 -0
- package/dist/core/resource-cleanup.d.ts +163 -0
- package/dist/core/resource-cleanup.js +371 -0
- package/dist/core/resource-manager.d.ts +212 -0
- package/dist/core/resource-manager.js +564 -0
- package/dist/core/search-pipeline.d.ts +111 -0
- package/dist/core/search-pipeline.js +287 -0
- package/dist/core/search.d.ts +131 -0
- package/dist/core/search.js +296 -0
- package/dist/core/streaming-operations.d.ts +145 -0
- package/dist/core/streaming-operations.js +409 -0
- package/dist/core/types.d.ts +66 -0
- package/dist/core/types.js +6 -0
- package/dist/core/universal-embedder.d.ts +177 -0
- package/dist/core/universal-embedder.js +139 -0
- package/dist/core/validation-messages.d.ts +99 -0
- package/dist/core/validation-messages.js +334 -0
- package/dist/{vector-index.d.ts → core/vector-index.d.ts} +4 -0
- package/dist/{vector-index.js → core/vector-index.js} +21 -3
- package/dist/dom-polyfills.d.ts +6 -0
- package/dist/dom-polyfills.js +40 -0
- package/dist/factories/index.d.ts +43 -0
- package/dist/factories/index.js +44 -0
- package/dist/factories/text-factory.d.ts +560 -0
- package/dist/factories/text-factory.js +968 -0
- package/dist/file-processor.d.ts +90 -4
- package/dist/file-processor.js +723 -20
- package/dist/index-manager.d.ts +3 -2
- package/dist/index-manager.js +13 -11
- package/dist/index.d.ts +72 -8
- package/dist/index.js +102 -16
- package/dist/indexer.js +1 -1
- package/dist/ingestion.d.ts +44 -154
- package/dist/ingestion.js +75 -671
- package/dist/mcp-server.d.ts +35 -3
- package/dist/mcp-server.js +1186 -79
- package/dist/multimodal/clip-embedder.d.ts +314 -0
- package/dist/multimodal/clip-embedder.js +945 -0
- package/dist/multimodal/index.d.ts +6 -0
- package/dist/multimodal/index.js +6 -0
- package/dist/preprocess.js +1 -1
- package/dist/run-error-recovery-tests.d.ts +7 -0
- package/dist/run-error-recovery-tests.js +101 -0
- package/dist/search-standalone.js +1 -1
- package/dist/search.d.ts +51 -69
- package/dist/search.js +117 -412
- package/dist/test-utils.d.ts +8 -26
- package/dist/text/chunker.d.ts +33 -0
- package/dist/{chunker.js → text/chunker.js} +98 -75
- package/dist/{embedder.d.ts → text/embedder.d.ts} +22 -1
- package/dist/{embedder.js → text/embedder.js} +84 -10
- package/dist/text/index.d.ts +8 -0
- package/dist/text/index.js +9 -0
- package/dist/text/preprocessors/index.d.ts +17 -0
- package/dist/text/preprocessors/index.js +38 -0
- package/dist/text/preprocessors/mdx.d.ts +25 -0
- package/dist/text/preprocessors/mdx.js +101 -0
- package/dist/text/preprocessors/mermaid.d.ts +68 -0
- package/dist/text/preprocessors/mermaid.js +330 -0
- package/dist/text/preprocessors/registry.d.ts +56 -0
- package/dist/text/preprocessors/registry.js +180 -0
- package/dist/text/reranker.d.ts +59 -0
- package/dist/{reranker.js → text/reranker.js} +138 -53
- package/dist/text/sentence-transformer-embedder.d.ts +96 -0
- package/dist/text/sentence-transformer-embedder.js +340 -0
- package/dist/{tokenizer.d.ts → text/tokenizer.d.ts} +1 -0
- package/dist/{tokenizer.js → text/tokenizer.js} +7 -2
- package/dist/types.d.ts +40 -1
- package/dist/utils/vector-math.d.ts +31 -0
- package/dist/utils/vector-math.js +70 -0
- package/package.json +16 -4
- package/dist/api-errors.d.ts.map +0 -1
- package/dist/api-errors.js.map +0 -1
- package/dist/chunker.d.ts +0 -47
- package/dist/chunker.d.ts.map +0 -1
- package/dist/chunker.js.map +0 -1
- package/dist/cli/indexer.d.ts.map +0 -1
- package/dist/cli/indexer.js.map +0 -1
- package/dist/cli/search.d.ts.map +0 -1
- package/dist/cli/search.js.map +0 -1
- package/dist/cli.d.ts.map +0 -1
- package/dist/cli.js.map +0 -1
- package/dist/config.d.ts.map +0 -1
- package/dist/config.js.map +0 -1
- package/dist/db.d.ts +0 -90
- package/dist/db.d.ts.map +0 -1
- package/dist/db.js +0 -340
- package/dist/db.js.map +0 -1
- package/dist/embedder.d.ts.map +0 -1
- package/dist/embedder.js.map +0 -1
- package/dist/error-handler.d.ts.map +0 -1
- package/dist/error-handler.js.map +0 -1
- package/dist/file-processor.d.ts.map +0 -1
- package/dist/file-processor.js.map +0 -1
- package/dist/index-manager.d.ts.map +0 -1
- package/dist/index-manager.js.map +0 -1
- package/dist/index.d.ts.map +0 -1
- package/dist/index.js.map +0 -1
- package/dist/indexer.d.ts.map +0 -1
- package/dist/indexer.js.map +0 -1
- package/dist/ingestion.d.ts.map +0 -1
- package/dist/ingestion.js.map +0 -1
- package/dist/mcp-server.d.ts.map +0 -1
- package/dist/mcp-server.js.map +0 -1
- package/dist/path-manager.d.ts.map +0 -1
- package/dist/path-manager.js.map +0 -1
- package/dist/preprocess.d.ts.map +0 -1
- package/dist/preprocess.js.map +0 -1
- package/dist/preprocessors/index.d.ts.map +0 -1
- package/dist/preprocessors/index.js.map +0 -1
- package/dist/preprocessors/mdx.d.ts.map +0 -1
- package/dist/preprocessors/mdx.js.map +0 -1
- package/dist/preprocessors/mermaid.d.ts.map +0 -1
- package/dist/preprocessors/mermaid.js.map +0 -1
- package/dist/preprocessors/registry.d.ts.map +0 -1
- package/dist/preprocessors/registry.js.map +0 -1
- package/dist/reranker.d.ts +0 -40
- package/dist/reranker.d.ts.map +0 -1
- package/dist/reranker.js.map +0 -1
- package/dist/resource-manager-demo.d.ts +0 -7
- package/dist/resource-manager-demo.d.ts.map +0 -1
- package/dist/resource-manager-demo.js +0 -52
- package/dist/resource-manager-demo.js.map +0 -1
- package/dist/resource-manager.d.ts +0 -129
- package/dist/resource-manager.d.ts.map +0 -1
- package/dist/resource-manager.js +0 -389
- package/dist/resource-manager.js.map +0 -1
- package/dist/search-standalone.d.ts.map +0 -1
- package/dist/search-standalone.js.map +0 -1
- package/dist/search.d.ts.map +0 -1
- package/dist/search.js.map +0 -1
- package/dist/test-utils.d.ts.map +0 -1
- package/dist/test-utils.js.map +0 -1
- package/dist/tokenizer.d.ts.map +0 -1
- package/dist/tokenizer.js.map +0 -1
- package/dist/types.d.ts.map +0 -1
- package/dist/types.js.map +0 -1
- package/dist/vector-index.d.ts.map +0 -1
- package/dist/vector-index.js.map +0 -1
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* CORE MODULE — Shared between text-only (rag-lite-ts) and future multimodal (rag-lite-mm)
|
|
3
|
+
* Model-agnostic. No transformer or modality-specific logic.
|
|
4
|
+
*
|
|
5
|
+
* This module provides the clean re-export surface for the core layer, enabling
|
|
6
|
+
* dependency injection patterns for different implementations (text-only, multimodal, etc.).
|
|
7
|
+
*
|
|
8
|
+
* DEPENDENCY INJECTION ARCHITECTURE:
|
|
9
|
+
*
|
|
10
|
+
* The core layer uses explicit dependency injection to maintain clean separation between
|
|
11
|
+
* model-agnostic logic and implementation-specific components:
|
|
12
|
+
*
|
|
13
|
+
* 1. Core Classes (SearchEngine, IngestionPipeline):
|
|
14
|
+
* - Accept injected functions (EmbedFunction, RerankFunction) in constructors
|
|
15
|
+
* - Coordinate model-agnostic operations (database, vector index, search pipeline)
|
|
16
|
+
* - No knowledge of specific embedding models or transformers
|
|
17
|
+
*
|
|
18
|
+
* 2. Dependency Injection Interfaces:
|
|
19
|
+
* - EmbedFunction: (query: string, contentType?: string) => Promise<EmbeddingResult>
|
|
20
|
+
* - RerankFunction: (query: string, results: SearchResult[], contentType?: string) => Promise<SearchResult[]>
|
|
21
|
+
* - Support different content types (text, image, etc.) and embedding dimensions
|
|
22
|
+
*
|
|
23
|
+
* 3. Usage Patterns:
|
|
24
|
+
*
|
|
25
|
+
* // Direct dependency injection (advanced users)
|
|
26
|
+
* const embedFn = await createTextEmbedder();
|
|
27
|
+
* const rerankFn = await createTextReranker();
|
|
28
|
+
* const indexManager = new IndexManager('./index.bin');
|
|
29
|
+
* const db = await openDatabase('./db.sqlite');
|
|
30
|
+
* const search = new SearchEngine(embedFn, indexManager, db, rerankFn);
|
|
31
|
+
*
|
|
32
|
+
* // Factory pattern (recommended for common use cases)
|
|
33
|
+
* const search = await TextSearchFactory.create('./index.bin', './db.sqlite');
|
|
34
|
+
*
|
|
35
|
+
* 4. Extension Points:
|
|
36
|
+
* - New implementations (multimodal, custom models) implement the same interfaces
|
|
37
|
+
* - Core classes remain unchanged when adding new modalities
|
|
38
|
+
* - Plugin architecture enabled through interface-based design
|
|
39
|
+
*
|
|
40
|
+
* 5. Benefits:
|
|
41
|
+
* - Clean separation of concerns
|
|
42
|
+
* - Testability through mock injection
|
|
43
|
+
* - Future extensibility without core changes
|
|
44
|
+
* - Support for different embedding dimensions and content types
|
|
45
|
+
*/
|
|
46
|
+
export { type ContentDocument, type ContentChunk, type Document, type Chunk, type EmbeddingResult, type SearchResult, type SearchOptions, } from './types.js';
|
|
47
|
+
export { type EmbedFunction, type RerankFunction, type EmbeddingQueryInterface, type RerankingInterface, type SearchEngineConfig, type ContentTypeStrategy, type ModelAgnosticInterface, type ExtendedEmbeddingInterface, type ExtendedRerankingInterface, type SearchPipelineInterface, type SearchDependencyFactory, InterfaceValidator } from './interfaces.js';
|
|
48
|
+
export * from './adapters.js';
|
|
49
|
+
export * from './config.js';
|
|
50
|
+
export { type DatabaseConnection, type ContentMetadata, openDatabase, initializeSchema, insertDocument, insertChunk, upsertDocument, getChunksByEmbeddingIds, getModelVersion, setModelVersion, getStoredModelInfo, setStoredModelInfo, insertContentMetadata, getContentMetadata, getContentMetadataByHash, getContentMetadataByStorageType, deleteContentMetadata, getStorageStats, updateStorageStats } from './db.js';
|
|
51
|
+
export { type VectorIndexOptions, VectorIndex } from './vector-index.js';
|
|
52
|
+
export { type ChunkConfig, type GenericDocument, type GenericChunk, type ChunkingStrategy, ChunkingStrategyRegistry, DEFAULT_CHUNK_CONFIG, chunkingRegistry, chunkGenericDocument, registerTextChunkingStrategy } from './chunker.js';
|
|
53
|
+
export * from './search.js';
|
|
54
|
+
export * from './ingestion.js';
|
|
55
|
+
export * from './path-manager.js';
|
|
56
|
+
export { ContentManager, type MemoryContentMetadata, type ContentIngestionResult, type ContentManagerConfig } from './content-manager.js';
|
|
57
|
+
export { ContentResolver, type ContentRequest, type ContentResult } from './content-resolver.js';
|
|
58
|
+
export * from './error-handler.js';
|
|
59
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* CORE MODULE — Shared between text-only (rag-lite-ts) and future multimodal (rag-lite-mm)
|
|
3
|
+
* Model-agnostic. No transformer or modality-specific logic.
|
|
4
|
+
*
|
|
5
|
+
* This module provides the clean re-export surface for the core layer, enabling
|
|
6
|
+
* dependency injection patterns for different implementations (text-only, multimodal, etc.).
|
|
7
|
+
*
|
|
8
|
+
* DEPENDENCY INJECTION ARCHITECTURE:
|
|
9
|
+
*
|
|
10
|
+
* The core layer uses explicit dependency injection to maintain clean separation between
|
|
11
|
+
* model-agnostic logic and implementation-specific components:
|
|
12
|
+
*
|
|
13
|
+
* 1. Core Classes (SearchEngine, IngestionPipeline):
|
|
14
|
+
* - Accept injected functions (EmbedFunction, RerankFunction) in constructors
|
|
15
|
+
* - Coordinate model-agnostic operations (database, vector index, search pipeline)
|
|
16
|
+
* - No knowledge of specific embedding models or transformers
|
|
17
|
+
*
|
|
18
|
+
* 2. Dependency Injection Interfaces:
|
|
19
|
+
* - EmbedFunction: (query: string, contentType?: string) => Promise<EmbeddingResult>
|
|
20
|
+
* - RerankFunction: (query: string, results: SearchResult[], contentType?: string) => Promise<SearchResult[]>
|
|
21
|
+
* - Support different content types (text, image, etc.) and embedding dimensions
|
|
22
|
+
*
|
|
23
|
+
* 3. Usage Patterns:
|
|
24
|
+
*
|
|
25
|
+
* // Direct dependency injection (advanced users)
|
|
26
|
+
* const embedFn = await createTextEmbedder();
|
|
27
|
+
* const rerankFn = await createTextReranker();
|
|
28
|
+
* const indexManager = new IndexManager('./index.bin');
|
|
29
|
+
* const db = await openDatabase('./db.sqlite');
|
|
30
|
+
* const search = new SearchEngine(embedFn, indexManager, db, rerankFn);
|
|
31
|
+
*
|
|
32
|
+
* // Factory pattern (recommended for common use cases)
|
|
33
|
+
* const search = await TextSearchFactory.create('./index.bin', './db.sqlite');
|
|
34
|
+
*
|
|
35
|
+
* 4. Extension Points:
|
|
36
|
+
* - New implementations (multimodal, custom models) implement the same interfaces
|
|
37
|
+
* - Core classes remain unchanged when adding new modalities
|
|
38
|
+
* - Plugin architecture enabled through interface-based design
|
|
39
|
+
*
|
|
40
|
+
* 5. Benefits:
|
|
41
|
+
* - Clean separation of concerns
|
|
42
|
+
* - Testability through mock injection
|
|
43
|
+
* - Future extensibility without core changes
|
|
44
|
+
* - Support for different embedding dimensions and content types
|
|
45
|
+
*/
|
|
46
|
+
// Dependency injection interfaces and utilities
|
|
47
|
+
export { InterfaceValidator } from './interfaces.js';
|
|
48
|
+
// Adapter utilities for converting implementations to dependency injection
|
|
49
|
+
export * from './adapters.js';
|
|
50
|
+
// Core configuration management - model-agnostic settings
|
|
51
|
+
export * from './config.js';
|
|
52
|
+
// Database operations - supports different content types through metadata
|
|
53
|
+
export { openDatabase, initializeSchema, insertDocument, insertChunk, upsertDocument, getChunksByEmbeddingIds, getModelVersion, setModelVersion, getStoredModelInfo, setStoredModelInfo, insertContentMetadata, getContentMetadata, getContentMetadataByHash, getContentMetadataByStorageType, deleteContentMetadata, getStorageStats, updateStorageStats } from './db.js';
|
|
54
|
+
// Vector index operations - works with any embedding dimensions
|
|
55
|
+
export { VectorIndex } from './vector-index.js';
|
|
56
|
+
// Generic chunking interfaces and strategies - supports text, image metadata, etc.
|
|
57
|
+
export { ChunkingStrategyRegistry, DEFAULT_CHUNK_CONFIG, chunkingRegistry, chunkGenericDocument, registerTextChunkingStrategy } from './chunker.js';
|
|
58
|
+
// Core search engine - uses dependency injection for embedding and reranking
|
|
59
|
+
export * from './search.js';
|
|
60
|
+
// Core ingestion pipeline - uses dependency injection for embedding
|
|
61
|
+
export * from './ingestion.js';
|
|
62
|
+
// Path management utilities - content-type agnostic
|
|
63
|
+
export * from './path-manager.js';
|
|
64
|
+
// Unified content system - handles both filesystem and memory content
|
|
65
|
+
export { ContentManager } from './content-manager.js';
|
|
66
|
+
export { ContentResolver } from './content-resolver.js';
|
|
67
|
+
// Error handling framework - supports implementation-specific error contexts
|
|
68
|
+
export * from './error-handler.js';
|
|
69
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* CORE MODULE — Shared between text-only (rag-lite-ts) and future multimodal (rag-lite-mm)
|
|
3
|
+
* Model-agnostic. No transformer or modality-specific logic.
|
|
4
|
+
*/
|
|
5
|
+
import { type FileProcessorOptions } from '../file-processor.js';
|
|
6
|
+
import { type ChunkConfig } from './chunker.js';
|
|
7
|
+
import { IndexManager } from '../index-manager.js';
|
|
8
|
+
import { type DatabaseConnection } from './db.js';
|
|
9
|
+
import type { EmbedFunction } from './interfaces.js';
|
|
10
|
+
import { ContentManager, type MemoryContentMetadata } from './content-manager.js';
|
|
11
|
+
/**
|
|
12
|
+
* Options for the ingestion pipeline
|
|
13
|
+
*/
|
|
14
|
+
export interface IngestionOptions {
|
|
15
|
+
/** File processing options */
|
|
16
|
+
fileOptions?: FileProcessorOptions;
|
|
17
|
+
/** Chunking configuration */
|
|
18
|
+
chunkConfig?: ChunkConfig;
|
|
19
|
+
/** Whether to force rebuild the index */
|
|
20
|
+
forceRebuild?: boolean;
|
|
21
|
+
/** Mode for the ingestion pipeline (text or multimodal) */
|
|
22
|
+
mode?: 'text' | 'multimodal';
|
|
23
|
+
/** Content type for the ingested content */
|
|
24
|
+
contentType?: string;
|
|
25
|
+
}
|
|
26
|
+
/**
|
|
27
|
+
* Result of the ingestion process
|
|
28
|
+
*/
|
|
29
|
+
export interface IngestionResult {
|
|
30
|
+
/** Total documents processed */
|
|
31
|
+
documentsProcessed: number;
|
|
32
|
+
/** Total chunks created */
|
|
33
|
+
chunksCreated: number;
|
|
34
|
+
/** Total embeddings generated */
|
|
35
|
+
embeddingsGenerated: number;
|
|
36
|
+
/** Number of documents that failed processing */
|
|
37
|
+
documentErrors: number;
|
|
38
|
+
/** Number of chunks that failed embedding */
|
|
39
|
+
embeddingErrors: number;
|
|
40
|
+
/** Processing time in milliseconds */
|
|
41
|
+
processingTimeMs: number;
|
|
42
|
+
/** Content IDs of successfully ingested documents */
|
|
43
|
+
contentIds: string[];
|
|
44
|
+
}
|
|
45
|
+
/**
|
|
46
|
+
* Main ingestion pipeline class
|
|
47
|
+
* Coordinates the entire process from file discovery to vector storage
|
|
48
|
+
* Uses explicit dependency injection for clean architecture
|
|
49
|
+
*/
|
|
50
|
+
export declare class IngestionPipeline {
|
|
51
|
+
private embedFn;
|
|
52
|
+
private indexManager;
|
|
53
|
+
private db;
|
|
54
|
+
private defaultChunkConfig?;
|
|
55
|
+
private pathManager;
|
|
56
|
+
private contentManager;
|
|
57
|
+
/**
|
|
58
|
+
* Creates a new IngestionPipeline with explicit dependency injection
|
|
59
|
+
* Enhanced with ContentManager integration for unified content system
|
|
60
|
+
*
|
|
61
|
+
* DEPENDENCY INJECTION PATTERN:
|
|
62
|
+
* This constructor requires all dependencies to be explicitly provided, enabling:
|
|
63
|
+
* - Clean separation between core ingestion logic and implementation-specific components
|
|
64
|
+
* - Support for different embedding models and content types
|
|
65
|
+
* - Testability through mock injection
|
|
66
|
+
* - Future extensibility for multimodal content processing
|
|
67
|
+
* - Unified content management for both filesystem and memory-based ingestion
|
|
68
|
+
*
|
|
69
|
+
* @param embedFn - Function to embed document chunks into vectors
|
|
70
|
+
* - Signature: (query: string, contentType?: string) => Promise<EmbeddingResult>
|
|
71
|
+
* - Must handle chunk text and return consistent embedding format
|
|
72
|
+
* - Examples:
|
|
73
|
+
* - Text: const embedFn = (text) => textEmbedder.embedSingle(text)
|
|
74
|
+
* - Multimodal: const embedFn = (content, type) => type === 'image' ? clipEmbedder.embedImage(content) : clipEmbedder.embedText(content)
|
|
75
|
+
* - Custom: const embedFn = (text) => customModel.embed(text)
|
|
76
|
+
*
|
|
77
|
+
* @param indexManager - Vector index manager for storing embeddings
|
|
78
|
+
* - Handles vector storage and indexing operations
|
|
79
|
+
* - Must support the embedding dimensions produced by embedFn
|
|
80
|
+
* - Example: new IndexManager('./index.bin')
|
|
81
|
+
*
|
|
82
|
+
* @param db - Database connection for metadata storage
|
|
83
|
+
* - Stores document and chunk metadata with content type support
|
|
84
|
+
* - Supports different content types through metadata fields
|
|
85
|
+
* - Example: await openDatabase('./db.sqlite')
|
|
86
|
+
*
|
|
87
|
+
* @param contentManager - Optional ContentManager for unified content system
|
|
88
|
+
* - Handles content storage routing and deduplication
|
|
89
|
+
* - If not provided, creates default instance with standard configuration
|
|
90
|
+
* - Example: new ContentManager(db, { contentDir: '.raglite/content' })
|
|
91
|
+
*
|
|
92
|
+
* USAGE EXAMPLES:
|
|
93
|
+
* ```typescript
|
|
94
|
+
* // Text-only ingestion pipeline with unified content system
|
|
95
|
+
* const textEmbedFn = await createTextEmbedder();
|
|
96
|
+
* const indexManager = new IndexManager('./index.bin');
|
|
97
|
+
* const db = await openDatabase('./db.sqlite');
|
|
98
|
+
* const contentManager = new ContentManager(db);
|
|
99
|
+
* const ingestion = new IngestionPipeline(textEmbedFn, indexManager, db, undefined, contentManager);
|
|
100
|
+
*
|
|
101
|
+
* // Simple usage (ContentManager created automatically)
|
|
102
|
+
* const ingestion = new IngestionPipeline(textEmbedFn, indexManager, db);
|
|
103
|
+
*
|
|
104
|
+
* // Custom embedding implementation with memory ingestion
|
|
105
|
+
* const customEmbedFn = async (text) => ({
|
|
106
|
+
* embedding_id: generateId(),
|
|
107
|
+
* vector: await myCustomModel.embed(text)
|
|
108
|
+
* });
|
|
109
|
+
* const ingestion = new IngestionPipeline(customEmbedFn, indexManager, db);
|
|
110
|
+
* await ingestion.ingestFromMemory(buffer, { displayName: 'file.txt' });
|
|
111
|
+
* ```
|
|
112
|
+
*/
|
|
113
|
+
constructor(embedFn: EmbedFunction, indexManager: IndexManager, db: DatabaseConnection, defaultChunkConfig?: ChunkConfig | undefined, contentManager?: ContentManager);
|
|
114
|
+
/**
|
|
115
|
+
* Ingest documents from a directory
|
|
116
|
+
* @param directoryPath - Path to directory containing documents
|
|
117
|
+
* @param options - Optional ingestion configuration
|
|
118
|
+
* @returns Promise resolving to ingestion results
|
|
119
|
+
*/
|
|
120
|
+
ingestDirectory(directoryPath: string, options?: IngestionOptions): Promise<IngestionResult>;
|
|
121
|
+
/**
|
|
122
|
+
* Ingest a single file
|
|
123
|
+
* @param filePath - Path to the file to ingest
|
|
124
|
+
* @param options - Optional ingestion configuration
|
|
125
|
+
* @returns Promise resolving to ingestion results
|
|
126
|
+
*/
|
|
127
|
+
ingestFile(filePath: string, options?: IngestionOptions): Promise<IngestionResult>;
|
|
128
|
+
/**
|
|
129
|
+
* Ingest content from memory buffer
|
|
130
|
+
* Enables MCP integration and real-time content processing
|
|
131
|
+
* @param content - Buffer containing the content to ingest
|
|
132
|
+
* @param metadata - Memory content metadata including display name and content type
|
|
133
|
+
* @param options - Optional ingestion configuration
|
|
134
|
+
* @returns Promise resolving to content ID for the ingested content
|
|
135
|
+
*/
|
|
136
|
+
ingestFromMemory(content: Buffer, metadata: MemoryContentMetadata, options?: IngestionOptions): Promise<string>;
|
|
137
|
+
/**
|
|
138
|
+
* Ingest documents from a path (file or directory)
|
|
139
|
+
* Implements the complete pipeline: file processing → chunking → embedding → storage
|
|
140
|
+
* Enhanced to handle mixed content types (text and images) in multimodal mode
|
|
141
|
+
*/
|
|
142
|
+
ingestPath(path: string, options?: IngestionOptions): Promise<IngestionResult>;
|
|
143
|
+
/**
|
|
144
|
+
* Analyze content types in the document collection
|
|
145
|
+
* @private
|
|
146
|
+
*/
|
|
147
|
+
private analyzeContentTypes;
|
|
148
|
+
/**
|
|
149
|
+
* Chunk all documents and organize results with content-type awareness
|
|
150
|
+
* Enhanced to handle different content types appropriately
|
|
151
|
+
*/
|
|
152
|
+
private chunkDocumentsWithContentTypes;
|
|
153
|
+
/**
|
|
154
|
+
* Chunk all documents and organize results (legacy method for backward compatibility)
|
|
155
|
+
* @deprecated Use chunkDocumentsWithContentTypes for multimodal support
|
|
156
|
+
*/
|
|
157
|
+
private chunkDocuments;
|
|
158
|
+
/**
|
|
159
|
+
* Generate embeddings for all chunks with content-type support
|
|
160
|
+
* Enhanced to handle different content types and pass metadata to embedding function
|
|
161
|
+
*/
|
|
162
|
+
private generateEmbeddingsWithContentTypes;
|
|
163
|
+
/**
|
|
164
|
+
* Generate embeddings for all chunks with error handling (legacy method for backward compatibility)
|
|
165
|
+
* @deprecated Use generateEmbeddingsWithContentTypes for multimodal support
|
|
166
|
+
*/
|
|
167
|
+
private generateEmbeddings;
|
|
168
|
+
/**
|
|
169
|
+
* Store documents and chunks in database with content-type support
|
|
170
|
+
* Enhanced to handle content type metadata and multimodal content
|
|
171
|
+
* @returns Array of content IDs for successfully stored documents
|
|
172
|
+
*/
|
|
173
|
+
private storeDocumentsAndChunksWithContentTypes;
|
|
174
|
+
/**
|
|
175
|
+
* Store documents and chunks in database (legacy method for backward compatibility)
|
|
176
|
+
* @deprecated Use storeDocumentsAndChunksWithContentTypes for multimodal support
|
|
177
|
+
*/
|
|
178
|
+
private storeDocumentsAndChunks;
|
|
179
|
+
/**
|
|
180
|
+
* Update vector index with new embeddings
|
|
181
|
+
*/
|
|
182
|
+
private updateVectorIndex;
|
|
183
|
+
/**
|
|
184
|
+
* Converts MIME type to simple content type for embedding function
|
|
185
|
+
* @param mimeType - MIME type string (e.g., 'text/plain', 'image/jpeg')
|
|
186
|
+
* @returns Simple content type ('text', 'image', etc.)
|
|
187
|
+
*/
|
|
188
|
+
private getContentTypeForEmbedding;
|
|
189
|
+
/**
|
|
190
|
+
* Save the vector index to disk
|
|
191
|
+
*/
|
|
192
|
+
saveIndex(): Promise<void>;
|
|
193
|
+
/**
|
|
194
|
+
* Process image content from memory using the existing image processing pipeline
|
|
195
|
+
* @private
|
|
196
|
+
*/
|
|
197
|
+
private processImageFromMemory;
|
|
198
|
+
/**
|
|
199
|
+
* Process PDF content from memory using the existing PDF processing pipeline
|
|
200
|
+
* @private
|
|
201
|
+
*/
|
|
202
|
+
private processPDFFromMemory;
|
|
203
|
+
/**
|
|
204
|
+
* Process DOCX content from memory using the existing DOCX processing pipeline
|
|
205
|
+
* @private
|
|
206
|
+
*/
|
|
207
|
+
private processDOCXFromMemory;
|
|
208
|
+
/**
|
|
209
|
+
* Clean up resources - explicit cleanup method
|
|
210
|
+
*/
|
|
211
|
+
cleanup(): Promise<void>;
|
|
212
|
+
}
|
|
213
|
+
//# sourceMappingURL=ingestion.d.ts.map
|