rag-lite-ts 2.1.0 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{cli → cjs/cli}/indexer.js +1 -1
- package/dist/{cli → cjs/cli}/search.js +5 -10
- package/dist/{core → cjs/core}/binary-index-format.d.ts +28 -2
- package/dist/cjs/core/binary-index-format.js +291 -0
- package/dist/{core → cjs/core}/ingestion.d.ts +5 -1
- package/dist/{core → cjs/core}/ingestion.js +76 -9
- package/dist/{core → cjs/core}/model-validator.js +1 -1
- package/dist/{core → cjs/core}/reranking-strategies.js +4 -5
- package/dist/{core → cjs/core}/search.js +2 -1
- package/dist/{core → cjs/core}/types.d.ts +1 -1
- package/dist/{core → cjs/core}/vector-index.d.ts +4 -0
- package/dist/{core → cjs/core}/vector-index.js +10 -2
- package/dist/{file-processor.d.ts → cjs/file-processor.d.ts} +2 -0
- package/dist/{file-processor.js → cjs/file-processor.js} +20 -0
- package/dist/{index-manager.d.ts → cjs/index-manager.d.ts} +17 -1
- package/dist/{index-manager.js → cjs/index-manager.js} +148 -7
- package/dist/{multimodal → cjs/multimodal}/clip-embedder.js +71 -66
- package/dist/esm/api-errors.d.ts +90 -0
- package/dist/esm/api-errors.js +320 -0
- package/dist/esm/cli/indexer.d.ts +11 -0
- package/dist/esm/cli/indexer.js +471 -0
- package/dist/esm/cli/search.d.ts +7 -0
- package/dist/esm/cli/search.js +332 -0
- package/dist/esm/cli.d.ts +3 -0
- package/dist/esm/cli.js +529 -0
- package/dist/esm/config.d.ts +51 -0
- package/dist/esm/config.js +79 -0
- package/dist/esm/core/abstract-embedder.d.ts +125 -0
- package/dist/esm/core/abstract-embedder.js +264 -0
- package/dist/esm/core/actionable-error-messages.d.ts +60 -0
- package/dist/esm/core/actionable-error-messages.js +397 -0
- package/dist/esm/core/adapters.d.ts +93 -0
- package/dist/esm/core/adapters.js +139 -0
- package/dist/esm/core/batch-processing-optimizer.d.ts +155 -0
- package/dist/esm/core/batch-processing-optimizer.js +536 -0
- package/dist/esm/core/binary-index-format.d.ts +78 -0
- package/dist/esm/core/binary-index-format.js +291 -0
- package/dist/esm/core/chunker.d.ts +119 -0
- package/dist/esm/core/chunker.js +73 -0
- package/dist/esm/core/cli-database-utils.d.ts +53 -0
- package/dist/esm/core/cli-database-utils.js +239 -0
- package/dist/esm/core/config.d.ts +102 -0
- package/dist/esm/core/config.js +247 -0
- package/dist/esm/core/content-errors.d.ts +111 -0
- package/dist/esm/core/content-errors.js +362 -0
- package/dist/esm/core/content-manager.d.ts +335 -0
- package/dist/esm/core/content-manager.js +1476 -0
- package/dist/esm/core/content-performance-optimizer.d.ts +150 -0
- package/dist/esm/core/content-performance-optimizer.js +516 -0
- package/dist/esm/core/content-resolver.d.ts +104 -0
- package/dist/esm/core/content-resolver.js +285 -0
- package/dist/esm/core/cross-modal-search.d.ts +164 -0
- package/dist/esm/core/cross-modal-search.js +342 -0
- package/dist/esm/core/database-connection-manager.d.ts +109 -0
- package/dist/esm/core/database-connection-manager.js +310 -0
- package/dist/esm/core/db.d.ts +213 -0
- package/dist/esm/core/db.js +895 -0
- package/dist/esm/core/embedder-factory.d.ts +154 -0
- package/dist/esm/core/embedder-factory.js +311 -0
- package/dist/esm/core/error-handler.d.ts +112 -0
- package/dist/esm/core/error-handler.js +239 -0
- package/dist/esm/core/index.d.ts +59 -0
- package/dist/esm/core/index.js +69 -0
- package/dist/esm/core/ingestion.d.ts +202 -0
- package/dist/esm/core/ingestion.js +901 -0
- package/dist/esm/core/interfaces.d.ts +408 -0
- package/dist/esm/core/interfaces.js +106 -0
- package/dist/esm/core/lazy-dependency-loader.d.ts +147 -0
- package/dist/esm/core/lazy-dependency-loader.js +435 -0
- package/dist/esm/core/mode-detection-service.d.ts +150 -0
- package/dist/esm/core/mode-detection-service.js +565 -0
- package/dist/esm/core/mode-model-validator.d.ts +92 -0
- package/dist/esm/core/mode-model-validator.js +203 -0
- package/dist/esm/core/model-registry.d.ts +116 -0
- package/dist/esm/core/model-registry.js +411 -0
- package/dist/esm/core/model-validator.d.ts +217 -0
- package/dist/esm/core/model-validator.js +782 -0
- package/dist/esm/core/path-manager.d.ts +47 -0
- package/dist/esm/core/path-manager.js +71 -0
- package/dist/esm/core/raglite-paths.d.ts +121 -0
- package/dist/esm/core/raglite-paths.js +145 -0
- package/dist/esm/core/reranking-config.d.ts +42 -0
- package/dist/esm/core/reranking-config.js +147 -0
- package/dist/esm/core/reranking-factory.d.ts +92 -0
- package/dist/esm/core/reranking-factory.js +410 -0
- package/dist/esm/core/reranking-strategies.d.ts +310 -0
- package/dist/esm/core/reranking-strategies.js +650 -0
- package/dist/esm/core/resource-cleanup.d.ts +163 -0
- package/dist/esm/core/resource-cleanup.js +371 -0
- package/dist/esm/core/resource-manager.d.ts +212 -0
- package/dist/esm/core/resource-manager.js +564 -0
- package/dist/esm/core/search-pipeline.d.ts +111 -0
- package/dist/esm/core/search-pipeline.js +287 -0
- package/dist/esm/core/search.d.ts +141 -0
- package/dist/esm/core/search.js +320 -0
- package/dist/esm/core/streaming-operations.d.ts +145 -0
- package/dist/esm/core/streaming-operations.js +409 -0
- package/dist/esm/core/types.d.ts +66 -0
- package/dist/esm/core/types.js +6 -0
- package/dist/esm/core/universal-embedder.d.ts +177 -0
- package/dist/esm/core/universal-embedder.js +139 -0
- package/dist/esm/core/validation-messages.d.ts +99 -0
- package/dist/esm/core/validation-messages.js +334 -0
- package/dist/esm/core/vector-index.d.ts +72 -0
- package/dist/esm/core/vector-index.js +333 -0
- package/dist/esm/dom-polyfills.d.ts +6 -0
- package/dist/esm/dom-polyfills.js +37 -0
- package/dist/esm/factories/index.d.ts +27 -0
- package/dist/esm/factories/index.js +29 -0
- package/dist/esm/factories/ingestion-factory.d.ts +200 -0
- package/dist/esm/factories/ingestion-factory.js +477 -0
- package/dist/esm/factories/search-factory.d.ts +154 -0
- package/dist/esm/factories/search-factory.js +344 -0
- package/dist/esm/file-processor.d.ts +147 -0
- package/dist/esm/file-processor.js +963 -0
- package/dist/esm/index-manager.d.ts +116 -0
- package/dist/esm/index-manager.js +598 -0
- package/dist/esm/index.d.ts +75 -0
- package/dist/esm/index.js +110 -0
- package/dist/esm/indexer.d.ts +7 -0
- package/dist/esm/indexer.js +54 -0
- package/dist/esm/ingestion.d.ts +63 -0
- package/dist/esm/ingestion.js +124 -0
- package/dist/esm/mcp-server.d.ts +46 -0
- package/dist/esm/mcp-server.js +1820 -0
- package/dist/esm/multimodal/clip-embedder.d.ts +327 -0
- package/dist/esm/multimodal/clip-embedder.js +996 -0
- package/dist/esm/multimodal/index.d.ts +6 -0
- package/dist/esm/multimodal/index.js +6 -0
- package/dist/esm/preprocess.d.ts +19 -0
- package/dist/esm/preprocess.js +203 -0
- package/dist/esm/preprocessors/index.d.ts +17 -0
- package/dist/esm/preprocessors/index.js +38 -0
- package/dist/esm/preprocessors/mdx.d.ts +25 -0
- package/dist/esm/preprocessors/mdx.js +101 -0
- package/dist/esm/preprocessors/mermaid.d.ts +68 -0
- package/dist/esm/preprocessors/mermaid.js +329 -0
- package/dist/esm/preprocessors/registry.d.ts +56 -0
- package/dist/esm/preprocessors/registry.js +179 -0
- package/dist/esm/run-error-recovery-tests.d.ts +7 -0
- package/dist/esm/run-error-recovery-tests.js +101 -0
- package/dist/esm/search-standalone.d.ts +7 -0
- package/dist/esm/search-standalone.js +117 -0
- package/dist/esm/search.d.ts +99 -0
- package/dist/esm/search.js +177 -0
- package/dist/esm/test-utils.d.ts +18 -0
- package/dist/esm/test-utils.js +27 -0
- package/dist/esm/text/chunker.d.ts +33 -0
- package/dist/esm/text/chunker.js +279 -0
- package/dist/esm/text/embedder.d.ts +111 -0
- package/dist/esm/text/embedder.js +386 -0
- package/dist/esm/text/index.d.ts +8 -0
- package/dist/esm/text/index.js +9 -0
- package/dist/esm/text/preprocessors/index.d.ts +17 -0
- package/dist/esm/text/preprocessors/index.js +38 -0
- package/dist/esm/text/preprocessors/mdx.d.ts +25 -0
- package/dist/esm/text/preprocessors/mdx.js +101 -0
- package/dist/esm/text/preprocessors/mermaid.d.ts +68 -0
- package/dist/esm/text/preprocessors/mermaid.js +330 -0
- package/dist/esm/text/preprocessors/registry.d.ts +56 -0
- package/dist/esm/text/preprocessors/registry.js +180 -0
- package/dist/esm/text/reranker.d.ts +49 -0
- package/dist/esm/text/reranker.js +274 -0
- package/dist/esm/text/sentence-transformer-embedder.d.ts +96 -0
- package/dist/esm/text/sentence-transformer-embedder.js +340 -0
- package/dist/esm/text/tokenizer.d.ts +22 -0
- package/dist/esm/text/tokenizer.js +64 -0
- package/dist/esm/types.d.ts +83 -0
- package/dist/esm/types.js +3 -0
- package/dist/esm/utils/vector-math.d.ts +31 -0
- package/dist/esm/utils/vector-math.js +70 -0
- package/package.json +30 -12
- package/dist/core/binary-index-format.js +0 -122
- /package/dist/{api-errors.d.ts → cjs/api-errors.d.ts} +0 -0
- /package/dist/{api-errors.js → cjs/api-errors.js} +0 -0
- /package/dist/{cli → cjs/cli}/indexer.d.ts +0 -0
- /package/dist/{cli → cjs/cli}/search.d.ts +0 -0
- /package/dist/{cli.d.ts → cjs/cli.d.ts} +0 -0
- /package/dist/{cli.js → cjs/cli.js} +0 -0
- /package/dist/{config.d.ts → cjs/config.d.ts} +0 -0
- /package/dist/{config.js → cjs/config.js} +0 -0
- /package/dist/{core → cjs/core}/abstract-embedder.d.ts +0 -0
- /package/dist/{core → cjs/core}/abstract-embedder.js +0 -0
- /package/dist/{core → cjs/core}/actionable-error-messages.d.ts +0 -0
- /package/dist/{core → cjs/core}/actionable-error-messages.js +0 -0
- /package/dist/{core → cjs/core}/adapters.d.ts +0 -0
- /package/dist/{core → cjs/core}/adapters.js +0 -0
- /package/dist/{core → cjs/core}/batch-processing-optimizer.d.ts +0 -0
- /package/dist/{core → cjs/core}/batch-processing-optimizer.js +0 -0
- /package/dist/{core → cjs/core}/chunker.d.ts +0 -0
- /package/dist/{core → cjs/core}/chunker.js +0 -0
- /package/dist/{core → cjs/core}/cli-database-utils.d.ts +0 -0
- /package/dist/{core → cjs/core}/cli-database-utils.js +0 -0
- /package/dist/{core → cjs/core}/config.d.ts +0 -0
- /package/dist/{core → cjs/core}/config.js +0 -0
- /package/dist/{core → cjs/core}/content-errors.d.ts +0 -0
- /package/dist/{core → cjs/core}/content-errors.js +0 -0
- /package/dist/{core → cjs/core}/content-manager.d.ts +0 -0
- /package/dist/{core → cjs/core}/content-manager.js +0 -0
- /package/dist/{core → cjs/core}/content-performance-optimizer.d.ts +0 -0
- /package/dist/{core → cjs/core}/content-performance-optimizer.js +0 -0
- /package/dist/{core → cjs/core}/content-resolver.d.ts +0 -0
- /package/dist/{core → cjs/core}/content-resolver.js +0 -0
- /package/dist/{core → cjs/core}/cross-modal-search.d.ts +0 -0
- /package/dist/{core → cjs/core}/cross-modal-search.js +0 -0
- /package/dist/{core → cjs/core}/database-connection-manager.d.ts +0 -0
- /package/dist/{core → cjs/core}/database-connection-manager.js +0 -0
- /package/dist/{core → cjs/core}/db.d.ts +0 -0
- /package/dist/{core → cjs/core}/db.js +0 -0
- /package/dist/{core → cjs/core}/embedder-factory.d.ts +0 -0
- /package/dist/{core → cjs/core}/embedder-factory.js +0 -0
- /package/dist/{core → cjs/core}/error-handler.d.ts +0 -0
- /package/dist/{core → cjs/core}/error-handler.js +0 -0
- /package/dist/{core → cjs/core}/index.d.ts +0 -0
- /package/dist/{core → cjs/core}/index.js +0 -0
- /package/dist/{core → cjs/core}/interfaces.d.ts +0 -0
- /package/dist/{core → cjs/core}/interfaces.js +0 -0
- /package/dist/{core → cjs/core}/lazy-dependency-loader.d.ts +0 -0
- /package/dist/{core → cjs/core}/lazy-dependency-loader.js +0 -0
- /package/dist/{core → cjs/core}/mode-detection-service.d.ts +0 -0
- /package/dist/{core → cjs/core}/mode-detection-service.js +0 -0
- /package/dist/{core → cjs/core}/mode-model-validator.d.ts +0 -0
- /package/dist/{core → cjs/core}/mode-model-validator.js +0 -0
- /package/dist/{core → cjs/core}/model-registry.d.ts +0 -0
- /package/dist/{core → cjs/core}/model-registry.js +0 -0
- /package/dist/{core → cjs/core}/model-validator.d.ts +0 -0
- /package/dist/{core → cjs/core}/path-manager.d.ts +0 -0
- /package/dist/{core → cjs/core}/path-manager.js +0 -0
- /package/dist/{core → cjs/core}/raglite-paths.d.ts +0 -0
- /package/dist/{core → cjs/core}/raglite-paths.js +0 -0
- /package/dist/{core → cjs/core}/reranking-config.d.ts +0 -0
- /package/dist/{core → cjs/core}/reranking-config.js +0 -0
- /package/dist/{core → cjs/core}/reranking-factory.d.ts +0 -0
- /package/dist/{core → cjs/core}/reranking-factory.js +0 -0
- /package/dist/{core → cjs/core}/reranking-strategies.d.ts +0 -0
- /package/dist/{core → cjs/core}/resource-cleanup.d.ts +0 -0
- /package/dist/{core → cjs/core}/resource-cleanup.js +0 -0
- /package/dist/{core → cjs/core}/resource-manager.d.ts +0 -0
- /package/dist/{core → cjs/core}/resource-manager.js +0 -0
- /package/dist/{core → cjs/core}/search-pipeline.d.ts +0 -0
- /package/dist/{core → cjs/core}/search-pipeline.js +0 -0
- /package/dist/{core → cjs/core}/search.d.ts +0 -0
- /package/dist/{core → cjs/core}/streaming-operations.d.ts +0 -0
- /package/dist/{core → cjs/core}/streaming-operations.js +0 -0
- /package/dist/{core → cjs/core}/types.js +0 -0
- /package/dist/{core → cjs/core}/universal-embedder.d.ts +0 -0
- /package/dist/{core → cjs/core}/universal-embedder.js +0 -0
- /package/dist/{core → cjs/core}/validation-messages.d.ts +0 -0
- /package/dist/{core → cjs/core}/validation-messages.js +0 -0
- /package/dist/{dom-polyfills.d.ts → cjs/dom-polyfills.d.ts} +0 -0
- /package/dist/{dom-polyfills.js → cjs/dom-polyfills.js} +0 -0
- /package/dist/{factories → cjs/factories}/index.d.ts +0 -0
- /package/dist/{factories → cjs/factories}/index.js +0 -0
- /package/dist/{factories → cjs/factories}/ingestion-factory.d.ts +0 -0
- /package/dist/{factories → cjs/factories}/ingestion-factory.js +0 -0
- /package/dist/{factories → cjs/factories}/search-factory.d.ts +0 -0
- /package/dist/{factories → cjs/factories}/search-factory.js +0 -0
- /package/dist/{index.d.ts → cjs/index.d.ts} +0 -0
- /package/dist/{index.js → cjs/index.js} +0 -0
- /package/dist/{indexer.d.ts → cjs/indexer.d.ts} +0 -0
- /package/dist/{indexer.js → cjs/indexer.js} +0 -0
- /package/dist/{ingestion.d.ts → cjs/ingestion.d.ts} +0 -0
- /package/dist/{ingestion.js → cjs/ingestion.js} +0 -0
- /package/dist/{mcp-server.d.ts → cjs/mcp-server.d.ts} +0 -0
- /package/dist/{mcp-server.js → cjs/mcp-server.js} +0 -0
- /package/dist/{multimodal → cjs/multimodal}/clip-embedder.d.ts +0 -0
- /package/dist/{multimodal → cjs/multimodal}/index.d.ts +0 -0
- /package/dist/{multimodal → cjs/multimodal}/index.js +0 -0
- /package/dist/{preprocess.d.ts → cjs/preprocess.d.ts} +0 -0
- /package/dist/{preprocess.js → cjs/preprocess.js} +0 -0
- /package/dist/{preprocessors → cjs/preprocessors}/index.d.ts +0 -0
- /package/dist/{preprocessors → cjs/preprocessors}/index.js +0 -0
- /package/dist/{preprocessors → cjs/preprocessors}/mdx.d.ts +0 -0
- /package/dist/{preprocessors → cjs/preprocessors}/mdx.js +0 -0
- /package/dist/{preprocessors → cjs/preprocessors}/mermaid.d.ts +0 -0
- /package/dist/{preprocessors → cjs/preprocessors}/mermaid.js +0 -0
- /package/dist/{preprocessors → cjs/preprocessors}/registry.d.ts +0 -0
- /package/dist/{preprocessors → cjs/preprocessors}/registry.js +0 -0
- /package/dist/{run-error-recovery-tests.d.ts → cjs/run-error-recovery-tests.d.ts} +0 -0
- /package/dist/{run-error-recovery-tests.js → cjs/run-error-recovery-tests.js} +0 -0
- /package/dist/{search-standalone.d.ts → cjs/search-standalone.d.ts} +0 -0
- /package/dist/{search-standalone.js → cjs/search-standalone.js} +0 -0
- /package/dist/{search.d.ts → cjs/search.d.ts} +0 -0
- /package/dist/{search.js → cjs/search.js} +0 -0
- /package/dist/{test-utils.d.ts → cjs/test-utils.d.ts} +0 -0
- /package/dist/{test-utils.js → cjs/test-utils.js} +0 -0
- /package/dist/{text → cjs/text}/chunker.d.ts +0 -0
- /package/dist/{text → cjs/text}/chunker.js +0 -0
- /package/dist/{text → cjs/text}/embedder.d.ts +0 -0
- /package/dist/{text → cjs/text}/embedder.js +0 -0
- /package/dist/{text → cjs/text}/index.d.ts +0 -0
- /package/dist/{text → cjs/text}/index.js +0 -0
- /package/dist/{text → cjs/text}/preprocessors/index.d.ts +0 -0
- /package/dist/{text → cjs/text}/preprocessors/index.js +0 -0
- /package/dist/{text → cjs/text}/preprocessors/mdx.d.ts +0 -0
- /package/dist/{text → cjs/text}/preprocessors/mdx.js +0 -0
- /package/dist/{text → cjs/text}/preprocessors/mermaid.d.ts +0 -0
- /package/dist/{text → cjs/text}/preprocessors/mermaid.js +0 -0
- /package/dist/{text → cjs/text}/preprocessors/registry.d.ts +0 -0
- /package/dist/{text → cjs/text}/preprocessors/registry.js +0 -0
- /package/dist/{text → cjs/text}/reranker.d.ts +0 -0
- /package/dist/{text → cjs/text}/reranker.js +0 -0
- /package/dist/{text → cjs/text}/sentence-transformer-embedder.d.ts +0 -0
- /package/dist/{text → cjs/text}/sentence-transformer-embedder.js +0 -0
- /package/dist/{text → cjs/text}/tokenizer.d.ts +0 -0
- /package/dist/{text → cjs/text}/tokenizer.js +0 -0
- /package/dist/{types.d.ts → cjs/types.d.ts} +0 -0
- /package/dist/{types.js → cjs/types.js} +0 -0
- /package/dist/{utils → cjs/utils}/vector-math.d.ts +0 -0
- /package/dist/{utils → cjs/utils}/vector-math.js +0 -0
|
@@ -198,7 +198,7 @@ export async function runIngest(path, options = {}) {
|
|
|
198
198
|
showProgress: true,
|
|
199
199
|
maxWaitMs: 15000 // Longer timeout for ingestion
|
|
200
200
|
});
|
|
201
|
-
const result = await pipeline.ingestPath(resolvedPath);
|
|
201
|
+
const result = await pipeline.ingestPath(resolvedPath, { mode: factoryOptions.mode });
|
|
202
202
|
// Display final results
|
|
203
203
|
console.log('\n' + '='.repeat(50));
|
|
204
204
|
console.log('INGESTION SUMMARY');
|
|
@@ -137,6 +137,11 @@ export async function runSearch(query, options = {}) {
|
|
|
137
137
|
if (options['top-k'] !== undefined) {
|
|
138
138
|
searchOptions.top_k = options['top-k'];
|
|
139
139
|
}
|
|
140
|
+
// Set content type filter for search-level filtering
|
|
141
|
+
const contentTypeFilter = options['content-type'];
|
|
142
|
+
if (contentTypeFilter && contentTypeFilter !== 'all') {
|
|
143
|
+
searchOptions.contentType = contentTypeFilter;
|
|
144
|
+
}
|
|
140
145
|
// Phase 2: Disable reranking for image-to-image searches to preserve visual similarity
|
|
141
146
|
let rerankingForciblyDisabled = false;
|
|
142
147
|
if (isImage && embedder) {
|
|
@@ -174,16 +179,6 @@ export async function runSearch(query, options = {}) {
|
|
|
174
179
|
results = await searchEngine.search(query, searchOptions);
|
|
175
180
|
}
|
|
176
181
|
const searchTime = Date.now() - startTime;
|
|
177
|
-
// Apply content type filter if specified
|
|
178
|
-
const contentTypeFilter = options['content-type'];
|
|
179
|
-
if (contentTypeFilter && contentTypeFilter !== 'all') {
|
|
180
|
-
const originalCount = results.length;
|
|
181
|
-
results = results.filter(r => r.contentType === contentTypeFilter);
|
|
182
|
-
if (results.length < originalCount) {
|
|
183
|
-
console.log(`Filtered to ${results.length} ${contentTypeFilter} result${results.length === 1 ? '' : 's'} (from ${originalCount} total)`);
|
|
184
|
-
console.log('');
|
|
185
|
-
}
|
|
186
|
-
}
|
|
187
182
|
// Display results
|
|
188
183
|
if (results.length === 0) {
|
|
189
184
|
console.log('No results found.');
|
|
@@ -25,10 +25,19 @@ export interface BinaryIndexData {
|
|
|
25
25
|
id: number;
|
|
26
26
|
vector: Float32Array;
|
|
27
27
|
}>;
|
|
28
|
+
hasContentTypeGroups?: boolean;
|
|
29
|
+
textVectors?: Array<{
|
|
30
|
+
id: number;
|
|
31
|
+
vector: Float32Array;
|
|
32
|
+
}>;
|
|
33
|
+
imageVectors?: Array<{
|
|
34
|
+
id: number;
|
|
35
|
+
vector: Float32Array;
|
|
36
|
+
}>;
|
|
28
37
|
}
|
|
29
38
|
export declare class BinaryIndexFormat {
|
|
30
39
|
/**
|
|
31
|
-
* Save index data to binary format
|
|
40
|
+
* Save index data to binary format (original format for backward compatibility)
|
|
32
41
|
*
|
|
33
42
|
* File structure:
|
|
34
43
|
* - Header (24 bytes): dimensions, maxElements, M, efConstruction, seed, currentSize
|
|
@@ -39,7 +48,24 @@ export declare class BinaryIndexFormat {
|
|
|
39
48
|
*/
|
|
40
49
|
static save(indexPath: string, data: BinaryIndexData): Promise<void>;
|
|
41
50
|
/**
|
|
42
|
-
*
|
|
51
|
+
* Save index data to grouped binary format
|
|
52
|
+
*
|
|
53
|
+
* File structure:
|
|
54
|
+
* - Extended Header (40 bytes):
|
|
55
|
+
* - Original 6 fields (24 bytes)
|
|
56
|
+
* - hasGroups flag (4 bytes)
|
|
57
|
+
* - textOffset (4 bytes)
|
|
58
|
+
* - textCount (4 bytes)
|
|
59
|
+
* - imageOffset (4 bytes)
|
|
60
|
+
* - imageCount (4 bytes)
|
|
61
|
+
* - Data section: [text vectors...][image vectors...]
|
|
62
|
+
*
|
|
63
|
+
* @param indexPath Path to save the binary index file
|
|
64
|
+
* @param data Index data to serialize
|
|
65
|
+
*/
|
|
66
|
+
static saveGrouped(indexPath: string, data: BinaryIndexData): Promise<void>;
|
|
67
|
+
/**
|
|
68
|
+
* Load index data from binary format (supports both original and grouped formats)
|
|
43
69
|
*
|
|
44
70
|
* Uses zero-copy Float32Array views for efficient loading.
|
|
45
71
|
* Copies the views to ensure data persistence after buffer lifecycle.
|
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Binary Index Format Module
|
|
3
|
+
*
|
|
4
|
+
* Provides efficient binary serialization for HNSW vector indices.
|
|
5
|
+
*
|
|
6
|
+
* Format Specification:
|
|
7
|
+
* - Header: 24 bytes (6 × uint32)
|
|
8
|
+
* - Vectors: N × (4 + D × 4) bytes
|
|
9
|
+
* - Little-endian encoding for cross-platform compatibility
|
|
10
|
+
* - 4-byte alignment for Float32Array zero-copy views
|
|
11
|
+
*
|
|
12
|
+
* Performance:
|
|
13
|
+
* - 3.66x smaller than JSON format
|
|
14
|
+
* - 3.5x faster loading
|
|
15
|
+
* - Zero-copy Float32Array views
|
|
16
|
+
*/
|
|
17
|
+
import { readFileSync, writeFileSync } from 'fs';
|
|
18
|
+
export class BinaryIndexFormat {
|
|
19
|
+
/**
|
|
20
|
+
* Save index data to binary format (original format for backward compatibility)
|
|
21
|
+
*
|
|
22
|
+
* File structure:
|
|
23
|
+
* - Header (24 bytes): dimensions, maxElements, M, efConstruction, seed, currentSize
|
|
24
|
+
* - Vectors: For each vector: id (4 bytes) + vector data (dimensions × 4 bytes)
|
|
25
|
+
*
|
|
26
|
+
* @param indexPath Path to save the binary index file
|
|
27
|
+
* @param data Index data to serialize
|
|
28
|
+
*/
|
|
29
|
+
static async save(indexPath, data) {
|
|
30
|
+
// Calculate total size
|
|
31
|
+
const headerSize = 24; // 6 uint32 fields
|
|
32
|
+
const vectorSize = 4 + (data.dimensions * 4); // id + vector
|
|
33
|
+
const totalSize = headerSize + (data.currentSize * vectorSize);
|
|
34
|
+
const buffer = new ArrayBuffer(totalSize);
|
|
35
|
+
const view = new DataView(buffer);
|
|
36
|
+
let offset = 0;
|
|
37
|
+
// Write header (24 bytes, all little-endian)
|
|
38
|
+
view.setUint32(offset, data.dimensions, true);
|
|
39
|
+
offset += 4;
|
|
40
|
+
view.setUint32(offset, data.maxElements, true);
|
|
41
|
+
offset += 4;
|
|
42
|
+
view.setUint32(offset, data.M, true);
|
|
43
|
+
offset += 4;
|
|
44
|
+
view.setUint32(offset, data.efConstruction, true);
|
|
45
|
+
offset += 4;
|
|
46
|
+
view.setUint32(offset, data.seed, true);
|
|
47
|
+
offset += 4;
|
|
48
|
+
view.setUint32(offset, data.currentSize, true);
|
|
49
|
+
offset += 4;
|
|
50
|
+
// Write vectors
|
|
51
|
+
for (const item of data.vectors) {
|
|
52
|
+
// Ensure 4-byte alignment (should always be true with our format)
|
|
53
|
+
if (offset % 4 !== 0) {
|
|
54
|
+
throw new Error(`Offset ${offset} is not 4-byte aligned`);
|
|
55
|
+
}
|
|
56
|
+
// Write vector ID
|
|
57
|
+
view.setUint32(offset, item.id, true);
|
|
58
|
+
offset += 4;
|
|
59
|
+
// Write vector data
|
|
60
|
+
for (let i = 0; i < item.vector.length; i++) {
|
|
61
|
+
view.setFloat32(offset, item.vector[i], true);
|
|
62
|
+
offset += 4;
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
// Write to file
|
|
66
|
+
writeFileSync(indexPath, Buffer.from(buffer));
|
|
67
|
+
}
|
|
68
|
+
/**
|
|
69
|
+
* Save index data to grouped binary format
|
|
70
|
+
*
|
|
71
|
+
* File structure:
|
|
72
|
+
* - Extended Header (40 bytes):
|
|
73
|
+
* - Original 6 fields (24 bytes)
|
|
74
|
+
* - hasGroups flag (4 bytes)
|
|
75
|
+
* - textOffset (4 bytes)
|
|
76
|
+
* - textCount (4 bytes)
|
|
77
|
+
* - imageOffset (4 bytes)
|
|
78
|
+
* - imageCount (4 bytes)
|
|
79
|
+
* - Data section: [text vectors...][image vectors...]
|
|
80
|
+
*
|
|
81
|
+
* @param indexPath Path to save the binary index file
|
|
82
|
+
* @param data Index data to serialize
|
|
83
|
+
*/
|
|
84
|
+
static async saveGrouped(indexPath, data) {
|
|
85
|
+
if (!data.hasContentTypeGroups || !data.textVectors || !data.imageVectors) {
|
|
86
|
+
// Fallback to original format
|
|
87
|
+
return this.save(indexPath, data);
|
|
88
|
+
}
|
|
89
|
+
const headerSize = 44; // Extended header: 24 + 20 bytes (hasGroups + textOffset + textCount + imageOffset + imageCount)
|
|
90
|
+
const vectorSize = 4 + (data.dimensions * 4); // id + vector
|
|
91
|
+
// Calculate offsets and total size
|
|
92
|
+
const textOffset = headerSize;
|
|
93
|
+
const imageOffset = textOffset + (data.textVectors.length * vectorSize);
|
|
94
|
+
const totalSize = imageOffset + (data.imageVectors.length * vectorSize);
|
|
95
|
+
const buffer = new ArrayBuffer(totalSize);
|
|
96
|
+
const view = new DataView(buffer);
|
|
97
|
+
let offset = 0;
|
|
98
|
+
// Write extended header (40 bytes, all little-endian)
|
|
99
|
+
if (offset + 40 > buffer.byteLength) {
|
|
100
|
+
throw new Error(`Header write would exceed buffer bounds: offset=${offset}, headerSize=40, bufferSize=${buffer.byteLength}`);
|
|
101
|
+
}
|
|
102
|
+
view.setUint32(offset, data.dimensions, true);
|
|
103
|
+
offset += 4;
|
|
104
|
+
view.setUint32(offset, data.maxElements, true);
|
|
105
|
+
offset += 4;
|
|
106
|
+
view.setUint32(offset, data.M, true);
|
|
107
|
+
offset += 4;
|
|
108
|
+
view.setUint32(offset, data.efConstruction, true);
|
|
109
|
+
offset += 4;
|
|
110
|
+
view.setUint32(offset, data.seed, true);
|
|
111
|
+
offset += 4;
|
|
112
|
+
view.setUint32(offset, data.currentSize, true);
|
|
113
|
+
offset += 4;
|
|
114
|
+
// Extended fields
|
|
115
|
+
view.setUint32(offset, 1, true);
|
|
116
|
+
offset += 4; // hasGroups = 1
|
|
117
|
+
view.setUint32(offset, textOffset, true);
|
|
118
|
+
offset += 4;
|
|
119
|
+
view.setUint32(offset, data.textVectors.length, true);
|
|
120
|
+
offset += 4;
|
|
121
|
+
view.setUint32(offset, imageOffset, true);
|
|
122
|
+
offset += 4;
|
|
123
|
+
view.setUint32(offset, data.imageVectors.length, true);
|
|
124
|
+
offset += 4;
|
|
125
|
+
// Write text vectors
|
|
126
|
+
for (const item of data.textVectors) {
|
|
127
|
+
// Ensure 4-byte alignment
|
|
128
|
+
if (offset % 4 !== 0) {
|
|
129
|
+
throw new Error(`Offset ${offset} is not 4-byte aligned`);
|
|
130
|
+
}
|
|
131
|
+
// Check bounds before writing
|
|
132
|
+
if (offset + 4 > buffer.byteLength) {
|
|
133
|
+
throw new Error(`ID write would exceed buffer bounds: offset=${offset}, bufferSize=${buffer.byteLength}`);
|
|
134
|
+
}
|
|
135
|
+
// Write vector ID
|
|
136
|
+
view.setUint32(offset, item.id, true);
|
|
137
|
+
offset += 4;
|
|
138
|
+
// Check bounds for vector data
|
|
139
|
+
const vectorDataSize = item.vector.length * 4;
|
|
140
|
+
if (offset + vectorDataSize > buffer.byteLength) {
|
|
141
|
+
throw new Error(`Vector data write would exceed buffer bounds: offset=${offset}, dataSize=${vectorDataSize}, bufferSize=${buffer.byteLength}`);
|
|
142
|
+
}
|
|
143
|
+
// Write vector data
|
|
144
|
+
for (let i = 0; i < item.vector.length; i++) {
|
|
145
|
+
view.setFloat32(offset, item.vector[i], true);
|
|
146
|
+
offset += 4;
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
// Write image vectors
|
|
150
|
+
for (const item of data.imageVectors) {
|
|
151
|
+
// Ensure 4-byte alignment
|
|
152
|
+
if (offset % 4 !== 0) {
|
|
153
|
+
throw new Error(`Offset ${offset} is not 4-byte aligned`);
|
|
154
|
+
}
|
|
155
|
+
// Check bounds before writing
|
|
156
|
+
if (offset + 4 > buffer.byteLength) {
|
|
157
|
+
throw new Error(`ID write would exceed buffer bounds: offset=${offset}, bufferSize=${buffer.byteLength}`);
|
|
158
|
+
}
|
|
159
|
+
// Write vector ID
|
|
160
|
+
view.setUint32(offset, item.id, true);
|
|
161
|
+
offset += 4;
|
|
162
|
+
// Check bounds for vector data
|
|
163
|
+
const vectorDataSize = item.vector.length * 4;
|
|
164
|
+
if (offset + vectorDataSize > buffer.byteLength) {
|
|
165
|
+
throw new Error(`Vector data write would exceed buffer bounds: offset=${offset}, dataSize=${vectorDataSize}, bufferSize=${buffer.byteLength}`);
|
|
166
|
+
}
|
|
167
|
+
// Write vector data
|
|
168
|
+
for (let i = 0; i < item.vector.length; i++) {
|
|
169
|
+
view.setFloat32(offset, item.vector[i], true);
|
|
170
|
+
offset += 4;
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
// Write to file
|
|
174
|
+
writeFileSync(indexPath, Buffer.from(buffer));
|
|
175
|
+
}
|
|
176
|
+
/**
|
|
177
|
+
* Load index data from binary format (supports both original and grouped formats)
|
|
178
|
+
*
|
|
179
|
+
* Uses zero-copy Float32Array views for efficient loading.
|
|
180
|
+
* Copies the views to ensure data persistence after buffer lifecycle.
|
|
181
|
+
*
|
|
182
|
+
* @param indexPath Path to the binary index file
|
|
183
|
+
* @returns Deserialized index data
|
|
184
|
+
*/
|
|
185
|
+
static async load(indexPath) {
|
|
186
|
+
const buffer = readFileSync(indexPath);
|
|
187
|
+
const view = new DataView(buffer.buffer, buffer.byteOffset, buffer.byteLength);
|
|
188
|
+
let offset = 0;
|
|
189
|
+
// Read basic header (24 bytes, all little-endian)
|
|
190
|
+
const dimensions = view.getUint32(offset, true);
|
|
191
|
+
offset += 4;
|
|
192
|
+
const maxElements = view.getUint32(offset, true);
|
|
193
|
+
offset += 4;
|
|
194
|
+
const M = view.getUint32(offset, true);
|
|
195
|
+
offset += 4;
|
|
196
|
+
const efConstruction = view.getUint32(offset, true);
|
|
197
|
+
offset += 4;
|
|
198
|
+
const seed = view.getUint32(offset, true);
|
|
199
|
+
offset += 4;
|
|
200
|
+
const currentSize = view.getUint32(offset, true);
|
|
201
|
+
offset += 4;
|
|
202
|
+
// Check if this is the extended grouped format (40+ bytes header)
|
|
203
|
+
const hasGroups = buffer.byteLength >= 40 ? view.getUint32(offset, true) : 0;
|
|
204
|
+
if (hasGroups === 1 && buffer.byteLength >= 40) {
|
|
205
|
+
// Load grouped format
|
|
206
|
+
const textOffset = view.getUint32(offset + 4, true);
|
|
207
|
+
const textCount = view.getUint32(offset + 8, true);
|
|
208
|
+
const imageOffset = view.getUint32(offset + 12, true);
|
|
209
|
+
const imageCount = view.getUint32(offset + 16, true);
|
|
210
|
+
// Load text vectors
|
|
211
|
+
const textVectors = [];
|
|
212
|
+
offset = textOffset;
|
|
213
|
+
for (let i = 0; i < textCount; i++) {
|
|
214
|
+
// Ensure 4-byte alignment
|
|
215
|
+
if (offset % 4 !== 0) {
|
|
216
|
+
throw new Error(`Offset ${offset} is not 4-byte aligned`);
|
|
217
|
+
}
|
|
218
|
+
// Read vector ID
|
|
219
|
+
const id = view.getUint32(offset, true);
|
|
220
|
+
offset += 4;
|
|
221
|
+
// Zero-copy Float32Array view
|
|
222
|
+
const vectorView = new Float32Array(buffer.buffer, buffer.byteOffset + offset, dimensions);
|
|
223
|
+
// Copy to avoid buffer lifecycle issues
|
|
224
|
+
const vector = new Float32Array(vectorView);
|
|
225
|
+
offset += dimensions * 4;
|
|
226
|
+
textVectors.push({ id, vector });
|
|
227
|
+
}
|
|
228
|
+
// Load image vectors
|
|
229
|
+
const imageVectors = [];
|
|
230
|
+
offset = imageOffset;
|
|
231
|
+
for (let i = 0; i < imageCount; i++) {
|
|
232
|
+
// Ensure 4-byte alignment
|
|
233
|
+
if (offset % 4 !== 0) {
|
|
234
|
+
throw new Error(`Offset ${offset} is not 4-byte aligned`);
|
|
235
|
+
}
|
|
236
|
+
// Read vector ID
|
|
237
|
+
const id = view.getUint32(offset, true);
|
|
238
|
+
offset += 4;
|
|
239
|
+
// Zero-copy Float32Array view
|
|
240
|
+
const vectorView = new Float32Array(buffer.buffer, buffer.byteOffset + offset, dimensions);
|
|
241
|
+
// Copy to avoid buffer lifecycle issues
|
|
242
|
+
const vector = new Float32Array(vectorView);
|
|
243
|
+
offset += dimensions * 4;
|
|
244
|
+
imageVectors.push({ id, vector });
|
|
245
|
+
}
|
|
246
|
+
// Combine all vectors for backward compatibility
|
|
247
|
+
const allVectors = [...textVectors, ...imageVectors];
|
|
248
|
+
return {
|
|
249
|
+
dimensions,
|
|
250
|
+
maxElements,
|
|
251
|
+
M,
|
|
252
|
+
efConstruction,
|
|
253
|
+
seed,
|
|
254
|
+
currentSize,
|
|
255
|
+
vectors: allVectors,
|
|
256
|
+
hasContentTypeGroups: true,
|
|
257
|
+
textVectors,
|
|
258
|
+
imageVectors
|
|
259
|
+
};
|
|
260
|
+
}
|
|
261
|
+
else {
|
|
262
|
+
// Load original format
|
|
263
|
+
const vectors = [];
|
|
264
|
+
for (let i = 0; i < currentSize; i++) {
|
|
265
|
+
// Ensure 4-byte alignment (should always be true with our format)
|
|
266
|
+
if (offset % 4 !== 0) {
|
|
267
|
+
throw new Error(`Offset ${offset} is not 4-byte aligned`);
|
|
268
|
+
}
|
|
269
|
+
// Read vector ID
|
|
270
|
+
const id = view.getUint32(offset, true);
|
|
271
|
+
offset += 4;
|
|
272
|
+
// Zero-copy Float32Array view (fast!)
|
|
273
|
+
const vectorView = new Float32Array(buffer.buffer, buffer.byteOffset + offset, dimensions);
|
|
274
|
+
// Copy to avoid buffer lifecycle issues
|
|
275
|
+
const vector = new Float32Array(vectorView);
|
|
276
|
+
offset += dimensions * 4;
|
|
277
|
+
vectors.push({ id, vector });
|
|
278
|
+
}
|
|
279
|
+
return {
|
|
280
|
+
dimensions,
|
|
281
|
+
maxElements,
|
|
282
|
+
M,
|
|
283
|
+
efConstruction,
|
|
284
|
+
seed,
|
|
285
|
+
currentSize,
|
|
286
|
+
vectors
|
|
287
|
+
};
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
//# sourceMappingURL=binary-index-format.js.map
|
|
@@ -162,9 +162,13 @@ export declare class IngestionPipeline {
|
|
|
162
162
|
*/
|
|
163
163
|
private storeDocumentsAndChunksWithContentTypes;
|
|
164
164
|
/**
|
|
165
|
-
* Update vector index with new embeddings
|
|
165
|
+
* Update vector index with new embeddings (supports grouped content type storage)
|
|
166
166
|
*/
|
|
167
167
|
private updateVectorIndex;
|
|
168
|
+
/**
|
|
169
|
+
* Filter documents based on ingestion mode to avoid processing incompatible content types
|
|
170
|
+
*/
|
|
171
|
+
private filterDocumentsByMode;
|
|
168
172
|
/**
|
|
169
173
|
* Converts MIME type to simple content type for embedding function
|
|
170
174
|
* @param mimeType - MIME type string (e.g., 'text/plain', 'image/jpeg')
|
|
@@ -287,21 +287,30 @@ export class IngestionPipeline {
|
|
|
287
287
|
try {
|
|
288
288
|
// Phase 1: File Discovery and Processing with Content-Type Detection
|
|
289
289
|
console.log('\n--- Phase 1: File Discovery and Processing ---');
|
|
290
|
-
const
|
|
291
|
-
|
|
290
|
+
const mode = options.mode || 'text';
|
|
291
|
+
const fileOptions = {
|
|
292
|
+
recursive: true,
|
|
293
|
+
maxFileSize: 10 * 1024 * 1024, // 10MB
|
|
294
|
+
...options.fileOptions,
|
|
295
|
+
mode
|
|
296
|
+
};
|
|
297
|
+
const fileResult = await discoverAndProcessFiles(path, fileOptions, this.pathManager);
|
|
298
|
+
// Additional filtering as fallback (should be minimal with mode-aware discovery)
|
|
299
|
+
const filteredResult = this.filterDocumentsByMode(fileResult, mode);
|
|
300
|
+
if (filteredResult.documents.length === 0) {
|
|
292
301
|
console.log('No documents found to process');
|
|
293
302
|
return {
|
|
294
303
|
documentsProcessed: 0,
|
|
295
304
|
chunksCreated: 0,
|
|
296
305
|
embeddingsGenerated: 0,
|
|
297
|
-
documentErrors:
|
|
306
|
+
documentErrors: filteredResult.processingResult.errors.length,
|
|
298
307
|
embeddingErrors: 0,
|
|
299
308
|
processingTimeMs: Date.now() - startTime,
|
|
300
309
|
contentIds: []
|
|
301
310
|
};
|
|
302
311
|
}
|
|
303
312
|
// Content-type detection and routing
|
|
304
|
-
const contentTypeStats = this.analyzeContentTypes(
|
|
313
|
+
const contentTypeStats = this.analyzeContentTypes(filteredResult.documents);
|
|
305
314
|
console.log(`📊 Content analysis: ${contentTypeStats.text} text, ${contentTypeStats.image} image, ${contentTypeStats.other} other files`);
|
|
306
315
|
// Phase 2: Document Chunking with Content-Type Awareness
|
|
307
316
|
console.log('\n--- Phase 2: Document Chunking ---');
|
|
@@ -309,7 +318,7 @@ export class IngestionPipeline {
|
|
|
309
318
|
chunkSize: config.chunk_size,
|
|
310
319
|
chunkOverlap: config.chunk_overlap
|
|
311
320
|
};
|
|
312
|
-
const chunkingResult = await this.chunkDocumentsWithContentTypes(
|
|
321
|
+
const chunkingResult = await this.chunkDocumentsWithContentTypes(filteredResult.documents, effectiveChunkConfig, options.mode);
|
|
313
322
|
if (chunkingResult.totalChunks === 0) {
|
|
314
323
|
console.log('No chunks created from documents');
|
|
315
324
|
return {
|
|
@@ -334,10 +343,10 @@ export class IngestionPipeline {
|
|
|
334
343
|
const endTime = Date.now();
|
|
335
344
|
const processingTimeMs = endTime - startTime;
|
|
336
345
|
const result = {
|
|
337
|
-
documentsProcessed:
|
|
346
|
+
documentsProcessed: filteredResult.documents.length,
|
|
338
347
|
chunksCreated: chunkingResult.totalChunks,
|
|
339
348
|
embeddingsGenerated: embeddingResult.embeddings.length,
|
|
340
|
-
documentErrors:
|
|
349
|
+
documentErrors: filteredResult.processingResult.errors.length,
|
|
341
350
|
embeddingErrors: embeddingResult.errors,
|
|
342
351
|
processingTimeMs,
|
|
343
352
|
contentIds
|
|
@@ -595,16 +604,35 @@ export class IngestionPipeline {
|
|
|
595
604
|
return contentIds;
|
|
596
605
|
}
|
|
597
606
|
/**
|
|
598
|
-
* Update vector index with new embeddings
|
|
607
|
+
* Update vector index with new embeddings (supports grouped content type storage)
|
|
599
608
|
*/
|
|
600
609
|
async updateVectorIndex(embeddings) {
|
|
610
|
+
console.log('updateVectorIndex called with', embeddings.length, 'embeddings');
|
|
601
611
|
if (embeddings.length === 0) {
|
|
602
612
|
console.log('No embeddings to add to vector index');
|
|
603
613
|
return;
|
|
604
614
|
}
|
|
605
615
|
console.log(`Adding ${embeddings.length} vector${embeddings.length === 1 ? '' : 's'} to search index...`);
|
|
606
616
|
try {
|
|
607
|
-
|
|
617
|
+
// Group embeddings by content type for optimized storage
|
|
618
|
+
const groupedEmbeddings = embeddings.reduce((groups, embedding) => {
|
|
619
|
+
const contentType = embedding.contentType || 'text';
|
|
620
|
+
if (!groups[contentType]) {
|
|
621
|
+
groups[contentType] = [];
|
|
622
|
+
}
|
|
623
|
+
groups[contentType].push(embedding);
|
|
624
|
+
return groups;
|
|
625
|
+
}, {});
|
|
626
|
+
const textEmbeddings = groupedEmbeddings.text || [];
|
|
627
|
+
const imageEmbeddings = groupedEmbeddings.image || [];
|
|
628
|
+
console.log(`Grouped: ${textEmbeddings.length} text, ${imageEmbeddings.length} image vectors`);
|
|
629
|
+
// Use grouped storage method if available, fallback to regular method
|
|
630
|
+
if (this.indexManager.addGroupedEmbeddings) {
|
|
631
|
+
await this.indexManager.addGroupedEmbeddings(textEmbeddings, imageEmbeddings);
|
|
632
|
+
}
|
|
633
|
+
else {
|
|
634
|
+
await this.indexManager.addVectors(embeddings);
|
|
635
|
+
}
|
|
608
636
|
console.log(`✓ Vector index updated successfully with ${embeddings.length} new vectors`);
|
|
609
637
|
}
|
|
610
638
|
catch (error) {
|
|
@@ -612,6 +640,45 @@ export class IngestionPipeline {
|
|
|
612
640
|
throw error;
|
|
613
641
|
}
|
|
614
642
|
}
|
|
643
|
+
/**
|
|
644
|
+
* Filter documents based on ingestion mode to avoid processing incompatible content types
|
|
645
|
+
*/
|
|
646
|
+
filterDocumentsByMode(fileResult, mode) {
|
|
647
|
+
if (mode === 'multimodal') {
|
|
648
|
+
// In multimodal mode, keep all documents
|
|
649
|
+
return fileResult;
|
|
650
|
+
}
|
|
651
|
+
// In text mode, filter out image documents
|
|
652
|
+
const filteredDocuments = fileResult.documents.filter(doc => {
|
|
653
|
+
const contentType = doc.metadata?.contentType || 'text';
|
|
654
|
+
const isCompatible = contentType === 'text' ||
|
|
655
|
+
contentType.startsWith('text/') ||
|
|
656
|
+
contentType === 'application/pdf' ||
|
|
657
|
+
contentType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document';
|
|
658
|
+
if (!isCompatible) {
|
|
659
|
+
console.log(`⚠️ Skipping ${doc.source} (${contentType}) - not compatible with text mode`);
|
|
660
|
+
}
|
|
661
|
+
return isCompatible;
|
|
662
|
+
});
|
|
663
|
+
// Update processing result to reflect filtering
|
|
664
|
+
const filteredProcessingResult = {
|
|
665
|
+
...fileResult.processingResult,
|
|
666
|
+
skippedFiles: [
|
|
667
|
+
...(fileResult.processingResult.skippedFiles || []),
|
|
668
|
+
...fileResult.documents
|
|
669
|
+
.filter(doc => !filteredDocuments.includes(doc))
|
|
670
|
+
.map(doc => ({
|
|
671
|
+
path: doc.source,
|
|
672
|
+
reason: `Content type not compatible with ${mode} mode`
|
|
673
|
+
}))
|
|
674
|
+
]
|
|
675
|
+
};
|
|
676
|
+
return {
|
|
677
|
+
documents: filteredDocuments,
|
|
678
|
+
discoveryResult: fileResult.discoveryResult,
|
|
679
|
+
processingResult: filteredProcessingResult
|
|
680
|
+
};
|
|
681
|
+
}
|
|
615
682
|
/**
|
|
616
683
|
* Converts MIME type to simple content type for embedding function
|
|
617
684
|
* @param mimeType - MIME type string (e.g., 'text/plain', 'image/jpeg')
|
|
@@ -105,7 +105,7 @@ export class ModelValidator {
|
|
|
105
105
|
}
|
|
106
106
|
// Fallback: try to detect from package.json import
|
|
107
107
|
try {
|
|
108
|
-
const packageInfo = await import('@huggingface/transformers/package.json');
|
|
108
|
+
const packageInfo = await import('@huggingface/transformers/package.json' + '');
|
|
109
109
|
if (packageInfo.version) {
|
|
110
110
|
this.currentTransformersVersion = packageInfo.version;
|
|
111
111
|
return packageInfo.version;
|
|
@@ -194,7 +194,7 @@ export class TextDerivedRerankingStrategy {
|
|
|
194
194
|
catch (error) {
|
|
195
195
|
console.warn(`Failed to generate description for image ${imagePath}: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
196
196
|
// Fallback to filename-based description
|
|
197
|
-
const filename = imagePath.split('/').pop() || imagePath
|
|
197
|
+
const filename = imagePath.split('/').pop() || imagePath;
|
|
198
198
|
return `Image file: ${filename}`;
|
|
199
199
|
}
|
|
200
200
|
}
|
|
@@ -211,17 +211,16 @@ export class TextDerivedRerankingStrategy {
|
|
|
211
211
|
// Step 1: Convert images to text descriptions
|
|
212
212
|
const processedResults = await Promise.all(results.map(async (result) => {
|
|
213
213
|
if (result.contentType === 'image') {
|
|
214
|
-
// Generate text description for image
|
|
215
|
-
const description = await this.generateImageDescription(result.
|
|
214
|
+
// Generate text description for image
|
|
215
|
+
const description = await this.generateImageDescription(result.content);
|
|
216
216
|
return {
|
|
217
217
|
...result,
|
|
218
218
|
content: description,
|
|
219
|
-
contentType: 'text', // Change to 'text' so cross-encoder will process it
|
|
220
219
|
originalContent: result.content,
|
|
221
220
|
originalContentType: result.contentType,
|
|
222
221
|
metadata: {
|
|
223
222
|
...result.metadata,
|
|
224
|
-
originalImagePath: result.
|
|
223
|
+
originalImagePath: result.content,
|
|
225
224
|
generatedDescription: description
|
|
226
225
|
}
|
|
227
226
|
};
|
|
@@ -139,7 +139,8 @@ export class SearchEngine {
|
|
|
139
139
|
const searchStartTime = performance.now();
|
|
140
140
|
let searchResult;
|
|
141
141
|
try {
|
|
142
|
-
|
|
142
|
+
const contentType = options.contentType;
|
|
143
|
+
searchResult = this.indexManager.search(queryVector, topK, contentType);
|
|
143
144
|
}
|
|
144
145
|
catch (error) {
|
|
145
146
|
if (error instanceof Error && error.message.includes('No embedding ID found for hash')) {
|
|
@@ -64,5 +64,9 @@ export declare class VectorIndex {
|
|
|
64
64
|
* Resize index to accommodate more vectors
|
|
65
65
|
*/
|
|
66
66
|
resizeIndex(newMaxElements: number): void;
|
|
67
|
+
/**
|
|
68
|
+
* Get index options (for external access to configuration)
|
|
69
|
+
*/
|
|
70
|
+
getOptions(): VectorIndexOptions;
|
|
67
71
|
}
|
|
68
72
|
//# sourceMappingURL=vector-index.d.ts.map
|
|
@@ -81,7 +81,8 @@ export class VectorIndex {
|
|
|
81
81
|
originalConsoleError.apply(console, args);
|
|
82
82
|
};
|
|
83
83
|
try {
|
|
84
|
-
const
|
|
84
|
+
const hnswlibModule = await import('hnswlib-wasm/dist/hnswlib.js');
|
|
85
|
+
const { loadHnswlib } = hnswlibModule;
|
|
85
86
|
this.hnswlib = await loadHnswlib();
|
|
86
87
|
}
|
|
87
88
|
finally {
|
|
@@ -143,7 +144,8 @@ export class VectorIndex {
|
|
|
143
144
|
originalConsoleError.apply(console, args);
|
|
144
145
|
};
|
|
145
146
|
try {
|
|
146
|
-
const
|
|
147
|
+
const hnswlibModule = await import('hnswlib-wasm/dist/hnswlib.js');
|
|
148
|
+
const { loadHnswlib } = hnswlibModule;
|
|
147
149
|
this.hnswlib = await loadHnswlib();
|
|
148
150
|
}
|
|
149
151
|
finally {
|
|
@@ -321,5 +323,11 @@ export class VectorIndex {
|
|
|
321
323
|
throw new Error(`Failed to resize index: ${error}`);
|
|
322
324
|
}
|
|
323
325
|
}
|
|
326
|
+
/**
|
|
327
|
+
* Get index options (for external access to configuration)
|
|
328
|
+
*/
|
|
329
|
+
getOptions() {
|
|
330
|
+
return { ...this.options };
|
|
331
|
+
}
|
|
324
332
|
}
|
|
325
333
|
//# sourceMappingURL=vector-index.js.map
|
|
@@ -8,6 +8,8 @@ export interface FileProcessorOptions {
|
|
|
8
8
|
recursive?: boolean;
|
|
9
9
|
/** Maximum file size in bytes (default: 10MB) */
|
|
10
10
|
maxFileSize?: number;
|
|
11
|
+
/** Processing mode to filter compatible files */
|
|
12
|
+
mode?: 'text' | 'multimodal';
|
|
11
13
|
}
|
|
12
14
|
/**
|
|
13
15
|
* Default options for file processing
|