rag-lite-ts 2.1.1 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +88 -5
- package/dist/{cli → cjs/cli}/indexer.js +73 -15
- package/dist/cjs/cli/ui-server.d.ts +5 -0
- package/dist/cjs/cli/ui-server.js +152 -0
- package/dist/{cli.js → cjs/cli.js} +25 -6
- package/dist/{core → cjs/core}/binary-index-format.js +6 -3
- package/dist/{core → cjs/core}/db.d.ts +56 -0
- package/dist/{core → cjs/core}/db.js +105 -0
- package/dist/{core → cjs/core}/ingestion.js +3 -0
- package/dist/cjs/core/knowledge-base-manager.d.ts +109 -0
- package/dist/cjs/core/knowledge-base-manager.js +256 -0
- package/dist/{core → cjs/core}/model-validator.js +1 -1
- package/dist/{core → cjs/core}/search-pipeline.js +1 -1
- package/dist/{core → cjs/core}/search.js +1 -1
- package/dist/cjs/core/vector-index-messages.d.ts +52 -0
- package/dist/cjs/core/vector-index-messages.js +5 -0
- package/dist/cjs/core/vector-index-worker.d.ts +6 -0
- package/dist/cjs/core/vector-index-worker.js +304 -0
- package/dist/cjs/core/vector-index.d.ts +107 -0
- package/dist/cjs/core/vector-index.js +344 -0
- package/dist/{factories → cjs/factories}/ingestion-factory.js +3 -7
- package/dist/{factories → cjs/factories}/search-factory.js +11 -0
- package/dist/{index-manager.d.ts → cjs/index-manager.d.ts} +23 -3
- package/dist/{index-manager.js → cjs/index-manager.js} +84 -15
- package/dist/{index.d.ts → cjs/index.d.ts} +2 -1
- package/dist/{index.js → cjs/index.js} +3 -1
- package/dist/esm/api-errors.d.ts +90 -0
- package/dist/esm/api-errors.js +320 -0
- package/dist/esm/cli/indexer.d.ts +11 -0
- package/dist/esm/cli/indexer.js +529 -0
- package/dist/esm/cli/search.d.ts +7 -0
- package/dist/esm/cli/search.js +332 -0
- package/dist/esm/cli/ui-server.d.ts +5 -0
- package/dist/esm/cli/ui-server.js +152 -0
- package/dist/esm/cli.d.ts +3 -0
- package/dist/esm/cli.js +548 -0
- package/dist/esm/config.d.ts +51 -0
- package/dist/esm/config.js +79 -0
- package/dist/esm/core/abstract-embedder.d.ts +125 -0
- package/dist/esm/core/abstract-embedder.js +264 -0
- package/dist/esm/core/actionable-error-messages.d.ts +60 -0
- package/dist/esm/core/actionable-error-messages.js +397 -0
- package/dist/esm/core/adapters.d.ts +93 -0
- package/dist/esm/core/adapters.js +139 -0
- package/dist/esm/core/batch-processing-optimizer.d.ts +155 -0
- package/dist/esm/core/batch-processing-optimizer.js +536 -0
- package/dist/esm/core/binary-index-format.d.ts +78 -0
- package/dist/esm/core/binary-index-format.js +294 -0
- package/dist/esm/core/chunker.d.ts +119 -0
- package/dist/esm/core/chunker.js +73 -0
- package/dist/esm/core/cli-database-utils.d.ts +53 -0
- package/dist/esm/core/cli-database-utils.js +239 -0
- package/dist/esm/core/config.d.ts +102 -0
- package/dist/esm/core/config.js +247 -0
- package/dist/esm/core/content-errors.d.ts +111 -0
- package/dist/esm/core/content-errors.js +362 -0
- package/dist/esm/core/content-manager.d.ts +335 -0
- package/dist/esm/core/content-manager.js +1476 -0
- package/dist/esm/core/content-performance-optimizer.d.ts +150 -0
- package/dist/esm/core/content-performance-optimizer.js +516 -0
- package/dist/esm/core/content-resolver.d.ts +104 -0
- package/dist/esm/core/content-resolver.js +285 -0
- package/dist/esm/core/cross-modal-search.d.ts +164 -0
- package/dist/esm/core/cross-modal-search.js +342 -0
- package/dist/esm/core/database-connection-manager.d.ts +109 -0
- package/dist/esm/core/database-connection-manager.js +310 -0
- package/dist/esm/core/db.d.ts +269 -0
- package/dist/esm/core/db.js +1000 -0
- package/dist/esm/core/embedder-factory.d.ts +154 -0
- package/dist/esm/core/embedder-factory.js +311 -0
- package/dist/esm/core/error-handler.d.ts +112 -0
- package/dist/esm/core/error-handler.js +239 -0
- package/dist/esm/core/index.d.ts +59 -0
- package/dist/esm/core/index.js +69 -0
- package/dist/esm/core/ingestion.d.ts +202 -0
- package/dist/esm/core/ingestion.js +904 -0
- package/dist/esm/core/interfaces.d.ts +408 -0
- package/dist/esm/core/interfaces.js +106 -0
- package/dist/esm/core/knowledge-base-manager.d.ts +109 -0
- package/dist/esm/core/knowledge-base-manager.js +256 -0
- package/dist/esm/core/lazy-dependency-loader.d.ts +147 -0
- package/dist/esm/core/lazy-dependency-loader.js +435 -0
- package/dist/esm/core/mode-detection-service.d.ts +150 -0
- package/dist/esm/core/mode-detection-service.js +565 -0
- package/dist/esm/core/mode-model-validator.d.ts +92 -0
- package/dist/esm/core/mode-model-validator.js +203 -0
- package/dist/esm/core/model-registry.d.ts +116 -0
- package/dist/esm/core/model-registry.js +411 -0
- package/dist/esm/core/model-validator.d.ts +217 -0
- package/dist/esm/core/model-validator.js +782 -0
- package/dist/esm/core/path-manager.d.ts +47 -0
- package/dist/esm/core/path-manager.js +71 -0
- package/dist/esm/core/raglite-paths.d.ts +121 -0
- package/dist/esm/core/raglite-paths.js +145 -0
- package/dist/esm/core/reranking-config.d.ts +42 -0
- package/dist/esm/core/reranking-config.js +147 -0
- package/dist/esm/core/reranking-factory.d.ts +92 -0
- package/dist/esm/core/reranking-factory.js +410 -0
- package/dist/esm/core/reranking-strategies.d.ts +310 -0
- package/dist/esm/core/reranking-strategies.js +650 -0
- package/dist/esm/core/resource-cleanup.d.ts +163 -0
- package/dist/esm/core/resource-cleanup.js +371 -0
- package/dist/esm/core/resource-manager.d.ts +212 -0
- package/dist/esm/core/resource-manager.js +564 -0
- package/dist/esm/core/search-pipeline.d.ts +111 -0
- package/dist/esm/core/search-pipeline.js +287 -0
- package/dist/esm/core/search.d.ts +141 -0
- package/dist/esm/core/search.js +320 -0
- package/dist/esm/core/streaming-operations.d.ts +145 -0
- package/dist/esm/core/streaming-operations.js +409 -0
- package/dist/esm/core/types.d.ts +66 -0
- package/dist/esm/core/types.js +6 -0
- package/dist/esm/core/universal-embedder.d.ts +177 -0
- package/dist/esm/core/universal-embedder.js +139 -0
- package/dist/esm/core/validation-messages.d.ts +99 -0
- package/dist/esm/core/validation-messages.js +334 -0
- package/dist/esm/core/vector-index-messages.d.ts +52 -0
- package/dist/esm/core/vector-index-messages.js +5 -0
- package/dist/esm/core/vector-index-worker.d.ts +6 -0
- package/dist/esm/core/vector-index-worker.js +304 -0
- package/dist/esm/core/vector-index.d.ts +107 -0
- package/dist/esm/core/vector-index.js +344 -0
- package/dist/esm/dom-polyfills.d.ts +6 -0
- package/dist/esm/dom-polyfills.js +37 -0
- package/dist/esm/factories/index.d.ts +27 -0
- package/dist/esm/factories/index.js +29 -0
- package/dist/esm/factories/ingestion-factory.d.ts +200 -0
- package/dist/esm/factories/ingestion-factory.js +473 -0
- package/dist/esm/factories/search-factory.d.ts +154 -0
- package/dist/esm/factories/search-factory.js +355 -0
- package/dist/esm/file-processor.d.ts +147 -0
- package/dist/esm/file-processor.js +963 -0
- package/dist/esm/index-manager.d.ts +136 -0
- package/dist/esm/index-manager.js +667 -0
- package/dist/esm/index.d.ts +76 -0
- package/dist/esm/index.js +112 -0
- package/dist/esm/indexer.d.ts +7 -0
- package/dist/esm/indexer.js +54 -0
- package/dist/esm/ingestion.d.ts +63 -0
- package/dist/esm/ingestion.js +124 -0
- package/dist/esm/mcp-server.d.ts +46 -0
- package/dist/esm/mcp-server.js +1820 -0
- package/dist/esm/multimodal/clip-embedder.d.ts +327 -0
- package/dist/esm/multimodal/clip-embedder.js +996 -0
- package/dist/esm/multimodal/index.d.ts +6 -0
- package/dist/esm/multimodal/index.js +6 -0
- package/dist/esm/preprocess.d.ts +19 -0
- package/dist/esm/preprocess.js +203 -0
- package/dist/esm/preprocessors/index.d.ts +17 -0
- package/dist/esm/preprocessors/index.js +38 -0
- package/dist/esm/preprocessors/mdx.d.ts +25 -0
- package/dist/esm/preprocessors/mdx.js +101 -0
- package/dist/esm/preprocessors/mermaid.d.ts +68 -0
- package/dist/esm/preprocessors/mermaid.js +329 -0
- package/dist/esm/preprocessors/registry.d.ts +56 -0
- package/dist/esm/preprocessors/registry.js +179 -0
- package/dist/esm/run-error-recovery-tests.d.ts +7 -0
- package/dist/esm/run-error-recovery-tests.js +101 -0
- package/dist/esm/search-standalone.d.ts +7 -0
- package/dist/esm/search-standalone.js +117 -0
- package/dist/esm/search.d.ts +99 -0
- package/dist/esm/search.js +177 -0
- package/dist/esm/test-utils.d.ts +18 -0
- package/dist/esm/test-utils.js +27 -0
- package/dist/esm/text/chunker.d.ts +33 -0
- package/dist/esm/text/chunker.js +279 -0
- package/dist/esm/text/embedder.d.ts +111 -0
- package/dist/esm/text/embedder.js +386 -0
- package/dist/esm/text/index.d.ts +8 -0
- package/dist/esm/text/index.js +9 -0
- package/dist/esm/text/preprocessors/index.d.ts +17 -0
- package/dist/esm/text/preprocessors/index.js +38 -0
- package/dist/esm/text/preprocessors/mdx.d.ts +25 -0
- package/dist/esm/text/preprocessors/mdx.js +101 -0
- package/dist/esm/text/preprocessors/mermaid.d.ts +68 -0
- package/dist/esm/text/preprocessors/mermaid.js +330 -0
- package/dist/esm/text/preprocessors/registry.d.ts +56 -0
- package/dist/esm/text/preprocessors/registry.js +180 -0
- package/dist/esm/text/reranker.d.ts +49 -0
- package/dist/esm/text/reranker.js +274 -0
- package/dist/esm/text/sentence-transformer-embedder.d.ts +96 -0
- package/dist/esm/text/sentence-transformer-embedder.js +340 -0
- package/dist/esm/text/tokenizer.d.ts +22 -0
- package/dist/esm/text/tokenizer.js +64 -0
- package/dist/esm/types.d.ts +83 -0
- package/dist/esm/types.js +3 -0
- package/dist/esm/utils/vector-math.d.ts +31 -0
- package/dist/esm/utils/vector-math.js +70 -0
- package/package.json +39 -14
- package/dist/core/vector-index.d.ts +0 -72
- package/dist/core/vector-index.js +0 -331
- /package/dist/{api-errors.d.ts → cjs/api-errors.d.ts} +0 -0
- /package/dist/{api-errors.js → cjs/api-errors.js} +0 -0
- /package/dist/{cli → cjs/cli}/indexer.d.ts +0 -0
- /package/dist/{cli → cjs/cli}/search.d.ts +0 -0
- /package/dist/{cli → cjs/cli}/search.js +0 -0
- /package/dist/{cli.d.ts → cjs/cli.d.ts} +0 -0
- /package/dist/{config.d.ts → cjs/config.d.ts} +0 -0
- /package/dist/{config.js → cjs/config.js} +0 -0
- /package/dist/{core → cjs/core}/abstract-embedder.d.ts +0 -0
- /package/dist/{core → cjs/core}/abstract-embedder.js +0 -0
- /package/dist/{core → cjs/core}/actionable-error-messages.d.ts +0 -0
- /package/dist/{core → cjs/core}/actionable-error-messages.js +0 -0
- /package/dist/{core → cjs/core}/adapters.d.ts +0 -0
- /package/dist/{core → cjs/core}/adapters.js +0 -0
- /package/dist/{core → cjs/core}/batch-processing-optimizer.d.ts +0 -0
- /package/dist/{core → cjs/core}/batch-processing-optimizer.js +0 -0
- /package/dist/{core → cjs/core}/binary-index-format.d.ts +0 -0
- /package/dist/{core → cjs/core}/chunker.d.ts +0 -0
- /package/dist/{core → cjs/core}/chunker.js +0 -0
- /package/dist/{core → cjs/core}/cli-database-utils.d.ts +0 -0
- /package/dist/{core → cjs/core}/cli-database-utils.js +0 -0
- /package/dist/{core → cjs/core}/config.d.ts +0 -0
- /package/dist/{core → cjs/core}/config.js +0 -0
- /package/dist/{core → cjs/core}/content-errors.d.ts +0 -0
- /package/dist/{core → cjs/core}/content-errors.js +0 -0
- /package/dist/{core → cjs/core}/content-manager.d.ts +0 -0
- /package/dist/{core → cjs/core}/content-manager.js +0 -0
- /package/dist/{core → cjs/core}/content-performance-optimizer.d.ts +0 -0
- /package/dist/{core → cjs/core}/content-performance-optimizer.js +0 -0
- /package/dist/{core → cjs/core}/content-resolver.d.ts +0 -0
- /package/dist/{core → cjs/core}/content-resolver.js +0 -0
- /package/dist/{core → cjs/core}/cross-modal-search.d.ts +0 -0
- /package/dist/{core → cjs/core}/cross-modal-search.js +0 -0
- /package/dist/{core → cjs/core}/database-connection-manager.d.ts +0 -0
- /package/dist/{core → cjs/core}/database-connection-manager.js +0 -0
- /package/dist/{core → cjs/core}/embedder-factory.d.ts +0 -0
- /package/dist/{core → cjs/core}/embedder-factory.js +0 -0
- /package/dist/{core → cjs/core}/error-handler.d.ts +0 -0
- /package/dist/{core → cjs/core}/error-handler.js +0 -0
- /package/dist/{core → cjs/core}/index.d.ts +0 -0
- /package/dist/{core → cjs/core}/index.js +0 -0
- /package/dist/{core → cjs/core}/ingestion.d.ts +0 -0
- /package/dist/{core → cjs/core}/interfaces.d.ts +0 -0
- /package/dist/{core → cjs/core}/interfaces.js +0 -0
- /package/dist/{core → cjs/core}/lazy-dependency-loader.d.ts +0 -0
- /package/dist/{core → cjs/core}/lazy-dependency-loader.js +0 -0
- /package/dist/{core → cjs/core}/mode-detection-service.d.ts +0 -0
- /package/dist/{core → cjs/core}/mode-detection-service.js +0 -0
- /package/dist/{core → cjs/core}/mode-model-validator.d.ts +0 -0
- /package/dist/{core → cjs/core}/mode-model-validator.js +0 -0
- /package/dist/{core → cjs/core}/model-registry.d.ts +0 -0
- /package/dist/{core → cjs/core}/model-registry.js +0 -0
- /package/dist/{core → cjs/core}/model-validator.d.ts +0 -0
- /package/dist/{core → cjs/core}/path-manager.d.ts +0 -0
- /package/dist/{core → cjs/core}/path-manager.js +0 -0
- /package/dist/{core → cjs/core}/raglite-paths.d.ts +0 -0
- /package/dist/{core → cjs/core}/raglite-paths.js +0 -0
- /package/dist/{core → cjs/core}/reranking-config.d.ts +0 -0
- /package/dist/{core → cjs/core}/reranking-config.js +0 -0
- /package/dist/{core → cjs/core}/reranking-factory.d.ts +0 -0
- /package/dist/{core → cjs/core}/reranking-factory.js +0 -0
- /package/dist/{core → cjs/core}/reranking-strategies.d.ts +0 -0
- /package/dist/{core → cjs/core}/reranking-strategies.js +0 -0
- /package/dist/{core → cjs/core}/resource-cleanup.d.ts +0 -0
- /package/dist/{core → cjs/core}/resource-cleanup.js +0 -0
- /package/dist/{core → cjs/core}/resource-manager.d.ts +0 -0
- /package/dist/{core → cjs/core}/resource-manager.js +0 -0
- /package/dist/{core → cjs/core}/search-pipeline.d.ts +0 -0
- /package/dist/{core → cjs/core}/search.d.ts +0 -0
- /package/dist/{core → cjs/core}/streaming-operations.d.ts +0 -0
- /package/dist/{core → cjs/core}/streaming-operations.js +0 -0
- /package/dist/{core → cjs/core}/types.d.ts +0 -0
- /package/dist/{core → cjs/core}/types.js +0 -0
- /package/dist/{core → cjs/core}/universal-embedder.d.ts +0 -0
- /package/dist/{core → cjs/core}/universal-embedder.js +0 -0
- /package/dist/{core → cjs/core}/validation-messages.d.ts +0 -0
- /package/dist/{core → cjs/core}/validation-messages.js +0 -0
- /package/dist/{dom-polyfills.d.ts → cjs/dom-polyfills.d.ts} +0 -0
- /package/dist/{dom-polyfills.js → cjs/dom-polyfills.js} +0 -0
- /package/dist/{factories → cjs/factories}/index.d.ts +0 -0
- /package/dist/{factories → cjs/factories}/index.js +0 -0
- /package/dist/{factories → cjs/factories}/ingestion-factory.d.ts +0 -0
- /package/dist/{factories → cjs/factories}/search-factory.d.ts +0 -0
- /package/dist/{file-processor.d.ts → cjs/file-processor.d.ts} +0 -0
- /package/dist/{file-processor.js → cjs/file-processor.js} +0 -0
- /package/dist/{indexer.d.ts → cjs/indexer.d.ts} +0 -0
- /package/dist/{indexer.js → cjs/indexer.js} +0 -0
- /package/dist/{ingestion.d.ts → cjs/ingestion.d.ts} +0 -0
- /package/dist/{ingestion.js → cjs/ingestion.js} +0 -0
- /package/dist/{mcp-server.d.ts → cjs/mcp-server.d.ts} +0 -0
- /package/dist/{mcp-server.js → cjs/mcp-server.js} +0 -0
- /package/dist/{multimodal → cjs/multimodal}/clip-embedder.d.ts +0 -0
- /package/dist/{multimodal → cjs/multimodal}/clip-embedder.js +0 -0
- /package/dist/{multimodal → cjs/multimodal}/index.d.ts +0 -0
- /package/dist/{multimodal → cjs/multimodal}/index.js +0 -0
- /package/dist/{preprocess.d.ts → cjs/preprocess.d.ts} +0 -0
- /package/dist/{preprocess.js → cjs/preprocess.js} +0 -0
- /package/dist/{preprocessors → cjs/preprocessors}/index.d.ts +0 -0
- /package/dist/{preprocessors → cjs/preprocessors}/index.js +0 -0
- /package/dist/{preprocessors → cjs/preprocessors}/mdx.d.ts +0 -0
- /package/dist/{preprocessors → cjs/preprocessors}/mdx.js +0 -0
- /package/dist/{preprocessors → cjs/preprocessors}/mermaid.d.ts +0 -0
- /package/dist/{preprocessors → cjs/preprocessors}/mermaid.js +0 -0
- /package/dist/{preprocessors → cjs/preprocessors}/registry.d.ts +0 -0
- /package/dist/{preprocessors → cjs/preprocessors}/registry.js +0 -0
- /package/dist/{run-error-recovery-tests.d.ts → cjs/run-error-recovery-tests.d.ts} +0 -0
- /package/dist/{run-error-recovery-tests.js → cjs/run-error-recovery-tests.js} +0 -0
- /package/dist/{search-standalone.d.ts → cjs/search-standalone.d.ts} +0 -0
- /package/dist/{search-standalone.js → cjs/search-standalone.js} +0 -0
- /package/dist/{search.d.ts → cjs/search.d.ts} +0 -0
- /package/dist/{search.js → cjs/search.js} +0 -0
- /package/dist/{test-utils.d.ts → cjs/test-utils.d.ts} +0 -0
- /package/dist/{test-utils.js → cjs/test-utils.js} +0 -0
- /package/dist/{text → cjs/text}/chunker.d.ts +0 -0
- /package/dist/{text → cjs/text}/chunker.js +0 -0
- /package/dist/{text → cjs/text}/embedder.d.ts +0 -0
- /package/dist/{text → cjs/text}/embedder.js +0 -0
- /package/dist/{text → cjs/text}/index.d.ts +0 -0
- /package/dist/{text → cjs/text}/index.js +0 -0
- /package/dist/{text → cjs/text}/preprocessors/index.d.ts +0 -0
- /package/dist/{text → cjs/text}/preprocessors/index.js +0 -0
- /package/dist/{text → cjs/text}/preprocessors/mdx.d.ts +0 -0
- /package/dist/{text → cjs/text}/preprocessors/mdx.js +0 -0
- /package/dist/{text → cjs/text}/preprocessors/mermaid.d.ts +0 -0
- /package/dist/{text → cjs/text}/preprocessors/mermaid.js +0 -0
- /package/dist/{text → cjs/text}/preprocessors/registry.d.ts +0 -0
- /package/dist/{text → cjs/text}/preprocessors/registry.js +0 -0
- /package/dist/{text → cjs/text}/reranker.d.ts +0 -0
- /package/dist/{text → cjs/text}/reranker.js +0 -0
- /package/dist/{text → cjs/text}/sentence-transformer-embedder.d.ts +0 -0
- /package/dist/{text → cjs/text}/sentence-transformer-embedder.js +0 -0
- /package/dist/{text → cjs/text}/tokenizer.d.ts +0 -0
- /package/dist/{text → cjs/text}/tokenizer.js +0 -0
- /package/dist/{types.d.ts → cjs/types.d.ts} +0 -0
- /package/dist/{types.js → cjs/types.js} +0 -0
- /package/dist/{utils → cjs/utils}/vector-math.d.ts +0 -0
- /package/dist/{utils → cjs/utils}/vector-math.js +0 -0
|
@@ -0,0 +1,904 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* CORE MODULE — Shared between text-only (rag-lite-ts) and future multimodal (rag-lite-mm)
|
|
3
|
+
* Model-agnostic. No transformer or modality-specific logic.
|
|
4
|
+
*/
|
|
5
|
+
import { discoverAndProcessFiles } from '../file-processor.js';
|
|
6
|
+
import { chunkDocument } from './chunker.js';
|
|
7
|
+
import { insertChunk, upsertDocument } from './db.js';
|
|
8
|
+
import { config } from './config.js';
|
|
9
|
+
import { DocumentPathManager } from './path-manager.js';
|
|
10
|
+
import { existsSync } from 'fs';
|
|
11
|
+
import { ContentManager } from './content-manager.js';
|
|
12
|
+
import { createRequire } from 'module';
|
|
13
|
+
// Create require for CommonJS modules in ES module context
|
|
14
|
+
const require = createRequire(import.meta.url);
|
|
15
|
+
/**
|
|
16
|
+
* Main ingestion pipeline class
|
|
17
|
+
* Coordinates the entire process from file discovery to vector storage
|
|
18
|
+
* Uses explicit dependency injection for clean architecture
|
|
19
|
+
*/
|
|
20
|
+
export class IngestionPipeline {
|
|
21
|
+
embedFn;
|
|
22
|
+
indexManager;
|
|
23
|
+
db;
|
|
24
|
+
defaultChunkConfig;
|
|
25
|
+
pathManager;
|
|
26
|
+
contentManager;
|
|
27
|
+
/**
|
|
28
|
+
* Creates a new IngestionPipeline with explicit dependency injection
|
|
29
|
+
* Enhanced with ContentManager integration for unified content system
|
|
30
|
+
*
|
|
31
|
+
* DEPENDENCY INJECTION PATTERN:
|
|
32
|
+
* This constructor requires all dependencies to be explicitly provided, enabling:
|
|
33
|
+
* - Clean separation between core ingestion logic and implementation-specific components
|
|
34
|
+
* - Support for different embedding models and content types
|
|
35
|
+
* - Testability through mock injection
|
|
36
|
+
* - Future extensibility for multimodal content processing
|
|
37
|
+
* - Unified content management for both filesystem and memory-based ingestion
|
|
38
|
+
*
|
|
39
|
+
* @param embedFn - Function to embed document chunks into vectors
|
|
40
|
+
* - Signature: (query: string, contentType?: string) => Promise<EmbeddingResult>
|
|
41
|
+
* - Must handle chunk text and return consistent embedding format
|
|
42
|
+
* - Examples:
|
|
43
|
+
* - Text: const embedFn = (text) => textEmbedder.embedSingle(text)
|
|
44
|
+
* - Multimodal: const embedFn = (content, type) => type === 'image' ? clipEmbedder.embedImage(content) : clipEmbedder.embedText(content)
|
|
45
|
+
* - Custom: const embedFn = (text) => customModel.embed(text)
|
|
46
|
+
*
|
|
47
|
+
* @param indexManager - Vector index manager for storing embeddings
|
|
48
|
+
* - Handles vector storage and indexing operations
|
|
49
|
+
* - Must support the embedding dimensions produced by embedFn
|
|
50
|
+
* - Example: new IndexManager('./index.bin')
|
|
51
|
+
*
|
|
52
|
+
* @param db - Database connection for metadata storage
|
|
53
|
+
* - Stores document and chunk metadata with content type support
|
|
54
|
+
* - Supports different content types through metadata fields
|
|
55
|
+
* - Example: await openDatabase('./db.sqlite')
|
|
56
|
+
*
|
|
57
|
+
* @param contentManager - Optional ContentManager for unified content system
|
|
58
|
+
* - Handles content storage routing and deduplication
|
|
59
|
+
* - If not provided, creates default instance with standard configuration
|
|
60
|
+
* - Example: new ContentManager(db, { contentDir: '.raglite/content' })
|
|
61
|
+
*
|
|
62
|
+
* USAGE EXAMPLES:
|
|
63
|
+
* ```typescript
|
|
64
|
+
* // Text-only ingestion pipeline with unified content system
|
|
65
|
+
* const textEmbedFn = createTextEmbedFunction();
|
|
66
|
+
* const indexManager = new IndexManager('./index.bin');
|
|
67
|
+
* const db = await openDatabase('./db.sqlite');
|
|
68
|
+
* const contentManager = new ContentManager(db);
|
|
69
|
+
* const ingestion = new IngestionPipeline(textEmbedFn, indexManager, db, undefined, contentManager);
|
|
70
|
+
*
|
|
71
|
+
* // Simple usage (ContentManager created automatically)
|
|
72
|
+
* const ingestion = new IngestionPipeline(textEmbedFn, indexManager, db);
|
|
73
|
+
*
|
|
74
|
+
* // Custom embedding implementation with memory ingestion
|
|
75
|
+
* const customEmbedFn = async (text) => ({
|
|
76
|
+
* embedding_id: generateId(),
|
|
77
|
+
* vector: await myCustomModel.embed(text)
|
|
78
|
+
* });
|
|
79
|
+
* const ingestion = new IngestionPipeline(customEmbedFn, indexManager, db);
|
|
80
|
+
* await ingestion.ingestFromMemory(buffer, { displayName: 'file.txt' });
|
|
81
|
+
* ```
|
|
82
|
+
*/
|
|
83
|
+
constructor(embedFn, indexManager, db, defaultChunkConfig, contentManager) {
|
|
84
|
+
this.embedFn = embedFn;
|
|
85
|
+
this.indexManager = indexManager;
|
|
86
|
+
this.db = db;
|
|
87
|
+
this.defaultChunkConfig = defaultChunkConfig;
|
|
88
|
+
// Validate required dependencies
|
|
89
|
+
if (!embedFn || typeof embedFn !== 'function') {
|
|
90
|
+
throw new Error('embedFn must be a valid function');
|
|
91
|
+
}
|
|
92
|
+
if (!indexManager) {
|
|
93
|
+
throw new Error('indexManager is required');
|
|
94
|
+
}
|
|
95
|
+
if (!db) {
|
|
96
|
+
throw new Error('db connection is required');
|
|
97
|
+
}
|
|
98
|
+
// Initialize path manager with default configuration
|
|
99
|
+
this.pathManager = new DocumentPathManager(config.path_storage_strategy, process.cwd());
|
|
100
|
+
// Initialize ContentManager (create default if not provided)
|
|
101
|
+
this.contentManager = contentManager || new ContentManager(this.db);
|
|
102
|
+
}
|
|
103
|
+
/**
|
|
104
|
+
* Ingest documents from a directory
|
|
105
|
+
* @param directoryPath - Path to directory containing documents
|
|
106
|
+
* @param options - Optional ingestion configuration
|
|
107
|
+
* @returns Promise resolving to ingestion results
|
|
108
|
+
*/
|
|
109
|
+
async ingestDirectory(directoryPath, options = {}) {
|
|
110
|
+
if (!existsSync(directoryPath)) {
|
|
111
|
+
throw new Error(`Directory not found: ${directoryPath}`);
|
|
112
|
+
}
|
|
113
|
+
return this.ingestPath(directoryPath, options);
|
|
114
|
+
}
|
|
115
|
+
/**
|
|
116
|
+
* Ingest a single file
|
|
117
|
+
* @param filePath - Path to the file to ingest
|
|
118
|
+
* @param options - Optional ingestion configuration
|
|
119
|
+
* @returns Promise resolving to ingestion results
|
|
120
|
+
*/
|
|
121
|
+
async ingestFile(filePath, options = {}) {
|
|
122
|
+
if (!existsSync(filePath)) {
|
|
123
|
+
throw new Error(`File not found: ${filePath}`);
|
|
124
|
+
}
|
|
125
|
+
return this.ingestPath(filePath, options);
|
|
126
|
+
}
|
|
127
|
+
/**
|
|
128
|
+
* Ingest content from memory buffer
|
|
129
|
+
* Enables MCP integration and real-time content processing
|
|
130
|
+
* @param content - Buffer containing the content to ingest
|
|
131
|
+
* @param metadata - Memory content metadata including display name and content type
|
|
132
|
+
* @param options - Optional ingestion configuration
|
|
133
|
+
* @returns Promise resolving to content ID for the ingested content
|
|
134
|
+
*/
|
|
135
|
+
async ingestFromMemory(content, metadata, options = {}) {
|
|
136
|
+
const startTime = Date.now();
|
|
137
|
+
console.log(`\n=== Starting memory ingestion: ${metadata.displayName} ===`);
|
|
138
|
+
try {
|
|
139
|
+
// Phase 1: Content Storage via ContentManager
|
|
140
|
+
console.log('\n--- Phase 1: Content Storage ---');
|
|
141
|
+
const contentResult = await this.contentManager.ingestFromMemory(content, metadata);
|
|
142
|
+
if (contentResult.wasDeduped) {
|
|
143
|
+
console.log(`✓ Content deduplicated: ${metadata.displayName} (ID: ${contentResult.contentId})`);
|
|
144
|
+
return contentResult.contentId;
|
|
145
|
+
}
|
|
146
|
+
console.log(`✓ Content stored: ${metadata.displayName} (ID: ${contentResult.contentId})`);
|
|
147
|
+
// Phase 2: Document Processing
|
|
148
|
+
console.log('\n--- Phase 2: Document Processing ---');
|
|
149
|
+
// Determine content type for processing
|
|
150
|
+
const detectedContentType = metadata.contentType || 'text/plain';
|
|
151
|
+
const isImageContent = detectedContentType.startsWith('image/');
|
|
152
|
+
let document;
|
|
153
|
+
if (isImageContent) {
|
|
154
|
+
// Process image content using the existing image processing pipeline
|
|
155
|
+
console.log(`Processing image content: ${metadata.displayName} (${detectedContentType})`);
|
|
156
|
+
document = await this.processImageFromMemory(content, contentResult, metadata, options);
|
|
157
|
+
}
|
|
158
|
+
else if (detectedContentType === 'application/pdf') {
|
|
159
|
+
// Process PDF content
|
|
160
|
+
console.log(`Processing PDF content: ${metadata.displayName}`);
|
|
161
|
+
document = await this.processPDFFromMemory(content, contentResult, metadata, options);
|
|
162
|
+
}
|
|
163
|
+
else if (detectedContentType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document') {
|
|
164
|
+
// Process DOCX content
|
|
165
|
+
console.log(`Processing DOCX content: ${metadata.displayName}`);
|
|
166
|
+
document = await this.processDOCXFromMemory(content, contentResult, metadata, options);
|
|
167
|
+
}
|
|
168
|
+
else {
|
|
169
|
+
// Process as text content
|
|
170
|
+
console.log(`Processing text content: ${metadata.displayName} (${detectedContentType})`);
|
|
171
|
+
document = {
|
|
172
|
+
source: metadata.displayName,
|
|
173
|
+
title: metadata.displayName,
|
|
174
|
+
content: content.toString('utf8'), // Convert buffer to string for processing
|
|
175
|
+
metadata: {
|
|
176
|
+
contentType: detectedContentType,
|
|
177
|
+
contentId: contentResult.contentId,
|
|
178
|
+
storageType: contentResult.storageType,
|
|
179
|
+
originalPath: metadata.originalPath
|
|
180
|
+
}
|
|
181
|
+
};
|
|
182
|
+
}
|
|
183
|
+
// Phase 3: Document Chunking
|
|
184
|
+
console.log('\n--- Phase 3: Document Chunking ---');
|
|
185
|
+
const effectiveChunkConfig = options.chunkConfig || this.defaultChunkConfig || {
|
|
186
|
+
chunkSize: config.chunk_size,
|
|
187
|
+
chunkOverlap: config.chunk_overlap
|
|
188
|
+
};
|
|
189
|
+
const chunks = await chunkDocument(document, effectiveChunkConfig);
|
|
190
|
+
console.log(`✓ Created ${chunks.length} chunks from memory content`);
|
|
191
|
+
if (chunks.length === 0) {
|
|
192
|
+
console.log('No chunks created from memory content');
|
|
193
|
+
return contentResult.contentId;
|
|
194
|
+
}
|
|
195
|
+
// Phase 4: Embedding Generation
|
|
196
|
+
console.log('\n--- Phase 4: Embedding Generation ---');
|
|
197
|
+
const embeddings = [];
|
|
198
|
+
let embeddingErrors = 0;
|
|
199
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
200
|
+
const chunk = chunks[i];
|
|
201
|
+
try {
|
|
202
|
+
// Convert MIME type to simple content type for embedding function
|
|
203
|
+
const contentTypeForEmbedding = this.getContentTypeForEmbedding(document.metadata?.contentType);
|
|
204
|
+
// For images, use the image path from metadata instead of text description
|
|
205
|
+
let contentForEmbedding = chunk.text;
|
|
206
|
+
if (contentTypeForEmbedding === 'image' && document.metadata) {
|
|
207
|
+
// Try to get image path from metadata (contentPath, originalPath, or source)
|
|
208
|
+
// contentPath is where the image is stored (from contentResult)
|
|
209
|
+
const imagePath = document.metadata.contentPath ||
|
|
210
|
+
document.metadata.originalPath ||
|
|
211
|
+
document.metadata.source;
|
|
212
|
+
if (imagePath) {
|
|
213
|
+
contentForEmbedding = imagePath;
|
|
214
|
+
}
|
|
215
|
+
else {
|
|
216
|
+
// Fallback: try to extract path from source if available
|
|
217
|
+
console.warn(`Image chunk ${i + 1} missing image path in metadata, using text content as fallback`);
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
const embedding = await this.embedFn(contentForEmbedding, contentTypeForEmbedding);
|
|
221
|
+
// Enhance embedding result with content type metadata
|
|
222
|
+
if (!embedding.contentType) {
|
|
223
|
+
embedding.contentType = contentTypeForEmbedding;
|
|
224
|
+
}
|
|
225
|
+
if (!embedding.metadata) {
|
|
226
|
+
embedding.metadata = document.metadata;
|
|
227
|
+
}
|
|
228
|
+
embeddings.push(embedding);
|
|
229
|
+
}
|
|
230
|
+
catch (error) {
|
|
231
|
+
console.warn(`Failed to embed chunk ${i + 1}:`, error instanceof Error ? error.message : String(error));
|
|
232
|
+
embeddingErrors++;
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
console.log(`✓ Generated ${embeddings.length} embeddings for memory content`);
|
|
236
|
+
if (embeddings.length === 0) {
|
|
237
|
+
console.log('No embeddings generated from memory content');
|
|
238
|
+
return contentResult.contentId;
|
|
239
|
+
}
|
|
240
|
+
// Phase 5: Database Storage
|
|
241
|
+
console.log('\n--- Phase 5: Database Storage ---');
|
|
242
|
+
// Insert document with content_id reference
|
|
243
|
+
const documentContentType = this.getContentTypeForEmbedding(document.metadata?.contentType);
|
|
244
|
+
const documentId = await upsertDocument(this.db, document.source, document.title, documentContentType, document.metadata, contentResult.contentId);
|
|
245
|
+
// Insert chunks with embeddings
|
|
246
|
+
let chunksStored = 0;
|
|
247
|
+
for (let i = 0; i < chunks.length && i < embeddings.length; i++) {
|
|
248
|
+
const chunk = chunks[i];
|
|
249
|
+
const embedding = embeddings[i];
|
|
250
|
+
try {
|
|
251
|
+
await insertChunk(this.db, embedding.embedding_id, documentId, chunk.text, chunk.chunkIndex, documentContentType, document.metadata);
|
|
252
|
+
chunksStored++;
|
|
253
|
+
}
|
|
254
|
+
catch (error) {
|
|
255
|
+
console.error(`Failed to store chunk ${i + 1}:`, error instanceof Error ? error.message : String(error));
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
console.log(`✓ Stored document and ${chunksStored} chunks in database`);
|
|
259
|
+
// Phase 6: Vector Index Updates
|
|
260
|
+
console.log('\n--- Phase 6: Vector Index Updates ---');
|
|
261
|
+
await this.updateVectorIndex(embeddings);
|
|
262
|
+
const endTime = Date.now();
|
|
263
|
+
const processingTimeMs = endTime - startTime;
|
|
264
|
+
console.log('\n=== Memory Ingestion Complete ===');
|
|
265
|
+
console.log(`Content ID: ${contentResult.contentId}`);
|
|
266
|
+
console.log(`Chunks created: ${chunks.length}`);
|
|
267
|
+
console.log(`Embeddings generated: ${embeddings.length}`);
|
|
268
|
+
console.log(`Chunks stored: ${chunksStored}`);
|
|
269
|
+
console.log(`Embedding errors: ${embeddingErrors}`);
|
|
270
|
+
console.log(`Total time: ${(processingTimeMs / 1000).toFixed(2)}s`);
|
|
271
|
+
return contentResult.contentId;
|
|
272
|
+
}
|
|
273
|
+
catch (error) {
|
|
274
|
+
console.error('\n=== Memory Ingestion Failed ===');
|
|
275
|
+
console.error(`Error: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
276
|
+
throw new Error(`Memory ingestion failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
/**
|
|
280
|
+
* Ingest documents from a path (file or directory)
|
|
281
|
+
* Implements the complete pipeline: file processing → chunking → embedding → storage
|
|
282
|
+
* Enhanced to handle mixed content types (text and images) in multimodal mode
|
|
283
|
+
*/
|
|
284
|
+
async ingestPath(path, options = {}) {
|
|
285
|
+
const startTime = Date.now();
|
|
286
|
+
console.log(`\n=== Starting ingestion from: ${path} ===`);
|
|
287
|
+
try {
|
|
288
|
+
// Phase 1: File Discovery and Processing with Content-Type Detection
|
|
289
|
+
console.log('\n--- Phase 1: File Discovery and Processing ---');
|
|
290
|
+
const mode = options.mode || 'text';
|
|
291
|
+
const fileOptions = {
|
|
292
|
+
recursive: true,
|
|
293
|
+
maxFileSize: 10 * 1024 * 1024, // 10MB
|
|
294
|
+
...options.fileOptions,
|
|
295
|
+
mode
|
|
296
|
+
};
|
|
297
|
+
const fileResult = await discoverAndProcessFiles(path, fileOptions, this.pathManager);
|
|
298
|
+
// Additional filtering as fallback (should be minimal with mode-aware discovery)
|
|
299
|
+
const filteredResult = this.filterDocumentsByMode(fileResult, mode);
|
|
300
|
+
if (filteredResult.documents.length === 0) {
|
|
301
|
+
console.log('No documents found to process');
|
|
302
|
+
return {
|
|
303
|
+
documentsProcessed: 0,
|
|
304
|
+
chunksCreated: 0,
|
|
305
|
+
embeddingsGenerated: 0,
|
|
306
|
+
documentErrors: filteredResult.processingResult.errors.length,
|
|
307
|
+
embeddingErrors: 0,
|
|
308
|
+
processingTimeMs: Date.now() - startTime,
|
|
309
|
+
contentIds: []
|
|
310
|
+
};
|
|
311
|
+
}
|
|
312
|
+
// Content-type detection and routing
|
|
313
|
+
const contentTypeStats = this.analyzeContentTypes(filteredResult.documents);
|
|
314
|
+
console.log(`📊 Content analysis: ${contentTypeStats.text} text, ${contentTypeStats.image} image, ${contentTypeStats.other} other files`);
|
|
315
|
+
// Phase 2: Document Chunking with Content-Type Awareness
|
|
316
|
+
console.log('\n--- Phase 2: Document Chunking ---');
|
|
317
|
+
const effectiveChunkConfig = options.chunkConfig || this.defaultChunkConfig || {
|
|
318
|
+
chunkSize: config.chunk_size,
|
|
319
|
+
chunkOverlap: config.chunk_overlap
|
|
320
|
+
};
|
|
321
|
+
const chunkingResult = await this.chunkDocumentsWithContentTypes(filteredResult.documents, effectiveChunkConfig, options.mode);
|
|
322
|
+
if (chunkingResult.totalChunks === 0) {
|
|
323
|
+
console.log('No chunks created from documents');
|
|
324
|
+
return {
|
|
325
|
+
documentsProcessed: fileResult.documents.length,
|
|
326
|
+
chunksCreated: 0,
|
|
327
|
+
embeddingsGenerated: 0,
|
|
328
|
+
documentErrors: fileResult.processingResult.errors.length,
|
|
329
|
+
embeddingErrors: 0,
|
|
330
|
+
processingTimeMs: Date.now() - startTime,
|
|
331
|
+
contentIds: []
|
|
332
|
+
};
|
|
333
|
+
}
|
|
334
|
+
// Phase 3: Embedding Generation with Content-Type Support
|
|
335
|
+
console.log('\n--- Phase 3: Embedding Generation ---');
|
|
336
|
+
const embeddingResult = await this.generateEmbeddingsWithContentTypes(chunkingResult.allChunks);
|
|
337
|
+
// Phase 4: Database and Index Storage with Content-Type Metadata
|
|
338
|
+
console.log('\n--- Phase 4: Storage Operations ---');
|
|
339
|
+
const contentIds = await this.storeDocumentsAndChunksWithContentTypes(chunkingResult.documentChunks, embeddingResult.embeddings);
|
|
340
|
+
// Phase 5: Vector Index Updates
|
|
341
|
+
console.log('\n--- Phase 5: Vector Index Updates ---');
|
|
342
|
+
await this.updateVectorIndex(embeddingResult.embeddings);
|
|
343
|
+
// Final save to ensure all vectors are persisted
|
|
344
|
+
console.log('Performing final index save...');
|
|
345
|
+
await this.indexManager.saveIndex();
|
|
346
|
+
const endTime = Date.now();
|
|
347
|
+
const processingTimeMs = endTime - startTime;
|
|
348
|
+
const result = {
|
|
349
|
+
documentsProcessed: filteredResult.documents.length,
|
|
350
|
+
chunksCreated: chunkingResult.totalChunks,
|
|
351
|
+
embeddingsGenerated: embeddingResult.embeddings.length,
|
|
352
|
+
documentErrors: filteredResult.processingResult.errors.length,
|
|
353
|
+
embeddingErrors: embeddingResult.errors,
|
|
354
|
+
processingTimeMs,
|
|
355
|
+
contentIds
|
|
356
|
+
};
|
|
357
|
+
console.log('\n=== Ingestion Complete ===');
|
|
358
|
+
console.log(`Documents processed: ${result.documentsProcessed}`);
|
|
359
|
+
console.log(`Chunks created: ${result.chunksCreated}`);
|
|
360
|
+
console.log(`Embeddings generated: ${result.embeddingsGenerated}`);
|
|
361
|
+
console.log(`Document errors: ${result.documentErrors}`);
|
|
362
|
+
console.log(`Embedding errors: ${result.embeddingErrors}`);
|
|
363
|
+
console.log(`Total time: ${(processingTimeMs / 1000).toFixed(2)}s`);
|
|
364
|
+
return result;
|
|
365
|
+
}
|
|
366
|
+
catch (error) {
|
|
367
|
+
console.error('\n=== Ingestion Failed ===');
|
|
368
|
+
console.error(`Error: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
369
|
+
throw new Error(`Ingestion failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
/**
|
|
373
|
+
* Analyze content types in the document collection
|
|
374
|
+
* @private
|
|
375
|
+
*/
|
|
376
|
+
analyzeContentTypes(documents) {
|
|
377
|
+
const stats = { text: 0, image: 0, other: 0 };
|
|
378
|
+
for (const document of documents) {
|
|
379
|
+
const contentType = document.metadata?.contentType || 'text';
|
|
380
|
+
switch (contentType) {
|
|
381
|
+
case 'text':
|
|
382
|
+
stats.text++;
|
|
383
|
+
break;
|
|
384
|
+
case 'image':
|
|
385
|
+
stats.image++;
|
|
386
|
+
break;
|
|
387
|
+
default:
|
|
388
|
+
stats.other++;
|
|
389
|
+
break;
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
return stats;
|
|
393
|
+
}
|
|
394
|
+
/**
|
|
395
|
+
* Chunk all documents and organize results with content-type awareness
|
|
396
|
+
* Enhanced to handle different content types appropriately
|
|
397
|
+
*/
|
|
398
|
+
async chunkDocumentsWithContentTypes(documents, chunkConfig, mode) {
|
|
399
|
+
const documentChunks = [];
|
|
400
|
+
const allChunks = [];
|
|
401
|
+
let totalChunks = 0;
|
|
402
|
+
console.log(`Processing ${documents.length} document${documents.length === 1 ? '' : 's'} for chunking...`);
|
|
403
|
+
for (let i = 0; i < documents.length; i++) {
|
|
404
|
+
const document = documents[i];
|
|
405
|
+
try {
|
|
406
|
+
const contentType = document.metadata?.contentType || 'text';
|
|
407
|
+
// Handle different content types appropriately
|
|
408
|
+
let chunks;
|
|
409
|
+
if (contentType === 'image') {
|
|
410
|
+
// For images, create a single chunk with the full content (description + metadata)
|
|
411
|
+
chunks = [{
|
|
412
|
+
text: document.content,
|
|
413
|
+
chunkIndex: 0,
|
|
414
|
+
contentType: 'image',
|
|
415
|
+
metadata: document.metadata
|
|
416
|
+
}];
|
|
417
|
+
}
|
|
418
|
+
else if (mode === 'multimodal') {
|
|
419
|
+
// In multimodal mode, don't chunk text - CLIP handles truncation at 77 tokens
|
|
420
|
+
// Chunking doesn't make sense because CLIP can't handle long text anyway
|
|
421
|
+
chunks = [{
|
|
422
|
+
text: document.content,
|
|
423
|
+
chunkIndex: 0,
|
|
424
|
+
contentType: 'text',
|
|
425
|
+
metadata: document.metadata
|
|
426
|
+
}];
|
|
427
|
+
}
|
|
428
|
+
else {
|
|
429
|
+
// For text mode, use normal chunking
|
|
430
|
+
const textChunks = await chunkDocument(document, chunkConfig);
|
|
431
|
+
chunks = textChunks.map(chunk => ({
|
|
432
|
+
...chunk,
|
|
433
|
+
contentType: 'text',
|
|
434
|
+
metadata: document.metadata
|
|
435
|
+
}));
|
|
436
|
+
}
|
|
437
|
+
documentChunks.push({ document, chunks });
|
|
438
|
+
// Collect all chunks with their content type information
|
|
439
|
+
for (const chunk of chunks) {
|
|
440
|
+
allChunks.push({
|
|
441
|
+
text: chunk.text,
|
|
442
|
+
contentType: chunk.contentType,
|
|
443
|
+
metadata: chunk.metadata
|
|
444
|
+
});
|
|
445
|
+
}
|
|
446
|
+
totalChunks += chunks.length;
|
|
447
|
+
// Progress logging - more frequent for better user experience
|
|
448
|
+
if (documents.length <= 10 || (i + 1) % Math.max(1, Math.floor(documents.length / 10)) === 0 || i === documents.length - 1) {
|
|
449
|
+
const percentage = Math.round(((i + 1) / documents.length) * 100);
|
|
450
|
+
console.log(`Processed ${i + 1} of ${documents.length} documents (${percentage}%) - ${totalChunks} chunks created`);
|
|
451
|
+
}
|
|
452
|
+
}
|
|
453
|
+
catch (error) {
|
|
454
|
+
console.error(`Failed to chunk document ${document.source}:`, error instanceof Error ? error.message : String(error));
|
|
455
|
+
// Continue with other documents
|
|
456
|
+
continue;
|
|
457
|
+
}
|
|
458
|
+
}
|
|
459
|
+
console.log(`✓ Chunking complete: Created ${totalChunks} chunks from ${documentChunks.length} documents`);
|
|
460
|
+
return { documentChunks, allChunks, totalChunks };
|
|
461
|
+
}
|
|
462
|
+
/**
|
|
463
|
+
* Generate embeddings for all chunks with content-type support
|
|
464
|
+
* Enhanced to handle different content types and pass metadata to embedding function
|
|
465
|
+
*/
|
|
466
|
+
async generateEmbeddingsWithContentTypes(chunks) {
|
|
467
|
+
console.log(`Generating embeddings for ${chunks.length} chunk${chunks.length === 1 ? '' : 's'}...`);
|
|
468
|
+
console.log('This may take a few minutes depending on the number of chunks...');
|
|
469
|
+
try {
|
|
470
|
+
// Generate embeddings using injected embed function with content type support
|
|
471
|
+
const embeddings = [];
|
|
472
|
+
let errors = 0;
|
|
473
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
474
|
+
const chunk = chunks[i];
|
|
475
|
+
try {
|
|
476
|
+
// Convert MIME type to simple content type for embedding function
|
|
477
|
+
const contentTypeForEmbedding = this.getContentTypeForEmbedding(chunk.contentType);
|
|
478
|
+
// For images, use the image path from metadata instead of text description
|
|
479
|
+
let contentForEmbedding = chunk.text;
|
|
480
|
+
if (contentTypeForEmbedding === 'image' && chunk.metadata) {
|
|
481
|
+
// Try to get image path from metadata (originalPath or contentPath)
|
|
482
|
+
const imagePath = chunk.metadata.originalPath || chunk.metadata.contentPath || chunk.metadata.source;
|
|
483
|
+
if (imagePath) {
|
|
484
|
+
contentForEmbedding = imagePath;
|
|
485
|
+
}
|
|
486
|
+
else {
|
|
487
|
+
// Fallback: try to extract path from source if available
|
|
488
|
+
console.warn(`Image chunk ${i + 1} missing image path in metadata, using text content as fallback`);
|
|
489
|
+
}
|
|
490
|
+
}
|
|
491
|
+
const embedding = await this.embedFn(contentForEmbedding, contentTypeForEmbedding);
|
|
492
|
+
// Enhance embedding result with content type metadata if not already present
|
|
493
|
+
if (!embedding.contentType) {
|
|
494
|
+
embedding.contentType = contentTypeForEmbedding;
|
|
495
|
+
}
|
|
496
|
+
if (!embedding.metadata && chunk.metadata) {
|
|
497
|
+
embedding.metadata = chunk.metadata;
|
|
498
|
+
}
|
|
499
|
+
embeddings.push(embedding);
|
|
500
|
+
}
|
|
501
|
+
catch (error) {
|
|
502
|
+
console.warn(`Failed to embed ${chunk.contentType} chunk ${i + 1}:`, error instanceof Error ? error.message : String(error));
|
|
503
|
+
errors++;
|
|
504
|
+
}
|
|
505
|
+
// Progress logging
|
|
506
|
+
if (chunks.length > 10 && (i + 1) % Math.max(1, Math.floor(chunks.length / 10)) === 0) {
|
|
507
|
+
const percentage = Math.round(((i + 1) / chunks.length) * 100);
|
|
508
|
+
console.log(`Generated ${i + 1} of ${chunks.length} embeddings (${percentage}%)`);
|
|
509
|
+
}
|
|
510
|
+
}
|
|
511
|
+
if (errors > 0) {
|
|
512
|
+
console.warn(`⚠ Warning: ${errors} chunk${errors === 1 ? '' : 's'} failed embedding and ${errors === 1 ? 'was' : 'were'} skipped`);
|
|
513
|
+
}
|
|
514
|
+
console.log(`✓ Generated ${embeddings.length} embeddings successfully`);
|
|
515
|
+
return { embeddings, errors };
|
|
516
|
+
}
|
|
517
|
+
catch (error) {
|
|
518
|
+
console.error('Critical embedding failure:', error instanceof Error ? error.message : String(error));
|
|
519
|
+
throw new Error(`Embedding generation failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
520
|
+
}
|
|
521
|
+
}
|
|
522
|
+
/**
|
|
523
|
+
* Store documents and chunks in database with content-type support
|
|
524
|
+
* Enhanced to handle content type metadata and multimodal content
|
|
525
|
+
* @returns Array of content IDs for successfully stored documents
|
|
526
|
+
*/
|
|
527
|
+
async storeDocumentsAndChunksWithContentTypes(documentChunks, embeddings) {
|
|
528
|
+
console.log(`Storing ${documentChunks.length} document${documentChunks.length === 1 ? '' : 's'} and chunks in database...`);
|
|
529
|
+
// Create a mapping of chunk text to embedding for efficient lookup
|
|
530
|
+
const embeddingMap = new Map();
|
|
531
|
+
let embeddingIndex = 0;
|
|
532
|
+
// Build mapping - this assumes embeddings are in the same order as chunks were processed
|
|
533
|
+
for (const { chunks } of documentChunks) {
|
|
534
|
+
for (const chunk of chunks) {
|
|
535
|
+
if (embeddingIndex < embeddings.length) {
|
|
536
|
+
embeddingMap.set(chunk.text, embeddings[embeddingIndex]);
|
|
537
|
+
embeddingIndex++;
|
|
538
|
+
}
|
|
539
|
+
}
|
|
540
|
+
}
|
|
541
|
+
let totalChunksStored = 0;
|
|
542
|
+
let documentsStored = 0;
|
|
543
|
+
const contentIds = [];
|
|
544
|
+
// Process each document sequentially
|
|
545
|
+
for (const { document, chunks } of documentChunks) {
|
|
546
|
+
try {
|
|
547
|
+
// Generate content ID for filesystem content using ContentManager
|
|
548
|
+
let contentId = document.metadata?.contentId;
|
|
549
|
+
if (!contentId) {
|
|
550
|
+
try {
|
|
551
|
+
// Use ContentManager to create filesystem reference and get content ID
|
|
552
|
+
const contentResult = await this.contentManager.ingestFromFilesystem(document.source);
|
|
553
|
+
contentId = contentResult.contentId;
|
|
554
|
+
// Update document metadata with content ID
|
|
555
|
+
if (!document.metadata) {
|
|
556
|
+
document.metadata = {};
|
|
557
|
+
}
|
|
558
|
+
document.metadata.contentId = contentId;
|
|
559
|
+
document.metadata.storageType = contentResult.storageType;
|
|
560
|
+
}
|
|
561
|
+
catch (contentError) {
|
|
562
|
+
console.warn(`Failed to create content reference for ${document.source}:`, contentError instanceof Error ? contentError.message : String(contentError));
|
|
563
|
+
// Continue without content ID - fallback to legacy behavior
|
|
564
|
+
}
|
|
565
|
+
}
|
|
566
|
+
// Insert or get existing document with content type support and content_id reference
|
|
567
|
+
const documentContentType = document.metadata?.contentType || 'text';
|
|
568
|
+
const documentId = await upsertDocument(this.db, document.source, document.title, documentContentType, document.metadata, contentId);
|
|
569
|
+
documentsStored++;
|
|
570
|
+
// Add content ID to results if available
|
|
571
|
+
if (contentId) {
|
|
572
|
+
contentIds.push(contentId);
|
|
573
|
+
}
|
|
574
|
+
// Insert all chunks for this document with content type support
|
|
575
|
+
let chunksStoredForDoc = 0;
|
|
576
|
+
for (const chunk of chunks) {
|
|
577
|
+
const embedding = embeddingMap.get(chunk.text);
|
|
578
|
+
if (embedding) {
|
|
579
|
+
try {
|
|
580
|
+
const chunkContentType = chunk.contentType || documentContentType;
|
|
581
|
+
const chunkMetadata = chunk.metadata || document.metadata;
|
|
582
|
+
await insertChunk(this.db, embedding.embedding_id, documentId, chunk.text, chunk.chunkIndex, chunkContentType, chunkMetadata);
|
|
583
|
+
chunksStoredForDoc++;
|
|
584
|
+
totalChunksStored++;
|
|
585
|
+
}
|
|
586
|
+
catch (chunkError) {
|
|
587
|
+
console.error(`Failed to store ${chunk.contentType || 'text'} chunk ${chunk.chunkIndex} for document ${document.source}:`, chunkError instanceof Error ? chunkError.message : String(chunkError));
|
|
588
|
+
// Continue with other chunks
|
|
589
|
+
}
|
|
590
|
+
}
|
|
591
|
+
else {
|
|
592
|
+
console.warn(`No embedding found for chunk ${chunk.chunkIndex} in document ${document.source}`);
|
|
593
|
+
}
|
|
594
|
+
}
|
|
595
|
+
// Progress logging for storage
|
|
596
|
+
if (documentChunks.length <= 20 || documentsStored % Math.max(1, Math.floor(documentChunks.length / 10)) === 0 || documentsStored === documentChunks.length) {
|
|
597
|
+
const percentage = Math.round((documentsStored / documentChunks.length) * 100);
|
|
598
|
+
console.log(`Stored ${documentsStored} of ${documentChunks.length} documents (${percentage}%) - ${totalChunksStored} chunks total`);
|
|
599
|
+
}
|
|
600
|
+
}
|
|
601
|
+
catch (docError) {
|
|
602
|
+
console.error(`Failed to store document ${document.source}:`, docError instanceof Error ? docError.message : String(docError));
|
|
603
|
+
// Continue with other documents
|
|
604
|
+
}
|
|
605
|
+
}
|
|
606
|
+
console.log(`✓ Storage complete: ${documentsStored} documents, ${totalChunksStored} chunks saved to database`);
|
|
607
|
+
return contentIds;
|
|
608
|
+
}
|
|
609
|
+
/**
|
|
610
|
+
* Update vector index with new embeddings (supports grouped content type storage)
|
|
611
|
+
*/
|
|
612
|
+
async updateVectorIndex(embeddings) {
|
|
613
|
+
console.log('updateVectorIndex called with', embeddings.length, 'embeddings');
|
|
614
|
+
if (embeddings.length === 0) {
|
|
615
|
+
console.log('No embeddings to add to vector index');
|
|
616
|
+
return;
|
|
617
|
+
}
|
|
618
|
+
console.log(`Adding ${embeddings.length} vector${embeddings.length === 1 ? '' : 's'} to search index...`);
|
|
619
|
+
try {
|
|
620
|
+
// Group embeddings by content type for optimized storage
|
|
621
|
+
const groupedEmbeddings = embeddings.reduce((groups, embedding) => {
|
|
622
|
+
const contentType = embedding.contentType || 'text';
|
|
623
|
+
if (!groups[contentType]) {
|
|
624
|
+
groups[contentType] = [];
|
|
625
|
+
}
|
|
626
|
+
groups[contentType].push(embedding);
|
|
627
|
+
return groups;
|
|
628
|
+
}, {});
|
|
629
|
+
const textEmbeddings = groupedEmbeddings.text || [];
|
|
630
|
+
const imageEmbeddings = groupedEmbeddings.image || [];
|
|
631
|
+
console.log(`Grouped: ${textEmbeddings.length} text, ${imageEmbeddings.length} image vectors`);
|
|
632
|
+
// Use grouped storage method if available, fallback to regular method
|
|
633
|
+
if (this.indexManager.addGroupedEmbeddings) {
|
|
634
|
+
await this.indexManager.addGroupedEmbeddings(textEmbeddings, imageEmbeddings);
|
|
635
|
+
}
|
|
636
|
+
else {
|
|
637
|
+
await this.indexManager.addVectors(embeddings);
|
|
638
|
+
}
|
|
639
|
+
console.log(`✓ Vector index updated successfully with ${embeddings.length} new vectors`);
|
|
640
|
+
}
|
|
641
|
+
catch (error) {
|
|
642
|
+
console.error('Failed to update vector index:', error instanceof Error ? error.message : String(error));
|
|
643
|
+
throw error;
|
|
644
|
+
}
|
|
645
|
+
}
|
|
646
|
+
/**
|
|
647
|
+
* Filter documents based on ingestion mode to avoid processing incompatible content types
|
|
648
|
+
*/
|
|
649
|
+
filterDocumentsByMode(fileResult, mode) {
|
|
650
|
+
if (mode === 'multimodal') {
|
|
651
|
+
// In multimodal mode, keep all documents
|
|
652
|
+
return fileResult;
|
|
653
|
+
}
|
|
654
|
+
// In text mode, filter out image documents
|
|
655
|
+
const filteredDocuments = fileResult.documents.filter(doc => {
|
|
656
|
+
const contentType = doc.metadata?.contentType || 'text';
|
|
657
|
+
const isCompatible = contentType === 'text' ||
|
|
658
|
+
contentType.startsWith('text/') ||
|
|
659
|
+
contentType === 'application/pdf' ||
|
|
660
|
+
contentType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document';
|
|
661
|
+
if (!isCompatible) {
|
|
662
|
+
console.log(`⚠️ Skipping ${doc.source} (${contentType}) - not compatible with text mode`);
|
|
663
|
+
}
|
|
664
|
+
return isCompatible;
|
|
665
|
+
});
|
|
666
|
+
// Update processing result to reflect filtering
|
|
667
|
+
const filteredProcessingResult = {
|
|
668
|
+
...fileResult.processingResult,
|
|
669
|
+
skippedFiles: [
|
|
670
|
+
...(fileResult.processingResult.skippedFiles || []),
|
|
671
|
+
...fileResult.documents
|
|
672
|
+
.filter(doc => !filteredDocuments.includes(doc))
|
|
673
|
+
.map(doc => ({
|
|
674
|
+
path: doc.source,
|
|
675
|
+
reason: `Content type not compatible with ${mode} mode`
|
|
676
|
+
}))
|
|
677
|
+
]
|
|
678
|
+
};
|
|
679
|
+
return {
|
|
680
|
+
documents: filteredDocuments,
|
|
681
|
+
discoveryResult: fileResult.discoveryResult,
|
|
682
|
+
processingResult: filteredProcessingResult
|
|
683
|
+
};
|
|
684
|
+
}
|
|
685
|
+
/**
|
|
686
|
+
* Converts MIME type to simple content type for embedding function
|
|
687
|
+
* @param mimeType - MIME type string (e.g., 'text/plain', 'image/jpeg')
|
|
688
|
+
* @returns Simple content type ('text', 'image', etc.)
|
|
689
|
+
*/
|
|
690
|
+
getContentTypeForEmbedding(contentType) {
|
|
691
|
+
if (!contentType) {
|
|
692
|
+
return 'text';
|
|
693
|
+
}
|
|
694
|
+
// Handle simple content type strings (used by chunking)
|
|
695
|
+
if (contentType === 'image') {
|
|
696
|
+
return 'image';
|
|
697
|
+
}
|
|
698
|
+
else if (contentType === 'text') {
|
|
699
|
+
return 'text';
|
|
700
|
+
}
|
|
701
|
+
// Convert MIME types to simple content types (legacy support)
|
|
702
|
+
if (contentType.startsWith('text/')) {
|
|
703
|
+
return 'text';
|
|
704
|
+
}
|
|
705
|
+
else if (contentType.startsWith('image/')) {
|
|
706
|
+
return 'image';
|
|
707
|
+
}
|
|
708
|
+
else if (contentType === 'application/pdf') {
|
|
709
|
+
return 'text'; // PDFs are processed as text
|
|
710
|
+
}
|
|
711
|
+
else if (contentType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document') {
|
|
712
|
+
return 'text'; // DOCX files are processed as text
|
|
713
|
+
}
|
|
714
|
+
else {
|
|
715
|
+
return 'text'; // Default to text for unknown types
|
|
716
|
+
}
|
|
717
|
+
}
|
|
718
|
+
/**
|
|
719
|
+
* Save the vector index to disk
|
|
720
|
+
*/
|
|
721
|
+
async saveIndex() {
|
|
722
|
+
await this.indexManager.saveIndex();
|
|
723
|
+
}
|
|
724
|
+
/**
|
|
725
|
+
* Process image content from memory using the existing image processing pipeline
|
|
726
|
+
* @private
|
|
727
|
+
*/
|
|
728
|
+
async processImageFromMemory(content, contentResult, metadata, options) {
|
|
729
|
+
try {
|
|
730
|
+
// Import image processing functions
|
|
731
|
+
const { generateImageDescriptionForFile, extractImageMetadataForFile } = await import('../file-processor.js');
|
|
732
|
+
// Use the content path from the content manager (where the image is stored)
|
|
733
|
+
const imagePath = contentResult.contentPath;
|
|
734
|
+
// Extract image metadata
|
|
735
|
+
let imageMetadata = {};
|
|
736
|
+
try {
|
|
737
|
+
imageMetadata = await extractImageMetadataForFile(imagePath);
|
|
738
|
+
}
|
|
739
|
+
catch (error) {
|
|
740
|
+
console.warn(`Failed to extract image metadata for ${metadata.displayName}:`, error instanceof Error ? error.message : String(error));
|
|
741
|
+
// Continue with empty metadata
|
|
742
|
+
}
|
|
743
|
+
// Generate text description for the image
|
|
744
|
+
let descriptionResult = { description: 'Image content', model: 'none', confidence: 0 };
|
|
745
|
+
try {
|
|
746
|
+
const imageToTextOptions = {}; // Use default options for now
|
|
747
|
+
descriptionResult = await generateImageDescriptionForFile(imagePath, imageToTextOptions);
|
|
748
|
+
console.log(`✓ Generated image description: "${descriptionResult.description}"`);
|
|
749
|
+
}
|
|
750
|
+
catch (error) {
|
|
751
|
+
console.warn(`Failed to generate image description for ${metadata.displayName}:`, error instanceof Error ? error.message : String(error));
|
|
752
|
+
// Continue with fallback description
|
|
753
|
+
}
|
|
754
|
+
// Update metadata with description information
|
|
755
|
+
imageMetadata.description = descriptionResult.description;
|
|
756
|
+
imageMetadata.descriptionModel = descriptionResult.model;
|
|
757
|
+
imageMetadata.descriptionConfidence = descriptionResult.confidence;
|
|
758
|
+
// Create document with image description as content
|
|
759
|
+
const title = metadata.displayName;
|
|
760
|
+
// Create content that includes description and key metadata
|
|
761
|
+
const contentParts = [
|
|
762
|
+
`Image: ${title}`,
|
|
763
|
+
`Description: ${descriptionResult.description}`
|
|
764
|
+
];
|
|
765
|
+
if (imageMetadata.dimensions) {
|
|
766
|
+
contentParts.push(`Dimensions: ${imageMetadata.dimensions.width}x${imageMetadata.dimensions.height}`);
|
|
767
|
+
}
|
|
768
|
+
if (imageMetadata.format) {
|
|
769
|
+
contentParts.push(`Format: ${imageMetadata.format}`);
|
|
770
|
+
}
|
|
771
|
+
const documentContent = contentParts.join('\n');
|
|
772
|
+
return {
|
|
773
|
+
source: metadata.displayName,
|
|
774
|
+
title,
|
|
775
|
+
content: documentContent.trim(),
|
|
776
|
+
metadata: {
|
|
777
|
+
contentType: 'image',
|
|
778
|
+
contentId: contentResult.contentId,
|
|
779
|
+
storageType: contentResult.storageType,
|
|
780
|
+
contentPath: contentResult.contentPath, // Store contentPath for embedding
|
|
781
|
+
originalPath: metadata.originalPath,
|
|
782
|
+
...imageMetadata // Spread all image metadata fields
|
|
783
|
+
}
|
|
784
|
+
};
|
|
785
|
+
}
|
|
786
|
+
catch (error) {
|
|
787
|
+
console.warn(`Failed to process image from memory, falling back to basic processing:`, error instanceof Error ? error.message : String(error));
|
|
788
|
+
// Fallback to basic document creation
|
|
789
|
+
return {
|
|
790
|
+
source: metadata.displayName,
|
|
791
|
+
title: metadata.displayName,
|
|
792
|
+
content: `Image: ${metadata.displayName}\nPath: ${contentResult.contentPath}`,
|
|
793
|
+
metadata: {
|
|
794
|
+
contentType: 'image',
|
|
795
|
+
contentId: contentResult.contentId,
|
|
796
|
+
storageType: contentResult.storageType,
|
|
797
|
+
contentPath: contentResult.contentPath, // Store contentPath for embedding
|
|
798
|
+
originalPath: metadata.originalPath,
|
|
799
|
+
processingError: error instanceof Error ? error.message : String(error)
|
|
800
|
+
}
|
|
801
|
+
};
|
|
802
|
+
}
|
|
803
|
+
}
|
|
804
|
+
/**
|
|
805
|
+
* Process PDF content from memory using the existing PDF processing pipeline
|
|
806
|
+
* @private
|
|
807
|
+
*/
|
|
808
|
+
async processPDFFromMemory(content, contentResult, metadata, options) {
|
|
809
|
+
try {
|
|
810
|
+
// Import PDF processing
|
|
811
|
+
const pdfParse = require('pdf-parse');
|
|
812
|
+
// Parse PDF content directly from buffer
|
|
813
|
+
const pdfData = await pdfParse(content);
|
|
814
|
+
console.log(`✓ Extracted ${pdfData.text.length} characters from PDF`);
|
|
815
|
+
return {
|
|
816
|
+
source: metadata.displayName,
|
|
817
|
+
title: metadata.displayName,
|
|
818
|
+
content: pdfData.text.trim(),
|
|
819
|
+
metadata: {
|
|
820
|
+
contentType: 'application/pdf',
|
|
821
|
+
contentId: contentResult.contentId,
|
|
822
|
+
storageType: contentResult.storageType,
|
|
823
|
+
originalPath: metadata.originalPath,
|
|
824
|
+
pages: pdfData.numpages,
|
|
825
|
+
pdfInfo: pdfData.info
|
|
826
|
+
}
|
|
827
|
+
};
|
|
828
|
+
}
|
|
829
|
+
catch (error) {
|
|
830
|
+
console.warn(`Failed to process PDF from memory, falling back to basic processing:`, error instanceof Error ? error.message : String(error));
|
|
831
|
+
// Fallback to basic document creation
|
|
832
|
+
return {
|
|
833
|
+
source: metadata.displayName,
|
|
834
|
+
title: metadata.displayName,
|
|
835
|
+
content: `PDF Document: ${metadata.displayName}\nPath: ${contentResult.contentPath}`,
|
|
836
|
+
metadata: {
|
|
837
|
+
contentType: 'application/pdf',
|
|
838
|
+
contentId: contentResult.contentId,
|
|
839
|
+
storageType: contentResult.storageType,
|
|
840
|
+
originalPath: metadata.originalPath,
|
|
841
|
+
processingError: error instanceof Error ? error.message : String(error)
|
|
842
|
+
}
|
|
843
|
+
};
|
|
844
|
+
}
|
|
845
|
+
}
|
|
846
|
+
/**
|
|
847
|
+
* Process DOCX content from memory using the existing DOCX processing pipeline
|
|
848
|
+
* @private
|
|
849
|
+
*/
|
|
850
|
+
async processDOCXFromMemory(content, contentResult, metadata, options) {
|
|
851
|
+
try {
|
|
852
|
+
// Import DOCX processing
|
|
853
|
+
const mammoth = await import('mammoth');
|
|
854
|
+
// Parse DOCX content directly from buffer
|
|
855
|
+
const docxResult = await mammoth.extractRawText({ buffer: content });
|
|
856
|
+
console.log(`✓ Extracted ${docxResult.value.length} characters from DOCX`);
|
|
857
|
+
return {
|
|
858
|
+
source: metadata.displayName,
|
|
859
|
+
title: metadata.displayName,
|
|
860
|
+
content: docxResult.value.trim(),
|
|
861
|
+
metadata: {
|
|
862
|
+
contentType: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
|
863
|
+
contentId: contentResult.contentId,
|
|
864
|
+
storageType: contentResult.storageType,
|
|
865
|
+
originalPath: metadata.originalPath,
|
|
866
|
+
messages: docxResult.messages
|
|
867
|
+
}
|
|
868
|
+
};
|
|
869
|
+
}
|
|
870
|
+
catch (error) {
|
|
871
|
+
console.warn(`Failed to process DOCX from memory, falling back to basic processing:`, error instanceof Error ? error.message : String(error));
|
|
872
|
+
// Fallback to basic document creation
|
|
873
|
+
return {
|
|
874
|
+
source: metadata.displayName,
|
|
875
|
+
title: metadata.displayName,
|
|
876
|
+
content: `DOCX Document: ${metadata.displayName}\nPath: ${contentResult.contentPath}`,
|
|
877
|
+
metadata: {
|
|
878
|
+
contentType: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
|
879
|
+
contentId: contentResult.contentId,
|
|
880
|
+
storageType: contentResult.storageType,
|
|
881
|
+
originalPath: metadata.originalPath,
|
|
882
|
+
processingError: error instanceof Error ? error.message : String(error)
|
|
883
|
+
}
|
|
884
|
+
};
|
|
885
|
+
}
|
|
886
|
+
}
|
|
887
|
+
/**
|
|
888
|
+
* Clean up resources - explicit cleanup method
|
|
889
|
+
*/
|
|
890
|
+
async cleanup() {
|
|
891
|
+
try {
|
|
892
|
+
// Clean up ContentManager to prevent resource leaks
|
|
893
|
+
if (this.contentManager && typeof this.contentManager.cleanup === 'function') {
|
|
894
|
+
this.contentManager.cleanup();
|
|
895
|
+
}
|
|
896
|
+
await this.db.close();
|
|
897
|
+
await this.indexManager.close();
|
|
898
|
+
}
|
|
899
|
+
catch (error) {
|
|
900
|
+
console.error('Error during IngestionPipeline cleanup:', error instanceof Error ? error.message : String(error));
|
|
901
|
+
}
|
|
902
|
+
}
|
|
903
|
+
}
|
|
904
|
+
//# sourceMappingURL=ingestion.js.map
|