rag-lite-ts 2.1.1 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{core → cjs/core}/model-validator.js +1 -1
- package/dist/{core → cjs/core}/vector-index.js +4 -2
- package/dist/esm/api-errors.d.ts +90 -0
- package/dist/esm/api-errors.js +320 -0
- package/dist/esm/cli/indexer.d.ts +11 -0
- package/dist/esm/cli/indexer.js +471 -0
- package/dist/esm/cli/search.d.ts +7 -0
- package/dist/esm/cli/search.js +332 -0
- package/dist/esm/cli.d.ts +3 -0
- package/dist/esm/cli.js +529 -0
- package/dist/esm/config.d.ts +51 -0
- package/dist/esm/config.js +79 -0
- package/dist/esm/core/abstract-embedder.d.ts +125 -0
- package/dist/esm/core/abstract-embedder.js +264 -0
- package/dist/esm/core/actionable-error-messages.d.ts +60 -0
- package/dist/esm/core/actionable-error-messages.js +397 -0
- package/dist/esm/core/adapters.d.ts +93 -0
- package/dist/esm/core/adapters.js +139 -0
- package/dist/esm/core/batch-processing-optimizer.d.ts +155 -0
- package/dist/esm/core/batch-processing-optimizer.js +536 -0
- package/dist/esm/core/binary-index-format.d.ts +78 -0
- package/dist/esm/core/binary-index-format.js +291 -0
- package/dist/esm/core/chunker.d.ts +119 -0
- package/dist/esm/core/chunker.js +73 -0
- package/dist/esm/core/cli-database-utils.d.ts +53 -0
- package/dist/esm/core/cli-database-utils.js +239 -0
- package/dist/esm/core/config.d.ts +102 -0
- package/dist/esm/core/config.js +247 -0
- package/dist/esm/core/content-errors.d.ts +111 -0
- package/dist/esm/core/content-errors.js +362 -0
- package/dist/esm/core/content-manager.d.ts +335 -0
- package/dist/esm/core/content-manager.js +1476 -0
- package/dist/esm/core/content-performance-optimizer.d.ts +150 -0
- package/dist/esm/core/content-performance-optimizer.js +516 -0
- package/dist/esm/core/content-resolver.d.ts +104 -0
- package/dist/esm/core/content-resolver.js +285 -0
- package/dist/esm/core/cross-modal-search.d.ts +164 -0
- package/dist/esm/core/cross-modal-search.js +342 -0
- package/dist/esm/core/database-connection-manager.d.ts +109 -0
- package/dist/esm/core/database-connection-manager.js +310 -0
- package/dist/esm/core/db.d.ts +213 -0
- package/dist/esm/core/db.js +895 -0
- package/dist/esm/core/embedder-factory.d.ts +154 -0
- package/dist/esm/core/embedder-factory.js +311 -0
- package/dist/esm/core/error-handler.d.ts +112 -0
- package/dist/esm/core/error-handler.js +239 -0
- package/dist/esm/core/index.d.ts +59 -0
- package/dist/esm/core/index.js +69 -0
- package/dist/esm/core/ingestion.d.ts +202 -0
- package/dist/esm/core/ingestion.js +901 -0
- package/dist/esm/core/interfaces.d.ts +408 -0
- package/dist/esm/core/interfaces.js +106 -0
- package/dist/esm/core/lazy-dependency-loader.d.ts +147 -0
- package/dist/esm/core/lazy-dependency-loader.js +435 -0
- package/dist/esm/core/mode-detection-service.d.ts +150 -0
- package/dist/esm/core/mode-detection-service.js +565 -0
- package/dist/esm/core/mode-model-validator.d.ts +92 -0
- package/dist/esm/core/mode-model-validator.js +203 -0
- package/dist/esm/core/model-registry.d.ts +116 -0
- package/dist/esm/core/model-registry.js +411 -0
- package/dist/esm/core/model-validator.d.ts +217 -0
- package/dist/esm/core/model-validator.js +782 -0
- package/dist/esm/core/path-manager.d.ts +47 -0
- package/dist/esm/core/path-manager.js +71 -0
- package/dist/esm/core/raglite-paths.d.ts +121 -0
- package/dist/esm/core/raglite-paths.js +145 -0
- package/dist/esm/core/reranking-config.d.ts +42 -0
- package/dist/esm/core/reranking-config.js +147 -0
- package/dist/esm/core/reranking-factory.d.ts +92 -0
- package/dist/esm/core/reranking-factory.js +410 -0
- package/dist/esm/core/reranking-strategies.d.ts +310 -0
- package/dist/esm/core/reranking-strategies.js +650 -0
- package/dist/esm/core/resource-cleanup.d.ts +163 -0
- package/dist/esm/core/resource-cleanup.js +371 -0
- package/dist/esm/core/resource-manager.d.ts +212 -0
- package/dist/esm/core/resource-manager.js +564 -0
- package/dist/esm/core/search-pipeline.d.ts +111 -0
- package/dist/esm/core/search-pipeline.js +287 -0
- package/dist/esm/core/search.d.ts +141 -0
- package/dist/esm/core/search.js +320 -0
- package/dist/esm/core/streaming-operations.d.ts +145 -0
- package/dist/esm/core/streaming-operations.js +409 -0
- package/dist/esm/core/types.d.ts +66 -0
- package/dist/esm/core/types.js +6 -0
- package/dist/esm/core/universal-embedder.d.ts +177 -0
- package/dist/esm/core/universal-embedder.js +139 -0
- package/dist/esm/core/validation-messages.d.ts +99 -0
- package/dist/esm/core/validation-messages.js +334 -0
- package/dist/esm/core/vector-index.d.ts +72 -0
- package/dist/esm/core/vector-index.js +333 -0
- package/dist/esm/dom-polyfills.d.ts +6 -0
- package/dist/esm/dom-polyfills.js +37 -0
- package/dist/esm/factories/index.d.ts +27 -0
- package/dist/esm/factories/index.js +29 -0
- package/dist/esm/factories/ingestion-factory.d.ts +200 -0
- package/dist/esm/factories/ingestion-factory.js +477 -0
- package/dist/esm/factories/search-factory.d.ts +154 -0
- package/dist/esm/factories/search-factory.js +344 -0
- package/dist/esm/file-processor.d.ts +147 -0
- package/dist/esm/file-processor.js +963 -0
- package/dist/esm/index-manager.d.ts +116 -0
- package/dist/esm/index-manager.js +598 -0
- package/dist/esm/index.d.ts +75 -0
- package/dist/esm/index.js +110 -0
- package/dist/esm/indexer.d.ts +7 -0
- package/dist/esm/indexer.js +54 -0
- package/dist/esm/ingestion.d.ts +63 -0
- package/dist/esm/ingestion.js +124 -0
- package/dist/esm/mcp-server.d.ts +46 -0
- package/dist/esm/mcp-server.js +1820 -0
- package/dist/esm/multimodal/clip-embedder.d.ts +327 -0
- package/dist/esm/multimodal/clip-embedder.js +996 -0
- package/dist/esm/multimodal/index.d.ts +6 -0
- package/dist/esm/multimodal/index.js +6 -0
- package/dist/esm/preprocess.d.ts +19 -0
- package/dist/esm/preprocess.js +203 -0
- package/dist/esm/preprocessors/index.d.ts +17 -0
- package/dist/esm/preprocessors/index.js +38 -0
- package/dist/esm/preprocessors/mdx.d.ts +25 -0
- package/dist/esm/preprocessors/mdx.js +101 -0
- package/dist/esm/preprocessors/mermaid.d.ts +68 -0
- package/dist/esm/preprocessors/mermaid.js +329 -0
- package/dist/esm/preprocessors/registry.d.ts +56 -0
- package/dist/esm/preprocessors/registry.js +179 -0
- package/dist/esm/run-error-recovery-tests.d.ts +7 -0
- package/dist/esm/run-error-recovery-tests.js +101 -0
- package/dist/esm/search-standalone.d.ts +7 -0
- package/dist/esm/search-standalone.js +117 -0
- package/dist/esm/search.d.ts +99 -0
- package/dist/esm/search.js +177 -0
- package/dist/esm/test-utils.d.ts +18 -0
- package/dist/esm/test-utils.js +27 -0
- package/dist/esm/text/chunker.d.ts +33 -0
- package/dist/esm/text/chunker.js +279 -0
- package/dist/esm/text/embedder.d.ts +111 -0
- package/dist/esm/text/embedder.js +386 -0
- package/dist/esm/text/index.d.ts +8 -0
- package/dist/esm/text/index.js +9 -0
- package/dist/esm/text/preprocessors/index.d.ts +17 -0
- package/dist/esm/text/preprocessors/index.js +38 -0
- package/dist/esm/text/preprocessors/mdx.d.ts +25 -0
- package/dist/esm/text/preprocessors/mdx.js +101 -0
- package/dist/esm/text/preprocessors/mermaid.d.ts +68 -0
- package/dist/esm/text/preprocessors/mermaid.js +330 -0
- package/dist/esm/text/preprocessors/registry.d.ts +56 -0
- package/dist/esm/text/preprocessors/registry.js +180 -0
- package/dist/esm/text/reranker.d.ts +49 -0
- package/dist/esm/text/reranker.js +274 -0
- package/dist/esm/text/sentence-transformer-embedder.d.ts +96 -0
- package/dist/esm/text/sentence-transformer-embedder.js +340 -0
- package/dist/esm/text/tokenizer.d.ts +22 -0
- package/dist/esm/text/tokenizer.js +64 -0
- package/dist/esm/types.d.ts +83 -0
- package/dist/esm/types.js +3 -0
- package/dist/esm/utils/vector-math.d.ts +31 -0
- package/dist/esm/utils/vector-math.js +70 -0
- package/package.json +30 -12
- /package/dist/{api-errors.d.ts → cjs/api-errors.d.ts} +0 -0
- /package/dist/{api-errors.js → cjs/api-errors.js} +0 -0
- /package/dist/{cli → cjs/cli}/indexer.d.ts +0 -0
- /package/dist/{cli → cjs/cli}/indexer.js +0 -0
- /package/dist/{cli → cjs/cli}/search.d.ts +0 -0
- /package/dist/{cli → cjs/cli}/search.js +0 -0
- /package/dist/{cli.d.ts → cjs/cli.d.ts} +0 -0
- /package/dist/{cli.js → cjs/cli.js} +0 -0
- /package/dist/{config.d.ts → cjs/config.d.ts} +0 -0
- /package/dist/{config.js → cjs/config.js} +0 -0
- /package/dist/{core → cjs/core}/abstract-embedder.d.ts +0 -0
- /package/dist/{core → cjs/core}/abstract-embedder.js +0 -0
- /package/dist/{core → cjs/core}/actionable-error-messages.d.ts +0 -0
- /package/dist/{core → cjs/core}/actionable-error-messages.js +0 -0
- /package/dist/{core → cjs/core}/adapters.d.ts +0 -0
- /package/dist/{core → cjs/core}/adapters.js +0 -0
- /package/dist/{core → cjs/core}/batch-processing-optimizer.d.ts +0 -0
- /package/dist/{core → cjs/core}/batch-processing-optimizer.js +0 -0
- /package/dist/{core → cjs/core}/binary-index-format.d.ts +0 -0
- /package/dist/{core → cjs/core}/binary-index-format.js +0 -0
- /package/dist/{core → cjs/core}/chunker.d.ts +0 -0
- /package/dist/{core → cjs/core}/chunker.js +0 -0
- /package/dist/{core → cjs/core}/cli-database-utils.d.ts +0 -0
- /package/dist/{core → cjs/core}/cli-database-utils.js +0 -0
- /package/dist/{core → cjs/core}/config.d.ts +0 -0
- /package/dist/{core → cjs/core}/config.js +0 -0
- /package/dist/{core → cjs/core}/content-errors.d.ts +0 -0
- /package/dist/{core → cjs/core}/content-errors.js +0 -0
- /package/dist/{core → cjs/core}/content-manager.d.ts +0 -0
- /package/dist/{core → cjs/core}/content-manager.js +0 -0
- /package/dist/{core → cjs/core}/content-performance-optimizer.d.ts +0 -0
- /package/dist/{core → cjs/core}/content-performance-optimizer.js +0 -0
- /package/dist/{core → cjs/core}/content-resolver.d.ts +0 -0
- /package/dist/{core → cjs/core}/content-resolver.js +0 -0
- /package/dist/{core → cjs/core}/cross-modal-search.d.ts +0 -0
- /package/dist/{core → cjs/core}/cross-modal-search.js +0 -0
- /package/dist/{core → cjs/core}/database-connection-manager.d.ts +0 -0
- /package/dist/{core → cjs/core}/database-connection-manager.js +0 -0
- /package/dist/{core → cjs/core}/db.d.ts +0 -0
- /package/dist/{core → cjs/core}/db.js +0 -0
- /package/dist/{core → cjs/core}/embedder-factory.d.ts +0 -0
- /package/dist/{core → cjs/core}/embedder-factory.js +0 -0
- /package/dist/{core → cjs/core}/error-handler.d.ts +0 -0
- /package/dist/{core → cjs/core}/error-handler.js +0 -0
- /package/dist/{core → cjs/core}/index.d.ts +0 -0
- /package/dist/{core → cjs/core}/index.js +0 -0
- /package/dist/{core → cjs/core}/ingestion.d.ts +0 -0
- /package/dist/{core → cjs/core}/ingestion.js +0 -0
- /package/dist/{core → cjs/core}/interfaces.d.ts +0 -0
- /package/dist/{core → cjs/core}/interfaces.js +0 -0
- /package/dist/{core → cjs/core}/lazy-dependency-loader.d.ts +0 -0
- /package/dist/{core → cjs/core}/lazy-dependency-loader.js +0 -0
- /package/dist/{core → cjs/core}/mode-detection-service.d.ts +0 -0
- /package/dist/{core → cjs/core}/mode-detection-service.js +0 -0
- /package/dist/{core → cjs/core}/mode-model-validator.d.ts +0 -0
- /package/dist/{core → cjs/core}/mode-model-validator.js +0 -0
- /package/dist/{core → cjs/core}/model-registry.d.ts +0 -0
- /package/dist/{core → cjs/core}/model-registry.js +0 -0
- /package/dist/{core → cjs/core}/model-validator.d.ts +0 -0
- /package/dist/{core → cjs/core}/path-manager.d.ts +0 -0
- /package/dist/{core → cjs/core}/path-manager.js +0 -0
- /package/dist/{core → cjs/core}/raglite-paths.d.ts +0 -0
- /package/dist/{core → cjs/core}/raglite-paths.js +0 -0
- /package/dist/{core → cjs/core}/reranking-config.d.ts +0 -0
- /package/dist/{core → cjs/core}/reranking-config.js +0 -0
- /package/dist/{core → cjs/core}/reranking-factory.d.ts +0 -0
- /package/dist/{core → cjs/core}/reranking-factory.js +0 -0
- /package/dist/{core → cjs/core}/reranking-strategies.d.ts +0 -0
- /package/dist/{core → cjs/core}/reranking-strategies.js +0 -0
- /package/dist/{core → cjs/core}/resource-cleanup.d.ts +0 -0
- /package/dist/{core → cjs/core}/resource-cleanup.js +0 -0
- /package/dist/{core → cjs/core}/resource-manager.d.ts +0 -0
- /package/dist/{core → cjs/core}/resource-manager.js +0 -0
- /package/dist/{core → cjs/core}/search-pipeline.d.ts +0 -0
- /package/dist/{core → cjs/core}/search-pipeline.js +0 -0
- /package/dist/{core → cjs/core}/search.d.ts +0 -0
- /package/dist/{core → cjs/core}/search.js +0 -0
- /package/dist/{core → cjs/core}/streaming-operations.d.ts +0 -0
- /package/dist/{core → cjs/core}/streaming-operations.js +0 -0
- /package/dist/{core → cjs/core}/types.d.ts +0 -0
- /package/dist/{core → cjs/core}/types.js +0 -0
- /package/dist/{core → cjs/core}/universal-embedder.d.ts +0 -0
- /package/dist/{core → cjs/core}/universal-embedder.js +0 -0
- /package/dist/{core → cjs/core}/validation-messages.d.ts +0 -0
- /package/dist/{core → cjs/core}/validation-messages.js +0 -0
- /package/dist/{core → cjs/core}/vector-index.d.ts +0 -0
- /package/dist/{dom-polyfills.d.ts → cjs/dom-polyfills.d.ts} +0 -0
- /package/dist/{dom-polyfills.js → cjs/dom-polyfills.js} +0 -0
- /package/dist/{factories → cjs/factories}/index.d.ts +0 -0
- /package/dist/{factories → cjs/factories}/index.js +0 -0
- /package/dist/{factories → cjs/factories}/ingestion-factory.d.ts +0 -0
- /package/dist/{factories → cjs/factories}/ingestion-factory.js +0 -0
- /package/dist/{factories → cjs/factories}/search-factory.d.ts +0 -0
- /package/dist/{factories → cjs/factories}/search-factory.js +0 -0
- /package/dist/{file-processor.d.ts → cjs/file-processor.d.ts} +0 -0
- /package/dist/{file-processor.js → cjs/file-processor.js} +0 -0
- /package/dist/{index-manager.d.ts → cjs/index-manager.d.ts} +0 -0
- /package/dist/{index-manager.js → cjs/index-manager.js} +0 -0
- /package/dist/{index.d.ts → cjs/index.d.ts} +0 -0
- /package/dist/{index.js → cjs/index.js} +0 -0
- /package/dist/{indexer.d.ts → cjs/indexer.d.ts} +0 -0
- /package/dist/{indexer.js → cjs/indexer.js} +0 -0
- /package/dist/{ingestion.d.ts → cjs/ingestion.d.ts} +0 -0
- /package/dist/{ingestion.js → cjs/ingestion.js} +0 -0
- /package/dist/{mcp-server.d.ts → cjs/mcp-server.d.ts} +0 -0
- /package/dist/{mcp-server.js → cjs/mcp-server.js} +0 -0
- /package/dist/{multimodal → cjs/multimodal}/clip-embedder.d.ts +0 -0
- /package/dist/{multimodal → cjs/multimodal}/clip-embedder.js +0 -0
- /package/dist/{multimodal → cjs/multimodal}/index.d.ts +0 -0
- /package/dist/{multimodal → cjs/multimodal}/index.js +0 -0
- /package/dist/{preprocess.d.ts → cjs/preprocess.d.ts} +0 -0
- /package/dist/{preprocess.js → cjs/preprocess.js} +0 -0
- /package/dist/{preprocessors → cjs/preprocessors}/index.d.ts +0 -0
- /package/dist/{preprocessors → cjs/preprocessors}/index.js +0 -0
- /package/dist/{preprocessors → cjs/preprocessors}/mdx.d.ts +0 -0
- /package/dist/{preprocessors → cjs/preprocessors}/mdx.js +0 -0
- /package/dist/{preprocessors → cjs/preprocessors}/mermaid.d.ts +0 -0
- /package/dist/{preprocessors → cjs/preprocessors}/mermaid.js +0 -0
- /package/dist/{preprocessors → cjs/preprocessors}/registry.d.ts +0 -0
- /package/dist/{preprocessors → cjs/preprocessors}/registry.js +0 -0
- /package/dist/{run-error-recovery-tests.d.ts → cjs/run-error-recovery-tests.d.ts} +0 -0
- /package/dist/{run-error-recovery-tests.js → cjs/run-error-recovery-tests.js} +0 -0
- /package/dist/{search-standalone.d.ts → cjs/search-standalone.d.ts} +0 -0
- /package/dist/{search-standalone.js → cjs/search-standalone.js} +0 -0
- /package/dist/{search.d.ts → cjs/search.d.ts} +0 -0
- /package/dist/{search.js → cjs/search.js} +0 -0
- /package/dist/{test-utils.d.ts → cjs/test-utils.d.ts} +0 -0
- /package/dist/{test-utils.js → cjs/test-utils.js} +0 -0
- /package/dist/{text → cjs/text}/chunker.d.ts +0 -0
- /package/dist/{text → cjs/text}/chunker.js +0 -0
- /package/dist/{text → cjs/text}/embedder.d.ts +0 -0
- /package/dist/{text → cjs/text}/embedder.js +0 -0
- /package/dist/{text → cjs/text}/index.d.ts +0 -0
- /package/dist/{text → cjs/text}/index.js +0 -0
- /package/dist/{text → cjs/text}/preprocessors/index.d.ts +0 -0
- /package/dist/{text → cjs/text}/preprocessors/index.js +0 -0
- /package/dist/{text → cjs/text}/preprocessors/mdx.d.ts +0 -0
- /package/dist/{text → cjs/text}/preprocessors/mdx.js +0 -0
- /package/dist/{text → cjs/text}/preprocessors/mermaid.d.ts +0 -0
- /package/dist/{text → cjs/text}/preprocessors/mermaid.js +0 -0
- /package/dist/{text → cjs/text}/preprocessors/registry.d.ts +0 -0
- /package/dist/{text → cjs/text}/preprocessors/registry.js +0 -0
- /package/dist/{text → cjs/text}/reranker.d.ts +0 -0
- /package/dist/{text → cjs/text}/reranker.js +0 -0
- /package/dist/{text → cjs/text}/sentence-transformer-embedder.d.ts +0 -0
- /package/dist/{text → cjs/text}/sentence-transformer-embedder.js +0 -0
- /package/dist/{text → cjs/text}/tokenizer.d.ts +0 -0
- /package/dist/{text → cjs/text}/tokenizer.js +0 -0
- /package/dist/{types.d.ts → cjs/types.d.ts} +0 -0
- /package/dist/{types.js → cjs/types.js} +0 -0
- /package/dist/{utils → cjs/utils}/vector-math.d.ts +0 -0
- /package/dist/{utils → cjs/utils}/vector-math.js +0 -0
|
@@ -0,0 +1,963 @@
|
|
|
1
|
+
import { promises as fs } from 'fs';
|
|
2
|
+
import { join, extname, basename, resolve } from 'path';
|
|
3
|
+
import { ErrorCategory, ErrorSeverity, safeExecute } from './core/error-handler.js';
|
|
4
|
+
import { preprocessDocument } from './preprocess.js';
|
|
5
|
+
import { config } from './core/config.js';
|
|
6
|
+
import { DocumentPathManager } from './core/path-manager.js';
|
|
7
|
+
import { createRequire } from 'module';
|
|
8
|
+
import { JSDOM } from 'jsdom';
|
|
9
|
+
const require = createRequire(import.meta.url);
|
|
10
|
+
// Set up DOM polyfills for PDF parsing
|
|
11
|
+
const dom = new JSDOM('<!DOCTYPE html><html><body></body></html>', {
|
|
12
|
+
pretendToBeVisual: true,
|
|
13
|
+
resources: 'usable'
|
|
14
|
+
});
|
|
15
|
+
// Polyfill global objects needed by pdf-parse
|
|
16
|
+
global.DOMMatrix = dom.window.DOMMatrix || class {
|
|
17
|
+
a = 1;
|
|
18
|
+
b = 0;
|
|
19
|
+
c = 0;
|
|
20
|
+
d = 1;
|
|
21
|
+
e = 0;
|
|
22
|
+
f = 0;
|
|
23
|
+
constructor() { }
|
|
24
|
+
};
|
|
25
|
+
global.ImageData = dom.window.ImageData || class {
|
|
26
|
+
width;
|
|
27
|
+
height;
|
|
28
|
+
data;
|
|
29
|
+
constructor(width, height) {
|
|
30
|
+
this.width = width;
|
|
31
|
+
this.height = height;
|
|
32
|
+
this.data = new Uint8ClampedArray(width * height * 4);
|
|
33
|
+
}
|
|
34
|
+
};
|
|
35
|
+
global.Path2D = dom.window.Path2D || class {
|
|
36
|
+
constructor() { }
|
|
37
|
+
moveTo() { }
|
|
38
|
+
lineTo() { }
|
|
39
|
+
closePath() { }
|
|
40
|
+
};
|
|
41
|
+
const pdfParse = require('pdf-parse');
|
|
42
|
+
import * as mammoth from 'mammoth';
|
|
43
|
+
/**
|
|
44
|
+
* Supported file extensions for document ingestion
|
|
45
|
+
*/
|
|
46
|
+
const SUPPORTED_TEXT_EXTENSIONS = ['.md', '.txt', '.mdx', '.pdf', '.docx'];
|
|
47
|
+
/**
|
|
48
|
+
* Supported image file extensions for multimodal ingestion
|
|
49
|
+
*/
|
|
50
|
+
const SUPPORTED_IMAGE_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'];
|
|
51
|
+
/**
|
|
52
|
+
* All supported file extensions (text + image)
|
|
53
|
+
*/
|
|
54
|
+
const SUPPORTED_EXTENSIONS = [...SUPPORTED_TEXT_EXTENSIONS, ...SUPPORTED_IMAGE_EXTENSIONS];
|
|
55
|
+
/**
|
|
56
|
+
* Default options for file processing
|
|
57
|
+
*/
|
|
58
|
+
export const DEFAULT_FILE_PROCESSOR_OPTIONS = {
|
|
59
|
+
recursive: true,
|
|
60
|
+
maxFileSize: 10 * 1024 * 1024 // 10MB
|
|
61
|
+
};
|
|
62
|
+
/**
|
|
63
|
+
* Default options for image-to-text processing
|
|
64
|
+
*/
|
|
65
|
+
export const DEFAULT_IMAGE_TO_TEXT_OPTIONS = {
|
|
66
|
+
model: 'Xenova/vit-gpt2-image-captioning',
|
|
67
|
+
maxLength: 50,
|
|
68
|
+
batchSize: 4,
|
|
69
|
+
includeConfidence: false
|
|
70
|
+
};
|
|
71
|
+
/**
|
|
72
|
+
* Check if a file has a supported extension
|
|
73
|
+
*/
|
|
74
|
+
function isSupportedFile(filePath) {
|
|
75
|
+
const ext = extname(filePath).toLowerCase();
|
|
76
|
+
return SUPPORTED_EXTENSIONS.includes(ext);
|
|
77
|
+
}
|
|
78
|
+
/**
|
|
79
|
+
* Determine content type based on file extension
|
|
80
|
+
*/
|
|
81
|
+
function getContentType(filePath) {
|
|
82
|
+
const ext = extname(filePath).toLowerCase();
|
|
83
|
+
if (SUPPORTED_IMAGE_EXTENSIONS.includes(ext)) {
|
|
84
|
+
return 'image';
|
|
85
|
+
}
|
|
86
|
+
return 'text';
|
|
87
|
+
}
|
|
88
|
+
/**
|
|
89
|
+
* Check if a file is an image file
|
|
90
|
+
*/
|
|
91
|
+
function isImageFile(filePath) {
|
|
92
|
+
const ext = extname(filePath).toLowerCase();
|
|
93
|
+
return SUPPORTED_IMAGE_EXTENSIONS.includes(ext);
|
|
94
|
+
}
|
|
95
|
+
/**
|
|
96
|
+
* Validate image file format and accessibility
|
|
97
|
+
*/
|
|
98
|
+
async function validateImageFile(filePath) {
|
|
99
|
+
try {
|
|
100
|
+
const stats = await fs.stat(filePath);
|
|
101
|
+
// Check if file is readable
|
|
102
|
+
if (!stats.isFile()) {
|
|
103
|
+
return { valid: false, error: 'Path is not a file' };
|
|
104
|
+
}
|
|
105
|
+
// Check file size (images can be larger than text files)
|
|
106
|
+
const maxImageSize = 50 * 1024 * 1024; // 50MB for images
|
|
107
|
+
if (stats.size > maxImageSize) {
|
|
108
|
+
return {
|
|
109
|
+
valid: false,
|
|
110
|
+
error: `Image file size (${Math.round(stats.size / 1024 / 1024)}MB) exceeds maximum (50MB)`
|
|
111
|
+
};
|
|
112
|
+
}
|
|
113
|
+
// Check if file is empty
|
|
114
|
+
if (stats.size === 0) {
|
|
115
|
+
return { valid: false, error: 'Image file is empty' };
|
|
116
|
+
}
|
|
117
|
+
// Basic format validation by reading file header
|
|
118
|
+
const buffer = await fs.readFile(filePath, { encoding: null });
|
|
119
|
+
const ext = extname(filePath).toLowerCase();
|
|
120
|
+
// Validate file signatures (magic numbers)
|
|
121
|
+
if (ext === '.jpg' || ext === '.jpeg') {
|
|
122
|
+
if (buffer[0] !== 0xFF || buffer[1] !== 0xD8) {
|
|
123
|
+
return { valid: false, error: 'Invalid JPEG file format' };
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
else if (ext === '.png') {
|
|
127
|
+
const pngSignature = [0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A];
|
|
128
|
+
for (let i = 0; i < pngSignature.length; i++) {
|
|
129
|
+
if (buffer[i] !== pngSignature[i]) {
|
|
130
|
+
return { valid: false, error: 'Invalid PNG file format' };
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
else if (ext === '.gif') {
|
|
135
|
+
const gifSignature = [0x47, 0x49, 0x46]; // "GIF"
|
|
136
|
+
for (let i = 0; i < gifSignature.length; i++) {
|
|
137
|
+
if (buffer[i] !== gifSignature[i]) {
|
|
138
|
+
return { valid: false, error: 'Invalid GIF file format' };
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
else if (ext === '.webp') {
|
|
143
|
+
// WebP: "RIFF" at start and "WEBP" at offset 8
|
|
144
|
+
if (buffer[0] !== 0x52 || buffer[1] !== 0x49 || buffer[2] !== 0x46 || buffer[3] !== 0x46) {
|
|
145
|
+
return { valid: false, error: 'Invalid WebP file format (missing RIFF header)' };
|
|
146
|
+
}
|
|
147
|
+
if (buffer[8] !== 0x57 || buffer[9] !== 0x45 || buffer[10] !== 0x42 || buffer[11] !== 0x50) {
|
|
148
|
+
return { valid: false, error: 'Invalid WebP file format (missing WEBP signature)' };
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
else if (ext === '.bmp') {
|
|
152
|
+
if (buffer[0] !== 0x42 || buffer[1] !== 0x4D) { // "BM"
|
|
153
|
+
return { valid: false, error: 'Invalid BMP file format' };
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
return { valid: true };
|
|
157
|
+
}
|
|
158
|
+
catch (error) {
|
|
159
|
+
return {
|
|
160
|
+
valid: false,
|
|
161
|
+
error: `Failed to validate image file: ${error instanceof Error ? error.message : String(error)}`
|
|
162
|
+
};
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
/**
|
|
166
|
+
* Recursively discover files in a directory
|
|
167
|
+
*/
|
|
168
|
+
async function discoverFilesRecursive(dirPath, options) {
|
|
169
|
+
const result = {
|
|
170
|
+
files: [],
|
|
171
|
+
skipped: []
|
|
172
|
+
};
|
|
173
|
+
try {
|
|
174
|
+
const entries = await fs.readdir(dirPath, { withFileTypes: true });
|
|
175
|
+
for (const entry of entries) {
|
|
176
|
+
const fullPath = join(dirPath, entry.name);
|
|
177
|
+
if (entry.isDirectory()) {
|
|
178
|
+
if (options.recursive) {
|
|
179
|
+
// Recursively process subdirectory
|
|
180
|
+
const subResult = await discoverFilesRecursive(fullPath, options);
|
|
181
|
+
result.files.push(...subResult.files);
|
|
182
|
+
result.skipped.push(...subResult.skipped);
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
else if (entry.isFile()) {
|
|
186
|
+
if (isSupportedFile(fullPath)) {
|
|
187
|
+
try {
|
|
188
|
+
// Check file size based on content type
|
|
189
|
+
const stats = await fs.stat(fullPath);
|
|
190
|
+
const contentType = getContentType(fullPath);
|
|
191
|
+
// Filter by mode: skip incompatible content types
|
|
192
|
+
const mode = options.mode || 'text';
|
|
193
|
+
if (mode === 'text' && contentType === 'image') {
|
|
194
|
+
result.skipped.push({
|
|
195
|
+
path: fullPath,
|
|
196
|
+
reason: `Image files not supported in text mode. Use --mode multimodal for image processing.`
|
|
197
|
+
});
|
|
198
|
+
continue;
|
|
199
|
+
}
|
|
200
|
+
// Different size limits for different content types
|
|
201
|
+
const maxSize = contentType === 'image'
|
|
202
|
+
? 50 * 1024 * 1024 // 50MB for images
|
|
203
|
+
: (options.maxFileSize || 10 * 1024 * 1024); // 10MB for text files
|
|
204
|
+
if (stats.size > maxSize) {
|
|
205
|
+
result.skipped.push({
|
|
206
|
+
path: fullPath,
|
|
207
|
+
reason: `File size (${Math.round(stats.size / 1024 / 1024)}MB) exceeds maximum (${Math.round(maxSize / 1024 / 1024)}MB) for ${contentType} files`
|
|
208
|
+
});
|
|
209
|
+
continue;
|
|
210
|
+
}
|
|
211
|
+
// Additional validation for image files
|
|
212
|
+
if (contentType === 'image') {
|
|
213
|
+
const validation = await validateImageFile(fullPath);
|
|
214
|
+
if (!validation.valid) {
|
|
215
|
+
result.skipped.push({
|
|
216
|
+
path: fullPath,
|
|
217
|
+
reason: validation.error || 'Invalid image file'
|
|
218
|
+
});
|
|
219
|
+
continue;
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
result.files.push(fullPath);
|
|
223
|
+
}
|
|
224
|
+
catch (error) {
|
|
225
|
+
result.skipped.push({
|
|
226
|
+
path: fullPath,
|
|
227
|
+
reason: `Failed to validate file: ${error instanceof Error ? error.message : String(error)}`
|
|
228
|
+
});
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
catch (error) {
|
|
235
|
+
result.skipped.push({
|
|
236
|
+
path: dirPath,
|
|
237
|
+
reason: `Failed to read directory: ${error instanceof Error ? error.message : String(error)}`
|
|
238
|
+
});
|
|
239
|
+
}
|
|
240
|
+
return result;
|
|
241
|
+
}
|
|
242
|
+
/**
|
|
243
|
+
* Discover files for ingestion
|
|
244
|
+
* Supports both single files and directories (with optional recursion)
|
|
245
|
+
*/
|
|
246
|
+
export async function discoverFiles(path, options = DEFAULT_FILE_PROCESSOR_OPTIONS) {
|
|
247
|
+
const resolvedPath = resolve(path);
|
|
248
|
+
try {
|
|
249
|
+
const stats = await fs.stat(resolvedPath);
|
|
250
|
+
if (stats.isFile()) {
|
|
251
|
+
// Single file processing
|
|
252
|
+
if (!isSupportedFile(resolvedPath)) {
|
|
253
|
+
return {
|
|
254
|
+
files: [],
|
|
255
|
+
skipped: [{
|
|
256
|
+
path: resolvedPath,
|
|
257
|
+
reason: `Unsupported file extension. Supported text: ${SUPPORTED_TEXT_EXTENSIONS.join(', ')}, images: ${SUPPORTED_IMAGE_EXTENSIONS.join(', ')}`
|
|
258
|
+
}]
|
|
259
|
+
};
|
|
260
|
+
}
|
|
261
|
+
const contentType = getContentType(resolvedPath);
|
|
262
|
+
// Filter by mode: skip incompatible content types
|
|
263
|
+
const mode = options.mode || 'text';
|
|
264
|
+
if (mode === 'text' && contentType === 'image') {
|
|
265
|
+
return {
|
|
266
|
+
files: [],
|
|
267
|
+
skipped: [{
|
|
268
|
+
path: resolvedPath,
|
|
269
|
+
reason: `Image files not supported in text mode. Use --mode multimodal for image processing.`
|
|
270
|
+
}]
|
|
271
|
+
};
|
|
272
|
+
}
|
|
273
|
+
// Check file size based on content type
|
|
274
|
+
const maxSize = contentType === 'image'
|
|
275
|
+
? 50 * 1024 * 1024 // 50MB for images
|
|
276
|
+
: (options.maxFileSize || 10 * 1024 * 1024); // 10MB for text files
|
|
277
|
+
if (stats.size > maxSize) {
|
|
278
|
+
return {
|
|
279
|
+
files: [],
|
|
280
|
+
skipped: [{
|
|
281
|
+
path: resolvedPath,
|
|
282
|
+
reason: `File size (${Math.round(stats.size / 1024 / 1024)}MB) exceeds maximum (${Math.round(maxSize / 1024 / 1024)}MB) for ${contentType} files`
|
|
283
|
+
}]
|
|
284
|
+
};
|
|
285
|
+
}
|
|
286
|
+
// Additional validation for image files
|
|
287
|
+
if (contentType === 'image') {
|
|
288
|
+
const validation = await validateImageFile(resolvedPath);
|
|
289
|
+
if (!validation.valid) {
|
|
290
|
+
return {
|
|
291
|
+
files: [],
|
|
292
|
+
skipped: [{
|
|
293
|
+
path: resolvedPath,
|
|
294
|
+
reason: validation.error || 'Invalid image file'
|
|
295
|
+
}]
|
|
296
|
+
};
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
return {
|
|
300
|
+
files: [resolvedPath],
|
|
301
|
+
skipped: []
|
|
302
|
+
};
|
|
303
|
+
}
|
|
304
|
+
else if (stats.isDirectory()) {
|
|
305
|
+
// Directory processing
|
|
306
|
+
return await discoverFilesRecursive(resolvedPath, options);
|
|
307
|
+
}
|
|
308
|
+
else {
|
|
309
|
+
return {
|
|
310
|
+
files: [],
|
|
311
|
+
skipped: [{
|
|
312
|
+
path: resolvedPath,
|
|
313
|
+
reason: 'Path is neither a file nor a directory'
|
|
314
|
+
}]
|
|
315
|
+
};
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
catch (error) {
|
|
319
|
+
return {
|
|
320
|
+
files: [],
|
|
321
|
+
skipped: [{
|
|
322
|
+
path: resolvedPath,
|
|
323
|
+
reason: `Failed to access path: ${error instanceof Error ? error.message : String(error)}`
|
|
324
|
+
}]
|
|
325
|
+
};
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
/**
|
|
329
|
+
* Extract text content from PDF file
|
|
330
|
+
*/
|
|
331
|
+
async function extractPdfContent(filePath) {
|
|
332
|
+
const buffer = await fs.readFile(filePath);
|
|
333
|
+
const data = await pdfParse(buffer);
|
|
334
|
+
return data.text;
|
|
335
|
+
}
|
|
336
|
+
/**
|
|
337
|
+
* Extract text content from DOCX file
|
|
338
|
+
*/
|
|
339
|
+
async function extractDocxContent(filePath) {
|
|
340
|
+
const buffer = await fs.readFile(filePath);
|
|
341
|
+
const result = await mammoth.extractRawText({ buffer });
|
|
342
|
+
return result.value;
|
|
343
|
+
}
|
|
344
|
+
/**
|
|
345
|
+
* Extract document title from content
|
|
346
|
+
* Looks for markdown H1 headers first, then falls back to filename
|
|
347
|
+
*/
|
|
348
|
+
function extractTitle(content, filePath) {
|
|
349
|
+
// Try to find markdown H1 header
|
|
350
|
+
const lines = content.split('\n');
|
|
351
|
+
for (const line of lines) {
|
|
352
|
+
const trimmed = line.trim();
|
|
353
|
+
if (trimmed.startsWith('# ')) {
|
|
354
|
+
const title = trimmed.substring(2).trim();
|
|
355
|
+
if (title) {
|
|
356
|
+
return title;
|
|
357
|
+
}
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
// Fallback to filename without extension
|
|
361
|
+
const filename = basename(filePath);
|
|
362
|
+
const ext = extname(filename);
|
|
363
|
+
return ext ? filename.slice(0, -ext.length) : filename;
|
|
364
|
+
}
|
|
365
|
+
/**
|
|
366
|
+
* Cache for image-to-text pipeline to avoid reloading
|
|
367
|
+
*/
|
|
368
|
+
let imageToTextPipeline = null;
|
|
369
|
+
let imageToTextPipelinePromise = null;
|
|
370
|
+
/**
|
|
371
|
+
* Initialize the image-to-text pipeline with proper async locking
|
|
372
|
+
*/
|
|
373
|
+
async function initializeImageToTextPipeline(modelName = 'Xenova/vit-gpt2-image-captioning') {
|
|
374
|
+
// Return cached pipeline if available
|
|
375
|
+
if (imageToTextPipeline) {
|
|
376
|
+
return imageToTextPipeline;
|
|
377
|
+
}
|
|
378
|
+
// If pipeline is currently loading, wait for it
|
|
379
|
+
if (imageToTextPipelinePromise) {
|
|
380
|
+
return imageToTextPipelinePromise;
|
|
381
|
+
}
|
|
382
|
+
// Start loading pipeline
|
|
383
|
+
imageToTextPipelinePromise = (async () => {
|
|
384
|
+
try {
|
|
385
|
+
const { pipeline } = await import('@huggingface/transformers');
|
|
386
|
+
console.log(`Loading image-to-text model: ${modelName}`);
|
|
387
|
+
imageToTextPipeline = await pipeline('image-to-text', modelName);
|
|
388
|
+
console.log(`Successfully loaded image-to-text model: ${modelName}`);
|
|
389
|
+
return imageToTextPipeline;
|
|
390
|
+
}
|
|
391
|
+
catch (error) {
|
|
392
|
+
console.error(`Failed to load image-to-text model ${modelName}:`, error);
|
|
393
|
+
imageToTextPipelinePromise = null; // Reset on error so it can be retried
|
|
394
|
+
throw new Error(`Failed to initialize image-to-text pipeline: ${error instanceof Error ? error.message : String(error)}`);
|
|
395
|
+
}
|
|
396
|
+
})();
|
|
397
|
+
return imageToTextPipelinePromise;
|
|
398
|
+
}
|
|
399
|
+
/**
|
|
400
|
+
* Parse PNG image dimensions from file buffer
|
|
401
|
+
*/
|
|
402
|
+
function parsePngDimensions(buffer) {
|
|
403
|
+
try {
|
|
404
|
+
// PNG signature: 89 50 4E 47 0D 0A 1A 0A
|
|
405
|
+
if (buffer.length < 24)
|
|
406
|
+
return null;
|
|
407
|
+
// Check PNG signature
|
|
408
|
+
const pngSignature = [0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A];
|
|
409
|
+
for (let i = 0; i < pngSignature.length; i++) {
|
|
410
|
+
if (buffer[i] !== pngSignature[i])
|
|
411
|
+
return null;
|
|
412
|
+
}
|
|
413
|
+
// IHDR chunk starts at byte 8, dimensions at bytes 16-23
|
|
414
|
+
const width = buffer.readUInt32BE(16);
|
|
415
|
+
const height = buffer.readUInt32BE(20);
|
|
416
|
+
return { width, height };
|
|
417
|
+
}
|
|
418
|
+
catch (error) {
|
|
419
|
+
return null;
|
|
420
|
+
}
|
|
421
|
+
}
|
|
422
|
+
/**
|
|
423
|
+
* Parse JPEG image dimensions from file buffer
|
|
424
|
+
*/
|
|
425
|
+
function parseJpegDimensions(buffer) {
|
|
426
|
+
try {
|
|
427
|
+
if (buffer.length < 4)
|
|
428
|
+
return null;
|
|
429
|
+
// Check JPEG signature
|
|
430
|
+
if (buffer[0] !== 0xFF || buffer[1] !== 0xD8)
|
|
431
|
+
return null;
|
|
432
|
+
let offset = 2;
|
|
433
|
+
while (offset < buffer.length - 8) {
|
|
434
|
+
// Find SOF (Start of Frame) markers
|
|
435
|
+
if (buffer[offset] === 0xFF) {
|
|
436
|
+
const marker = buffer[offset + 1];
|
|
437
|
+
// SOF0 (0xC0) or SOF2 (0xC2) markers contain dimensions
|
|
438
|
+
if (marker === 0xC0 || marker === 0xC2) {
|
|
439
|
+
const height = buffer.readUInt16BE(offset + 5);
|
|
440
|
+
const width = buffer.readUInt16BE(offset + 7);
|
|
441
|
+
return { width, height };
|
|
442
|
+
}
|
|
443
|
+
// Skip to next marker
|
|
444
|
+
const segmentLength = buffer.readUInt16BE(offset + 2);
|
|
445
|
+
offset += 2 + segmentLength;
|
|
446
|
+
}
|
|
447
|
+
else {
|
|
448
|
+
offset++;
|
|
449
|
+
}
|
|
450
|
+
}
|
|
451
|
+
return null;
|
|
452
|
+
}
|
|
453
|
+
catch (error) {
|
|
454
|
+
return null;
|
|
455
|
+
}
|
|
456
|
+
}
|
|
457
|
+
/**
|
|
458
|
+
* Parse GIF image dimensions from file buffer
|
|
459
|
+
*/
|
|
460
|
+
function parseGifDimensions(buffer) {
|
|
461
|
+
try {
|
|
462
|
+
if (buffer.length < 10)
|
|
463
|
+
return null;
|
|
464
|
+
// Check GIF signature
|
|
465
|
+
const gifSignature = [0x47, 0x49, 0x46]; // "GIF"
|
|
466
|
+
for (let i = 0; i < gifSignature.length; i++) {
|
|
467
|
+
if (buffer[i] !== gifSignature[i])
|
|
468
|
+
return null;
|
|
469
|
+
}
|
|
470
|
+
// Dimensions are at bytes 6-9 (little endian)
|
|
471
|
+
const width = buffer.readUInt16LE(6);
|
|
472
|
+
const height = buffer.readUInt16LE(8);
|
|
473
|
+
return { width, height };
|
|
474
|
+
}
|
|
475
|
+
catch (error) {
|
|
476
|
+
return null;
|
|
477
|
+
}
|
|
478
|
+
}
|
|
479
|
+
/**
|
|
480
|
+
* Parse WebP image dimensions from file buffer
|
|
481
|
+
*/
|
|
482
|
+
function parseWebpDimensions(buffer) {
|
|
483
|
+
try {
|
|
484
|
+
if (buffer.length < 30)
|
|
485
|
+
return null;
|
|
486
|
+
// Check WebP signature
|
|
487
|
+
if (buffer.readUInt32BE(0) !== 0x52494646)
|
|
488
|
+
return null; // "RIFF"
|
|
489
|
+
if (buffer.readUInt32BE(8) !== 0x57454250)
|
|
490
|
+
return null; // "WEBP"
|
|
491
|
+
// VP8 format
|
|
492
|
+
if (buffer.readUInt32BE(12) === 0x56503820) { // "VP8 "
|
|
493
|
+
const width = buffer.readUInt16LE(26) & 0x3FFF;
|
|
494
|
+
const height = buffer.readUInt16LE(28) & 0x3FFF;
|
|
495
|
+
return { width, height };
|
|
496
|
+
}
|
|
497
|
+
// VP8L format
|
|
498
|
+
if (buffer.readUInt32BE(12) === 0x5650384C) { // "VP8L"
|
|
499
|
+
const bits = buffer.readUInt32LE(21);
|
|
500
|
+
const width = (bits & 0x3FFF) + 1;
|
|
501
|
+
const height = ((bits >> 14) & 0x3FFF) + 1;
|
|
502
|
+
return { width, height };
|
|
503
|
+
}
|
|
504
|
+
return null;
|
|
505
|
+
}
|
|
506
|
+
catch (error) {
|
|
507
|
+
return null;
|
|
508
|
+
}
|
|
509
|
+
}
|
|
510
|
+
/**
|
|
511
|
+
* Parse BMP image dimensions from file buffer
|
|
512
|
+
*/
|
|
513
|
+
function parseBmpDimensions(buffer) {
|
|
514
|
+
try {
|
|
515
|
+
if (buffer.length < 26)
|
|
516
|
+
return null;
|
|
517
|
+
// Check BMP signature
|
|
518
|
+
if (buffer[0] !== 0x42 || buffer[1] !== 0x4D)
|
|
519
|
+
return null; // "BM"
|
|
520
|
+
// Dimensions are at bytes 18-25 (little endian)
|
|
521
|
+
const width = buffer.readInt32LE(18);
|
|
522
|
+
const height = Math.abs(buffer.readInt32LE(22)); // Height can be negative
|
|
523
|
+
return { width, height };
|
|
524
|
+
}
|
|
525
|
+
catch (error) {
|
|
526
|
+
return null;
|
|
527
|
+
}
|
|
528
|
+
}
|
|
529
|
+
/**
|
|
530
|
+
* Extract image dimensions from file buffer based on format
|
|
531
|
+
*/
|
|
532
|
+
function extractImageDimensions(buffer, format) {
|
|
533
|
+
switch (format.toLowerCase()) {
|
|
534
|
+
case 'png':
|
|
535
|
+
return parsePngDimensions(buffer);
|
|
536
|
+
case 'jpg':
|
|
537
|
+
case 'jpeg':
|
|
538
|
+
return parseJpegDimensions(buffer);
|
|
539
|
+
case 'gif':
|
|
540
|
+
return parseGifDimensions(buffer);
|
|
541
|
+
case 'webp':
|
|
542
|
+
return parseWebpDimensions(buffer);
|
|
543
|
+
case 'bmp':
|
|
544
|
+
return parseBmpDimensions(buffer);
|
|
545
|
+
default:
|
|
546
|
+
return null;
|
|
547
|
+
}
|
|
548
|
+
}
|
|
549
|
+
/**
|
|
550
|
+
* Extract metadata from an image file using native parsing
|
|
551
|
+
*/
|
|
552
|
+
async function extractImageMetadata(imagePath) {
|
|
553
|
+
try {
|
|
554
|
+
const stats = await fs.stat(imagePath);
|
|
555
|
+
const format = extname(imagePath).toLowerCase().substring(1);
|
|
556
|
+
// Read file buffer for dimension extraction
|
|
557
|
+
const buffer = await fs.readFile(imagePath);
|
|
558
|
+
// Extract dimensions using native parsing
|
|
559
|
+
const dimensions = extractImageDimensions(buffer, format);
|
|
560
|
+
const imageMetadata = {
|
|
561
|
+
originalPath: imagePath,
|
|
562
|
+
dimensions: dimensions || { width: 0, height: 0 }, // Use 0 if dimensions can't be extracted
|
|
563
|
+
fileSize: stats.size,
|
|
564
|
+
format: format,
|
|
565
|
+
createdAt: stats.birthtime || stats.mtime
|
|
566
|
+
};
|
|
567
|
+
return imageMetadata;
|
|
568
|
+
}
|
|
569
|
+
catch (error) {
|
|
570
|
+
throw new Error(`Failed to extract metadata for image ${imagePath}: ${error instanceof Error ? error.message : String(error)}`);
|
|
571
|
+
}
|
|
572
|
+
}
|
|
573
|
+
/**
|
|
574
|
+
* Generate text description for a single image
|
|
575
|
+
*/
|
|
576
|
+
async function generateImageDescription(imagePath, options = DEFAULT_IMAGE_TO_TEXT_OPTIONS) {
|
|
577
|
+
try {
|
|
578
|
+
const pipeline = await initializeImageToTextPipeline(options.model);
|
|
579
|
+
// Load image using RawImage.fromURL which works with local file paths
|
|
580
|
+
const { RawImage } = await import('@huggingface/transformers');
|
|
581
|
+
const image = await RawImage.fromURL(imagePath);
|
|
582
|
+
// Generate description with loaded image
|
|
583
|
+
const result = await pipeline(image, {
|
|
584
|
+
max_length: options.maxLength || 50,
|
|
585
|
+
num_beams: 4,
|
|
586
|
+
early_stopping: true
|
|
587
|
+
});
|
|
588
|
+
// Extract description and confidence
|
|
589
|
+
const description = Array.isArray(result) ? result[0]?.generated_text : result?.generated_text;
|
|
590
|
+
const confidence = Array.isArray(result) ? result[0]?.score : result?.score;
|
|
591
|
+
if (!description) {
|
|
592
|
+
throw new Error('No description generated for image');
|
|
593
|
+
}
|
|
594
|
+
// Clean up the description
|
|
595
|
+
const cleanDescription = description.trim();
|
|
596
|
+
return {
|
|
597
|
+
description: cleanDescription,
|
|
598
|
+
confidence: options.includeConfidence ? confidence : undefined,
|
|
599
|
+
model: options.model || DEFAULT_IMAGE_TO_TEXT_OPTIONS.model
|
|
600
|
+
};
|
|
601
|
+
}
|
|
602
|
+
catch (error) {
|
|
603
|
+
throw new Error(`Failed to generate description for image ${imagePath}: ${error instanceof Error ? error.message : String(error)}`);
|
|
604
|
+
}
|
|
605
|
+
}
|
|
606
|
+
/**
|
|
607
|
+
* Generate text descriptions for multiple images in batches
|
|
608
|
+
*/
|
|
609
|
+
async function generateImageDescriptionsBatch(imagePaths, options = DEFAULT_IMAGE_TO_TEXT_OPTIONS) {
|
|
610
|
+
const results = [];
|
|
611
|
+
const batchSize = options.batchSize || DEFAULT_IMAGE_TO_TEXT_OPTIONS.batchSize;
|
|
612
|
+
// Process images in batches
|
|
613
|
+
for (let i = 0; i < imagePaths.length; i += batchSize) {
|
|
614
|
+
const batch = imagePaths.slice(i, i + batchSize);
|
|
615
|
+
console.log(`Processing image batch ${Math.floor(i / batchSize) + 1}/${Math.ceil(imagePaths.length / batchSize)} (${batch.length} images)`);
|
|
616
|
+
// Process batch in parallel
|
|
617
|
+
const batchPromises = batch.map(async (imagePath) => {
|
|
618
|
+
try {
|
|
619
|
+
const result = await generateImageDescription(imagePath, options);
|
|
620
|
+
return { path: imagePath, result };
|
|
621
|
+
}
|
|
622
|
+
catch (error) {
|
|
623
|
+
return {
|
|
624
|
+
path: imagePath,
|
|
625
|
+
error: error instanceof Error ? error.message : String(error)
|
|
626
|
+
};
|
|
627
|
+
}
|
|
628
|
+
});
|
|
629
|
+
const batchResults = await Promise.all(batchPromises);
|
|
630
|
+
results.push(...batchResults);
|
|
631
|
+
}
|
|
632
|
+
return results;
|
|
633
|
+
}
|
|
634
|
+
/**
|
|
635
|
+
* Process image file to extract text description and metadata
|
|
636
|
+
*/
|
|
637
|
+
async function processImageFile(filePath, pathManager, options = DEFAULT_IMAGE_TO_TEXT_OPTIONS) {
|
|
638
|
+
try {
|
|
639
|
+
// Extract image metadata first
|
|
640
|
+
const imageMetadata = await extractImageMetadata(filePath);
|
|
641
|
+
// Generate text description for the image
|
|
642
|
+
const descriptionResult = await generateImageDescription(filePath, options);
|
|
643
|
+
// Update metadata with description information
|
|
644
|
+
imageMetadata.description = descriptionResult.description;
|
|
645
|
+
imageMetadata.descriptionModel = descriptionResult.model;
|
|
646
|
+
imageMetadata.descriptionConfidence = descriptionResult.confidence;
|
|
647
|
+
// Create document with image description as content
|
|
648
|
+
const title = extractTitle('', filePath); // Use filename as title for images
|
|
649
|
+
// Create content that includes description and key metadata
|
|
650
|
+
const content = `Image: ${title}\nDescription: ${descriptionResult.description}\nDimensions: ${imageMetadata.dimensions.width}x${imageMetadata.dimensions.height}\nFormat: ${imageMetadata.format}`;
|
|
651
|
+
return {
|
|
652
|
+
source: pathManager.toStoragePath(filePath),
|
|
653
|
+
title,
|
|
654
|
+
content: content.trim(),
|
|
655
|
+
// Store comprehensive metadata about the image
|
|
656
|
+
metadata: {
|
|
657
|
+
contentType: 'image',
|
|
658
|
+
...imageMetadata // Spread all image metadata fields
|
|
659
|
+
}
|
|
660
|
+
};
|
|
661
|
+
}
|
|
662
|
+
catch (error) {
|
|
663
|
+
// If processing fails, try to extract at least basic metadata
|
|
664
|
+
console.warn(`Failed to fully process image ${filePath}, attempting basic metadata extraction: ${error instanceof Error ? error.message : String(error)}`);
|
|
665
|
+
try {
|
|
666
|
+
const imageMetadata = await extractImageMetadata(filePath);
|
|
667
|
+
const title = extractTitle('', filePath);
|
|
668
|
+
const content = `Image: ${title}\nDimensions: ${imageMetadata.dimensions.width}x${imageMetadata.dimensions.height}\nFormat: ${imageMetadata.format}`;
|
|
669
|
+
return {
|
|
670
|
+
source: pathManager.toStoragePath(filePath),
|
|
671
|
+
title,
|
|
672
|
+
content: content.trim(),
|
|
673
|
+
metadata: {
|
|
674
|
+
contentType: 'image',
|
|
675
|
+
...imageMetadata,
|
|
676
|
+
processingError: error instanceof Error ? error.message : String(error)
|
|
677
|
+
}
|
|
678
|
+
};
|
|
679
|
+
}
|
|
680
|
+
catch (metadataError) {
|
|
681
|
+
// Final fallback - create document with minimal information
|
|
682
|
+
console.warn(`Failed to extract any metadata for image ${filePath}, using minimal fallback: ${metadataError instanceof Error ? metadataError.message : String(metadataError)}`);
|
|
683
|
+
const title = extractTitle('', filePath);
|
|
684
|
+
const content = `Image: ${title}\nPath: ${filePath}`;
|
|
685
|
+
return {
|
|
686
|
+
source: pathManager.toStoragePath(filePath),
|
|
687
|
+
title,
|
|
688
|
+
content: content.trim(),
|
|
689
|
+
metadata: {
|
|
690
|
+
contentType: 'image',
|
|
691
|
+
originalPath: filePath,
|
|
692
|
+
processingError: error instanceof Error ? error.message : String(error),
|
|
693
|
+
metadataError: metadataError instanceof Error ? metadataError.message : String(metadataError)
|
|
694
|
+
}
|
|
695
|
+
};
|
|
696
|
+
}
|
|
697
|
+
}
|
|
698
|
+
}
|
|
699
|
+
/**
|
|
700
|
+
* Process a single file into a Document
|
|
701
|
+
*/
|
|
702
|
+
async function processFile(filePath, pathManager, imageToTextOptions) {
|
|
703
|
+
const result = await safeExecute(async () => {
|
|
704
|
+
const contentType = getContentType(filePath);
|
|
705
|
+
// Handle image files differently
|
|
706
|
+
if (contentType === 'image') {
|
|
707
|
+
return await processImageFile(filePath, pathManager, imageToTextOptions);
|
|
708
|
+
}
|
|
709
|
+
// Handle text files (existing logic)
|
|
710
|
+
let content;
|
|
711
|
+
const ext = extname(filePath).toLowerCase();
|
|
712
|
+
// Extract content based on file type
|
|
713
|
+
switch (ext) {
|
|
714
|
+
case '.pdf':
|
|
715
|
+
content = await extractPdfContent(filePath);
|
|
716
|
+
break;
|
|
717
|
+
case '.docx':
|
|
718
|
+
content = await extractDocxContent(filePath);
|
|
719
|
+
break;
|
|
720
|
+
case '.md':
|
|
721
|
+
case '.txt':
|
|
722
|
+
case '.mdx':
|
|
723
|
+
default:
|
|
724
|
+
content = await fs.readFile(filePath, 'utf-8');
|
|
725
|
+
break;
|
|
726
|
+
}
|
|
727
|
+
// Validate content is not empty
|
|
728
|
+
if (!content.trim()) {
|
|
729
|
+
throw new Error('File is empty or contains only whitespace');
|
|
730
|
+
}
|
|
731
|
+
// Use preprocessing module for all content types
|
|
732
|
+
content = preprocessDocument(content, filePath, config.preprocessing);
|
|
733
|
+
// Validate processed content is not empty (preprocessing module ensures this)
|
|
734
|
+
if (!content.trim()) {
|
|
735
|
+
throw new Error('File contains no content after preprocessing');
|
|
736
|
+
}
|
|
737
|
+
const title = extractTitle(content, filePath);
|
|
738
|
+
return {
|
|
739
|
+
source: pathManager.toStoragePath(filePath), // Use path manager
|
|
740
|
+
title,
|
|
741
|
+
content: content.trim(),
|
|
742
|
+
metadata: {
|
|
743
|
+
contentType: 'text'
|
|
744
|
+
}
|
|
745
|
+
};
|
|
746
|
+
}, `File Processing: ${filePath}`, {
|
|
747
|
+
category: ErrorCategory.FILE_SYSTEM,
|
|
748
|
+
severity: ErrorSeverity.ERROR
|
|
749
|
+
});
|
|
750
|
+
if (!result) {
|
|
751
|
+
throw new Error(`Failed to process file: ${filePath}`);
|
|
752
|
+
}
|
|
753
|
+
return result;
|
|
754
|
+
}
|
|
755
|
+
/**
|
|
756
|
+
* Process multiple files into Documents
|
|
757
|
+
* Handles errors gracefully by skipping problematic files
|
|
758
|
+
*/
|
|
759
|
+
export async function processFiles(filePaths, pathManager, imageToTextOptions) {
|
|
760
|
+
const result = {
|
|
761
|
+
documents: [],
|
|
762
|
+
errors: []
|
|
763
|
+
};
|
|
764
|
+
// Separate image and text files for optimized processing
|
|
765
|
+
const imageFiles = filePaths.filter(path => getContentType(path) === 'image');
|
|
766
|
+
const textFiles = filePaths.filter(path => getContentType(path) === 'text');
|
|
767
|
+
// Process text files sequentially (existing behavior)
|
|
768
|
+
for (const filePath of textFiles) {
|
|
769
|
+
try {
|
|
770
|
+
const document = await processFile(filePath, pathManager, imageToTextOptions);
|
|
771
|
+
result.documents.push(document);
|
|
772
|
+
}
|
|
773
|
+
catch (error) {
|
|
774
|
+
result.errors.push({
|
|
775
|
+
path: filePath,
|
|
776
|
+
error: error instanceof Error ? error.message : String(error)
|
|
777
|
+
});
|
|
778
|
+
}
|
|
779
|
+
}
|
|
780
|
+
// Process image files in batches for efficiency
|
|
781
|
+
if (imageFiles.length > 0) {
|
|
782
|
+
console.log(`Processing ${imageFiles.length} image files with optimized batch processing`);
|
|
783
|
+
try {
|
|
784
|
+
// Use batch processing for image descriptions
|
|
785
|
+
const batchResults = await generateImageDescriptionsBatch(imageFiles, imageToTextOptions);
|
|
786
|
+
// Convert batch results to documents with metadata extraction
|
|
787
|
+
for (const batchResult of batchResults) {
|
|
788
|
+
try {
|
|
789
|
+
// Extract metadata for each image
|
|
790
|
+
const imageMetadata = await extractImageMetadata(batchResult.path);
|
|
791
|
+
if (batchResult.result) {
|
|
792
|
+
// Create document from successful description generation
|
|
793
|
+
imageMetadata.description = batchResult.result.description;
|
|
794
|
+
imageMetadata.descriptionModel = batchResult.result.model;
|
|
795
|
+
imageMetadata.descriptionConfidence = batchResult.result.confidence;
|
|
796
|
+
const title = extractTitle('', batchResult.path);
|
|
797
|
+
const content = `Image: ${title}\nDescription: ${batchResult.result.description}\nDimensions: ${imageMetadata.dimensions.width}x${imageMetadata.dimensions.height}\nFormat: ${imageMetadata.format}`;
|
|
798
|
+
result.documents.push({
|
|
799
|
+
source: pathManager.toStoragePath(batchResult.path),
|
|
800
|
+
title,
|
|
801
|
+
content: content.trim(),
|
|
802
|
+
metadata: {
|
|
803
|
+
contentType: 'image',
|
|
804
|
+
...imageMetadata
|
|
805
|
+
}
|
|
806
|
+
});
|
|
807
|
+
}
|
|
808
|
+
else {
|
|
809
|
+
// Create fallback document for failed description generation
|
|
810
|
+
const title = extractTitle('', batchResult.path);
|
|
811
|
+
const content = `Image: ${title}\nDimensions: ${imageMetadata.dimensions.width}x${imageMetadata.dimensions.height}\nFormat: ${imageMetadata.format}`;
|
|
812
|
+
result.documents.push({
|
|
813
|
+
source: pathManager.toStoragePath(batchResult.path),
|
|
814
|
+
title,
|
|
815
|
+
content: content.trim(),
|
|
816
|
+
metadata: {
|
|
817
|
+
contentType: 'image',
|
|
818
|
+
...imageMetadata,
|
|
819
|
+
processingError: batchResult.error
|
|
820
|
+
}
|
|
821
|
+
});
|
|
822
|
+
}
|
|
823
|
+
}
|
|
824
|
+
catch (error) {
|
|
825
|
+
result.errors.push({
|
|
826
|
+
path: batchResult.path,
|
|
827
|
+
error: error instanceof Error ? error.message : String(error)
|
|
828
|
+
});
|
|
829
|
+
}
|
|
830
|
+
}
|
|
831
|
+
}
|
|
832
|
+
catch (error) {
|
|
833
|
+
// If batch processing fails entirely, fall back to individual processing
|
|
834
|
+
console.warn(`Batch processing failed, falling back to individual processing: ${error instanceof Error ? error.message : String(error)}`);
|
|
835
|
+
for (const filePath of imageFiles) {
|
|
836
|
+
try {
|
|
837
|
+
const document = await processFile(filePath, pathManager, imageToTextOptions);
|
|
838
|
+
result.documents.push(document);
|
|
839
|
+
}
|
|
840
|
+
catch (error) {
|
|
841
|
+
result.errors.push({
|
|
842
|
+
path: filePath,
|
|
843
|
+
error: error instanceof Error ? error.message : String(error)
|
|
844
|
+
});
|
|
845
|
+
}
|
|
846
|
+
}
|
|
847
|
+
}
|
|
848
|
+
}
|
|
849
|
+
return result;
|
|
850
|
+
}
|
|
851
|
+
/**
|
|
852
|
+
* Complete file discovery and processing pipeline
|
|
853
|
+
* Discovers files and processes them into Documents
|
|
854
|
+
*/
|
|
855
|
+
export async function discoverAndProcessFiles(path, options = DEFAULT_FILE_PROCESSOR_OPTIONS, pathManager, imageToTextOptions) {
|
|
856
|
+
console.log(`Discovering files in: ${path}`);
|
|
857
|
+
// Discover files
|
|
858
|
+
const discoveryResult = await discoverFiles(path, options);
|
|
859
|
+
// Log discovery results
|
|
860
|
+
if (discoveryResult.skipped.length > 0) {
|
|
861
|
+
console.log(`Skipped ${discoveryResult.skipped.length} files:`);
|
|
862
|
+
for (const skipped of discoveryResult.skipped) {
|
|
863
|
+
console.error(` - ${skipped.path}: ${skipped.reason}`);
|
|
864
|
+
}
|
|
865
|
+
}
|
|
866
|
+
console.log(`Found ${discoveryResult.files.length} supported files`);
|
|
867
|
+
// Count different content types
|
|
868
|
+
const imageFiles = discoveryResult.files.filter(file => getContentType(file) === 'image');
|
|
869
|
+
const textFiles = discoveryResult.files.filter(file => getContentType(file) === 'text');
|
|
870
|
+
if (imageFiles.length > 0) {
|
|
871
|
+
console.log(` - ${textFiles.length} text files`);
|
|
872
|
+
console.log(` - ${imageFiles.length} image files`);
|
|
873
|
+
if (imageToTextOptions?.model) {
|
|
874
|
+
console.log(`Using image-to-text model: ${imageToTextOptions.model}`);
|
|
875
|
+
}
|
|
876
|
+
else {
|
|
877
|
+
console.log(`Using default image-to-text model: ${DEFAULT_IMAGE_TO_TEXT_OPTIONS.model}`);
|
|
878
|
+
}
|
|
879
|
+
}
|
|
880
|
+
// Create default path manager if not provided
|
|
881
|
+
const effectivePathManager = pathManager || new DocumentPathManager(config.path_storage_strategy, resolve(path));
|
|
882
|
+
// Process discovered files with path manager and image-to-text options
|
|
883
|
+
const processingResult = await processFiles(discoveryResult.files, effectivePathManager, imageToTextOptions);
|
|
884
|
+
// Log processing results
|
|
885
|
+
if (processingResult.errors.length > 0) {
|
|
886
|
+
console.log(`Failed to process ${processingResult.errors.length} files:`);
|
|
887
|
+
for (const error of processingResult.errors) {
|
|
888
|
+
console.error(` - ${error.path}: ${error.error}`);
|
|
889
|
+
}
|
|
890
|
+
}
|
|
891
|
+
console.log(`Successfully processed ${processingResult.documents.length} documents`);
|
|
892
|
+
return {
|
|
893
|
+
documents: processingResult.documents,
|
|
894
|
+
discoveryResult,
|
|
895
|
+
processingResult
|
|
896
|
+
};
|
|
897
|
+
}
|
|
898
|
+
/**
|
|
899
|
+
* Clean up image processing resources
|
|
900
|
+
* Call this when shutting down the application to free memory
|
|
901
|
+
*/
|
|
902
|
+
export async function cleanupImageProcessingResources() {
|
|
903
|
+
// Clean up image-to-text pipeline
|
|
904
|
+
if (imageToTextPipeline) {
|
|
905
|
+
try {
|
|
906
|
+
// Dispose of the pipeline if it has a dispose method
|
|
907
|
+
if (typeof imageToTextPipeline.dispose === 'function') {
|
|
908
|
+
await imageToTextPipeline.dispose();
|
|
909
|
+
}
|
|
910
|
+
imageToTextPipeline = null;
|
|
911
|
+
imageToTextPipelinePromise = null;
|
|
912
|
+
console.log('Image-to-text pipeline cleaned up');
|
|
913
|
+
}
|
|
914
|
+
catch (error) {
|
|
915
|
+
console.warn('Error cleaning up image-to-text pipeline:', error);
|
|
916
|
+
}
|
|
917
|
+
}
|
|
918
|
+
}
|
|
919
|
+
/**
|
|
920
|
+
* Clean up image-to-text pipeline resources (legacy function for backward compatibility)
|
|
921
|
+
* @deprecated Use cleanupImageProcessingResources() instead
|
|
922
|
+
*/
|
|
923
|
+
export async function cleanupImageToTextPipeline() {
|
|
924
|
+
return cleanupImageProcessingResources();
|
|
925
|
+
}
|
|
926
|
+
/**
|
|
927
|
+
* Generate description for a single image (exported for external use)
|
|
928
|
+
*/
|
|
929
|
+
export async function generateImageDescriptionForFile(imagePath, options) {
|
|
930
|
+
return generateImageDescription(imagePath, { ...DEFAULT_IMAGE_TO_TEXT_OPTIONS, ...options });
|
|
931
|
+
}
|
|
932
|
+
/**
|
|
933
|
+
* Generate descriptions for multiple images (exported for external use)
|
|
934
|
+
*/
|
|
935
|
+
export async function generateImageDescriptionsForFiles(imagePaths, options) {
|
|
936
|
+
return generateImageDescriptionsBatch(imagePaths, { ...DEFAULT_IMAGE_TO_TEXT_OPTIONS, ...options });
|
|
937
|
+
}
|
|
938
|
+
/**
|
|
939
|
+
* Extract metadata from a single image file (exported for external use)
|
|
940
|
+
*/
|
|
941
|
+
export async function extractImageMetadataForFile(imagePath) {
|
|
942
|
+
return extractImageMetadata(imagePath);
|
|
943
|
+
}
|
|
944
|
+
/**
|
|
945
|
+
* Extract metadata from multiple image files (exported for external use)
|
|
946
|
+
*/
|
|
947
|
+
export async function extractImageMetadataForFiles(imagePaths) {
|
|
948
|
+
const results = [];
|
|
949
|
+
for (const imagePath of imagePaths) {
|
|
950
|
+
try {
|
|
951
|
+
const metadata = await extractImageMetadata(imagePath);
|
|
952
|
+
results.push({ path: imagePath, metadata });
|
|
953
|
+
}
|
|
954
|
+
catch (error) {
|
|
955
|
+
results.push({
|
|
956
|
+
path: imagePath,
|
|
957
|
+
error: error instanceof Error ? error.message : String(error)
|
|
958
|
+
});
|
|
959
|
+
}
|
|
960
|
+
}
|
|
961
|
+
return results;
|
|
962
|
+
}
|
|
963
|
+
//# sourceMappingURL=file-processor.js.map
|