rag-lite-ts 2.1.1 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{core → cjs/core}/model-validator.js +1 -1
- package/dist/{core → cjs/core}/vector-index.js +4 -2
- package/dist/esm/api-errors.d.ts +90 -0
- package/dist/esm/api-errors.js +320 -0
- package/dist/esm/cli/indexer.d.ts +11 -0
- package/dist/esm/cli/indexer.js +471 -0
- package/dist/esm/cli/search.d.ts +7 -0
- package/dist/esm/cli/search.js +332 -0
- package/dist/esm/cli.d.ts +3 -0
- package/dist/esm/cli.js +529 -0
- package/dist/esm/config.d.ts +51 -0
- package/dist/esm/config.js +79 -0
- package/dist/esm/core/abstract-embedder.d.ts +125 -0
- package/dist/esm/core/abstract-embedder.js +264 -0
- package/dist/esm/core/actionable-error-messages.d.ts +60 -0
- package/dist/esm/core/actionable-error-messages.js +397 -0
- package/dist/esm/core/adapters.d.ts +93 -0
- package/dist/esm/core/adapters.js +139 -0
- package/dist/esm/core/batch-processing-optimizer.d.ts +155 -0
- package/dist/esm/core/batch-processing-optimizer.js +536 -0
- package/dist/esm/core/binary-index-format.d.ts +78 -0
- package/dist/esm/core/binary-index-format.js +291 -0
- package/dist/esm/core/chunker.d.ts +119 -0
- package/dist/esm/core/chunker.js +73 -0
- package/dist/esm/core/cli-database-utils.d.ts +53 -0
- package/dist/esm/core/cli-database-utils.js +239 -0
- package/dist/esm/core/config.d.ts +102 -0
- package/dist/esm/core/config.js +247 -0
- package/dist/esm/core/content-errors.d.ts +111 -0
- package/dist/esm/core/content-errors.js +362 -0
- package/dist/esm/core/content-manager.d.ts +335 -0
- package/dist/esm/core/content-manager.js +1476 -0
- package/dist/esm/core/content-performance-optimizer.d.ts +150 -0
- package/dist/esm/core/content-performance-optimizer.js +516 -0
- package/dist/esm/core/content-resolver.d.ts +104 -0
- package/dist/esm/core/content-resolver.js +285 -0
- package/dist/esm/core/cross-modal-search.d.ts +164 -0
- package/dist/esm/core/cross-modal-search.js +342 -0
- package/dist/esm/core/database-connection-manager.d.ts +109 -0
- package/dist/esm/core/database-connection-manager.js +310 -0
- package/dist/esm/core/db.d.ts +213 -0
- package/dist/esm/core/db.js +895 -0
- package/dist/esm/core/embedder-factory.d.ts +154 -0
- package/dist/esm/core/embedder-factory.js +311 -0
- package/dist/esm/core/error-handler.d.ts +112 -0
- package/dist/esm/core/error-handler.js +239 -0
- package/dist/esm/core/index.d.ts +59 -0
- package/dist/esm/core/index.js +69 -0
- package/dist/esm/core/ingestion.d.ts +202 -0
- package/dist/esm/core/ingestion.js +901 -0
- package/dist/esm/core/interfaces.d.ts +408 -0
- package/dist/esm/core/interfaces.js +106 -0
- package/dist/esm/core/lazy-dependency-loader.d.ts +147 -0
- package/dist/esm/core/lazy-dependency-loader.js +435 -0
- package/dist/esm/core/mode-detection-service.d.ts +150 -0
- package/dist/esm/core/mode-detection-service.js +565 -0
- package/dist/esm/core/mode-model-validator.d.ts +92 -0
- package/dist/esm/core/mode-model-validator.js +203 -0
- package/dist/esm/core/model-registry.d.ts +116 -0
- package/dist/esm/core/model-registry.js +411 -0
- package/dist/esm/core/model-validator.d.ts +217 -0
- package/dist/esm/core/model-validator.js +782 -0
- package/dist/esm/core/path-manager.d.ts +47 -0
- package/dist/esm/core/path-manager.js +71 -0
- package/dist/esm/core/raglite-paths.d.ts +121 -0
- package/dist/esm/core/raglite-paths.js +145 -0
- package/dist/esm/core/reranking-config.d.ts +42 -0
- package/dist/esm/core/reranking-config.js +147 -0
- package/dist/esm/core/reranking-factory.d.ts +92 -0
- package/dist/esm/core/reranking-factory.js +410 -0
- package/dist/esm/core/reranking-strategies.d.ts +310 -0
- package/dist/esm/core/reranking-strategies.js +650 -0
- package/dist/esm/core/resource-cleanup.d.ts +163 -0
- package/dist/esm/core/resource-cleanup.js +371 -0
- package/dist/esm/core/resource-manager.d.ts +212 -0
- package/dist/esm/core/resource-manager.js +564 -0
- package/dist/esm/core/search-pipeline.d.ts +111 -0
- package/dist/esm/core/search-pipeline.js +287 -0
- package/dist/esm/core/search.d.ts +141 -0
- package/dist/esm/core/search.js +320 -0
- package/dist/esm/core/streaming-operations.d.ts +145 -0
- package/dist/esm/core/streaming-operations.js +409 -0
- package/dist/esm/core/types.d.ts +66 -0
- package/dist/esm/core/types.js +6 -0
- package/dist/esm/core/universal-embedder.d.ts +177 -0
- package/dist/esm/core/universal-embedder.js +139 -0
- package/dist/esm/core/validation-messages.d.ts +99 -0
- package/dist/esm/core/validation-messages.js +334 -0
- package/dist/esm/core/vector-index.d.ts +72 -0
- package/dist/esm/core/vector-index.js +333 -0
- package/dist/esm/dom-polyfills.d.ts +6 -0
- package/dist/esm/dom-polyfills.js +37 -0
- package/dist/esm/factories/index.d.ts +27 -0
- package/dist/esm/factories/index.js +29 -0
- package/dist/esm/factories/ingestion-factory.d.ts +200 -0
- package/dist/esm/factories/ingestion-factory.js +477 -0
- package/dist/esm/factories/search-factory.d.ts +154 -0
- package/dist/esm/factories/search-factory.js +344 -0
- package/dist/esm/file-processor.d.ts +147 -0
- package/dist/esm/file-processor.js +963 -0
- package/dist/esm/index-manager.d.ts +116 -0
- package/dist/esm/index-manager.js +598 -0
- package/dist/esm/index.d.ts +75 -0
- package/dist/esm/index.js +110 -0
- package/dist/esm/indexer.d.ts +7 -0
- package/dist/esm/indexer.js +54 -0
- package/dist/esm/ingestion.d.ts +63 -0
- package/dist/esm/ingestion.js +124 -0
- package/dist/esm/mcp-server.d.ts +46 -0
- package/dist/esm/mcp-server.js +1820 -0
- package/dist/esm/multimodal/clip-embedder.d.ts +327 -0
- package/dist/esm/multimodal/clip-embedder.js +996 -0
- package/dist/esm/multimodal/index.d.ts +6 -0
- package/dist/esm/multimodal/index.js +6 -0
- package/dist/esm/preprocess.d.ts +19 -0
- package/dist/esm/preprocess.js +203 -0
- package/dist/esm/preprocessors/index.d.ts +17 -0
- package/dist/esm/preprocessors/index.js +38 -0
- package/dist/esm/preprocessors/mdx.d.ts +25 -0
- package/dist/esm/preprocessors/mdx.js +101 -0
- package/dist/esm/preprocessors/mermaid.d.ts +68 -0
- package/dist/esm/preprocessors/mermaid.js +329 -0
- package/dist/esm/preprocessors/registry.d.ts +56 -0
- package/dist/esm/preprocessors/registry.js +179 -0
- package/dist/esm/run-error-recovery-tests.d.ts +7 -0
- package/dist/esm/run-error-recovery-tests.js +101 -0
- package/dist/esm/search-standalone.d.ts +7 -0
- package/dist/esm/search-standalone.js +117 -0
- package/dist/esm/search.d.ts +99 -0
- package/dist/esm/search.js +177 -0
- package/dist/esm/test-utils.d.ts +18 -0
- package/dist/esm/test-utils.js +27 -0
- package/dist/esm/text/chunker.d.ts +33 -0
- package/dist/esm/text/chunker.js +279 -0
- package/dist/esm/text/embedder.d.ts +111 -0
- package/dist/esm/text/embedder.js +386 -0
- package/dist/esm/text/index.d.ts +8 -0
- package/dist/esm/text/index.js +9 -0
- package/dist/esm/text/preprocessors/index.d.ts +17 -0
- package/dist/esm/text/preprocessors/index.js +38 -0
- package/dist/esm/text/preprocessors/mdx.d.ts +25 -0
- package/dist/esm/text/preprocessors/mdx.js +101 -0
- package/dist/esm/text/preprocessors/mermaid.d.ts +68 -0
- package/dist/esm/text/preprocessors/mermaid.js +330 -0
- package/dist/esm/text/preprocessors/registry.d.ts +56 -0
- package/dist/esm/text/preprocessors/registry.js +180 -0
- package/dist/esm/text/reranker.d.ts +49 -0
- package/dist/esm/text/reranker.js +274 -0
- package/dist/esm/text/sentence-transformer-embedder.d.ts +96 -0
- package/dist/esm/text/sentence-transformer-embedder.js +340 -0
- package/dist/esm/text/tokenizer.d.ts +22 -0
- package/dist/esm/text/tokenizer.js +64 -0
- package/dist/esm/types.d.ts +83 -0
- package/dist/esm/types.js +3 -0
- package/dist/esm/utils/vector-math.d.ts +31 -0
- package/dist/esm/utils/vector-math.js +70 -0
- package/package.json +30 -12
- /package/dist/{api-errors.d.ts → cjs/api-errors.d.ts} +0 -0
- /package/dist/{api-errors.js → cjs/api-errors.js} +0 -0
- /package/dist/{cli → cjs/cli}/indexer.d.ts +0 -0
- /package/dist/{cli → cjs/cli}/indexer.js +0 -0
- /package/dist/{cli → cjs/cli}/search.d.ts +0 -0
- /package/dist/{cli → cjs/cli}/search.js +0 -0
- /package/dist/{cli.d.ts → cjs/cli.d.ts} +0 -0
- /package/dist/{cli.js → cjs/cli.js} +0 -0
- /package/dist/{config.d.ts → cjs/config.d.ts} +0 -0
- /package/dist/{config.js → cjs/config.js} +0 -0
- /package/dist/{core → cjs/core}/abstract-embedder.d.ts +0 -0
- /package/dist/{core → cjs/core}/abstract-embedder.js +0 -0
- /package/dist/{core → cjs/core}/actionable-error-messages.d.ts +0 -0
- /package/dist/{core → cjs/core}/actionable-error-messages.js +0 -0
- /package/dist/{core → cjs/core}/adapters.d.ts +0 -0
- /package/dist/{core → cjs/core}/adapters.js +0 -0
- /package/dist/{core → cjs/core}/batch-processing-optimizer.d.ts +0 -0
- /package/dist/{core → cjs/core}/batch-processing-optimizer.js +0 -0
- /package/dist/{core → cjs/core}/binary-index-format.d.ts +0 -0
- /package/dist/{core → cjs/core}/binary-index-format.js +0 -0
- /package/dist/{core → cjs/core}/chunker.d.ts +0 -0
- /package/dist/{core → cjs/core}/chunker.js +0 -0
- /package/dist/{core → cjs/core}/cli-database-utils.d.ts +0 -0
- /package/dist/{core → cjs/core}/cli-database-utils.js +0 -0
- /package/dist/{core → cjs/core}/config.d.ts +0 -0
- /package/dist/{core → cjs/core}/config.js +0 -0
- /package/dist/{core → cjs/core}/content-errors.d.ts +0 -0
- /package/dist/{core → cjs/core}/content-errors.js +0 -0
- /package/dist/{core → cjs/core}/content-manager.d.ts +0 -0
- /package/dist/{core → cjs/core}/content-manager.js +0 -0
- /package/dist/{core → cjs/core}/content-performance-optimizer.d.ts +0 -0
- /package/dist/{core → cjs/core}/content-performance-optimizer.js +0 -0
- /package/dist/{core → cjs/core}/content-resolver.d.ts +0 -0
- /package/dist/{core → cjs/core}/content-resolver.js +0 -0
- /package/dist/{core → cjs/core}/cross-modal-search.d.ts +0 -0
- /package/dist/{core → cjs/core}/cross-modal-search.js +0 -0
- /package/dist/{core → cjs/core}/database-connection-manager.d.ts +0 -0
- /package/dist/{core → cjs/core}/database-connection-manager.js +0 -0
- /package/dist/{core → cjs/core}/db.d.ts +0 -0
- /package/dist/{core → cjs/core}/db.js +0 -0
- /package/dist/{core → cjs/core}/embedder-factory.d.ts +0 -0
- /package/dist/{core → cjs/core}/embedder-factory.js +0 -0
- /package/dist/{core → cjs/core}/error-handler.d.ts +0 -0
- /package/dist/{core → cjs/core}/error-handler.js +0 -0
- /package/dist/{core → cjs/core}/index.d.ts +0 -0
- /package/dist/{core → cjs/core}/index.js +0 -0
- /package/dist/{core → cjs/core}/ingestion.d.ts +0 -0
- /package/dist/{core → cjs/core}/ingestion.js +0 -0
- /package/dist/{core → cjs/core}/interfaces.d.ts +0 -0
- /package/dist/{core → cjs/core}/interfaces.js +0 -0
- /package/dist/{core → cjs/core}/lazy-dependency-loader.d.ts +0 -0
- /package/dist/{core → cjs/core}/lazy-dependency-loader.js +0 -0
- /package/dist/{core → cjs/core}/mode-detection-service.d.ts +0 -0
- /package/dist/{core → cjs/core}/mode-detection-service.js +0 -0
- /package/dist/{core → cjs/core}/mode-model-validator.d.ts +0 -0
- /package/dist/{core → cjs/core}/mode-model-validator.js +0 -0
- /package/dist/{core → cjs/core}/model-registry.d.ts +0 -0
- /package/dist/{core → cjs/core}/model-registry.js +0 -0
- /package/dist/{core → cjs/core}/model-validator.d.ts +0 -0
- /package/dist/{core → cjs/core}/path-manager.d.ts +0 -0
- /package/dist/{core → cjs/core}/path-manager.js +0 -0
- /package/dist/{core → cjs/core}/raglite-paths.d.ts +0 -0
- /package/dist/{core → cjs/core}/raglite-paths.js +0 -0
- /package/dist/{core → cjs/core}/reranking-config.d.ts +0 -0
- /package/dist/{core → cjs/core}/reranking-config.js +0 -0
- /package/dist/{core → cjs/core}/reranking-factory.d.ts +0 -0
- /package/dist/{core → cjs/core}/reranking-factory.js +0 -0
- /package/dist/{core → cjs/core}/reranking-strategies.d.ts +0 -0
- /package/dist/{core → cjs/core}/reranking-strategies.js +0 -0
- /package/dist/{core → cjs/core}/resource-cleanup.d.ts +0 -0
- /package/dist/{core → cjs/core}/resource-cleanup.js +0 -0
- /package/dist/{core → cjs/core}/resource-manager.d.ts +0 -0
- /package/dist/{core → cjs/core}/resource-manager.js +0 -0
- /package/dist/{core → cjs/core}/search-pipeline.d.ts +0 -0
- /package/dist/{core → cjs/core}/search-pipeline.js +0 -0
- /package/dist/{core → cjs/core}/search.d.ts +0 -0
- /package/dist/{core → cjs/core}/search.js +0 -0
- /package/dist/{core → cjs/core}/streaming-operations.d.ts +0 -0
- /package/dist/{core → cjs/core}/streaming-operations.js +0 -0
- /package/dist/{core → cjs/core}/types.d.ts +0 -0
- /package/dist/{core → cjs/core}/types.js +0 -0
- /package/dist/{core → cjs/core}/universal-embedder.d.ts +0 -0
- /package/dist/{core → cjs/core}/universal-embedder.js +0 -0
- /package/dist/{core → cjs/core}/validation-messages.d.ts +0 -0
- /package/dist/{core → cjs/core}/validation-messages.js +0 -0
- /package/dist/{core → cjs/core}/vector-index.d.ts +0 -0
- /package/dist/{dom-polyfills.d.ts → cjs/dom-polyfills.d.ts} +0 -0
- /package/dist/{dom-polyfills.js → cjs/dom-polyfills.js} +0 -0
- /package/dist/{factories → cjs/factories}/index.d.ts +0 -0
- /package/dist/{factories → cjs/factories}/index.js +0 -0
- /package/dist/{factories → cjs/factories}/ingestion-factory.d.ts +0 -0
- /package/dist/{factories → cjs/factories}/ingestion-factory.js +0 -0
- /package/dist/{factories → cjs/factories}/search-factory.d.ts +0 -0
- /package/dist/{factories → cjs/factories}/search-factory.js +0 -0
- /package/dist/{file-processor.d.ts → cjs/file-processor.d.ts} +0 -0
- /package/dist/{file-processor.js → cjs/file-processor.js} +0 -0
- /package/dist/{index-manager.d.ts → cjs/index-manager.d.ts} +0 -0
- /package/dist/{index-manager.js → cjs/index-manager.js} +0 -0
- /package/dist/{index.d.ts → cjs/index.d.ts} +0 -0
- /package/dist/{index.js → cjs/index.js} +0 -0
- /package/dist/{indexer.d.ts → cjs/indexer.d.ts} +0 -0
- /package/dist/{indexer.js → cjs/indexer.js} +0 -0
- /package/dist/{ingestion.d.ts → cjs/ingestion.d.ts} +0 -0
- /package/dist/{ingestion.js → cjs/ingestion.js} +0 -0
- /package/dist/{mcp-server.d.ts → cjs/mcp-server.d.ts} +0 -0
- /package/dist/{mcp-server.js → cjs/mcp-server.js} +0 -0
- /package/dist/{multimodal → cjs/multimodal}/clip-embedder.d.ts +0 -0
- /package/dist/{multimodal → cjs/multimodal}/clip-embedder.js +0 -0
- /package/dist/{multimodal → cjs/multimodal}/index.d.ts +0 -0
- /package/dist/{multimodal → cjs/multimodal}/index.js +0 -0
- /package/dist/{preprocess.d.ts → cjs/preprocess.d.ts} +0 -0
- /package/dist/{preprocess.js → cjs/preprocess.js} +0 -0
- /package/dist/{preprocessors → cjs/preprocessors}/index.d.ts +0 -0
- /package/dist/{preprocessors → cjs/preprocessors}/index.js +0 -0
- /package/dist/{preprocessors → cjs/preprocessors}/mdx.d.ts +0 -0
- /package/dist/{preprocessors → cjs/preprocessors}/mdx.js +0 -0
- /package/dist/{preprocessors → cjs/preprocessors}/mermaid.d.ts +0 -0
- /package/dist/{preprocessors → cjs/preprocessors}/mermaid.js +0 -0
- /package/dist/{preprocessors → cjs/preprocessors}/registry.d.ts +0 -0
- /package/dist/{preprocessors → cjs/preprocessors}/registry.js +0 -0
- /package/dist/{run-error-recovery-tests.d.ts → cjs/run-error-recovery-tests.d.ts} +0 -0
- /package/dist/{run-error-recovery-tests.js → cjs/run-error-recovery-tests.js} +0 -0
- /package/dist/{search-standalone.d.ts → cjs/search-standalone.d.ts} +0 -0
- /package/dist/{search-standalone.js → cjs/search-standalone.js} +0 -0
- /package/dist/{search.d.ts → cjs/search.d.ts} +0 -0
- /package/dist/{search.js → cjs/search.js} +0 -0
- /package/dist/{test-utils.d.ts → cjs/test-utils.d.ts} +0 -0
- /package/dist/{test-utils.js → cjs/test-utils.js} +0 -0
- /package/dist/{text → cjs/text}/chunker.d.ts +0 -0
- /package/dist/{text → cjs/text}/chunker.js +0 -0
- /package/dist/{text → cjs/text}/embedder.d.ts +0 -0
- /package/dist/{text → cjs/text}/embedder.js +0 -0
- /package/dist/{text → cjs/text}/index.d.ts +0 -0
- /package/dist/{text → cjs/text}/index.js +0 -0
- /package/dist/{text → cjs/text}/preprocessors/index.d.ts +0 -0
- /package/dist/{text → cjs/text}/preprocessors/index.js +0 -0
- /package/dist/{text → cjs/text}/preprocessors/mdx.d.ts +0 -0
- /package/dist/{text → cjs/text}/preprocessors/mdx.js +0 -0
- /package/dist/{text → cjs/text}/preprocessors/mermaid.d.ts +0 -0
- /package/dist/{text → cjs/text}/preprocessors/mermaid.js +0 -0
- /package/dist/{text → cjs/text}/preprocessors/registry.d.ts +0 -0
- /package/dist/{text → cjs/text}/preprocessors/registry.js +0 -0
- /package/dist/{text → cjs/text}/reranker.d.ts +0 -0
- /package/dist/{text → cjs/text}/reranker.js +0 -0
- /package/dist/{text → cjs/text}/sentence-transformer-embedder.d.ts +0 -0
- /package/dist/{text → cjs/text}/sentence-transformer-embedder.js +0 -0
- /package/dist/{text → cjs/text}/tokenizer.d.ts +0 -0
- /package/dist/{text → cjs/text}/tokenizer.js +0 -0
- /package/dist/{types.d.ts → cjs/types.d.ts} +0 -0
- /package/dist/{types.js → cjs/types.js} +0 -0
- /package/dist/{utils → cjs/utils}/vector-math.d.ts +0 -0
- /package/dist/{utils → cjs/utils}/vector-math.js +0 -0
|
@@ -0,0 +1,386 @@
|
|
|
1
|
+
import '../dom-polyfills.js';
|
|
2
|
+
import { createHash } from 'crypto';
|
|
3
|
+
import { config } from '../core/config.js';
|
|
4
|
+
import { handleError, ErrorCategory, ErrorSeverity, safeExecute } from '../core/error-handler.js';
|
|
5
|
+
import { createModelLoadingError, createInvalidContentError, createMissingDependencyError } from '../core/actionable-error-messages.js';
|
|
6
|
+
/**
|
|
7
|
+
* List of supported embedding models
|
|
8
|
+
*/
|
|
9
|
+
const SUPPORTED_MODELS = [
|
|
10
|
+
'sentence-transformers/all-MiniLM-L6-v2',
|
|
11
|
+
'Xenova/all-mpnet-base-v2'
|
|
12
|
+
];
|
|
13
|
+
/**
|
|
14
|
+
* Embedding engine using transformers.js for generating embeddings
|
|
15
|
+
*/
|
|
16
|
+
export class EmbeddingEngine {
|
|
17
|
+
model = null;
|
|
18
|
+
modelVersion = null;
|
|
19
|
+
modelName;
|
|
20
|
+
batchSize;
|
|
21
|
+
constructor(modelName, batchSize) {
|
|
22
|
+
this.modelName = modelName || config.embedding_model;
|
|
23
|
+
this.batchSize = batchSize || config.batch_size;
|
|
24
|
+
// Validate that the model is supported
|
|
25
|
+
if (!SUPPORTED_MODELS.includes(this.modelName)) {
|
|
26
|
+
throw createModelLoadingError(this.modelName, `Model not in supported list. Supported models: ${SUPPORTED_MODELS.join(', ')}`, { operationContext: 'EmbeddingEngine constructor' });
|
|
27
|
+
}
|
|
28
|
+
console.log(`🤖 EmbeddingEngine initialized with model: ${this.modelName}, batchSize: ${this.batchSize}`);
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* Load the embedding model
|
|
32
|
+
* @throws {Error} If model loading fails
|
|
33
|
+
*/
|
|
34
|
+
async loadModel() {
|
|
35
|
+
await safeExecute(async () => {
|
|
36
|
+
console.log(`Loading embedding model: ${this.modelName}`);
|
|
37
|
+
// Ensure DOM polyfills are set up before importing transformers
|
|
38
|
+
if (typeof globalThis.self === 'undefined') {
|
|
39
|
+
globalThis.self = globalThis;
|
|
40
|
+
}
|
|
41
|
+
if (typeof global.self === 'undefined') {
|
|
42
|
+
global.self = global;
|
|
43
|
+
}
|
|
44
|
+
// Additional polyfills that might be needed
|
|
45
|
+
if (typeof globalThis.window === 'undefined') {
|
|
46
|
+
globalThis.window = {};
|
|
47
|
+
}
|
|
48
|
+
if (typeof globalThis.document === 'undefined') {
|
|
49
|
+
globalThis.document = {};
|
|
50
|
+
}
|
|
51
|
+
console.log('Embedder polyfills set up. self is now:', typeof self !== 'undefined' ? 'defined' : 'undefined');
|
|
52
|
+
// Initialize the feature extraction pipeline using dynamic import
|
|
53
|
+
// Let transformers.js handle model caching automatically
|
|
54
|
+
try {
|
|
55
|
+
const { pipeline } = await import('@huggingface/transformers');
|
|
56
|
+
this.model = await pipeline('feature-extraction', this.modelName, {
|
|
57
|
+
cache_dir: config.model_cache_path,
|
|
58
|
+
local_files_only: false,
|
|
59
|
+
dtype: 'fp32' // Explicitly specify dtype to suppress warning
|
|
60
|
+
});
|
|
61
|
+
}
|
|
62
|
+
catch (error) {
|
|
63
|
+
// Enhanced error handling for model download failures
|
|
64
|
+
if (error instanceof Error && (error.message.includes('network') ||
|
|
65
|
+
error.message.includes('download') ||
|
|
66
|
+
error.message.includes('fetch') ||
|
|
67
|
+
error.message.includes('ENOTFOUND') ||
|
|
68
|
+
error.message.includes('ECONNREFUSED') ||
|
|
69
|
+
error.message.includes('timeout'))) {
|
|
70
|
+
throw new Error(`Failed to download model '${this.modelName}'. ` +
|
|
71
|
+
`Check your internet connection or see models/README.md for offline setup instructions.`);
|
|
72
|
+
}
|
|
73
|
+
throw error;
|
|
74
|
+
}
|
|
75
|
+
// Generate model version hash
|
|
76
|
+
this.modelVersion = this.generateModelVersion();
|
|
77
|
+
console.log(`Model loaded successfully. Version: ${this.modelVersion}`);
|
|
78
|
+
}, 'Model Loading', {
|
|
79
|
+
category: ErrorCategory.MODEL,
|
|
80
|
+
severity: ErrorSeverity.FATAL,
|
|
81
|
+
exitCode: 6
|
|
82
|
+
});
|
|
83
|
+
}
|
|
84
|
+
/**
|
|
85
|
+
* Generate embeddings for a batch of texts
|
|
86
|
+
* @param texts - Array of text strings to embed
|
|
87
|
+
* @returns Promise resolving to array of embedding results
|
|
88
|
+
*/
|
|
89
|
+
async embedBatch(texts) {
|
|
90
|
+
if (!this.model) {
|
|
91
|
+
throw createMissingDependencyError('model', 'object', {
|
|
92
|
+
operationContext: 'embedBatch',
|
|
93
|
+
includeTroubleshooting: true
|
|
94
|
+
});
|
|
95
|
+
}
|
|
96
|
+
if (texts.length === 0) {
|
|
97
|
+
return [];
|
|
98
|
+
}
|
|
99
|
+
// Split into smaller batches based on configured batch size
|
|
100
|
+
const results = [];
|
|
101
|
+
for (let i = 0; i < texts.length; i += this.batchSize) {
|
|
102
|
+
const batch = texts.slice(i, i + this.batchSize);
|
|
103
|
+
const batchResults = await this.processBatchWithErrorHandling(batch, i);
|
|
104
|
+
results.push(...batchResults);
|
|
105
|
+
}
|
|
106
|
+
return results;
|
|
107
|
+
}
|
|
108
|
+
/**
|
|
109
|
+
* Process a single batch with error handling for individual chunks
|
|
110
|
+
* @param batch - Array of text strings in this batch
|
|
111
|
+
* @param startIndex - Starting index for this batch in the original array
|
|
112
|
+
* @returns Promise resolving to array of embedding results
|
|
113
|
+
*/
|
|
114
|
+
async processBatchWithErrorHandling(batch, startIndex) {
|
|
115
|
+
return await safeExecute(async () => {
|
|
116
|
+
// Try to process the entire batch first
|
|
117
|
+
const embeddings = await this.model(batch, {
|
|
118
|
+
pooling: 'mean',
|
|
119
|
+
normalize: true
|
|
120
|
+
});
|
|
121
|
+
// Convert to EmbeddingResult format
|
|
122
|
+
const results = [];
|
|
123
|
+
const embeddingData = embeddings.tolist();
|
|
124
|
+
for (let i = 0; i < batch.length; i++) {
|
|
125
|
+
const embedding_id = this.generateEmbeddingId(batch[i], startIndex + i);
|
|
126
|
+
const vector = new Float32Array(embeddingData[i]);
|
|
127
|
+
results.push({
|
|
128
|
+
embedding_id,
|
|
129
|
+
vector,
|
|
130
|
+
contentType: 'text'
|
|
131
|
+
});
|
|
132
|
+
}
|
|
133
|
+
return results;
|
|
134
|
+
}, `Batch Embedding (${batch.length} chunks)`, {
|
|
135
|
+
category: ErrorCategory.EMBEDDING,
|
|
136
|
+
severity: ErrorSeverity.ERROR,
|
|
137
|
+
skipError: true,
|
|
138
|
+
fallbackValue: []
|
|
139
|
+
}) || await this.fallbackToIndividualProcessing(batch, startIndex);
|
|
140
|
+
}
|
|
141
|
+
/**
|
|
142
|
+
* Fallback to individual chunk processing when batch fails
|
|
143
|
+
*/
|
|
144
|
+
async fallbackToIndividualProcessing(batch, startIndex) {
|
|
145
|
+
handleError(`Batch processing failed for ${batch.length} chunks, falling back to individual processing`, 'Embedding Batch Processing', {
|
|
146
|
+
category: ErrorCategory.EMBEDDING,
|
|
147
|
+
severity: ErrorSeverity.WARNING,
|
|
148
|
+
skipError: true
|
|
149
|
+
});
|
|
150
|
+
const results = [];
|
|
151
|
+
for (let i = 0; i < batch.length; i++) {
|
|
152
|
+
const singleResult = await safeExecute(() => this.processSingleChunk(batch[i], startIndex + i), `Individual Chunk Embedding (${startIndex + i})`, {
|
|
153
|
+
category: ErrorCategory.EMBEDDING,
|
|
154
|
+
severity: ErrorSeverity.WARNING,
|
|
155
|
+
skipError: true
|
|
156
|
+
});
|
|
157
|
+
if (singleResult) {
|
|
158
|
+
results.push(singleResult);
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
return results;
|
|
162
|
+
}
|
|
163
|
+
/**
|
|
164
|
+
* Process a single chunk with error handling
|
|
165
|
+
* @param text - Text to embed
|
|
166
|
+
* @param index - Index of this chunk
|
|
167
|
+
* @returns Promise resolving to embedding result or null if failed
|
|
168
|
+
*/
|
|
169
|
+
async processSingleChunk(text, index) {
|
|
170
|
+
try {
|
|
171
|
+
const embeddings = await this.model([text], {
|
|
172
|
+
pooling: 'mean',
|
|
173
|
+
normalize: true
|
|
174
|
+
});
|
|
175
|
+
const embeddingData = embeddings.tolist();
|
|
176
|
+
const embedding_id = this.generateEmbeddingId(text, index);
|
|
177
|
+
const vector = new Float32Array(embeddingData[0]);
|
|
178
|
+
return {
|
|
179
|
+
embedding_id,
|
|
180
|
+
vector,
|
|
181
|
+
contentType: 'text'
|
|
182
|
+
};
|
|
183
|
+
}
|
|
184
|
+
catch (error) {
|
|
185
|
+
// Return null to indicate failure
|
|
186
|
+
throw error;
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
/**
|
|
190
|
+
* Generate embedding for a single text
|
|
191
|
+
* @param text - Text string to embed
|
|
192
|
+
* @returns Promise resolving to embedding result
|
|
193
|
+
*/
|
|
194
|
+
async embedSingle(text) {
|
|
195
|
+
const results = await this.embedBatch([text]);
|
|
196
|
+
if (results.length === 0) {
|
|
197
|
+
throw createInvalidContentError('text', 'empty', {
|
|
198
|
+
operationContext: 'embedText'
|
|
199
|
+
});
|
|
200
|
+
}
|
|
201
|
+
return results[0];
|
|
202
|
+
}
|
|
203
|
+
/**
|
|
204
|
+
* Generate embeddings for document chunks with progress logging
|
|
205
|
+
* Optimized for large document ingestion with batch processing
|
|
206
|
+
* @param chunks - Array of text chunks from documents
|
|
207
|
+
* @returns Promise resolving to array of embedding results
|
|
208
|
+
*/
|
|
209
|
+
async embedDocumentBatch(chunks) {
|
|
210
|
+
if (!this.model) {
|
|
211
|
+
throw new Error('Model not loaded. Call loadModel() first.');
|
|
212
|
+
}
|
|
213
|
+
if (chunks.length === 0) {
|
|
214
|
+
return [];
|
|
215
|
+
}
|
|
216
|
+
console.log(`Processing ${chunks.length} chunk${chunks.length === 1 ? '' : 's'} in batches of ${this.batchSize}...`);
|
|
217
|
+
const results = [];
|
|
218
|
+
const totalBatches = Math.ceil(chunks.length / this.batchSize);
|
|
219
|
+
let processedChunks = 0;
|
|
220
|
+
let skippedChunks = 0;
|
|
221
|
+
for (let batchIndex = 0; batchIndex < totalBatches; batchIndex++) {
|
|
222
|
+
const startIdx = batchIndex * this.batchSize;
|
|
223
|
+
const endIdx = Math.min(startIdx + this.batchSize, chunks.length);
|
|
224
|
+
const batch = chunks.slice(startIdx, endIdx);
|
|
225
|
+
try {
|
|
226
|
+
const batchResults = await this.processBatchWithErrorHandling(batch, startIdx);
|
|
227
|
+
results.push(...batchResults);
|
|
228
|
+
processedChunks += batchResults.length;
|
|
229
|
+
skippedChunks += (batch.length - batchResults.length);
|
|
230
|
+
// Progress logging - more frequent updates for better user experience
|
|
231
|
+
const progressInterval = Math.max(1, Math.floor(totalBatches / 20)); // Show progress every 5%
|
|
232
|
+
if ((batchIndex + 1) % progressInterval === 0 || batchIndex === totalBatches - 1) {
|
|
233
|
+
const percentage = Math.round(((batchIndex + 1) / totalBatches) * 100);
|
|
234
|
+
console.log(`Processed ${processedChunks} of ${chunks.length} chunks (${percentage}%)${skippedChunks > 0 ? ` - ${skippedChunks} skipped` : ''}`);
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
catch (error) {
|
|
238
|
+
console.error(`Failed to process batch ${batchIndex + 1}/${totalBatches}:`, error instanceof Error ? error.message : String(error));
|
|
239
|
+
skippedChunks += batch.length;
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
if (skippedChunks > 0) {
|
|
243
|
+
console.log(`✓ Embedding complete: ${processedChunks} successful, ${skippedChunks} skipped due to errors`);
|
|
244
|
+
}
|
|
245
|
+
else {
|
|
246
|
+
console.log(`✓ Embedding complete: ${processedChunks} chunks processed successfully`);
|
|
247
|
+
}
|
|
248
|
+
return results;
|
|
249
|
+
}
|
|
250
|
+
/**
|
|
251
|
+
* Get the current model version identifier
|
|
252
|
+
* @returns Model version string
|
|
253
|
+
*/
|
|
254
|
+
getModelVersion() {
|
|
255
|
+
if (!this.modelVersion) {
|
|
256
|
+
throw new Error('Model not loaded. Call loadModel() first.');
|
|
257
|
+
}
|
|
258
|
+
return this.modelVersion;
|
|
259
|
+
}
|
|
260
|
+
/**
|
|
261
|
+
* Check if the model is loaded
|
|
262
|
+
* @returns True if model is loaded
|
|
263
|
+
*/
|
|
264
|
+
isLoaded() {
|
|
265
|
+
return this.model !== null;
|
|
266
|
+
}
|
|
267
|
+
/**
|
|
268
|
+
* Get the model name
|
|
269
|
+
* @returns Model name string
|
|
270
|
+
*/
|
|
271
|
+
getModelName() {
|
|
272
|
+
return this.modelName;
|
|
273
|
+
}
|
|
274
|
+
/**
|
|
275
|
+
* Get the batch size
|
|
276
|
+
* @returns Batch size number
|
|
277
|
+
*/
|
|
278
|
+
getBatchSize() {
|
|
279
|
+
return this.batchSize;
|
|
280
|
+
}
|
|
281
|
+
/**
|
|
282
|
+
* Generate a deterministic model version identifier
|
|
283
|
+
* Uses model name and configuration for consistent versioning
|
|
284
|
+
* @returns Model version string
|
|
285
|
+
*/
|
|
286
|
+
generateModelVersion() {
|
|
287
|
+
// Create a deterministic hash based on model name and configuration
|
|
288
|
+
// This ensures the same model configuration always produces the same version
|
|
289
|
+
const configData = JSON.stringify({
|
|
290
|
+
model: this.modelName,
|
|
291
|
+
// Add other relevant config that affects embeddings
|
|
292
|
+
quantized: false,
|
|
293
|
+
revision: 'main'
|
|
294
|
+
});
|
|
295
|
+
const hash = createHash('sha256').update(configData).digest('hex').substring(0, 16);
|
|
296
|
+
return `${this.modelName.replace('/', '_')}_${hash}`;
|
|
297
|
+
}
|
|
298
|
+
/**
|
|
299
|
+
* Generate a deterministic embedding ID for a text chunk
|
|
300
|
+
* @param text - The text content
|
|
301
|
+
* @param index - Index in the batch
|
|
302
|
+
* @returns Deterministic embedding ID
|
|
303
|
+
*/
|
|
304
|
+
generateEmbeddingId(text, index) {
|
|
305
|
+
// Create deterministic ID based on content hash only
|
|
306
|
+
// This ensures the same text always gets the same ID regardless of processing order
|
|
307
|
+
const contentHash = createHash('sha256').update(text.trim()).digest('hex');
|
|
308
|
+
return contentHash.substring(0, 32);
|
|
309
|
+
}
|
|
310
|
+
}
|
|
311
|
+
/**
|
|
312
|
+
* Singleton instance for the embedding engine
|
|
313
|
+
* Ensures model is loaded only once across the application
|
|
314
|
+
*/
|
|
315
|
+
let embeddingEngineInstance = null;
|
|
316
|
+
/**
|
|
317
|
+
* Get the singleton embedding engine instance
|
|
318
|
+
* @param modelName - Optional model name override
|
|
319
|
+
* @param batchSize - Optional batch size override
|
|
320
|
+
* @returns EmbeddingEngine instance
|
|
321
|
+
*/
|
|
322
|
+
export function getEmbeddingEngine(modelName, batchSize) {
|
|
323
|
+
// Always create a new instance if specific parameters are provided
|
|
324
|
+
// This ensures we don't use cached instances with wrong configuration
|
|
325
|
+
if (modelName || batchSize) {
|
|
326
|
+
embeddingEngineInstance = new EmbeddingEngine(modelName, batchSize);
|
|
327
|
+
}
|
|
328
|
+
else if (!embeddingEngineInstance) {
|
|
329
|
+
embeddingEngineInstance = new EmbeddingEngine();
|
|
330
|
+
}
|
|
331
|
+
return embeddingEngineInstance;
|
|
332
|
+
}
|
|
333
|
+
/**
|
|
334
|
+
* Initialize the embedding engine and load the model
|
|
335
|
+
* @param modelName - Optional model name override
|
|
336
|
+
* @param batchSize - Optional batch size override
|
|
337
|
+
* @returns Promise resolving to the loaded embedding engine
|
|
338
|
+
*/
|
|
339
|
+
export async function initializeEmbeddingEngine(modelName, batchSize) {
|
|
340
|
+
const engine = getEmbeddingEngine(modelName, batchSize);
|
|
341
|
+
if (!engine.isLoaded()) {
|
|
342
|
+
await engine.loadModel();
|
|
343
|
+
}
|
|
344
|
+
return engine;
|
|
345
|
+
}
|
|
346
|
+
/**
|
|
347
|
+
|
|
348
|
+
* Create an EmbedFunction implementation using the text embedding engine
|
|
349
|
+
* This function implements the core EmbedFunction interface for dependency injection
|
|
350
|
+
* @param modelName - Optional model name override
|
|
351
|
+
* @param batchSize - Optional batch size override
|
|
352
|
+
* @returns EmbedFunction that can be injected into core components
|
|
353
|
+
*/
|
|
354
|
+
export function createTextEmbedFunction(modelName, batchSize) {
|
|
355
|
+
let engine = null;
|
|
356
|
+
const embedFunction = async (query, contentType) => {
|
|
357
|
+
// Only support text content type
|
|
358
|
+
if (contentType && contentType !== 'text') {
|
|
359
|
+
throw new Error(`Text embedder only supports 'text' content type, got: ${contentType}`);
|
|
360
|
+
}
|
|
361
|
+
// Initialize engine if not already done
|
|
362
|
+
if (!engine) {
|
|
363
|
+
engine = await initializeEmbeddingEngine(modelName, batchSize);
|
|
364
|
+
}
|
|
365
|
+
// Use the existing embedSingle method
|
|
366
|
+
const result = await engine.embedSingle(query);
|
|
367
|
+
// Ensure contentType is present (should already be included from embedSingle)
|
|
368
|
+
return {
|
|
369
|
+
...result,
|
|
370
|
+
contentType: result.contentType || 'text'
|
|
371
|
+
};
|
|
372
|
+
};
|
|
373
|
+
return embedFunction;
|
|
374
|
+
}
|
|
375
|
+
// =============================================================================
|
|
376
|
+
// REMOVED: createTextEmbedder() factory object
|
|
377
|
+
// =============================================================================
|
|
378
|
+
// The createTextEmbedder() function has been removed as it was redundant.
|
|
379
|
+
// It was just a wrapper around initializeEmbeddingEngine() that provided no
|
|
380
|
+
// additional value over using the engine directly or using createEmbedder().
|
|
381
|
+
//
|
|
382
|
+
// Migration guide:
|
|
383
|
+
// - For public API: Use createEmbedder() from core/embedder-factory.ts
|
|
384
|
+
// - For dependency injection: Use createTextEmbedFunction()
|
|
385
|
+
// - For direct access: Use initializeEmbeddingEngine() or new EmbeddingEngine()
|
|
386
|
+
//# sourceMappingURL=embedder.js.map
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
export { EmbeddingEngine, getEmbeddingEngine, initializeEmbeddingEngine, createTextEmbedFunction } from './embedder.js';
|
|
2
|
+
export { CrossEncoderReranker, createTextRerankFunction } from './reranker.js';
|
|
3
|
+
export { countTokens, getTokenizer, resetTokenizer } from './tokenizer.js';
|
|
4
|
+
export { chunkDocument, type Chunk, type Document } from '../core/chunker.js';
|
|
5
|
+
export { type ChunkConfig } from '../core/chunker.js';
|
|
6
|
+
export { SentenceTransformerEmbedder } from './sentence-transformer-embedder.js';
|
|
7
|
+
export * from './preprocessors/index.js';
|
|
8
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
// Text implementation layer exports
|
|
2
|
+
export { EmbeddingEngine, getEmbeddingEngine, initializeEmbeddingEngine, createTextEmbedFunction } from './embedder.js';
|
|
3
|
+
export { CrossEncoderReranker, createTextRerankFunction } from './reranker.js';
|
|
4
|
+
export { countTokens, getTokenizer, resetTokenizer } from './tokenizer.js';
|
|
5
|
+
export { chunkDocument } from '../core/chunker.js';
|
|
6
|
+
export { SentenceTransformerEmbedder } from './sentence-transformer-embedder.js';
|
|
7
|
+
// Re-export preprocessors
|
|
8
|
+
export * from './preprocessors/index.js';
|
|
9
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import { PreprocessorRegistry } from './registry.js';
|
|
2
|
+
export { PreprocessorRegistry, ContentTypeDetector } from './registry.js';
|
|
3
|
+
export { MdxPreprocessor } from './mdx.js';
|
|
4
|
+
export { MermaidPreprocessor } from './mermaid.js';
|
|
5
|
+
/**
|
|
6
|
+
* Global preprocessor registry instance
|
|
7
|
+
*/
|
|
8
|
+
export declare const preprocessorRegistry: PreprocessorRegistry;
|
|
9
|
+
/**
|
|
10
|
+
* Validate that all required preprocessors are available in the registry
|
|
11
|
+
*/
|
|
12
|
+
export declare function validatePreprocessorConfiguration(requiredPreprocessors: string[]): void;
|
|
13
|
+
/**
|
|
14
|
+
* Get all available preprocessor names
|
|
15
|
+
*/
|
|
16
|
+
export declare function getAvailablePreprocessors(): string[];
|
|
17
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import { PreprocessorRegistry } from './registry.js';
|
|
2
|
+
import { MdxPreprocessor } from './mdx.js';
|
|
3
|
+
import { MermaidPreprocessor } from './mermaid.js';
|
|
4
|
+
// Export all preprocessor classes
|
|
5
|
+
export { PreprocessorRegistry, ContentTypeDetector } from './registry.js';
|
|
6
|
+
export { MdxPreprocessor } from './mdx.js';
|
|
7
|
+
export { MermaidPreprocessor } from './mermaid.js';
|
|
8
|
+
/**
|
|
9
|
+
* Create and initialize the global preprocessor registry
|
|
10
|
+
*/
|
|
11
|
+
function createPreprocessorRegistry() {
|
|
12
|
+
const registry = new PreprocessorRegistry();
|
|
13
|
+
// Register built-in preprocessors
|
|
14
|
+
registry.register('mdx', new MdxPreprocessor());
|
|
15
|
+
registry.register('mermaid', new MermaidPreprocessor());
|
|
16
|
+
return registry;
|
|
17
|
+
}
|
|
18
|
+
/**
|
|
19
|
+
* Global preprocessor registry instance
|
|
20
|
+
*/
|
|
21
|
+
export const preprocessorRegistry = createPreprocessorRegistry();
|
|
22
|
+
/**
|
|
23
|
+
* Validate that all required preprocessors are available in the registry
|
|
24
|
+
*/
|
|
25
|
+
export function validatePreprocessorConfiguration(requiredPreprocessors) {
|
|
26
|
+
const validation = preprocessorRegistry.validatePreprocessors(requiredPreprocessors);
|
|
27
|
+
if (!validation.valid) {
|
|
28
|
+
const missingList = validation.missing.join(', ');
|
|
29
|
+
throw new Error(`Missing required preprocessors: ${missingList}. Available: ${preprocessorRegistry.getRegisteredNames().join(', ')}`);
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Get all available preprocessor names
|
|
34
|
+
*/
|
|
35
|
+
export function getAvailablePreprocessors() {
|
|
36
|
+
return preprocessorRegistry.getRegisteredNames();
|
|
37
|
+
}
|
|
38
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import { Preprocessor, PreprocessorOptions } from '../../types.js';
|
|
2
|
+
/**
|
|
3
|
+
* MDX preprocessor for handling JSX content in Markdown files
|
|
4
|
+
* Ports the existing cleanMdxContent logic with mode-aware behavior
|
|
5
|
+
*/
|
|
6
|
+
export declare class MdxPreprocessor implements Preprocessor {
|
|
7
|
+
/**
|
|
8
|
+
* Check if this preprocessor applies to the given language/content type
|
|
9
|
+
* Applies to .mdx files and content with JSX syntax
|
|
10
|
+
*/
|
|
11
|
+
appliesTo(language: string): boolean;
|
|
12
|
+
/**
|
|
13
|
+
* Process MDX content based on the specified mode
|
|
14
|
+
*/
|
|
15
|
+
process(content: string, options: PreprocessorOptions): string;
|
|
16
|
+
/**
|
|
17
|
+
* Strip JSX content entirely - ported from cleanMdxContent logic
|
|
18
|
+
*/
|
|
19
|
+
private stripJsx;
|
|
20
|
+
/**
|
|
21
|
+
* Replace JSX with descriptive placeholders
|
|
22
|
+
*/
|
|
23
|
+
private replaceWithPlaceholders;
|
|
24
|
+
}
|
|
25
|
+
//# sourceMappingURL=mdx.d.ts.map
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
import { ContentTypeDetector } from './registry.js';
|
|
2
|
+
/**
|
|
3
|
+
* MDX preprocessor for handling JSX content in Markdown files
|
|
4
|
+
* Ports the existing cleanMdxContent logic with mode-aware behavior
|
|
5
|
+
*/
|
|
6
|
+
export class MdxPreprocessor {
|
|
7
|
+
/**
|
|
8
|
+
* Check if this preprocessor applies to the given language/content type
|
|
9
|
+
* Applies to .mdx files and content with JSX syntax
|
|
10
|
+
*/
|
|
11
|
+
appliesTo(language) {
|
|
12
|
+
return language === 'mdx';
|
|
13
|
+
}
|
|
14
|
+
/**
|
|
15
|
+
* Process MDX content based on the specified mode
|
|
16
|
+
*/
|
|
17
|
+
process(content, options) {
|
|
18
|
+
// Only process if content actually contains JSX
|
|
19
|
+
if (!ContentTypeDetector.hasJsxContent(content)) {
|
|
20
|
+
return content;
|
|
21
|
+
}
|
|
22
|
+
switch (options.mode) {
|
|
23
|
+
case 'strip':
|
|
24
|
+
return this.stripJsx(content);
|
|
25
|
+
case 'keep':
|
|
26
|
+
return content; // Keep JSX as-is
|
|
27
|
+
case 'placeholder':
|
|
28
|
+
return this.replaceWithPlaceholders(content);
|
|
29
|
+
default:
|
|
30
|
+
console.log(`Unknown MDX processing mode: ${options.mode}, using placeholder`);
|
|
31
|
+
return this.replaceWithPlaceholders(content);
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Strip JSX content entirely - ported from cleanMdxContent logic
|
|
36
|
+
*/
|
|
37
|
+
stripJsx(content) {
|
|
38
|
+
let cleaned = content;
|
|
39
|
+
// Remove JSX import statements (requirement 11.1)
|
|
40
|
+
// Matches: import ... from '...' or import ... from "..."
|
|
41
|
+
cleaned = cleaned.replace(/^import\s+.*?from\s+['"][^'"]*['"];?\s*$/gm, '');
|
|
42
|
+
// Remove JSX export statements (requirement 11.2)
|
|
43
|
+
// Handle both single-line and multi-line exports
|
|
44
|
+
// Multi-line function exports: export default function() { ... }
|
|
45
|
+
cleaned = cleaned.replace(/^export\s+default\s+function[^{]*\{[\s\S]*?\n\}\s*$/gm, '');
|
|
46
|
+
// Object exports: export const metadata = { ... }
|
|
47
|
+
cleaned = cleaned.replace(/^export\s+const\s+[^=]*=\s*\{[\s\S]*?\}\s*;?\s*$/gm, '');
|
|
48
|
+
// Single line exports: export const x = ...; or export default ...
|
|
49
|
+
cleaned = cleaned.replace(/^export\s+(?:default\s+)?(?:const|let|var|function|class)\s+[^;{]*;?\s*$/gm, '');
|
|
50
|
+
// Simple exports: export default Component
|
|
51
|
+
cleaned = cleaned.replace(/^export\s+default\s+[^;{]*;?\s*$/gm, '');
|
|
52
|
+
// Remove JSX components (requirements 11.3, 11.4)
|
|
53
|
+
// Self-closing tags: <Component />
|
|
54
|
+
cleaned = cleaned.replace(/<[A-Z][a-zA-Z0-9]*[^>]*\/>/g, '');
|
|
55
|
+
// Opening and closing tags with content: <Component>content</Component>
|
|
56
|
+
// This handles nested components by replacing the outermost ones first
|
|
57
|
+
let previousLength;
|
|
58
|
+
do {
|
|
59
|
+
previousLength = cleaned.length;
|
|
60
|
+
cleaned = cleaned.replace(/<[A-Z][a-zA-Z0-9]*[^>]*>.*?<\/[A-Z][a-zA-Z0-9]*>/gs, '');
|
|
61
|
+
} while (cleaned.length !== previousLength);
|
|
62
|
+
// Clean up multiple consecutive newlines and trim
|
|
63
|
+
cleaned = cleaned.replace(/\n\s*\n\s*\n/g, '\n\n').trim();
|
|
64
|
+
// Ensure we never return empty content (requirement 6.4)
|
|
65
|
+
if (!cleaned.trim()) {
|
|
66
|
+
return '[content removed]';
|
|
67
|
+
}
|
|
68
|
+
return cleaned;
|
|
69
|
+
}
|
|
70
|
+
/**
|
|
71
|
+
* Replace JSX with descriptive placeholders
|
|
72
|
+
*/
|
|
73
|
+
replaceWithPlaceholders(content) {
|
|
74
|
+
let cleaned = content;
|
|
75
|
+
// Replace JSX import statements
|
|
76
|
+
cleaned = cleaned.replace(/^import\s+.*?from\s+['"][^'"]*['"];?\s*$/gm, '[import removed]');
|
|
77
|
+
// Replace JSX export statements
|
|
78
|
+
// Multi-line function exports: export default function() { ... }
|
|
79
|
+
cleaned = cleaned.replace(/^export\s+default\s+function[^{]*\{[\s\S]*?\n\}\s*$/gm, '[export removed]');
|
|
80
|
+
// Object exports: export const metadata = { ... }
|
|
81
|
+
cleaned = cleaned.replace(/^export\s+const\s+[^=]*=\s*\{[\s\S]*?\}\s*;?\s*$/gm, '[export removed]');
|
|
82
|
+
// Single line exports: export const x = ...; or export default ...
|
|
83
|
+
cleaned = cleaned.replace(/^export\s+(?:default\s+)?(?:const|let|var|function|class)\s+[^;{]*;?\s*$/gm, '[export removed]');
|
|
84
|
+
// Simple exports: export default Component
|
|
85
|
+
cleaned = cleaned.replace(/^export\s+default\s+[^;{]*;?\s*$/gm, '[export removed]');
|
|
86
|
+
// Replace JSX components with placeholder
|
|
87
|
+
// Self-closing tags: <Component />
|
|
88
|
+
cleaned = cleaned.replace(/<[A-Z][a-zA-Z0-9]*[^>]*\/>/g, '[component removed]');
|
|
89
|
+
// Opening and closing tags with content: <Component>content</Component>
|
|
90
|
+
// This handles nested components by replacing the outermost ones first
|
|
91
|
+
let previousLength;
|
|
92
|
+
do {
|
|
93
|
+
previousLength = cleaned.length;
|
|
94
|
+
cleaned = cleaned.replace(/<[A-Z][a-zA-Z0-9]*[^>]*>.*?<\/[A-Z][a-zA-Z0-9]*>/gs, '[component removed]');
|
|
95
|
+
} while (cleaned.length !== previousLength);
|
|
96
|
+
// Clean up multiple consecutive newlines and trim
|
|
97
|
+
cleaned = cleaned.replace(/\n\s*\n\s*\n/g, '\n\n').trim();
|
|
98
|
+
return cleaned;
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
//# sourceMappingURL=mdx.js.map
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import { Preprocessor, PreprocessorOptions } from '../../types.js';
|
|
2
|
+
/**
|
|
3
|
+
* Mermaid preprocessor for handling Mermaid diagrams in Markdown files
|
|
4
|
+
* Supports strip, extract, and placeholder modes
|
|
5
|
+
*/
|
|
6
|
+
export declare class MermaidPreprocessor implements Preprocessor {
|
|
7
|
+
/**
|
|
8
|
+
* Check if this preprocessor applies to the given language/content type
|
|
9
|
+
* Applies to mermaid code blocks and content with Mermaid syntax
|
|
10
|
+
*/
|
|
11
|
+
appliesTo(language: string): boolean;
|
|
12
|
+
/**
|
|
13
|
+
* Process Mermaid content based on the specified mode
|
|
14
|
+
*/
|
|
15
|
+
process(content: string, options: PreprocessorOptions): string;
|
|
16
|
+
/**
|
|
17
|
+
* Strip Mermaid diagrams entirely
|
|
18
|
+
*/
|
|
19
|
+
private stripMermaid;
|
|
20
|
+
/**
|
|
21
|
+
* Replace Mermaid diagrams with descriptive placeholders
|
|
22
|
+
*/
|
|
23
|
+
private replaceWithPlaceholders;
|
|
24
|
+
/**
|
|
25
|
+
* Extract semantic information from Mermaid diagrams
|
|
26
|
+
* Converts diagram edges to plain text while ignoring styling and layout instructions
|
|
27
|
+
*/
|
|
28
|
+
private extractMermaidEdges;
|
|
29
|
+
/**
|
|
30
|
+
* Extract edges and relationships from a Mermaid diagram
|
|
31
|
+
* Ignores styling, layout, and formatting instructions
|
|
32
|
+
*/
|
|
33
|
+
private extractEdgesFromDiagram;
|
|
34
|
+
/**
|
|
35
|
+
* Build a mapping of node IDs to their labels by scanning all lines
|
|
36
|
+
*/
|
|
37
|
+
private buildNodeLabelMap;
|
|
38
|
+
/**
|
|
39
|
+
* Check if a line is a layout instruction (should be ignored)
|
|
40
|
+
*/
|
|
41
|
+
private isLayoutInstruction;
|
|
42
|
+
/**
|
|
43
|
+
* Check if a line is a styling instruction (should be ignored)
|
|
44
|
+
*/
|
|
45
|
+
private isStyleInstruction;
|
|
46
|
+
/**
|
|
47
|
+
* Extract the meaningful label from a node definition
|
|
48
|
+
* Examples: A[Start] -> "Start", B{Decision} -> "Decision", C((End)) -> "End", D -> "D"
|
|
49
|
+
*/
|
|
50
|
+
private extractNodeLabel;
|
|
51
|
+
/**
|
|
52
|
+
* Extract edge information from a single line
|
|
53
|
+
*/
|
|
54
|
+
private extractEdgeFromLine;
|
|
55
|
+
/**
|
|
56
|
+
* Interpret flowchart connector symbols
|
|
57
|
+
*/
|
|
58
|
+
private interpretConnector;
|
|
59
|
+
/**
|
|
60
|
+
* Interpret class diagram connector symbols
|
|
61
|
+
*/
|
|
62
|
+
private interpretClassConnector;
|
|
63
|
+
/**
|
|
64
|
+
* Interpret ER diagram connector symbols
|
|
65
|
+
*/
|
|
66
|
+
private interpretERConnector;
|
|
67
|
+
}
|
|
68
|
+
//# sourceMappingURL=mermaid.d.ts.map
|