rag-lite-ts 2.1.1 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{core → cjs/core}/model-validator.js +1 -1
- package/dist/{core → cjs/core}/vector-index.js +4 -2
- package/dist/esm/api-errors.d.ts +90 -0
- package/dist/esm/api-errors.js +320 -0
- package/dist/esm/cli/indexer.d.ts +11 -0
- package/dist/esm/cli/indexer.js +471 -0
- package/dist/esm/cli/search.d.ts +7 -0
- package/dist/esm/cli/search.js +332 -0
- package/dist/esm/cli.d.ts +3 -0
- package/dist/esm/cli.js +529 -0
- package/dist/esm/config.d.ts +51 -0
- package/dist/esm/config.js +79 -0
- package/dist/esm/core/abstract-embedder.d.ts +125 -0
- package/dist/esm/core/abstract-embedder.js +264 -0
- package/dist/esm/core/actionable-error-messages.d.ts +60 -0
- package/dist/esm/core/actionable-error-messages.js +397 -0
- package/dist/esm/core/adapters.d.ts +93 -0
- package/dist/esm/core/adapters.js +139 -0
- package/dist/esm/core/batch-processing-optimizer.d.ts +155 -0
- package/dist/esm/core/batch-processing-optimizer.js +536 -0
- package/dist/esm/core/binary-index-format.d.ts +78 -0
- package/dist/esm/core/binary-index-format.js +291 -0
- package/dist/esm/core/chunker.d.ts +119 -0
- package/dist/esm/core/chunker.js +73 -0
- package/dist/esm/core/cli-database-utils.d.ts +53 -0
- package/dist/esm/core/cli-database-utils.js +239 -0
- package/dist/esm/core/config.d.ts +102 -0
- package/dist/esm/core/config.js +247 -0
- package/dist/esm/core/content-errors.d.ts +111 -0
- package/dist/esm/core/content-errors.js +362 -0
- package/dist/esm/core/content-manager.d.ts +335 -0
- package/dist/esm/core/content-manager.js +1476 -0
- package/dist/esm/core/content-performance-optimizer.d.ts +150 -0
- package/dist/esm/core/content-performance-optimizer.js +516 -0
- package/dist/esm/core/content-resolver.d.ts +104 -0
- package/dist/esm/core/content-resolver.js +285 -0
- package/dist/esm/core/cross-modal-search.d.ts +164 -0
- package/dist/esm/core/cross-modal-search.js +342 -0
- package/dist/esm/core/database-connection-manager.d.ts +109 -0
- package/dist/esm/core/database-connection-manager.js +310 -0
- package/dist/esm/core/db.d.ts +213 -0
- package/dist/esm/core/db.js +895 -0
- package/dist/esm/core/embedder-factory.d.ts +154 -0
- package/dist/esm/core/embedder-factory.js +311 -0
- package/dist/esm/core/error-handler.d.ts +112 -0
- package/dist/esm/core/error-handler.js +239 -0
- package/dist/esm/core/index.d.ts +59 -0
- package/dist/esm/core/index.js +69 -0
- package/dist/esm/core/ingestion.d.ts +202 -0
- package/dist/esm/core/ingestion.js +901 -0
- package/dist/esm/core/interfaces.d.ts +408 -0
- package/dist/esm/core/interfaces.js +106 -0
- package/dist/esm/core/lazy-dependency-loader.d.ts +147 -0
- package/dist/esm/core/lazy-dependency-loader.js +435 -0
- package/dist/esm/core/mode-detection-service.d.ts +150 -0
- package/dist/esm/core/mode-detection-service.js +565 -0
- package/dist/esm/core/mode-model-validator.d.ts +92 -0
- package/dist/esm/core/mode-model-validator.js +203 -0
- package/dist/esm/core/model-registry.d.ts +116 -0
- package/dist/esm/core/model-registry.js +411 -0
- package/dist/esm/core/model-validator.d.ts +217 -0
- package/dist/esm/core/model-validator.js +782 -0
- package/dist/esm/core/path-manager.d.ts +47 -0
- package/dist/esm/core/path-manager.js +71 -0
- package/dist/esm/core/raglite-paths.d.ts +121 -0
- package/dist/esm/core/raglite-paths.js +145 -0
- package/dist/esm/core/reranking-config.d.ts +42 -0
- package/dist/esm/core/reranking-config.js +147 -0
- package/dist/esm/core/reranking-factory.d.ts +92 -0
- package/dist/esm/core/reranking-factory.js +410 -0
- package/dist/esm/core/reranking-strategies.d.ts +310 -0
- package/dist/esm/core/reranking-strategies.js +650 -0
- package/dist/esm/core/resource-cleanup.d.ts +163 -0
- package/dist/esm/core/resource-cleanup.js +371 -0
- package/dist/esm/core/resource-manager.d.ts +212 -0
- package/dist/esm/core/resource-manager.js +564 -0
- package/dist/esm/core/search-pipeline.d.ts +111 -0
- package/dist/esm/core/search-pipeline.js +287 -0
- package/dist/esm/core/search.d.ts +141 -0
- package/dist/esm/core/search.js +320 -0
- package/dist/esm/core/streaming-operations.d.ts +145 -0
- package/dist/esm/core/streaming-operations.js +409 -0
- package/dist/esm/core/types.d.ts +66 -0
- package/dist/esm/core/types.js +6 -0
- package/dist/esm/core/universal-embedder.d.ts +177 -0
- package/dist/esm/core/universal-embedder.js +139 -0
- package/dist/esm/core/validation-messages.d.ts +99 -0
- package/dist/esm/core/validation-messages.js +334 -0
- package/dist/esm/core/vector-index.d.ts +72 -0
- package/dist/esm/core/vector-index.js +333 -0
- package/dist/esm/dom-polyfills.d.ts +6 -0
- package/dist/esm/dom-polyfills.js +37 -0
- package/dist/esm/factories/index.d.ts +27 -0
- package/dist/esm/factories/index.js +29 -0
- package/dist/esm/factories/ingestion-factory.d.ts +200 -0
- package/dist/esm/factories/ingestion-factory.js +477 -0
- package/dist/esm/factories/search-factory.d.ts +154 -0
- package/dist/esm/factories/search-factory.js +344 -0
- package/dist/esm/file-processor.d.ts +147 -0
- package/dist/esm/file-processor.js +963 -0
- package/dist/esm/index-manager.d.ts +116 -0
- package/dist/esm/index-manager.js +598 -0
- package/dist/esm/index.d.ts +75 -0
- package/dist/esm/index.js +110 -0
- package/dist/esm/indexer.d.ts +7 -0
- package/dist/esm/indexer.js +54 -0
- package/dist/esm/ingestion.d.ts +63 -0
- package/dist/esm/ingestion.js +124 -0
- package/dist/esm/mcp-server.d.ts +46 -0
- package/dist/esm/mcp-server.js +1820 -0
- package/dist/esm/multimodal/clip-embedder.d.ts +327 -0
- package/dist/esm/multimodal/clip-embedder.js +996 -0
- package/dist/esm/multimodal/index.d.ts +6 -0
- package/dist/esm/multimodal/index.js +6 -0
- package/dist/esm/preprocess.d.ts +19 -0
- package/dist/esm/preprocess.js +203 -0
- package/dist/esm/preprocessors/index.d.ts +17 -0
- package/dist/esm/preprocessors/index.js +38 -0
- package/dist/esm/preprocessors/mdx.d.ts +25 -0
- package/dist/esm/preprocessors/mdx.js +101 -0
- package/dist/esm/preprocessors/mermaid.d.ts +68 -0
- package/dist/esm/preprocessors/mermaid.js +329 -0
- package/dist/esm/preprocessors/registry.d.ts +56 -0
- package/dist/esm/preprocessors/registry.js +179 -0
- package/dist/esm/run-error-recovery-tests.d.ts +7 -0
- package/dist/esm/run-error-recovery-tests.js +101 -0
- package/dist/esm/search-standalone.d.ts +7 -0
- package/dist/esm/search-standalone.js +117 -0
- package/dist/esm/search.d.ts +99 -0
- package/dist/esm/search.js +177 -0
- package/dist/esm/test-utils.d.ts +18 -0
- package/dist/esm/test-utils.js +27 -0
- package/dist/esm/text/chunker.d.ts +33 -0
- package/dist/esm/text/chunker.js +279 -0
- package/dist/esm/text/embedder.d.ts +111 -0
- package/dist/esm/text/embedder.js +386 -0
- package/dist/esm/text/index.d.ts +8 -0
- package/dist/esm/text/index.js +9 -0
- package/dist/esm/text/preprocessors/index.d.ts +17 -0
- package/dist/esm/text/preprocessors/index.js +38 -0
- package/dist/esm/text/preprocessors/mdx.d.ts +25 -0
- package/dist/esm/text/preprocessors/mdx.js +101 -0
- package/dist/esm/text/preprocessors/mermaid.d.ts +68 -0
- package/dist/esm/text/preprocessors/mermaid.js +330 -0
- package/dist/esm/text/preprocessors/registry.d.ts +56 -0
- package/dist/esm/text/preprocessors/registry.js +180 -0
- package/dist/esm/text/reranker.d.ts +49 -0
- package/dist/esm/text/reranker.js +274 -0
- package/dist/esm/text/sentence-transformer-embedder.d.ts +96 -0
- package/dist/esm/text/sentence-transformer-embedder.js +340 -0
- package/dist/esm/text/tokenizer.d.ts +22 -0
- package/dist/esm/text/tokenizer.js +64 -0
- package/dist/esm/types.d.ts +83 -0
- package/dist/esm/types.js +3 -0
- package/dist/esm/utils/vector-math.d.ts +31 -0
- package/dist/esm/utils/vector-math.js +70 -0
- package/package.json +30 -12
- /package/dist/{api-errors.d.ts → cjs/api-errors.d.ts} +0 -0
- /package/dist/{api-errors.js → cjs/api-errors.js} +0 -0
- /package/dist/{cli → cjs/cli}/indexer.d.ts +0 -0
- /package/dist/{cli → cjs/cli}/indexer.js +0 -0
- /package/dist/{cli → cjs/cli}/search.d.ts +0 -0
- /package/dist/{cli → cjs/cli}/search.js +0 -0
- /package/dist/{cli.d.ts → cjs/cli.d.ts} +0 -0
- /package/dist/{cli.js → cjs/cli.js} +0 -0
- /package/dist/{config.d.ts → cjs/config.d.ts} +0 -0
- /package/dist/{config.js → cjs/config.js} +0 -0
- /package/dist/{core → cjs/core}/abstract-embedder.d.ts +0 -0
- /package/dist/{core → cjs/core}/abstract-embedder.js +0 -0
- /package/dist/{core → cjs/core}/actionable-error-messages.d.ts +0 -0
- /package/dist/{core → cjs/core}/actionable-error-messages.js +0 -0
- /package/dist/{core → cjs/core}/adapters.d.ts +0 -0
- /package/dist/{core → cjs/core}/adapters.js +0 -0
- /package/dist/{core → cjs/core}/batch-processing-optimizer.d.ts +0 -0
- /package/dist/{core → cjs/core}/batch-processing-optimizer.js +0 -0
- /package/dist/{core → cjs/core}/binary-index-format.d.ts +0 -0
- /package/dist/{core → cjs/core}/binary-index-format.js +0 -0
- /package/dist/{core → cjs/core}/chunker.d.ts +0 -0
- /package/dist/{core → cjs/core}/chunker.js +0 -0
- /package/dist/{core → cjs/core}/cli-database-utils.d.ts +0 -0
- /package/dist/{core → cjs/core}/cli-database-utils.js +0 -0
- /package/dist/{core → cjs/core}/config.d.ts +0 -0
- /package/dist/{core → cjs/core}/config.js +0 -0
- /package/dist/{core → cjs/core}/content-errors.d.ts +0 -0
- /package/dist/{core → cjs/core}/content-errors.js +0 -0
- /package/dist/{core → cjs/core}/content-manager.d.ts +0 -0
- /package/dist/{core → cjs/core}/content-manager.js +0 -0
- /package/dist/{core → cjs/core}/content-performance-optimizer.d.ts +0 -0
- /package/dist/{core → cjs/core}/content-performance-optimizer.js +0 -0
- /package/dist/{core → cjs/core}/content-resolver.d.ts +0 -0
- /package/dist/{core → cjs/core}/content-resolver.js +0 -0
- /package/dist/{core → cjs/core}/cross-modal-search.d.ts +0 -0
- /package/dist/{core → cjs/core}/cross-modal-search.js +0 -0
- /package/dist/{core → cjs/core}/database-connection-manager.d.ts +0 -0
- /package/dist/{core → cjs/core}/database-connection-manager.js +0 -0
- /package/dist/{core → cjs/core}/db.d.ts +0 -0
- /package/dist/{core → cjs/core}/db.js +0 -0
- /package/dist/{core → cjs/core}/embedder-factory.d.ts +0 -0
- /package/dist/{core → cjs/core}/embedder-factory.js +0 -0
- /package/dist/{core → cjs/core}/error-handler.d.ts +0 -0
- /package/dist/{core → cjs/core}/error-handler.js +0 -0
- /package/dist/{core → cjs/core}/index.d.ts +0 -0
- /package/dist/{core → cjs/core}/index.js +0 -0
- /package/dist/{core → cjs/core}/ingestion.d.ts +0 -0
- /package/dist/{core → cjs/core}/ingestion.js +0 -0
- /package/dist/{core → cjs/core}/interfaces.d.ts +0 -0
- /package/dist/{core → cjs/core}/interfaces.js +0 -0
- /package/dist/{core → cjs/core}/lazy-dependency-loader.d.ts +0 -0
- /package/dist/{core → cjs/core}/lazy-dependency-loader.js +0 -0
- /package/dist/{core → cjs/core}/mode-detection-service.d.ts +0 -0
- /package/dist/{core → cjs/core}/mode-detection-service.js +0 -0
- /package/dist/{core → cjs/core}/mode-model-validator.d.ts +0 -0
- /package/dist/{core → cjs/core}/mode-model-validator.js +0 -0
- /package/dist/{core → cjs/core}/model-registry.d.ts +0 -0
- /package/dist/{core → cjs/core}/model-registry.js +0 -0
- /package/dist/{core → cjs/core}/model-validator.d.ts +0 -0
- /package/dist/{core → cjs/core}/path-manager.d.ts +0 -0
- /package/dist/{core → cjs/core}/path-manager.js +0 -0
- /package/dist/{core → cjs/core}/raglite-paths.d.ts +0 -0
- /package/dist/{core → cjs/core}/raglite-paths.js +0 -0
- /package/dist/{core → cjs/core}/reranking-config.d.ts +0 -0
- /package/dist/{core → cjs/core}/reranking-config.js +0 -0
- /package/dist/{core → cjs/core}/reranking-factory.d.ts +0 -0
- /package/dist/{core → cjs/core}/reranking-factory.js +0 -0
- /package/dist/{core → cjs/core}/reranking-strategies.d.ts +0 -0
- /package/dist/{core → cjs/core}/reranking-strategies.js +0 -0
- /package/dist/{core → cjs/core}/resource-cleanup.d.ts +0 -0
- /package/dist/{core → cjs/core}/resource-cleanup.js +0 -0
- /package/dist/{core → cjs/core}/resource-manager.d.ts +0 -0
- /package/dist/{core → cjs/core}/resource-manager.js +0 -0
- /package/dist/{core → cjs/core}/search-pipeline.d.ts +0 -0
- /package/dist/{core → cjs/core}/search-pipeline.js +0 -0
- /package/dist/{core → cjs/core}/search.d.ts +0 -0
- /package/dist/{core → cjs/core}/search.js +0 -0
- /package/dist/{core → cjs/core}/streaming-operations.d.ts +0 -0
- /package/dist/{core → cjs/core}/streaming-operations.js +0 -0
- /package/dist/{core → cjs/core}/types.d.ts +0 -0
- /package/dist/{core → cjs/core}/types.js +0 -0
- /package/dist/{core → cjs/core}/universal-embedder.d.ts +0 -0
- /package/dist/{core → cjs/core}/universal-embedder.js +0 -0
- /package/dist/{core → cjs/core}/validation-messages.d.ts +0 -0
- /package/dist/{core → cjs/core}/validation-messages.js +0 -0
- /package/dist/{core → cjs/core}/vector-index.d.ts +0 -0
- /package/dist/{dom-polyfills.d.ts → cjs/dom-polyfills.d.ts} +0 -0
- /package/dist/{dom-polyfills.js → cjs/dom-polyfills.js} +0 -0
- /package/dist/{factories → cjs/factories}/index.d.ts +0 -0
- /package/dist/{factories → cjs/factories}/index.js +0 -0
- /package/dist/{factories → cjs/factories}/ingestion-factory.d.ts +0 -0
- /package/dist/{factories → cjs/factories}/ingestion-factory.js +0 -0
- /package/dist/{factories → cjs/factories}/search-factory.d.ts +0 -0
- /package/dist/{factories → cjs/factories}/search-factory.js +0 -0
- /package/dist/{file-processor.d.ts → cjs/file-processor.d.ts} +0 -0
- /package/dist/{file-processor.js → cjs/file-processor.js} +0 -0
- /package/dist/{index-manager.d.ts → cjs/index-manager.d.ts} +0 -0
- /package/dist/{index-manager.js → cjs/index-manager.js} +0 -0
- /package/dist/{index.d.ts → cjs/index.d.ts} +0 -0
- /package/dist/{index.js → cjs/index.js} +0 -0
- /package/dist/{indexer.d.ts → cjs/indexer.d.ts} +0 -0
- /package/dist/{indexer.js → cjs/indexer.js} +0 -0
- /package/dist/{ingestion.d.ts → cjs/ingestion.d.ts} +0 -0
- /package/dist/{ingestion.js → cjs/ingestion.js} +0 -0
- /package/dist/{mcp-server.d.ts → cjs/mcp-server.d.ts} +0 -0
- /package/dist/{mcp-server.js → cjs/mcp-server.js} +0 -0
- /package/dist/{multimodal → cjs/multimodal}/clip-embedder.d.ts +0 -0
- /package/dist/{multimodal → cjs/multimodal}/clip-embedder.js +0 -0
- /package/dist/{multimodal → cjs/multimodal}/index.d.ts +0 -0
- /package/dist/{multimodal → cjs/multimodal}/index.js +0 -0
- /package/dist/{preprocess.d.ts → cjs/preprocess.d.ts} +0 -0
- /package/dist/{preprocess.js → cjs/preprocess.js} +0 -0
- /package/dist/{preprocessors → cjs/preprocessors}/index.d.ts +0 -0
- /package/dist/{preprocessors → cjs/preprocessors}/index.js +0 -0
- /package/dist/{preprocessors → cjs/preprocessors}/mdx.d.ts +0 -0
- /package/dist/{preprocessors → cjs/preprocessors}/mdx.js +0 -0
- /package/dist/{preprocessors → cjs/preprocessors}/mermaid.d.ts +0 -0
- /package/dist/{preprocessors → cjs/preprocessors}/mermaid.js +0 -0
- /package/dist/{preprocessors → cjs/preprocessors}/registry.d.ts +0 -0
- /package/dist/{preprocessors → cjs/preprocessors}/registry.js +0 -0
- /package/dist/{run-error-recovery-tests.d.ts → cjs/run-error-recovery-tests.d.ts} +0 -0
- /package/dist/{run-error-recovery-tests.js → cjs/run-error-recovery-tests.js} +0 -0
- /package/dist/{search-standalone.d.ts → cjs/search-standalone.d.ts} +0 -0
- /package/dist/{search-standalone.js → cjs/search-standalone.js} +0 -0
- /package/dist/{search.d.ts → cjs/search.d.ts} +0 -0
- /package/dist/{search.js → cjs/search.js} +0 -0
- /package/dist/{test-utils.d.ts → cjs/test-utils.d.ts} +0 -0
- /package/dist/{test-utils.js → cjs/test-utils.js} +0 -0
- /package/dist/{text → cjs/text}/chunker.d.ts +0 -0
- /package/dist/{text → cjs/text}/chunker.js +0 -0
- /package/dist/{text → cjs/text}/embedder.d.ts +0 -0
- /package/dist/{text → cjs/text}/embedder.js +0 -0
- /package/dist/{text → cjs/text}/index.d.ts +0 -0
- /package/dist/{text → cjs/text}/index.js +0 -0
- /package/dist/{text → cjs/text}/preprocessors/index.d.ts +0 -0
- /package/dist/{text → cjs/text}/preprocessors/index.js +0 -0
- /package/dist/{text → cjs/text}/preprocessors/mdx.d.ts +0 -0
- /package/dist/{text → cjs/text}/preprocessors/mdx.js +0 -0
- /package/dist/{text → cjs/text}/preprocessors/mermaid.d.ts +0 -0
- /package/dist/{text → cjs/text}/preprocessors/mermaid.js +0 -0
- /package/dist/{text → cjs/text}/preprocessors/registry.d.ts +0 -0
- /package/dist/{text → cjs/text}/preprocessors/registry.js +0 -0
- /package/dist/{text → cjs/text}/reranker.d.ts +0 -0
- /package/dist/{text → cjs/text}/reranker.js +0 -0
- /package/dist/{text → cjs/text}/sentence-transformer-embedder.d.ts +0 -0
- /package/dist/{text → cjs/text}/sentence-transformer-embedder.js +0 -0
- /package/dist/{text → cjs/text}/tokenizer.d.ts +0 -0
- /package/dist/{text → cjs/text}/tokenizer.js +0 -0
- /package/dist/{types.d.ts → cjs/types.d.ts} +0 -0
- /package/dist/{types.js → cjs/types.js} +0 -0
- /package/dist/{utils → cjs/utils}/vector-math.d.ts +0 -0
- /package/dist/{utils → cjs/utils}/vector-math.js +0 -0
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Text-specific chunking implementation
|
|
3
|
+
* Implements the ChunkingStrategy interface for text content
|
|
4
|
+
*/
|
|
5
|
+
import '../dom-polyfills.js';
|
|
6
|
+
import { DEFAULT_CHUNK_CONFIG } from '../core/chunker.js';
|
|
7
|
+
import { countTokens } from './tokenizer.js';
|
|
8
|
+
/**
|
|
9
|
+
* Split text at paragraph boundaries (double newlines)
|
|
10
|
+
* This is the first tier of the chunking strategy
|
|
11
|
+
*/
|
|
12
|
+
function splitIntoParagraphs(text) {
|
|
13
|
+
// Split on double newlines, filter out empty strings
|
|
14
|
+
return text
|
|
15
|
+
.split(/\n\s*\n/)
|
|
16
|
+
.map(p => p.trim())
|
|
17
|
+
.filter(p => p.length > 0);
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* Split text at sentence boundaries using punctuation marks
|
|
21
|
+
* This is the second tier of the chunking strategy
|
|
22
|
+
*/
|
|
23
|
+
function splitIntoSentences(text) {
|
|
24
|
+
// Split on sentence-ending punctuation followed by whitespace or end of string
|
|
25
|
+
// Handle common abbreviations and edge cases
|
|
26
|
+
const sentences = text
|
|
27
|
+
.split(/(?<=[.!?])\s+/)
|
|
28
|
+
.map(s => s.trim())
|
|
29
|
+
.filter(s => s.length > 0);
|
|
30
|
+
return sentences;
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Split text into fixed-size chunks based on character count
|
|
34
|
+
* This is the fallback tier when semantic splitting fails
|
|
35
|
+
*/
|
|
36
|
+
async function splitIntoFixedSizeChunks(text, maxTokens, overlapTokens) {
|
|
37
|
+
const chunks = [];
|
|
38
|
+
const words = text.split(/\s+/);
|
|
39
|
+
let currentChunk = '';
|
|
40
|
+
let currentTokens = 0;
|
|
41
|
+
let i = 0;
|
|
42
|
+
while (i < words.length) {
|
|
43
|
+
const word = words[i];
|
|
44
|
+
const testChunk = currentChunk ? `${currentChunk} ${word}` : word;
|
|
45
|
+
const testTokens = await countTokens(testChunk);
|
|
46
|
+
if (testTokens <= maxTokens) {
|
|
47
|
+
currentChunk = testChunk;
|
|
48
|
+
currentTokens = testTokens;
|
|
49
|
+
i++;
|
|
50
|
+
}
|
|
51
|
+
else {
|
|
52
|
+
// Current chunk is full, save it
|
|
53
|
+
if (currentChunk) {
|
|
54
|
+
chunks.push(currentChunk);
|
|
55
|
+
// Create overlap for next chunk
|
|
56
|
+
if (overlapTokens > 0 && chunks.length > 0) {
|
|
57
|
+
const overlapText = await createOverlapFromWords(currentChunk, overlapTokens);
|
|
58
|
+
currentChunk = overlapText;
|
|
59
|
+
currentTokens = await countTokens(currentChunk);
|
|
60
|
+
}
|
|
61
|
+
else {
|
|
62
|
+
currentChunk = '';
|
|
63
|
+
currentTokens = 0;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
else {
|
|
67
|
+
// Single word exceeds limit, add it anyway
|
|
68
|
+
chunks.push(word);
|
|
69
|
+
i++;
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
// Add final chunk if it has content
|
|
74
|
+
if (currentChunk.trim()) {
|
|
75
|
+
chunks.push(currentChunk.trim());
|
|
76
|
+
}
|
|
77
|
+
return chunks;
|
|
78
|
+
}
|
|
79
|
+
/**
|
|
80
|
+
* Create overlap text from words at the end of a chunk
|
|
81
|
+
*/
|
|
82
|
+
async function createOverlapFromWords(text, overlapTokens) {
|
|
83
|
+
const words = text.split(/\s+/);
|
|
84
|
+
let overlapText = '';
|
|
85
|
+
let tokens = 0;
|
|
86
|
+
// Work backwards from the end
|
|
87
|
+
for (let i = words.length - 1; i >= 0; i--) {
|
|
88
|
+
const word = words[i];
|
|
89
|
+
const testText = word + (overlapText ? ' ' + overlapText : '');
|
|
90
|
+
const testTokens = await countTokens(testText);
|
|
91
|
+
if (testTokens <= overlapTokens) {
|
|
92
|
+
overlapText = testText;
|
|
93
|
+
tokens = testTokens;
|
|
94
|
+
}
|
|
95
|
+
else {
|
|
96
|
+
break;
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
return overlapText;
|
|
100
|
+
}
|
|
101
|
+
/**
|
|
102
|
+
* Create chunks from a list of text segments, respecting token limits
|
|
103
|
+
*/
|
|
104
|
+
async function createChunksFromSegments(segments, config) {
|
|
105
|
+
const chunks = [];
|
|
106
|
+
let currentChunk = '';
|
|
107
|
+
let currentTokens = 0;
|
|
108
|
+
for (const segment of segments) {
|
|
109
|
+
const segmentTokens = await countTokens(segment);
|
|
110
|
+
// If this single segment exceeds our limit, we need to split it further
|
|
111
|
+
if (segmentTokens > config.chunkSize) {
|
|
112
|
+
// Save current chunk if it has content
|
|
113
|
+
if (currentChunk.trim()) {
|
|
114
|
+
chunks.push(currentChunk.trim());
|
|
115
|
+
currentChunk = '';
|
|
116
|
+
currentTokens = 0;
|
|
117
|
+
}
|
|
118
|
+
// Split the large segment using fixed-size chunking based on tokens
|
|
119
|
+
const subChunks = await splitIntoFixedSizeChunks(segment, config.chunkSize, config.chunkOverlap);
|
|
120
|
+
chunks.push(...subChunks);
|
|
121
|
+
continue;
|
|
122
|
+
}
|
|
123
|
+
// Check if adding this segment would exceed our token limit
|
|
124
|
+
const potentialChunk = currentChunk ? `${currentChunk}\n\n${segment}` : segment;
|
|
125
|
+
const potentialTokens = await countTokens(potentialChunk);
|
|
126
|
+
if (potentialTokens <= config.chunkSize) {
|
|
127
|
+
// Add to current chunk
|
|
128
|
+
currentChunk = potentialChunk;
|
|
129
|
+
currentTokens = potentialTokens;
|
|
130
|
+
}
|
|
131
|
+
else {
|
|
132
|
+
// Save current chunk and start a new one
|
|
133
|
+
if (currentChunk.trim()) {
|
|
134
|
+
chunks.push(currentChunk.trim());
|
|
135
|
+
}
|
|
136
|
+
// Start new chunk with overlap if possible
|
|
137
|
+
if (config.chunkOverlap > 0 && currentChunk) {
|
|
138
|
+
const overlapText = await createOverlapText(currentChunk, config.chunkOverlap);
|
|
139
|
+
currentChunk = overlapText ? `${overlapText}\n\n${segment}` : segment;
|
|
140
|
+
}
|
|
141
|
+
else {
|
|
142
|
+
currentChunk = segment;
|
|
143
|
+
}
|
|
144
|
+
currentTokens = await countTokens(currentChunk);
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
// Add final chunk if it has content
|
|
148
|
+
if (currentChunk.trim()) {
|
|
149
|
+
chunks.push(currentChunk.trim());
|
|
150
|
+
}
|
|
151
|
+
return chunks;
|
|
152
|
+
}
|
|
153
|
+
/**
|
|
154
|
+
* Create overlap text from the end of a chunk
|
|
155
|
+
*/
|
|
156
|
+
async function createOverlapText(text, overlapTokens) {
|
|
157
|
+
// Split into sentences and work backwards to get approximately the right amount of overlap
|
|
158
|
+
const sentences = splitIntoSentences(text);
|
|
159
|
+
let overlapText = '';
|
|
160
|
+
let tokens = 0;
|
|
161
|
+
for (let i = sentences.length - 1; i >= 0; i--) {
|
|
162
|
+
const sentence = sentences[i];
|
|
163
|
+
const sentenceTokens = await countTokens(sentence);
|
|
164
|
+
if (tokens + sentenceTokens <= overlapTokens) {
|
|
165
|
+
overlapText = sentence + (overlapText ? ' ' + overlapText : '');
|
|
166
|
+
tokens += sentenceTokens;
|
|
167
|
+
}
|
|
168
|
+
else {
|
|
169
|
+
break;
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
return overlapText;
|
|
173
|
+
}
|
|
174
|
+
/**
|
|
175
|
+
* Text chunking strategy implementation
|
|
176
|
+
*/
|
|
177
|
+
export class TextChunkingStrategy {
|
|
178
|
+
appliesTo(contentType) {
|
|
179
|
+
return contentType === 'text';
|
|
180
|
+
}
|
|
181
|
+
async chunk(document, config) {
|
|
182
|
+
console.log(`📝 Chunking document "${document.title}" with config: chunkSize=${config.chunkSize}, chunkOverlap=${config.chunkOverlap}`);
|
|
183
|
+
if (!document.content || document.content.trim().length === 0) {
|
|
184
|
+
return [];
|
|
185
|
+
}
|
|
186
|
+
// Tier 1: Split into paragraphs
|
|
187
|
+
const paragraphs = splitIntoParagraphs(document.content);
|
|
188
|
+
// Tier 2: For large paragraphs, split into sentences
|
|
189
|
+
const segments = [];
|
|
190
|
+
for (const paragraph of paragraphs) {
|
|
191
|
+
const paragraphTokens = await countTokens(paragraph);
|
|
192
|
+
if (paragraphTokens <= config.chunkSize) {
|
|
193
|
+
// Paragraph is small enough, use as-is
|
|
194
|
+
segments.push(paragraph);
|
|
195
|
+
}
|
|
196
|
+
else {
|
|
197
|
+
// Paragraph is too large, split into sentences
|
|
198
|
+
const sentences = splitIntoSentences(paragraph);
|
|
199
|
+
// Group sentences that fit within token limits
|
|
200
|
+
let currentGroup = '';
|
|
201
|
+
let currentTokens = 0;
|
|
202
|
+
for (const sentence of sentences) {
|
|
203
|
+
const sentenceTokens = await countTokens(sentence);
|
|
204
|
+
// If single sentence exceeds limit, it will be handled in createChunksFromSegments
|
|
205
|
+
if (sentenceTokens > config.chunkSize) {
|
|
206
|
+
// Save current group if it has content
|
|
207
|
+
if (currentGroup.trim()) {
|
|
208
|
+
segments.push(currentGroup.trim());
|
|
209
|
+
currentGroup = '';
|
|
210
|
+
currentTokens = 0;
|
|
211
|
+
}
|
|
212
|
+
// Add the large sentence as its own segment (will be split later)
|
|
213
|
+
segments.push(sentence);
|
|
214
|
+
continue;
|
|
215
|
+
}
|
|
216
|
+
const potentialGroup = currentGroup ? `${currentGroup} ${sentence}` : sentence;
|
|
217
|
+
const potentialTokens = await countTokens(potentialGroup);
|
|
218
|
+
if (potentialTokens <= config.chunkSize) {
|
|
219
|
+
currentGroup = potentialGroup;
|
|
220
|
+
currentTokens = potentialTokens;
|
|
221
|
+
}
|
|
222
|
+
else {
|
|
223
|
+
// Save current group and start new one
|
|
224
|
+
if (currentGroup.trim()) {
|
|
225
|
+
segments.push(currentGroup.trim());
|
|
226
|
+
}
|
|
227
|
+
currentGroup = sentence;
|
|
228
|
+
currentTokens = sentenceTokens;
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
// Add final group if it has content
|
|
232
|
+
if (currentGroup.trim()) {
|
|
233
|
+
segments.push(currentGroup.trim());
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
// Tier 3: Create final chunks with overlap handling
|
|
238
|
+
const chunkTexts = await createChunksFromSegments(segments, config);
|
|
239
|
+
// Convert to GenericChunk objects
|
|
240
|
+
const chunks = [];
|
|
241
|
+
for (let i = 0; i < chunkTexts.length; i++) {
|
|
242
|
+
const content = chunkTexts[i];
|
|
243
|
+
chunks.push({
|
|
244
|
+
content,
|
|
245
|
+
contentType: document.contentType,
|
|
246
|
+
chunkIndex: i,
|
|
247
|
+
metadata: {
|
|
248
|
+
tokenCount: await countTokens(content),
|
|
249
|
+
...document.metadata
|
|
250
|
+
}
|
|
251
|
+
});
|
|
252
|
+
}
|
|
253
|
+
return chunks;
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
/**
|
|
257
|
+
* Text document chunking function
|
|
258
|
+
* Converts between text-specific and generic interfaces
|
|
259
|
+
*/
|
|
260
|
+
export async function chunkDocument(document, config = DEFAULT_CHUNK_CONFIG) {
|
|
261
|
+
const strategy = new TextChunkingStrategy();
|
|
262
|
+
// Convert Document to GenericDocument
|
|
263
|
+
const genericDocument = {
|
|
264
|
+
source: document.source,
|
|
265
|
+
title: document.title,
|
|
266
|
+
content: document.content,
|
|
267
|
+
contentType: 'text'
|
|
268
|
+
};
|
|
269
|
+
// Use the strategy to chunk
|
|
270
|
+
const genericChunks = await strategy.chunk(genericDocument, config);
|
|
271
|
+
// Convert GenericChunk back to Chunk format
|
|
272
|
+
const chunks = genericChunks.map(chunk => ({
|
|
273
|
+
text: chunk.content,
|
|
274
|
+
chunkIndex: chunk.chunkIndex,
|
|
275
|
+
tokenCount: chunk.metadata?.tokenCount || 0
|
|
276
|
+
}));
|
|
277
|
+
return chunks;
|
|
278
|
+
}
|
|
279
|
+
//# sourceMappingURL=chunker.js.map
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
import '../dom-polyfills.js';
|
|
2
|
+
import type { EmbeddingResult, EmbedFunction } from '../core/types.js';
|
|
3
|
+
/**
|
|
4
|
+
* Embedding engine using transformers.js for generating embeddings
|
|
5
|
+
*/
|
|
6
|
+
export declare class EmbeddingEngine {
|
|
7
|
+
private model;
|
|
8
|
+
private modelVersion;
|
|
9
|
+
private readonly modelName;
|
|
10
|
+
private readonly batchSize;
|
|
11
|
+
constructor(modelName?: string, batchSize?: number);
|
|
12
|
+
/**
|
|
13
|
+
* Load the embedding model
|
|
14
|
+
* @throws {Error} If model loading fails
|
|
15
|
+
*/
|
|
16
|
+
loadModel(): Promise<void>;
|
|
17
|
+
/**
|
|
18
|
+
* Generate embeddings for a batch of texts
|
|
19
|
+
* @param texts - Array of text strings to embed
|
|
20
|
+
* @returns Promise resolving to array of embedding results
|
|
21
|
+
*/
|
|
22
|
+
embedBatch(texts: string[]): Promise<EmbeddingResult[]>;
|
|
23
|
+
/**
|
|
24
|
+
* Process a single batch with error handling for individual chunks
|
|
25
|
+
* @param batch - Array of text strings in this batch
|
|
26
|
+
* @param startIndex - Starting index for this batch in the original array
|
|
27
|
+
* @returns Promise resolving to array of embedding results
|
|
28
|
+
*/
|
|
29
|
+
private processBatchWithErrorHandling;
|
|
30
|
+
/**
|
|
31
|
+
* Fallback to individual chunk processing when batch fails
|
|
32
|
+
*/
|
|
33
|
+
private fallbackToIndividualProcessing;
|
|
34
|
+
/**
|
|
35
|
+
* Process a single chunk with error handling
|
|
36
|
+
* @param text - Text to embed
|
|
37
|
+
* @param index - Index of this chunk
|
|
38
|
+
* @returns Promise resolving to embedding result or null if failed
|
|
39
|
+
*/
|
|
40
|
+
private processSingleChunk;
|
|
41
|
+
/**
|
|
42
|
+
* Generate embedding for a single text
|
|
43
|
+
* @param text - Text string to embed
|
|
44
|
+
* @returns Promise resolving to embedding result
|
|
45
|
+
*/
|
|
46
|
+
embedSingle(text: string): Promise<EmbeddingResult>;
|
|
47
|
+
/**
|
|
48
|
+
* Generate embeddings for document chunks with progress logging
|
|
49
|
+
* Optimized for large document ingestion with batch processing
|
|
50
|
+
* @param chunks - Array of text chunks from documents
|
|
51
|
+
* @returns Promise resolving to array of embedding results
|
|
52
|
+
*/
|
|
53
|
+
embedDocumentBatch(chunks: string[]): Promise<EmbeddingResult[]>;
|
|
54
|
+
/**
|
|
55
|
+
* Get the current model version identifier
|
|
56
|
+
* @returns Model version string
|
|
57
|
+
*/
|
|
58
|
+
getModelVersion(): string;
|
|
59
|
+
/**
|
|
60
|
+
* Check if the model is loaded
|
|
61
|
+
* @returns True if model is loaded
|
|
62
|
+
*/
|
|
63
|
+
isLoaded(): boolean;
|
|
64
|
+
/**
|
|
65
|
+
* Get the model name
|
|
66
|
+
* @returns Model name string
|
|
67
|
+
*/
|
|
68
|
+
getModelName(): string;
|
|
69
|
+
/**
|
|
70
|
+
* Get the batch size
|
|
71
|
+
* @returns Batch size number
|
|
72
|
+
*/
|
|
73
|
+
getBatchSize(): number;
|
|
74
|
+
/**
|
|
75
|
+
* Generate a deterministic model version identifier
|
|
76
|
+
* Uses model name and configuration for consistent versioning
|
|
77
|
+
* @returns Model version string
|
|
78
|
+
*/
|
|
79
|
+
private generateModelVersion;
|
|
80
|
+
/**
|
|
81
|
+
* Generate a deterministic embedding ID for a text chunk
|
|
82
|
+
* @param text - The text content
|
|
83
|
+
* @param index - Index in the batch
|
|
84
|
+
* @returns Deterministic embedding ID
|
|
85
|
+
*/
|
|
86
|
+
private generateEmbeddingId;
|
|
87
|
+
}
|
|
88
|
+
/**
|
|
89
|
+
* Get the singleton embedding engine instance
|
|
90
|
+
* @param modelName - Optional model name override
|
|
91
|
+
* @param batchSize - Optional batch size override
|
|
92
|
+
* @returns EmbeddingEngine instance
|
|
93
|
+
*/
|
|
94
|
+
export declare function getEmbeddingEngine(modelName?: string, batchSize?: number): EmbeddingEngine;
|
|
95
|
+
/**
|
|
96
|
+
* Initialize the embedding engine and load the model
|
|
97
|
+
* @param modelName - Optional model name override
|
|
98
|
+
* @param batchSize - Optional batch size override
|
|
99
|
+
* @returns Promise resolving to the loaded embedding engine
|
|
100
|
+
*/
|
|
101
|
+
export declare function initializeEmbeddingEngine(modelName?: string, batchSize?: number): Promise<EmbeddingEngine>;
|
|
102
|
+
/**
|
|
103
|
+
|
|
104
|
+
* Create an EmbedFunction implementation using the text embedding engine
|
|
105
|
+
* This function implements the core EmbedFunction interface for dependency injection
|
|
106
|
+
* @param modelName - Optional model name override
|
|
107
|
+
* @param batchSize - Optional batch size override
|
|
108
|
+
* @returns EmbedFunction that can be injected into core components
|
|
109
|
+
*/
|
|
110
|
+
export declare function createTextEmbedFunction(modelName?: string, batchSize?: number): EmbedFunction;
|
|
111
|
+
//# sourceMappingURL=embedder.d.ts.map
|