voctar 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +102 -0
  3. package/dist/index.d.ts +6 -0
  4. package/dist/index.d.ts.map +1 -0
  5. package/dist/index.js +29 -0
  6. package/dist/index.js.map +1 -0
  7. package/dist/src/chunking/index.d.ts +48 -0
  8. package/dist/src/chunking/index.d.ts.map +1 -0
  9. package/dist/src/chunking/index.js +123 -0
  10. package/dist/src/chunking/index.js.map +1 -0
  11. package/dist/src/chunking/strategies/fixed.d.ts +14 -0
  12. package/dist/src/chunking/strategies/fixed.d.ts.map +1 -0
  13. package/dist/src/chunking/strategies/fixed.js +111 -0
  14. package/dist/src/chunking/strategies/fixed.js.map +1 -0
  15. package/dist/src/chunking/strategies/paragraph.d.ts +6 -0
  16. package/dist/src/chunking/strategies/paragraph.d.ts.map +1 -0
  17. package/dist/src/chunking/strategies/paragraph.js +84 -0
  18. package/dist/src/chunking/strategies/paragraph.js.map +1 -0
  19. package/dist/src/chunking/strategies/recursive.d.ts +17 -0
  20. package/dist/src/chunking/strategies/recursive.d.ts.map +1 -0
  21. package/dist/src/chunking/strategies/recursive.js +192 -0
  22. package/dist/src/chunking/strategies/recursive.js.map +1 -0
  23. package/dist/src/chunking/strategies/semantic.d.ts +96 -0
  24. package/dist/src/chunking/strategies/semantic.d.ts.map +1 -0
  25. package/dist/src/chunking/strategies/semantic.js +587 -0
  26. package/dist/src/chunking/strategies/semantic.js.map +1 -0
  27. package/dist/src/chunking/strategies/sentence.d.ts +7 -0
  28. package/dist/src/chunking/strategies/sentence.d.ts.map +1 -0
  29. package/dist/src/chunking/strategies/sentence.js +116 -0
  30. package/dist/src/chunking/strategies/sentence.js.map +1 -0
  31. package/dist/src/chunking/types.d.ts +45 -0
  32. package/dist/src/chunking/types.d.ts.map +1 -0
  33. package/dist/src/chunking/types.js +4 -0
  34. package/dist/src/chunking/types.js.map +1 -0
  35. package/dist/src/chunking/utils/tokenizer.d.ts +10 -0
  36. package/dist/src/chunking/utils/tokenizer.d.ts.map +1 -0
  37. package/dist/src/chunking/utils/tokenizer.js +50 -0
  38. package/dist/src/chunking/utils/tokenizer.js.map +1 -0
  39. package/dist/src/providers/embeddings/index.d.ts +3 -0
  40. package/dist/src/providers/embeddings/index.d.ts.map +1 -0
  41. package/dist/src/providers/embeddings/index.js +7 -0
  42. package/dist/src/providers/embeddings/index.js.map +1 -0
  43. package/dist/src/providers/embeddings/openai.d.ts +21 -0
  44. package/dist/src/providers/embeddings/openai.d.ts.map +1 -0
  45. package/dist/src/providers/embeddings/openai.js +86 -0
  46. package/dist/src/providers/embeddings/openai.js.map +1 -0
  47. package/dist/src/providers/index.d.ts +3 -0
  48. package/dist/src/providers/index.d.ts.map +1 -0
  49. package/dist/src/providers/index.js +20 -0
  50. package/dist/src/providers/index.js.map +1 -0
  51. package/dist/src/providers/stores/index.d.ts +6 -0
  52. package/dist/src/providers/stores/index.d.ts.map +1 -0
  53. package/dist/src/providers/stores/index.js +11 -0
  54. package/dist/src/providers/stores/index.js.map +1 -0
  55. package/dist/src/providers/stores/memory.d.ts +18 -0
  56. package/dist/src/providers/stores/memory.d.ts.map +1 -0
  57. package/dist/src/providers/stores/memory.js +169 -0
  58. package/dist/src/providers/stores/memory.js.map +1 -0
  59. package/dist/src/providers/stores/qdrant.d.ts +28 -0
  60. package/dist/src/providers/stores/qdrant.d.ts.map +1 -0
  61. package/dist/src/providers/stores/qdrant.js +223 -0
  62. package/dist/src/providers/stores/qdrant.js.map +1 -0
  63. package/dist/src/providers/stores/sqlite.d.ts +38 -0
  64. package/dist/src/providers/stores/sqlite.d.ts.map +1 -0
  65. package/dist/src/providers/stores/sqlite.js +306 -0
  66. package/dist/src/providers/stores/sqlite.js.map +1 -0
  67. package/dist/src/types.d.ts +111 -0
  68. package/dist/src/types.d.ts.map +1 -0
  69. package/dist/src/types.js +32 -0
  70. package/dist/src/types.js.map +1 -0
  71. package/dist/src/vector.d.ts +74 -0
  72. package/dist/src/vector.d.ts.map +1 -0
  73. package/dist/src/vector.js +505 -0
  74. package/dist/src/vector.js.map +1 -0
  75. package/docs/API.md +361 -0
  76. package/docs/CHUNKING.md +280 -0
  77. package/docs/CUSTOM_PROVIDERS.md +101 -0
  78. package/docs/README.md +11 -0
  79. package/docs/STORAGE_BACKENDS.md +189 -0
  80. package/docs/assets/vectar.png +0 -0
  81. package/package.json +46 -0
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Voctar contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,102 @@
1
+ <p align="center">
2
+ <img src="./docs/assets/vectar.png" alt="Voctar logo" width="180" />
3
+ </p>
4
+
5
+ <h1 align="center">Voctar</h1>
6
+
7
+ <p align="center">
8
+ Simple TypeScript library with RAG primitives for embeddings, chunking, storage, and semantic retrieval.
9
+ </p>
10
+
11
+ <p align="center">
12
+ <img alt="npm version" src="https://img.shields.io/npm/v/voctar?logo=npm&color=CB3837" />
13
+ <img alt="TypeScript" src="https://img.shields.io/badge/TypeScript-5.x-3178C6?logo=typescript&logoColor=white" />
14
+ <img alt="Node" src="https://img.shields.io/badge/Node-%3E%3D18-339933?logo=node.js&logoColor=white" />
15
+ <img alt="License: MIT" src="https://img.shields.io/badge/License-MIT-yellow.svg" />
16
+ </p>
17
+
18
+ ## Features
19
+
20
+ - Supports multiple vector stores: SQLite, Qdrant, in-memory, or custom store providers
21
+ - Automatic chunking for long documents with multiple strategies (`fixed`, `recursive`, `sentence`, `paragraph`, `semantic`)
22
+ - Semantic search with score thresholds and metadata filtering
23
+ - Simple primitives: `embed`, `search` and more
24
+ - TypeScript-first.
25
+
26
+ ## Quick Start
27
+
28
+ ```bash
29
+ yarn add voctar
30
+ ```
31
+
32
+ ```typescript
33
+ import { Voctar } from 'voctar';
34
+
35
+ const vector = new Voctar({
36
+ embedding: {
37
+ type: 'openai',
38
+ apiKey: '<your-api-key>',
39
+ },
40
+ store: {
41
+ type: 'sqlite',
42
+ path: 'data/vector.db',
43
+ },
44
+ });
45
+
46
+ const { documentId } = await vector.embed('documents', "Very long text...", {
47
+ metadata: { author: 'Alice' },
48
+ });
49
+
50
+ const results = await vector.search('documents', 'Some query');
51
+ ```
52
+
53
+ ## Primitives API
54
+
55
+ ### `embed(collection, text, options?)`
56
+
57
+ Embeds a document into a collection.
58
+ If the text exceeds model limits, Voctar auto-chunks and stores chunk vectors.
59
+
60
+ ```typescript
61
+ const { documentId, chunkIds } = await vector.embed('documents', longText, {
62
+ documentId: 'doc-1', // optional; auto-generated if omitted
63
+ metadata: { source: 'guide' }, // optional user metadata
64
+ chunkSize: 1000, // optional
65
+ chunkStrategy: 'recursive', // fixed | recursive | sentence | paragraph | semantic
66
+ chunkOverlap: 200, // optional
67
+ autoChunk: true, // optional override
68
+ });
69
+ ```
70
+
71
+ Returns:
72
+
73
+ - `documentId`: stable parent id for the document
74
+ - `chunkIds`: stored ids (single id for unchunked docs, multiple for chunked docs)
75
+
76
+ ### `search(collection, query, options?)`
77
+
78
+ Retrieves semantically similar text from a collection.
79
+
80
+ ```typescript
81
+ const results = await vector.search('documents', 'how does chunking work', {
82
+ limit: 5, // optional, default provider behavior
83
+ scoreThreshold: 0, // optional
84
+ filter: { source: 'guide' }, // optional metadata filter
85
+ includeSystem: false, // optional; include internal metadata when true
86
+ });
87
+ ```
88
+
89
+ Each result includes:
90
+
91
+ - `id`
92
+ - `text`
93
+ - `score`
94
+ - `createdAt`
95
+ - `metadata` (and optional `system` when `includeSystem: true`)
96
+
97
+ ## Documentation
98
+
99
+ - [Docs Index](./docs/README.md)
100
+ - [Storage Backends](./docs/STORAGE_BACKENDS.md)
101
+ - [Chunking](./docs/CHUNKING.md)
102
+
@@ -0,0 +1,6 @@
1
+ export { Vector, Vector as Voctar } from './src/vector';
2
+ export * from './src/providers';
3
+ export * from './src/types';
4
+ export { chunking, ChunkingService } from './src/chunking';
5
+ export * from './src/chunking/types';
6
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,MAAM,EAAE,MAAM,IAAI,MAAM,EAAE,MAAM,cAAc,CAAC;AACxD,cAAc,iBAAiB,CAAC;AAChC,cAAc,aAAa,CAAC;AAG5B,OAAO,EAAE,QAAQ,EAAE,eAAe,EAAE,MAAM,gBAAgB,CAAC;AAC3D,cAAc,sBAAsB,CAAC"}
package/dist/index.js ADDED
@@ -0,0 +1,29 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __exportStar = (this && this.__exportStar) || function(m, exports) {
14
+ for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
15
+ };
16
+ Object.defineProperty(exports, "__esModule", { value: true });
17
+ exports.ChunkingService = exports.chunking = exports.Voctar = exports.Vector = void 0;
18
+ // Voctar exports
19
+ var vector_1 = require("./src/vector");
20
+ Object.defineProperty(exports, "Vector", { enumerable: true, get: function () { return vector_1.Vector; } });
21
+ Object.defineProperty(exports, "Voctar", { enumerable: true, get: function () { return vector_1.Vector; } });
22
+ __exportStar(require("./src/providers"), exports);
23
+ __exportStar(require("./src/types"), exports);
24
+ // Chunking exports
25
+ var chunking_1 = require("./src/chunking");
26
+ Object.defineProperty(exports, "chunking", { enumerable: true, get: function () { return chunking_1.chunking; } });
27
+ Object.defineProperty(exports, "ChunkingService", { enumerable: true, get: function () { return chunking_1.ChunkingService; } });
28
+ __exportStar(require("./src/chunking/types"), exports);
29
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;AAAA,iBAAiB;AACjB,uCAAwD;AAA/C,gGAAA,MAAM,OAAA;AAAE,gGAAA,MAAM,OAAU;AACjC,kDAAgC;AAChC,8CAA4B;AAE5B,mBAAmB;AACnB,2CAA2D;AAAlD,oGAAA,QAAQ,OAAA;AAAE,2GAAA,eAAe,OAAA;AAClC,uDAAqC"}
@@ -0,0 +1,48 @@
1
+ import type { ChunkingOptions, ChunkingStrategy, DocumentChunkResult } from './types';
2
+ export declare class ChunkingService {
3
+ private strategies;
4
+ private defaultStrategy;
5
+ constructor();
6
+ /**
7
+ * Register a custom chunking strategy
8
+ */
9
+ registerStrategy(strategy: ChunkingStrategy): void;
10
+ /**
11
+ * Chunk a single document
12
+ */
13
+ chunkDocument(text: string, options?: ChunkingOptions, documentId?: string): DocumentChunkResult;
14
+ /**
15
+ * Chunk multiple documents
16
+ */
17
+ chunkDocuments(documents: Array<{
18
+ text: string;
19
+ id?: string;
20
+ metadata?: Record<string, any>;
21
+ }>, options?: ChunkingOptions): DocumentChunkResult[];
22
+ /**
23
+ * Estimate token count using accurate tokenizer
24
+ */
25
+ estimateTokens(text: string): number;
26
+ /**
27
+ * Get optimal chunk size based on embedding model's token limit
28
+ * Now returns tokens directly, not characters
29
+ */
30
+ getOptimalChunkSize(modelTokenLimit?: number, safetyMargin?: number): number;
31
+ /**
32
+ * Set default chunking strategy
33
+ */
34
+ setDefaultStrategy(strategy: string): void;
35
+ /**
36
+ * Get list of available strategies
37
+ */
38
+ getAvailableStrategies(): string[];
39
+ }
40
+ export declare const chunking: ChunkingService;
41
+ export * from './types';
42
+ export { FixedSizeChunkingStrategy } from './strategies/fixed';
43
+ export { RecursiveChunkingStrategy } from './strategies/recursive';
44
+ export { SentenceChunkingStrategy } from './strategies/sentence';
45
+ export { ParagraphChunkingStrategy } from './strategies/paragraph';
46
+ export { SemanticChunkingStrategy } from './strategies/semantic';
47
+ export type { SemanticChunkingOptions } from './strategies/semantic';
48
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/chunking/index.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAS,eAAe,EAAE,gBAAgB,EAAE,mBAAmB,EAAE,MAAM,SAAS,CAAC;AAO7F,qBAAa,eAAe;IAC1B,OAAO,CAAC,UAAU,CAAgC;IAClD,OAAO,CAAC,eAAe,CAAuB;;IAa9C;;OAEG;IACH,gBAAgB,CAAC,QAAQ,EAAE,gBAAgB,GAAG,IAAI;IAIlD;;OAEG;IACH,aAAa,CACX,IAAI,EAAE,MAAM,EACZ,OAAO,GAAE,eAAoB,EAC7B,UAAU,CAAC,EAAE,MAAM,GAClB,mBAAmB;IAuBtB;;OAEG;IACH,cAAc,CACZ,SAAS,EAAE,KAAK,CAAC;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,EAAE,CAAC,EAAE,MAAM,CAAC;QAAC,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAA;KAAE,CAAC,EAC/E,OAAO,GAAE,eAAoB,GAC5B,mBAAmB,EAAE;IAWxB;;OAEG;IACH,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM;IAKpC;;;OAGG;IACH,mBAAmB,CAAC,eAAe,GAAE,MAAa,EAAE,YAAY,GAAE,MAAY,GAAG,MAAM;IAMvF;;OAEG;IACH,kBAAkB,CAAC,QAAQ,EAAE,MAAM,GAAG,IAAI;IAO1C;;OAEG;IACH,sBAAsB,IAAI,MAAM,EAAE;CAGnC;AAGD,eAAO,MAAM,QAAQ,iBAAwB,CAAC;AAG9C,cAAc,SAAS,CAAC;AACxB,OAAO,EAAE,yBAAyB,EAAE,MAAM,oBAAoB,CAAC;AAC/D,OAAO,EAAE,yBAAyB,EAAE,MAAM,wBAAwB,CAAC;AACnE,OAAO,EAAE,wBAAwB,EAAE,MAAM,uBAAuB,CAAC;AACjE,OAAO,EAAE,yBAAyB,EAAE,MAAM,wBAAwB,CAAC;AACnE,OAAO,EAAE,wBAAwB,EAAE,MAAM,uBAAuB,CAAC;AACjE,YAAY,EAAE,uBAAuB,EAAE,MAAM,uBAAuB,CAAC"}
@@ -0,0 +1,123 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __exportStar = (this && this.__exportStar) || function(m, exports) {
14
+ for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
15
+ };
16
+ Object.defineProperty(exports, "__esModule", { value: true });
17
+ exports.SemanticChunkingStrategy = exports.ParagraphChunkingStrategy = exports.SentenceChunkingStrategy = exports.RecursiveChunkingStrategy = exports.FixedSizeChunkingStrategy = exports.chunking = exports.ChunkingService = void 0;
18
+ // Main chunking service
19
+ const uuid_1 = require("uuid");
20
+ const fixed_1 = require("./strategies/fixed");
21
+ const recursive_1 = require("./strategies/recursive");
22
+ const sentence_1 = require("./strategies/sentence");
23
+ const paragraph_1 = require("./strategies/paragraph");
24
+ const semantic_1 = require("./strategies/semantic");
25
+ class ChunkingService {
26
+ constructor() {
27
+ this.defaultStrategy = 'recursive';
28
+ this.strategies = new Map();
29
+ // Register built-in strategies
30
+ this.registerStrategy(new fixed_1.FixedSizeChunkingStrategy());
31
+ this.registerStrategy(new recursive_1.RecursiveChunkingStrategy());
32
+ this.registerStrategy(new sentence_1.SentenceChunkingStrategy());
33
+ this.registerStrategy(new paragraph_1.ParagraphChunkingStrategy());
34
+ this.registerStrategy(new semantic_1.SemanticChunkingStrategy());
35
+ }
36
+ /**
37
+ * Register a custom chunking strategy
38
+ */
39
+ registerStrategy(strategy) {
40
+ this.strategies.set(strategy.getName(), strategy);
41
+ }
42
+ /**
43
+ * Chunk a single document
44
+ */
45
+ chunkDocument(text, options = {}, documentId) {
46
+ const docId = documentId || (0, uuid_1.v4)();
47
+ const strategy = options.strategy || this.defaultStrategy;
48
+ const chunkingStrategy = this.strategies.get(strategy);
49
+ if (!chunkingStrategy) {
50
+ throw new Error(`Unknown chunking strategy: ${strategy}`);
51
+ }
52
+ const chunks = chunkingStrategy.chunk(text, docId, options);
53
+ return {
54
+ documentId: docId,
55
+ chunks,
56
+ metadata: {
57
+ originalLength: text.length,
58
+ totalChunks: chunks.length,
59
+ strategy,
60
+ ...options.metadata,
61
+ },
62
+ };
63
+ }
64
+ /**
65
+ * Chunk multiple documents
66
+ */
67
+ chunkDocuments(documents, options = {}) {
68
+ return documents.map(doc => {
69
+ const docOptions = {
70
+ ...options,
71
+ metadata: { ...options.metadata, ...doc.metadata },
72
+ };
73
+ return this.chunkDocument(doc.text, docOptions, doc.id);
74
+ });
75
+ }
76
+ /**
77
+ * Estimate token count using accurate tokenizer
78
+ */
79
+ estimateTokens(text) {
80
+ const { countTokens } = require('./utils/tokenizer');
81
+ return countTokens(text);
82
+ }
83
+ /**
84
+ * Get optimal chunk size based on embedding model's token limit
85
+ * Now returns tokens directly, not characters
86
+ */
87
+ getOptimalChunkSize(modelTokenLimit = 8192, safetyMargin = 0.8) {
88
+ const effectiveLimit = Math.floor(modelTokenLimit * safetyMargin);
89
+ // Return tokens directly (not characters)
90
+ return effectiveLimit;
91
+ }
92
+ /**
93
+ * Set default chunking strategy
94
+ */
95
+ setDefaultStrategy(strategy) {
96
+ if (!this.strategies.has(strategy)) {
97
+ throw new Error(`Unknown strategy: ${strategy}`);
98
+ }
99
+ this.defaultStrategy = strategy;
100
+ }
101
+ /**
102
+ * Get list of available strategies
103
+ */
104
+ getAvailableStrategies() {
105
+ return Array.from(this.strategies.keys());
106
+ }
107
+ }
108
+ exports.ChunkingService = ChunkingService;
109
+ // Export singleton instance
110
+ exports.chunking = new ChunkingService();
111
+ // Export types and strategies
112
+ __exportStar(require("./types"), exports);
113
+ var fixed_2 = require("./strategies/fixed");
114
+ Object.defineProperty(exports, "FixedSizeChunkingStrategy", { enumerable: true, get: function () { return fixed_2.FixedSizeChunkingStrategy; } });
115
+ var recursive_2 = require("./strategies/recursive");
116
+ Object.defineProperty(exports, "RecursiveChunkingStrategy", { enumerable: true, get: function () { return recursive_2.RecursiveChunkingStrategy; } });
117
+ var sentence_2 = require("./strategies/sentence");
118
+ Object.defineProperty(exports, "SentenceChunkingStrategy", { enumerable: true, get: function () { return sentence_2.SentenceChunkingStrategy; } });
119
+ var paragraph_2 = require("./strategies/paragraph");
120
+ Object.defineProperty(exports, "ParagraphChunkingStrategy", { enumerable: true, get: function () { return paragraph_2.ParagraphChunkingStrategy; } });
121
+ var semantic_2 = require("./strategies/semantic");
122
+ Object.defineProperty(exports, "SemanticChunkingStrategy", { enumerable: true, get: function () { return semantic_2.SemanticChunkingStrategy; } });
123
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/chunking/index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;AAAA,wBAAwB;AACxB,+BAAoC;AAEpC,8CAA+D;AAC/D,sDAAmE;AACnE,oDAAiE;AACjE,sDAAmE;AACnE,oDAAiE;AAEjE,MAAa,eAAe;IAI1B;QAFQ,oBAAe,GAAW,WAAW,CAAC;QAG5C,IAAI,CAAC,UAAU,GAAG,IAAI,GAAG,EAAE,CAAC;QAE5B,+BAA+B;QAC/B,IAAI,CAAC,gBAAgB,CAAC,IAAI,iCAAyB,EAAE,CAAC,CAAC;QACvD,IAAI,CAAC,gBAAgB,CAAC,IAAI,qCAAyB,EAAE,CAAC,CAAC;QACvD,IAAI,CAAC,gBAAgB,CAAC,IAAI,mCAAwB,EAAE,CAAC,CAAC;QACtD,IAAI,CAAC,gBAAgB,CAAC,IAAI,qCAAyB,EAAE,CAAC,CAAC;QACvD,IAAI,CAAC,gBAAgB,CAAC,IAAI,mCAAwB,EAAE,CAAC,CAAC;IACxD,CAAC;IAED;;OAEG;IACH,gBAAgB,CAAC,QAA0B;QACzC,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,QAAQ,CAAC,OAAO,EAAE,EAAE,QAAQ,CAAC,CAAC;IACpD,CAAC;IAED;;OAEG;IACH,aAAa,CACX,IAAY,EACZ,UAA2B,EAAE,EAC7B,UAAmB;QAEnB,MAAM,KAAK,GAAG,UAAU,IAAI,IAAA,SAAM,GAAE,CAAC;QACrC,MAAM,QAAQ,GAAG,OAAO,CAAC,QAAQ,IAAI,IAAI,CAAC,eAAe,CAAC;QAE1D,MAAM,gBAAgB,GAAG,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;QACvD,IAAI,CAAC,gBAAgB,EAAE,CAAC;YACtB,MAAM,IAAI,KAAK,CAAC,8BAA8B,QAAQ,EAAE,CAAC,CAAC;QAC5D,CAAC;QAED,MAAM,MAAM,GAAG,gBAAgB,CAAC,KAAK,CAAC,IAAI,EAAE,KAAK,EAAE,OAAO,CAAC,CAAC;QAE5D,OAAO;YACL,UAAU,EAAE,KAAK;YACjB,MAAM;YACN,QAAQ,EAAE;gBACR,cAAc,EAAE,IAAI,CAAC,MAAM;gBAC3B,WAAW,EAAE,MAAM,CAAC,MAAM;gBAC1B,QAAQ;gBACR,GAAG,OAAO,CAAC,QAAQ;aACpB;SACF,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,cAAc,CACZ,SAA+E,EAC/E,UAA2B,EAAE;QAE7B,OAAO,SAAS,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE;YACzB,MAAM,UAAU,GAAoB;gBAClC,GAAG,OAAO;gBACV,QAAQ,EAAE,EAAE,GAAG,OAAO,CAAC,QAAQ,EAAE,GAAG,GAAG,CAAC,QAAQ,EAAE;aACnD,CAAC;YAEF,OAAO,IAAI,CAAC,aAAa,CAAC,GAAG,CAAC,IAAI,EAAE,UAAU,EAAE,GAAG,CAAC,EAAE,CAAC,CAAC;QAC1D,CAAC,CAAC,CAAC;IACL,CAAC;IAED;;OAEG;IACH,cAAc,CAAC,IAAY;QACzB,MAAM,EAAE,WAAW,EAAE,GAAG,OAAO,CAAC,mBAAmB,CAAC,CAAC;QACrD,OAAO,WAAW,CAAC,IAAI,CAAC,CAAC;IAC3B,CAAC;IAED;;;OAGG;IACH,mBAAmB,CAAC,kBAA0B,IAAI,EAAE,eAAuB,GAAG;QAC5E,MAAM,cAAc,GAAG,IAAI,CAAC,KAAK,CAAC,eAAe,GAAG,YAAY,CAAC,CAAC;QAClE,0CAA0C;QAC1C,OAAO,cAAc,CAAC;IACxB,CAAC;IAED;;OAEG;IACH,kBAAkB,CAAC,QAAgB;QACjC,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,CAAC;YACnC,MAAM,IAAI,KAAK,CAAC,qBAAqB,QAAQ,EAAE,CAAC,CAAC;QACnD,CAAC;QACD,IAAI,CAAC,eAAe,GAAG,QAAQ,CAAC;IAClC,CAAC;IAED;;OAEG;IACH,sBAAsB;QACpB,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,IAAI,EAAE,CAAC,CAAC;IAC5C,CAAC;CACF;AAvGD,0CAuGC;AAED,4BAA4B;AACf,QAAA,QAAQ,GAAG,IAAI,eAAe,EAAE,CAAC;AAE9C,8BAA8B;AAC9B,0CAAwB;AACxB,4CAA+D;AAAtD,kHAAA,yBAAyB,OAAA;AAClC,oDAAmE;AAA1D,sHAAA,yBAAyB,OAAA;AAClC,kDAAiE;AAAxD,oHAAA,wBAAwB,OAAA;AACjC,oDAAmE;AAA1D,sHAAA,yBAAyB,OAAA;AAClC,kDAAiE;AAAxD,oHAAA,wBAAwB,OAAA"}
@@ -0,0 +1,14 @@
1
+ import type { Chunk, ChunkingOptions, ChunkingStrategy } from '../types';
2
+ export declare class FixedSizeChunkingStrategy implements ChunkingStrategy {
3
+ getName(): string;
4
+ chunk(text: string, documentId: string, options: ChunkingOptions): Chunk[];
5
+ /**
6
+ * Get text up to a token limit
7
+ */
8
+ private getTextUpToTokenLimit;
9
+ /**
10
+ * Get overlap text that is approximately 'overlapTokens' tokens
11
+ */
12
+ private getOverlapText;
13
+ }
14
+ //# sourceMappingURL=fixed.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"fixed.d.ts","sourceRoot":"","sources":["../../../../src/chunking/strategies/fixed.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,KAAK,EAAE,eAAe,EAAE,gBAAgB,EAAE,MAAM,UAAU,CAAC;AAGzE,qBAAa,yBAA0B,YAAW,gBAAgB;IAChE,OAAO,IAAI,MAAM;IAIjB,KAAK,CAAC,IAAI,EAAE,MAAM,EAAE,UAAU,EAAE,MAAM,EAAE,OAAO,EAAE,eAAe,GAAG,KAAK,EAAE;IA4D1E;;OAEG;IACH,OAAO,CAAC,qBAAqB;IA0B7B;;OAEG;IACH,OAAO,CAAC,cAAc;CAuBvB"}
@@ -0,0 +1,111 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.FixedSizeChunkingStrategy = void 0;
4
+ // Fixed-size chunking strategy
5
+ const uuid_1 = require("uuid");
6
+ const tokenizer_1 = require("../utils/tokenizer");
7
+ class FixedSizeChunkingStrategy {
8
+ getName() {
9
+ return 'fixed';
10
+ }
11
+ chunk(text, documentId, options) {
12
+ // Get token limit and ensure maxSize doesn't exceed it
13
+ const tokenLimit = options.tokenLimit ?? 8192;
14
+ const maxSize = Math.min(options.maxChunkSize ?? 1000, tokenLimit);
15
+ const overlap = Math.min(options.overlap ?? 200, Math.floor(maxSize * 0.2));
16
+ const preserveFormatting = options.preserveFormatting ?? false;
17
+ // Normalize text if not preserving formatting
18
+ const normalizedText = preserveFormatting
19
+ ? text
20
+ : text.replace(/\s+/g, ' ').trim();
21
+ const chunks = [];
22
+ let startChar = 0;
23
+ let chunkIndex = 0;
24
+ while (startChar < normalizedText.length) {
25
+ // Get text up to token limit
26
+ const remainingText = normalizedText.slice(startChar);
27
+ const chunkText = this.getTextUpToTokenLimit(remainingText, maxSize);
28
+ if (!chunkText || chunkText.length === 0) {
29
+ break;
30
+ }
31
+ const endChar = startChar + chunkText.length;
32
+ chunks.push({
33
+ id: (0, uuid_1.v4)(),
34
+ text: chunkText,
35
+ metadata: {
36
+ documentId,
37
+ chunkIndex,
38
+ totalChunks: 0, // Will be updated after all chunks are created
39
+ startChar,
40
+ endChar,
41
+ tokens: (0, tokenizer_1.countTokens)(chunkText),
42
+ ...options.metadata,
43
+ },
44
+ });
45
+ // Calculate overlap position using token count
46
+ const overlapText = this.getOverlapText(chunkText, overlap);
47
+ startChar = endChar - overlapText.length;
48
+ chunkIndex++;
49
+ // Avoid creating tiny overlapping chunks at the end
50
+ if (normalizedText.length - startChar < overlapText.length) {
51
+ break;
52
+ }
53
+ }
54
+ // Update totalChunks for all chunks
55
+ chunks.forEach(chunk => {
56
+ chunk.metadata.totalChunks = chunks.length;
57
+ });
58
+ return chunks;
59
+ }
60
+ /**
61
+ * Get text up to a token limit
62
+ */
63
+ getTextUpToTokenLimit(text, maxTokens) {
64
+ if ((0, tokenizer_1.countTokens)(text) <= maxTokens) {
65
+ return text;
66
+ }
67
+ // Binary search for the right character position
68
+ let start = 0;
69
+ let end = text.length;
70
+ let bestMatch = '';
71
+ while (start < end) {
72
+ const mid = Math.floor((start + end) / 2);
73
+ const candidate = text.slice(0, mid);
74
+ const tokens = (0, tokenizer_1.countTokens)(candidate);
75
+ if (tokens <= maxTokens) {
76
+ bestMatch = candidate;
77
+ start = mid + 1;
78
+ }
79
+ else {
80
+ end = mid;
81
+ }
82
+ }
83
+ return bestMatch || text.slice(0, Math.floor(text.length * 0.8)); // Fallback to 80%
84
+ }
85
+ /**
86
+ * Get overlap text that is approximately 'overlapTokens' tokens
87
+ */
88
+ getOverlapText(text, overlapTokens) {
89
+ if (overlapTokens === 0)
90
+ return '';
91
+ // Binary search for the right amount of text
92
+ let start = 0;
93
+ let end = text.length;
94
+ let bestMatch = '';
95
+ while (start < end) {
96
+ const mid = Math.floor((start + end) / 2);
97
+ const candidate = text.slice(mid);
98
+ const tokens = (0, tokenizer_1.countTokens)(candidate);
99
+ if (tokens <= overlapTokens) {
100
+ bestMatch = candidate;
101
+ end = mid;
102
+ }
103
+ else {
104
+ start = mid + 1;
105
+ }
106
+ }
107
+ return bestMatch || text.slice(-Math.floor(text.length * 0.1)); // Fallback to last 10%
108
+ }
109
+ }
110
+ exports.FixedSizeChunkingStrategy = FixedSizeChunkingStrategy;
111
+ //# sourceMappingURL=fixed.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"fixed.js","sourceRoot":"","sources":["../../../../src/chunking/strategies/fixed.ts"],"names":[],"mappings":";;;AAAA,+BAA+B;AAC/B,+BAAoC;AAEpC,kDAAiD;AAEjD,MAAa,yBAAyB;IACpC,OAAO;QACL,OAAO,OAAO,CAAC;IACjB,CAAC;IAED,KAAK,CAAC,IAAY,EAAE,UAAkB,EAAE,OAAwB;QAC9D,uDAAuD;QACvD,MAAM,UAAU,GAAI,OAAe,CAAC,UAAU,IAAI,IAAI,CAAC;QACvD,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,YAAY,IAAI,IAAI,EAAE,UAAU,CAAC,CAAC;QACnE,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,OAAO,IAAI,GAAG,EAAE,IAAI,CAAC,KAAK,CAAC,OAAO,GAAG,GAAG,CAAC,CAAC,CAAC;QAC5E,MAAM,kBAAkB,GAAG,OAAO,CAAC,kBAAkB,IAAI,KAAK,CAAC;QAE/D,8CAA8C;QAC9C,MAAM,cAAc,GAAG,kBAAkB;YACvC,CAAC,CAAC,IAAI;YACN,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;QAErC,MAAM,MAAM,GAAY,EAAE,CAAC;QAC3B,IAAI,SAAS,GAAG,CAAC,CAAC;QAClB,IAAI,UAAU,GAAG,CAAC,CAAC;QAEnB,OAAO,SAAS,GAAG,cAAc,CAAC,MAAM,EAAE,CAAC;YACzC,6BAA6B;YAC7B,MAAM,aAAa,GAAG,cAAc,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;YACtD,MAAM,SAAS,GAAG,IAAI,CAAC,qBAAqB,CAAC,aAAa,EAAE,OAAO,CAAC,CAAC;YAErE,IAAI,CAAC,SAAS,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBACzC,MAAM;YACR,CAAC;YAED,MAAM,OAAO,GAAG,SAAS,GAAG,SAAS,CAAC,MAAM,CAAC;YAE7C,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE,EAAE,IAAA,SAAM,GAAE;gBACZ,IAAI,EAAE,SAAS;gBACf,QAAQ,EAAE;oBACR,UAAU;oBACV,UAAU;oBACV,WAAW,EAAE,CAAC,EAAE,+CAA+C;oBAC/D,SAAS;oBACT,OAAO;oBACP,MAAM,EAAE,IAAA,uBAAW,EAAC,SAAS,CAAC;oBAC9B,GAAG,OAAO,CAAC,QAAQ;iBACpB;aACF,CAAC,CAAC;YAEH,+CAA+C;YAC/C,MAAM,WAAW,GAAG,IAAI,CAAC,cAAc,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC;YAC5D,SAAS,GAAG,OAAO,GAAG,WAAW,CAAC,MAAM,CAAC;YACzC,UAAU,EAAE,CAAC;YAEb,oDAAoD;YACpD,IAAI,cAAc,CAAC,MAAM,GAAG,SAAS,GAAG,WAAW,CAAC,MAAM,EAAE,CAAC;gBAC3D,MAAM;YACR,CAAC;QACH,CAAC;QAED,oCAAoC;QACpC,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE;YACrB,KAAK,CAAC,QAAQ,CAAC,WAAW,GAAG,MAAM,CAAC,MAAM,CAAC;QAC7C,CAAC,CAAC,CAAC;QAEH,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;OAEG;IACK,qBAAqB,CAAC,IAAY,EAAE,SAAiB;QAC3D,IAAI,IAAA,uBAAW,EAAC,IAAI,CAAC,IAAI,SAAS,EAAE,CAAC;YACnC,OAAO,IAAI,CAAC;QACd,CAAC;QAED,iDAAiD;QACjD,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,IAAI,GAAG,GAAG,IAAI,CAAC,MAAM,CAAC;QACtB,IAAI,SAAS,GAAG,EAAE,CAAC;QAEnB,OAAO,KAAK,GAAG,GAAG,EAAE,CAAC;YACnB,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,KAAK,GAAG,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC;YAC1C,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;YACrC,MAAM,MAAM,GAAG,IAAA,uBAAW,EAAC,SAAS,CAAC,CAAC;YAEtC,IAAI,MAAM,IAAI,SAAS,EAAE,CAAC;gBACxB,SAAS,GAAG,SAAS,CAAC;gBACtB,KAAK,GAAG,GAAG,GAAG,CAAC,CAAC;YAClB,CAAC;iBAAM,CAAC;gBACN,GAAG,GAAG,GAAG,CAAC;YACZ,CAAC;QACH,CAAC;QAED,OAAO,SAAS,IAAI,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,kBAAkB;IACtF,CAAC;IAED;;OAEG;IACK,cAAc,CAAC,IAAY,EAAE,aAAqB;QACxD,IAAI,aAAa,KAAK,CAAC;YAAE,OAAO,EAAE,CAAC;QAEnC,6CAA6C;QAC7C,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,IAAI,GAAG,GAAG,IAAI,CAAC,MAAM,CAAC;QACtB,IAAI,SAAS,GAAG,EAAE,CAAC;QAEnB,OAAO,KAAK,GAAG,GAAG,EAAE,CAAC;YACnB,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,KAAK,GAAG,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC;YAC1C,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;YAClC,MAAM,MAAM,GAAG,IAAA,uBAAW,EAAC,SAAS,CAAC,CAAC;YAEtC,IAAI,MAAM,IAAI,aAAa,EAAE,CAAC;gBAC5B,SAAS,GAAG,SAAS,CAAC;gBACtB,GAAG,GAAG,GAAG,CAAC;YACZ,CAAC;iBAAM,CAAC;gBACN,KAAK,GAAG,GAAG,GAAG,CAAC,CAAC;YAClB,CAAC;QACH,CAAC;QAED,OAAO,SAAS,IAAI,IAAI,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,uBAAuB;IACzF,CAAC;CACF;AAxHD,8DAwHC"}
@@ -0,0 +1,6 @@
1
+ import type { Chunk, ChunkingOptions, ChunkingStrategy } from '../types';
2
+ export declare class ParagraphChunkingStrategy implements ChunkingStrategy {
3
+ getName(): string;
4
+ chunk(text: string, documentId: string, options: ChunkingOptions): Chunk[];
5
+ }
6
+ //# sourceMappingURL=paragraph.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"paragraph.d.ts","sourceRoot":"","sources":["../../../../src/chunking/strategies/paragraph.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,KAAK,EAAE,eAAe,EAAE,gBAAgB,EAAE,MAAM,UAAU,CAAC;AAGzE,qBAAa,yBAA0B,YAAW,gBAAgB;IAChE,OAAO,IAAI,MAAM;IAIjB,KAAK,CAAC,IAAI,EAAE,MAAM,EAAE,UAAU,EAAE,MAAM,EAAE,OAAO,EAAE,eAAe,GAAG,KAAK,EAAE;CAgF3E"}
@@ -0,0 +1,84 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.ParagraphChunkingStrategy = void 0;
4
+ // Paragraph-based chunking strategy
5
+ const uuid_1 = require("uuid");
6
+ const tokenizer_1 = require("../utils/tokenizer");
7
+ class ParagraphChunkingStrategy {
8
+ getName() {
9
+ return 'paragraph';
10
+ }
11
+ chunk(text, documentId, options) {
12
+ // Get token limit and ensure maxSize doesn't exceed it
13
+ const tokenLimit = options.tokenLimit ?? 8192;
14
+ const maxSize = Math.min(options.maxChunkSize ?? 2000, tokenLimit);
15
+ const overlap = options.overlap ?? 1; // Overlap in number of paragraphs
16
+ // Split text into paragraphs
17
+ const paragraphs = text
18
+ .split(/\n\s*\n/)
19
+ .map(p => p.trim())
20
+ .filter(p => p.length > 0);
21
+ const chunks = [];
22
+ let currentChunk = [];
23
+ let currentTokens = 0;
24
+ let chunkIndex = 0;
25
+ let startChar = 0;
26
+ for (let i = 0; i < paragraphs.length; i++) {
27
+ const paragraph = paragraphs[i];
28
+ const paragraphTokens = (0, tokenizer_1.countTokens)(paragraph);
29
+ // If adding this paragraph would exceed maxSize (in tokens) and we have content, create a chunk
30
+ if (currentTokens + paragraphTokens > maxSize && currentChunk.length > 0) {
31
+ const chunkText = currentChunk.join('\n\n').trim();
32
+ const endChar = startChar + chunkText.length;
33
+ chunks.push({
34
+ id: (0, uuid_1.v4)(),
35
+ text: chunkText,
36
+ metadata: {
37
+ documentId,
38
+ chunkIndex,
39
+ totalChunks: 0, // Will be updated later
40
+ startChar,
41
+ endChar,
42
+ paragraphs: currentChunk.length,
43
+ ...options.metadata,
44
+ },
45
+ });
46
+ // Keep last N paragraphs for overlap
47
+ const overlapParagraphs = currentChunk.slice(-overlap);
48
+ currentChunk = [...overlapParagraphs, paragraph];
49
+ currentTokens = (0, tokenizer_1.countTokens)(overlapParagraphs.join('\n\n')) + paragraphTokens;
50
+ startChar = endChar - (overlapParagraphs.join('\n\n').length);
51
+ chunkIndex++;
52
+ }
53
+ else {
54
+ currentChunk.push(paragraph);
55
+ currentTokens += paragraphTokens;
56
+ }
57
+ }
58
+ // Add remaining content as final chunk
59
+ if (currentChunk.length > 0) {
60
+ const chunkText = currentChunk.join('\n\n').trim();
61
+ const endChar = startChar + chunkText.length;
62
+ chunks.push({
63
+ id: (0, uuid_1.v4)(),
64
+ text: chunkText,
65
+ metadata: {
66
+ documentId,
67
+ chunkIndex,
68
+ totalChunks: 0,
69
+ startChar,
70
+ endChar,
71
+ paragraphs: currentChunk.length,
72
+ ...options.metadata,
73
+ },
74
+ });
75
+ }
76
+ // Update totalChunks
77
+ chunks.forEach(chunk => {
78
+ chunk.metadata.totalChunks = chunks.length;
79
+ });
80
+ return chunks;
81
+ }
82
+ }
83
+ exports.ParagraphChunkingStrategy = ParagraphChunkingStrategy;
84
+ //# sourceMappingURL=paragraph.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"paragraph.js","sourceRoot":"","sources":["../../../../src/chunking/strategies/paragraph.ts"],"names":[],"mappings":";;;AAAA,oCAAoC;AACpC,+BAAoC;AAEpC,kDAAiD;AAEjD,MAAa,yBAAyB;IACpC,OAAO;QACL,OAAO,WAAW,CAAC;IACrB,CAAC;IAED,KAAK,CAAC,IAAY,EAAE,UAAkB,EAAE,OAAwB;QAC9D,uDAAuD;QACvD,MAAM,UAAU,GAAI,OAAe,CAAC,UAAU,IAAI,IAAI,CAAC;QACvD,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,YAAY,IAAI,IAAI,EAAE,UAAU,CAAC,CAAC;QACnE,MAAM,OAAO,GAAG,OAAO,CAAC,OAAO,IAAI,CAAC,CAAC,CAAC,kCAAkC;QAExE,6BAA6B;QAC7B,MAAM,UAAU,GAAG,IAAI;aACpB,KAAK,CAAC,SAAS,CAAC;aAChB,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;aAClB,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QAE7B,MAAM,MAAM,GAAY,EAAE,CAAC;QAC3B,IAAI,YAAY,GAAa,EAAE,CAAC;QAChC,IAAI,aAAa,GAAG,CAAC,CAAC;QACtB,IAAI,UAAU,GAAG,CAAC,CAAC;QACnB,IAAI,SAAS,GAAG,CAAC,CAAC;QAElB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC3C,MAAM,SAAS,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;YAChC,MAAM,eAAe,GAAG,IAAA,uBAAW,EAAC,SAAS,CAAC,CAAC;YAE/C,gGAAgG;YAChG,IAAI,aAAa,GAAG,eAAe,GAAG,OAAO,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACzE,MAAM,SAAS,GAAG,YAAY,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,CAAC;gBACnD,MAAM,OAAO,GAAG,SAAS,GAAG,SAAS,CAAC,MAAM,CAAC;gBAE7C,MAAM,CAAC,IAAI,CAAC;oBACV,EAAE,EAAE,IAAA,SAAM,GAAE;oBACZ,IAAI,EAAE,SAAS;oBACf,QAAQ,EAAE;wBACR,UAAU;wBACV,UAAU;wBACV,WAAW,EAAE,CAAC,EAAE,wBAAwB;wBACxC,SAAS;wBACT,OAAO;wBACP,UAAU,EAAE,YAAY,CAAC,MAAM;wBAC/B,GAAG,OAAO,CAAC,QAAQ;qBACpB;iBACF,CAAC,CAAC;gBAEH,qCAAqC;gBACrC,MAAM,iBAAiB,GAAG,YAAY,CAAC,KAAK,CAAC,CAAC,OAAO,CAAC,CAAC;gBACvD,YAAY,GAAG,CAAC,GAAG,iBAAiB,EAAE,SAAS,CAAC,CAAC;gBACjD,aAAa,GAAG,IAAA,uBAAW,EAAC,iBAAiB,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,GAAG,eAAe,CAAC;gBAC9E,SAAS,GAAG,OAAO,GAAG,CAAC,iBAAiB,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC,CAAC;gBAC9D,UAAU,EAAE,CAAC;YACf,CAAC;iBAAM,CAAC;gBACN,YAAY,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;gBAC7B,aAAa,IAAI,eAAe,CAAC;YACnC,CAAC;QACH,CAAC;QAED,uCAAuC;QACvC,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC5B,MAAM,SAAS,GAAG,YAAY,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,CAAC;YACnD,MAAM,OAAO,GAAG,SAAS,GAAG,SAAS,CAAC,MAAM,CAAC;YAE7C,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE,EAAE,IAAA,SAAM,GAAE;gBACZ,IAAI,EAAE,SAAS;gBACf,QAAQ,EAAE;oBACR,UAAU;oBACV,UAAU;oBACV,WAAW,EAAE,CAAC;oBACd,SAAS;oBACT,OAAO;oBACP,UAAU,EAAE,YAAY,CAAC,MAAM;oBAC/B,GAAG,OAAO,CAAC,QAAQ;iBACpB;aACF,CAAC,CAAC;QACL,CAAC;QAED,qBAAqB;QACrB,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE;YACrB,KAAK,CAAC,QAAQ,CAAC,WAAW,GAAG,MAAM,CAAC,MAAM,CAAC;QAC7C,CAAC,CAAC,CAAC;QAEH,OAAO,MAAM,CAAC;IAChB,CAAC;CACF;AArFD,8DAqFC"}
@@ -0,0 +1,17 @@
1
+ import type { Chunk, ChunkingOptions, ChunkingStrategy } from '../types';
2
+ export declare class RecursiveChunkingStrategy implements ChunkingStrategy {
3
+ getName(): string;
4
+ chunk(text: string, documentId: string, options: ChunkingOptions): Chunk[];
5
+ private recursiveSplit;
6
+ private mergeSplits;
7
+ /**
8
+ * Get overlap text that is approximately 'overlapTokens' tokens
9
+ */
10
+ private getOverlapText;
11
+ private splitByCharacters;
12
+ /**
13
+ * Get text up to a token limit
14
+ */
15
+ private getTextUpToTokenLimit;
16
+ }
17
+ //# sourceMappingURL=recursive.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"recursive.d.ts","sourceRoot":"","sources":["../../../../src/chunking/strategies/recursive.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,KAAK,EAAE,eAAe,EAAE,gBAAgB,EAAE,MAAM,UAAU,CAAC;AAGzE,qBAAa,yBAA0B,YAAW,gBAAgB;IAChE,OAAO,IAAI,MAAM;IAIjB,KAAK,CAAC,IAAI,EAAE,MAAM,EAAE,UAAU,EAAE,MAAM,EAAE,OAAO,EAAE,eAAe,GAAG,KAAK,EAAE;IAoD1E,OAAO,CAAC,cAAc;IA8CtB,OAAO,CAAC,WAAW;IAsCnB;;OAEG;IACH,OAAO,CAAC,cAAc;IAsBtB,OAAO,CAAC,iBAAiB;IAyBzB;;OAEG;IACH,OAAO,CAAC,qBAAqB;CAyB9B"}