voctar 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +102 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +29 -0
- package/dist/index.js.map +1 -0
- package/dist/src/chunking/index.d.ts +48 -0
- package/dist/src/chunking/index.d.ts.map +1 -0
- package/dist/src/chunking/index.js +123 -0
- package/dist/src/chunking/index.js.map +1 -0
- package/dist/src/chunking/strategies/fixed.d.ts +14 -0
- package/dist/src/chunking/strategies/fixed.d.ts.map +1 -0
- package/dist/src/chunking/strategies/fixed.js +111 -0
- package/dist/src/chunking/strategies/fixed.js.map +1 -0
- package/dist/src/chunking/strategies/paragraph.d.ts +6 -0
- package/dist/src/chunking/strategies/paragraph.d.ts.map +1 -0
- package/dist/src/chunking/strategies/paragraph.js +84 -0
- package/dist/src/chunking/strategies/paragraph.js.map +1 -0
- package/dist/src/chunking/strategies/recursive.d.ts +17 -0
- package/dist/src/chunking/strategies/recursive.d.ts.map +1 -0
- package/dist/src/chunking/strategies/recursive.js +192 -0
- package/dist/src/chunking/strategies/recursive.js.map +1 -0
- package/dist/src/chunking/strategies/semantic.d.ts +96 -0
- package/dist/src/chunking/strategies/semantic.d.ts.map +1 -0
- package/dist/src/chunking/strategies/semantic.js +587 -0
- package/dist/src/chunking/strategies/semantic.js.map +1 -0
- package/dist/src/chunking/strategies/sentence.d.ts +7 -0
- package/dist/src/chunking/strategies/sentence.d.ts.map +1 -0
- package/dist/src/chunking/strategies/sentence.js +116 -0
- package/dist/src/chunking/strategies/sentence.js.map +1 -0
- package/dist/src/chunking/types.d.ts +45 -0
- package/dist/src/chunking/types.d.ts.map +1 -0
- package/dist/src/chunking/types.js +4 -0
- package/dist/src/chunking/types.js.map +1 -0
- package/dist/src/chunking/utils/tokenizer.d.ts +10 -0
- package/dist/src/chunking/utils/tokenizer.d.ts.map +1 -0
- package/dist/src/chunking/utils/tokenizer.js +50 -0
- package/dist/src/chunking/utils/tokenizer.js.map +1 -0
- package/dist/src/providers/embeddings/index.d.ts +3 -0
- package/dist/src/providers/embeddings/index.d.ts.map +1 -0
- package/dist/src/providers/embeddings/index.js +7 -0
- package/dist/src/providers/embeddings/index.js.map +1 -0
- package/dist/src/providers/embeddings/openai.d.ts +21 -0
- package/dist/src/providers/embeddings/openai.d.ts.map +1 -0
- package/dist/src/providers/embeddings/openai.js +86 -0
- package/dist/src/providers/embeddings/openai.js.map +1 -0
- package/dist/src/providers/index.d.ts +3 -0
- package/dist/src/providers/index.d.ts.map +1 -0
- package/dist/src/providers/index.js +20 -0
- package/dist/src/providers/index.js.map +1 -0
- package/dist/src/providers/stores/index.d.ts +6 -0
- package/dist/src/providers/stores/index.d.ts.map +1 -0
- package/dist/src/providers/stores/index.js +11 -0
- package/dist/src/providers/stores/index.js.map +1 -0
- package/dist/src/providers/stores/memory.d.ts +18 -0
- package/dist/src/providers/stores/memory.d.ts.map +1 -0
- package/dist/src/providers/stores/memory.js +169 -0
- package/dist/src/providers/stores/memory.js.map +1 -0
- package/dist/src/providers/stores/qdrant.d.ts +28 -0
- package/dist/src/providers/stores/qdrant.d.ts.map +1 -0
- package/dist/src/providers/stores/qdrant.js +223 -0
- package/dist/src/providers/stores/qdrant.js.map +1 -0
- package/dist/src/providers/stores/sqlite.d.ts +38 -0
- package/dist/src/providers/stores/sqlite.d.ts.map +1 -0
- package/dist/src/providers/stores/sqlite.js +306 -0
- package/dist/src/providers/stores/sqlite.js.map +1 -0
- package/dist/src/types.d.ts +111 -0
- package/dist/src/types.d.ts.map +1 -0
- package/dist/src/types.js +32 -0
- package/dist/src/types.js.map +1 -0
- package/dist/src/vector.d.ts +74 -0
- package/dist/src/vector.d.ts.map +1 -0
- package/dist/src/vector.js +505 -0
- package/dist/src/vector.js.map +1 -0
- package/docs/API.md +361 -0
- package/docs/CHUNKING.md +280 -0
- package/docs/CUSTOM_PROVIDERS.md +101 -0
- package/docs/README.md +11 -0
- package/docs/STORAGE_BACKENDS.md +189 -0
- package/docs/assets/vectar.png +0 -0
- package/package.json +46 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Voctar contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="./docs/assets/vectar.png" alt="Voctar logo" width="180" />
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
<h1 align="center">Voctar</h1>
|
|
6
|
+
|
|
7
|
+
<p align="center">
|
|
8
|
+
Simple TypeScript library with RAG primitives for embeddings, chunking, storage, and semantic retrieval.
|
|
9
|
+
</p>
|
|
10
|
+
|
|
11
|
+
<p align="center">
|
|
12
|
+
<img alt="npm version" src="https://img.shields.io/npm/v/voctar?logo=npm&color=CB3837" />
|
|
13
|
+
<img alt="TypeScript" src="https://img.shields.io/badge/TypeScript-5.x-3178C6?logo=typescript&logoColor=white" />
|
|
14
|
+
<img alt="Node" src="https://img.shields.io/badge/Node-%3E%3D18-339933?logo=node.js&logoColor=white" />
|
|
15
|
+
<img alt="License: MIT" src="https://img.shields.io/badge/License-MIT-yellow.svg" />
|
|
16
|
+
</p>
|
|
17
|
+
|
|
18
|
+
## Features
|
|
19
|
+
|
|
20
|
+
- Supports multiple vector stores: SQLite, Qdrant, in-memory, or custom store providers
|
|
21
|
+
- Automatic chunking for long documents with multiple strategies (`fixed`, `recursive`, `sentence`, `paragraph`, `semantic`)
|
|
22
|
+
- Semantic search with score thresholds and metadata filtering
|
|
23
|
+
- Simple primitives: `embed`, `search` and more
|
|
24
|
+
- TypeScript-first.
|
|
25
|
+
|
|
26
|
+
## Quick Start
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
yarn add voctar
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
```typescript
|
|
33
|
+
import { Voctar } from 'voctar';
|
|
34
|
+
|
|
35
|
+
const vector = new Voctar({
|
|
36
|
+
embedding: {
|
|
37
|
+
type: 'openai',
|
|
38
|
+
apiKey: '<your-api-key>',
|
|
39
|
+
},
|
|
40
|
+
store: {
|
|
41
|
+
type: 'sqlite',
|
|
42
|
+
path: 'data/vector.db',
|
|
43
|
+
},
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
const { documentId } = await vector.embed('documents', "Very long text...", {
|
|
47
|
+
metadata: { author: 'Alice' },
|
|
48
|
+
});
|
|
49
|
+
|
|
50
|
+
const results = await vector.search('documents', 'Some query');
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Primitives API
|
|
54
|
+
|
|
55
|
+
### `embed(collection, text, options?)`
|
|
56
|
+
|
|
57
|
+
Embeds a document into a collection.
|
|
58
|
+
If the text exceeds model limits, Voctar auto-chunks and stores chunk vectors.
|
|
59
|
+
|
|
60
|
+
```typescript
|
|
61
|
+
const { documentId, chunkIds } = await vector.embed('documents', longText, {
|
|
62
|
+
documentId: 'doc-1', // optional; auto-generated if omitted
|
|
63
|
+
metadata: { source: 'guide' }, // optional user metadata
|
|
64
|
+
chunkSize: 1000, // optional
|
|
65
|
+
chunkStrategy: 'recursive', // fixed | recursive | sentence | paragraph | semantic
|
|
66
|
+
chunkOverlap: 200, // optional
|
|
67
|
+
autoChunk: true, // optional override
|
|
68
|
+
});
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
|
|
73
|
+
- `documentId`: stable parent id for the document
|
|
74
|
+
- `chunkIds`: stored ids (single id for unchunked docs, multiple for chunked docs)
|
|
75
|
+
|
|
76
|
+
### `search(collection, query, options?)`
|
|
77
|
+
|
|
78
|
+
Retrieves semantically similar text from a collection.
|
|
79
|
+
|
|
80
|
+
```typescript
|
|
81
|
+
const results = await vector.search('documents', 'how does chunking work', {
|
|
82
|
+
limit: 5, // optional, default provider behavior
|
|
83
|
+
scoreThreshold: 0, // optional
|
|
84
|
+
filter: { source: 'guide' }, // optional metadata filter
|
|
85
|
+
includeSystem: false, // optional; include internal metadata when true
|
|
86
|
+
});
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
Each result includes:
|
|
90
|
+
|
|
91
|
+
- `id`
|
|
92
|
+
- `text`
|
|
93
|
+
- `score`
|
|
94
|
+
- `createdAt`
|
|
95
|
+
- `metadata` (and optional `system` when `includeSystem: true`)
|
|
96
|
+
|
|
97
|
+
## Documentation
|
|
98
|
+
|
|
99
|
+
- [Docs Index](./docs/README.md)
|
|
100
|
+
- [Storage Backends](./docs/STORAGE_BACKENDS.md)
|
|
101
|
+
- [Chunking](./docs/CHUNKING.md)
|
|
102
|
+
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,MAAM,EAAE,MAAM,IAAI,MAAM,EAAE,MAAM,cAAc,CAAC;AACxD,cAAc,iBAAiB,CAAC;AAChC,cAAc,aAAa,CAAC;AAG5B,OAAO,EAAE,QAAQ,EAAE,eAAe,EAAE,MAAM,gBAAgB,CAAC;AAC3D,cAAc,sBAAsB,CAAC"}
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
14
|
+
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
15
|
+
};
|
|
16
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
17
|
+
exports.ChunkingService = exports.chunking = exports.Voctar = exports.Vector = void 0;
|
|
18
|
+
// Voctar exports
|
|
19
|
+
var vector_1 = require("./src/vector");
|
|
20
|
+
Object.defineProperty(exports, "Vector", { enumerable: true, get: function () { return vector_1.Vector; } });
|
|
21
|
+
Object.defineProperty(exports, "Voctar", { enumerable: true, get: function () { return vector_1.Vector; } });
|
|
22
|
+
__exportStar(require("./src/providers"), exports);
|
|
23
|
+
__exportStar(require("./src/types"), exports);
|
|
24
|
+
// Chunking exports
|
|
25
|
+
var chunking_1 = require("./src/chunking");
|
|
26
|
+
Object.defineProperty(exports, "chunking", { enumerable: true, get: function () { return chunking_1.chunking; } });
|
|
27
|
+
Object.defineProperty(exports, "ChunkingService", { enumerable: true, get: function () { return chunking_1.ChunkingService; } });
|
|
28
|
+
__exportStar(require("./src/chunking/types"), exports);
|
|
29
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;AAAA,iBAAiB;AACjB,uCAAwD;AAA/C,gGAAA,MAAM,OAAA;AAAE,gGAAA,MAAM,OAAU;AACjC,kDAAgC;AAChC,8CAA4B;AAE5B,mBAAmB;AACnB,2CAA2D;AAAlD,oGAAA,QAAQ,OAAA;AAAE,2GAAA,eAAe,OAAA;AAClC,uDAAqC"}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import type { ChunkingOptions, ChunkingStrategy, DocumentChunkResult } from './types';
|
|
2
|
+
export declare class ChunkingService {
|
|
3
|
+
private strategies;
|
|
4
|
+
private defaultStrategy;
|
|
5
|
+
constructor();
|
|
6
|
+
/**
|
|
7
|
+
* Register a custom chunking strategy
|
|
8
|
+
*/
|
|
9
|
+
registerStrategy(strategy: ChunkingStrategy): void;
|
|
10
|
+
/**
|
|
11
|
+
* Chunk a single document
|
|
12
|
+
*/
|
|
13
|
+
chunkDocument(text: string, options?: ChunkingOptions, documentId?: string): DocumentChunkResult;
|
|
14
|
+
/**
|
|
15
|
+
* Chunk multiple documents
|
|
16
|
+
*/
|
|
17
|
+
chunkDocuments(documents: Array<{
|
|
18
|
+
text: string;
|
|
19
|
+
id?: string;
|
|
20
|
+
metadata?: Record<string, any>;
|
|
21
|
+
}>, options?: ChunkingOptions): DocumentChunkResult[];
|
|
22
|
+
/**
|
|
23
|
+
* Estimate token count using accurate tokenizer
|
|
24
|
+
*/
|
|
25
|
+
estimateTokens(text: string): number;
|
|
26
|
+
/**
|
|
27
|
+
* Get optimal chunk size based on embedding model's token limit
|
|
28
|
+
* Now returns tokens directly, not characters
|
|
29
|
+
*/
|
|
30
|
+
getOptimalChunkSize(modelTokenLimit?: number, safetyMargin?: number): number;
|
|
31
|
+
/**
|
|
32
|
+
* Set default chunking strategy
|
|
33
|
+
*/
|
|
34
|
+
setDefaultStrategy(strategy: string): void;
|
|
35
|
+
/**
|
|
36
|
+
* Get list of available strategies
|
|
37
|
+
*/
|
|
38
|
+
getAvailableStrategies(): string[];
|
|
39
|
+
}
|
|
40
|
+
export declare const chunking: ChunkingService;
|
|
41
|
+
export * from './types';
|
|
42
|
+
export { FixedSizeChunkingStrategy } from './strategies/fixed';
|
|
43
|
+
export { RecursiveChunkingStrategy } from './strategies/recursive';
|
|
44
|
+
export { SentenceChunkingStrategy } from './strategies/sentence';
|
|
45
|
+
export { ParagraphChunkingStrategy } from './strategies/paragraph';
|
|
46
|
+
export { SemanticChunkingStrategy } from './strategies/semantic';
|
|
47
|
+
export type { SemanticChunkingOptions } from './strategies/semantic';
|
|
48
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/chunking/index.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAS,eAAe,EAAE,gBAAgB,EAAE,mBAAmB,EAAE,MAAM,SAAS,CAAC;AAO7F,qBAAa,eAAe;IAC1B,OAAO,CAAC,UAAU,CAAgC;IAClD,OAAO,CAAC,eAAe,CAAuB;;IAa9C;;OAEG;IACH,gBAAgB,CAAC,QAAQ,EAAE,gBAAgB,GAAG,IAAI;IAIlD;;OAEG;IACH,aAAa,CACX,IAAI,EAAE,MAAM,EACZ,OAAO,GAAE,eAAoB,EAC7B,UAAU,CAAC,EAAE,MAAM,GAClB,mBAAmB;IAuBtB;;OAEG;IACH,cAAc,CACZ,SAAS,EAAE,KAAK,CAAC;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,EAAE,CAAC,EAAE,MAAM,CAAC;QAAC,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAA;KAAE,CAAC,EAC/E,OAAO,GAAE,eAAoB,GAC5B,mBAAmB,EAAE;IAWxB;;OAEG;IACH,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM;IAKpC;;;OAGG;IACH,mBAAmB,CAAC,eAAe,GAAE,MAAa,EAAE,YAAY,GAAE,MAAY,GAAG,MAAM;IAMvF;;OAEG;IACH,kBAAkB,CAAC,QAAQ,EAAE,MAAM,GAAG,IAAI;IAO1C;;OAEG;IACH,sBAAsB,IAAI,MAAM,EAAE;CAGnC;AAGD,eAAO,MAAM,QAAQ,iBAAwB,CAAC;AAG9C,cAAc,SAAS,CAAC;AACxB,OAAO,EAAE,yBAAyB,EAAE,MAAM,oBAAoB,CAAC;AAC/D,OAAO,EAAE,yBAAyB,EAAE,MAAM,wBAAwB,CAAC;AACnE,OAAO,EAAE,wBAAwB,EAAE,MAAM,uBAAuB,CAAC;AACjE,OAAO,EAAE,yBAAyB,EAAE,MAAM,wBAAwB,CAAC;AACnE,OAAO,EAAE,wBAAwB,EAAE,MAAM,uBAAuB,CAAC;AACjE,YAAY,EAAE,uBAAuB,EAAE,MAAM,uBAAuB,CAAC"}
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
14
|
+
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
15
|
+
};
|
|
16
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
17
|
+
exports.SemanticChunkingStrategy = exports.ParagraphChunkingStrategy = exports.SentenceChunkingStrategy = exports.RecursiveChunkingStrategy = exports.FixedSizeChunkingStrategy = exports.chunking = exports.ChunkingService = void 0;
|
|
18
|
+
// Main chunking service
|
|
19
|
+
const uuid_1 = require("uuid");
|
|
20
|
+
const fixed_1 = require("./strategies/fixed");
|
|
21
|
+
const recursive_1 = require("./strategies/recursive");
|
|
22
|
+
const sentence_1 = require("./strategies/sentence");
|
|
23
|
+
const paragraph_1 = require("./strategies/paragraph");
|
|
24
|
+
const semantic_1 = require("./strategies/semantic");
|
|
25
|
+
class ChunkingService {
|
|
26
|
+
constructor() {
|
|
27
|
+
this.defaultStrategy = 'recursive';
|
|
28
|
+
this.strategies = new Map();
|
|
29
|
+
// Register built-in strategies
|
|
30
|
+
this.registerStrategy(new fixed_1.FixedSizeChunkingStrategy());
|
|
31
|
+
this.registerStrategy(new recursive_1.RecursiveChunkingStrategy());
|
|
32
|
+
this.registerStrategy(new sentence_1.SentenceChunkingStrategy());
|
|
33
|
+
this.registerStrategy(new paragraph_1.ParagraphChunkingStrategy());
|
|
34
|
+
this.registerStrategy(new semantic_1.SemanticChunkingStrategy());
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* Register a custom chunking strategy
|
|
38
|
+
*/
|
|
39
|
+
registerStrategy(strategy) {
|
|
40
|
+
this.strategies.set(strategy.getName(), strategy);
|
|
41
|
+
}
|
|
42
|
+
/**
|
|
43
|
+
* Chunk a single document
|
|
44
|
+
*/
|
|
45
|
+
chunkDocument(text, options = {}, documentId) {
|
|
46
|
+
const docId = documentId || (0, uuid_1.v4)();
|
|
47
|
+
const strategy = options.strategy || this.defaultStrategy;
|
|
48
|
+
const chunkingStrategy = this.strategies.get(strategy);
|
|
49
|
+
if (!chunkingStrategy) {
|
|
50
|
+
throw new Error(`Unknown chunking strategy: ${strategy}`);
|
|
51
|
+
}
|
|
52
|
+
const chunks = chunkingStrategy.chunk(text, docId, options);
|
|
53
|
+
return {
|
|
54
|
+
documentId: docId,
|
|
55
|
+
chunks,
|
|
56
|
+
metadata: {
|
|
57
|
+
originalLength: text.length,
|
|
58
|
+
totalChunks: chunks.length,
|
|
59
|
+
strategy,
|
|
60
|
+
...options.metadata,
|
|
61
|
+
},
|
|
62
|
+
};
|
|
63
|
+
}
|
|
64
|
+
/**
|
|
65
|
+
* Chunk multiple documents
|
|
66
|
+
*/
|
|
67
|
+
chunkDocuments(documents, options = {}) {
|
|
68
|
+
return documents.map(doc => {
|
|
69
|
+
const docOptions = {
|
|
70
|
+
...options,
|
|
71
|
+
metadata: { ...options.metadata, ...doc.metadata },
|
|
72
|
+
};
|
|
73
|
+
return this.chunkDocument(doc.text, docOptions, doc.id);
|
|
74
|
+
});
|
|
75
|
+
}
|
|
76
|
+
/**
|
|
77
|
+
* Estimate token count using accurate tokenizer
|
|
78
|
+
*/
|
|
79
|
+
estimateTokens(text) {
|
|
80
|
+
const { countTokens } = require('./utils/tokenizer');
|
|
81
|
+
return countTokens(text);
|
|
82
|
+
}
|
|
83
|
+
/**
|
|
84
|
+
* Get optimal chunk size based on embedding model's token limit
|
|
85
|
+
* Now returns tokens directly, not characters
|
|
86
|
+
*/
|
|
87
|
+
getOptimalChunkSize(modelTokenLimit = 8192, safetyMargin = 0.8) {
|
|
88
|
+
const effectiveLimit = Math.floor(modelTokenLimit * safetyMargin);
|
|
89
|
+
// Return tokens directly (not characters)
|
|
90
|
+
return effectiveLimit;
|
|
91
|
+
}
|
|
92
|
+
/**
|
|
93
|
+
* Set default chunking strategy
|
|
94
|
+
*/
|
|
95
|
+
setDefaultStrategy(strategy) {
|
|
96
|
+
if (!this.strategies.has(strategy)) {
|
|
97
|
+
throw new Error(`Unknown strategy: ${strategy}`);
|
|
98
|
+
}
|
|
99
|
+
this.defaultStrategy = strategy;
|
|
100
|
+
}
|
|
101
|
+
/**
|
|
102
|
+
* Get list of available strategies
|
|
103
|
+
*/
|
|
104
|
+
getAvailableStrategies() {
|
|
105
|
+
return Array.from(this.strategies.keys());
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
exports.ChunkingService = ChunkingService;
|
|
109
|
+
// Export singleton instance
|
|
110
|
+
exports.chunking = new ChunkingService();
|
|
111
|
+
// Export types and strategies
|
|
112
|
+
__exportStar(require("./types"), exports);
|
|
113
|
+
var fixed_2 = require("./strategies/fixed");
|
|
114
|
+
Object.defineProperty(exports, "FixedSizeChunkingStrategy", { enumerable: true, get: function () { return fixed_2.FixedSizeChunkingStrategy; } });
|
|
115
|
+
var recursive_2 = require("./strategies/recursive");
|
|
116
|
+
Object.defineProperty(exports, "RecursiveChunkingStrategy", { enumerable: true, get: function () { return recursive_2.RecursiveChunkingStrategy; } });
|
|
117
|
+
var sentence_2 = require("./strategies/sentence");
|
|
118
|
+
Object.defineProperty(exports, "SentenceChunkingStrategy", { enumerable: true, get: function () { return sentence_2.SentenceChunkingStrategy; } });
|
|
119
|
+
var paragraph_2 = require("./strategies/paragraph");
|
|
120
|
+
Object.defineProperty(exports, "ParagraphChunkingStrategy", { enumerable: true, get: function () { return paragraph_2.ParagraphChunkingStrategy; } });
|
|
121
|
+
var semantic_2 = require("./strategies/semantic");
|
|
122
|
+
Object.defineProperty(exports, "SemanticChunkingStrategy", { enumerable: true, get: function () { return semantic_2.SemanticChunkingStrategy; } });
|
|
123
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/chunking/index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;AAAA,wBAAwB;AACxB,+BAAoC;AAEpC,8CAA+D;AAC/D,sDAAmE;AACnE,oDAAiE;AACjE,sDAAmE;AACnE,oDAAiE;AAEjE,MAAa,eAAe;IAI1B;QAFQ,oBAAe,GAAW,WAAW,CAAC;QAG5C,IAAI,CAAC,UAAU,GAAG,IAAI,GAAG,EAAE,CAAC;QAE5B,+BAA+B;QAC/B,IAAI,CAAC,gBAAgB,CAAC,IAAI,iCAAyB,EAAE,CAAC,CAAC;QACvD,IAAI,CAAC,gBAAgB,CAAC,IAAI,qCAAyB,EAAE,CAAC,CAAC;QACvD,IAAI,CAAC,gBAAgB,CAAC,IAAI,mCAAwB,EAAE,CAAC,CAAC;QACtD,IAAI,CAAC,gBAAgB,CAAC,IAAI,qCAAyB,EAAE,CAAC,CAAC;QACvD,IAAI,CAAC,gBAAgB,CAAC,IAAI,mCAAwB,EAAE,CAAC,CAAC;IACxD,CAAC;IAED;;OAEG;IACH,gBAAgB,CAAC,QAA0B;QACzC,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,QAAQ,CAAC,OAAO,EAAE,EAAE,QAAQ,CAAC,CAAC;IACpD,CAAC;IAED;;OAEG;IACH,aAAa,CACX,IAAY,EACZ,UAA2B,EAAE,EAC7B,UAAmB;QAEnB,MAAM,KAAK,GAAG,UAAU,IAAI,IAAA,SAAM,GAAE,CAAC;QACrC,MAAM,QAAQ,GAAG,OAAO,CAAC,QAAQ,IAAI,IAAI,CAAC,eAAe,CAAC;QAE1D,MAAM,gBAAgB,GAAG,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;QACvD,IAAI,CAAC,gBAAgB,EAAE,CAAC;YACtB,MAAM,IAAI,KAAK,CAAC,8BAA8B,QAAQ,EAAE,CAAC,CAAC;QAC5D,CAAC;QAED,MAAM,MAAM,GAAG,gBAAgB,CAAC,KAAK,CAAC,IAAI,EAAE,KAAK,EAAE,OAAO,CAAC,CAAC;QAE5D,OAAO;YACL,UAAU,EAAE,KAAK;YACjB,MAAM;YACN,QAAQ,EAAE;gBACR,cAAc,EAAE,IAAI,CAAC,MAAM;gBAC3B,WAAW,EAAE,MAAM,CAAC,MAAM;gBAC1B,QAAQ;gBACR,GAAG,OAAO,CAAC,QAAQ;aACpB;SACF,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,cAAc,CACZ,SAA+E,EAC/E,UAA2B,EAAE;QAE7B,OAAO,SAAS,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE;YACzB,MAAM,UAAU,GAAoB;gBAClC,GAAG,OAAO;gBACV,QAAQ,EAAE,EAAE,GAAG,OAAO,CAAC,QAAQ,EAAE,GAAG,GAAG,CAAC,QAAQ,EAAE;aACnD,CAAC;YAEF,OAAO,IAAI,CAAC,aAAa,CAAC,GAAG,CAAC,IAAI,EAAE,UAAU,EAAE,GAAG,CAAC,EAAE,CAAC,CAAC;QAC1D,CAAC,CAAC,CAAC;IACL,CAAC;IAED;;OAEG;IACH,cAAc,CAAC,IAAY;QACzB,MAAM,EAAE,WAAW,EAAE,GAAG,OAAO,CAAC,mBAAmB,CAAC,CAAC;QACrD,OAAO,WAAW,CAAC,IAAI,CAAC,CAAC;IAC3B,CAAC;IAED;;;OAGG;IACH,mBAAmB,CAAC,kBAA0B,IAAI,EAAE,eAAuB,GAAG;QAC5E,MAAM,cAAc,GAAG,IAAI,CAAC,KAAK,CAAC,eAAe,GAAG,YAAY,CAAC,CAAC;QAClE,0CAA0C;QAC1C,OAAO,cAAc,CAAC;IACxB,CAAC;IAED;;OAEG;IACH,kBAAkB,CAAC,QAAgB;QACjC,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,CAAC;YACnC,MAAM,IAAI,KAAK,CAAC,qBAAqB,QAAQ,EAAE,CAAC,CAAC;QACnD,CAAC;QACD,IAAI,CAAC,eAAe,GAAG,QAAQ,CAAC;IAClC,CAAC;IAED;;OAEG;IACH,sBAAsB;QACpB,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,IAAI,EAAE,CAAC,CAAC;IAC5C,CAAC;CACF;AAvGD,0CAuGC;AAED,4BAA4B;AACf,QAAA,QAAQ,GAAG,IAAI,eAAe,EAAE,CAAC;AAE9C,8BAA8B;AAC9B,0CAAwB;AACxB,4CAA+D;AAAtD,kHAAA,yBAAyB,OAAA;AAClC,oDAAmE;AAA1D,sHAAA,yBAAyB,OAAA;AAClC,kDAAiE;AAAxD,oHAAA,wBAAwB,OAAA;AACjC,oDAAmE;AAA1D,sHAAA,yBAAyB,OAAA;AAClC,kDAAiE;AAAxD,oHAAA,wBAAwB,OAAA"}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import type { Chunk, ChunkingOptions, ChunkingStrategy } from '../types';
|
|
2
|
+
export declare class FixedSizeChunkingStrategy implements ChunkingStrategy {
|
|
3
|
+
getName(): string;
|
|
4
|
+
chunk(text: string, documentId: string, options: ChunkingOptions): Chunk[];
|
|
5
|
+
/**
|
|
6
|
+
* Get text up to a token limit
|
|
7
|
+
*/
|
|
8
|
+
private getTextUpToTokenLimit;
|
|
9
|
+
/**
|
|
10
|
+
* Get overlap text that is approximately 'overlapTokens' tokens
|
|
11
|
+
*/
|
|
12
|
+
private getOverlapText;
|
|
13
|
+
}
|
|
14
|
+
//# sourceMappingURL=fixed.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"fixed.d.ts","sourceRoot":"","sources":["../../../../src/chunking/strategies/fixed.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,KAAK,EAAE,eAAe,EAAE,gBAAgB,EAAE,MAAM,UAAU,CAAC;AAGzE,qBAAa,yBAA0B,YAAW,gBAAgB;IAChE,OAAO,IAAI,MAAM;IAIjB,KAAK,CAAC,IAAI,EAAE,MAAM,EAAE,UAAU,EAAE,MAAM,EAAE,OAAO,EAAE,eAAe,GAAG,KAAK,EAAE;IA4D1E;;OAEG;IACH,OAAO,CAAC,qBAAqB;IA0B7B;;OAEG;IACH,OAAO,CAAC,cAAc;CAuBvB"}
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.FixedSizeChunkingStrategy = void 0;
|
|
4
|
+
// Fixed-size chunking strategy
|
|
5
|
+
const uuid_1 = require("uuid");
|
|
6
|
+
const tokenizer_1 = require("../utils/tokenizer");
|
|
7
|
+
class FixedSizeChunkingStrategy {
|
|
8
|
+
getName() {
|
|
9
|
+
return 'fixed';
|
|
10
|
+
}
|
|
11
|
+
chunk(text, documentId, options) {
|
|
12
|
+
// Get token limit and ensure maxSize doesn't exceed it
|
|
13
|
+
const tokenLimit = options.tokenLimit ?? 8192;
|
|
14
|
+
const maxSize = Math.min(options.maxChunkSize ?? 1000, tokenLimit);
|
|
15
|
+
const overlap = Math.min(options.overlap ?? 200, Math.floor(maxSize * 0.2));
|
|
16
|
+
const preserveFormatting = options.preserveFormatting ?? false;
|
|
17
|
+
// Normalize text if not preserving formatting
|
|
18
|
+
const normalizedText = preserveFormatting
|
|
19
|
+
? text
|
|
20
|
+
: text.replace(/\s+/g, ' ').trim();
|
|
21
|
+
const chunks = [];
|
|
22
|
+
let startChar = 0;
|
|
23
|
+
let chunkIndex = 0;
|
|
24
|
+
while (startChar < normalizedText.length) {
|
|
25
|
+
// Get text up to token limit
|
|
26
|
+
const remainingText = normalizedText.slice(startChar);
|
|
27
|
+
const chunkText = this.getTextUpToTokenLimit(remainingText, maxSize);
|
|
28
|
+
if (!chunkText || chunkText.length === 0) {
|
|
29
|
+
break;
|
|
30
|
+
}
|
|
31
|
+
const endChar = startChar + chunkText.length;
|
|
32
|
+
chunks.push({
|
|
33
|
+
id: (0, uuid_1.v4)(),
|
|
34
|
+
text: chunkText,
|
|
35
|
+
metadata: {
|
|
36
|
+
documentId,
|
|
37
|
+
chunkIndex,
|
|
38
|
+
totalChunks: 0, // Will be updated after all chunks are created
|
|
39
|
+
startChar,
|
|
40
|
+
endChar,
|
|
41
|
+
tokens: (0, tokenizer_1.countTokens)(chunkText),
|
|
42
|
+
...options.metadata,
|
|
43
|
+
},
|
|
44
|
+
});
|
|
45
|
+
// Calculate overlap position using token count
|
|
46
|
+
const overlapText = this.getOverlapText(chunkText, overlap);
|
|
47
|
+
startChar = endChar - overlapText.length;
|
|
48
|
+
chunkIndex++;
|
|
49
|
+
// Avoid creating tiny overlapping chunks at the end
|
|
50
|
+
if (normalizedText.length - startChar < overlapText.length) {
|
|
51
|
+
break;
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
// Update totalChunks for all chunks
|
|
55
|
+
chunks.forEach(chunk => {
|
|
56
|
+
chunk.metadata.totalChunks = chunks.length;
|
|
57
|
+
});
|
|
58
|
+
return chunks;
|
|
59
|
+
}
|
|
60
|
+
/**
|
|
61
|
+
* Get text up to a token limit
|
|
62
|
+
*/
|
|
63
|
+
getTextUpToTokenLimit(text, maxTokens) {
|
|
64
|
+
if ((0, tokenizer_1.countTokens)(text) <= maxTokens) {
|
|
65
|
+
return text;
|
|
66
|
+
}
|
|
67
|
+
// Binary search for the right character position
|
|
68
|
+
let start = 0;
|
|
69
|
+
let end = text.length;
|
|
70
|
+
let bestMatch = '';
|
|
71
|
+
while (start < end) {
|
|
72
|
+
const mid = Math.floor((start + end) / 2);
|
|
73
|
+
const candidate = text.slice(0, mid);
|
|
74
|
+
const tokens = (0, tokenizer_1.countTokens)(candidate);
|
|
75
|
+
if (tokens <= maxTokens) {
|
|
76
|
+
bestMatch = candidate;
|
|
77
|
+
start = mid + 1;
|
|
78
|
+
}
|
|
79
|
+
else {
|
|
80
|
+
end = mid;
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
return bestMatch || text.slice(0, Math.floor(text.length * 0.8)); // Fallback to 80%
|
|
84
|
+
}
|
|
85
|
+
/**
|
|
86
|
+
* Get overlap text that is approximately 'overlapTokens' tokens
|
|
87
|
+
*/
|
|
88
|
+
getOverlapText(text, overlapTokens) {
|
|
89
|
+
if (overlapTokens === 0)
|
|
90
|
+
return '';
|
|
91
|
+
// Binary search for the right amount of text
|
|
92
|
+
let start = 0;
|
|
93
|
+
let end = text.length;
|
|
94
|
+
let bestMatch = '';
|
|
95
|
+
while (start < end) {
|
|
96
|
+
const mid = Math.floor((start + end) / 2);
|
|
97
|
+
const candidate = text.slice(mid);
|
|
98
|
+
const tokens = (0, tokenizer_1.countTokens)(candidate);
|
|
99
|
+
if (tokens <= overlapTokens) {
|
|
100
|
+
bestMatch = candidate;
|
|
101
|
+
end = mid;
|
|
102
|
+
}
|
|
103
|
+
else {
|
|
104
|
+
start = mid + 1;
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
return bestMatch || text.slice(-Math.floor(text.length * 0.1)); // Fallback to last 10%
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
exports.FixedSizeChunkingStrategy = FixedSizeChunkingStrategy;
|
|
111
|
+
//# sourceMappingURL=fixed.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"fixed.js","sourceRoot":"","sources":["../../../../src/chunking/strategies/fixed.ts"],"names":[],"mappings":";;;AAAA,+BAA+B;AAC/B,+BAAoC;AAEpC,kDAAiD;AAEjD,MAAa,yBAAyB;IACpC,OAAO;QACL,OAAO,OAAO,CAAC;IACjB,CAAC;IAED,KAAK,CAAC,IAAY,EAAE,UAAkB,EAAE,OAAwB;QAC9D,uDAAuD;QACvD,MAAM,UAAU,GAAI,OAAe,CAAC,UAAU,IAAI,IAAI,CAAC;QACvD,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,YAAY,IAAI,IAAI,EAAE,UAAU,CAAC,CAAC;QACnE,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,OAAO,IAAI,GAAG,EAAE,IAAI,CAAC,KAAK,CAAC,OAAO,GAAG,GAAG,CAAC,CAAC,CAAC;QAC5E,MAAM,kBAAkB,GAAG,OAAO,CAAC,kBAAkB,IAAI,KAAK,CAAC;QAE/D,8CAA8C;QAC9C,MAAM,cAAc,GAAG,kBAAkB;YACvC,CAAC,CAAC,IAAI;YACN,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;QAErC,MAAM,MAAM,GAAY,EAAE,CAAC;QAC3B,IAAI,SAAS,GAAG,CAAC,CAAC;QAClB,IAAI,UAAU,GAAG,CAAC,CAAC;QAEnB,OAAO,SAAS,GAAG,cAAc,CAAC,MAAM,EAAE,CAAC;YACzC,6BAA6B;YAC7B,MAAM,aAAa,GAAG,cAAc,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;YACtD,MAAM,SAAS,GAAG,IAAI,CAAC,qBAAqB,CAAC,aAAa,EAAE,OAAO,CAAC,CAAC;YAErE,IAAI,CAAC,SAAS,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBACzC,MAAM;YACR,CAAC;YAED,MAAM,OAAO,GAAG,SAAS,GAAG,SAAS,CAAC,MAAM,CAAC;YAE7C,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE,EAAE,IAAA,SAAM,GAAE;gBACZ,IAAI,EAAE,SAAS;gBACf,QAAQ,EAAE;oBACR,UAAU;oBACV,UAAU;oBACV,WAAW,EAAE,CAAC,EAAE,+CAA+C;oBAC/D,SAAS;oBACT,OAAO;oBACP,MAAM,EAAE,IAAA,uBAAW,EAAC,SAAS,CAAC;oBAC9B,GAAG,OAAO,CAAC,QAAQ;iBACpB;aACF,CAAC,CAAC;YAEH,+CAA+C;YAC/C,MAAM,WAAW,GAAG,IAAI,CAAC,cAAc,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC;YAC5D,SAAS,GAAG,OAAO,GAAG,WAAW,CAAC,MAAM,CAAC;YACzC,UAAU,EAAE,CAAC;YAEb,oDAAoD;YACpD,IAAI,cAAc,CAAC,MAAM,GAAG,SAAS,GAAG,WAAW,CAAC,MAAM,EAAE,CAAC;gBAC3D,MAAM;YACR,CAAC;QACH,CAAC;QAED,oCAAoC;QACpC,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE;YACrB,KAAK,CAAC,QAAQ,CAAC,WAAW,GAAG,MAAM,CAAC,MAAM,CAAC;QAC7C,CAAC,CAAC,CAAC;QAEH,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;OAEG;IACK,qBAAqB,CAAC,IAAY,EAAE,SAAiB;QAC3D,IAAI,IAAA,uBAAW,EAAC,IAAI,CAAC,IAAI,SAAS,EAAE,CAAC;YACnC,OAAO,IAAI,CAAC;QACd,CAAC;QAED,iDAAiD;QACjD,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,IAAI,GAAG,GAAG,IAAI,CAAC,MAAM,CAAC;QACtB,IAAI,SAAS,GAAG,EAAE,CAAC;QAEnB,OAAO,KAAK,GAAG,GAAG,EAAE,CAAC;YACnB,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,KAAK,GAAG,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC;YAC1C,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;YACrC,MAAM,MAAM,GAAG,IAAA,uBAAW,EAAC,SAAS,CAAC,CAAC;YAEtC,IAAI,MAAM,IAAI,SAAS,EAAE,CAAC;gBACxB,SAAS,GAAG,SAAS,CAAC;gBACtB,KAAK,GAAG,GAAG,GAAG,CAAC,CAAC;YAClB,CAAC;iBAAM,CAAC;gBACN,GAAG,GAAG,GAAG,CAAC;YACZ,CAAC;QACH,CAAC;QAED,OAAO,SAAS,IAAI,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,kBAAkB;IACtF,CAAC;IAED;;OAEG;IACK,cAAc,CAAC,IAAY,EAAE,aAAqB;QACxD,IAAI,aAAa,KAAK,CAAC;YAAE,OAAO,EAAE,CAAC;QAEnC,6CAA6C;QAC7C,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,IAAI,GAAG,GAAG,IAAI,CAAC,MAAM,CAAC;QACtB,IAAI,SAAS,GAAG,EAAE,CAAC;QAEnB,OAAO,KAAK,GAAG,GAAG,EAAE,CAAC;YACnB,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,KAAK,GAAG,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC;YAC1C,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;YAClC,MAAM,MAAM,GAAG,IAAA,uBAAW,EAAC,SAAS,CAAC,CAAC;YAEtC,IAAI,MAAM,IAAI,aAAa,EAAE,CAAC;gBAC5B,SAAS,GAAG,SAAS,CAAC;gBACtB,GAAG,GAAG,GAAG,CAAC;YACZ,CAAC;iBAAM,CAAC;gBACN,KAAK,GAAG,GAAG,GAAG,CAAC,CAAC;YAClB,CAAC;QACH,CAAC;QAED,OAAO,SAAS,IAAI,IAAI,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,uBAAuB;IACzF,CAAC;CACF;AAxHD,8DAwHC"}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import type { Chunk, ChunkingOptions, ChunkingStrategy } from '../types';
|
|
2
|
+
export declare class ParagraphChunkingStrategy implements ChunkingStrategy {
|
|
3
|
+
getName(): string;
|
|
4
|
+
chunk(text: string, documentId: string, options: ChunkingOptions): Chunk[];
|
|
5
|
+
}
|
|
6
|
+
//# sourceMappingURL=paragraph.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"paragraph.d.ts","sourceRoot":"","sources":["../../../../src/chunking/strategies/paragraph.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,KAAK,EAAE,eAAe,EAAE,gBAAgB,EAAE,MAAM,UAAU,CAAC;AAGzE,qBAAa,yBAA0B,YAAW,gBAAgB;IAChE,OAAO,IAAI,MAAM;IAIjB,KAAK,CAAC,IAAI,EAAE,MAAM,EAAE,UAAU,EAAE,MAAM,EAAE,OAAO,EAAE,eAAe,GAAG,KAAK,EAAE;CAgF3E"}
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.ParagraphChunkingStrategy = void 0;
|
|
4
|
+
// Paragraph-based chunking strategy
|
|
5
|
+
const uuid_1 = require("uuid");
|
|
6
|
+
const tokenizer_1 = require("../utils/tokenizer");
|
|
7
|
+
class ParagraphChunkingStrategy {
|
|
8
|
+
getName() {
|
|
9
|
+
return 'paragraph';
|
|
10
|
+
}
|
|
11
|
+
chunk(text, documentId, options) {
|
|
12
|
+
// Get token limit and ensure maxSize doesn't exceed it
|
|
13
|
+
const tokenLimit = options.tokenLimit ?? 8192;
|
|
14
|
+
const maxSize = Math.min(options.maxChunkSize ?? 2000, tokenLimit);
|
|
15
|
+
const overlap = options.overlap ?? 1; // Overlap in number of paragraphs
|
|
16
|
+
// Split text into paragraphs
|
|
17
|
+
const paragraphs = text
|
|
18
|
+
.split(/\n\s*\n/)
|
|
19
|
+
.map(p => p.trim())
|
|
20
|
+
.filter(p => p.length > 0);
|
|
21
|
+
const chunks = [];
|
|
22
|
+
let currentChunk = [];
|
|
23
|
+
let currentTokens = 0;
|
|
24
|
+
let chunkIndex = 0;
|
|
25
|
+
let startChar = 0;
|
|
26
|
+
for (let i = 0; i < paragraphs.length; i++) {
|
|
27
|
+
const paragraph = paragraphs[i];
|
|
28
|
+
const paragraphTokens = (0, tokenizer_1.countTokens)(paragraph);
|
|
29
|
+
// If adding this paragraph would exceed maxSize (in tokens) and we have content, create a chunk
|
|
30
|
+
if (currentTokens + paragraphTokens > maxSize && currentChunk.length > 0) {
|
|
31
|
+
const chunkText = currentChunk.join('\n\n').trim();
|
|
32
|
+
const endChar = startChar + chunkText.length;
|
|
33
|
+
chunks.push({
|
|
34
|
+
id: (0, uuid_1.v4)(),
|
|
35
|
+
text: chunkText,
|
|
36
|
+
metadata: {
|
|
37
|
+
documentId,
|
|
38
|
+
chunkIndex,
|
|
39
|
+
totalChunks: 0, // Will be updated later
|
|
40
|
+
startChar,
|
|
41
|
+
endChar,
|
|
42
|
+
paragraphs: currentChunk.length,
|
|
43
|
+
...options.metadata,
|
|
44
|
+
},
|
|
45
|
+
});
|
|
46
|
+
// Keep last N paragraphs for overlap
|
|
47
|
+
const overlapParagraphs = currentChunk.slice(-overlap);
|
|
48
|
+
currentChunk = [...overlapParagraphs, paragraph];
|
|
49
|
+
currentTokens = (0, tokenizer_1.countTokens)(overlapParagraphs.join('\n\n')) + paragraphTokens;
|
|
50
|
+
startChar = endChar - (overlapParagraphs.join('\n\n').length);
|
|
51
|
+
chunkIndex++;
|
|
52
|
+
}
|
|
53
|
+
else {
|
|
54
|
+
currentChunk.push(paragraph);
|
|
55
|
+
currentTokens += paragraphTokens;
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
// Add remaining content as final chunk
|
|
59
|
+
if (currentChunk.length > 0) {
|
|
60
|
+
const chunkText = currentChunk.join('\n\n').trim();
|
|
61
|
+
const endChar = startChar + chunkText.length;
|
|
62
|
+
chunks.push({
|
|
63
|
+
id: (0, uuid_1.v4)(),
|
|
64
|
+
text: chunkText,
|
|
65
|
+
metadata: {
|
|
66
|
+
documentId,
|
|
67
|
+
chunkIndex,
|
|
68
|
+
totalChunks: 0,
|
|
69
|
+
startChar,
|
|
70
|
+
endChar,
|
|
71
|
+
paragraphs: currentChunk.length,
|
|
72
|
+
...options.metadata,
|
|
73
|
+
},
|
|
74
|
+
});
|
|
75
|
+
}
|
|
76
|
+
// Update totalChunks
|
|
77
|
+
chunks.forEach(chunk => {
|
|
78
|
+
chunk.metadata.totalChunks = chunks.length;
|
|
79
|
+
});
|
|
80
|
+
return chunks;
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
exports.ParagraphChunkingStrategy = ParagraphChunkingStrategy;
|
|
84
|
+
//# sourceMappingURL=paragraph.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"paragraph.js","sourceRoot":"","sources":["../../../../src/chunking/strategies/paragraph.ts"],"names":[],"mappings":";;;AAAA,oCAAoC;AACpC,+BAAoC;AAEpC,kDAAiD;AAEjD,MAAa,yBAAyB;IACpC,OAAO;QACL,OAAO,WAAW,CAAC;IACrB,CAAC;IAED,KAAK,CAAC,IAAY,EAAE,UAAkB,EAAE,OAAwB;QAC9D,uDAAuD;QACvD,MAAM,UAAU,GAAI,OAAe,CAAC,UAAU,IAAI,IAAI,CAAC;QACvD,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,YAAY,IAAI,IAAI,EAAE,UAAU,CAAC,CAAC;QACnE,MAAM,OAAO,GAAG,OAAO,CAAC,OAAO,IAAI,CAAC,CAAC,CAAC,kCAAkC;QAExE,6BAA6B;QAC7B,MAAM,UAAU,GAAG,IAAI;aACpB,KAAK,CAAC,SAAS,CAAC;aAChB,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;aAClB,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QAE7B,MAAM,MAAM,GAAY,EAAE,CAAC;QAC3B,IAAI,YAAY,GAAa,EAAE,CAAC;QAChC,IAAI,aAAa,GAAG,CAAC,CAAC;QACtB,IAAI,UAAU,GAAG,CAAC,CAAC;QACnB,IAAI,SAAS,GAAG,CAAC,CAAC;QAElB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC3C,MAAM,SAAS,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;YAChC,MAAM,eAAe,GAAG,IAAA,uBAAW,EAAC,SAAS,CAAC,CAAC;YAE/C,gGAAgG;YAChG,IAAI,aAAa,GAAG,eAAe,GAAG,OAAO,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACzE,MAAM,SAAS,GAAG,YAAY,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,CAAC;gBACnD,MAAM,OAAO,GAAG,SAAS,GAAG,SAAS,CAAC,MAAM,CAAC;gBAE7C,MAAM,CAAC,IAAI,CAAC;oBACV,EAAE,EAAE,IAAA,SAAM,GAAE;oBACZ,IAAI,EAAE,SAAS;oBACf,QAAQ,EAAE;wBACR,UAAU;wBACV,UAAU;wBACV,WAAW,EAAE,CAAC,EAAE,wBAAwB;wBACxC,SAAS;wBACT,OAAO;wBACP,UAAU,EAAE,YAAY,CAAC,MAAM;wBAC/B,GAAG,OAAO,CAAC,QAAQ;qBACpB;iBACF,CAAC,CAAC;gBAEH,qCAAqC;gBACrC,MAAM,iBAAiB,GAAG,YAAY,CAAC,KAAK,CAAC,CAAC,OAAO,CAAC,CAAC;gBACvD,YAAY,GAAG,CAAC,GAAG,iBAAiB,EAAE,SAAS,CAAC,CAAC;gBACjD,aAAa,GAAG,IAAA,uBAAW,EAAC,iBAAiB,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,GAAG,eAAe,CAAC;gBAC9E,SAAS,GAAG,OAAO,GAAG,CAAC,iBAAiB,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC,CAAC;gBAC9D,UAAU,EAAE,CAAC;YACf,CAAC;iBAAM,CAAC;gBACN,YAAY,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;gBAC7B,aAAa,IAAI,eAAe,CAAC;YACnC,CAAC;QACH,CAAC;QAED,uCAAuC;QACvC,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC5B,MAAM,SAAS,GAAG,YAAY,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,CAAC;YACnD,MAAM,OAAO,GAAG,SAAS,GAAG,SAAS,CAAC,MAAM,CAAC;YAE7C,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE,EAAE,IAAA,SAAM,GAAE;gBACZ,IAAI,EAAE,SAAS;gBACf,QAAQ,EAAE;oBACR,UAAU;oBACV,UAAU;oBACV,WAAW,EAAE,CAAC;oBACd,SAAS;oBACT,OAAO;oBACP,UAAU,EAAE,YAAY,CAAC,MAAM;oBAC/B,GAAG,OAAO,CAAC,QAAQ;iBACpB;aACF,CAAC,CAAC;QACL,CAAC;QAED,qBAAqB;QACrB,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE;YACrB,KAAK,CAAC,QAAQ,CAAC,WAAW,GAAG,MAAM,CAAC,MAAM,CAAC;QAC7C,CAAC,CAAC,CAAC;QAEH,OAAO,MAAM,CAAC;IAChB,CAAC;CACF;AArFD,8DAqFC"}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import type { Chunk, ChunkingOptions, ChunkingStrategy } from '../types';
|
|
2
|
+
export declare class RecursiveChunkingStrategy implements ChunkingStrategy {
|
|
3
|
+
getName(): string;
|
|
4
|
+
chunk(text: string, documentId: string, options: ChunkingOptions): Chunk[];
|
|
5
|
+
private recursiveSplit;
|
|
6
|
+
private mergeSplits;
|
|
7
|
+
/**
|
|
8
|
+
* Get overlap text that is approximately 'overlapTokens' tokens
|
|
9
|
+
*/
|
|
10
|
+
private getOverlapText;
|
|
11
|
+
private splitByCharacters;
|
|
12
|
+
/**
|
|
13
|
+
* Get text up to a token limit
|
|
14
|
+
*/
|
|
15
|
+
private getTextUpToTokenLimit;
|
|
16
|
+
}
|
|
17
|
+
//# sourceMappingURL=recursive.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"recursive.d.ts","sourceRoot":"","sources":["../../../../src/chunking/strategies/recursive.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,KAAK,EAAE,eAAe,EAAE,gBAAgB,EAAE,MAAM,UAAU,CAAC;AAGzE,qBAAa,yBAA0B,YAAW,gBAAgB;IAChE,OAAO,IAAI,MAAM;IAIjB,KAAK,CAAC,IAAI,EAAE,MAAM,EAAE,UAAU,EAAE,MAAM,EAAE,OAAO,EAAE,eAAe,GAAG,KAAK,EAAE;IAoD1E,OAAO,CAAC,cAAc;IA8CtB,OAAO,CAAC,WAAW;IAsCnB;;OAEG;IACH,OAAO,CAAC,cAAc;IAsBtB,OAAO,CAAC,iBAAiB;IAyBzB;;OAEG;IACH,OAAO,CAAC,qBAAqB;CAyB9B"}
|