@agentionai/agents 0.3.0-beta
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +517 -0
- package/dist/agents/Agent.d.ts +29 -0
- package/dist/agents/Agent.js +28 -0
- package/dist/agents/AgentConfig.d.ts +118 -0
- package/dist/agents/AgentConfig.js +3 -0
- package/dist/agents/AgentEvent.d.ts +18 -0
- package/dist/agents/AgentEvent.js +26 -0
- package/dist/agents/BaseAgent.d.ts +82 -0
- package/dist/agents/BaseAgent.js +121 -0
- package/dist/agents/anthropic/ClaudeAgent.d.ts +46 -0
- package/dist/agents/anthropic/ClaudeAgent.js +262 -0
- package/dist/agents/errors/AgentError.d.ts +47 -0
- package/dist/agents/errors/AgentError.js +74 -0
- package/dist/agents/google/GeminiAgent.d.ts +63 -0
- package/dist/agents/google/GeminiAgent.js +395 -0
- package/dist/agents/mistral/MistralAgent.d.ts +47 -0
- package/dist/agents/mistral/MistralAgent.js +313 -0
- package/dist/agents/model-types.d.ts +30 -0
- package/dist/agents/model-types.js +8 -0
- package/dist/agents/openai/OpenAiAgent.d.ts +48 -0
- package/dist/agents/openai/OpenAiAgent.js +338 -0
- package/dist/chunkers/Chunker.d.ts +53 -0
- package/dist/chunkers/Chunker.js +174 -0
- package/dist/chunkers/RecursiveChunker.d.ts +52 -0
- package/dist/chunkers/RecursiveChunker.js +166 -0
- package/dist/chunkers/TextChunker.d.ts +27 -0
- package/dist/chunkers/TextChunker.js +50 -0
- package/dist/chunkers/TokenChunker.d.ts +60 -0
- package/dist/chunkers/TokenChunker.js +176 -0
- package/dist/chunkers/index.d.ts +6 -0
- package/dist/chunkers/index.js +14 -0
- package/dist/chunkers/types.d.ts +95 -0
- package/dist/chunkers/types.js +3 -0
- package/dist/graph/AgentGraph.d.ts +99 -0
- package/dist/graph/AgentGraph.js +115 -0
- package/dist/graph/BaseExecutor.d.ts +86 -0
- package/dist/graph/BaseExecutor.js +61 -0
- package/dist/graph/GraphMetrics.d.ts +143 -0
- package/dist/graph/GraphMetrics.js +264 -0
- package/dist/graph/MapExecutor.d.ts +39 -0
- package/dist/graph/MapExecutor.js +123 -0
- package/dist/graph/ParallelExecutor.d.ts +51 -0
- package/dist/graph/ParallelExecutor.js +103 -0
- package/dist/graph/Pipeline.d.ts +44 -0
- package/dist/graph/Pipeline.js +109 -0
- package/dist/graph/RouterExecutor.d.ts +89 -0
- package/dist/graph/RouterExecutor.js +209 -0
- package/dist/graph/SequentialExecutor.d.ts +44 -0
- package/dist/graph/SequentialExecutor.js +115 -0
- package/dist/graph/VotingSystem.d.ts +54 -0
- package/dist/graph/VotingSystem.js +106 -0
- package/dist/history/History.d.ts +107 -0
- package/dist/history/History.js +166 -0
- package/dist/history/RedisHistory.d.ts +27 -0
- package/dist/history/RedisHistory.js +55 -0
- package/dist/history/transformers.d.ts +102 -0
- package/dist/history/transformers.js +415 -0
- package/dist/history/types.d.ts +130 -0
- package/dist/history/types.js +55 -0
- package/dist/index.d.ts +16 -0
- package/dist/index.js +48 -0
- package/dist/ingestion/IngestionPipeline.d.ts +86 -0
- package/dist/ingestion/IngestionPipeline.js +266 -0
- package/dist/ingestion/index.d.ts +3 -0
- package/dist/ingestion/index.js +7 -0
- package/dist/ingestion/types.d.ts +74 -0
- package/dist/ingestion/types.js +3 -0
- package/dist/team/Team.d.ts +46 -0
- package/dist/team/Team.js +104 -0
- package/dist/tools/Tool.d.ts +75 -0
- package/dist/tools/Tool.js +137 -0
- package/dist/vectorstore/Embeddings.d.ts +67 -0
- package/dist/vectorstore/Embeddings.js +54 -0
- package/dist/vectorstore/LanceDBVectorStore.d.ts +149 -0
- package/dist/vectorstore/LanceDBVectorStore.js +338 -0
- package/dist/vectorstore/OpenAIEmbeddings.d.ts +45 -0
- package/dist/vectorstore/OpenAIEmbeddings.js +109 -0
- package/dist/vectorstore/VectorStore.d.ts +255 -0
- package/dist/vectorstore/VectorStore.js +216 -0
- package/dist/vectorstore/index.d.ts +28 -0
- package/dist/vectorstore/index.js +35 -0
- package/dist/viz/VizConfig.d.ts +54 -0
- package/dist/viz/VizConfig.js +100 -0
- package/dist/viz/VizReporter.d.ts +127 -0
- package/dist/viz/VizReporter.js +595 -0
- package/dist/viz/index.d.ts +31 -0
- package/dist/viz/index.js +51 -0
- package/dist/viz/types.d.ts +105 -0
- package/dist/viz/types.js +7 -0
- package/package.json +109 -0
- package/readme.md +1 -0
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.RecursiveChunker = void 0;
|
|
4
|
+
const Chunker_1 = require("./Chunker");
|
|
5
|
+
/**
|
|
6
|
+
* Recursive text chunker that tries to split on semantic boundaries.
|
|
7
|
+
* It attempts to split by larger separators first (paragraphs), then
|
|
8
|
+
* falls back to smaller ones (sentences, words) to keep semantic units together.
|
|
9
|
+
*
|
|
10
|
+
* @example
|
|
11
|
+
* ```typescript
|
|
12
|
+
* const chunker = new RecursiveChunker({
|
|
13
|
+
* chunkSize: 1000,
|
|
14
|
+
* chunkOverlap: 100,
|
|
15
|
+
* separators: ["\n\n", "\n", ". ", " "],
|
|
16
|
+
* });
|
|
17
|
+
*
|
|
18
|
+
* const chunks = await chunker.chunk(document);
|
|
19
|
+
* ```
|
|
20
|
+
*/
|
|
21
|
+
class RecursiveChunker extends Chunker_1.Chunker {
|
|
22
|
+
constructor(config) {
|
|
23
|
+
super(config);
|
|
24
|
+
this.name = "RecursiveChunker";
|
|
25
|
+
this.separators = config.separators ?? ["\n\n", "\n", ". ", " "];
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Split text recursively using the separator hierarchy.
|
|
29
|
+
*/
|
|
30
|
+
splitText(text) {
|
|
31
|
+
return this.recursiveSplit(text, 0);
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Recursively split text using separators at the given index.
|
|
35
|
+
*/
|
|
36
|
+
recursiveSplit(text, separatorIndex) {
|
|
37
|
+
const { chunkSize, chunkOverlap = 0 } = this.config;
|
|
38
|
+
// Base case: text fits in one chunk
|
|
39
|
+
if (text.length <= chunkSize) {
|
|
40
|
+
return text.trim() ? [text] : [];
|
|
41
|
+
}
|
|
42
|
+
// No more separators: force split by character
|
|
43
|
+
if (separatorIndex >= this.separators.length) {
|
|
44
|
+
return this.forceSplit(text);
|
|
45
|
+
}
|
|
46
|
+
const separator = this.separators[separatorIndex];
|
|
47
|
+
const parts = this.splitBySeparator(text, separator);
|
|
48
|
+
// If separator didn't help, try the next one
|
|
49
|
+
if (parts.length <= 1) {
|
|
50
|
+
return this.recursiveSplit(text, separatorIndex + 1);
|
|
51
|
+
}
|
|
52
|
+
// Merge parts into chunks respecting size limit
|
|
53
|
+
const chunks = [];
|
|
54
|
+
let currentChunk = "";
|
|
55
|
+
for (const part of parts) {
|
|
56
|
+
const partWithSep = currentChunk ? separator + part : part;
|
|
57
|
+
const wouldBeLength = currentChunk.length + partWithSep.length;
|
|
58
|
+
if (wouldBeLength <= chunkSize) {
|
|
59
|
+
// Part fits in current chunk
|
|
60
|
+
currentChunk = currentChunk ? currentChunk + separator + part : part;
|
|
61
|
+
}
|
|
62
|
+
else {
|
|
63
|
+
// Save current chunk if it has content
|
|
64
|
+
if (currentChunk.trim()) {
|
|
65
|
+
chunks.push(currentChunk);
|
|
66
|
+
}
|
|
67
|
+
// Check if part itself is too big
|
|
68
|
+
if (part.length > chunkSize) {
|
|
69
|
+
// Recursively split the oversized part
|
|
70
|
+
const subChunks = this.recursiveSplit(part, separatorIndex + 1);
|
|
71
|
+
chunks.push(...subChunks);
|
|
72
|
+
currentChunk = "";
|
|
73
|
+
}
|
|
74
|
+
else {
|
|
75
|
+
currentChunk = part;
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
// Don't forget the last chunk
|
|
80
|
+
if (currentChunk.trim()) {
|
|
81
|
+
chunks.push(currentChunk);
|
|
82
|
+
}
|
|
83
|
+
// Apply overlap if configured
|
|
84
|
+
if (chunkOverlap > 0 && chunks.length > 1) {
|
|
85
|
+
return this.applyOverlap(chunks, separator);
|
|
86
|
+
}
|
|
87
|
+
return chunks;
|
|
88
|
+
}
|
|
89
|
+
/**
|
|
90
|
+
* Split text by separator, keeping the parts clean.
|
|
91
|
+
*/
|
|
92
|
+
splitBySeparator(text, separator) {
|
|
93
|
+
if (separator === ". ") {
|
|
94
|
+
// Special handling for sentence boundaries - keep the period
|
|
95
|
+
return text.split(/(?<=\.)\s+/).filter((p) => p.trim());
|
|
96
|
+
}
|
|
97
|
+
return text.split(separator).filter((p) => p.trim());
|
|
98
|
+
}
|
|
99
|
+
/**
|
|
100
|
+
* Force split text by character count when no separator works.
|
|
101
|
+
*/
|
|
102
|
+
forceSplit(text) {
|
|
103
|
+
const { chunkSize, chunkOverlap = 0 } = this.config;
|
|
104
|
+
const chunks = [];
|
|
105
|
+
const step = chunkSize - chunkOverlap;
|
|
106
|
+
let start = 0;
|
|
107
|
+
while (start < text.length) {
|
|
108
|
+
const end = Math.min(start + chunkSize, text.length);
|
|
109
|
+
const chunk = text.slice(start, end);
|
|
110
|
+
if (chunk.trim()) {
|
|
111
|
+
chunks.push(chunk);
|
|
112
|
+
}
|
|
113
|
+
if (end >= text.length)
|
|
114
|
+
break;
|
|
115
|
+
start += step;
|
|
116
|
+
}
|
|
117
|
+
return chunks;
|
|
118
|
+
}
|
|
119
|
+
/**
|
|
120
|
+
* Apply overlap between chunks by prepending context from previous chunk.
|
|
121
|
+
*/
|
|
122
|
+
applyOverlap(chunks, separator) {
|
|
123
|
+
const { chunkOverlap = 0 } = this.config;
|
|
124
|
+
if (chunkOverlap === 0 || chunks.length <= 1) {
|
|
125
|
+
return chunks;
|
|
126
|
+
}
|
|
127
|
+
const result = [chunks[0]];
|
|
128
|
+
for (let i = 1; i < chunks.length; i++) {
|
|
129
|
+
const prevChunk = chunks[i - 1];
|
|
130
|
+
const currentChunk = chunks[i];
|
|
131
|
+
// Get overlap from end of previous chunk
|
|
132
|
+
const overlapText = this.getOverlapText(prevChunk, chunkOverlap, separator);
|
|
133
|
+
if (overlapText) {
|
|
134
|
+
result.push(overlapText + separator + currentChunk);
|
|
135
|
+
}
|
|
136
|
+
else {
|
|
137
|
+
result.push(currentChunk);
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
return result;
|
|
141
|
+
}
|
|
142
|
+
/**
|
|
143
|
+
* Extract overlap text from the end of a chunk, trying to break at separator.
|
|
144
|
+
*/
|
|
145
|
+
getOverlapText(text, overlapSize, separator) {
|
|
146
|
+
if (text.length <= overlapSize) {
|
|
147
|
+
return text;
|
|
148
|
+
}
|
|
149
|
+
// Try to find a clean break point near the overlap size
|
|
150
|
+
const overlapStart = text.length - overlapSize;
|
|
151
|
+
const sepIndex = text.indexOf(separator, overlapStart);
|
|
152
|
+
if (sepIndex !== -1 && sepIndex < text.length - 1) {
|
|
153
|
+
return text.slice(sepIndex + separator.length);
|
|
154
|
+
}
|
|
155
|
+
// Fall back to exact character overlap
|
|
156
|
+
return text.slice(overlapStart);
|
|
157
|
+
}
|
|
158
|
+
/**
|
|
159
|
+
* Get the configured separators.
|
|
160
|
+
*/
|
|
161
|
+
getSeparators() {
|
|
162
|
+
return [...this.separators];
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
exports.RecursiveChunker = RecursiveChunker;
|
|
166
|
+
//# sourceMappingURL=RecursiveChunker.js.map
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import { Chunker } from "./Chunker";
|
|
2
|
+
import { ChunkerConfig } from "./types";
|
|
3
|
+
/**
|
|
4
|
+
* Simple text chunker that splits by character count with optional overlap.
|
|
5
|
+
*
|
|
6
|
+
* @example
|
|
7
|
+
* ```typescript
|
|
8
|
+
* const chunker = new TextChunker({
|
|
9
|
+
* chunkSize: 1000,
|
|
10
|
+
* chunkOverlap: 200,
|
|
11
|
+
* });
|
|
12
|
+
*
|
|
13
|
+
* const chunks = await chunker.chunk(longText, {
|
|
14
|
+
* sourceId: 'doc-123',
|
|
15
|
+
* sourcePath: '/docs/readme.md',
|
|
16
|
+
* });
|
|
17
|
+
* ```
|
|
18
|
+
*/
|
|
19
|
+
export declare class TextChunker extends Chunker {
|
|
20
|
+
readonly name = "TextChunker";
|
|
21
|
+
constructor(config: ChunkerConfig);
|
|
22
|
+
/**
|
|
23
|
+
* Split text by character count with overlap.
|
|
24
|
+
*/
|
|
25
|
+
protected splitText(text: string): string[];
|
|
26
|
+
}
|
|
27
|
+
//# sourceMappingURL=TextChunker.d.ts.map
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.TextChunker = void 0;
|
|
4
|
+
const Chunker_1 = require("./Chunker");
|
|
5
|
+
/**
|
|
6
|
+
* Simple text chunker that splits by character count with optional overlap.
|
|
7
|
+
*
|
|
8
|
+
* @example
|
|
9
|
+
* ```typescript
|
|
10
|
+
* const chunker = new TextChunker({
|
|
11
|
+
* chunkSize: 1000,
|
|
12
|
+
* chunkOverlap: 200,
|
|
13
|
+
* });
|
|
14
|
+
*
|
|
15
|
+
* const chunks = await chunker.chunk(longText, {
|
|
16
|
+
* sourceId: 'doc-123',
|
|
17
|
+
* sourcePath: '/docs/readme.md',
|
|
18
|
+
* });
|
|
19
|
+
* ```
|
|
20
|
+
*/
|
|
21
|
+
class TextChunker extends Chunker_1.Chunker {
|
|
22
|
+
constructor(config) {
|
|
23
|
+
super(config);
|
|
24
|
+
this.name = "TextChunker";
|
|
25
|
+
}
|
|
26
|
+
/**
|
|
27
|
+
* Split text by character count with overlap.
|
|
28
|
+
*/
|
|
29
|
+
splitText(text) {
|
|
30
|
+
const { chunkSize, chunkOverlap = 0 } = this.config;
|
|
31
|
+
const chunks = [];
|
|
32
|
+
if (text.length <= chunkSize) {
|
|
33
|
+
return [text];
|
|
34
|
+
}
|
|
35
|
+
const step = chunkSize - chunkOverlap;
|
|
36
|
+
let start = 0;
|
|
37
|
+
while (start < text.length) {
|
|
38
|
+
const end = Math.min(start + chunkSize, text.length);
|
|
39
|
+
chunks.push(text.slice(start, end));
|
|
40
|
+
// If we've reached the end, stop
|
|
41
|
+
if (end >= text.length) {
|
|
42
|
+
break;
|
|
43
|
+
}
|
|
44
|
+
start += step;
|
|
45
|
+
}
|
|
46
|
+
return chunks;
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
exports.TextChunker = TextChunker;
|
|
50
|
+
//# sourceMappingURL=TextChunker.js.map
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import { Chunker } from "./Chunker";
|
|
2
|
+
import { Chunk, TokenChunkerConfig } from "./types";
|
|
3
|
+
/**
|
|
4
|
+
* Load tokenx module using dynamic import.
|
|
5
|
+
* This function can be mocked in tests.
|
|
6
|
+
* @internal
|
|
7
|
+
*/
|
|
8
|
+
export declare function loadTokenx(): Promise<typeof import("tokenx")>;
|
|
9
|
+
/**
|
|
10
|
+
* Reset the tokenx module cache. Used in tests.
|
|
11
|
+
* @internal
|
|
12
|
+
*/
|
|
13
|
+
export declare function resetTokenxCache(): void;
|
|
14
|
+
/**
|
|
15
|
+
* Token-aware text chunker using the tokenx library.
|
|
16
|
+
* Splits text based on token count rather than character count,
|
|
17
|
+
* ensuring chunks fit within LLM token limits.
|
|
18
|
+
*
|
|
19
|
+
* Uses tokenx for fast token estimation (~96% accuracy, ~2kB).
|
|
20
|
+
*
|
|
21
|
+
* @example
|
|
22
|
+
* ```typescript
|
|
23
|
+
* const chunker = new TokenChunker({
|
|
24
|
+
* chunkSize: 500, // 500 tokens per chunk
|
|
25
|
+
* chunkOverlap: 50, // 50 token overlap
|
|
26
|
+
* });
|
|
27
|
+
*
|
|
28
|
+
* const chunks = await chunker.chunk(longDocument);
|
|
29
|
+
* // Each chunk.metadata.tokenCount contains estimated tokens
|
|
30
|
+
* ```
|
|
31
|
+
*/
|
|
32
|
+
export declare class TokenChunker extends Chunker {
|
|
33
|
+
readonly name = "TokenChunker";
|
|
34
|
+
constructor(config: TokenChunkerConfig);
|
|
35
|
+
/**
|
|
36
|
+
* Protected method to get tokenx - can be overridden in tests
|
|
37
|
+
*/
|
|
38
|
+
protected getTokenx(): Promise<typeof import("tokenx")>;
|
|
39
|
+
/**
|
|
40
|
+
* Split text by token count using tokenx.
|
|
41
|
+
*/
|
|
42
|
+
protected splitText(text: string): Promise<string[]>;
|
|
43
|
+
/**
|
|
44
|
+
* Apply token-based overlap between chunks.
|
|
45
|
+
*/
|
|
46
|
+
private applyTokenOverlap;
|
|
47
|
+
/**
|
|
48
|
+
* Get approximately chunkOverlap tokens from the end of text.
|
|
49
|
+
*/
|
|
50
|
+
private getTokenOverlap;
|
|
51
|
+
/**
|
|
52
|
+
* Override chunk to add token count to metadata.
|
|
53
|
+
*/
|
|
54
|
+
chunk(text: string, options?: import("./types").ChunkOptions): Promise<Chunk[]>;
|
|
55
|
+
/**
|
|
56
|
+
* Estimate token count for a given text.
|
|
57
|
+
*/
|
|
58
|
+
static estimateTokens(text: string): Promise<number>;
|
|
59
|
+
}
|
|
60
|
+
//# sourceMappingURL=TokenChunker.d.ts.map
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
+
exports.TokenChunker = void 0;
|
|
37
|
+
exports.loadTokenx = loadTokenx;
|
|
38
|
+
exports.resetTokenxCache = resetTokenxCache;
|
|
39
|
+
const Chunker_1 = require("./Chunker");
|
|
40
|
+
// Cache the tokenx module after first import
|
|
41
|
+
let tokenxModule = null;
|
|
42
|
+
/**
|
|
43
|
+
* Load tokenx module using dynamic import.
|
|
44
|
+
* This function can be mocked in tests.
|
|
45
|
+
* @internal
|
|
46
|
+
*/
|
|
47
|
+
async function loadTokenx() {
|
|
48
|
+
if (!tokenxModule) {
|
|
49
|
+
// Use dynamic import for ESM module
|
|
50
|
+
tokenxModule = await Promise.resolve().then(() => __importStar(require("tokenx")));
|
|
51
|
+
}
|
|
52
|
+
return tokenxModule;
|
|
53
|
+
}
|
|
54
|
+
/**
|
|
55
|
+
* Reset the tokenx module cache. Used in tests.
|
|
56
|
+
* @internal
|
|
57
|
+
*/
|
|
58
|
+
function resetTokenxCache() {
|
|
59
|
+
tokenxModule = null;
|
|
60
|
+
}
|
|
61
|
+
/**
|
|
62
|
+
* Token-aware text chunker using the tokenx library.
|
|
63
|
+
* Splits text based on token count rather than character count,
|
|
64
|
+
* ensuring chunks fit within LLM token limits.
|
|
65
|
+
*
|
|
66
|
+
* Uses tokenx for fast token estimation (~96% accuracy, ~2kB).
|
|
67
|
+
*
|
|
68
|
+
* @example
|
|
69
|
+
* ```typescript
|
|
70
|
+
* const chunker = new TokenChunker({
|
|
71
|
+
* chunkSize: 500, // 500 tokens per chunk
|
|
72
|
+
* chunkOverlap: 50, // 50 token overlap
|
|
73
|
+
* });
|
|
74
|
+
*
|
|
75
|
+
* const chunks = await chunker.chunk(longDocument);
|
|
76
|
+
* // Each chunk.metadata.tokenCount contains estimated tokens
|
|
77
|
+
* ```
|
|
78
|
+
*/
|
|
79
|
+
class TokenChunker extends Chunker_1.Chunker {
|
|
80
|
+
constructor(config) {
|
|
81
|
+
super(config);
|
|
82
|
+
this.name = "TokenChunker";
|
|
83
|
+
}
|
|
84
|
+
/**
|
|
85
|
+
* Protected method to get tokenx - can be overridden in tests
|
|
86
|
+
*/
|
|
87
|
+
async getTokenx() {
|
|
88
|
+
return loadTokenx();
|
|
89
|
+
}
|
|
90
|
+
/**
|
|
91
|
+
* Split text by token count using tokenx.
|
|
92
|
+
*/
|
|
93
|
+
async splitText(text) {
|
|
94
|
+
const { chunkSize, chunkOverlap = 0 } = this.config;
|
|
95
|
+
const tokenx = await this.getTokenx();
|
|
96
|
+
const { splitByTokens } = tokenx;
|
|
97
|
+
// Use tokenx's splitByTokens for token-aware splitting
|
|
98
|
+
const chunks = splitByTokens(text, chunkSize);
|
|
99
|
+
// Apply overlap if configured
|
|
100
|
+
if (chunkOverlap > 0 && chunks.length > 1) {
|
|
101
|
+
return this.applyTokenOverlap(chunks, text);
|
|
102
|
+
}
|
|
103
|
+
return chunks;
|
|
104
|
+
}
|
|
105
|
+
/**
|
|
106
|
+
* Apply token-based overlap between chunks.
|
|
107
|
+
*/
|
|
108
|
+
async applyTokenOverlap(chunks, _originalText) {
|
|
109
|
+
const { chunkOverlap = 0 } = this.config;
|
|
110
|
+
const result = [chunks[0]];
|
|
111
|
+
for (let i = 1; i < chunks.length; i++) {
|
|
112
|
+
const prevChunk = chunks[i - 1];
|
|
113
|
+
const currentChunk = chunks[i];
|
|
114
|
+
// Get overlap from end of previous chunk
|
|
115
|
+
const overlapText = await this.getTokenOverlap(prevChunk, chunkOverlap);
|
|
116
|
+
if (overlapText && overlapText.trim()) {
|
|
117
|
+
result.push(overlapText + " " + currentChunk);
|
|
118
|
+
}
|
|
119
|
+
else {
|
|
120
|
+
result.push(currentChunk);
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
return result;
|
|
124
|
+
}
|
|
125
|
+
/**
|
|
126
|
+
* Get approximately chunkOverlap tokens from the end of text.
|
|
127
|
+
*/
|
|
128
|
+
async getTokenOverlap(text, overlapTokens) {
|
|
129
|
+
const tokenx = await this.getTokenx();
|
|
130
|
+
const { estimateTokenCount } = tokenx;
|
|
131
|
+
// Estimate characters per token (roughly 4 chars per token for English)
|
|
132
|
+
const estimatedChars = overlapTokens * 4;
|
|
133
|
+
if (text.length <= estimatedChars) {
|
|
134
|
+
return text;
|
|
135
|
+
}
|
|
136
|
+
// Start from estimated position and find a word boundary
|
|
137
|
+
let start = text.length - estimatedChars;
|
|
138
|
+
// Find the next space to start at a word boundary
|
|
139
|
+
const spaceIndex = text.indexOf(" ", start);
|
|
140
|
+
if (spaceIndex !== -1 && spaceIndex < text.length - 1) {
|
|
141
|
+
start = spaceIndex + 1;
|
|
142
|
+
}
|
|
143
|
+
const overlap = text.slice(start);
|
|
144
|
+
// Verify we're close to the target token count
|
|
145
|
+
const actualTokens = estimateTokenCount(overlap);
|
|
146
|
+
if (actualTokens > overlapTokens * 1.5) {
|
|
147
|
+
// Too many tokens, trim more aggressively
|
|
148
|
+
const words = overlap.split(/\s+/);
|
|
149
|
+
const targetWords = Math.ceil(words.length * (overlapTokens / actualTokens));
|
|
150
|
+
return words.slice(-targetWords).join(" ");
|
|
151
|
+
}
|
|
152
|
+
return overlap;
|
|
153
|
+
}
|
|
154
|
+
/**
|
|
155
|
+
* Override chunk to add token count to metadata.
|
|
156
|
+
*/
|
|
157
|
+
async chunk(text, options) {
|
|
158
|
+
const chunks = await super.chunk(text, options);
|
|
159
|
+
const tokenx = await this.getTokenx();
|
|
160
|
+
const { estimateTokenCount } = tokenx;
|
|
161
|
+
// Add token count to each chunk's metadata
|
|
162
|
+
for (const chunk of chunks) {
|
|
163
|
+
chunk.metadata.tokenCount = estimateTokenCount(chunk.content);
|
|
164
|
+
}
|
|
165
|
+
return chunks;
|
|
166
|
+
}
|
|
167
|
+
/**
|
|
168
|
+
* Estimate token count for a given text.
|
|
169
|
+
*/
|
|
170
|
+
static async estimateTokens(text) {
|
|
171
|
+
const tokenx = await loadTokenx();
|
|
172
|
+
return tokenx.estimateTokenCount(text);
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
exports.TokenChunker = TokenChunker;
|
|
176
|
+
//# sourceMappingURL=TokenChunker.js.map
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
export { Chunk, ChunkMetadata, ChunkerConfig, ChunkOptions, RecursiveChunkerConfig, TokenChunkerConfig, } from "./types";
|
|
2
|
+
export { Chunker } from "./Chunker";
|
|
3
|
+
export { TextChunker } from "./TextChunker";
|
|
4
|
+
export { RecursiveChunker } from "./RecursiveChunker";
|
|
5
|
+
export { TokenChunker } from "./TokenChunker";
|
|
6
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.TokenChunker = exports.RecursiveChunker = exports.TextChunker = exports.Chunker = void 0;
|
|
4
|
+
// Base class
|
|
5
|
+
var Chunker_1 = require("./Chunker");
|
|
6
|
+
Object.defineProperty(exports, "Chunker", { enumerable: true, get: function () { return Chunker_1.Chunker; } });
|
|
7
|
+
// Implementations
|
|
8
|
+
var TextChunker_1 = require("./TextChunker");
|
|
9
|
+
Object.defineProperty(exports, "TextChunker", { enumerable: true, get: function () { return TextChunker_1.TextChunker; } });
|
|
10
|
+
var RecursiveChunker_1 = require("./RecursiveChunker");
|
|
11
|
+
Object.defineProperty(exports, "RecursiveChunker", { enumerable: true, get: function () { return RecursiveChunker_1.RecursiveChunker; } });
|
|
12
|
+
var TokenChunker_1 = require("./TokenChunker");
|
|
13
|
+
Object.defineProperty(exports, "TokenChunker", { enumerable: true, get: function () { return TokenChunker_1.TokenChunker; } });
|
|
14
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Represents a chunk of text with metadata for tracking and linking.
|
|
3
|
+
*/
|
|
4
|
+
export interface Chunk {
|
|
5
|
+
/** Unique identifier for this chunk */
|
|
6
|
+
id: string;
|
|
7
|
+
/** The text content of the chunk */
|
|
8
|
+
content: string;
|
|
9
|
+
/** Metadata about the chunk */
|
|
10
|
+
metadata: ChunkMetadata;
|
|
11
|
+
}
|
|
12
|
+
/**
|
|
13
|
+
* Metadata associated with each chunk.
|
|
14
|
+
*/
|
|
15
|
+
export interface ChunkMetadata {
|
|
16
|
+
/** Zero-based index of this chunk in the sequence */
|
|
17
|
+
chunkIndex: number;
|
|
18
|
+
/** Total number of chunks in the sequence */
|
|
19
|
+
totalChunks: number;
|
|
20
|
+
/** ID of the previous chunk, or null if first */
|
|
21
|
+
previousChunkId: string | null;
|
|
22
|
+
/** ID of the next chunk, or null if last */
|
|
23
|
+
nextChunkId: string | null;
|
|
24
|
+
/** Character offset where this chunk starts in the source text */
|
|
25
|
+
startOffset: number;
|
|
26
|
+
/** Character offset where this chunk ends in the source text */
|
|
27
|
+
endOffset: number;
|
|
28
|
+
/** Optional identifier for the source document */
|
|
29
|
+
sourceId?: string;
|
|
30
|
+
/** Optional path to the source file */
|
|
31
|
+
sourcePath?: string;
|
|
32
|
+
/** Number of characters in the chunk content */
|
|
33
|
+
charCount: number;
|
|
34
|
+
/** Estimated number of tokens (when available) */
|
|
35
|
+
tokenCount?: number;
|
|
36
|
+
/** SHA-256 hash of the content for deduplication */
|
|
37
|
+
hash: string;
|
|
38
|
+
/** Section title if detected (e.g., markdown headers) */
|
|
39
|
+
sectionTitle?: string;
|
|
40
|
+
[key: string]: unknown;
|
|
41
|
+
}
|
|
42
|
+
/**
|
|
43
|
+
* Configuration for creating a chunker.
|
|
44
|
+
*/
|
|
45
|
+
export interface ChunkerConfig {
|
|
46
|
+
/** Target size for each chunk (in characters or tokens depending on chunker) */
|
|
47
|
+
chunkSize: number;
|
|
48
|
+
/** Number of characters/tokens to overlap between chunks (default: 0) */
|
|
49
|
+
chunkOverlap?: number;
|
|
50
|
+
/**
|
|
51
|
+
* Optional processor function applied to each chunk.
|
|
52
|
+
* Can modify the chunk or return null to filter it out.
|
|
53
|
+
*/
|
|
54
|
+
chunkProcessor?: (chunk: Chunk, index: number, all: Chunk[]) => Chunk | null | Promise<Chunk | null>;
|
|
55
|
+
/**
|
|
56
|
+
* Custom ID generator function.
|
|
57
|
+
* @param content - The chunk content
|
|
58
|
+
* @param index - The chunk index
|
|
59
|
+
* @param sourceId - Optional source document ID
|
|
60
|
+
* @returns A unique ID for the chunk
|
|
61
|
+
*/
|
|
62
|
+
idGenerator?: (content: string, index: number, sourceId?: string) => string;
|
|
63
|
+
}
|
|
64
|
+
/**
|
|
65
|
+
* Options passed when chunking text.
|
|
66
|
+
*/
|
|
67
|
+
export interface ChunkOptions {
|
|
68
|
+
/** Identifier for the source document */
|
|
69
|
+
sourceId?: string;
|
|
70
|
+
/** Path to the source file */
|
|
71
|
+
sourcePath?: string;
|
|
72
|
+
/** Additional metadata to merge into each chunk */
|
|
73
|
+
metadata?: Record<string, unknown>;
|
|
74
|
+
}
|
|
75
|
+
/**
|
|
76
|
+
* Configuration specific to RecursiveChunker.
|
|
77
|
+
*/
|
|
78
|
+
export interface RecursiveChunkerConfig extends ChunkerConfig {
|
|
79
|
+
/**
|
|
80
|
+
* Separators to try in order, from largest to smallest semantic unit.
|
|
81
|
+
* Default: ["\n\n", "\n", ". ", " "]
|
|
82
|
+
*/
|
|
83
|
+
separators?: string[];
|
|
84
|
+
}
|
|
85
|
+
/**
|
|
86
|
+
* Configuration specific to TokenChunker.
|
|
87
|
+
*/
|
|
88
|
+
export interface TokenChunkerConfig extends ChunkerConfig {
|
|
89
|
+
/**
|
|
90
|
+
* Chunk size is in tokens, not characters.
|
|
91
|
+
* Uses tokenx for estimation (~96% accuracy).
|
|
92
|
+
*/
|
|
93
|
+
chunkSize: number;
|
|
94
|
+
}
|
|
95
|
+
//# sourceMappingURL=types.d.ts.map
|