voctar 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +102 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +29 -0
- package/dist/index.js.map +1 -0
- package/dist/src/chunking/index.d.ts +48 -0
- package/dist/src/chunking/index.d.ts.map +1 -0
- package/dist/src/chunking/index.js +123 -0
- package/dist/src/chunking/index.js.map +1 -0
- package/dist/src/chunking/strategies/fixed.d.ts +14 -0
- package/dist/src/chunking/strategies/fixed.d.ts.map +1 -0
- package/dist/src/chunking/strategies/fixed.js +111 -0
- package/dist/src/chunking/strategies/fixed.js.map +1 -0
- package/dist/src/chunking/strategies/paragraph.d.ts +6 -0
- package/dist/src/chunking/strategies/paragraph.d.ts.map +1 -0
- package/dist/src/chunking/strategies/paragraph.js +84 -0
- package/dist/src/chunking/strategies/paragraph.js.map +1 -0
- package/dist/src/chunking/strategies/recursive.d.ts +17 -0
- package/dist/src/chunking/strategies/recursive.d.ts.map +1 -0
- package/dist/src/chunking/strategies/recursive.js +192 -0
- package/dist/src/chunking/strategies/recursive.js.map +1 -0
- package/dist/src/chunking/strategies/semantic.d.ts +96 -0
- package/dist/src/chunking/strategies/semantic.d.ts.map +1 -0
- package/dist/src/chunking/strategies/semantic.js +587 -0
- package/dist/src/chunking/strategies/semantic.js.map +1 -0
- package/dist/src/chunking/strategies/sentence.d.ts +7 -0
- package/dist/src/chunking/strategies/sentence.d.ts.map +1 -0
- package/dist/src/chunking/strategies/sentence.js +116 -0
- package/dist/src/chunking/strategies/sentence.js.map +1 -0
- package/dist/src/chunking/types.d.ts +45 -0
- package/dist/src/chunking/types.d.ts.map +1 -0
- package/dist/src/chunking/types.js +4 -0
- package/dist/src/chunking/types.js.map +1 -0
- package/dist/src/chunking/utils/tokenizer.d.ts +10 -0
- package/dist/src/chunking/utils/tokenizer.d.ts.map +1 -0
- package/dist/src/chunking/utils/tokenizer.js +50 -0
- package/dist/src/chunking/utils/tokenizer.js.map +1 -0
- package/dist/src/providers/embeddings/index.d.ts +3 -0
- package/dist/src/providers/embeddings/index.d.ts.map +1 -0
- package/dist/src/providers/embeddings/index.js +7 -0
- package/dist/src/providers/embeddings/index.js.map +1 -0
- package/dist/src/providers/embeddings/openai.d.ts +21 -0
- package/dist/src/providers/embeddings/openai.d.ts.map +1 -0
- package/dist/src/providers/embeddings/openai.js +86 -0
- package/dist/src/providers/embeddings/openai.js.map +1 -0
- package/dist/src/providers/index.d.ts +3 -0
- package/dist/src/providers/index.d.ts.map +1 -0
- package/dist/src/providers/index.js +20 -0
- package/dist/src/providers/index.js.map +1 -0
- package/dist/src/providers/stores/index.d.ts +6 -0
- package/dist/src/providers/stores/index.d.ts.map +1 -0
- package/dist/src/providers/stores/index.js +11 -0
- package/dist/src/providers/stores/index.js.map +1 -0
- package/dist/src/providers/stores/memory.d.ts +18 -0
- package/dist/src/providers/stores/memory.d.ts.map +1 -0
- package/dist/src/providers/stores/memory.js +169 -0
- package/dist/src/providers/stores/memory.js.map +1 -0
- package/dist/src/providers/stores/qdrant.d.ts +28 -0
- package/dist/src/providers/stores/qdrant.d.ts.map +1 -0
- package/dist/src/providers/stores/qdrant.js +223 -0
- package/dist/src/providers/stores/qdrant.js.map +1 -0
- package/dist/src/providers/stores/sqlite.d.ts +38 -0
- package/dist/src/providers/stores/sqlite.d.ts.map +1 -0
- package/dist/src/providers/stores/sqlite.js +306 -0
- package/dist/src/providers/stores/sqlite.js.map +1 -0
- package/dist/src/types.d.ts +111 -0
- package/dist/src/types.d.ts.map +1 -0
- package/dist/src/types.js +32 -0
- package/dist/src/types.js.map +1 -0
- package/dist/src/vector.d.ts +74 -0
- package/dist/src/vector.d.ts.map +1 -0
- package/dist/src/vector.js +505 -0
- package/dist/src/vector.js.map +1 -0
- package/docs/API.md +361 -0
- package/docs/CHUNKING.md +280 -0
- package/docs/CUSTOM_PROVIDERS.md +101 -0
- package/docs/README.md +11 -0
- package/docs/STORAGE_BACKENDS.md +189 -0
- package/docs/assets/vectar.png +0 -0
- package/package.json +46 -0
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.RecursiveChunkingStrategy = void 0;
|
|
4
|
+
// Recursive text splitting strategy - tries to split on natural boundaries
|
|
5
|
+
const uuid_1 = require("uuid");
|
|
6
|
+
const tokenizer_1 = require("../utils/tokenizer");
|
|
7
|
+
class RecursiveChunkingStrategy {
|
|
8
|
+
getName() {
|
|
9
|
+
return 'recursive';
|
|
10
|
+
}
|
|
11
|
+
chunk(text, documentId, options) {
|
|
12
|
+
// Get token limit and ensure maxSize doesn't exceed it
|
|
13
|
+
const tokenLimit = options.tokenLimit ?? 8192;
|
|
14
|
+
const maxSize = Math.min(options.maxChunkSize ?? 1000, tokenLimit);
|
|
15
|
+
const overlap = Math.min(options.overlap ?? 200, Math.floor(maxSize * 0.2)); // Overlap shouldn't exceed 20% of maxSize
|
|
16
|
+
// Default separators in order of preference (paragraph > sentence > word > char)
|
|
17
|
+
const defaultSeparators = [
|
|
18
|
+
'\n\n', // Paragraph
|
|
19
|
+
'\n', // Line break
|
|
20
|
+
'. ', // Sentence
|
|
21
|
+
'! ', // Sentence
|
|
22
|
+
'? ', // Sentence
|
|
23
|
+
'; ', // Clause
|
|
24
|
+
', ', // Phrase
|
|
25
|
+
' ', // Word
|
|
26
|
+
'', // Character
|
|
27
|
+
];
|
|
28
|
+
const separators = options.separator
|
|
29
|
+
? (Array.isArray(options.separator) ? options.separator : [options.separator])
|
|
30
|
+
: defaultSeparators;
|
|
31
|
+
const chunks = this.recursiveSplit(text, maxSize, overlap, separators);
|
|
32
|
+
// Convert to Chunk objects with metadata
|
|
33
|
+
let startChar = 0;
|
|
34
|
+
const result = [];
|
|
35
|
+
chunks.forEach((chunkText, index) => {
|
|
36
|
+
const endChar = startChar + chunkText.length;
|
|
37
|
+
result.push({
|
|
38
|
+
id: (0, uuid_1.v4)(),
|
|
39
|
+
text: chunkText,
|
|
40
|
+
metadata: {
|
|
41
|
+
documentId,
|
|
42
|
+
chunkIndex: index,
|
|
43
|
+
totalChunks: chunks.length,
|
|
44
|
+
startChar,
|
|
45
|
+
endChar,
|
|
46
|
+
...options.metadata,
|
|
47
|
+
},
|
|
48
|
+
});
|
|
49
|
+
// Account for overlap in start position
|
|
50
|
+
startChar = endChar - overlap;
|
|
51
|
+
});
|
|
52
|
+
return result;
|
|
53
|
+
}
|
|
54
|
+
recursiveSplit(text, maxSize, // maxSize is now in tokens
|
|
55
|
+
overlap, // overlap is now in tokens
|
|
56
|
+
separators) {
|
|
57
|
+
const finalChunks = [];
|
|
58
|
+
// Base case: if text token count is small enough, return it
|
|
59
|
+
const textTokens = (0, tokenizer_1.countTokens)(text);
|
|
60
|
+
if (textTokens <= maxSize) {
|
|
61
|
+
return text.trim() ? [text.trim()] : [];
|
|
62
|
+
}
|
|
63
|
+
// Try each separator in order
|
|
64
|
+
for (const separator of separators) {
|
|
65
|
+
if (separator === '') {
|
|
66
|
+
// Character-level split as last resort
|
|
67
|
+
return this.splitByCharacters(text, maxSize, overlap);
|
|
68
|
+
}
|
|
69
|
+
if (text.includes(separator)) {
|
|
70
|
+
const splits = text.split(separator);
|
|
71
|
+
const chunks = this.mergeSplits(splits, separator, maxSize, overlap);
|
|
72
|
+
// If any chunk is still too large (by token count), recursively split it
|
|
73
|
+
for (const chunk of chunks) {
|
|
74
|
+
const chunkTokens = (0, tokenizer_1.countTokens)(chunk);
|
|
75
|
+
if (chunkTokens > maxSize) {
|
|
76
|
+
// Find next separator in the list
|
|
77
|
+
const nextSeparatorIndex = separators.indexOf(separator) + 1;
|
|
78
|
+
const remainingSeparators = separators.slice(nextSeparatorIndex);
|
|
79
|
+
finalChunks.push(...this.recursiveSplit(chunk, maxSize, overlap, remainingSeparators));
|
|
80
|
+
}
|
|
81
|
+
else if (chunk.trim()) {
|
|
82
|
+
finalChunks.push(chunk.trim());
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
return finalChunks;
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
// Fallback to character split
|
|
89
|
+
return this.splitByCharacters(text, maxSize, overlap);
|
|
90
|
+
}
|
|
91
|
+
mergeSplits(splits, separator, maxSize, // maxSize is now in tokens
|
|
92
|
+
overlap // overlap is now in tokens
|
|
93
|
+
) {
|
|
94
|
+
const chunks = [];
|
|
95
|
+
let currentChunk = '';
|
|
96
|
+
for (let i = 0; i < splits.length; i++) {
|
|
97
|
+
const split = splits[i];
|
|
98
|
+
const piece = i < splits.length - 1 ? split + separator : split;
|
|
99
|
+
const combined = currentChunk + piece;
|
|
100
|
+
const combinedTokens = (0, tokenizer_1.countTokens)(combined);
|
|
101
|
+
if (combinedTokens <= maxSize) {
|
|
102
|
+
currentChunk = combined;
|
|
103
|
+
}
|
|
104
|
+
else {
|
|
105
|
+
if (currentChunk) {
|
|
106
|
+
chunks.push(currentChunk);
|
|
107
|
+
// Start new chunk with overlap (in tokens)
|
|
108
|
+
// Find overlap by binary search or by character approximation
|
|
109
|
+
const overlapText = this.getOverlapText(currentChunk, overlap);
|
|
110
|
+
currentChunk = overlapText + piece;
|
|
111
|
+
}
|
|
112
|
+
else {
|
|
113
|
+
// Single piece is larger than maxSize, add it anyway (will be split recursively)
|
|
114
|
+
currentChunk = piece;
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
if (currentChunk) {
|
|
119
|
+
chunks.push(currentChunk);
|
|
120
|
+
}
|
|
121
|
+
return chunks;
|
|
122
|
+
}
|
|
123
|
+
/**
|
|
124
|
+
* Get overlap text that is approximately 'overlapTokens' tokens
|
|
125
|
+
*/
|
|
126
|
+
getOverlapText(text, overlapTokens) {
|
|
127
|
+
// Binary search for the right amount of text
|
|
128
|
+
let start = 0;
|
|
129
|
+
let end = text.length;
|
|
130
|
+
let bestMatch = '';
|
|
131
|
+
while (start < end) {
|
|
132
|
+
const mid = Math.floor((start + end) / 2);
|
|
133
|
+
const candidate = text.slice(mid);
|
|
134
|
+
const tokens = (0, tokenizer_1.countTokens)(candidate);
|
|
135
|
+
if (tokens <= overlapTokens) {
|
|
136
|
+
bestMatch = candidate;
|
|
137
|
+
end = mid;
|
|
138
|
+
}
|
|
139
|
+
else {
|
|
140
|
+
start = mid + 1;
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
return bestMatch || text.slice(-Math.floor(text.length * 0.1)); // Fallback to last 10%
|
|
144
|
+
}
|
|
145
|
+
splitByCharacters(text, maxSize, overlap) {
|
|
146
|
+
// maxSize and overlap are in tokens, but we need to split by characters
|
|
147
|
+
// Use binary search to find character positions that match token limits
|
|
148
|
+
const chunks = [];
|
|
149
|
+
let start = 0;
|
|
150
|
+
while (start < text.length) {
|
|
151
|
+
const chunkText = this.getTextUpToTokenLimit(text.slice(start), maxSize);
|
|
152
|
+
if (!chunkText)
|
|
153
|
+
break;
|
|
154
|
+
chunks.push(chunkText);
|
|
155
|
+
const chunkLength = chunkText.length;
|
|
156
|
+
// Calculate overlap start position
|
|
157
|
+
const overlapText = this.getOverlapText(chunkText, overlap);
|
|
158
|
+
start += chunkLength - overlapText.length;
|
|
159
|
+
if (text.length - start < overlapText.length) {
|
|
160
|
+
break;
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
return chunks;
|
|
164
|
+
}
|
|
165
|
+
/**
|
|
166
|
+
* Get text up to a token limit
|
|
167
|
+
*/
|
|
168
|
+
getTextUpToTokenLimit(text, maxTokens) {
|
|
169
|
+
if ((0, tokenizer_1.countTokens)(text) <= maxTokens) {
|
|
170
|
+
return text;
|
|
171
|
+
}
|
|
172
|
+
// Binary search for the right character position
|
|
173
|
+
let start = 0;
|
|
174
|
+
let end = text.length;
|
|
175
|
+
let bestMatch = '';
|
|
176
|
+
while (start < end) {
|
|
177
|
+
const mid = Math.floor((start + end) / 2);
|
|
178
|
+
const candidate = text.slice(0, mid);
|
|
179
|
+
const tokens = (0, tokenizer_1.countTokens)(candidate);
|
|
180
|
+
if (tokens <= maxTokens) {
|
|
181
|
+
bestMatch = candidate;
|
|
182
|
+
start = mid + 1;
|
|
183
|
+
}
|
|
184
|
+
else {
|
|
185
|
+
end = mid;
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
return bestMatch || text.slice(0, Math.floor(text.length * 0.8)); // Fallback to 80%
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
exports.RecursiveChunkingStrategy = RecursiveChunkingStrategy;
|
|
192
|
+
//# sourceMappingURL=recursive.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"recursive.js","sourceRoot":"","sources":["../../../../src/chunking/strategies/recursive.ts"],"names":[],"mappings":";;;AAAA,2EAA2E;AAC3E,+BAAoC;AAEpC,kDAAiD;AAEjD,MAAa,yBAAyB;IACpC,OAAO;QACL,OAAO,WAAW,CAAC;IACrB,CAAC;IAED,KAAK,CAAC,IAAY,EAAE,UAAkB,EAAE,OAAwB;QAC9D,uDAAuD;QACvD,MAAM,UAAU,GAAI,OAAe,CAAC,UAAU,IAAI,IAAI,CAAC;QACvD,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,YAAY,IAAI,IAAI,EAAE,UAAU,CAAC,CAAC;QACnE,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,OAAO,IAAI,GAAG,EAAE,IAAI,CAAC,KAAK,CAAC,OAAO,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,0CAA0C;QAEvH,iFAAiF;QACjF,MAAM,iBAAiB,GAAG;YACxB,MAAM,EAAG,YAAY;YACrB,IAAI,EAAK,aAAa;YACtB,IAAI,EAAK,WAAW;YACpB,IAAI,EAAK,WAAW;YACpB,IAAI,EAAK,WAAW;YACpB,IAAI,EAAK,SAAS;YAClB,IAAI,EAAK,SAAS;YAClB,GAAG,EAAM,OAAO;YAChB,EAAE,EAAO,YAAY;SACtB,CAAC;QAEF,MAAM,UAAU,GAAG,OAAO,CAAC,SAAS;YAClC,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;YAC9E,CAAC,CAAC,iBAAiB,CAAC;QAEtB,MAAM,MAAM,GAAG,IAAI,CAAC,cAAc,CAAC,IAAI,EAAE,OAAO,EAAE,OAAO,EAAE,UAAU,CAAC,CAAC;QAEvE,yCAAyC;QACzC,IAAI,SAAS,GAAG,CAAC,CAAC;QAClB,MAAM,MAAM,GAAY,EAAE,CAAC;QAE3B,MAAM,CAAC,OAAO,CAAC,CAAC,SAAS,EAAE,KAAK,EAAE,EAAE;YAClC,MAAM,OAAO,GAAG,SAAS,GAAG,SAAS,CAAC,MAAM,CAAC;YAE7C,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE,EAAE,IAAA,SAAM,GAAE;gBACZ,IAAI,EAAE,SAAS;gBACf,QAAQ,EAAE;oBACR,UAAU;oBACV,UAAU,EAAE,KAAK;oBACjB,WAAW,EAAE,MAAM,CAAC,MAAM;oBAC1B,SAAS;oBACT,OAAO;oBACP,GAAG,OAAO,CAAC,QAAQ;iBACpB;aACF,CAAC,CAAC;YAEH,wCAAwC;YACxC,SAAS,GAAG,OAAO,GAAG,OAAO,CAAC;QAChC,CAAC,CAAC,CAAC;QAEH,OAAO,MAAM,CAAC;IAChB,CAAC;IAEO,cAAc,CACpB,IAAY,EACZ,OAAe,EAAE,2BAA2B;IAC5C,OAAe,EAAE,2BAA2B;IAC5C,UAAoB;QAEpB,MAAM,WAAW,GAAa,EAAE,CAAC;QAEjC,4DAA4D;QAC5D,MAAM,UAAU,GAAG,IAAA,uBAAW,EAAC,IAAI,CAAC,CAAC;QACrC,IAAI,UAAU,IAAI,OAAO,EAAE,CAAC;YAC1B,OAAO,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QAC1C,CAAC;QAED,8BAA8B;QAC9B,KAAK,MAAM,SAAS,IAAI,UAAU,EAAE,CAAC;YACnC,IAAI,SAAS,KAAK,EAAE,EAAE,CAAC;gBACrB,uCAAuC;gBACvC,OAAO,IAAI,CAAC,iBAAiB,CAAC,IAAI,EAAE,OAAO,EAAE,OAAO,CAAC,CAAC;YACxD,CAAC;YAED,IAAI,IAAI,CAAC,QAAQ,CAAC,SAAS,CAAC,EAAE,CAAC;gBAC7B,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;gBACrC,MAAM,MAAM,GAAG,IAAI,CAAC,WAAW,CAAC,MAAM,EAAE,SAAS,EAAE,OAAO,EAAE,OAAO,CAAC,CAAC;gBAErE,yEAAyE;gBACzE,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;oBAC3B,MAAM,WAAW,GAAG,IAAA,uBAAW,EAAC,KAAK,CAAC,CAAC;oBACvC,IAAI,WAAW,GAAG,OAAO,EAAE,CAAC;wBAC1B,kCAAkC;wBAClC,MAAM,kBAAkB,GAAG,UAAU,CAAC,OAAO,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC;wBAC7D,MAAM,mBAAmB,GAAG,UAAU,CAAC,KAAK,CAAC,kBAAkB,CAAC,CAAC;wBACjE,WAAW,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC,cAAc,CAAC,KAAK,EAAE,OAAO,EAAE,OAAO,EAAE,mBAAmB,CAAC,CAAC,CAAC;oBACzF,CAAC;yBAAM,IAAI,KAAK,CAAC,IAAI,EAAE,EAAE,CAAC;wBACxB,WAAW,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC,CAAC;oBACjC,CAAC;gBACH,CAAC;gBAED,OAAO,WAAW,CAAC;YACrB,CAAC;QACH,CAAC;QAED,8BAA8B;QAC9B,OAAO,IAAI,CAAC,iBAAiB,CAAC,IAAI,EAAE,OAAO,EAAE,OAAO,CAAC,CAAC;IACxD,CAAC;IAEO,WAAW,CACjB,MAAgB,EAChB,SAAiB,EACjB,OAAe,EAAE,2BAA2B;IAC5C,OAAe,CAAC,2BAA2B;;QAE3C,MAAM,MAAM,GAAa,EAAE,CAAC;QAC5B,IAAI,YAAY,GAAG,EAAE,CAAC;QAEtB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACvC,MAAM,KAAK,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC;YACxB,MAAM,KAAK,GAAG,CAAC,GAAG,MAAM,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,GAAG,SAAS,CAAC,CAAC,CAAC,KAAK,CAAC;YAChE,MAAM,QAAQ,GAAG,YAAY,GAAG,KAAK,CAAC;YACtC,MAAM,cAAc,GAAG,IAAA,uBAAW,EAAC,QAAQ,CAAC,CAAC;YAE7C,IAAI,cAAc,IAAI,OAAO,EAAE,CAAC;gBAC9B,YAAY,GAAG,QAAQ,CAAC;YAC1B,CAAC;iBAAM,CAAC;gBACN,IAAI,YAAY,EAAE,CAAC;oBACjB,MAAM,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;oBAC1B,2CAA2C;oBAC3C,8DAA8D;oBAC9D,MAAM,WAAW,GAAG,IAAI,CAAC,cAAc,CAAC,YAAY,EAAE,OAAO,CAAC,CAAC;oBAC/D,YAAY,GAAG,WAAW,GAAG,KAAK,CAAC;gBACrC,CAAC;qBAAM,CAAC;oBACN,iFAAiF;oBACjF,YAAY,GAAG,KAAK,CAAC;gBACvB,CAAC;YACH,CAAC;QACH,CAAC;QAED,IAAI,YAAY,EAAE,CAAC;YACjB,MAAM,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;QAC5B,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;OAEG;IACK,cAAc,CAAC,IAAY,EAAE,aAAqB;QACxD,6CAA6C;QAC7C,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,IAAI,GAAG,GAAG,IAAI,CAAC,MAAM,CAAC;QACtB,IAAI,SAAS,GAAG,EAAE,CAAC;QAEnB,OAAO,KAAK,GAAG,GAAG,EAAE,CAAC;YACnB,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,KAAK,GAAG,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC;YAC1C,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;YAClC,MAAM,MAAM,GAAG,IAAA,uBAAW,EAAC,SAAS,CAAC,CAAC;YAEtC,IAAI,MAAM,IAAI,aAAa,EAAE,CAAC;gBAC5B,SAAS,GAAG,SAAS,CAAC;gBACtB,GAAG,GAAG,GAAG,CAAC;YACZ,CAAC;iBAAM,CAAC;gBACN,KAAK,GAAG,GAAG,GAAG,CAAC,CAAC;YAClB,CAAC;QACH,CAAC;QAED,OAAO,SAAS,IAAI,IAAI,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,uBAAuB;IACzF,CAAC;IAEO,iBAAiB,CAAC,IAAY,EAAE,OAAe,EAAE,OAAe;QACtE,wEAAwE;QACxE,wEAAwE;QACxE,MAAM,MAAM,GAAa,EAAE,CAAC;QAC5B,IAAI,KAAK,GAAG,CAAC,CAAC;QAEd,OAAO,KAAK,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;YAC3B,MAAM,SAAS,GAAG,IAAI,CAAC,qBAAqB,CAAC,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,OAAO,CAAC,CAAC;YACzE,IAAI,CAAC,SAAS;gBAAE,MAAM;YAEtB,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YACvB,MAAM,WAAW,GAAG,SAAS,CAAC,MAAM,CAAC;YAErC,mCAAmC;YACnC,MAAM,WAAW,GAAG,IAAI,CAAC,cAAc,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC;YAC5D,KAAK,IAAI,WAAW,GAAG,WAAW,CAAC,MAAM,CAAC;YAE1C,IAAI,IAAI,CAAC,MAAM,GAAG,KAAK,GAAG,WAAW,CAAC,MAAM,EAAE,CAAC;gBAC7C,MAAM;YACR,CAAC;QACH,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;OAEG;IACK,qBAAqB,CAAC,IAAY,EAAE,SAAiB;QAC3D,IAAI,IAAA,uBAAW,EAAC,IAAI,CAAC,IAAI,SAAS,EAAE,CAAC;YACnC,OAAO,IAAI,CAAC;QACd,CAAC;QAED,iDAAiD;QACjD,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,IAAI,GAAG,GAAG,IAAI,CAAC,MAAM,CAAC;QACtB,IAAI,SAAS,GAAG,EAAE,CAAC;QAEnB,OAAO,KAAK,GAAG,GAAG,EAAE,CAAC;YACnB,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,KAAK,GAAG,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC;YAC1C,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;YACrC,MAAM,MAAM,GAAG,IAAA,uBAAW,EAAC,SAAS,CAAC,CAAC;YAEtC,IAAI,MAAM,IAAI,SAAS,EAAE,CAAC;gBACxB,SAAS,GAAG,SAAS,CAAC;gBACtB,KAAK,GAAG,GAAG,GAAG,CAAC,CAAC;YAClB,CAAC;iBAAM,CAAC;gBACN,GAAG,GAAG,GAAG,CAAC;YACZ,CAAC;QACH,CAAC;QAED,OAAO,SAAS,IAAI,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,kBAAkB;IACtF,CAAC;CACF;AA3ND,8DA2NC"}
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import type { Chunk, ChunkingOptions, ChunkingStrategy } from '../types';
|
|
2
|
+
import type { EmbeddingProvider } from '../../types';
|
|
3
|
+
interface ChunkHeader {
|
|
4
|
+
summary?: string;
|
|
5
|
+
facts?: string[];
|
|
6
|
+
refs?: string[];
|
|
7
|
+
}
|
|
8
|
+
export interface SemanticChunkingOptions extends ChunkingOptions {
|
|
9
|
+
embeddingProvider?: EmbeddingProvider;
|
|
10
|
+
softLimit?: number;
|
|
11
|
+
hardLimit?: number;
|
|
12
|
+
similarityThreshold?: number;
|
|
13
|
+
contentType?: 'conversation' | 'text';
|
|
14
|
+
contextOverlapPercent?: number;
|
|
15
|
+
smartOverlap?: boolean;
|
|
16
|
+
volatilityWindow?: number;
|
|
17
|
+
generateHeaders?: boolean;
|
|
18
|
+
headerGenerator?: (text: string) => Promise<ChunkHeader>;
|
|
19
|
+
stripNoise?: boolean;
|
|
20
|
+
noisePatterns?: RegExp[];
|
|
21
|
+
addRoleMarkers?: boolean;
|
|
22
|
+
}
|
|
23
|
+
export declare class SemanticChunkingStrategy implements ChunkingStrategy {
|
|
24
|
+
getName(): string;
|
|
25
|
+
chunk(text: string, documentId: string, options: ChunkingOptions): Chunk[];
|
|
26
|
+
/**
|
|
27
|
+
* Auto-detect if text is conversational based on role markers
|
|
28
|
+
*/
|
|
29
|
+
private detectContentType;
|
|
30
|
+
/**
|
|
31
|
+
* Pre-segment text into atomic units (turns, paragraphs, sentences)
|
|
32
|
+
*/
|
|
33
|
+
private preSegment;
|
|
34
|
+
/**
|
|
35
|
+
* Segment conversation into turns (user/agent/tool)
|
|
36
|
+
*/
|
|
37
|
+
private segmentConversation;
|
|
38
|
+
/**
|
|
39
|
+
* Segment plain text into paragraphs and sentences
|
|
40
|
+
*/
|
|
41
|
+
private segmentText;
|
|
42
|
+
/**
|
|
43
|
+
* Semantic merging with soft/hard limits
|
|
44
|
+
*/
|
|
45
|
+
private semanticMerge;
|
|
46
|
+
/**
|
|
47
|
+
* Finalize chunk from atoms
|
|
48
|
+
*/
|
|
49
|
+
private finalizeChunk;
|
|
50
|
+
/**
|
|
51
|
+
* Calculate topic volatility (simplified - uses lexical changes)
|
|
52
|
+
*/
|
|
53
|
+
private calculateVolatility;
|
|
54
|
+
/**
|
|
55
|
+
* Apply smart overlap between chunks
|
|
56
|
+
*/
|
|
57
|
+
private applySmartOverlap;
|
|
58
|
+
/**
|
|
59
|
+
* Add role markers to text
|
|
60
|
+
*/
|
|
61
|
+
private addRoleMarkers;
|
|
62
|
+
/**
|
|
63
|
+
* Generate simple header (synchronous version)
|
|
64
|
+
*/
|
|
65
|
+
private generateSimpleHeader;
|
|
66
|
+
/**
|
|
67
|
+
* Extract facts from text (dates, numbers, key entities)
|
|
68
|
+
*/
|
|
69
|
+
private extractFacts;
|
|
70
|
+
/**
|
|
71
|
+
* Strip noise from text
|
|
72
|
+
*/
|
|
73
|
+
private stripNoise;
|
|
74
|
+
/**
|
|
75
|
+
* Compute lexical similarity (Jaccard similarity on words)
|
|
76
|
+
*/
|
|
77
|
+
private computeLexicalSimilarity;
|
|
78
|
+
/**
|
|
79
|
+
* Split text into sentences
|
|
80
|
+
*/
|
|
81
|
+
private splitIntoSentences;
|
|
82
|
+
/**
|
|
83
|
+
* Estimate token count using accurate tokenizer
|
|
84
|
+
*/
|
|
85
|
+
private estimateTokens;
|
|
86
|
+
/**
|
|
87
|
+
* Split an oversized atom (turn, paragraph, or sentence)
|
|
88
|
+
*/
|
|
89
|
+
private splitOversizedAtom;
|
|
90
|
+
/**
|
|
91
|
+
* Split an oversized chunk using simple recursive splitting
|
|
92
|
+
*/
|
|
93
|
+
private splitOversizedChunk;
|
|
94
|
+
}
|
|
95
|
+
export {};
|
|
96
|
+
//# sourceMappingURL=semantic.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"semantic.d.ts","sourceRoot":"","sources":["../../../../src/chunking/strategies/semantic.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,KAAK,EAAE,eAAe,EAAE,gBAAgB,EAAE,MAAM,UAAU,CAAC;AACzE,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,aAAa,CAAC;AAYrD,UAAU,WAAW;IACnB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,KAAK,CAAC,EAAE,MAAM,EAAE,CAAC;IACjB,IAAI,CAAC,EAAE,MAAM,EAAE,CAAC;CACjB;AASD,MAAM,WAAW,uBAAwB,SAAQ,eAAe;IAE9D,iBAAiB,CAAC,EAAE,iBAAiB,CAAC;IAGtC,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,SAAS,CAAC,EAAE,MAAM,CAAC;IAGnB,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAG7B,WAAW,CAAC,EAAE,cAAc,GAAG,MAAM,CAAC;IAGtC,qBAAqB,CAAC,EAAE,MAAM,CAAC;IAC/B,YAAY,CAAC,EAAE,OAAO,CAAC;IAGvB,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAG1B,eAAe,CAAC,EAAE,OAAO,CAAC;IAC1B,eAAe,CAAC,EAAE,CAAC,IAAI,EAAE,MAAM,KAAK,OAAO,CAAC,WAAW,CAAC,CAAC;IAGzD,UAAU,CAAC,EAAE,OAAO,CAAC;IACrB,aAAa,CAAC,EAAE,MAAM,EAAE,CAAC;IAGzB,cAAc,CAAC,EAAE,OAAO,CAAC;CAC1B;AAED,qBAAa,wBAAyB,YAAW,gBAAgB;IAC/D,OAAO,IAAI,MAAM;IAIjB,KAAK,CAAC,IAAI,EAAE,MAAM,EAAE,UAAU,EAAE,MAAM,EAAE,OAAO,EAAE,eAAe,GAAG,KAAK,EAAE;IAmI1E;;OAEG;IACH,OAAO,CAAC,iBAAiB;IAsBzB;;OAEG;IACH,OAAO,CAAC,UAAU;IAalB;;OAEG;IACH,OAAO,CAAC,mBAAmB;IAwE3B;;OAEG;IACH,OAAO,CAAC,WAAW;IAuCnB;;OAEG;IACH,OAAO,CAAC,aAAa;IA6GrB;;OAEG;IACH,OAAO,CAAC,aAAa;IAqBrB;;OAEG;IACH,OAAO,CAAC,mBAAmB;IAY3B;;OAEG;IACH,OAAO,CAAC,iBAAiB;IAuCzB;;OAEG;IACH,OAAO,CAAC,cAAc;IAWtB;;OAEG;IACH,OAAO,CAAC,oBAAoB;IAW5B;;OAEG;IACH,OAAO,CAAC,YAAY;IAqBpB;;OAEG;IACH,OAAO,CAAC,UAAU;IAoBlB;;OAEG;IACH,OAAO,CAAC,wBAAwB;IAwBhC;;OAEG;IACH,OAAO,CAAC,kBAAkB;IAS1B;;OAEG;IACH,OAAO,CAAC,cAAc;IAItB;;OAEG;IACH,OAAO,CAAC,kBAAkB;IA0C1B;;OAEG;IACH,OAAO,CAAC,mBAAmB;CAoD5B"}
|