@sparkleideas/embeddings 3.0.0-alpha.17 → 3.0.0-alpha.27
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +308 -17
- package/package.json +19 -7
- package/src/chunking.ts +351 -0
- package/src/embedding-service.ts +477 -5
- package/src/hyperbolic.ts +458 -0
- package/src/index.ts +77 -0
- package/src/neural-integration.ts +295 -0
- package/src/normalization.ts +267 -0
- package/src/persistent-cache.ts +410 -0
- package/src/types.ts +61 -2
- package/dist/__tests__/embedding-service.test.d.ts +0 -2
- package/dist/__tests__/embedding-service.test.d.ts.map +0 -1
- package/dist/__tests__/embedding-service.test.js +0 -98
- package/dist/__tests__/embedding-service.test.js.map +0 -1
- package/dist/embedding-service.d.ts +0 -113
- package/dist/embedding-service.d.ts.map +0 -1
- package/dist/embedding-service.js +0 -543
- package/dist/embedding-service.js.map +0 -1
- package/dist/index.d.ts +0 -15
- package/dist/index.d.ts.map +0 -1
- package/dist/index.js +0 -15
- package/dist/index.js.map +0 -1
- package/dist/types.d.ts +0 -178
- package/dist/types.d.ts.map +0 -1
- package/dist/types.js +0 -15
- package/dist/types.js.map +0 -1
package/src/chunking.ts
ADDED
|
@@ -0,0 +1,351 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Document Chunking Utilities
|
|
3
|
+
*
|
|
4
|
+
* Features:
|
|
5
|
+
* - Configurable chunk size and overlap
|
|
6
|
+
* - Sentence-aware splitting
|
|
7
|
+
* - Paragraph-aware splitting
|
|
8
|
+
* - Token-based chunking (approximate)
|
|
9
|
+
* - Metadata tracking for reconstruction
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* Chunking configuration
|
|
14
|
+
*/
|
|
15
|
+
export interface ChunkingConfig {
|
|
16
|
+
/** Maximum chunk size in characters (default: 512) */
|
|
17
|
+
maxChunkSize?: number;
|
|
18
|
+
/** Overlap between chunks in characters (default: 50) */
|
|
19
|
+
overlap?: number;
|
|
20
|
+
/** Strategy for splitting (default: 'sentence') */
|
|
21
|
+
strategy?: 'character' | 'sentence' | 'paragraph' | 'token';
|
|
22
|
+
/** Minimum chunk size (default: 100) */
|
|
23
|
+
minChunkSize?: number;
|
|
24
|
+
/** Include metadata with chunks */
|
|
25
|
+
includeMetadata?: boolean;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
/**
|
|
29
|
+
* Chunk result with metadata
|
|
30
|
+
*/
|
|
31
|
+
export interface Chunk {
|
|
32
|
+
/** Chunk text content */
|
|
33
|
+
text: string;
|
|
34
|
+
/** Original index in document */
|
|
35
|
+
index: number;
|
|
36
|
+
/** Start position in original text */
|
|
37
|
+
startPos: number;
|
|
38
|
+
/** End position in original text */
|
|
39
|
+
endPos: number;
|
|
40
|
+
/** Character count */
|
|
41
|
+
length: number;
|
|
42
|
+
/** Approximate token count (chars / 4) */
|
|
43
|
+
tokenCount: number;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Chunked document result
|
|
48
|
+
*/
|
|
49
|
+
export interface ChunkedDocument {
|
|
50
|
+
/** Array of chunks */
|
|
51
|
+
chunks: Chunk[];
|
|
52
|
+
/** Original text length */
|
|
53
|
+
originalLength: number;
|
|
54
|
+
/** Total chunks created */
|
|
55
|
+
totalChunks: number;
|
|
56
|
+
/** Configuration used */
|
|
57
|
+
config: Required<ChunkingConfig>;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// Sentence boundary patterns
|
|
61
|
+
const SENTENCE_ENDINGS = /(?<=[.!?])\s+(?=[A-Z])/g;
|
|
62
|
+
const PARAGRAPH_BREAKS = /\n\n+/g;
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* Split text into chunks with overlap
|
|
66
|
+
*/
|
|
67
|
+
export function chunkText(
|
|
68
|
+
text: string,
|
|
69
|
+
config: ChunkingConfig = {}
|
|
70
|
+
): ChunkedDocument {
|
|
71
|
+
const finalConfig: Required<ChunkingConfig> = {
|
|
72
|
+
maxChunkSize: config.maxChunkSize ?? 512,
|
|
73
|
+
overlap: config.overlap ?? 50,
|
|
74
|
+
strategy: config.strategy ?? 'sentence',
|
|
75
|
+
minChunkSize: config.minChunkSize ?? 100,
|
|
76
|
+
includeMetadata: config.includeMetadata ?? true,
|
|
77
|
+
};
|
|
78
|
+
|
|
79
|
+
// Normalize whitespace
|
|
80
|
+
const normalizedText = text.replace(/\s+/g, ' ').trim();
|
|
81
|
+
|
|
82
|
+
let chunks: Chunk[];
|
|
83
|
+
|
|
84
|
+
switch (finalConfig.strategy) {
|
|
85
|
+
case 'character':
|
|
86
|
+
chunks = chunkByCharacter(normalizedText, finalConfig);
|
|
87
|
+
break;
|
|
88
|
+
case 'sentence':
|
|
89
|
+
chunks = chunkBySentence(normalizedText, finalConfig);
|
|
90
|
+
break;
|
|
91
|
+
case 'paragraph':
|
|
92
|
+
chunks = chunkByParagraph(text, finalConfig); // Keep original for paragraphs
|
|
93
|
+
break;
|
|
94
|
+
case 'token':
|
|
95
|
+
chunks = chunkByToken(normalizedText, finalConfig);
|
|
96
|
+
break;
|
|
97
|
+
default:
|
|
98
|
+
chunks = chunkBySentence(normalizedText, finalConfig);
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
return {
|
|
102
|
+
chunks,
|
|
103
|
+
originalLength: text.length,
|
|
104
|
+
totalChunks: chunks.length,
|
|
105
|
+
config: finalConfig,
|
|
106
|
+
};
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
/**
|
|
110
|
+
* Simple character-based chunking with overlap
|
|
111
|
+
*/
|
|
112
|
+
function chunkByCharacter(
|
|
113
|
+
text: string,
|
|
114
|
+
config: Required<ChunkingConfig>
|
|
115
|
+
): Chunk[] {
|
|
116
|
+
const chunks: Chunk[] = [];
|
|
117
|
+
const { maxChunkSize, overlap } = config;
|
|
118
|
+
|
|
119
|
+
let pos = 0;
|
|
120
|
+
let index = 0;
|
|
121
|
+
|
|
122
|
+
while (pos < text.length) {
|
|
123
|
+
const endPos = Math.min(pos + maxChunkSize, text.length);
|
|
124
|
+
const chunkText = text.slice(pos, endPos);
|
|
125
|
+
|
|
126
|
+
chunks.push({
|
|
127
|
+
text: chunkText,
|
|
128
|
+
index,
|
|
129
|
+
startPos: pos,
|
|
130
|
+
endPos,
|
|
131
|
+
length: chunkText.length,
|
|
132
|
+
tokenCount: Math.ceil(chunkText.length / 4),
|
|
133
|
+
});
|
|
134
|
+
|
|
135
|
+
// Move position with overlap
|
|
136
|
+
pos = endPos - overlap;
|
|
137
|
+
if (pos >= text.length - overlap) {
|
|
138
|
+
break;
|
|
139
|
+
}
|
|
140
|
+
index++;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
return chunks;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
/**
|
|
147
|
+
* Sentence-aware chunking - keeps sentences intact
|
|
148
|
+
*/
|
|
149
|
+
function chunkBySentence(
|
|
150
|
+
text: string,
|
|
151
|
+
config: Required<ChunkingConfig>
|
|
152
|
+
): Chunk[] {
|
|
153
|
+
const { maxChunkSize, overlap, minChunkSize } = config;
|
|
154
|
+
|
|
155
|
+
// Split into sentences
|
|
156
|
+
const sentences = text.split(SENTENCE_ENDINGS).filter(s => s.trim().length > 0);
|
|
157
|
+
|
|
158
|
+
const chunks: Chunk[] = [];
|
|
159
|
+
let currentChunk = '';
|
|
160
|
+
let currentStart = 0;
|
|
161
|
+
let index = 0;
|
|
162
|
+
let textPos = 0;
|
|
163
|
+
|
|
164
|
+
for (const sentence of sentences) {
|
|
165
|
+
const trimmedSentence = sentence.trim();
|
|
166
|
+
|
|
167
|
+
// If adding this sentence exceeds max size, save current chunk
|
|
168
|
+
if (currentChunk.length + trimmedSentence.length > maxChunkSize && currentChunk.length >= minChunkSize) {
|
|
169
|
+
chunks.push({
|
|
170
|
+
text: currentChunk.trim(),
|
|
171
|
+
index,
|
|
172
|
+
startPos: currentStart,
|
|
173
|
+
endPos: textPos,
|
|
174
|
+
length: currentChunk.length,
|
|
175
|
+
tokenCount: Math.ceil(currentChunk.length / 4),
|
|
176
|
+
});
|
|
177
|
+
|
|
178
|
+
// Start new chunk with overlap (last part of previous chunk)
|
|
179
|
+
const overlapText = currentChunk.slice(-overlap);
|
|
180
|
+
currentChunk = overlapText + ' ' + trimmedSentence;
|
|
181
|
+
currentStart = textPos - overlap;
|
|
182
|
+
index++;
|
|
183
|
+
} else {
|
|
184
|
+
currentChunk += (currentChunk.length > 0 ? ' ' : '') + trimmedSentence;
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
textPos += trimmedSentence.length + 1;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
// Add final chunk
|
|
191
|
+
if (currentChunk.trim().length > 0) {
|
|
192
|
+
chunks.push({
|
|
193
|
+
text: currentChunk.trim(),
|
|
194
|
+
index,
|
|
195
|
+
startPos: currentStart,
|
|
196
|
+
endPos: text.length,
|
|
197
|
+
length: currentChunk.length,
|
|
198
|
+
tokenCount: Math.ceil(currentChunk.length / 4),
|
|
199
|
+
});
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
return chunks;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
/**
|
|
206
|
+
* Paragraph-aware chunking
|
|
207
|
+
*/
|
|
208
|
+
function chunkByParagraph(
|
|
209
|
+
text: string,
|
|
210
|
+
config: Required<ChunkingConfig>
|
|
211
|
+
): Chunk[] {
|
|
212
|
+
const { maxChunkSize, minChunkSize } = config;
|
|
213
|
+
|
|
214
|
+
// Split by paragraph breaks
|
|
215
|
+
const paragraphs = text.split(PARAGRAPH_BREAKS).filter(p => p.trim().length > 0);
|
|
216
|
+
|
|
217
|
+
const chunks: Chunk[] = [];
|
|
218
|
+
let currentChunk = '';
|
|
219
|
+
let currentStart = 0;
|
|
220
|
+
let index = 0;
|
|
221
|
+
let textPos = 0;
|
|
222
|
+
|
|
223
|
+
for (const paragraph of paragraphs) {
|
|
224
|
+
const trimmedPara = paragraph.trim();
|
|
225
|
+
|
|
226
|
+
// If single paragraph exceeds max, fall back to sentence chunking
|
|
227
|
+
if (trimmedPara.length > maxChunkSize) {
|
|
228
|
+
if (currentChunk.length > 0) {
|
|
229
|
+
chunks.push({
|
|
230
|
+
text: currentChunk.trim(),
|
|
231
|
+
index,
|
|
232
|
+
startPos: currentStart,
|
|
233
|
+
endPos: textPos,
|
|
234
|
+
length: currentChunk.length,
|
|
235
|
+
tokenCount: Math.ceil(currentChunk.length / 4),
|
|
236
|
+
});
|
|
237
|
+
index++;
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
// Chunk the large paragraph by sentence
|
|
241
|
+
const subChunks = chunkBySentence(trimmedPara, config);
|
|
242
|
+
for (const subChunk of subChunks) {
|
|
243
|
+
chunks.push({
|
|
244
|
+
...subChunk,
|
|
245
|
+
index,
|
|
246
|
+
startPos: textPos + subChunk.startPos,
|
|
247
|
+
endPos: textPos + subChunk.endPos,
|
|
248
|
+
});
|
|
249
|
+
index++;
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
currentChunk = '';
|
|
253
|
+
currentStart = textPos + trimmedPara.length;
|
|
254
|
+
} else if (currentChunk.length + trimmedPara.length > maxChunkSize && currentChunk.length >= minChunkSize) {
|
|
255
|
+
chunks.push({
|
|
256
|
+
text: currentChunk.trim(),
|
|
257
|
+
index,
|
|
258
|
+
startPos: currentStart,
|
|
259
|
+
endPos: textPos,
|
|
260
|
+
length: currentChunk.length,
|
|
261
|
+
tokenCount: Math.ceil(currentChunk.length / 4),
|
|
262
|
+
});
|
|
263
|
+
|
|
264
|
+
currentChunk = trimmedPara;
|
|
265
|
+
currentStart = textPos;
|
|
266
|
+
index++;
|
|
267
|
+
} else {
|
|
268
|
+
currentChunk += (currentChunk.length > 0 ? '\n\n' : '') + trimmedPara;
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
textPos += trimmedPara.length + 2; // +2 for paragraph break
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
// Add final chunk
|
|
275
|
+
if (currentChunk.trim().length > 0) {
|
|
276
|
+
chunks.push({
|
|
277
|
+
text: currentChunk.trim(),
|
|
278
|
+
index,
|
|
279
|
+
startPos: currentStart,
|
|
280
|
+
endPos: text.length,
|
|
281
|
+
length: currentChunk.length,
|
|
282
|
+
tokenCount: Math.ceil(currentChunk.length / 4),
|
|
283
|
+
});
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
return chunks;
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
/**
|
|
290
|
+
* Token-based chunking (approximate - uses chars/4 as estimate)
|
|
291
|
+
*/
|
|
292
|
+
function chunkByToken(
|
|
293
|
+
text: string,
|
|
294
|
+
config: Required<ChunkingConfig>
|
|
295
|
+
): Chunk[] {
|
|
296
|
+
// Convert token limits to character limits (rough estimate: 1 token ≈ 4 chars)
|
|
297
|
+
const charConfig: Required<ChunkingConfig> = {
|
|
298
|
+
...config,
|
|
299
|
+
maxChunkSize: config.maxChunkSize * 4,
|
|
300
|
+
overlap: config.overlap * 4,
|
|
301
|
+
minChunkSize: config.minChunkSize * 4,
|
|
302
|
+
};
|
|
303
|
+
|
|
304
|
+
// Use sentence-aware chunking with converted limits
|
|
305
|
+
return chunkBySentence(text, charConfig);
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
/**
|
|
309
|
+
* Estimate token count for text
|
|
310
|
+
*/
|
|
311
|
+
export function estimateTokens(text: string): number {
|
|
312
|
+
// Simple estimation: ~4 characters per token on average
|
|
313
|
+
return Math.ceil(text.length / 4);
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
/**
|
|
317
|
+
* Reconstruct original text from chunks (approximate)
|
|
318
|
+
*/
|
|
319
|
+
export function reconstructFromChunks(chunks: Chunk[]): string {
|
|
320
|
+
if (chunks.length === 0) return '';
|
|
321
|
+
if (chunks.length === 1) return chunks[0].text;
|
|
322
|
+
|
|
323
|
+
// Sort by index
|
|
324
|
+
const sorted = [...chunks].sort((a, b) => a.index - b.index);
|
|
325
|
+
|
|
326
|
+
// Simple concatenation (overlap removal is approximate)
|
|
327
|
+
let result = sorted[0].text;
|
|
328
|
+
|
|
329
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
330
|
+
const chunk = sorted[i];
|
|
331
|
+
const prevChunk = sorted[i - 1];
|
|
332
|
+
|
|
333
|
+
// Find overlap by looking for common suffix/prefix
|
|
334
|
+
const overlapSize = Math.min(100, prevChunk.text.length, chunk.text.length);
|
|
335
|
+
const prevSuffix = prevChunk.text.slice(-overlapSize);
|
|
336
|
+
const currPrefix = chunk.text.slice(0, overlapSize);
|
|
337
|
+
|
|
338
|
+
// Find longest common overlap
|
|
339
|
+
let overlap = 0;
|
|
340
|
+
for (let len = overlapSize; len > 0; len--) {
|
|
341
|
+
if (currPrefix.startsWith(prevSuffix.slice(-len))) {
|
|
342
|
+
overlap = len;
|
|
343
|
+
break;
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
result += ' ' + chunk.text.slice(overlap);
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
return result.replace(/\s+/g, ' ').trim();
|
|
351
|
+
}
|