@sparkleideas/embeddings 3.0.0-alpha.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,126 @@
1
+ /**
2
+ * Tests for EmbeddingService
3
+ */
4
+ import { describe, it, expect, beforeEach } from 'vitest';
5
+ import {
6
+ createEmbeddingService,
7
+ MockEmbeddingService,
8
+ cosineSimilarity,
9
+ euclideanDistance,
10
+ dotProduct,
11
+ computeSimilarity,
12
+ } from '../index.js';
13
+
14
+ describe('EmbeddingService', () => {
15
+ describe('MockEmbeddingService', () => {
16
+ let service: MockEmbeddingService;
17
+
18
+ beforeEach(() => {
19
+ service = new MockEmbeddingService({ provider: 'mock', dimensions: 128 });
20
+ });
21
+
22
+ it('should generate embeddings with correct dimensions', async () => {
23
+ const result = await service.embed('Hello, world!');
24
+ expect(result.embedding).toHaveLength(128);
25
+ });
26
+
27
+ it('should generate deterministic embeddings for same text', async () => {
28
+ const result1 = await service.embed('test text');
29
+ const result2 = await service.embed('test text');
30
+
31
+ // Mock service should be deterministic
32
+ expect(Array.from(result1.embedding)).toEqual(Array.from(result2.embedding));
33
+ });
34
+
35
+ it('should handle batch embeddings', async () => {
36
+ const texts = ['text1', 'text2', 'text3'];
37
+ const results = await service.embedBatch(texts);
38
+
39
+ expect(results.embeddings).toHaveLength(3);
40
+ // Each embedding should have correct dimensions
41
+ results.embeddings.forEach((emb) => {
42
+ expect(emb.length).toBe(128);
43
+ });
44
+ });
45
+ });
46
+
47
+ describe('createEmbeddingService', () => {
48
+ it('should create mock service', () => {
49
+ const service = createEmbeddingService({
50
+ provider: 'mock',
51
+ dimensions: 64,
52
+ });
53
+
54
+ expect(service).toBeInstanceOf(MockEmbeddingService);
55
+ });
56
+ });
57
+ });
58
+
59
+ describe('Similarity Functions', () => {
60
+ const vec1 = new Float32Array([1, 0, 0]);
61
+ const vec2 = new Float32Array([1, 0, 0]);
62
+ const vec3 = new Float32Array([0, 1, 0]);
63
+ const vec4 = new Float32Array([-1, 0, 0]);
64
+
65
+ describe('cosineSimilarity', () => {
66
+ it('should return 1 for identical vectors', () => {
67
+ expect(cosineSimilarity(vec1, vec2)).toBeCloseTo(1);
68
+ });
69
+
70
+ it('should return 0 for orthogonal vectors', () => {
71
+ expect(cosineSimilarity(vec1, vec3)).toBeCloseTo(0);
72
+ });
73
+
74
+ it('should return -1 for opposite vectors', () => {
75
+ expect(cosineSimilarity(vec1, vec4)).toBeCloseTo(-1);
76
+ });
77
+ });
78
+
79
+ describe('euclideanDistance', () => {
80
+ it('should return 0 for identical vectors', () => {
81
+ expect(euclideanDistance(vec1, vec2)).toBeCloseTo(0);
82
+ });
83
+
84
+ it('should return sqrt(2) for unit orthogonal vectors', () => {
85
+ expect(euclideanDistance(vec1, vec3)).toBeCloseTo(Math.sqrt(2));
86
+ });
87
+
88
+ it('should return 2 for opposite unit vectors', () => {
89
+ expect(euclideanDistance(vec1, vec4)).toBeCloseTo(2);
90
+ });
91
+ });
92
+
93
+ describe('dotProduct', () => {
94
+ it('should return 1 for identical unit vectors', () => {
95
+ expect(dotProduct(vec1, vec2)).toBeCloseTo(1);
96
+ });
97
+
98
+ it('should return 0 for orthogonal vectors', () => {
99
+ expect(dotProduct(vec1, vec3)).toBeCloseTo(0);
100
+ });
101
+
102
+ it('should return -1 for opposite unit vectors', () => {
103
+ expect(dotProduct(vec1, vec4)).toBeCloseTo(-1);
104
+ });
105
+ });
106
+
107
+ describe('computeSimilarity', () => {
108
+ it('should use cosine metric by default', () => {
109
+ const result = computeSimilarity(vec1, vec2);
110
+ expect(result.metric).toBe('cosine');
111
+ expect(result.score).toBeCloseTo(1);
112
+ });
113
+
114
+ it('should support euclidean metric', () => {
115
+ const result = computeSimilarity(vec1, vec3, 'euclidean');
116
+ expect(result.metric).toBe('euclidean');
117
+ expect(result.score).toBeCloseTo(Math.sqrt(2));
118
+ });
119
+
120
+ it('should support dot product metric', () => {
121
+ const result = computeSimilarity(vec1, vec4, 'dot');
122
+ expect(result.metric).toBe('dot');
123
+ expect(result.score).toBeCloseTo(-1);
124
+ });
125
+ });
126
+ });
@@ -0,0 +1,351 @@
1
+ /**
2
+ * Document Chunking Utilities
3
+ *
4
+ * Features:
5
+ * - Configurable chunk size and overlap
6
+ * - Sentence-aware splitting
7
+ * - Paragraph-aware splitting
8
+ * - Token-based chunking (approximate)
9
+ * - Metadata tracking for reconstruction
10
+ */
11
+
12
+ /**
13
+ * Chunking configuration
14
+ */
15
+ export interface ChunkingConfig {
16
+ /** Maximum chunk size in characters (default: 512) */
17
+ maxChunkSize?: number;
18
+ /** Overlap between chunks in characters (default: 50) */
19
+ overlap?: number;
20
+ /** Strategy for splitting (default: 'sentence') */
21
+ strategy?: 'character' | 'sentence' | 'paragraph' | 'token';
22
+ /** Minimum chunk size (default: 100) */
23
+ minChunkSize?: number;
24
+ /** Include metadata with chunks */
25
+ includeMetadata?: boolean;
26
+ }
27
+
28
+ /**
29
+ * Chunk result with metadata
30
+ */
31
+ export interface Chunk {
32
+ /** Chunk text content */
33
+ text: string;
34
+ /** Original index in document */
35
+ index: number;
36
+ /** Start position in original text */
37
+ startPos: number;
38
+ /** End position in original text */
39
+ endPos: number;
40
+ /** Character count */
41
+ length: number;
42
+ /** Approximate token count (chars / 4) */
43
+ tokenCount: number;
44
+ }
45
+
46
+ /**
47
+ * Chunked document result
48
+ */
49
+ export interface ChunkedDocument {
50
+ /** Array of chunks */
51
+ chunks: Chunk[];
52
+ /** Original text length */
53
+ originalLength: number;
54
+ /** Total chunks created */
55
+ totalChunks: number;
56
+ /** Configuration used */
57
+ config: Required<ChunkingConfig>;
58
+ }
59
+
60
+ // Sentence boundary patterns
61
+ const SENTENCE_ENDINGS = /(?<=[.!?])\s+(?=[A-Z])/g;
62
+ const PARAGRAPH_BREAKS = /\n\n+/g;
63
+
64
+ /**
65
+ * Split text into chunks with overlap
66
+ */
67
+ export function chunkText(
68
+ text: string,
69
+ config: ChunkingConfig = {}
70
+ ): ChunkedDocument {
71
+ const finalConfig: Required<ChunkingConfig> = {
72
+ maxChunkSize: config.maxChunkSize ?? 512,
73
+ overlap: config.overlap ?? 50,
74
+ strategy: config.strategy ?? 'sentence',
75
+ minChunkSize: config.minChunkSize ?? 100,
76
+ includeMetadata: config.includeMetadata ?? true,
77
+ };
78
+
79
+ // Normalize whitespace
80
+ const normalizedText = text.replace(/\s+/g, ' ').trim();
81
+
82
+ let chunks: Chunk[];
83
+
84
+ switch (finalConfig.strategy) {
85
+ case 'character':
86
+ chunks = chunkByCharacter(normalizedText, finalConfig);
87
+ break;
88
+ case 'sentence':
89
+ chunks = chunkBySentence(normalizedText, finalConfig);
90
+ break;
91
+ case 'paragraph':
92
+ chunks = chunkByParagraph(text, finalConfig); // Keep original for paragraphs
93
+ break;
94
+ case 'token':
95
+ chunks = chunkByToken(normalizedText, finalConfig);
96
+ break;
97
+ default:
98
+ chunks = chunkBySentence(normalizedText, finalConfig);
99
+ }
100
+
101
+ return {
102
+ chunks,
103
+ originalLength: text.length,
104
+ totalChunks: chunks.length,
105
+ config: finalConfig,
106
+ };
107
+ }
108
+
109
+ /**
110
+ * Simple character-based chunking with overlap
111
+ */
112
+ function chunkByCharacter(
113
+ text: string,
114
+ config: Required<ChunkingConfig>
115
+ ): Chunk[] {
116
+ const chunks: Chunk[] = [];
117
+ const { maxChunkSize, overlap } = config;
118
+
119
+ let pos = 0;
120
+ let index = 0;
121
+
122
+ while (pos < text.length) {
123
+ const endPos = Math.min(pos + maxChunkSize, text.length);
124
+ const chunkText = text.slice(pos, endPos);
125
+
126
+ chunks.push({
127
+ text: chunkText,
128
+ index,
129
+ startPos: pos,
130
+ endPos,
131
+ length: chunkText.length,
132
+ tokenCount: Math.ceil(chunkText.length / 4),
133
+ });
134
+
135
+ // Move position with overlap
136
+ pos = endPos - overlap;
137
+ if (pos >= text.length - overlap) {
138
+ break;
139
+ }
140
+ index++;
141
+ }
142
+
143
+ return chunks;
144
+ }
145
+
146
+ /**
147
+ * Sentence-aware chunking - keeps sentences intact
148
+ */
149
+ function chunkBySentence(
150
+ text: string,
151
+ config: Required<ChunkingConfig>
152
+ ): Chunk[] {
153
+ const { maxChunkSize, overlap, minChunkSize } = config;
154
+
155
+ // Split into sentences
156
+ const sentences = text.split(SENTENCE_ENDINGS).filter(s => s.trim().length > 0);
157
+
158
+ const chunks: Chunk[] = [];
159
+ let currentChunk = '';
160
+ let currentStart = 0;
161
+ let index = 0;
162
+ let textPos = 0;
163
+
164
+ for (const sentence of sentences) {
165
+ const trimmedSentence = sentence.trim();
166
+
167
+ // If adding this sentence exceeds max size, save current chunk
168
+ if (currentChunk.length + trimmedSentence.length > maxChunkSize && currentChunk.length >= minChunkSize) {
169
+ chunks.push({
170
+ text: currentChunk.trim(),
171
+ index,
172
+ startPos: currentStart,
173
+ endPos: textPos,
174
+ length: currentChunk.length,
175
+ tokenCount: Math.ceil(currentChunk.length / 4),
176
+ });
177
+
178
+ // Start new chunk with overlap (last part of previous chunk)
179
+ const overlapText = currentChunk.slice(-overlap);
180
+ currentChunk = overlapText + ' ' + trimmedSentence;
181
+ currentStart = textPos - overlap;
182
+ index++;
183
+ } else {
184
+ currentChunk += (currentChunk.length > 0 ? ' ' : '') + trimmedSentence;
185
+ }
186
+
187
+ textPos += trimmedSentence.length + 1;
188
+ }
189
+
190
+ // Add final chunk
191
+ if (currentChunk.trim().length > 0) {
192
+ chunks.push({
193
+ text: currentChunk.trim(),
194
+ index,
195
+ startPos: currentStart,
196
+ endPos: text.length,
197
+ length: currentChunk.length,
198
+ tokenCount: Math.ceil(currentChunk.length / 4),
199
+ });
200
+ }
201
+
202
+ return chunks;
203
+ }
204
+
205
+ /**
206
+ * Paragraph-aware chunking
207
+ */
208
+ function chunkByParagraph(
209
+ text: string,
210
+ config: Required<ChunkingConfig>
211
+ ): Chunk[] {
212
+ const { maxChunkSize, minChunkSize } = config;
213
+
214
+ // Split by paragraph breaks
215
+ const paragraphs = text.split(PARAGRAPH_BREAKS).filter(p => p.trim().length > 0);
216
+
217
+ const chunks: Chunk[] = [];
218
+ let currentChunk = '';
219
+ let currentStart = 0;
220
+ let index = 0;
221
+ let textPos = 0;
222
+
223
+ for (const paragraph of paragraphs) {
224
+ const trimmedPara = paragraph.trim();
225
+
226
+ // If single paragraph exceeds max, fall back to sentence chunking
227
+ if (trimmedPara.length > maxChunkSize) {
228
+ if (currentChunk.length > 0) {
229
+ chunks.push({
230
+ text: currentChunk.trim(),
231
+ index,
232
+ startPos: currentStart,
233
+ endPos: textPos,
234
+ length: currentChunk.length,
235
+ tokenCount: Math.ceil(currentChunk.length / 4),
236
+ });
237
+ index++;
238
+ }
239
+
240
+ // Chunk the large paragraph by sentence
241
+ const subChunks = chunkBySentence(trimmedPara, config);
242
+ for (const subChunk of subChunks) {
243
+ chunks.push({
244
+ ...subChunk,
245
+ index,
246
+ startPos: textPos + subChunk.startPos,
247
+ endPos: textPos + subChunk.endPos,
248
+ });
249
+ index++;
250
+ }
251
+
252
+ currentChunk = '';
253
+ currentStart = textPos + trimmedPara.length;
254
+ } else if (currentChunk.length + trimmedPara.length > maxChunkSize && currentChunk.length >= minChunkSize) {
255
+ chunks.push({
256
+ text: currentChunk.trim(),
257
+ index,
258
+ startPos: currentStart,
259
+ endPos: textPos,
260
+ length: currentChunk.length,
261
+ tokenCount: Math.ceil(currentChunk.length / 4),
262
+ });
263
+
264
+ currentChunk = trimmedPara;
265
+ currentStart = textPos;
266
+ index++;
267
+ } else {
268
+ currentChunk += (currentChunk.length > 0 ? '\n\n' : '') + trimmedPara;
269
+ }
270
+
271
+ textPos += trimmedPara.length + 2; // +2 for paragraph break
272
+ }
273
+
274
+ // Add final chunk
275
+ if (currentChunk.trim().length > 0) {
276
+ chunks.push({
277
+ text: currentChunk.trim(),
278
+ index,
279
+ startPos: currentStart,
280
+ endPos: text.length,
281
+ length: currentChunk.length,
282
+ tokenCount: Math.ceil(currentChunk.length / 4),
283
+ });
284
+ }
285
+
286
+ return chunks;
287
+ }
288
+
289
+ /**
290
+ * Token-based chunking (approximate - uses chars/4 as estimate)
291
+ */
292
+ function chunkByToken(
293
+ text: string,
294
+ config: Required<ChunkingConfig>
295
+ ): Chunk[] {
296
+ // Convert token limits to character limits (rough estimate: 1 token ≈ 4 chars)
297
+ const charConfig: Required<ChunkingConfig> = {
298
+ ...config,
299
+ maxChunkSize: config.maxChunkSize * 4,
300
+ overlap: config.overlap * 4,
301
+ minChunkSize: config.minChunkSize * 4,
302
+ };
303
+
304
+ // Use sentence-aware chunking with converted limits
305
+ return chunkBySentence(text, charConfig);
306
+ }
307
+
308
+ /**
309
+ * Estimate token count for text
310
+ */
311
+ export function estimateTokens(text: string): number {
312
+ // Simple estimation: ~4 characters per token on average
313
+ return Math.ceil(text.length / 4);
314
+ }
315
+
316
+ /**
317
+ * Reconstruct original text from chunks (approximate)
318
+ */
319
+ export function reconstructFromChunks(chunks: Chunk[]): string {
320
+ if (chunks.length === 0) return '';
321
+ if (chunks.length === 1) return chunks[0].text;
322
+
323
+ // Sort by index
324
+ const sorted = [...chunks].sort((a, b) => a.index - b.index);
325
+
326
+ // Simple concatenation (overlap removal is approximate)
327
+ let result = sorted[0].text;
328
+
329
+ for (let i = 1; i < sorted.length; i++) {
330
+ const chunk = sorted[i];
331
+ const prevChunk = sorted[i - 1];
332
+
333
+ // Find overlap by looking for common suffix/prefix
334
+ const overlapSize = Math.min(100, prevChunk.text.length, chunk.text.length);
335
+ const prevSuffix = prevChunk.text.slice(-overlapSize);
336
+ const currPrefix = chunk.text.slice(0, overlapSize);
337
+
338
+ // Find longest common overlap
339
+ let overlap = 0;
340
+ for (let len = overlapSize; len > 0; len--) {
341
+ if (currPrefix.startsWith(prevSuffix.slice(-len))) {
342
+ overlap = len;
343
+ break;
344
+ }
345
+ }
346
+
347
+ result += ' ' + chunk.text.slice(overlap);
348
+ }
349
+
350
+ return result.replace(/\s+/g, ' ').trim();
351
+ }