@sparkleideas/embeddings 3.0.0-alpha.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +651 -0
- package/package.json +66 -0
- package/src/__tests__/embedding-service.test.ts +126 -0
- package/src/chunking.ts +351 -0
- package/src/embedding-service.ts +1136 -0
- package/src/hyperbolic.ts +458 -0
- package/src/index.ts +116 -0
- package/src/neural-integration.ts +295 -0
- package/src/normalization.ts +267 -0
- package/src/persistent-cache.ts +410 -0
- package/src/types.ts +282 -0
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for EmbeddingService
|
|
3
|
+
*/
|
|
4
|
+
import { describe, it, expect, beforeEach } from 'vitest';
|
|
5
|
+
import {
|
|
6
|
+
createEmbeddingService,
|
|
7
|
+
MockEmbeddingService,
|
|
8
|
+
cosineSimilarity,
|
|
9
|
+
euclideanDistance,
|
|
10
|
+
dotProduct,
|
|
11
|
+
computeSimilarity,
|
|
12
|
+
} from '../index.js';
|
|
13
|
+
|
|
14
|
+
describe('EmbeddingService', () => {
|
|
15
|
+
describe('MockEmbeddingService', () => {
|
|
16
|
+
let service: MockEmbeddingService;
|
|
17
|
+
|
|
18
|
+
beforeEach(() => {
|
|
19
|
+
service = new MockEmbeddingService({ provider: 'mock', dimensions: 128 });
|
|
20
|
+
});
|
|
21
|
+
|
|
22
|
+
it('should generate embeddings with correct dimensions', async () => {
|
|
23
|
+
const result = await service.embed('Hello, world!');
|
|
24
|
+
expect(result.embedding).toHaveLength(128);
|
|
25
|
+
});
|
|
26
|
+
|
|
27
|
+
it('should generate deterministic embeddings for same text', async () => {
|
|
28
|
+
const result1 = await service.embed('test text');
|
|
29
|
+
const result2 = await service.embed('test text');
|
|
30
|
+
|
|
31
|
+
// Mock service should be deterministic
|
|
32
|
+
expect(Array.from(result1.embedding)).toEqual(Array.from(result2.embedding));
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
it('should handle batch embeddings', async () => {
|
|
36
|
+
const texts = ['text1', 'text2', 'text3'];
|
|
37
|
+
const results = await service.embedBatch(texts);
|
|
38
|
+
|
|
39
|
+
expect(results.embeddings).toHaveLength(3);
|
|
40
|
+
// Each embedding should have correct dimensions
|
|
41
|
+
results.embeddings.forEach((emb) => {
|
|
42
|
+
expect(emb.length).toBe(128);
|
|
43
|
+
});
|
|
44
|
+
});
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
describe('createEmbeddingService', () => {
|
|
48
|
+
it('should create mock service', () => {
|
|
49
|
+
const service = createEmbeddingService({
|
|
50
|
+
provider: 'mock',
|
|
51
|
+
dimensions: 64,
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
expect(service).toBeInstanceOf(MockEmbeddingService);
|
|
55
|
+
});
|
|
56
|
+
});
|
|
57
|
+
});
|
|
58
|
+
|
|
59
|
+
describe('Similarity Functions', () => {
|
|
60
|
+
const vec1 = new Float32Array([1, 0, 0]);
|
|
61
|
+
const vec2 = new Float32Array([1, 0, 0]);
|
|
62
|
+
const vec3 = new Float32Array([0, 1, 0]);
|
|
63
|
+
const vec4 = new Float32Array([-1, 0, 0]);
|
|
64
|
+
|
|
65
|
+
describe('cosineSimilarity', () => {
|
|
66
|
+
it('should return 1 for identical vectors', () => {
|
|
67
|
+
expect(cosineSimilarity(vec1, vec2)).toBeCloseTo(1);
|
|
68
|
+
});
|
|
69
|
+
|
|
70
|
+
it('should return 0 for orthogonal vectors', () => {
|
|
71
|
+
expect(cosineSimilarity(vec1, vec3)).toBeCloseTo(0);
|
|
72
|
+
});
|
|
73
|
+
|
|
74
|
+
it('should return -1 for opposite vectors', () => {
|
|
75
|
+
expect(cosineSimilarity(vec1, vec4)).toBeCloseTo(-1);
|
|
76
|
+
});
|
|
77
|
+
});
|
|
78
|
+
|
|
79
|
+
describe('euclideanDistance', () => {
|
|
80
|
+
it('should return 0 for identical vectors', () => {
|
|
81
|
+
expect(euclideanDistance(vec1, vec2)).toBeCloseTo(0);
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
it('should return sqrt(2) for unit orthogonal vectors', () => {
|
|
85
|
+
expect(euclideanDistance(vec1, vec3)).toBeCloseTo(Math.sqrt(2));
|
|
86
|
+
});
|
|
87
|
+
|
|
88
|
+
it('should return 2 for opposite unit vectors', () => {
|
|
89
|
+
expect(euclideanDistance(vec1, vec4)).toBeCloseTo(2);
|
|
90
|
+
});
|
|
91
|
+
});
|
|
92
|
+
|
|
93
|
+
describe('dotProduct', () => {
|
|
94
|
+
it('should return 1 for identical unit vectors', () => {
|
|
95
|
+
expect(dotProduct(vec1, vec2)).toBeCloseTo(1);
|
|
96
|
+
});
|
|
97
|
+
|
|
98
|
+
it('should return 0 for orthogonal vectors', () => {
|
|
99
|
+
expect(dotProduct(vec1, vec3)).toBeCloseTo(0);
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
it('should return -1 for opposite unit vectors', () => {
|
|
103
|
+
expect(dotProduct(vec1, vec4)).toBeCloseTo(-1);
|
|
104
|
+
});
|
|
105
|
+
});
|
|
106
|
+
|
|
107
|
+
describe('computeSimilarity', () => {
|
|
108
|
+
it('should use cosine metric by default', () => {
|
|
109
|
+
const result = computeSimilarity(vec1, vec2);
|
|
110
|
+
expect(result.metric).toBe('cosine');
|
|
111
|
+
expect(result.score).toBeCloseTo(1);
|
|
112
|
+
});
|
|
113
|
+
|
|
114
|
+
it('should support euclidean metric', () => {
|
|
115
|
+
const result = computeSimilarity(vec1, vec3, 'euclidean');
|
|
116
|
+
expect(result.metric).toBe('euclidean');
|
|
117
|
+
expect(result.score).toBeCloseTo(Math.sqrt(2));
|
|
118
|
+
});
|
|
119
|
+
|
|
120
|
+
it('should support dot product metric', () => {
|
|
121
|
+
const result = computeSimilarity(vec1, vec4, 'dot');
|
|
122
|
+
expect(result.metric).toBe('dot');
|
|
123
|
+
expect(result.score).toBeCloseTo(-1);
|
|
124
|
+
});
|
|
125
|
+
});
|
|
126
|
+
});
|
package/src/chunking.ts
ADDED
|
@@ -0,0 +1,351 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Document Chunking Utilities
|
|
3
|
+
*
|
|
4
|
+
* Features:
|
|
5
|
+
* - Configurable chunk size and overlap
|
|
6
|
+
* - Sentence-aware splitting
|
|
7
|
+
* - Paragraph-aware splitting
|
|
8
|
+
* - Token-based chunking (approximate)
|
|
9
|
+
* - Metadata tracking for reconstruction
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* Chunking configuration
|
|
14
|
+
*/
|
|
15
|
+
export interface ChunkingConfig {
|
|
16
|
+
/** Maximum chunk size in characters (default: 512) */
|
|
17
|
+
maxChunkSize?: number;
|
|
18
|
+
/** Overlap between chunks in characters (default: 50) */
|
|
19
|
+
overlap?: number;
|
|
20
|
+
/** Strategy for splitting (default: 'sentence') */
|
|
21
|
+
strategy?: 'character' | 'sentence' | 'paragraph' | 'token';
|
|
22
|
+
/** Minimum chunk size (default: 100) */
|
|
23
|
+
minChunkSize?: number;
|
|
24
|
+
/** Include metadata with chunks */
|
|
25
|
+
includeMetadata?: boolean;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
/**
|
|
29
|
+
* Chunk result with metadata
|
|
30
|
+
*/
|
|
31
|
+
export interface Chunk {
|
|
32
|
+
/** Chunk text content */
|
|
33
|
+
text: string;
|
|
34
|
+
/** Original index in document */
|
|
35
|
+
index: number;
|
|
36
|
+
/** Start position in original text */
|
|
37
|
+
startPos: number;
|
|
38
|
+
/** End position in original text */
|
|
39
|
+
endPos: number;
|
|
40
|
+
/** Character count */
|
|
41
|
+
length: number;
|
|
42
|
+
/** Approximate token count (chars / 4) */
|
|
43
|
+
tokenCount: number;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Chunked document result
|
|
48
|
+
*/
|
|
49
|
+
export interface ChunkedDocument {
|
|
50
|
+
/** Array of chunks */
|
|
51
|
+
chunks: Chunk[];
|
|
52
|
+
/** Original text length */
|
|
53
|
+
originalLength: number;
|
|
54
|
+
/** Total chunks created */
|
|
55
|
+
totalChunks: number;
|
|
56
|
+
/** Configuration used */
|
|
57
|
+
config: Required<ChunkingConfig>;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// Sentence boundary patterns
|
|
61
|
+
const SENTENCE_ENDINGS = /(?<=[.!?])\s+(?=[A-Z])/g;
|
|
62
|
+
const PARAGRAPH_BREAKS = /\n\n+/g;
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* Split text into chunks with overlap
|
|
66
|
+
*/
|
|
67
|
+
export function chunkText(
|
|
68
|
+
text: string,
|
|
69
|
+
config: ChunkingConfig = {}
|
|
70
|
+
): ChunkedDocument {
|
|
71
|
+
const finalConfig: Required<ChunkingConfig> = {
|
|
72
|
+
maxChunkSize: config.maxChunkSize ?? 512,
|
|
73
|
+
overlap: config.overlap ?? 50,
|
|
74
|
+
strategy: config.strategy ?? 'sentence',
|
|
75
|
+
minChunkSize: config.minChunkSize ?? 100,
|
|
76
|
+
includeMetadata: config.includeMetadata ?? true,
|
|
77
|
+
};
|
|
78
|
+
|
|
79
|
+
// Normalize whitespace
|
|
80
|
+
const normalizedText = text.replace(/\s+/g, ' ').trim();
|
|
81
|
+
|
|
82
|
+
let chunks: Chunk[];
|
|
83
|
+
|
|
84
|
+
switch (finalConfig.strategy) {
|
|
85
|
+
case 'character':
|
|
86
|
+
chunks = chunkByCharacter(normalizedText, finalConfig);
|
|
87
|
+
break;
|
|
88
|
+
case 'sentence':
|
|
89
|
+
chunks = chunkBySentence(normalizedText, finalConfig);
|
|
90
|
+
break;
|
|
91
|
+
case 'paragraph':
|
|
92
|
+
chunks = chunkByParagraph(text, finalConfig); // Keep original for paragraphs
|
|
93
|
+
break;
|
|
94
|
+
case 'token':
|
|
95
|
+
chunks = chunkByToken(normalizedText, finalConfig);
|
|
96
|
+
break;
|
|
97
|
+
default:
|
|
98
|
+
chunks = chunkBySentence(normalizedText, finalConfig);
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
return {
|
|
102
|
+
chunks,
|
|
103
|
+
originalLength: text.length,
|
|
104
|
+
totalChunks: chunks.length,
|
|
105
|
+
config: finalConfig,
|
|
106
|
+
};
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
/**
|
|
110
|
+
* Simple character-based chunking with overlap
|
|
111
|
+
*/
|
|
112
|
+
function chunkByCharacter(
|
|
113
|
+
text: string,
|
|
114
|
+
config: Required<ChunkingConfig>
|
|
115
|
+
): Chunk[] {
|
|
116
|
+
const chunks: Chunk[] = [];
|
|
117
|
+
const { maxChunkSize, overlap } = config;
|
|
118
|
+
|
|
119
|
+
let pos = 0;
|
|
120
|
+
let index = 0;
|
|
121
|
+
|
|
122
|
+
while (pos < text.length) {
|
|
123
|
+
const endPos = Math.min(pos + maxChunkSize, text.length);
|
|
124
|
+
const chunkText = text.slice(pos, endPos);
|
|
125
|
+
|
|
126
|
+
chunks.push({
|
|
127
|
+
text: chunkText,
|
|
128
|
+
index,
|
|
129
|
+
startPos: pos,
|
|
130
|
+
endPos,
|
|
131
|
+
length: chunkText.length,
|
|
132
|
+
tokenCount: Math.ceil(chunkText.length / 4),
|
|
133
|
+
});
|
|
134
|
+
|
|
135
|
+
// Move position with overlap
|
|
136
|
+
pos = endPos - overlap;
|
|
137
|
+
if (pos >= text.length - overlap) {
|
|
138
|
+
break;
|
|
139
|
+
}
|
|
140
|
+
index++;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
return chunks;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
/**
|
|
147
|
+
* Sentence-aware chunking - keeps sentences intact
|
|
148
|
+
*/
|
|
149
|
+
function chunkBySentence(
|
|
150
|
+
text: string,
|
|
151
|
+
config: Required<ChunkingConfig>
|
|
152
|
+
): Chunk[] {
|
|
153
|
+
const { maxChunkSize, overlap, minChunkSize } = config;
|
|
154
|
+
|
|
155
|
+
// Split into sentences
|
|
156
|
+
const sentences = text.split(SENTENCE_ENDINGS).filter(s => s.trim().length > 0);
|
|
157
|
+
|
|
158
|
+
const chunks: Chunk[] = [];
|
|
159
|
+
let currentChunk = '';
|
|
160
|
+
let currentStart = 0;
|
|
161
|
+
let index = 0;
|
|
162
|
+
let textPos = 0;
|
|
163
|
+
|
|
164
|
+
for (const sentence of sentences) {
|
|
165
|
+
const trimmedSentence = sentence.trim();
|
|
166
|
+
|
|
167
|
+
// If adding this sentence exceeds max size, save current chunk
|
|
168
|
+
if (currentChunk.length + trimmedSentence.length > maxChunkSize && currentChunk.length >= minChunkSize) {
|
|
169
|
+
chunks.push({
|
|
170
|
+
text: currentChunk.trim(),
|
|
171
|
+
index,
|
|
172
|
+
startPos: currentStart,
|
|
173
|
+
endPos: textPos,
|
|
174
|
+
length: currentChunk.length,
|
|
175
|
+
tokenCount: Math.ceil(currentChunk.length / 4),
|
|
176
|
+
});
|
|
177
|
+
|
|
178
|
+
// Start new chunk with overlap (last part of previous chunk)
|
|
179
|
+
const overlapText = currentChunk.slice(-overlap);
|
|
180
|
+
currentChunk = overlapText + ' ' + trimmedSentence;
|
|
181
|
+
currentStart = textPos - overlap;
|
|
182
|
+
index++;
|
|
183
|
+
} else {
|
|
184
|
+
currentChunk += (currentChunk.length > 0 ? ' ' : '') + trimmedSentence;
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
textPos += trimmedSentence.length + 1;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
// Add final chunk
|
|
191
|
+
if (currentChunk.trim().length > 0) {
|
|
192
|
+
chunks.push({
|
|
193
|
+
text: currentChunk.trim(),
|
|
194
|
+
index,
|
|
195
|
+
startPos: currentStart,
|
|
196
|
+
endPos: text.length,
|
|
197
|
+
length: currentChunk.length,
|
|
198
|
+
tokenCount: Math.ceil(currentChunk.length / 4),
|
|
199
|
+
});
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
return chunks;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
/**
|
|
206
|
+
* Paragraph-aware chunking
|
|
207
|
+
*/
|
|
208
|
+
function chunkByParagraph(
|
|
209
|
+
text: string,
|
|
210
|
+
config: Required<ChunkingConfig>
|
|
211
|
+
): Chunk[] {
|
|
212
|
+
const { maxChunkSize, minChunkSize } = config;
|
|
213
|
+
|
|
214
|
+
// Split by paragraph breaks
|
|
215
|
+
const paragraphs = text.split(PARAGRAPH_BREAKS).filter(p => p.trim().length > 0);
|
|
216
|
+
|
|
217
|
+
const chunks: Chunk[] = [];
|
|
218
|
+
let currentChunk = '';
|
|
219
|
+
let currentStart = 0;
|
|
220
|
+
let index = 0;
|
|
221
|
+
let textPos = 0;
|
|
222
|
+
|
|
223
|
+
for (const paragraph of paragraphs) {
|
|
224
|
+
const trimmedPara = paragraph.trim();
|
|
225
|
+
|
|
226
|
+
// If single paragraph exceeds max, fall back to sentence chunking
|
|
227
|
+
if (trimmedPara.length > maxChunkSize) {
|
|
228
|
+
if (currentChunk.length > 0) {
|
|
229
|
+
chunks.push({
|
|
230
|
+
text: currentChunk.trim(),
|
|
231
|
+
index,
|
|
232
|
+
startPos: currentStart,
|
|
233
|
+
endPos: textPos,
|
|
234
|
+
length: currentChunk.length,
|
|
235
|
+
tokenCount: Math.ceil(currentChunk.length / 4),
|
|
236
|
+
});
|
|
237
|
+
index++;
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
// Chunk the large paragraph by sentence
|
|
241
|
+
const subChunks = chunkBySentence(trimmedPara, config);
|
|
242
|
+
for (const subChunk of subChunks) {
|
|
243
|
+
chunks.push({
|
|
244
|
+
...subChunk,
|
|
245
|
+
index,
|
|
246
|
+
startPos: textPos + subChunk.startPos,
|
|
247
|
+
endPos: textPos + subChunk.endPos,
|
|
248
|
+
});
|
|
249
|
+
index++;
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
currentChunk = '';
|
|
253
|
+
currentStart = textPos + trimmedPara.length;
|
|
254
|
+
} else if (currentChunk.length + trimmedPara.length > maxChunkSize && currentChunk.length >= minChunkSize) {
|
|
255
|
+
chunks.push({
|
|
256
|
+
text: currentChunk.trim(),
|
|
257
|
+
index,
|
|
258
|
+
startPos: currentStart,
|
|
259
|
+
endPos: textPos,
|
|
260
|
+
length: currentChunk.length,
|
|
261
|
+
tokenCount: Math.ceil(currentChunk.length / 4),
|
|
262
|
+
});
|
|
263
|
+
|
|
264
|
+
currentChunk = trimmedPara;
|
|
265
|
+
currentStart = textPos;
|
|
266
|
+
index++;
|
|
267
|
+
} else {
|
|
268
|
+
currentChunk += (currentChunk.length > 0 ? '\n\n' : '') + trimmedPara;
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
textPos += trimmedPara.length + 2; // +2 for paragraph break
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
// Add final chunk
|
|
275
|
+
if (currentChunk.trim().length > 0) {
|
|
276
|
+
chunks.push({
|
|
277
|
+
text: currentChunk.trim(),
|
|
278
|
+
index,
|
|
279
|
+
startPos: currentStart,
|
|
280
|
+
endPos: text.length,
|
|
281
|
+
length: currentChunk.length,
|
|
282
|
+
tokenCount: Math.ceil(currentChunk.length / 4),
|
|
283
|
+
});
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
return chunks;
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
/**
|
|
290
|
+
* Token-based chunking (approximate - uses chars/4 as estimate)
|
|
291
|
+
*/
|
|
292
|
+
function chunkByToken(
|
|
293
|
+
text: string,
|
|
294
|
+
config: Required<ChunkingConfig>
|
|
295
|
+
): Chunk[] {
|
|
296
|
+
// Convert token limits to character limits (rough estimate: 1 token ≈ 4 chars)
|
|
297
|
+
const charConfig: Required<ChunkingConfig> = {
|
|
298
|
+
...config,
|
|
299
|
+
maxChunkSize: config.maxChunkSize * 4,
|
|
300
|
+
overlap: config.overlap * 4,
|
|
301
|
+
minChunkSize: config.minChunkSize * 4,
|
|
302
|
+
};
|
|
303
|
+
|
|
304
|
+
// Use sentence-aware chunking with converted limits
|
|
305
|
+
return chunkBySentence(text, charConfig);
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
/**
|
|
309
|
+
* Estimate token count for text
|
|
310
|
+
*/
|
|
311
|
+
export function estimateTokens(text: string): number {
|
|
312
|
+
// Simple estimation: ~4 characters per token on average
|
|
313
|
+
return Math.ceil(text.length / 4);
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
/**
|
|
317
|
+
* Reconstruct original text from chunks (approximate)
|
|
318
|
+
*/
|
|
319
|
+
export function reconstructFromChunks(chunks: Chunk[]): string {
|
|
320
|
+
if (chunks.length === 0) return '';
|
|
321
|
+
if (chunks.length === 1) return chunks[0].text;
|
|
322
|
+
|
|
323
|
+
// Sort by index
|
|
324
|
+
const sorted = [...chunks].sort((a, b) => a.index - b.index);
|
|
325
|
+
|
|
326
|
+
// Simple concatenation (overlap removal is approximate)
|
|
327
|
+
let result = sorted[0].text;
|
|
328
|
+
|
|
329
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
330
|
+
const chunk = sorted[i];
|
|
331
|
+
const prevChunk = sorted[i - 1];
|
|
332
|
+
|
|
333
|
+
// Find overlap by looking for common suffix/prefix
|
|
334
|
+
const overlapSize = Math.min(100, prevChunk.text.length, chunk.text.length);
|
|
335
|
+
const prevSuffix = prevChunk.text.slice(-overlapSize);
|
|
336
|
+
const currPrefix = chunk.text.slice(0, overlapSize);
|
|
337
|
+
|
|
338
|
+
// Find longest common overlap
|
|
339
|
+
let overlap = 0;
|
|
340
|
+
for (let len = overlapSize; len > 0; len--) {
|
|
341
|
+
if (currPrefix.startsWith(prevSuffix.slice(-len))) {
|
|
342
|
+
overlap = len;
|
|
343
|
+
break;
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
result += ' ' + chunk.text.slice(overlap);
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
return result.replace(/\s+/g, ' ').trim();
|
|
351
|
+
}
|