@sparkleideas/performance 3.0.0-alpha.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,569 @@
1
+ /**
2
+ * Attention Memory Efficiency Benchmark
3
+ *
4
+ * Target: 50-75% memory reduction
5
+ *
6
+ * Measures memory efficiency of different attention implementations
7
+ * and optimization strategies.
8
+ */
9
+
10
+ import { benchmark, BenchmarkRunner, formatTime, formatBytes } from '../../src/framework/benchmark.js';
11
+
12
+ // ============================================================================
13
+ // Memory Tracking
14
+ // ============================================================================
15
+
16
+ interface MemorySnapshot {
17
+ heapUsed: number;
18
+ heapTotal: number;
19
+ external: number;
20
+ arrayBuffers: number;
21
+ rss: number;
22
+ }
23
+
24
+ function takeMemorySnapshot(): MemorySnapshot {
25
+ const mem = process.memoryUsage();
26
+ return {
27
+ heapUsed: mem.heapUsed,
28
+ heapTotal: mem.heapTotal,
29
+ external: mem.external,
30
+ arrayBuffers: mem.arrayBuffers,
31
+ rss: mem.rss,
32
+ };
33
+ }
34
+
35
+ function calculateMemoryDelta(before: MemorySnapshot, after: MemorySnapshot): number {
36
+ return after.heapUsed - before.heapUsed;
37
+ }
38
+
39
+ // ============================================================================
40
+ // Attention Implementations for Memory Testing
41
+ // ============================================================================
42
+
43
+ /**
44
+ * Standard attention - stores full attention matrix
45
+ */
46
+ function standardAttention(
47
+ query: Float32Array,
48
+ key: Float32Array,
49
+ value: Float32Array,
50
+ seqLength: number,
51
+ headDim: number
52
+ ): { output: Float32Array; attentionMatrix: Float32Array } {
53
+ const scale = 1 / Math.sqrt(headDim);
54
+
55
+ // Full attention matrix - O(n^2) memory
56
+ const attentionMatrix = new Float32Array(seqLength * seqLength);
57
+
58
+ // Compute scores
59
+ for (let i = 0; i < seqLength; i++) {
60
+ for (let j = 0; j < seqLength; j++) {
61
+ let dot = 0;
62
+ for (let k = 0; k < headDim; k++) {
63
+ dot += query[i * headDim + k]! * key[j * headDim + k]!;
64
+ }
65
+ attentionMatrix[i * seqLength + j] = dot * scale;
66
+ }
67
+ }
68
+
69
+ // Softmax
70
+ for (let i = 0; i < seqLength; i++) {
71
+ let max = -Infinity;
72
+ for (let j = 0; j < seqLength; j++) {
73
+ max = Math.max(max, attentionMatrix[i * seqLength + j]!);
74
+ }
75
+
76
+ let sum = 0;
77
+ for (let j = 0; j < seqLength; j++) {
78
+ const exp = Math.exp(attentionMatrix[i * seqLength + j]! - max);
79
+ attentionMatrix[i * seqLength + j] = exp;
80
+ sum += exp;
81
+ }
82
+
83
+ for (let j = 0; j < seqLength; j++) {
84
+ attentionMatrix[i * seqLength + j]! /= sum;
85
+ }
86
+ }
87
+
88
+ // Output
89
+ const output = new Float32Array(seqLength * headDim);
90
+ for (let i = 0; i < seqLength; i++) {
91
+ for (let k = 0; k < headDim; k++) {
92
+ let sum = 0;
93
+ for (let j = 0; j < seqLength; j++) {
94
+ sum += attentionMatrix[i * seqLength + j]! * value[j * headDim + k]!;
95
+ }
96
+ output[i * headDim + k] = sum;
97
+ }
98
+ }
99
+
100
+ return { output, attentionMatrix };
101
+ }
102
+
103
+ /**
104
+ * Memory-efficient attention - no full matrix storage
105
+ */
106
+ function memoryEfficientAttention(
107
+ query: Float32Array,
108
+ key: Float32Array,
109
+ value: Float32Array,
110
+ seqLength: number,
111
+ headDim: number
112
+ ): { output: Float32Array } {
113
+ const scale = 1 / Math.sqrt(headDim);
114
+ const output = new Float32Array(seqLength * headDim);
115
+
116
+ // Process row by row - O(n) memory for scores
117
+ const rowScores = new Float32Array(seqLength);
118
+
119
+ for (let i = 0; i < seqLength; i++) {
120
+ // Compute scores for this row
121
+ let max = -Infinity;
122
+ for (let j = 0; j < seqLength; j++) {
123
+ let dot = 0;
124
+ for (let k = 0; k < headDim; k++) {
125
+ dot += query[i * headDim + k]! * key[j * headDim + k]!;
126
+ }
127
+ rowScores[j] = dot * scale;
128
+ max = Math.max(max, rowScores[j]!);
129
+ }
130
+
131
+ // Softmax
132
+ let sum = 0;
133
+ for (let j = 0; j < seqLength; j++) {
134
+ rowScores[j] = Math.exp(rowScores[j]! - max);
135
+ sum += rowScores[j]!;
136
+ }
137
+ for (let j = 0; j < seqLength; j++) {
138
+ rowScores[j]! /= sum;
139
+ }
140
+
141
+ // Compute output for this row
142
+ for (let k = 0; k < headDim; k++) {
143
+ let val = 0;
144
+ for (let j = 0; j < seqLength; j++) {
145
+ val += rowScores[j]! * value[j * headDim + k]!;
146
+ }
147
+ output[i * headDim + k] = val;
148
+ }
149
+ }
150
+
151
+ return { output };
152
+ }
153
+
154
+ /**
155
+ * Chunked attention - process in blocks
156
+ */
157
+ function chunkedAttention(
158
+ query: Float32Array,
159
+ key: Float32Array,
160
+ value: Float32Array,
161
+ seqLength: number,
162
+ headDim: number,
163
+ chunkSize: number = 64
164
+ ): { output: Float32Array } {
165
+ const scale = 1 / Math.sqrt(headDim);
166
+ const output = new Float32Array(seqLength * headDim);
167
+ const numChunks = Math.ceil(seqLength / chunkSize);
168
+
169
+ // Chunk buffer - O(chunkSize^2) memory
170
+ const chunkScores = new Float32Array(chunkSize * seqLength);
171
+ const rowMax = new Float32Array(chunkSize).fill(-Infinity);
172
+ const rowSum = new Float32Array(chunkSize).fill(0);
173
+
174
+ for (let ci = 0; ci < numChunks; ci++) {
175
+ const iStart = ci * chunkSize;
176
+ const iEnd = Math.min(iStart + chunkSize, seqLength);
177
+ const iSize = iEnd - iStart;
178
+
179
+ // Reset accumulators
180
+ rowMax.fill(-Infinity);
181
+ rowSum.fill(0);
182
+ output.fill(0, iStart * headDim, iEnd * headDim);
183
+
184
+ for (let cj = 0; cj < numChunks; cj++) {
185
+ const jStart = cj * chunkSize;
186
+ const jEnd = Math.min(jStart + chunkSize, seqLength);
187
+ const jSize = jEnd - jStart;
188
+
189
+ // Compute chunk scores
190
+ for (let i = 0; i < iSize; i++) {
191
+ for (let j = 0; j < jSize; j++) {
192
+ let dot = 0;
193
+ for (let k = 0; k < headDim; k++) {
194
+ dot += query[(iStart + i) * headDim + k]! * key[(jStart + j) * headDim + k]!;
195
+ }
196
+ chunkScores[i * seqLength + jStart + j] = dot * scale;
197
+ }
198
+ }
199
+
200
+ // Online softmax update
201
+ for (let i = 0; i < iSize; i++) {
202
+ const prevMax = rowMax[i]!;
203
+
204
+ // Find new max
205
+ for (let j = 0; j < jSize; j++) {
206
+ rowMax[i] = Math.max(rowMax[i]!, chunkScores[i * seqLength + jStart + j]!);
207
+ }
208
+
209
+ // Rescale previous
210
+ if (prevMax !== -Infinity && prevMax !== rowMax[i]) {
211
+ const rescale = Math.exp(prevMax - rowMax[i]!);
212
+ rowSum[i]! *= rescale;
213
+ for (let k = 0; k < headDim; k++) {
214
+ output[(iStart + i) * headDim + k]! *= rescale;
215
+ }
216
+ }
217
+
218
+ // Add new exponentials
219
+ for (let j = 0; j < jSize; j++) {
220
+ const exp = Math.exp(chunkScores[i * seqLength + jStart + j]! - rowMax[i]!);
221
+ chunkScores[i * seqLength + jStart + j] = exp;
222
+ rowSum[i]! += exp;
223
+ }
224
+
225
+ // Accumulate output
226
+ for (let k = 0; k < headDim; k++) {
227
+ for (let j = 0; j < jSize; j++) {
228
+ output[(iStart + i) * headDim + k]! +=
229
+ chunkScores[i * seqLength + jStart + j]! * value[(jStart + j) * headDim + k]!;
230
+ }
231
+ }
232
+ }
233
+ }
234
+
235
+ // Final normalization
236
+ for (let i = 0; i < iSize; i++) {
237
+ for (let k = 0; k < headDim; k++) {
238
+ output[(iStart + i) * headDim + k]! /= rowSum[i]!;
239
+ }
240
+ }
241
+ }
242
+
243
+ return { output };
244
+ }
245
+
246
+ // ============================================================================
247
+ // Helper Functions
248
+ // ============================================================================
249
+
250
+ function generateRandomTensor(size: number): Float32Array {
251
+ const tensor = new Float32Array(size);
252
+ for (let i = 0; i < size; i++) {
253
+ tensor[i] = Math.random() * 2 - 1;
254
+ }
255
+ return tensor;
256
+ }
257
+
258
+ // ============================================================================
259
+ // Benchmark Suite
260
+ // ============================================================================
261
+
262
+ export async function runMemoryEfficiencyBenchmarks(): Promise<void> {
263
+ const runner = new BenchmarkRunner('Attention Memory Efficiency');
264
+
265
+ console.log('\n--- Attention Memory Efficiency Benchmarks ---\n');
266
+
267
+ // Test configurations
268
+ const seqLengths = [128, 256, 512, 1024];
269
+ const headDim = 64;
270
+
271
+ // Memory scaling comparison
272
+ console.log('--- Memory Scaling by Sequence Length ---\n');
273
+
274
+ const memoryResults: Array<{
275
+ seqLength: number;
276
+ standard: number;
277
+ efficient: number;
278
+ chunked: number;
279
+ reduction: number;
280
+ }> = [];
281
+
282
+ for (const seqLength of seqLengths) {
283
+ console.log(`Sequence Length: ${seqLength}`);
284
+
285
+ const size = seqLength * headDim;
286
+ const query = generateRandomTensor(size);
287
+ const key = generateRandomTensor(size);
288
+ const value = generateRandomTensor(size);
289
+
290
+ // Standard attention memory
291
+ if (typeof global.gc === 'function') global.gc();
292
+ const standardBefore = takeMemorySnapshot();
293
+ const standardResult = standardAttention(query, key, value, seqLength, headDim);
294
+ const standardAfter = takeMemorySnapshot();
295
+ const standardMem = calculateMemoryDelta(standardBefore, standardAfter);
296
+ void standardResult;
297
+
298
+ // Memory-efficient attention
299
+ if (typeof global.gc === 'function') global.gc();
300
+ const efficientBefore = takeMemorySnapshot();
301
+ const efficientResult = memoryEfficientAttention(query, key, value, seqLength, headDim);
302
+ const efficientAfter = takeMemorySnapshot();
303
+ const efficientMem = calculateMemoryDelta(efficientBefore, efficientAfter);
304
+ void efficientResult;
305
+
306
+ // Chunked attention
307
+ if (typeof global.gc === 'function') global.gc();
308
+ const chunkedBefore = takeMemorySnapshot();
309
+ const chunkedResult = chunkedAttention(query, key, value, seqLength, headDim);
310
+ const chunkedAfter = takeMemorySnapshot();
311
+ const chunkedMem = calculateMemoryDelta(chunkedBefore, chunkedAfter);
312
+ void chunkedResult;
313
+
314
+ const reduction = ((standardMem - efficientMem) / standardMem) * 100;
315
+
316
+ memoryResults.push({
317
+ seqLength,
318
+ standard: standardMem,
319
+ efficient: efficientMem,
320
+ chunked: chunkedMem,
321
+ reduction,
322
+ });
323
+
324
+ console.log(` Standard: ${formatBytes(standardMem)}`);
325
+ console.log(` Efficient: ${formatBytes(efficientMem)}`);
326
+ console.log(` Chunked: ${formatBytes(chunkedMem)}`);
327
+ console.log(` Reduction: ${reduction.toFixed(1)}%`);
328
+ console.log('');
329
+ }
330
+
331
+ // Theoretical memory comparison
332
+ console.log('--- Theoretical Memory Analysis ---\n');
333
+
334
+ for (const seqLength of seqLengths) {
335
+ const bytesPerFloat = 4;
336
+
337
+ // Standard: stores full n x n attention matrix
338
+ const standardTheory = seqLength * seqLength * bytesPerFloat;
339
+
340
+ // Efficient: stores only one row at a time
341
+ const efficientTheory = seqLength * bytesPerFloat;
342
+
343
+ // Chunked: stores chunk x n scores
344
+ const chunkSize = 64;
345
+ const chunkedTheory = chunkSize * seqLength * bytesPerFloat;
346
+
347
+ console.log(`Seq ${seqLength}:`);
348
+ console.log(` Standard: ${formatBytes(standardTheory)} (n^2)`);
349
+ console.log(` Efficient: ${formatBytes(efficientTheory)} (n)`);
350
+ console.log(` Chunked: ${formatBytes(chunkedTheory)} (chunk * n)`);
351
+ console.log(` Theoretical reduction: ${((1 - efficientTheory / standardTheory) * 100).toFixed(1)}%`);
352
+ console.log('');
353
+ }
354
+
355
+ // Performance vs Memory tradeoff
356
+ console.log('--- Performance vs Memory Tradeoff ---\n');
357
+
358
+ const tradeoffConfig = { seqLength: 512, headDim: 64 };
359
+ const size = tradeoffConfig.seqLength * tradeoffConfig.headDim;
360
+ const q = generateRandomTensor(size);
361
+ const k = generateRandomTensor(size);
362
+ const v = generateRandomTensor(size);
363
+
364
+ // Standard performance
365
+ const standardPerfResult = await runner.run(
366
+ 'standard-attention-perf',
367
+ async () => {
368
+ standardAttention(q, k, v, tradeoffConfig.seqLength, tradeoffConfig.headDim);
369
+ },
370
+ { iterations: 20 }
371
+ );
372
+
373
+ console.log(`Standard Performance: ${formatTime(standardPerfResult.mean)}`);
374
+
375
+ // Efficient performance
376
+ const efficientPerfResult = await runner.run(
377
+ 'efficient-attention-perf',
378
+ async () => {
379
+ memoryEfficientAttention(q, k, v, tradeoffConfig.seqLength, tradeoffConfig.headDim);
380
+ },
381
+ { iterations: 20 }
382
+ );
383
+
384
+ console.log(`Memory-Efficient Performance: ${formatTime(efficientPerfResult.mean)}`);
385
+
386
+ // Chunked performance with different chunk sizes
387
+ const chunkSizes = [32, 64, 128, 256];
388
+
389
+ for (const chunkSize of chunkSizes) {
390
+ const chunkedPerfResult = await runner.run(
391
+ `chunked-attention-chunk${chunkSize}`,
392
+ async () => {
393
+ chunkedAttention(q, k, v, tradeoffConfig.seqLength, tradeoffConfig.headDim, chunkSize);
394
+ },
395
+ { iterations: 20 }
396
+ );
397
+
398
+ console.log(`Chunked (size=${chunkSize}): ${formatTime(chunkedPerfResult.mean)}`);
399
+ }
400
+
401
+ // Multi-head memory analysis
402
+ console.log('\n--- Multi-Head Memory Analysis ---\n');
403
+
404
+ const numHeads = [4, 8, 16, 32];
405
+ const mhaSeqLength = 256;
406
+
407
+ for (const heads of numHeads) {
408
+ const mhaSize = mhaSeqLength * headDim;
409
+
410
+ // Standard MHA memory
411
+ const standardMHAMem = mhaSeqLength * mhaSeqLength * 4 * heads; // attention matrices
412
+ const qkvMem = mhaSize * 4 * 3 * heads; // QKV storage
413
+
414
+ // GQA memory (shared KV)
415
+ const gqaKVHeads = heads / 4;
416
+ const gqaMem = mhaSeqLength * mhaSeqLength * 4 * heads + // attention matrices (same)
417
+ mhaSize * 4 * heads + // Q storage
418
+ mhaSize * 4 * 2 * gqaKVHeads; // shared KV
419
+
420
+ // MQA memory (single KV)
421
+ const mqaMem = mhaSeqLength * mhaSeqLength * 4 * heads + // attention matrices
422
+ mhaSize * 4 * heads + // Q storage
423
+ mhaSize * 4 * 2; // single KV
424
+
425
+ console.log(`${heads} heads:`);
426
+ console.log(` Standard MHA: ${formatBytes(standardMHAMem + qkvMem)}`);
427
+ console.log(` GQA (${gqaKVHeads} KV): ${formatBytes(gqaMem)}`);
428
+ console.log(` MQA (1 KV): ${formatBytes(mqaMem)}`);
429
+ console.log(` MQA reduction: ${(((standardMHAMem + qkvMem) - mqaMem) / (standardMHAMem + qkvMem) * 100).toFixed(1)}%`);
430
+ console.log('');
431
+ }
432
+
433
+ // Summary
434
+ console.log('--- Summary ---\n');
435
+
436
+ console.log('Memory Reduction Achieved:');
437
+ for (const result of memoryResults) {
438
+ const targetMet = result.reduction >= 50;
439
+ console.log(
440
+ ` Seq ${result.seqLength}: ${result.reduction.toFixed(1)}% ${targetMet ? '(TARGET MET)' : ''}`
441
+ );
442
+ }
443
+
444
+ console.log('\nPerformance Comparison (seq=512):');
445
+ console.log(` Standard: ${formatTime(standardPerfResult.mean)}`);
446
+ console.log(` Efficient: ${formatTime(efficientPerfResult.mean)}`);
447
+
448
+ // Print full results
449
+ runner.printResults();
450
+ }
451
+
452
+ // ============================================================================
453
+ // Memory Efficiency Optimization Strategies
454
+ // ============================================================================
455
+
456
+ export const memoryOptimizations = {
457
+ /**
458
+ * Online softmax computation
459
+ */
460
+ onlineSoftmax: {
461
+ description: 'Compute softmax in streaming fashion without storing all values',
462
+ expectedImprovement: 'O(n) instead of O(n^2) for softmax',
463
+ implementation: `
464
+ class OnlineSoftmax {
465
+ private max = -Infinity;
466
+ private sum = 0;
467
+ private count = 0;
468
+
469
+ add(value: number): void {
470
+ if (value > this.max) {
471
+ this.sum *= Math.exp(this.max - value);
472
+ this.max = value;
473
+ }
474
+ this.sum += Math.exp(value - this.max);
475
+ this.count++;
476
+ }
477
+
478
+ normalize(value: number): number {
479
+ return Math.exp(value - this.max) / this.sum;
480
+ }
481
+ }
482
+ `,
483
+ },
484
+
485
+ /**
486
+ * Gradient checkpointing
487
+ */
488
+ gradientCheckpointing: {
489
+ description: 'Recompute attention during backward pass instead of storing',
490
+ expectedImprovement: 'O(1) memory for activations',
491
+ implementation: `
492
+ function checkpointedAttention(q, k, v) {
493
+ const output = computeAttention(q, k, v);
494
+
495
+ function backward(gradOutput) {
496
+ // Recompute attention weights during backward
497
+ const attnWeights = recomputeAttention(q, k);
498
+ return computeGradients(gradOutput, attnWeights, q, k, v);
499
+ }
500
+
501
+ return { output, backward };
502
+ }
503
+ `,
504
+ },
505
+
506
+ /**
507
+ * Sparse attention patterns
508
+ */
509
+ sparseAttention: {
510
+ description: 'Only compute attention for relevant positions',
511
+ expectedImprovement: 'O(n * k) instead of O(n^2) where k << n',
512
+ implementation: `
513
+ function sparseAttention(q, k, v, pattern: 'local' | 'strided' | 'block') {
514
+ const sparseMask = generateSparsePattern(q.length, pattern);
515
+ return computeAttentionWithMask(q, k, v, sparseMask);
516
+ }
517
+ `,
518
+ },
519
+
520
+ /**
521
+ * Quantization
522
+ */
523
+ quantization: {
524
+ description: 'Use lower precision for attention computation',
525
+ expectedImprovement: '2-4x memory reduction',
526
+ implementation: `
527
+ function quantizedAttention(q, k, v) {
528
+ // Quantize to int8
529
+ const qInt8 = quantizeToInt8(q);
530
+ const kInt8 = quantizeToInt8(k);
531
+
532
+ // Compute in int8
533
+ const scores = computeInt8Attention(qInt8, kInt8);
534
+
535
+ // Dequantize for output
536
+ return dequantizeAndApply(scores, v);
537
+ }
538
+ `,
539
+ },
540
+
541
+ /**
542
+ * Memory pooling
543
+ */
544
+ memoryPooling: {
545
+ description: 'Reuse memory buffers across forward passes',
546
+ expectedImprovement: 'Eliminates allocation overhead',
547
+ implementation: `
548
+ class AttentionMemoryPool {
549
+ private scoreBuffer: Float32Array;
550
+ private outputBuffer: Float32Array;
551
+
552
+ forward(q, k, v) {
553
+ // Reuse pre-allocated buffers
554
+ computeScores(q, k, this.scoreBuffer);
555
+ applySoftmax(this.scoreBuffer);
556
+ computeOutput(this.scoreBuffer, v, this.outputBuffer);
557
+ return this.outputBuffer;
558
+ }
559
+ }
560
+ `,
561
+ },
562
+ };
563
+
564
+ // Run if executed directly
565
+ if (import.meta.url === `file://${process.argv[1]}`) {
566
+ runMemoryEfficiencyBenchmarks().catch(console.error);
567
+ }
568
+
569
+ export default runMemoryEfficiencyBenchmarks;