@sparkleideas/performance 3.0.0-alpha.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +256 -0
- package/__tests__/README.md +242 -0
- package/__tests__/attention.test.ts +516 -0
- package/__tests__/benchmarks.test.ts +515 -0
- package/benchmarks/attention/memory-efficiency.bench.ts +569 -0
- package/benchmarks/attention/multi-head-attention.bench.ts +566 -0
- package/benchmarks/startup/agent-spawn.bench.ts +422 -0
- package/benchmarks/startup/cli-cold-start.bench.ts +327 -0
- package/benchmarks/startup/cli-warm-start.bench.ts +277 -0
- package/benchmarks/startup/mcp-server-init.bench.ts +380 -0
- package/docs/ATTENTION.md +277 -0
- package/package.json +29 -0
- package/src/attention-benchmarks.ts +459 -0
- package/src/attention-integration.ts +507 -0
- package/src/examples/flash-attention-demo.ts +160 -0
- package/src/examples/quick-test.ts +62 -0
- package/src/framework/benchmark.ts +583 -0
- package/src/index.ts +63 -0
- package/tmp.json +0 -0
- package/tsconfig.json +9 -0
- package/vitest.config.ts +31 -0
|
@@ -0,0 +1,569 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Attention Memory Efficiency Benchmark
|
|
3
|
+
*
|
|
4
|
+
* Target: 50-75% memory reduction
|
|
5
|
+
*
|
|
6
|
+
* Measures memory efficiency of different attention implementations
|
|
7
|
+
* and optimization strategies.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import { benchmark, BenchmarkRunner, formatTime, formatBytes } from '../../src/framework/benchmark.js';
|
|
11
|
+
|
|
12
|
+
// ============================================================================
|
|
13
|
+
// Memory Tracking
|
|
14
|
+
// ============================================================================
|
|
15
|
+
|
|
16
|
+
interface MemorySnapshot {
|
|
17
|
+
heapUsed: number;
|
|
18
|
+
heapTotal: number;
|
|
19
|
+
external: number;
|
|
20
|
+
arrayBuffers: number;
|
|
21
|
+
rss: number;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
function takeMemorySnapshot(): MemorySnapshot {
|
|
25
|
+
const mem = process.memoryUsage();
|
|
26
|
+
return {
|
|
27
|
+
heapUsed: mem.heapUsed,
|
|
28
|
+
heapTotal: mem.heapTotal,
|
|
29
|
+
external: mem.external,
|
|
30
|
+
arrayBuffers: mem.arrayBuffers,
|
|
31
|
+
rss: mem.rss,
|
|
32
|
+
};
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
function calculateMemoryDelta(before: MemorySnapshot, after: MemorySnapshot): number {
|
|
36
|
+
return after.heapUsed - before.heapUsed;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
// ============================================================================
|
|
40
|
+
// Attention Implementations for Memory Testing
|
|
41
|
+
// ============================================================================
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* Standard attention - stores full attention matrix
|
|
45
|
+
*/
|
|
46
|
+
function standardAttention(
|
|
47
|
+
query: Float32Array,
|
|
48
|
+
key: Float32Array,
|
|
49
|
+
value: Float32Array,
|
|
50
|
+
seqLength: number,
|
|
51
|
+
headDim: number
|
|
52
|
+
): { output: Float32Array; attentionMatrix: Float32Array } {
|
|
53
|
+
const scale = 1 / Math.sqrt(headDim);
|
|
54
|
+
|
|
55
|
+
// Full attention matrix - O(n^2) memory
|
|
56
|
+
const attentionMatrix = new Float32Array(seqLength * seqLength);
|
|
57
|
+
|
|
58
|
+
// Compute scores
|
|
59
|
+
for (let i = 0; i < seqLength; i++) {
|
|
60
|
+
for (let j = 0; j < seqLength; j++) {
|
|
61
|
+
let dot = 0;
|
|
62
|
+
for (let k = 0; k < headDim; k++) {
|
|
63
|
+
dot += query[i * headDim + k]! * key[j * headDim + k]!;
|
|
64
|
+
}
|
|
65
|
+
attentionMatrix[i * seqLength + j] = dot * scale;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
// Softmax
|
|
70
|
+
for (let i = 0; i < seqLength; i++) {
|
|
71
|
+
let max = -Infinity;
|
|
72
|
+
for (let j = 0; j < seqLength; j++) {
|
|
73
|
+
max = Math.max(max, attentionMatrix[i * seqLength + j]!);
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
let sum = 0;
|
|
77
|
+
for (let j = 0; j < seqLength; j++) {
|
|
78
|
+
const exp = Math.exp(attentionMatrix[i * seqLength + j]! - max);
|
|
79
|
+
attentionMatrix[i * seqLength + j] = exp;
|
|
80
|
+
sum += exp;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
for (let j = 0; j < seqLength; j++) {
|
|
84
|
+
attentionMatrix[i * seqLength + j]! /= sum;
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
// Output
|
|
89
|
+
const output = new Float32Array(seqLength * headDim);
|
|
90
|
+
for (let i = 0; i < seqLength; i++) {
|
|
91
|
+
for (let k = 0; k < headDim; k++) {
|
|
92
|
+
let sum = 0;
|
|
93
|
+
for (let j = 0; j < seqLength; j++) {
|
|
94
|
+
sum += attentionMatrix[i * seqLength + j]! * value[j * headDim + k]!;
|
|
95
|
+
}
|
|
96
|
+
output[i * headDim + k] = sum;
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
return { output, attentionMatrix };
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* Memory-efficient attention - no full matrix storage
|
|
105
|
+
*/
|
|
106
|
+
function memoryEfficientAttention(
|
|
107
|
+
query: Float32Array,
|
|
108
|
+
key: Float32Array,
|
|
109
|
+
value: Float32Array,
|
|
110
|
+
seqLength: number,
|
|
111
|
+
headDim: number
|
|
112
|
+
): { output: Float32Array } {
|
|
113
|
+
const scale = 1 / Math.sqrt(headDim);
|
|
114
|
+
const output = new Float32Array(seqLength * headDim);
|
|
115
|
+
|
|
116
|
+
// Process row by row - O(n) memory for scores
|
|
117
|
+
const rowScores = new Float32Array(seqLength);
|
|
118
|
+
|
|
119
|
+
for (let i = 0; i < seqLength; i++) {
|
|
120
|
+
// Compute scores for this row
|
|
121
|
+
let max = -Infinity;
|
|
122
|
+
for (let j = 0; j < seqLength; j++) {
|
|
123
|
+
let dot = 0;
|
|
124
|
+
for (let k = 0; k < headDim; k++) {
|
|
125
|
+
dot += query[i * headDim + k]! * key[j * headDim + k]!;
|
|
126
|
+
}
|
|
127
|
+
rowScores[j] = dot * scale;
|
|
128
|
+
max = Math.max(max, rowScores[j]!);
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
// Softmax
|
|
132
|
+
let sum = 0;
|
|
133
|
+
for (let j = 0; j < seqLength; j++) {
|
|
134
|
+
rowScores[j] = Math.exp(rowScores[j]! - max);
|
|
135
|
+
sum += rowScores[j]!;
|
|
136
|
+
}
|
|
137
|
+
for (let j = 0; j < seqLength; j++) {
|
|
138
|
+
rowScores[j]! /= sum;
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
// Compute output for this row
|
|
142
|
+
for (let k = 0; k < headDim; k++) {
|
|
143
|
+
let val = 0;
|
|
144
|
+
for (let j = 0; j < seqLength; j++) {
|
|
145
|
+
val += rowScores[j]! * value[j * headDim + k]!;
|
|
146
|
+
}
|
|
147
|
+
output[i * headDim + k] = val;
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
return { output };
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
/**
|
|
155
|
+
* Chunked attention - process in blocks
|
|
156
|
+
*/
|
|
157
|
+
function chunkedAttention(
|
|
158
|
+
query: Float32Array,
|
|
159
|
+
key: Float32Array,
|
|
160
|
+
value: Float32Array,
|
|
161
|
+
seqLength: number,
|
|
162
|
+
headDim: number,
|
|
163
|
+
chunkSize: number = 64
|
|
164
|
+
): { output: Float32Array } {
|
|
165
|
+
const scale = 1 / Math.sqrt(headDim);
|
|
166
|
+
const output = new Float32Array(seqLength * headDim);
|
|
167
|
+
const numChunks = Math.ceil(seqLength / chunkSize);
|
|
168
|
+
|
|
169
|
+
// Chunk buffer - O(chunkSize^2) memory
|
|
170
|
+
const chunkScores = new Float32Array(chunkSize * seqLength);
|
|
171
|
+
const rowMax = new Float32Array(chunkSize).fill(-Infinity);
|
|
172
|
+
const rowSum = new Float32Array(chunkSize).fill(0);
|
|
173
|
+
|
|
174
|
+
for (let ci = 0; ci < numChunks; ci++) {
|
|
175
|
+
const iStart = ci * chunkSize;
|
|
176
|
+
const iEnd = Math.min(iStart + chunkSize, seqLength);
|
|
177
|
+
const iSize = iEnd - iStart;
|
|
178
|
+
|
|
179
|
+
// Reset accumulators
|
|
180
|
+
rowMax.fill(-Infinity);
|
|
181
|
+
rowSum.fill(0);
|
|
182
|
+
output.fill(0, iStart * headDim, iEnd * headDim);
|
|
183
|
+
|
|
184
|
+
for (let cj = 0; cj < numChunks; cj++) {
|
|
185
|
+
const jStart = cj * chunkSize;
|
|
186
|
+
const jEnd = Math.min(jStart + chunkSize, seqLength);
|
|
187
|
+
const jSize = jEnd - jStart;
|
|
188
|
+
|
|
189
|
+
// Compute chunk scores
|
|
190
|
+
for (let i = 0; i < iSize; i++) {
|
|
191
|
+
for (let j = 0; j < jSize; j++) {
|
|
192
|
+
let dot = 0;
|
|
193
|
+
for (let k = 0; k < headDim; k++) {
|
|
194
|
+
dot += query[(iStart + i) * headDim + k]! * key[(jStart + j) * headDim + k]!;
|
|
195
|
+
}
|
|
196
|
+
chunkScores[i * seqLength + jStart + j] = dot * scale;
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
// Online softmax update
|
|
201
|
+
for (let i = 0; i < iSize; i++) {
|
|
202
|
+
const prevMax = rowMax[i]!;
|
|
203
|
+
|
|
204
|
+
// Find new max
|
|
205
|
+
for (let j = 0; j < jSize; j++) {
|
|
206
|
+
rowMax[i] = Math.max(rowMax[i]!, chunkScores[i * seqLength + jStart + j]!);
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
// Rescale previous
|
|
210
|
+
if (prevMax !== -Infinity && prevMax !== rowMax[i]) {
|
|
211
|
+
const rescale = Math.exp(prevMax - rowMax[i]!);
|
|
212
|
+
rowSum[i]! *= rescale;
|
|
213
|
+
for (let k = 0; k < headDim; k++) {
|
|
214
|
+
output[(iStart + i) * headDim + k]! *= rescale;
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
// Add new exponentials
|
|
219
|
+
for (let j = 0; j < jSize; j++) {
|
|
220
|
+
const exp = Math.exp(chunkScores[i * seqLength + jStart + j]! - rowMax[i]!);
|
|
221
|
+
chunkScores[i * seqLength + jStart + j] = exp;
|
|
222
|
+
rowSum[i]! += exp;
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
// Accumulate output
|
|
226
|
+
for (let k = 0; k < headDim; k++) {
|
|
227
|
+
for (let j = 0; j < jSize; j++) {
|
|
228
|
+
output[(iStart + i) * headDim + k]! +=
|
|
229
|
+
chunkScores[i * seqLength + jStart + j]! * value[(jStart + j) * headDim + k]!;
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
// Final normalization
|
|
236
|
+
for (let i = 0; i < iSize; i++) {
|
|
237
|
+
for (let k = 0; k < headDim; k++) {
|
|
238
|
+
output[(iStart + i) * headDim + k]! /= rowSum[i]!;
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
return { output };
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
// ============================================================================
|
|
247
|
+
// Helper Functions
|
|
248
|
+
// ============================================================================
|
|
249
|
+
|
|
250
|
+
function generateRandomTensor(size: number): Float32Array {
|
|
251
|
+
const tensor = new Float32Array(size);
|
|
252
|
+
for (let i = 0; i < size; i++) {
|
|
253
|
+
tensor[i] = Math.random() * 2 - 1;
|
|
254
|
+
}
|
|
255
|
+
return tensor;
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
// ============================================================================
|
|
259
|
+
// Benchmark Suite
|
|
260
|
+
// ============================================================================
|
|
261
|
+
|
|
262
|
+
export async function runMemoryEfficiencyBenchmarks(): Promise<void> {
|
|
263
|
+
const runner = new BenchmarkRunner('Attention Memory Efficiency');
|
|
264
|
+
|
|
265
|
+
console.log('\n--- Attention Memory Efficiency Benchmarks ---\n');
|
|
266
|
+
|
|
267
|
+
// Test configurations
|
|
268
|
+
const seqLengths = [128, 256, 512, 1024];
|
|
269
|
+
const headDim = 64;
|
|
270
|
+
|
|
271
|
+
// Memory scaling comparison
|
|
272
|
+
console.log('--- Memory Scaling by Sequence Length ---\n');
|
|
273
|
+
|
|
274
|
+
const memoryResults: Array<{
|
|
275
|
+
seqLength: number;
|
|
276
|
+
standard: number;
|
|
277
|
+
efficient: number;
|
|
278
|
+
chunked: number;
|
|
279
|
+
reduction: number;
|
|
280
|
+
}> = [];
|
|
281
|
+
|
|
282
|
+
for (const seqLength of seqLengths) {
|
|
283
|
+
console.log(`Sequence Length: ${seqLength}`);
|
|
284
|
+
|
|
285
|
+
const size = seqLength * headDim;
|
|
286
|
+
const query = generateRandomTensor(size);
|
|
287
|
+
const key = generateRandomTensor(size);
|
|
288
|
+
const value = generateRandomTensor(size);
|
|
289
|
+
|
|
290
|
+
// Standard attention memory
|
|
291
|
+
if (typeof global.gc === 'function') global.gc();
|
|
292
|
+
const standardBefore = takeMemorySnapshot();
|
|
293
|
+
const standardResult = standardAttention(query, key, value, seqLength, headDim);
|
|
294
|
+
const standardAfter = takeMemorySnapshot();
|
|
295
|
+
const standardMem = calculateMemoryDelta(standardBefore, standardAfter);
|
|
296
|
+
void standardResult;
|
|
297
|
+
|
|
298
|
+
// Memory-efficient attention
|
|
299
|
+
if (typeof global.gc === 'function') global.gc();
|
|
300
|
+
const efficientBefore = takeMemorySnapshot();
|
|
301
|
+
const efficientResult = memoryEfficientAttention(query, key, value, seqLength, headDim);
|
|
302
|
+
const efficientAfter = takeMemorySnapshot();
|
|
303
|
+
const efficientMem = calculateMemoryDelta(efficientBefore, efficientAfter);
|
|
304
|
+
void efficientResult;
|
|
305
|
+
|
|
306
|
+
// Chunked attention
|
|
307
|
+
if (typeof global.gc === 'function') global.gc();
|
|
308
|
+
const chunkedBefore = takeMemorySnapshot();
|
|
309
|
+
const chunkedResult = chunkedAttention(query, key, value, seqLength, headDim);
|
|
310
|
+
const chunkedAfter = takeMemorySnapshot();
|
|
311
|
+
const chunkedMem = calculateMemoryDelta(chunkedBefore, chunkedAfter);
|
|
312
|
+
void chunkedResult;
|
|
313
|
+
|
|
314
|
+
const reduction = ((standardMem - efficientMem) / standardMem) * 100;
|
|
315
|
+
|
|
316
|
+
memoryResults.push({
|
|
317
|
+
seqLength,
|
|
318
|
+
standard: standardMem,
|
|
319
|
+
efficient: efficientMem,
|
|
320
|
+
chunked: chunkedMem,
|
|
321
|
+
reduction,
|
|
322
|
+
});
|
|
323
|
+
|
|
324
|
+
console.log(` Standard: ${formatBytes(standardMem)}`);
|
|
325
|
+
console.log(` Efficient: ${formatBytes(efficientMem)}`);
|
|
326
|
+
console.log(` Chunked: ${formatBytes(chunkedMem)}`);
|
|
327
|
+
console.log(` Reduction: ${reduction.toFixed(1)}%`);
|
|
328
|
+
console.log('');
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
// Theoretical memory comparison
|
|
332
|
+
console.log('--- Theoretical Memory Analysis ---\n');
|
|
333
|
+
|
|
334
|
+
for (const seqLength of seqLengths) {
|
|
335
|
+
const bytesPerFloat = 4;
|
|
336
|
+
|
|
337
|
+
// Standard: stores full n x n attention matrix
|
|
338
|
+
const standardTheory = seqLength * seqLength * bytesPerFloat;
|
|
339
|
+
|
|
340
|
+
// Efficient: stores only one row at a time
|
|
341
|
+
const efficientTheory = seqLength * bytesPerFloat;
|
|
342
|
+
|
|
343
|
+
// Chunked: stores chunk x n scores
|
|
344
|
+
const chunkSize = 64;
|
|
345
|
+
const chunkedTheory = chunkSize * seqLength * bytesPerFloat;
|
|
346
|
+
|
|
347
|
+
console.log(`Seq ${seqLength}:`);
|
|
348
|
+
console.log(` Standard: ${formatBytes(standardTheory)} (n^2)`);
|
|
349
|
+
console.log(` Efficient: ${formatBytes(efficientTheory)} (n)`);
|
|
350
|
+
console.log(` Chunked: ${formatBytes(chunkedTheory)} (chunk * n)`);
|
|
351
|
+
console.log(` Theoretical reduction: ${((1 - efficientTheory / standardTheory) * 100).toFixed(1)}%`);
|
|
352
|
+
console.log('');
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
// Performance vs Memory tradeoff
|
|
356
|
+
console.log('--- Performance vs Memory Tradeoff ---\n');
|
|
357
|
+
|
|
358
|
+
const tradeoffConfig = { seqLength: 512, headDim: 64 };
|
|
359
|
+
const size = tradeoffConfig.seqLength * tradeoffConfig.headDim;
|
|
360
|
+
const q = generateRandomTensor(size);
|
|
361
|
+
const k = generateRandomTensor(size);
|
|
362
|
+
const v = generateRandomTensor(size);
|
|
363
|
+
|
|
364
|
+
// Standard performance
|
|
365
|
+
const standardPerfResult = await runner.run(
|
|
366
|
+
'standard-attention-perf',
|
|
367
|
+
async () => {
|
|
368
|
+
standardAttention(q, k, v, tradeoffConfig.seqLength, tradeoffConfig.headDim);
|
|
369
|
+
},
|
|
370
|
+
{ iterations: 20 }
|
|
371
|
+
);
|
|
372
|
+
|
|
373
|
+
console.log(`Standard Performance: ${formatTime(standardPerfResult.mean)}`);
|
|
374
|
+
|
|
375
|
+
// Efficient performance
|
|
376
|
+
const efficientPerfResult = await runner.run(
|
|
377
|
+
'efficient-attention-perf',
|
|
378
|
+
async () => {
|
|
379
|
+
memoryEfficientAttention(q, k, v, tradeoffConfig.seqLength, tradeoffConfig.headDim);
|
|
380
|
+
},
|
|
381
|
+
{ iterations: 20 }
|
|
382
|
+
);
|
|
383
|
+
|
|
384
|
+
console.log(`Memory-Efficient Performance: ${formatTime(efficientPerfResult.mean)}`);
|
|
385
|
+
|
|
386
|
+
// Chunked performance with different chunk sizes
|
|
387
|
+
const chunkSizes = [32, 64, 128, 256];
|
|
388
|
+
|
|
389
|
+
for (const chunkSize of chunkSizes) {
|
|
390
|
+
const chunkedPerfResult = await runner.run(
|
|
391
|
+
`chunked-attention-chunk${chunkSize}`,
|
|
392
|
+
async () => {
|
|
393
|
+
chunkedAttention(q, k, v, tradeoffConfig.seqLength, tradeoffConfig.headDim, chunkSize);
|
|
394
|
+
},
|
|
395
|
+
{ iterations: 20 }
|
|
396
|
+
);
|
|
397
|
+
|
|
398
|
+
console.log(`Chunked (size=${chunkSize}): ${formatTime(chunkedPerfResult.mean)}`);
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
// Multi-head memory analysis
|
|
402
|
+
console.log('\n--- Multi-Head Memory Analysis ---\n');
|
|
403
|
+
|
|
404
|
+
const numHeads = [4, 8, 16, 32];
|
|
405
|
+
const mhaSeqLength = 256;
|
|
406
|
+
|
|
407
|
+
for (const heads of numHeads) {
|
|
408
|
+
const mhaSize = mhaSeqLength * headDim;
|
|
409
|
+
|
|
410
|
+
// Standard MHA memory
|
|
411
|
+
const standardMHAMem = mhaSeqLength * mhaSeqLength * 4 * heads; // attention matrices
|
|
412
|
+
const qkvMem = mhaSize * 4 * 3 * heads; // QKV storage
|
|
413
|
+
|
|
414
|
+
// GQA memory (shared KV)
|
|
415
|
+
const gqaKVHeads = heads / 4;
|
|
416
|
+
const gqaMem = mhaSeqLength * mhaSeqLength * 4 * heads + // attention matrices (same)
|
|
417
|
+
mhaSize * 4 * heads + // Q storage
|
|
418
|
+
mhaSize * 4 * 2 * gqaKVHeads; // shared KV
|
|
419
|
+
|
|
420
|
+
// MQA memory (single KV)
|
|
421
|
+
const mqaMem = mhaSeqLength * mhaSeqLength * 4 * heads + // attention matrices
|
|
422
|
+
mhaSize * 4 * heads + // Q storage
|
|
423
|
+
mhaSize * 4 * 2; // single KV
|
|
424
|
+
|
|
425
|
+
console.log(`${heads} heads:`);
|
|
426
|
+
console.log(` Standard MHA: ${formatBytes(standardMHAMem + qkvMem)}`);
|
|
427
|
+
console.log(` GQA (${gqaKVHeads} KV): ${formatBytes(gqaMem)}`);
|
|
428
|
+
console.log(` MQA (1 KV): ${formatBytes(mqaMem)}`);
|
|
429
|
+
console.log(` MQA reduction: ${(((standardMHAMem + qkvMem) - mqaMem) / (standardMHAMem + qkvMem) * 100).toFixed(1)}%`);
|
|
430
|
+
console.log('');
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
// Summary
|
|
434
|
+
console.log('--- Summary ---\n');
|
|
435
|
+
|
|
436
|
+
console.log('Memory Reduction Achieved:');
|
|
437
|
+
for (const result of memoryResults) {
|
|
438
|
+
const targetMet = result.reduction >= 50;
|
|
439
|
+
console.log(
|
|
440
|
+
` Seq ${result.seqLength}: ${result.reduction.toFixed(1)}% ${targetMet ? '(TARGET MET)' : ''}`
|
|
441
|
+
);
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
console.log('\nPerformance Comparison (seq=512):');
|
|
445
|
+
console.log(` Standard: ${formatTime(standardPerfResult.mean)}`);
|
|
446
|
+
console.log(` Efficient: ${formatTime(efficientPerfResult.mean)}`);
|
|
447
|
+
|
|
448
|
+
// Print full results
|
|
449
|
+
runner.printResults();
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
// ============================================================================
|
|
453
|
+
// Memory Efficiency Optimization Strategies
|
|
454
|
+
// ============================================================================
|
|
455
|
+
|
|
456
|
+
export const memoryOptimizations = {
|
|
457
|
+
/**
|
|
458
|
+
* Online softmax computation
|
|
459
|
+
*/
|
|
460
|
+
onlineSoftmax: {
|
|
461
|
+
description: 'Compute softmax in streaming fashion without storing all values',
|
|
462
|
+
expectedImprovement: 'O(n) instead of O(n^2) for softmax',
|
|
463
|
+
implementation: `
|
|
464
|
+
class OnlineSoftmax {
|
|
465
|
+
private max = -Infinity;
|
|
466
|
+
private sum = 0;
|
|
467
|
+
private count = 0;
|
|
468
|
+
|
|
469
|
+
add(value: number): void {
|
|
470
|
+
if (value > this.max) {
|
|
471
|
+
this.sum *= Math.exp(this.max - value);
|
|
472
|
+
this.max = value;
|
|
473
|
+
}
|
|
474
|
+
this.sum += Math.exp(value - this.max);
|
|
475
|
+
this.count++;
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
normalize(value: number): number {
|
|
479
|
+
return Math.exp(value - this.max) / this.sum;
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
`,
|
|
483
|
+
},
|
|
484
|
+
|
|
485
|
+
/**
|
|
486
|
+
* Gradient checkpointing
|
|
487
|
+
*/
|
|
488
|
+
gradientCheckpointing: {
|
|
489
|
+
description: 'Recompute attention during backward pass instead of storing',
|
|
490
|
+
expectedImprovement: 'O(1) memory for activations',
|
|
491
|
+
implementation: `
|
|
492
|
+
function checkpointedAttention(q, k, v) {
|
|
493
|
+
const output = computeAttention(q, k, v);
|
|
494
|
+
|
|
495
|
+
function backward(gradOutput) {
|
|
496
|
+
// Recompute attention weights during backward
|
|
497
|
+
const attnWeights = recomputeAttention(q, k);
|
|
498
|
+
return computeGradients(gradOutput, attnWeights, q, k, v);
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
return { output, backward };
|
|
502
|
+
}
|
|
503
|
+
`,
|
|
504
|
+
},
|
|
505
|
+
|
|
506
|
+
/**
|
|
507
|
+
* Sparse attention patterns
|
|
508
|
+
*/
|
|
509
|
+
sparseAttention: {
|
|
510
|
+
description: 'Only compute attention for relevant positions',
|
|
511
|
+
expectedImprovement: 'O(n * k) instead of O(n^2) where k << n',
|
|
512
|
+
implementation: `
|
|
513
|
+
function sparseAttention(q, k, v, pattern: 'local' | 'strided' | 'block') {
|
|
514
|
+
const sparseMask = generateSparsePattern(q.length, pattern);
|
|
515
|
+
return computeAttentionWithMask(q, k, v, sparseMask);
|
|
516
|
+
}
|
|
517
|
+
`,
|
|
518
|
+
},
|
|
519
|
+
|
|
520
|
+
/**
|
|
521
|
+
* Quantization
|
|
522
|
+
*/
|
|
523
|
+
quantization: {
|
|
524
|
+
description: 'Use lower precision for attention computation',
|
|
525
|
+
expectedImprovement: '2-4x memory reduction',
|
|
526
|
+
implementation: `
|
|
527
|
+
function quantizedAttention(q, k, v) {
|
|
528
|
+
// Quantize to int8
|
|
529
|
+
const qInt8 = quantizeToInt8(q);
|
|
530
|
+
const kInt8 = quantizeToInt8(k);
|
|
531
|
+
|
|
532
|
+
// Compute in int8
|
|
533
|
+
const scores = computeInt8Attention(qInt8, kInt8);
|
|
534
|
+
|
|
535
|
+
// Dequantize for output
|
|
536
|
+
return dequantizeAndApply(scores, v);
|
|
537
|
+
}
|
|
538
|
+
`,
|
|
539
|
+
},
|
|
540
|
+
|
|
541
|
+
/**
|
|
542
|
+
* Memory pooling
|
|
543
|
+
*/
|
|
544
|
+
memoryPooling: {
|
|
545
|
+
description: 'Reuse memory buffers across forward passes',
|
|
546
|
+
expectedImprovement: 'Eliminates allocation overhead',
|
|
547
|
+
implementation: `
|
|
548
|
+
class AttentionMemoryPool {
|
|
549
|
+
private scoreBuffer: Float32Array;
|
|
550
|
+
private outputBuffer: Float32Array;
|
|
551
|
+
|
|
552
|
+
forward(q, k, v) {
|
|
553
|
+
// Reuse pre-allocated buffers
|
|
554
|
+
computeScores(q, k, this.scoreBuffer);
|
|
555
|
+
applySoftmax(this.scoreBuffer);
|
|
556
|
+
computeOutput(this.scoreBuffer, v, this.outputBuffer);
|
|
557
|
+
return this.outputBuffer;
|
|
558
|
+
}
|
|
559
|
+
}
|
|
560
|
+
`,
|
|
561
|
+
},
|
|
562
|
+
};
|
|
563
|
+
|
|
564
|
+
// Run if executed directly
|
|
565
|
+
if (import.meta.url === `file://${process.argv[1]}`) {
|
|
566
|
+
runMemoryEfficiencyBenchmarks().catch(console.error);
|
|
567
|
+
}
|
|
568
|
+
|
|
569
|
+
export default runMemoryEfficiencyBenchmarks;
|