webinfer 0.0.3 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +65 -21
- package/dist/activation/index.d.ts +30 -0
- package/dist/core/context.d.ts +70 -0
- package/dist/core/paged-kv-cache.d.ts +33 -0
- package/dist/core/tensor.d.ts +51 -19
- package/dist/core/types.d.ts +27 -0
- package/dist/decode/index.d.ts +140 -0
- package/dist/gemm/index.d.ts +27 -0
- package/dist/index.d.ts +29 -21
- package/dist/index.js +3433 -4809
- package/dist/jit/index.d.ts +138 -0
- package/dist/kernels/activation.wgsl.d.ts +14 -0
- package/dist/kernels/batch-decode-paged.wgsl.d.ts +12 -0
- package/dist/kernels/batch-prefill-paged.wgsl.d.ts +13 -0
- package/dist/kernels/decode-attention.wgsl.d.ts +16 -0
- package/dist/kernels/gemm.wgsl.d.ts +17 -0
- package/dist/kernels/page.wgsl.d.ts +10 -0
- package/dist/kernels/prefill-attention.wgsl.d.ts +17 -0
- package/dist/kernels/rmsnorm.wgsl.d.ts +10 -0
- package/dist/kernels/rope.wgsl.d.ts +19 -0
- package/dist/kernels/sampling.wgsl.d.ts +23 -0
- package/dist/norm/index.d.ts +43 -0
- package/dist/page/index.d.ts +21 -0
- package/dist/prefill/index.d.ts +155 -0
- package/dist/rope/index.d.ts +37 -0
- package/dist/sampling/index.d.ts +53 -4
- package/package.json +1 -1
- package/dist/attention/block-sparse/format.d.ts +0 -52
- package/dist/attention/block-sparse/patterns/causal.d.ts +0 -16
- package/dist/attention/block-sparse/patterns/sliding.d.ts +0 -22
- package/dist/attention/block-sparse/patterns/tree.d.ts +0 -65
- package/dist/attention/cascaded-inference.d.ts +0 -29
- package/dist/attention/flash-attention.d.ts +0 -30
- package/dist/attention/index.d.ts +0 -118
- package/dist/attention/paged-attention.d.ts +0 -40
- package/dist/attention/paged-kv/block-manager.d.ts +0 -102
- package/dist/attention/paged-kv/index.d.ts +0 -5
- package/dist/attention/paged-kv/page-table.d.ts +0 -165
- package/dist/attention/scheduler.d.ts +0 -40
- package/dist/core/buffer-pool.d.ts +0 -18
- package/dist/core/device.d.ts +0 -23
- package/dist/core/tdr.d.ts +0 -114
- package/dist/inference/engine.d.ts +0 -69
- package/dist/inference/generate.d.ts +0 -30
- package/dist/inference/index.d.ts +0 -7
- package/dist/inference/types.d.ts +0 -161
- package/dist/jit/compiler.d.ts +0 -23
- package/dist/jit/kernel-cache.d.ts +0 -21
- package/dist/model/gguf.d.ts +0 -90
- package/dist/model/index.d.ts +0 -16
- package/dist/model/safetensors.d.ts +0 -38
- package/dist/model/types.d.ts +0 -182
- package/dist/ops/activations.d.ts +0 -43
- package/dist/ops/elementwise.d.ts +0 -38
- package/dist/ops/embedding.d.ts +0 -30
- package/dist/ops/matmul.d.ts +0 -21
- package/dist/ops/normalization.d.ts +0 -63
- package/dist/ops/reshape.d.ts +0 -39
- package/dist/ops/rope.d.ts +0 -32
- package/dist/ops/softmax.d.ts +0 -18
- package/dist/quantization/index.d.ts +0 -6
- package/dist/quantization/qmatmul.d.ts +0 -38
- package/dist/quantization/quantize.d.ts +0 -52
- package/dist/sampling/beam-search.d.ts +0 -87
- package/dist/sampling/sampler.d.ts +0 -72
- package/dist/sampling/speculative.d.ts +0 -65
- package/dist/sampling/top-k.d.ts +0 -24
- package/dist/sampling/top-p.d.ts +0 -14
- package/dist/tvm/adapter.d.ts +0 -81
- package/dist/tvm/index.d.ts +0 -8
- package/dist/tvm/ops.d.ts +0 -26
- package/dist/tvm/types.d.ts +0 -35
|
@@ -1,52 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Block-Sparse CSR Format
|
|
3
|
-
* Enables a single kernel to support all attention variants
|
|
4
|
-
*/
|
|
5
|
-
/**
|
|
6
|
-
* Block-Sparse CSR representation of attention mask
|
|
7
|
-
*/
|
|
8
|
-
export interface BlockSparseCSR {
|
|
9
|
-
blockSize: number;
|
|
10
|
-
rowPtr: Uint32Array;
|
|
11
|
-
colIdx: Uint32Array;
|
|
12
|
-
blockMask?: Uint8Array;
|
|
13
|
-
numRows: number;
|
|
14
|
-
numCols: number;
|
|
15
|
-
numBlockRows: number;
|
|
16
|
-
numBlockCols: number;
|
|
17
|
-
nnzBlocks: number;
|
|
18
|
-
}
|
|
19
|
-
/**
|
|
20
|
-
* Attention pattern types
|
|
21
|
-
*/
|
|
22
|
-
export type AttentionPattern = {
|
|
23
|
-
type: "dense";
|
|
24
|
-
} | {
|
|
25
|
-
type: "causal";
|
|
26
|
-
} | {
|
|
27
|
-
type: "sliding";
|
|
28
|
-
windowSize: number;
|
|
29
|
-
} | {
|
|
30
|
-
type: "global-local";
|
|
31
|
-
globalTokens: number[];
|
|
32
|
-
localWindow: number;
|
|
33
|
-
} | {
|
|
34
|
-
type: "custom";
|
|
35
|
-
mask: boolean[][];
|
|
36
|
-
};
|
|
37
|
-
/**
|
|
38
|
-
* Build BS-CSR from attention pattern
|
|
39
|
-
*/
|
|
40
|
-
export declare function buildBlockSparseCSR(seqLen: number, pattern: AttentionPattern, blockSize?: number): BlockSparseCSR;
|
|
41
|
-
/**
|
|
42
|
-
* Calculate sparsity ratio of the mask
|
|
43
|
-
*/
|
|
44
|
-
export declare function getSparsityRatio(csr: BlockSparseCSR): number;
|
|
45
|
-
/**
|
|
46
|
-
* Estimate memory savings from sparsity
|
|
47
|
-
*/
|
|
48
|
-
export declare function estimateMemorySavings(csr: BlockSparseCSR): {
|
|
49
|
-
denseBytes: number;
|
|
50
|
-
sparseBytes: number;
|
|
51
|
-
savingsRatio: number;
|
|
52
|
-
};
|
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Causal (autoregressive) attention pattern
|
|
3
|
-
* Used in GPT, Llama, and most decoder-only models
|
|
4
|
-
*/
|
|
5
|
-
import { type BlockSparseCSR } from "../format.ts";
|
|
6
|
-
/**
|
|
7
|
-
* Build causal attention mask in BS-CSR format
|
|
8
|
-
* Each query position can only attend to positions <= its own position
|
|
9
|
-
*/
|
|
10
|
-
export declare function buildCausalMask(seqLen: number, blockSize?: number): BlockSparseCSR;
|
|
11
|
-
/**
|
|
12
|
-
* Get the theoretical sparsity of causal attention
|
|
13
|
-
* For a sequence of length N, causal attention has N*(N+1)/2 non-zero elements
|
|
14
|
-
* out of N*N total, giving ~50% sparsity for large N
|
|
15
|
-
*/
|
|
16
|
-
export declare function getCausalSparsity(seqLen: number): number;
|
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Sliding window attention pattern
|
|
3
|
-
* Used in Mistral and other efficient attention models
|
|
4
|
-
*/
|
|
5
|
-
import { type BlockSparseCSR } from "../format.ts";
|
|
6
|
-
/**
|
|
7
|
-
* Build sliding window attention mask in BS-CSR format
|
|
8
|
-
* Each query can only attend to the previous `windowSize` positions
|
|
9
|
-
*/
|
|
10
|
-
export declare function buildSlidingWindowMask(seqLen: number, windowSize: number, blockSize?: number): BlockSparseCSR;
|
|
11
|
-
/**
|
|
12
|
-
* Get the theoretical sparsity of sliding window attention
|
|
13
|
-
* For window size W and sequence length N:
|
|
14
|
-
* - First W positions have triangular attention (like causal)
|
|
15
|
-
* - Remaining N-W positions have W+1 attention each
|
|
16
|
-
*/
|
|
17
|
-
export declare function getSlidingWindowSparsity(seqLen: number, windowSize: number): number;
|
|
18
|
-
/**
|
|
19
|
-
* Sliding window with causal constraint
|
|
20
|
-
* This is what Mistral uses - combines sliding window with causal masking
|
|
21
|
-
*/
|
|
22
|
-
export declare function buildCausalSlidingWindowMask(seqLen: number, windowSize: number, blockSize?: number): BlockSparseCSR;
|
|
@@ -1,65 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Tree Attention Pattern
|
|
3
|
-
* Used in speculative decoding (Medusa, EAGLE) and tree-based generation
|
|
4
|
-
*/
|
|
5
|
-
import { type BlockSparseCSR } from "../format.ts";
|
|
6
|
-
/**
|
|
7
|
-
* Tree node structure
|
|
8
|
-
*/
|
|
9
|
-
export interface TreeNode {
|
|
10
|
-
/** Token position in sequence */
|
|
11
|
-
position: number;
|
|
12
|
-
/** Parent position (-1 for root) */
|
|
13
|
-
parent: number;
|
|
14
|
-
/** Depth in tree (0 for root) */
|
|
15
|
-
depth: number;
|
|
16
|
-
}
|
|
17
|
-
/**
|
|
18
|
-
* Tree attention configuration
|
|
19
|
-
*/
|
|
20
|
-
export interface TreeAttentionConfig {
|
|
21
|
-
/** Total sequence length including prompt */
|
|
22
|
-
seqLen: number;
|
|
23
|
-
/** Prompt length (prefix that all tokens attend to) */
|
|
24
|
-
promptLen: number;
|
|
25
|
-
/** Tree structure for speculative tokens */
|
|
26
|
-
tree: TreeNode[];
|
|
27
|
-
/** Block size for sparse format */
|
|
28
|
-
blockSize?: number;
|
|
29
|
-
}
|
|
30
|
-
/**
|
|
31
|
-
* Build tree attention mask
|
|
32
|
-
*
|
|
33
|
-
* In tree attention:
|
|
34
|
-
* - All tokens attend to the prompt (positions 0 to promptLen-1)
|
|
35
|
-
* - Tree tokens attend to their ancestors in the tree
|
|
36
|
-
* - Maintains causal property within the tree structure
|
|
37
|
-
*/
|
|
38
|
-
export declare function buildTreeMask(config: TreeAttentionConfig): BlockSparseCSR;
|
|
39
|
-
/**
|
|
40
|
-
* Build a simple chain tree (linear speculation)
|
|
41
|
-
* Each token depends on the previous one
|
|
42
|
-
*/
|
|
43
|
-
export declare function buildChainTree(numSpecTokens: number): TreeNode[];
|
|
44
|
-
/**
|
|
45
|
-
* Build a wide tree (parallel speculation)
|
|
46
|
-
* All speculative tokens depend only on the prompt
|
|
47
|
-
*/
|
|
48
|
-
export declare function buildWideTree(numSpecTokens: number): TreeNode[];
|
|
49
|
-
/**
|
|
50
|
-
* Build a binary tree for speculation
|
|
51
|
-
*/
|
|
52
|
-
export declare function buildBinaryTree(depth: number): TreeNode[];
|
|
53
|
-
/**
|
|
54
|
-
* Build Medusa-style tree
|
|
55
|
-
* Multiple heads predict tokens at different positions
|
|
56
|
-
*/
|
|
57
|
-
export declare function buildMedusaTree(numHeads: number, tokensPerHead: number): TreeNode[];
|
|
58
|
-
/**
|
|
59
|
-
* Calculate tree sparsity ratio
|
|
60
|
-
*/
|
|
61
|
-
export declare function getTreeSparsity(config: TreeAttentionConfig): number;
|
|
62
|
-
/**
|
|
63
|
-
* Validate tree structure
|
|
64
|
-
*/
|
|
65
|
-
export declare function validateTree(tree: TreeNode[]): boolean;
|
|
@@ -1,29 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Cascaded Inference - TDR-safe attention for very long sequences
|
|
3
|
-
* Splits attention computation across multiple passes with browser yields
|
|
4
|
-
*/
|
|
5
|
-
import type { WebInferDevice } from "../core/device.ts";
|
|
6
|
-
import { Tensor } from "../core/tensor.ts";
|
|
7
|
-
export interface CascadedAttentionConfig {
|
|
8
|
-
numHeads: number;
|
|
9
|
-
headDim: number;
|
|
10
|
-
seqLen: number;
|
|
11
|
-
scale?: number;
|
|
12
|
-
causal?: boolean;
|
|
13
|
-
}
|
|
14
|
-
/**
|
|
15
|
-
* Cascaded Attention - Safe for very long sequences
|
|
16
|
-
* Uses Split-K strategy to prevent TDR (GPU timeout)
|
|
17
|
-
*
|
|
18
|
-
* @param device WebInfer device
|
|
19
|
-
* @param q Query tensor [seqLen, numHeads, headDim]
|
|
20
|
-
* @param k Key tensor [seqLen, numHeads, headDim]
|
|
21
|
-
* @param v Value tensor [seqLen, numHeads, headDim]
|
|
22
|
-
* @param config Attention configuration
|
|
23
|
-
* @param onProgress Optional progress callback
|
|
24
|
-
*/
|
|
25
|
-
export declare function cascadedAttention(device: WebInferDevice, q: Tensor, k: Tensor, v: Tensor, config: CascadedAttentionConfig, onProgress?: (chunk: number, total: number) => void): Promise<Tensor>;
|
|
26
|
-
/**
|
|
27
|
-
* CPU reference implementation for verification
|
|
28
|
-
*/
|
|
29
|
-
export declare function cascadedAttentionCPU(q: Float32Array, k: Float32Array, v: Float32Array, seqLen: number, numHeads: number, headDim: number, causal?: boolean): Float32Array;
|
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* FlashAttention Implementation for WebGPU
|
|
3
|
-
* Memory-efficient attention using online softmax and tiling
|
|
4
|
-
*/
|
|
5
|
-
import type { WebInferDevice } from "../core/device.ts";
|
|
6
|
-
import { Tensor } from "../core/tensor.ts";
|
|
7
|
-
import type { AttentionPattern } from "./block-sparse/format.ts";
|
|
8
|
-
export interface AttentionConfig {
|
|
9
|
-
numHeads: number;
|
|
10
|
-
headDim: number;
|
|
11
|
-
seqLen: number;
|
|
12
|
-
scale?: number;
|
|
13
|
-
pattern?: AttentionPattern;
|
|
14
|
-
blockSize?: number;
|
|
15
|
-
}
|
|
16
|
-
/**
|
|
17
|
-
* FlashAttention forward pass
|
|
18
|
-
* Computes: softmax(Q @ K^T / sqrt(d)) @ V
|
|
19
|
-
*
|
|
20
|
-
* @param device WebInfer device
|
|
21
|
-
* @param q Query tensor [batch, seqLen, numHeads, headDim]
|
|
22
|
-
* @param k Key tensor [batch, seqLen, numHeads, headDim]
|
|
23
|
-
* @param v Value tensor [batch, seqLen, numHeads, headDim]
|
|
24
|
-
* @param config Attention configuration
|
|
25
|
-
*/
|
|
26
|
-
export declare function flashAttention(device: WebInferDevice, q: Tensor, k: Tensor, v: Tensor, config: AttentionConfig): Promise<Tensor>;
|
|
27
|
-
/**
|
|
28
|
-
* CPU reference implementation for verification
|
|
29
|
-
*/
|
|
30
|
-
export declare function attentionCPU(q: Float32Array, k: Float32Array, v: Float32Array, seqLen: number, numHeads: number, headDim: number, causal?: boolean): Float32Array;
|
|
@@ -1,118 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Attention Module Exports
|
|
3
|
-
*/
|
|
4
|
-
import type { WebInferDevice } from "../core/device.ts";
|
|
5
|
-
import { Tensor } from "../core/tensor.ts";
|
|
6
|
-
import type { BlockSparseCSR } from "./block-sparse/format.ts";
|
|
7
|
-
import { PagedKVCache } from "./paged-kv/page-table.ts";
|
|
8
|
-
/**
|
|
9
|
-
* Tensor-like input types
|
|
10
|
-
*/
|
|
11
|
-
export type TensorLike = Tensor | Float32Array | {
|
|
12
|
-
data: Float32Array;
|
|
13
|
-
shape: number[];
|
|
14
|
-
};
|
|
15
|
-
/**
|
|
16
|
-
* Attention options
|
|
17
|
-
*/
|
|
18
|
-
export interface AttentionOptions {
|
|
19
|
-
q: TensorLike;
|
|
20
|
-
k: TensorLike;
|
|
21
|
-
v: TensorLike;
|
|
22
|
-
causal?: boolean;
|
|
23
|
-
scale?: number;
|
|
24
|
-
window?: number;
|
|
25
|
-
mask?: BlockSparseCSR;
|
|
26
|
-
returnLse?: boolean;
|
|
27
|
-
}
|
|
28
|
-
/**
|
|
29
|
-
* Attention result with optional LSE
|
|
30
|
-
*/
|
|
31
|
-
export interface AttentionResultWithLse {
|
|
32
|
-
output: Tensor;
|
|
33
|
-
lse: Tensor;
|
|
34
|
-
}
|
|
35
|
-
/**
|
|
36
|
-
* Simple attention function
|
|
37
|
-
*/
|
|
38
|
-
export declare function attention(device: WebInferDevice, options: AttentionOptions): Promise<Tensor>;
|
|
39
|
-
export declare function attention(device: WebInferDevice, options: AttentionOptions & {
|
|
40
|
-
returnLse: true;
|
|
41
|
-
}): Promise<AttentionResultWithLse>;
|
|
42
|
-
/**
|
|
43
|
-
* BatchAttention configuration
|
|
44
|
-
*/
|
|
45
|
-
export interface BatchAttentionConfig {
|
|
46
|
-
numHeads: number;
|
|
47
|
-
headDim: number;
|
|
48
|
-
numKvHeads?: number;
|
|
49
|
-
maxBatchSize?: number;
|
|
50
|
-
maxSeqLen?: number;
|
|
51
|
-
}
|
|
52
|
-
/**
|
|
53
|
-
* Prefill input
|
|
54
|
-
*/
|
|
55
|
-
export interface PrefillInput {
|
|
56
|
-
queries: TensorLike[];
|
|
57
|
-
keys: TensorLike[];
|
|
58
|
-
values: TensorLike[];
|
|
59
|
-
causal?: boolean;
|
|
60
|
-
window?: number;
|
|
61
|
-
}
|
|
62
|
-
/**
|
|
63
|
-
* Decode input
|
|
64
|
-
*/
|
|
65
|
-
export interface DecodeInput {
|
|
66
|
-
query: TensorLike;
|
|
67
|
-
kvCache: PagedKVCache;
|
|
68
|
-
seqIds: number[];
|
|
69
|
-
}
|
|
70
|
-
/**
|
|
71
|
-
* BatchAttention - Batched attention for prefill and decode
|
|
72
|
-
*/
|
|
73
|
-
export declare class BatchAttention {
|
|
74
|
-
private device;
|
|
75
|
-
private config;
|
|
76
|
-
constructor(device: WebInferDevice, config: BatchAttentionConfig);
|
|
77
|
-
/**
|
|
78
|
-
* Prefill: Process variable-length sequences
|
|
79
|
-
*/
|
|
80
|
-
prefill(input: PrefillInput): Promise<Tensor[]>;
|
|
81
|
-
/**
|
|
82
|
-
* Decode: Single token per sequence with KV cache
|
|
83
|
-
*/
|
|
84
|
-
decode(input: DecodeInput): Promise<Tensor>;
|
|
85
|
-
getConfig(): Required<BatchAttentionConfig>;
|
|
86
|
-
dispose(): void;
|
|
87
|
-
}
|
|
88
|
-
/**
|
|
89
|
-
* AttentionKernel configuration
|
|
90
|
-
*/
|
|
91
|
-
export interface AttentionKernelConfig {
|
|
92
|
-
numHeads: number;
|
|
93
|
-
headDim: number;
|
|
94
|
-
causal?: boolean;
|
|
95
|
-
blockSize?: number;
|
|
96
|
-
}
|
|
97
|
-
/**
|
|
98
|
-
* AttentionKernel - Low-level compiled kernel
|
|
99
|
-
*/
|
|
100
|
-
export declare class AttentionKernel {
|
|
101
|
-
private device;
|
|
102
|
-
private config;
|
|
103
|
-
private constructor();
|
|
104
|
-
static compile(device: WebInferDevice, config: AttentionKernelConfig): Promise<AttentionKernel>;
|
|
105
|
-
execute(input: {
|
|
106
|
-
q: Tensor;
|
|
107
|
-
k: Tensor;
|
|
108
|
-
v: Tensor;
|
|
109
|
-
}): Promise<Tensor>;
|
|
110
|
-
dispose(): void;
|
|
111
|
-
}
|
|
112
|
-
export { flashAttention, type AttentionConfig } from "./flash-attention.ts";
|
|
113
|
-
export { buildBlockSparseCSR, getSparsityRatio, estimateMemorySavings, type BlockSparseCSR, type AttentionPattern, } from "./block-sparse/format.ts";
|
|
114
|
-
export { buildCausalMask, getCausalSparsity } from "./block-sparse/patterns/causal.ts";
|
|
115
|
-
export { buildSlidingWindowMask, buildCausalSlidingWindowMask, getSlidingWindowSparsity, } from "./block-sparse/patterns/sliding.ts";
|
|
116
|
-
export { PagedKVCache, type PagedKVCacheConfig, type SequenceEntry, type DefragmentResult, BlockManager, ContinuousBatchScheduler, type BlockManagerConfig, type AllocationPolicy, type AllocationRequest, } from "./paged-kv/index.ts";
|
|
117
|
-
export { pagedAttention, appendToPagedCache, type PagedAttentionConfig, type PagedAttentionInput, } from "./paged-attention.ts";
|
|
118
|
-
export { cascadedAttention, type CascadedAttentionConfig, } from "./cascaded-inference.ts";
|
|
@@ -1,40 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Paged Attention Implementation for WebGPU
|
|
3
|
-
* Efficient attention computation using paged KV cache (vLLM-style)
|
|
4
|
-
*/
|
|
5
|
-
import type { WebInferDevice } from "../core/device.ts";
|
|
6
|
-
import { Tensor } from "../core/tensor.ts";
|
|
7
|
-
import type { PagedKVCache } from "./paged-kv/page-table.ts";
|
|
8
|
-
export interface PagedAttentionConfig {
|
|
9
|
-
numHeads: number;
|
|
10
|
-
headDim: number;
|
|
11
|
-
scale?: number;
|
|
12
|
-
}
|
|
13
|
-
export interface PagedAttentionInput {
|
|
14
|
-
query: Tensor;
|
|
15
|
-
kvCache: PagedKVCache;
|
|
16
|
-
seqIds: number[];
|
|
17
|
-
positions: number[];
|
|
18
|
-
}
|
|
19
|
-
/**
|
|
20
|
-
* Paged Attention forward pass for decoding
|
|
21
|
-
* Computes attention against paged KV cache for single-token queries
|
|
22
|
-
*
|
|
23
|
-
* @param device WebInfer device
|
|
24
|
-
* @param input Paged attention input (query, kv cache, sequence info)
|
|
25
|
-
* @param config Attention configuration
|
|
26
|
-
*/
|
|
27
|
-
export declare function pagedAttention(device: WebInferDevice, input: PagedAttentionInput, config: PagedAttentionConfig): Promise<Tensor>;
|
|
28
|
-
/**
|
|
29
|
-
* Append new KV to paged cache
|
|
30
|
-
*/
|
|
31
|
-
export declare function appendToPagedCache(device: WebInferDevice, kvCache: PagedKVCache, seqId: number, key: Tensor, // [numHeads, headDim]
|
|
32
|
-
value: Tensor): Promise<void>;
|
|
33
|
-
/**
|
|
34
|
-
* CPU reference implementation for verification
|
|
35
|
-
*/
|
|
36
|
-
export declare function pagedAttentionCPU(q: Float32Array, // [batchSize, numHeads, headDim]
|
|
37
|
-
keyCache: Float32Array, // [maxPages, pageSize, numHeads, headDim]
|
|
38
|
-
valueCache: Float32Array, pageTable: number[][], // [batchSize][pages]
|
|
39
|
-
seqLens: number[], // [batchSize]
|
|
40
|
-
numHeads: number, headDim: number, pageSize: number, maxPages: number): Float32Array;
|
|
@@ -1,102 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Block Manager - High-level memory management for PagedKVCache
|
|
3
|
-
*/
|
|
4
|
-
import type { WebInferDevice } from "../../core/device.ts";
|
|
5
|
-
import { PagedKVCache, type PagedKVCacheConfig } from "./page-table.ts";
|
|
6
|
-
/**
|
|
7
|
-
* Block allocation policy
|
|
8
|
-
*/
|
|
9
|
-
export type AllocationPolicy = "greedy" | "best-fit" | "first-fit";
|
|
10
|
-
/**
|
|
11
|
-
* Block Manager configuration
|
|
12
|
-
*/
|
|
13
|
-
export interface BlockManagerConfig extends PagedKVCacheConfig {
|
|
14
|
-
policy?: AllocationPolicy;
|
|
15
|
-
reservedPages?: number;
|
|
16
|
-
}
|
|
17
|
-
/**
|
|
18
|
-
* Request for KV cache allocation
|
|
19
|
-
*/
|
|
20
|
-
export interface AllocationRequest {
|
|
21
|
-
seqId?: number;
|
|
22
|
-
numTokens: number;
|
|
23
|
-
priority?: number;
|
|
24
|
-
}
|
|
25
|
-
/**
|
|
26
|
-
* Block Manager - Manages KV cache allocation across multiple sequences
|
|
27
|
-
*/
|
|
28
|
-
export declare class BlockManager {
|
|
29
|
-
private cache;
|
|
30
|
-
private config;
|
|
31
|
-
private priorities;
|
|
32
|
-
constructor(device: WebInferDevice, config: BlockManagerConfig);
|
|
33
|
-
/**
|
|
34
|
-
* Check if allocation is possible
|
|
35
|
-
*/
|
|
36
|
-
canAllocate(request: AllocationRequest): boolean;
|
|
37
|
-
/**
|
|
38
|
-
* Allocate or extend a sequence
|
|
39
|
-
*/
|
|
40
|
-
allocate(request: AllocationRequest): number;
|
|
41
|
-
/**
|
|
42
|
-
* Free a sequence
|
|
43
|
-
*/
|
|
44
|
-
free(seqId: number): void;
|
|
45
|
-
/**
|
|
46
|
-
* Evict lowest priority sequences until we have enough free pages
|
|
47
|
-
*/
|
|
48
|
-
evict(neededPages: number): number[];
|
|
49
|
-
/**
|
|
50
|
-
* Get memory utilization
|
|
51
|
-
*/
|
|
52
|
-
getUtilization(): number;
|
|
53
|
-
/**
|
|
54
|
-
* Get the underlying cache
|
|
55
|
-
*/
|
|
56
|
-
getCache(): PagedKVCache;
|
|
57
|
-
/**
|
|
58
|
-
* Get statistics
|
|
59
|
-
*/
|
|
60
|
-
getStats(): {
|
|
61
|
-
totalPages: number;
|
|
62
|
-
usedPages: number;
|
|
63
|
-
freePages: number;
|
|
64
|
-
numSequences: number;
|
|
65
|
-
memoryUsedBytes: number;
|
|
66
|
-
memoryTotalBytes: number;
|
|
67
|
-
};
|
|
68
|
-
/**
|
|
69
|
-
* Dispose resources
|
|
70
|
-
*/
|
|
71
|
-
dispose(): void;
|
|
72
|
-
}
|
|
73
|
-
/**
|
|
74
|
-
* Scheduler for continuous batching with PagedAttention
|
|
75
|
-
*/
|
|
76
|
-
export declare class ContinuousBatchScheduler {
|
|
77
|
-
private blockManager;
|
|
78
|
-
private runningSequences;
|
|
79
|
-
private waitingQueue;
|
|
80
|
-
constructor(blockManager: BlockManager);
|
|
81
|
-
/**
|
|
82
|
-
* Add a new request to the scheduler
|
|
83
|
-
*/
|
|
84
|
-
addRequest(request: AllocationRequest): void;
|
|
85
|
-
/**
|
|
86
|
-
* Complete a sequence
|
|
87
|
-
*/
|
|
88
|
-
completeSequence(seqId: number): void;
|
|
89
|
-
/**
|
|
90
|
-
* Extend a running sequence
|
|
91
|
-
*/
|
|
92
|
-
extendSequence(seqId: number, numNewTokens: number): boolean;
|
|
93
|
-
private scheduleWaiting;
|
|
94
|
-
/**
|
|
95
|
-
* Get running sequence count
|
|
96
|
-
*/
|
|
97
|
-
getRunningCount(): number;
|
|
98
|
-
/**
|
|
99
|
-
* Get waiting request count
|
|
100
|
-
*/
|
|
101
|
-
getWaitingCount(): number;
|
|
102
|
-
}
|
|
@@ -1,5 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* PagedKV Module Exports
|
|
3
|
-
*/
|
|
4
|
-
export { type AllocationPolicy, type AllocationRequest, BlockManager, type BlockManagerConfig, ContinuousBatchScheduler, } from "./block-manager.ts";
|
|
5
|
-
export { type DefragmentResult, PagedKVCache, type PagedKVCacheConfig, type SequenceEntry, } from "./page-table.ts";
|
|
@@ -1,165 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Paged KV Cache - Software page table for efficient memory management
|
|
3
|
-
* Inspired by vLLM's PagedAttention
|
|
4
|
-
*/
|
|
5
|
-
import type { WebInferDevice } from "../../core/device.ts";
|
|
6
|
-
/**
|
|
7
|
-
* Configuration for PagedKVCache
|
|
8
|
-
*/
|
|
9
|
-
export interface PagedKVCacheConfig {
|
|
10
|
-
numLayers: number;
|
|
11
|
-
numHeads: number;
|
|
12
|
-
headDim: number;
|
|
13
|
-
pageSize: number;
|
|
14
|
-
maxPages: number;
|
|
15
|
-
dtype?: "f32" | "f16";
|
|
16
|
-
}
|
|
17
|
-
/**
|
|
18
|
-
* Entry in the page table for a sequence
|
|
19
|
-
*/
|
|
20
|
-
export interface SequenceEntry {
|
|
21
|
-
seqId: number;
|
|
22
|
-
pages: number[];
|
|
23
|
-
length: number;
|
|
24
|
-
}
|
|
25
|
-
/**
|
|
26
|
-
* PagedKVCache - Manages KV cache with paging for efficient memory use
|
|
27
|
-
*
|
|
28
|
-
* Benefits:
|
|
29
|
-
* 1. No memory fragmentation - pages are fixed size
|
|
30
|
-
* 2. Efficient memory sharing - multiple sequences can share cache
|
|
31
|
-
* 3. Dynamic allocation - only allocate pages as needed
|
|
32
|
-
* 4. Easy defragmentation - just remap logical to physical pages
|
|
33
|
-
*/
|
|
34
|
-
export declare class PagedKVCache {
|
|
35
|
-
private device;
|
|
36
|
-
private config;
|
|
37
|
-
private keyCache;
|
|
38
|
-
private valueCache;
|
|
39
|
-
private pageTable;
|
|
40
|
-
private freePages;
|
|
41
|
-
private nextSeqId;
|
|
42
|
-
constructor(device: WebInferDevice, config: PagedKVCacheConfig);
|
|
43
|
-
/**
|
|
44
|
-
* Allocate pages for a new sequence
|
|
45
|
-
*/
|
|
46
|
-
allocateSequence(initialLength?: number): number;
|
|
47
|
-
/**
|
|
48
|
-
* Extend a sequence with new tokens
|
|
49
|
-
*/
|
|
50
|
-
extendSequence(seqId: number, numNewTokens: number): void;
|
|
51
|
-
/**
|
|
52
|
-
* Free a sequence and its pages
|
|
53
|
-
*/
|
|
54
|
-
freeSequence(seqId: number): void;
|
|
55
|
-
/**
|
|
56
|
-
* Get page indices for a sequence
|
|
57
|
-
*/
|
|
58
|
-
getSequencePages(seqId: number): number[] | null;
|
|
59
|
-
/**
|
|
60
|
-
* Get sequence length
|
|
61
|
-
*/
|
|
62
|
-
getSequenceLength(seqId: number): number;
|
|
63
|
-
/**
|
|
64
|
-
* Get the physical page index for a given sequence position
|
|
65
|
-
*/
|
|
66
|
-
getPageForPosition(seqId: number, position: number): number | null;
|
|
67
|
-
/**
|
|
68
|
-
* Get offset within a page for a given position
|
|
69
|
-
*/
|
|
70
|
-
getOffsetInPage(position: number): number;
|
|
71
|
-
private allocatePage;
|
|
72
|
-
private freePage;
|
|
73
|
-
/**
|
|
74
|
-
* Get cache statistics
|
|
75
|
-
*/
|
|
76
|
-
getStats(): {
|
|
77
|
-
totalPages: number;
|
|
78
|
-
usedPages: number;
|
|
79
|
-
freePages: number;
|
|
80
|
-
numSequences: number;
|
|
81
|
-
memoryUsedBytes: number;
|
|
82
|
-
memoryTotalBytes: number;
|
|
83
|
-
};
|
|
84
|
-
/**
|
|
85
|
-
* Get GPU buffers for kernel binding
|
|
86
|
-
*/
|
|
87
|
-
getBuffers(): {
|
|
88
|
-
keyCache: GPUBuffer;
|
|
89
|
-
valueCache: GPUBuffer;
|
|
90
|
-
};
|
|
91
|
-
/**
|
|
92
|
-
* Get configuration
|
|
93
|
-
*/
|
|
94
|
-
getConfig(): PagedKVCacheConfig;
|
|
95
|
-
/**
|
|
96
|
-
* Dispose GPU resources
|
|
97
|
-
*/
|
|
98
|
-
dispose(): void;
|
|
99
|
-
/**
|
|
100
|
-
* Allocate a new sequence (v2 alias)
|
|
101
|
-
*/
|
|
102
|
-
alloc(initialLength?: number): number;
|
|
103
|
-
/**
|
|
104
|
-
* Append KV to sequence (v2 API)
|
|
105
|
-
*/
|
|
106
|
-
append(seqId: number, kv: {
|
|
107
|
-
key: Float32Array;
|
|
108
|
-
value: Float32Array;
|
|
109
|
-
layer?: number;
|
|
110
|
-
}): void;
|
|
111
|
-
/**
|
|
112
|
-
* Batch append KV for all layers (v2 API)
|
|
113
|
-
*/
|
|
114
|
-
appendBatch(seqId: number, kv: {
|
|
115
|
-
keys: Float32Array;
|
|
116
|
-
values: Float32Array;
|
|
117
|
-
}): void;
|
|
118
|
-
/**
|
|
119
|
-
* Free sequence (v2 alias)
|
|
120
|
-
*/
|
|
121
|
-
free(seqId: number): void;
|
|
122
|
-
/**
|
|
123
|
-
* Get stats (v2 API)
|
|
124
|
-
*/
|
|
125
|
-
stats(): {
|
|
126
|
-
usedPages: number;
|
|
127
|
-
freePages: number;
|
|
128
|
-
fragmentation: number;
|
|
129
|
-
};
|
|
130
|
-
/**
|
|
131
|
-
* Check if defrag needed (v2 alias)
|
|
132
|
-
*/
|
|
133
|
-
needsDefrag(threshold?: number): boolean;
|
|
134
|
-
/**
|
|
135
|
-
* Check if defragmentation is needed
|
|
136
|
-
* Returns fragmentation ratio (0 = no fragmentation, 1 = fully fragmented)
|
|
137
|
-
*/
|
|
138
|
-
getFragmentationRatio(): number;
|
|
139
|
-
/**
|
|
140
|
-
* Check if defragmentation would be beneficial
|
|
141
|
-
*/
|
|
142
|
-
needsDefragmentation(threshold?: number): boolean;
|
|
143
|
-
/**
|
|
144
|
-
* Defragment the KV cache by compacting pages
|
|
145
|
-
* Returns the number of pages moved
|
|
146
|
-
*/
|
|
147
|
-
defragment(): Promise<DefragmentResult>;
|
|
148
|
-
/**
|
|
149
|
-
* Move a single page from one location to another
|
|
150
|
-
*/
|
|
151
|
-
private movePage;
|
|
152
|
-
/**
|
|
153
|
-
* Rebuild the free page list from scratch
|
|
154
|
-
*/
|
|
155
|
-
private rebuildFreePageList;
|
|
156
|
-
}
|
|
157
|
-
/**
|
|
158
|
-
* Result of defragmentation operation
|
|
159
|
-
*/
|
|
160
|
-
export interface DefragmentResult {
|
|
161
|
-
pagesMoved: number;
|
|
162
|
-
durationMs: number;
|
|
163
|
-
fragmentationBefore: number;
|
|
164
|
-
fragmentationAfter: number;
|
|
165
|
-
}
|
|
@@ -1,40 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Attention Scheduler - Prevents TDR (GPU timeout) by splitting long sequences
|
|
3
|
-
*/
|
|
4
|
-
import type { WebInferDevice } from "../core/device.ts";
|
|
5
|
-
export interface ChunkPlan {
|
|
6
|
-
numChunks: number;
|
|
7
|
-
chunkSize: number;
|
|
8
|
-
estimatedTimeMs: number;
|
|
9
|
-
}
|
|
10
|
-
/**
|
|
11
|
-
* Attention Scheduler for TDR prevention
|
|
12
|
-
* Splits long sequences into chunks to avoid GPU timeout
|
|
13
|
-
*/
|
|
14
|
-
export declare class AttentionScheduler {
|
|
15
|
-
private device;
|
|
16
|
-
private tdrLimit;
|
|
17
|
-
constructor(device: WebInferDevice);
|
|
18
|
-
private detectTDRLimit;
|
|
19
|
-
/**
|
|
20
|
-
* Estimate execution time for attention operation
|
|
21
|
-
* Based on empirical formula: time ∝ seqLen² × numHeads × headDim
|
|
22
|
-
*/
|
|
23
|
-
estimateExecutionTime(seqLen: number, numHeads: number, headDim: number): number;
|
|
24
|
-
/**
|
|
25
|
-
* Compute chunk plan for given sequence length
|
|
26
|
-
*/
|
|
27
|
-
computeChunkPlan(seqLen: number, numHeads: number, headDim: number): ChunkPlan;
|
|
28
|
-
/**
|
|
29
|
-
* Yield to main thread to prevent TDR
|
|
30
|
-
*/
|
|
31
|
-
yieldToMain(): Promise<void>;
|
|
32
|
-
/**
|
|
33
|
-
* Check if sequence might cause TDR
|
|
34
|
-
*/
|
|
35
|
-
mightCauseTDR(seqLen: number, numHeads: number, headDim: number): boolean;
|
|
36
|
-
/**
|
|
37
|
-
* Get recommended maximum sequence length for single-pass execution
|
|
38
|
-
*/
|
|
39
|
-
getMaxSinglePassSeqLen(numHeads: number, headDim: number): number;
|
|
40
|
-
}
|