webinfer 0.0.3 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. package/README.md +65 -21
  2. package/dist/activation/index.d.ts +30 -0
  3. package/dist/core/context.d.ts +70 -0
  4. package/dist/core/paged-kv-cache.d.ts +33 -0
  5. package/dist/core/tensor.d.ts +51 -19
  6. package/dist/core/types.d.ts +27 -0
  7. package/dist/decode/index.d.ts +140 -0
  8. package/dist/gemm/index.d.ts +27 -0
  9. package/dist/index.d.ts +29 -21
  10. package/dist/index.js +3433 -4809
  11. package/dist/jit/index.d.ts +138 -0
  12. package/dist/kernels/activation.wgsl.d.ts +14 -0
  13. package/dist/kernels/batch-decode-paged.wgsl.d.ts +12 -0
  14. package/dist/kernels/batch-prefill-paged.wgsl.d.ts +13 -0
  15. package/dist/kernels/decode-attention.wgsl.d.ts +16 -0
  16. package/dist/kernels/gemm.wgsl.d.ts +17 -0
  17. package/dist/kernels/page.wgsl.d.ts +10 -0
  18. package/dist/kernels/prefill-attention.wgsl.d.ts +17 -0
  19. package/dist/kernels/rmsnorm.wgsl.d.ts +10 -0
  20. package/dist/kernels/rope.wgsl.d.ts +19 -0
  21. package/dist/kernels/sampling.wgsl.d.ts +23 -0
  22. package/dist/norm/index.d.ts +43 -0
  23. package/dist/page/index.d.ts +21 -0
  24. package/dist/prefill/index.d.ts +155 -0
  25. package/dist/rope/index.d.ts +37 -0
  26. package/dist/sampling/index.d.ts +53 -4
  27. package/package.json +1 -1
  28. package/dist/attention/block-sparse/format.d.ts +0 -52
  29. package/dist/attention/block-sparse/patterns/causal.d.ts +0 -16
  30. package/dist/attention/block-sparse/patterns/sliding.d.ts +0 -22
  31. package/dist/attention/block-sparse/patterns/tree.d.ts +0 -65
  32. package/dist/attention/cascaded-inference.d.ts +0 -29
  33. package/dist/attention/flash-attention.d.ts +0 -30
  34. package/dist/attention/index.d.ts +0 -118
  35. package/dist/attention/paged-attention.d.ts +0 -40
  36. package/dist/attention/paged-kv/block-manager.d.ts +0 -102
  37. package/dist/attention/paged-kv/index.d.ts +0 -5
  38. package/dist/attention/paged-kv/page-table.d.ts +0 -165
  39. package/dist/attention/scheduler.d.ts +0 -40
  40. package/dist/core/buffer-pool.d.ts +0 -18
  41. package/dist/core/device.d.ts +0 -23
  42. package/dist/core/tdr.d.ts +0 -114
  43. package/dist/inference/engine.d.ts +0 -69
  44. package/dist/inference/generate.d.ts +0 -30
  45. package/dist/inference/index.d.ts +0 -7
  46. package/dist/inference/types.d.ts +0 -161
  47. package/dist/jit/compiler.d.ts +0 -23
  48. package/dist/jit/kernel-cache.d.ts +0 -21
  49. package/dist/model/gguf.d.ts +0 -90
  50. package/dist/model/index.d.ts +0 -16
  51. package/dist/model/safetensors.d.ts +0 -38
  52. package/dist/model/types.d.ts +0 -182
  53. package/dist/ops/activations.d.ts +0 -43
  54. package/dist/ops/elementwise.d.ts +0 -38
  55. package/dist/ops/embedding.d.ts +0 -30
  56. package/dist/ops/matmul.d.ts +0 -21
  57. package/dist/ops/normalization.d.ts +0 -63
  58. package/dist/ops/reshape.d.ts +0 -39
  59. package/dist/ops/rope.d.ts +0 -32
  60. package/dist/ops/softmax.d.ts +0 -18
  61. package/dist/quantization/index.d.ts +0 -6
  62. package/dist/quantization/qmatmul.d.ts +0 -38
  63. package/dist/quantization/quantize.d.ts +0 -52
  64. package/dist/sampling/beam-search.d.ts +0 -87
  65. package/dist/sampling/sampler.d.ts +0 -72
  66. package/dist/sampling/speculative.d.ts +0 -65
  67. package/dist/sampling/top-k.d.ts +0 -24
  68. package/dist/sampling/top-p.d.ts +0 -14
  69. package/dist/tvm/adapter.d.ts +0 -81
  70. package/dist/tvm/index.d.ts +0 -8
  71. package/dist/tvm/ops.d.ts +0 -26
  72. package/dist/tvm/types.d.ts +0 -35
@@ -1,52 +0,0 @@
1
- /**
2
- * Block-Sparse CSR Format
3
- * Enables a single kernel to support all attention variants
4
- */
5
- /**
6
- * Block-Sparse CSR representation of attention mask
7
- */
8
- export interface BlockSparseCSR {
9
- blockSize: number;
10
- rowPtr: Uint32Array;
11
- colIdx: Uint32Array;
12
- blockMask?: Uint8Array;
13
- numRows: number;
14
- numCols: number;
15
- numBlockRows: number;
16
- numBlockCols: number;
17
- nnzBlocks: number;
18
- }
19
- /**
20
- * Attention pattern types
21
- */
22
- export type AttentionPattern = {
23
- type: "dense";
24
- } | {
25
- type: "causal";
26
- } | {
27
- type: "sliding";
28
- windowSize: number;
29
- } | {
30
- type: "global-local";
31
- globalTokens: number[];
32
- localWindow: number;
33
- } | {
34
- type: "custom";
35
- mask: boolean[][];
36
- };
37
- /**
38
- * Build BS-CSR from attention pattern
39
- */
40
- export declare function buildBlockSparseCSR(seqLen: number, pattern: AttentionPattern, blockSize?: number): BlockSparseCSR;
41
- /**
42
- * Calculate sparsity ratio of the mask
43
- */
44
- export declare function getSparsityRatio(csr: BlockSparseCSR): number;
45
- /**
46
- * Estimate memory savings from sparsity
47
- */
48
- export declare function estimateMemorySavings(csr: BlockSparseCSR): {
49
- denseBytes: number;
50
- sparseBytes: number;
51
- savingsRatio: number;
52
- };
@@ -1,16 +0,0 @@
1
- /**
2
- * Causal (autoregressive) attention pattern
3
- * Used in GPT, Llama, and most decoder-only models
4
- */
5
- import { type BlockSparseCSR } from "../format.ts";
6
- /**
7
- * Build causal attention mask in BS-CSR format
8
- * Each query position can only attend to positions <= its own position
9
- */
10
- export declare function buildCausalMask(seqLen: number, blockSize?: number): BlockSparseCSR;
11
- /**
12
- * Get the theoretical sparsity of causal attention
13
- * For a sequence of length N, causal attention has N*(N+1)/2 non-zero elements
14
- * out of N*N total, giving ~50% sparsity for large N
15
- */
16
- export declare function getCausalSparsity(seqLen: number): number;
@@ -1,22 +0,0 @@
1
- /**
2
- * Sliding window attention pattern
3
- * Used in Mistral and other efficient attention models
4
- */
5
- import { type BlockSparseCSR } from "../format.ts";
6
- /**
7
- * Build sliding window attention mask in BS-CSR format
8
- * Each query can only attend to the previous `windowSize` positions
9
- */
10
- export declare function buildSlidingWindowMask(seqLen: number, windowSize: number, blockSize?: number): BlockSparseCSR;
11
- /**
12
- * Get the theoretical sparsity of sliding window attention
13
- * For window size W and sequence length N:
14
- * - First W positions have triangular attention (like causal)
15
- * - Remaining N-W positions have W+1 attention each
16
- */
17
- export declare function getSlidingWindowSparsity(seqLen: number, windowSize: number): number;
18
- /**
19
- * Sliding window with causal constraint
20
- * This is what Mistral uses - combines sliding window with causal masking
21
- */
22
- export declare function buildCausalSlidingWindowMask(seqLen: number, windowSize: number, blockSize?: number): BlockSparseCSR;
@@ -1,65 +0,0 @@
1
- /**
2
- * Tree Attention Pattern
3
- * Used in speculative decoding (Medusa, EAGLE) and tree-based generation
4
- */
5
- import { type BlockSparseCSR } from "../format.ts";
6
- /**
7
- * Tree node structure
8
- */
9
- export interface TreeNode {
10
- /** Token position in sequence */
11
- position: number;
12
- /** Parent position (-1 for root) */
13
- parent: number;
14
- /** Depth in tree (0 for root) */
15
- depth: number;
16
- }
17
- /**
18
- * Tree attention configuration
19
- */
20
- export interface TreeAttentionConfig {
21
- /** Total sequence length including prompt */
22
- seqLen: number;
23
- /** Prompt length (prefix that all tokens attend to) */
24
- promptLen: number;
25
- /** Tree structure for speculative tokens */
26
- tree: TreeNode[];
27
- /** Block size for sparse format */
28
- blockSize?: number;
29
- }
30
- /**
31
- * Build tree attention mask
32
- *
33
- * In tree attention:
34
- * - All tokens attend to the prompt (positions 0 to promptLen-1)
35
- * - Tree tokens attend to their ancestors in the tree
36
- * - Maintains causal property within the tree structure
37
- */
38
- export declare function buildTreeMask(config: TreeAttentionConfig): BlockSparseCSR;
39
- /**
40
- * Build a simple chain tree (linear speculation)
41
- * Each token depends on the previous one
42
- */
43
- export declare function buildChainTree(numSpecTokens: number): TreeNode[];
44
- /**
45
- * Build a wide tree (parallel speculation)
46
- * All speculative tokens depend only on the prompt
47
- */
48
- export declare function buildWideTree(numSpecTokens: number): TreeNode[];
49
- /**
50
- * Build a binary tree for speculation
51
- */
52
- export declare function buildBinaryTree(depth: number): TreeNode[];
53
- /**
54
- * Build Medusa-style tree
55
- * Multiple heads predict tokens at different positions
56
- */
57
- export declare function buildMedusaTree(numHeads: number, tokensPerHead: number): TreeNode[];
58
- /**
59
- * Calculate tree sparsity ratio
60
- */
61
- export declare function getTreeSparsity(config: TreeAttentionConfig): number;
62
- /**
63
- * Validate tree structure
64
- */
65
- export declare function validateTree(tree: TreeNode[]): boolean;
@@ -1,29 +0,0 @@
1
- /**
2
- * Cascaded Inference - TDR-safe attention for very long sequences
3
- * Splits attention computation across multiple passes with browser yields
4
- */
5
- import type { WebInferDevice } from "../core/device.ts";
6
- import { Tensor } from "../core/tensor.ts";
7
- export interface CascadedAttentionConfig {
8
- numHeads: number;
9
- headDim: number;
10
- seqLen: number;
11
- scale?: number;
12
- causal?: boolean;
13
- }
14
- /**
15
- * Cascaded Attention - Safe for very long sequences
16
- * Uses Split-K strategy to prevent TDR (GPU timeout)
17
- *
18
- * @param device WebInfer device
19
- * @param q Query tensor [seqLen, numHeads, headDim]
20
- * @param k Key tensor [seqLen, numHeads, headDim]
21
- * @param v Value tensor [seqLen, numHeads, headDim]
22
- * @param config Attention configuration
23
- * @param onProgress Optional progress callback
24
- */
25
- export declare function cascadedAttention(device: WebInferDevice, q: Tensor, k: Tensor, v: Tensor, config: CascadedAttentionConfig, onProgress?: (chunk: number, total: number) => void): Promise<Tensor>;
26
- /**
27
- * CPU reference implementation for verification
28
- */
29
- export declare function cascadedAttentionCPU(q: Float32Array, k: Float32Array, v: Float32Array, seqLen: number, numHeads: number, headDim: number, causal?: boolean): Float32Array;
@@ -1,30 +0,0 @@
1
- /**
2
- * FlashAttention Implementation for WebGPU
3
- * Memory-efficient attention using online softmax and tiling
4
- */
5
- import type { WebInferDevice } from "../core/device.ts";
6
- import { Tensor } from "../core/tensor.ts";
7
- import type { AttentionPattern } from "./block-sparse/format.ts";
8
- export interface AttentionConfig {
9
- numHeads: number;
10
- headDim: number;
11
- seqLen: number;
12
- scale?: number;
13
- pattern?: AttentionPattern;
14
- blockSize?: number;
15
- }
16
- /**
17
- * FlashAttention forward pass
18
- * Computes: softmax(Q @ K^T / sqrt(d)) @ V
19
- *
20
- * @param device WebInfer device
21
- * @param q Query tensor [batch, seqLen, numHeads, headDim]
22
- * @param k Key tensor [batch, seqLen, numHeads, headDim]
23
- * @param v Value tensor [batch, seqLen, numHeads, headDim]
24
- * @param config Attention configuration
25
- */
26
- export declare function flashAttention(device: WebInferDevice, q: Tensor, k: Tensor, v: Tensor, config: AttentionConfig): Promise<Tensor>;
27
- /**
28
- * CPU reference implementation for verification
29
- */
30
- export declare function attentionCPU(q: Float32Array, k: Float32Array, v: Float32Array, seqLen: number, numHeads: number, headDim: number, causal?: boolean): Float32Array;
@@ -1,118 +0,0 @@
1
- /**
2
- * Attention Module Exports
3
- */
4
- import type { WebInferDevice } from "../core/device.ts";
5
- import { Tensor } from "../core/tensor.ts";
6
- import type { BlockSparseCSR } from "./block-sparse/format.ts";
7
- import { PagedKVCache } from "./paged-kv/page-table.ts";
8
- /**
9
- * Tensor-like input types
10
- */
11
- export type TensorLike = Tensor | Float32Array | {
12
- data: Float32Array;
13
- shape: number[];
14
- };
15
- /**
16
- * Attention options
17
- */
18
- export interface AttentionOptions {
19
- q: TensorLike;
20
- k: TensorLike;
21
- v: TensorLike;
22
- causal?: boolean;
23
- scale?: number;
24
- window?: number;
25
- mask?: BlockSparseCSR;
26
- returnLse?: boolean;
27
- }
28
- /**
29
- * Attention result with optional LSE
30
- */
31
- export interface AttentionResultWithLse {
32
- output: Tensor;
33
- lse: Tensor;
34
- }
35
- /**
36
- * Simple attention function
37
- */
38
- export declare function attention(device: WebInferDevice, options: AttentionOptions): Promise<Tensor>;
39
- export declare function attention(device: WebInferDevice, options: AttentionOptions & {
40
- returnLse: true;
41
- }): Promise<AttentionResultWithLse>;
42
- /**
43
- * BatchAttention configuration
44
- */
45
- export interface BatchAttentionConfig {
46
- numHeads: number;
47
- headDim: number;
48
- numKvHeads?: number;
49
- maxBatchSize?: number;
50
- maxSeqLen?: number;
51
- }
52
- /**
53
- * Prefill input
54
- */
55
- export interface PrefillInput {
56
- queries: TensorLike[];
57
- keys: TensorLike[];
58
- values: TensorLike[];
59
- causal?: boolean;
60
- window?: number;
61
- }
62
- /**
63
- * Decode input
64
- */
65
- export interface DecodeInput {
66
- query: TensorLike;
67
- kvCache: PagedKVCache;
68
- seqIds: number[];
69
- }
70
- /**
71
- * BatchAttention - Batched attention for prefill and decode
72
- */
73
- export declare class BatchAttention {
74
- private device;
75
- private config;
76
- constructor(device: WebInferDevice, config: BatchAttentionConfig);
77
- /**
78
- * Prefill: Process variable-length sequences
79
- */
80
- prefill(input: PrefillInput): Promise<Tensor[]>;
81
- /**
82
- * Decode: Single token per sequence with KV cache
83
- */
84
- decode(input: DecodeInput): Promise<Tensor>;
85
- getConfig(): Required<BatchAttentionConfig>;
86
- dispose(): void;
87
- }
88
- /**
89
- * AttentionKernel configuration
90
- */
91
- export interface AttentionKernelConfig {
92
- numHeads: number;
93
- headDim: number;
94
- causal?: boolean;
95
- blockSize?: number;
96
- }
97
- /**
98
- * AttentionKernel - Low-level compiled kernel
99
- */
100
- export declare class AttentionKernel {
101
- private device;
102
- private config;
103
- private constructor();
104
- static compile(device: WebInferDevice, config: AttentionKernelConfig): Promise<AttentionKernel>;
105
- execute(input: {
106
- q: Tensor;
107
- k: Tensor;
108
- v: Tensor;
109
- }): Promise<Tensor>;
110
- dispose(): void;
111
- }
112
- export { flashAttention, type AttentionConfig } from "./flash-attention.ts";
113
- export { buildBlockSparseCSR, getSparsityRatio, estimateMemorySavings, type BlockSparseCSR, type AttentionPattern, } from "./block-sparse/format.ts";
114
- export { buildCausalMask, getCausalSparsity } from "./block-sparse/patterns/causal.ts";
115
- export { buildSlidingWindowMask, buildCausalSlidingWindowMask, getSlidingWindowSparsity, } from "./block-sparse/patterns/sliding.ts";
116
- export { PagedKVCache, type PagedKVCacheConfig, type SequenceEntry, type DefragmentResult, BlockManager, ContinuousBatchScheduler, type BlockManagerConfig, type AllocationPolicy, type AllocationRequest, } from "./paged-kv/index.ts";
117
- export { pagedAttention, appendToPagedCache, type PagedAttentionConfig, type PagedAttentionInput, } from "./paged-attention.ts";
118
- export { cascadedAttention, type CascadedAttentionConfig, } from "./cascaded-inference.ts";
@@ -1,40 +0,0 @@
1
- /**
2
- * Paged Attention Implementation for WebGPU
3
- * Efficient attention computation using paged KV cache (vLLM-style)
4
- */
5
- import type { WebInferDevice } from "../core/device.ts";
6
- import { Tensor } from "../core/tensor.ts";
7
- import type { PagedKVCache } from "./paged-kv/page-table.ts";
8
- export interface PagedAttentionConfig {
9
- numHeads: number;
10
- headDim: number;
11
- scale?: number;
12
- }
13
- export interface PagedAttentionInput {
14
- query: Tensor;
15
- kvCache: PagedKVCache;
16
- seqIds: number[];
17
- positions: number[];
18
- }
19
- /**
20
- * Paged Attention forward pass for decoding
21
- * Computes attention against paged KV cache for single-token queries
22
- *
23
- * @param device WebInfer device
24
- * @param input Paged attention input (query, kv cache, sequence info)
25
- * @param config Attention configuration
26
- */
27
- export declare function pagedAttention(device: WebInferDevice, input: PagedAttentionInput, config: PagedAttentionConfig): Promise<Tensor>;
28
- /**
29
- * Append new KV to paged cache
30
- */
31
- export declare function appendToPagedCache(device: WebInferDevice, kvCache: PagedKVCache, seqId: number, key: Tensor, // [numHeads, headDim]
32
- value: Tensor): Promise<void>;
33
- /**
34
- * CPU reference implementation for verification
35
- */
36
- export declare function pagedAttentionCPU(q: Float32Array, // [batchSize, numHeads, headDim]
37
- keyCache: Float32Array, // [maxPages, pageSize, numHeads, headDim]
38
- valueCache: Float32Array, pageTable: number[][], // [batchSize][pages]
39
- seqLens: number[], // [batchSize]
40
- numHeads: number, headDim: number, pageSize: number, maxPages: number): Float32Array;
@@ -1,102 +0,0 @@
1
- /**
2
- * Block Manager - High-level memory management for PagedKVCache
3
- */
4
- import type { WebInferDevice } from "../../core/device.ts";
5
- import { PagedKVCache, type PagedKVCacheConfig } from "./page-table.ts";
6
- /**
7
- * Block allocation policy
8
- */
9
- export type AllocationPolicy = "greedy" | "best-fit" | "first-fit";
10
- /**
11
- * Block Manager configuration
12
- */
13
- export interface BlockManagerConfig extends PagedKVCacheConfig {
14
- policy?: AllocationPolicy;
15
- reservedPages?: number;
16
- }
17
- /**
18
- * Request for KV cache allocation
19
- */
20
- export interface AllocationRequest {
21
- seqId?: number;
22
- numTokens: number;
23
- priority?: number;
24
- }
25
- /**
26
- * Block Manager - Manages KV cache allocation across multiple sequences
27
- */
28
- export declare class BlockManager {
29
- private cache;
30
- private config;
31
- private priorities;
32
- constructor(device: WebInferDevice, config: BlockManagerConfig);
33
- /**
34
- * Check if allocation is possible
35
- */
36
- canAllocate(request: AllocationRequest): boolean;
37
- /**
38
- * Allocate or extend a sequence
39
- */
40
- allocate(request: AllocationRequest): number;
41
- /**
42
- * Free a sequence
43
- */
44
- free(seqId: number): void;
45
- /**
46
- * Evict lowest priority sequences until we have enough free pages
47
- */
48
- evict(neededPages: number): number[];
49
- /**
50
- * Get memory utilization
51
- */
52
- getUtilization(): number;
53
- /**
54
- * Get the underlying cache
55
- */
56
- getCache(): PagedKVCache;
57
- /**
58
- * Get statistics
59
- */
60
- getStats(): {
61
- totalPages: number;
62
- usedPages: number;
63
- freePages: number;
64
- numSequences: number;
65
- memoryUsedBytes: number;
66
- memoryTotalBytes: number;
67
- };
68
- /**
69
- * Dispose resources
70
- */
71
- dispose(): void;
72
- }
73
- /**
74
- * Scheduler for continuous batching with PagedAttention
75
- */
76
- export declare class ContinuousBatchScheduler {
77
- private blockManager;
78
- private runningSequences;
79
- private waitingQueue;
80
- constructor(blockManager: BlockManager);
81
- /**
82
- * Add a new request to the scheduler
83
- */
84
- addRequest(request: AllocationRequest): void;
85
- /**
86
- * Complete a sequence
87
- */
88
- completeSequence(seqId: number): void;
89
- /**
90
- * Extend a running sequence
91
- */
92
- extendSequence(seqId: number, numNewTokens: number): boolean;
93
- private scheduleWaiting;
94
- /**
95
- * Get running sequence count
96
- */
97
- getRunningCount(): number;
98
- /**
99
- * Get waiting request count
100
- */
101
- getWaitingCount(): number;
102
- }
@@ -1,5 +0,0 @@
1
- /**
2
- * PagedKV Module Exports
3
- */
4
- export { type AllocationPolicy, type AllocationRequest, BlockManager, type BlockManagerConfig, ContinuousBatchScheduler, } from "./block-manager.ts";
5
- export { type DefragmentResult, PagedKVCache, type PagedKVCacheConfig, type SequenceEntry, } from "./page-table.ts";
@@ -1,165 +0,0 @@
1
- /**
2
- * Paged KV Cache - Software page table for efficient memory management
3
- * Inspired by vLLM's PagedAttention
4
- */
5
- import type { WebInferDevice } from "../../core/device.ts";
6
- /**
7
- * Configuration for PagedKVCache
8
- */
9
- export interface PagedKVCacheConfig {
10
- numLayers: number;
11
- numHeads: number;
12
- headDim: number;
13
- pageSize: number;
14
- maxPages: number;
15
- dtype?: "f32" | "f16";
16
- }
17
- /**
18
- * Entry in the page table for a sequence
19
- */
20
- export interface SequenceEntry {
21
- seqId: number;
22
- pages: number[];
23
- length: number;
24
- }
25
- /**
26
- * PagedKVCache - Manages KV cache with paging for efficient memory use
27
- *
28
- * Benefits:
29
- * 1. No memory fragmentation - pages are fixed size
30
- * 2. Efficient memory sharing - multiple sequences can share cache
31
- * 3. Dynamic allocation - only allocate pages as needed
32
- * 4. Easy defragmentation - just remap logical to physical pages
33
- */
34
- export declare class PagedKVCache {
35
- private device;
36
- private config;
37
- private keyCache;
38
- private valueCache;
39
- private pageTable;
40
- private freePages;
41
- private nextSeqId;
42
- constructor(device: WebInferDevice, config: PagedKVCacheConfig);
43
- /**
44
- * Allocate pages for a new sequence
45
- */
46
- allocateSequence(initialLength?: number): number;
47
- /**
48
- * Extend a sequence with new tokens
49
- */
50
- extendSequence(seqId: number, numNewTokens: number): void;
51
- /**
52
- * Free a sequence and its pages
53
- */
54
- freeSequence(seqId: number): void;
55
- /**
56
- * Get page indices for a sequence
57
- */
58
- getSequencePages(seqId: number): number[] | null;
59
- /**
60
- * Get sequence length
61
- */
62
- getSequenceLength(seqId: number): number;
63
- /**
64
- * Get the physical page index for a given sequence position
65
- */
66
- getPageForPosition(seqId: number, position: number): number | null;
67
- /**
68
- * Get offset within a page for a given position
69
- */
70
- getOffsetInPage(position: number): number;
71
- private allocatePage;
72
- private freePage;
73
- /**
74
- * Get cache statistics
75
- */
76
- getStats(): {
77
- totalPages: number;
78
- usedPages: number;
79
- freePages: number;
80
- numSequences: number;
81
- memoryUsedBytes: number;
82
- memoryTotalBytes: number;
83
- };
84
- /**
85
- * Get GPU buffers for kernel binding
86
- */
87
- getBuffers(): {
88
- keyCache: GPUBuffer;
89
- valueCache: GPUBuffer;
90
- };
91
- /**
92
- * Get configuration
93
- */
94
- getConfig(): PagedKVCacheConfig;
95
- /**
96
- * Dispose GPU resources
97
- */
98
- dispose(): void;
99
- /**
100
- * Allocate a new sequence (v2 alias)
101
- */
102
- alloc(initialLength?: number): number;
103
- /**
104
- * Append KV to sequence (v2 API)
105
- */
106
- append(seqId: number, kv: {
107
- key: Float32Array;
108
- value: Float32Array;
109
- layer?: number;
110
- }): void;
111
- /**
112
- * Batch append KV for all layers (v2 API)
113
- */
114
- appendBatch(seqId: number, kv: {
115
- keys: Float32Array;
116
- values: Float32Array;
117
- }): void;
118
- /**
119
- * Free sequence (v2 alias)
120
- */
121
- free(seqId: number): void;
122
- /**
123
- * Get stats (v2 API)
124
- */
125
- stats(): {
126
- usedPages: number;
127
- freePages: number;
128
- fragmentation: number;
129
- };
130
- /**
131
- * Check if defrag needed (v2 alias)
132
- */
133
- needsDefrag(threshold?: number): boolean;
134
- /**
135
- * Check if defragmentation is needed
136
- * Returns fragmentation ratio (0 = no fragmentation, 1 = fully fragmented)
137
- */
138
- getFragmentationRatio(): number;
139
- /**
140
- * Check if defragmentation would be beneficial
141
- */
142
- needsDefragmentation(threshold?: number): boolean;
143
- /**
144
- * Defragment the KV cache by compacting pages
145
- * Returns the number of pages moved
146
- */
147
- defragment(): Promise<DefragmentResult>;
148
- /**
149
- * Move a single page from one location to another
150
- */
151
- private movePage;
152
- /**
153
- * Rebuild the free page list from scratch
154
- */
155
- private rebuildFreePageList;
156
- }
157
- /**
158
- * Result of defragmentation operation
159
- */
160
- export interface DefragmentResult {
161
- pagesMoved: number;
162
- durationMs: number;
163
- fragmentationBefore: number;
164
- fragmentationAfter: number;
165
- }
@@ -1,40 +0,0 @@
1
- /**
2
- * Attention Scheduler - Prevents TDR (GPU timeout) by splitting long sequences
3
- */
4
- import type { WebInferDevice } from "../core/device.ts";
5
- export interface ChunkPlan {
6
- numChunks: number;
7
- chunkSize: number;
8
- estimatedTimeMs: number;
9
- }
10
- /**
11
- * Attention Scheduler for TDR prevention
12
- * Splits long sequences into chunks to avoid GPU timeout
13
- */
14
- export declare class AttentionScheduler {
15
- private device;
16
- private tdrLimit;
17
- constructor(device: WebInferDevice);
18
- private detectTDRLimit;
19
- /**
20
- * Estimate execution time for attention operation
21
- * Based on empirical formula: time ∝ seqLen² × numHeads × headDim
22
- */
23
- estimateExecutionTime(seqLen: number, numHeads: number, headDim: number): number;
24
- /**
25
- * Compute chunk plan for given sequence length
26
- */
27
- computeChunkPlan(seqLen: number, numHeads: number, headDim: number): ChunkPlan;
28
- /**
29
- * Yield to main thread to prevent TDR
30
- */
31
- yieldToMain(): Promise<void>;
32
- /**
33
- * Check if sequence might cause TDR
34
- */
35
- mightCauseTDR(seqLen: number, numHeads: number, headDim: number): boolean;
36
- /**
37
- * Get recommended maximum sequence length for single-pass execution
38
- */
39
- getMaxSinglePassSeqLen(numHeads: number, headDim: number): number;
40
- }