webinfer 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/LICENSE +201 -0
  2. package/dist/attention/block-sparse/format.d.ts +52 -0
  3. package/dist/attention/block-sparse/patterns/causal.d.ts +16 -0
  4. package/dist/attention/block-sparse/patterns/sliding.d.ts +22 -0
  5. package/dist/attention/flash-attention.d.ts +30 -0
  6. package/dist/attention/index.d.ts +9 -0
  7. package/dist/attention/paged-kv/block-manager.d.ts +102 -0
  8. package/dist/attention/paged-kv/index.d.ts +5 -0
  9. package/dist/attention/paged-kv/page-table.d.ts +99 -0
  10. package/dist/attention/scheduler.d.ts +40 -0
  11. package/dist/core/buffer-pool.d.ts +18 -0
  12. package/dist/core/device.d.ts +23 -0
  13. package/dist/core/tensor.d.ts +25 -0
  14. package/dist/index.d.ts +22 -0
  15. package/dist/index.js +4228 -0
  16. package/dist/inference/engine.d.ts +69 -0
  17. package/dist/inference/generate.d.ts +30 -0
  18. package/dist/inference/index.d.ts +7 -0
  19. package/dist/inference/types.d.ts +161 -0
  20. package/dist/jit/compiler.d.ts +23 -0
  21. package/dist/jit/kernel-cache.d.ts +21 -0
  22. package/dist/model/gguf.d.ts +90 -0
  23. package/dist/model/index.d.ts +16 -0
  24. package/dist/model/safetensors.d.ts +38 -0
  25. package/dist/model/types.d.ts +182 -0
  26. package/dist/ops/activations.d.ts +43 -0
  27. package/dist/ops/elementwise.d.ts +38 -0
  28. package/dist/ops/embedding.d.ts +30 -0
  29. package/dist/ops/matmul.d.ts +21 -0
  30. package/dist/ops/normalization.d.ts +24 -0
  31. package/dist/ops/reshape.d.ts +39 -0
  32. package/dist/ops/rope.d.ts +32 -0
  33. package/dist/ops/softmax.d.ts +18 -0
  34. package/dist/quantization/index.d.ts +6 -0
  35. package/dist/quantization/qmatmul.d.ts +38 -0
  36. package/dist/quantization/quantize.d.ts +52 -0
  37. package/dist/sampling/index.d.ts +6 -0
  38. package/dist/sampling/sampler.d.ts +39 -0
  39. package/dist/sampling/top-k.d.ts +24 -0
  40. package/dist/sampling/top-p.d.ts +14 -0
  41. package/package.json +54 -0
@@ -0,0 +1,38 @@
1
+ /**
2
+ * Element-wise Operations
3
+ * Add, Multiply, Scale, etc. for tensor operations
4
+ */
5
+ import type { WebInferDevice } from "../core/device.ts";
6
+ import { Tensor } from "../core/tensor.ts";
7
+ /**
8
+ * Element-wise add (CPU)
9
+ */
10
+ export declare function addCPU(a: Float32Array, b: Float32Array): Float32Array;
11
+ /**
12
+ * Element-wise multiply (CPU)
13
+ */
14
+ export declare function mulCPU(a: Float32Array, b: Float32Array): Float32Array;
15
+ /**
16
+ * Scale by constant (CPU)
17
+ */
18
+ export declare function scaleCPU(a: Float32Array, scalar: number): Float32Array;
19
+ /**
20
+ * Add constant (CPU)
21
+ */
22
+ export declare function addScalarCPU(a: Float32Array, scalar: number): Float32Array;
23
+ /**
24
+ * Fused multiply-add: a * b + c (CPU)
25
+ */
26
+ export declare function fmaCPU(a: Float32Array, b: Float32Array, c: Float32Array): Float32Array;
27
+ /**
28
+ * Element-wise add (GPU)
29
+ */
30
+ export declare function add(device: WebInferDevice, a: Tensor, b: Tensor): Promise<Tensor>;
31
+ /**
32
+ * Element-wise multiply (GPU)
33
+ */
34
+ export declare function mul(device: WebInferDevice, a: Tensor, b: Tensor): Promise<Tensor>;
35
+ /**
36
+ * Scale tensor by constant (GPU)
37
+ */
38
+ export declare function scale(device: WebInferDevice, a: Tensor, scalar: number): Promise<Tensor>;
@@ -0,0 +1,30 @@
1
+ /**
2
+ * Embedding Operations
3
+ * Token embedding lookup for LLM inference
4
+ */
5
+ import type { WebInferDevice } from "../core/device.ts";
6
+ import { Tensor } from "../core/tensor.ts";
7
+ /**
8
+ * Embedding lookup (CPU)
9
+ * @param embeddings - Embedding table [vocabSize, embeddingDim]
10
+ * @param tokens - Token indices [seqLen]
11
+ * @param embeddingDim - Dimension of each embedding
12
+ * @returns Embedded tokens [seqLen, embeddingDim]
13
+ */
14
+ export declare function embeddingCPU(embeddings: Float32Array, tokens: number[], embeddingDim: number): Float32Array;
15
+ /**
16
+ * Embedding lookup (GPU)
17
+ * @param device - WebInfer device
18
+ * @param embeddings - Embedding table tensor [vocabSize, embeddingDim]
19
+ * @param tokens - Token indices tensor [seqLen]
20
+ * @returns Embedded tokens tensor [seqLen, embeddingDim]
21
+ */
22
+ export declare function embedding(device: WebInferDevice, embeddings: Tensor, tokens: Tensor): Promise<Tensor>;
23
+ /**
24
+ * Batched embedding lookup (CPU)
25
+ * @param embeddings - Embedding table [vocabSize, embeddingDim]
26
+ * @param tokens - Token indices [batchSize, seqLen]
27
+ * @param embeddingDim - Dimension of each embedding
28
+ * @returns Embedded tokens [batchSize, seqLen, embeddingDim]
29
+ */
30
+ export declare function batchedEmbeddingCPU(embeddings: Float32Array, tokens: number[][], embeddingDim: number): Float32Array;
@@ -0,0 +1,21 @@
1
+ /**
2
+ * MatMul Operator - GPU-accelerated matrix multiplication
3
+ */
4
+ import type { WebInferDevice } from "../core/device.ts";
5
+ import { Tensor } from "../core/tensor.ts";
6
+ /**
7
+ * Matrix multiplication: C = A @ B
8
+ * @param device WebInfer device
9
+ * @param a Input tensor [M, K]
10
+ * @param b Input tensor [K, N]
11
+ * @returns Output tensor [M, N]
12
+ */
13
+ export declare function matmul(device: WebInferDevice, a: Tensor, b: Tensor): Promise<Tensor>;
14
+ /**
15
+ * CPU reference implementation for verification
16
+ */
17
+ export declare function matmulCPU(a: Float32Array, b: Float32Array, M: number, N: number, K: number): Float32Array;
18
+ /**
19
+ * Get compiler cache statistics
20
+ */
21
+ export declare function getMatMulCacheStats(device: WebInferDevice): import("../index.ts").CacheStats;
@@ -0,0 +1,24 @@
1
+ /**
2
+ * Normalization Operations
3
+ * LayerNorm and RMSNorm implementations
4
+ */
5
+ import type { WebInferDevice } from "../core/device.ts";
6
+ import { Tensor } from "../core/tensor.ts";
7
+ /**
8
+ * Layer Normalization (CPU)
9
+ * Normalizes over the last dimension
10
+ */
11
+ export declare function layerNormCPU(x: Float32Array, weight: Float32Array, bias: Float32Array | null, shape: number[], eps?: number): Float32Array;
12
+ /**
13
+ * RMS Normalization (CPU)
14
+ * Used in Llama and other modern LLMs
15
+ */
16
+ export declare function rmsNormCPU(x: Float32Array, weight: Float32Array, shape: number[], eps?: number): Float32Array;
17
+ /**
18
+ * Layer Normalization (GPU)
19
+ */
20
+ export declare function layerNorm(device: WebInferDevice, x: Tensor, weight: Tensor, bias: Tensor | null, eps?: number): Promise<Tensor>;
21
+ /**
22
+ * RMS Normalization (GPU)
23
+ */
24
+ export declare function rmsNorm(device: WebInferDevice, x: Tensor, weight: Tensor, eps?: number): Promise<Tensor>;
@@ -0,0 +1,39 @@
1
+ /**
2
+ * Reshape and Transpose Operations
3
+ * Shape manipulation for tensor operations
4
+ */
5
+ import type { WebInferDevice } from "../core/device.ts";
6
+ import { Tensor } from "../core/tensor.ts";
7
+ /**
8
+ * Transpose 2D matrix (CPU)
9
+ * [M, N] -> [N, M]
10
+ */
11
+ export declare function transpose2DCPU(x: Float32Array, rows: number, cols: number): Float32Array;
12
+ /**
13
+ * Transpose last two dimensions (CPU)
14
+ * [..., M, N] -> [..., N, M]
15
+ */
16
+ export declare function transposeCPU(x: Float32Array, shape: number[]): {
17
+ data: Float32Array;
18
+ shape: number[];
19
+ };
20
+ /**
21
+ * Reshape tensor (CPU) - just validates and returns new shape
22
+ * Data layout doesn't change, only interpretation
23
+ */
24
+ export declare function reshapeCPU(x: Float32Array, oldShape: number[], newShape: number[]): {
25
+ data: Float32Array;
26
+ shape: number[];
27
+ };
28
+ /**
29
+ * Transpose 2D matrix (GPU)
30
+ */
31
+ export declare function transpose2D(device: WebInferDevice, x: Tensor): Promise<Tensor>;
32
+ /**
33
+ * Permute dimensions (CPU)
34
+ * Generalized transpose for any dimension ordering
35
+ */
36
+ export declare function permuteCPU(x: Float32Array, shape: number[], dims: number[]): {
37
+ data: Float32Array;
38
+ shape: number[];
39
+ };
@@ -0,0 +1,32 @@
1
+ /**
2
+ * Rotary Position Embedding (RoPE)
3
+ * Used in Llama, Mistral, and most modern LLMs
4
+ */
5
+ import type { WebInferDevice } from "../core/device.ts";
6
+ import { Tensor } from "../core/tensor.ts";
7
+ /**
8
+ * RoPE configuration
9
+ */
10
+ export interface RoPEConfig {
11
+ dim: number;
12
+ maxSeqLen: number;
13
+ base?: number;
14
+ scaling?: number;
15
+ }
16
+ /**
17
+ * Precompute RoPE frequencies
18
+ * Returns cos and sin values for each position and dimension
19
+ */
20
+ export declare function computeRoPEFrequencies(config: RoPEConfig): {
21
+ cos: Float32Array;
22
+ sin: Float32Array;
23
+ };
24
+ /**
25
+ * Apply RoPE to query/key tensors (CPU implementation)
26
+ * Input shape: [seqLen, numHeads, headDim]
27
+ */
28
+ export declare function ropeCPU(x: Float32Array, positions: number[], cos: Float32Array, sin: Float32Array, seqLen: number, numHeads: number, headDim: number): Float32Array;
29
+ /**
30
+ * Apply RoPE to query/key tensor (GPU implementation)
31
+ */
32
+ export declare function rope(device: WebInferDevice, x: Tensor, positions: Tensor, config: RoPEConfig): Promise<Tensor>;
@@ -0,0 +1,18 @@
1
+ /**
2
+ * Softmax Operation
3
+ * GPU-accelerated softmax for attention and output layers
4
+ */
5
+ import type { WebInferDevice } from "../core/device.ts";
6
+ import { Tensor } from "../core/tensor.ts";
7
+ /**
8
+ * Softmax (CPU) - operates along last dimension
9
+ */
10
+ export declare function softmaxCPU(x: Float32Array, shape: number[]): Float32Array;
11
+ /**
12
+ * Log-Softmax (CPU) - more numerically stable for cross-entropy loss
13
+ */
14
+ export declare function logSoftmaxCPU(x: Float32Array, shape: number[]): Float32Array;
15
+ /**
16
+ * Softmax (GPU) - operates along last dimension
17
+ */
18
+ export declare function softmaxGPU(device: WebInferDevice, x: Tensor): Promise<Tensor>;
@@ -0,0 +1,6 @@
1
+ /**
2
+ * Quantization Module
3
+ * INT4 and INT8 quantization for efficient LLM inference
4
+ */
5
+ export { quantizeToInt8, quantizeToInt4, dequantizeInt8, dequantizeInt4, quantizationError, getMemorySavings, type QuantConfig, type QuantizedTensor, } from "./quantize.ts";
6
+ export { qmatmulInt8CPU, qmatmulInt4CPU, qmatmulInt8BlockCPU, estimateQMatMulFlops, estimateQMatMulBandwidth, } from "./qmatmul.ts";
@@ -0,0 +1,38 @@
1
+ /**
2
+ * Quantized Matrix Multiplication
3
+ * INT4 and INT8 matmul for efficient LLM inference
4
+ */
5
+ import type { QuantizedTensor } from "./quantize.ts";
6
+ /**
7
+ * INT8 MatMul (CPU): float16/32 activations × int8 weights
8
+ * A: [M, K] float32 activations
9
+ * B: QuantizedTensor [K, N] int8 weights
10
+ * Output: [M, N] float32
11
+ */
12
+ export declare function qmatmulInt8CPU(A: Float32Array, B: QuantizedTensor, M: number, K: number, N: number): Float32Array;
13
+ /**
14
+ * INT4 MatMul (CPU): float activations × int4 weights
15
+ * A: [M, K] float32 activations
16
+ * B: QuantizedTensor [K, N] int4 weights (packed)
17
+ * Output: [M, N] float32
18
+ */
19
+ export declare function qmatmulInt4CPU(A: Float32Array, B: QuantizedTensor, M: number, K: number, N: number): Float32Array;
20
+ /**
21
+ * Optimized INT8 MatMul with block accumulation
22
+ * Uses SIMD-friendly access patterns
23
+ */
24
+ export declare function qmatmulInt8BlockCPU(A: Float32Array, B: QuantizedTensor, M: number, K: number, N: number, blockSize?: number): Float32Array;
25
+ /**
26
+ * Compute expected FLOPS for quantized matmul
27
+ */
28
+ export declare function estimateQMatMulFlops(M: number, K: number, N: number): number;
29
+ /**
30
+ * Estimate memory bandwidth for quantized matmul
31
+ */
32
+ export declare function estimateQMatMulBandwidth(M: number, K: number, N: number, bits: 4 | 8, groupSize: number): {
33
+ activationBytes: number;
34
+ weightBytes: number;
35
+ scaleBytes: number;
36
+ outputBytes: number;
37
+ totalBytes: number;
38
+ };
@@ -0,0 +1,52 @@
1
+ /**
2
+ * Quantization Operations
3
+ * INT4 and INT8 quantization for memory-efficient LLM inference
4
+ */
5
+ /**
6
+ * Quantization configuration
7
+ */
8
+ export interface QuantConfig {
9
+ bits: 4 | 8;
10
+ groupSize: number;
11
+ symmetric: boolean;
12
+ }
13
+ /**
14
+ * Quantized tensor representation
15
+ */
16
+ export interface QuantizedTensor {
17
+ data: Uint8Array;
18
+ scales: Float32Array;
19
+ zeros: Float32Array | null;
20
+ shape: number[];
21
+ config: QuantConfig;
22
+ }
23
+ /**
24
+ * Quantize Float32 tensor to INT8 (CPU)
25
+ */
26
+ export declare function quantizeToInt8(x: Float32Array, groupSize?: number, symmetric?: boolean): QuantizedTensor;
27
+ /**
28
+ * Quantize Float32 tensor to INT4 (CPU)
29
+ * Packs two INT4 values per byte
30
+ */
31
+ export declare function quantizeToInt4(x: Float32Array, groupSize?: number, symmetric?: boolean): QuantizedTensor;
32
+ /**
33
+ * Dequantize INT8 tensor back to Float32 (CPU)
34
+ */
35
+ export declare function dequantizeInt8(qt: QuantizedTensor): Float32Array;
36
+ /**
37
+ * Dequantize INT4 tensor back to Float32 (CPU)
38
+ */
39
+ export declare function dequantizeInt4(qt: QuantizedTensor): Float32Array;
40
+ /**
41
+ * Compute quantization error (MSE)
42
+ */
43
+ export declare function quantizationError(original: Float32Array, reconstructed: Float32Array): number;
44
+ /**
45
+ * Get memory savings from quantization
46
+ */
47
+ export declare function getMemorySavings(originalBytes: number, qt: QuantizedTensor): {
48
+ originalBytes: number;
49
+ quantizedBytes: number;
50
+ savings: number;
51
+ ratio: number;
52
+ };
@@ -0,0 +1,6 @@
1
+ /**
2
+ * Sampling Module Exports
3
+ */
4
+ export { topK, topKCPU, topKFilter } from "./top-k.ts";
5
+ export { topPFilter, topPFilterCPU } from "./top-p.ts";
6
+ export { sample, sampleCPU, sampleGreedy, sampleFromProbs, softmax, applyRepetitionPenalty, type SamplingConfig, } from "./sampler.ts";
@@ -0,0 +1,39 @@
1
+ /**
2
+ * Token Sampler - Combines filtering and sampling strategies
3
+ */
4
+ import type { WebInferDevice } from "../core/device.ts";
5
+ import { Tensor } from "../core/tensor.ts";
6
+ /**
7
+ * Sampling configuration
8
+ */
9
+ export interface SamplingConfig {
10
+ temperature?: number;
11
+ topK?: number;
12
+ topP?: number;
13
+ repetitionPenalty?: number;
14
+ seed?: number;
15
+ }
16
+ /**
17
+ * Apply softmax to logits
18
+ */
19
+ export declare function softmax(logits: Float32Array): Float32Array;
20
+ /**
21
+ * Apply repetition penalty to logits
22
+ */
23
+ export declare function applyRepetitionPenalty(logits: Float32Array, previousTokens: number[], penalty: number): Float32Array;
24
+ /**
25
+ * Sample a token from probability distribution
26
+ */
27
+ export declare function sampleFromProbs(probs: Float32Array, random?: () => number): number;
28
+ /**
29
+ * Greedy sampling (argmax)
30
+ */
31
+ export declare function sampleGreedy(logits: Float32Array): number;
32
+ /**
33
+ * Full sampling pipeline
34
+ */
35
+ export declare function sample(device: WebInferDevice, logits: Tensor, config?: SamplingConfig, previousTokens?: number[]): Promise<number>;
36
+ /**
37
+ * CPU-only sampling (no GPU tensors)
38
+ */
39
+ export declare function sampleCPU(logits: Float32Array, config?: SamplingConfig, previousTokens?: number[]): number;
@@ -0,0 +1,24 @@
1
+ /**
2
+ * Top-K Sampling - GPU-accelerated top-k selection
3
+ */
4
+ import type { WebInferDevice } from "../core/device.ts";
5
+ import { Tensor } from "../core/tensor.ts";
6
+ /**
7
+ * GPU-accelerated Top-K selection
8
+ * Returns the top K values and their indices from logits
9
+ */
10
+ export declare function topK(device: WebInferDevice, logits: Tensor, k: number): Promise<{
11
+ values: Tensor;
12
+ indices: Tensor;
13
+ }>;
14
+ /**
15
+ * CPU reference implementation
16
+ */
17
+ export declare function topKCPU(logits: Float32Array, k: number, vocabSize: number): {
18
+ values: Float32Array;
19
+ indices: Uint32Array;
20
+ };
21
+ /**
22
+ * Apply top-k filtering to logits (set non-top-k to -inf)
23
+ */
24
+ export declare function topKFilter(device: WebInferDevice, logits: Tensor, k: number): Promise<Tensor>;
@@ -0,0 +1,14 @@
1
+ /**
2
+ * Top-P (Nucleus) Sampling
3
+ */
4
+ import type { WebInferDevice } from "../core/device.ts";
5
+ import { Tensor } from "../core/tensor.ts";
6
+ /**
7
+ * Apply top-p (nucleus) filtering to logits
8
+ * Keeps the smallest set of tokens whose cumulative probability >= p
9
+ */
10
+ export declare function topPFilter(device: WebInferDevice, logits: Tensor, p: number, temperature?: number): Promise<Tensor>;
11
+ /**
12
+ * CPU implementation of top-p filtering
13
+ */
14
+ export declare function topPFilterCPU(logits: Float32Array, p: number, temperature?: number): Float32Array;
package/package.json ADDED
@@ -0,0 +1,54 @@
1
+ {
2
+ "name": "webinfer",
3
+ "version": "0.0.1",
4
+ "description": "High-performance LLM inference kernels for WebGPU",
5
+ "license": "Apache-2.0",
6
+ "repository": {
7
+ "type": "git",
8
+ "url": "https://github.com/guan404ming/webinfer"
9
+ },
10
+ "keywords": [
11
+ "webgpu",
12
+ "llm",
13
+ "inference",
14
+ "gpu",
15
+ "machine-learning",
16
+ "transformer",
17
+ "attention",
18
+ "matmul"
19
+ ],
20
+ "type": "module",
21
+ "main": "./dist/index.js",
22
+ "module": "./dist/index.js",
23
+ "types": "./dist/index.d.ts",
24
+ "exports": {
25
+ ".": {
26
+ "types": "./dist/index.d.ts",
27
+ "import": "./dist/index.js"
28
+ }
29
+ },
30
+ "files": [
31
+ "dist",
32
+ "README.md",
33
+ "LICENSE"
34
+ ],
35
+ "scripts": {
36
+ "dev": "bun --hot benchmarks/server.ts",
37
+ "bench": "open http://localhost:3000 && bun --hot benchmarks/server.ts",
38
+ "build": "bun run build:types && bun run build:js",
39
+ "build:types": "tsc -p tsconfig.build.json",
40
+ "build:js": "bun build ./src/index.ts --outdir ./dist --target browser --format esm",
41
+ "test": "bun test",
42
+ "typecheck": "tsc --noEmit",
43
+ "prepublishOnly": "bun run build"
44
+ },
45
+ "devDependencies": {
46
+ "@types/bun": "latest",
47
+ "@webgpu/types": "^0.1.49",
48
+ "onnxruntime-web": "^1.20.1",
49
+ "typescript": "^5.3.0"
50
+ },
51
+ "peerDependencies": {
52
+ "@webgpu/types": "^0.1.0"
53
+ }
54
+ }