webinfer 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/dist/attention/block-sparse/format.d.ts +52 -0
- package/dist/attention/block-sparse/patterns/causal.d.ts +16 -0
- package/dist/attention/block-sparse/patterns/sliding.d.ts +22 -0
- package/dist/attention/flash-attention.d.ts +30 -0
- package/dist/attention/index.d.ts +9 -0
- package/dist/attention/paged-kv/block-manager.d.ts +102 -0
- package/dist/attention/paged-kv/index.d.ts +5 -0
- package/dist/attention/paged-kv/page-table.d.ts +99 -0
- package/dist/attention/scheduler.d.ts +40 -0
- package/dist/core/buffer-pool.d.ts +18 -0
- package/dist/core/device.d.ts +23 -0
- package/dist/core/tensor.d.ts +25 -0
- package/dist/index.d.ts +22 -0
- package/dist/index.js +4228 -0
- package/dist/inference/engine.d.ts +69 -0
- package/dist/inference/generate.d.ts +30 -0
- package/dist/inference/index.d.ts +7 -0
- package/dist/inference/types.d.ts +161 -0
- package/dist/jit/compiler.d.ts +23 -0
- package/dist/jit/kernel-cache.d.ts +21 -0
- package/dist/model/gguf.d.ts +90 -0
- package/dist/model/index.d.ts +16 -0
- package/dist/model/safetensors.d.ts +38 -0
- package/dist/model/types.d.ts +182 -0
- package/dist/ops/activations.d.ts +43 -0
- package/dist/ops/elementwise.d.ts +38 -0
- package/dist/ops/embedding.d.ts +30 -0
- package/dist/ops/matmul.d.ts +21 -0
- package/dist/ops/normalization.d.ts +24 -0
- package/dist/ops/reshape.d.ts +39 -0
- package/dist/ops/rope.d.ts +32 -0
- package/dist/ops/softmax.d.ts +18 -0
- package/dist/quantization/index.d.ts +6 -0
- package/dist/quantization/qmatmul.d.ts +38 -0
- package/dist/quantization/quantize.d.ts +52 -0
- package/dist/sampling/index.d.ts +6 -0
- package/dist/sampling/sampler.d.ts +39 -0
- package/dist/sampling/top-k.d.ts +24 -0
- package/dist/sampling/top-p.d.ts +14 -0
- package/package.json +54 -0
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Element-wise Operations
|
|
3
|
+
* Add, Multiply, Scale, etc. for tensor operations
|
|
4
|
+
*/
|
|
5
|
+
import type { WebInferDevice } from "../core/device.ts";
|
|
6
|
+
import { Tensor } from "../core/tensor.ts";
|
|
7
|
+
/**
|
|
8
|
+
* Element-wise add (CPU)
|
|
9
|
+
*/
|
|
10
|
+
export declare function addCPU(a: Float32Array, b: Float32Array): Float32Array;
|
|
11
|
+
/**
|
|
12
|
+
* Element-wise multiply (CPU)
|
|
13
|
+
*/
|
|
14
|
+
export declare function mulCPU(a: Float32Array, b: Float32Array): Float32Array;
|
|
15
|
+
/**
|
|
16
|
+
* Scale by constant (CPU)
|
|
17
|
+
*/
|
|
18
|
+
export declare function scaleCPU(a: Float32Array, scalar: number): Float32Array;
|
|
19
|
+
/**
|
|
20
|
+
* Add constant (CPU)
|
|
21
|
+
*/
|
|
22
|
+
export declare function addScalarCPU(a: Float32Array, scalar: number): Float32Array;
|
|
23
|
+
/**
|
|
24
|
+
* Fused multiply-add: a * b + c (CPU)
|
|
25
|
+
*/
|
|
26
|
+
export declare function fmaCPU(a: Float32Array, b: Float32Array, c: Float32Array): Float32Array;
|
|
27
|
+
/**
|
|
28
|
+
* Element-wise add (GPU)
|
|
29
|
+
*/
|
|
30
|
+
export declare function add(device: WebInferDevice, a: Tensor, b: Tensor): Promise<Tensor>;
|
|
31
|
+
/**
|
|
32
|
+
* Element-wise multiply (GPU)
|
|
33
|
+
*/
|
|
34
|
+
export declare function mul(device: WebInferDevice, a: Tensor, b: Tensor): Promise<Tensor>;
|
|
35
|
+
/**
|
|
36
|
+
* Scale tensor by constant (GPU)
|
|
37
|
+
*/
|
|
38
|
+
export declare function scale(device: WebInferDevice, a: Tensor, scalar: number): Promise<Tensor>;
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Embedding Operations
|
|
3
|
+
* Token embedding lookup for LLM inference
|
|
4
|
+
*/
|
|
5
|
+
import type { WebInferDevice } from "../core/device.ts";
|
|
6
|
+
import { Tensor } from "../core/tensor.ts";
|
|
7
|
+
/**
|
|
8
|
+
* Embedding lookup (CPU)
|
|
9
|
+
* @param embeddings - Embedding table [vocabSize, embeddingDim]
|
|
10
|
+
* @param tokens - Token indices [seqLen]
|
|
11
|
+
* @param embeddingDim - Dimension of each embedding
|
|
12
|
+
* @returns Embedded tokens [seqLen, embeddingDim]
|
|
13
|
+
*/
|
|
14
|
+
export declare function embeddingCPU(embeddings: Float32Array, tokens: number[], embeddingDim: number): Float32Array;
|
|
15
|
+
/**
|
|
16
|
+
* Embedding lookup (GPU)
|
|
17
|
+
* @param device - WebInfer device
|
|
18
|
+
* @param embeddings - Embedding table tensor [vocabSize, embeddingDim]
|
|
19
|
+
* @param tokens - Token indices tensor [seqLen]
|
|
20
|
+
* @returns Embedded tokens tensor [seqLen, embeddingDim]
|
|
21
|
+
*/
|
|
22
|
+
export declare function embedding(device: WebInferDevice, embeddings: Tensor, tokens: Tensor): Promise<Tensor>;
|
|
23
|
+
/**
|
|
24
|
+
* Batched embedding lookup (CPU)
|
|
25
|
+
* @param embeddings - Embedding table [vocabSize, embeddingDim]
|
|
26
|
+
* @param tokens - Token indices [batchSize, seqLen]
|
|
27
|
+
* @param embeddingDim - Dimension of each embedding
|
|
28
|
+
* @returns Embedded tokens [batchSize, seqLen, embeddingDim]
|
|
29
|
+
*/
|
|
30
|
+
export declare function batchedEmbeddingCPU(embeddings: Float32Array, tokens: number[][], embeddingDim: number): Float32Array;
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MatMul Operator - GPU-accelerated matrix multiplication
|
|
3
|
+
*/
|
|
4
|
+
import type { WebInferDevice } from "../core/device.ts";
|
|
5
|
+
import { Tensor } from "../core/tensor.ts";
|
|
6
|
+
/**
|
|
7
|
+
* Matrix multiplication: C = A @ B
|
|
8
|
+
* @param device WebInfer device
|
|
9
|
+
* @param a Input tensor [M, K]
|
|
10
|
+
* @param b Input tensor [K, N]
|
|
11
|
+
* @returns Output tensor [M, N]
|
|
12
|
+
*/
|
|
13
|
+
export declare function matmul(device: WebInferDevice, a: Tensor, b: Tensor): Promise<Tensor>;
|
|
14
|
+
/**
|
|
15
|
+
* CPU reference implementation for verification
|
|
16
|
+
*/
|
|
17
|
+
export declare function matmulCPU(a: Float32Array, b: Float32Array, M: number, N: number, K: number): Float32Array;
|
|
18
|
+
/**
|
|
19
|
+
* Get compiler cache statistics
|
|
20
|
+
*/
|
|
21
|
+
export declare function getMatMulCacheStats(device: WebInferDevice): import("../index.ts").CacheStats;
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Normalization Operations
|
|
3
|
+
* LayerNorm and RMSNorm implementations
|
|
4
|
+
*/
|
|
5
|
+
import type { WebInferDevice } from "../core/device.ts";
|
|
6
|
+
import { Tensor } from "../core/tensor.ts";
|
|
7
|
+
/**
|
|
8
|
+
* Layer Normalization (CPU)
|
|
9
|
+
* Normalizes over the last dimension
|
|
10
|
+
*/
|
|
11
|
+
export declare function layerNormCPU(x: Float32Array, weight: Float32Array, bias: Float32Array | null, shape: number[], eps?: number): Float32Array;
|
|
12
|
+
/**
|
|
13
|
+
* RMS Normalization (CPU)
|
|
14
|
+
* Used in Llama and other modern LLMs
|
|
15
|
+
*/
|
|
16
|
+
export declare function rmsNormCPU(x: Float32Array, weight: Float32Array, shape: number[], eps?: number): Float32Array;
|
|
17
|
+
/**
|
|
18
|
+
* Layer Normalization (GPU)
|
|
19
|
+
*/
|
|
20
|
+
export declare function layerNorm(device: WebInferDevice, x: Tensor, weight: Tensor, bias: Tensor | null, eps?: number): Promise<Tensor>;
|
|
21
|
+
/**
|
|
22
|
+
* RMS Normalization (GPU)
|
|
23
|
+
*/
|
|
24
|
+
export declare function rmsNorm(device: WebInferDevice, x: Tensor, weight: Tensor, eps?: number): Promise<Tensor>;
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Reshape and Transpose Operations
|
|
3
|
+
* Shape manipulation for tensor operations
|
|
4
|
+
*/
|
|
5
|
+
import type { WebInferDevice } from "../core/device.ts";
|
|
6
|
+
import { Tensor } from "../core/tensor.ts";
|
|
7
|
+
/**
|
|
8
|
+
* Transpose 2D matrix (CPU)
|
|
9
|
+
* [M, N] -> [N, M]
|
|
10
|
+
*/
|
|
11
|
+
export declare function transpose2DCPU(x: Float32Array, rows: number, cols: number): Float32Array;
|
|
12
|
+
/**
|
|
13
|
+
* Transpose last two dimensions (CPU)
|
|
14
|
+
* [..., M, N] -> [..., N, M]
|
|
15
|
+
*/
|
|
16
|
+
export declare function transposeCPU(x: Float32Array, shape: number[]): {
|
|
17
|
+
data: Float32Array;
|
|
18
|
+
shape: number[];
|
|
19
|
+
};
|
|
20
|
+
/**
|
|
21
|
+
* Reshape tensor (CPU) - just validates and returns new shape
|
|
22
|
+
* Data layout doesn't change, only interpretation
|
|
23
|
+
*/
|
|
24
|
+
export declare function reshapeCPU(x: Float32Array, oldShape: number[], newShape: number[]): {
|
|
25
|
+
data: Float32Array;
|
|
26
|
+
shape: number[];
|
|
27
|
+
};
|
|
28
|
+
/**
|
|
29
|
+
* Transpose 2D matrix (GPU)
|
|
30
|
+
*/
|
|
31
|
+
export declare function transpose2D(device: WebInferDevice, x: Tensor): Promise<Tensor>;
|
|
32
|
+
/**
|
|
33
|
+
* Permute dimensions (CPU)
|
|
34
|
+
* Generalized transpose for any dimension ordering
|
|
35
|
+
*/
|
|
36
|
+
export declare function permuteCPU(x: Float32Array, shape: number[], dims: number[]): {
|
|
37
|
+
data: Float32Array;
|
|
38
|
+
shape: number[];
|
|
39
|
+
};
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Rotary Position Embedding (RoPE)
|
|
3
|
+
* Used in Llama, Mistral, and most modern LLMs
|
|
4
|
+
*/
|
|
5
|
+
import type { WebInferDevice } from "../core/device.ts";
|
|
6
|
+
import { Tensor } from "../core/tensor.ts";
|
|
7
|
+
/**
|
|
8
|
+
* RoPE configuration
|
|
9
|
+
*/
|
|
10
|
+
export interface RoPEConfig {
|
|
11
|
+
dim: number;
|
|
12
|
+
maxSeqLen: number;
|
|
13
|
+
base?: number;
|
|
14
|
+
scaling?: number;
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* Precompute RoPE frequencies
|
|
18
|
+
* Returns cos and sin values for each position and dimension
|
|
19
|
+
*/
|
|
20
|
+
export declare function computeRoPEFrequencies(config: RoPEConfig): {
|
|
21
|
+
cos: Float32Array;
|
|
22
|
+
sin: Float32Array;
|
|
23
|
+
};
|
|
24
|
+
/**
|
|
25
|
+
* Apply RoPE to query/key tensors (CPU implementation)
|
|
26
|
+
* Input shape: [seqLen, numHeads, headDim]
|
|
27
|
+
*/
|
|
28
|
+
export declare function ropeCPU(x: Float32Array, positions: number[], cos: Float32Array, sin: Float32Array, seqLen: number, numHeads: number, headDim: number): Float32Array;
|
|
29
|
+
/**
|
|
30
|
+
* Apply RoPE to query/key tensor (GPU implementation)
|
|
31
|
+
*/
|
|
32
|
+
export declare function rope(device: WebInferDevice, x: Tensor, positions: Tensor, config: RoPEConfig): Promise<Tensor>;
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Softmax Operation
|
|
3
|
+
* GPU-accelerated softmax for attention and output layers
|
|
4
|
+
*/
|
|
5
|
+
import type { WebInferDevice } from "../core/device.ts";
|
|
6
|
+
import { Tensor } from "../core/tensor.ts";
|
|
7
|
+
/**
|
|
8
|
+
* Softmax (CPU) - operates along last dimension
|
|
9
|
+
*/
|
|
10
|
+
export declare function softmaxCPU(x: Float32Array, shape: number[]): Float32Array;
|
|
11
|
+
/**
|
|
12
|
+
* Log-Softmax (CPU) - more numerically stable for cross-entropy loss
|
|
13
|
+
*/
|
|
14
|
+
export declare function logSoftmaxCPU(x: Float32Array, shape: number[]): Float32Array;
|
|
15
|
+
/**
|
|
16
|
+
* Softmax (GPU) - operates along last dimension
|
|
17
|
+
*/
|
|
18
|
+
export declare function softmaxGPU(device: WebInferDevice, x: Tensor): Promise<Tensor>;
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Quantization Module
|
|
3
|
+
* INT4 and INT8 quantization for efficient LLM inference
|
|
4
|
+
*/
|
|
5
|
+
export { quantizeToInt8, quantizeToInt4, dequantizeInt8, dequantizeInt4, quantizationError, getMemorySavings, type QuantConfig, type QuantizedTensor, } from "./quantize.ts";
|
|
6
|
+
export { qmatmulInt8CPU, qmatmulInt4CPU, qmatmulInt8BlockCPU, estimateQMatMulFlops, estimateQMatMulBandwidth, } from "./qmatmul.ts";
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Quantized Matrix Multiplication
|
|
3
|
+
* INT4 and INT8 matmul for efficient LLM inference
|
|
4
|
+
*/
|
|
5
|
+
import type { QuantizedTensor } from "./quantize.ts";
|
|
6
|
+
/**
|
|
7
|
+
* INT8 MatMul (CPU): float16/32 activations × int8 weights
|
|
8
|
+
* A: [M, K] float32 activations
|
|
9
|
+
* B: QuantizedTensor [K, N] int8 weights
|
|
10
|
+
* Output: [M, N] float32
|
|
11
|
+
*/
|
|
12
|
+
export declare function qmatmulInt8CPU(A: Float32Array, B: QuantizedTensor, M: number, K: number, N: number): Float32Array;
|
|
13
|
+
/**
|
|
14
|
+
* INT4 MatMul (CPU): float activations × int4 weights
|
|
15
|
+
* A: [M, K] float32 activations
|
|
16
|
+
* B: QuantizedTensor [K, N] int4 weights (packed)
|
|
17
|
+
* Output: [M, N] float32
|
|
18
|
+
*/
|
|
19
|
+
export declare function qmatmulInt4CPU(A: Float32Array, B: QuantizedTensor, M: number, K: number, N: number): Float32Array;
|
|
20
|
+
/**
|
|
21
|
+
* Optimized INT8 MatMul with block accumulation
|
|
22
|
+
* Uses SIMD-friendly access patterns
|
|
23
|
+
*/
|
|
24
|
+
export declare function qmatmulInt8BlockCPU(A: Float32Array, B: QuantizedTensor, M: number, K: number, N: number, blockSize?: number): Float32Array;
|
|
25
|
+
/**
|
|
26
|
+
* Compute expected FLOPS for quantized matmul
|
|
27
|
+
*/
|
|
28
|
+
export declare function estimateQMatMulFlops(M: number, K: number, N: number): number;
|
|
29
|
+
/**
|
|
30
|
+
* Estimate memory bandwidth for quantized matmul
|
|
31
|
+
*/
|
|
32
|
+
export declare function estimateQMatMulBandwidth(M: number, K: number, N: number, bits: 4 | 8, groupSize: number): {
|
|
33
|
+
activationBytes: number;
|
|
34
|
+
weightBytes: number;
|
|
35
|
+
scaleBytes: number;
|
|
36
|
+
outputBytes: number;
|
|
37
|
+
totalBytes: number;
|
|
38
|
+
};
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Quantization Operations
|
|
3
|
+
* INT4 and INT8 quantization for memory-efficient LLM inference
|
|
4
|
+
*/
|
|
5
|
+
/**
|
|
6
|
+
* Quantization configuration
|
|
7
|
+
*/
|
|
8
|
+
export interface QuantConfig {
|
|
9
|
+
bits: 4 | 8;
|
|
10
|
+
groupSize: number;
|
|
11
|
+
symmetric: boolean;
|
|
12
|
+
}
|
|
13
|
+
/**
|
|
14
|
+
* Quantized tensor representation
|
|
15
|
+
*/
|
|
16
|
+
export interface QuantizedTensor {
|
|
17
|
+
data: Uint8Array;
|
|
18
|
+
scales: Float32Array;
|
|
19
|
+
zeros: Float32Array | null;
|
|
20
|
+
shape: number[];
|
|
21
|
+
config: QuantConfig;
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* Quantize Float32 tensor to INT8 (CPU)
|
|
25
|
+
*/
|
|
26
|
+
export declare function quantizeToInt8(x: Float32Array, groupSize?: number, symmetric?: boolean): QuantizedTensor;
|
|
27
|
+
/**
|
|
28
|
+
* Quantize Float32 tensor to INT4 (CPU)
|
|
29
|
+
* Packs two INT4 values per byte
|
|
30
|
+
*/
|
|
31
|
+
export declare function quantizeToInt4(x: Float32Array, groupSize?: number, symmetric?: boolean): QuantizedTensor;
|
|
32
|
+
/**
|
|
33
|
+
* Dequantize INT8 tensor back to Float32 (CPU)
|
|
34
|
+
*/
|
|
35
|
+
export declare function dequantizeInt8(qt: QuantizedTensor): Float32Array;
|
|
36
|
+
/**
|
|
37
|
+
* Dequantize INT4 tensor back to Float32 (CPU)
|
|
38
|
+
*/
|
|
39
|
+
export declare function dequantizeInt4(qt: QuantizedTensor): Float32Array;
|
|
40
|
+
/**
|
|
41
|
+
* Compute quantization error (MSE)
|
|
42
|
+
*/
|
|
43
|
+
export declare function quantizationError(original: Float32Array, reconstructed: Float32Array): number;
|
|
44
|
+
/**
|
|
45
|
+
* Get memory savings from quantization
|
|
46
|
+
*/
|
|
47
|
+
export declare function getMemorySavings(originalBytes: number, qt: QuantizedTensor): {
|
|
48
|
+
originalBytes: number;
|
|
49
|
+
quantizedBytes: number;
|
|
50
|
+
savings: number;
|
|
51
|
+
ratio: number;
|
|
52
|
+
};
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Sampling Module Exports
|
|
3
|
+
*/
|
|
4
|
+
export { topK, topKCPU, topKFilter } from "./top-k.ts";
|
|
5
|
+
export { topPFilter, topPFilterCPU } from "./top-p.ts";
|
|
6
|
+
export { sample, sampleCPU, sampleGreedy, sampleFromProbs, softmax, applyRepetitionPenalty, type SamplingConfig, } from "./sampler.ts";
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Token Sampler - Combines filtering and sampling strategies
|
|
3
|
+
*/
|
|
4
|
+
import type { WebInferDevice } from "../core/device.ts";
|
|
5
|
+
import { Tensor } from "../core/tensor.ts";
|
|
6
|
+
/**
|
|
7
|
+
* Sampling configuration
|
|
8
|
+
*/
|
|
9
|
+
export interface SamplingConfig {
|
|
10
|
+
temperature?: number;
|
|
11
|
+
topK?: number;
|
|
12
|
+
topP?: number;
|
|
13
|
+
repetitionPenalty?: number;
|
|
14
|
+
seed?: number;
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* Apply softmax to logits
|
|
18
|
+
*/
|
|
19
|
+
export declare function softmax(logits: Float32Array): Float32Array;
|
|
20
|
+
/**
|
|
21
|
+
* Apply repetition penalty to logits
|
|
22
|
+
*/
|
|
23
|
+
export declare function applyRepetitionPenalty(logits: Float32Array, previousTokens: number[], penalty: number): Float32Array;
|
|
24
|
+
/**
|
|
25
|
+
* Sample a token from probability distribution
|
|
26
|
+
*/
|
|
27
|
+
export declare function sampleFromProbs(probs: Float32Array, random?: () => number): number;
|
|
28
|
+
/**
|
|
29
|
+
* Greedy sampling (argmax)
|
|
30
|
+
*/
|
|
31
|
+
export declare function sampleGreedy(logits: Float32Array): number;
|
|
32
|
+
/**
|
|
33
|
+
* Full sampling pipeline
|
|
34
|
+
*/
|
|
35
|
+
export declare function sample(device: WebInferDevice, logits: Tensor, config?: SamplingConfig, previousTokens?: number[]): Promise<number>;
|
|
36
|
+
/**
|
|
37
|
+
* CPU-only sampling (no GPU tensors)
|
|
38
|
+
*/
|
|
39
|
+
export declare function sampleCPU(logits: Float32Array, config?: SamplingConfig, previousTokens?: number[]): number;
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Top-K Sampling - GPU-accelerated top-k selection
|
|
3
|
+
*/
|
|
4
|
+
import type { WebInferDevice } from "../core/device.ts";
|
|
5
|
+
import { Tensor } from "../core/tensor.ts";
|
|
6
|
+
/**
|
|
7
|
+
* GPU-accelerated Top-K selection
|
|
8
|
+
* Returns the top K values and their indices from logits
|
|
9
|
+
*/
|
|
10
|
+
export declare function topK(device: WebInferDevice, logits: Tensor, k: number): Promise<{
|
|
11
|
+
values: Tensor;
|
|
12
|
+
indices: Tensor;
|
|
13
|
+
}>;
|
|
14
|
+
/**
|
|
15
|
+
* CPU reference implementation
|
|
16
|
+
*/
|
|
17
|
+
export declare function topKCPU(logits: Float32Array, k: number, vocabSize: number): {
|
|
18
|
+
values: Float32Array;
|
|
19
|
+
indices: Uint32Array;
|
|
20
|
+
};
|
|
21
|
+
/**
|
|
22
|
+
* Apply top-k filtering to logits (set non-top-k to -inf)
|
|
23
|
+
*/
|
|
24
|
+
export declare function topKFilter(device: WebInferDevice, logits: Tensor, k: number): Promise<Tensor>;
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Top-P (Nucleus) Sampling
|
|
3
|
+
*/
|
|
4
|
+
import type { WebInferDevice } from "../core/device.ts";
|
|
5
|
+
import { Tensor } from "../core/tensor.ts";
|
|
6
|
+
/**
|
|
7
|
+
* Apply top-p (nucleus) filtering to logits
|
|
8
|
+
* Keeps the smallest set of tokens whose cumulative probability >= p
|
|
9
|
+
*/
|
|
10
|
+
export declare function topPFilter(device: WebInferDevice, logits: Tensor, p: number, temperature?: number): Promise<Tensor>;
|
|
11
|
+
/**
|
|
12
|
+
* CPU implementation of top-p filtering
|
|
13
|
+
*/
|
|
14
|
+
export declare function topPFilterCPU(logits: Float32Array, p: number, temperature?: number): Float32Array;
|
package/package.json
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "webinfer",
|
|
3
|
+
"version": "0.0.1",
|
|
4
|
+
"description": "High-performance LLM inference kernels for WebGPU",
|
|
5
|
+
"license": "Apache-2.0",
|
|
6
|
+
"repository": {
|
|
7
|
+
"type": "git",
|
|
8
|
+
"url": "https://github.com/guan404ming/webinfer"
|
|
9
|
+
},
|
|
10
|
+
"keywords": [
|
|
11
|
+
"webgpu",
|
|
12
|
+
"llm",
|
|
13
|
+
"inference",
|
|
14
|
+
"gpu",
|
|
15
|
+
"machine-learning",
|
|
16
|
+
"transformer",
|
|
17
|
+
"attention",
|
|
18
|
+
"matmul"
|
|
19
|
+
],
|
|
20
|
+
"type": "module",
|
|
21
|
+
"main": "./dist/index.js",
|
|
22
|
+
"module": "./dist/index.js",
|
|
23
|
+
"types": "./dist/index.d.ts",
|
|
24
|
+
"exports": {
|
|
25
|
+
".": {
|
|
26
|
+
"types": "./dist/index.d.ts",
|
|
27
|
+
"import": "./dist/index.js"
|
|
28
|
+
}
|
|
29
|
+
},
|
|
30
|
+
"files": [
|
|
31
|
+
"dist",
|
|
32
|
+
"README.md",
|
|
33
|
+
"LICENSE"
|
|
34
|
+
],
|
|
35
|
+
"scripts": {
|
|
36
|
+
"dev": "bun --hot benchmarks/server.ts",
|
|
37
|
+
"bench": "open http://localhost:3000 && bun --hot benchmarks/server.ts",
|
|
38
|
+
"build": "bun run build:types && bun run build:js",
|
|
39
|
+
"build:types": "tsc -p tsconfig.build.json",
|
|
40
|
+
"build:js": "bun build ./src/index.ts --outdir ./dist --target browser --format esm",
|
|
41
|
+
"test": "bun test",
|
|
42
|
+
"typecheck": "tsc --noEmit",
|
|
43
|
+
"prepublishOnly": "bun run build"
|
|
44
|
+
},
|
|
45
|
+
"devDependencies": {
|
|
46
|
+
"@types/bun": "latest",
|
|
47
|
+
"@webgpu/types": "^0.1.49",
|
|
48
|
+
"onnxruntime-web": "^1.20.1",
|
|
49
|
+
"typescript": "^5.3.0"
|
|
50
|
+
},
|
|
51
|
+
"peerDependencies": {
|
|
52
|
+
"@webgpu/types": "^0.1.0"
|
|
53
|
+
}
|
|
54
|
+
}
|