webinfer 0.0.1 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,7 +4,7 @@
4
4
  */
5
5
  import type { WebInferDevice } from "../core/device.ts";
6
6
  import type { LoadedModel } from "../model/types.ts";
7
- import type { ModelConfig, InferenceConfig, ForwardResult } from "./types.ts";
7
+ import type { ForwardResult, InferenceConfig, ModelConfig } from "./types.ts";
8
8
  /**
9
9
  * Inference Engine
10
10
  * Manages model weights and provides forward pass functionality
@@ -2,6 +2,6 @@
2
2
  * Inference Module
3
3
  * High-level API for LLM inference
4
4
  */
5
- export { type ModelConfig, type InferenceConfig, type GenerationConfig, type GenerationResult, type StreamToken, type FinishReason, type ForwardResult, type ModelWeights, type LayerWeights, DEFAULT_GENERATION_CONFIG, normalizeGenerationConfig, } from "./types.ts";
6
5
  export { InferenceEngine } from "./engine.ts";
7
6
  export { generate, generateStream, greedyDecode, sampleNextToken, } from "./generate.ts";
7
+ export { DEFAULT_GENERATION_CONFIG, type FinishReason, type ForwardResult, type GenerationConfig, type GenerationResult, type InferenceConfig, type LayerWeights, type ModelConfig, type ModelWeights, normalizeGenerationConfig, type StreamToken, } from "./types.ts";
@@ -1,8 +1,8 @@
1
1
  /**
2
2
  * WGSL Compiler - Generates optimized GPU kernels
3
3
  */
4
- import { KernelCache } from "./kernel-cache.ts";
5
4
  import type { DeviceInfo } from "../core/device.ts";
5
+ import type { KernelCache } from "./kernel-cache.ts";
6
6
  export interface MatMulConfig {
7
7
  M: number;
8
8
  N: number;
@@ -11,7 +11,7 @@
11
11
  * - Padding to alignment
12
12
  * - Tensor data
13
13
  */
14
- import { type GGUFTensorInfo, type TensorInfo, type LoadedModel, type LoadOptions, GGUFQuantType } from "./types.ts";
14
+ import { GGUFQuantType, type GGUFTensorInfo, type LoadedModel, type LoadOptions, type TensorInfo } from "./types.ts";
15
15
  /**
16
16
  * Reader helper for GGUF binary format
17
17
  */
@@ -2,10 +2,10 @@
2
2
  * Model Loading Module
3
3
  * Supports SafeTensors and GGUF formats
4
4
  */
5
- import type { ModelFormat, LoadOptions, LoadedModel } from "./types.ts";
6
- export { type ModelFormat, type SafetensorsDType, GGUFQuantType, GGUFMetadataValueType, type TensorInfo, type SafetensorsHeader, type SafetensorsHeaderEntry, type ModelMetadata, type GGUFTensorInfo, type LoadedTensor, type LoadedModel, type LoadOptions, SAFETENSORS_DTYPE_BYTES, GGUF_QUANT_BLOCK_SIZE, GGUF_QUANT_BYTES_PER_BLOCK, } from "./types.ts";
7
- export { parseSafetensorsHeader, getSafetensorsTensorInfos, loadSafetensorsTensor, loadSafetensors, loadSafetensorsFromUrl, isSafetensors, } from "./safetensors.ts";
8
- export { type GGUFHeader, parseGGUFHeader, parseGGUFMetadata, parseGGUFTensorInfos, calculateGGUFTensorBytes, loadGGUF, loadGGUFFromUrl, loadGGUFTensor, dequantizeQ4_0Block, dequantizeQ8_0Block, isGGUF, } from "./gguf.ts";
5
+ import type { LoadedModel, LoadOptions, ModelFormat } from "./types.ts";
6
+ export { calculateGGUFTensorBytes, dequantizeQ4_0Block, dequantizeQ8_0Block, type GGUFHeader, isGGUF, loadGGUF, loadGGUFFromUrl, loadGGUFTensor, parseGGUFHeader, parseGGUFMetadata, parseGGUFTensorInfos, } from "./gguf.ts";
7
+ export { getSafetensorsTensorInfos, isSafetensors, loadSafetensors, loadSafetensorsFromUrl, loadSafetensorsTensor, parseSafetensorsHeader, } from "./safetensors.ts";
8
+ export { GGUF_QUANT_BLOCK_SIZE, GGUF_QUANT_BYTES_PER_BLOCK, GGUFMetadataValueType, GGUFQuantType, type GGUFTensorInfo, type LoadedModel, type LoadedTensor, type LoadOptions, type ModelFormat, type ModelMetadata, SAFETENSORS_DTYPE_BYTES, type SafetensorsDType, type SafetensorsHeader, type SafetensorsHeaderEntry, type TensorInfo, } from "./types.ts";
9
9
  /**
10
10
  * Auto-detect model format from buffer
11
11
  */
@@ -6,7 +6,7 @@
6
6
  * - N bytes: JSON header (UTF-8)
7
7
  * - Remaining: tensor data (contiguous)
8
8
  */
9
- import { type SafetensorsHeader, type TensorInfo, type LoadedModel, type LoadOptions } from "./types.ts";
9
+ import { type LoadedModel, type LoadOptions, type SafetensorsHeader, type TensorInfo } from "./types.ts";
10
10
  /**
11
11
  * Parse the SafeTensors header from a buffer
12
12
  * @param buffer - ArrayBuffer containing the SafeTensors file
@@ -22,3 +22,42 @@ export declare function layerNorm(device: WebInferDevice, x: Tensor, weight: Ten
22
22
  * RMS Normalization (GPU)
23
23
  */
24
24
  export declare function rmsNorm(device: WebInferDevice, x: Tensor, weight: Tensor, eps?: number): Promise<Tensor>;
25
+ /**
26
+ * Fused Add + RMS Normalization (CPU)
27
+ * Computes: output = rmsNorm(input + residual)
28
+ * Also updates residual in-place: residual = input + residual
29
+ */
30
+ export declare function fusedAddRmsNormCPU(input: Float32Array, residual: Float32Array, weight: Float32Array, shape: number[], eps?: number): {
31
+ output: Float32Array;
32
+ residual: Float32Array;
33
+ };
34
+ /**
35
+ * Fused Add + RMS Normalization (GPU)
36
+ */
37
+ export declare function fusedAddRmsNorm(device: WebInferDevice, input: Tensor, residual: Tensor, weight: Tensor, eps?: number): Promise<{
38
+ output: Tensor;
39
+ residual: Tensor;
40
+ }>;
41
+ /**
42
+ * Gemma-style RMS Normalization (CPU)
43
+ * Uses (1 + weight) instead of weight: output = x * invRms * (1 + weight)
44
+ */
45
+ export declare function gemmaRmsNormCPU(x: Float32Array, weight: Float32Array, shape: number[], eps?: number): Float32Array;
46
+ /**
47
+ * Gemma-style RMS Normalization (GPU)
48
+ */
49
+ export declare function gemmaRmsNorm(device: WebInferDevice, x: Tensor, weight: Tensor, eps?: number): Promise<Tensor>;
50
+ /**
51
+ * Gemma-style Fused Add + RMS Normalization (CPU)
52
+ */
53
+ export declare function gemmaFusedAddRmsNormCPU(input: Float32Array, residual: Float32Array, weight: Float32Array, shape: number[], eps?: number): {
54
+ output: Float32Array;
55
+ residual: Float32Array;
56
+ };
57
+ /**
58
+ * Gemma-style Fused Add + RMS Normalization (GPU)
59
+ */
60
+ export declare function gemmaFusedAddRmsNorm(device: WebInferDevice, input: Tensor, residual: Tensor, weight: Tensor, eps?: number): Promise<{
61
+ output: Tensor;
62
+ residual: Tensor;
63
+ }>;
@@ -2,5 +2,5 @@
2
2
  * Quantization Module
3
3
  * INT4 and INT8 quantization for efficient LLM inference
4
4
  */
5
- export { quantizeToInt8, quantizeToInt4, dequantizeInt8, dequantizeInt4, quantizationError, getMemorySavings, type QuantConfig, type QuantizedTensor, } from "./quantize.ts";
6
- export { qmatmulInt8CPU, qmatmulInt4CPU, qmatmulInt8BlockCPU, estimateQMatMulFlops, estimateQMatMulBandwidth, } from "./qmatmul.ts";
5
+ export { estimateQMatMulBandwidth, estimateQMatMulFlops } from "./qmatmul.ts";
6
+ export { dequantizeInt4, dequantizeInt8, getMemorySavings, type QuantConfig, type QuantizedTensor, quantizationError, quantizeToInt4, quantizeToInt8, } from "./quantize.ts";
@@ -0,0 +1,87 @@
1
+ /**
2
+ * Beam Search Decoding
3
+ * Maintains top-k hypotheses during generation
4
+ */
5
+ /**
6
+ * Beam search configuration
7
+ */
8
+ export interface BeamSearchConfig {
9
+ /** Number of beams to maintain */
10
+ beamWidth: number;
11
+ /** Maximum sequence length */
12
+ maxLength: number;
13
+ /** Length penalty (>1 favors longer, <1 favors shorter) */
14
+ lengthPenalty?: number;
15
+ /** Early stopping when all beams hit EOS */
16
+ earlyStopping?: boolean;
17
+ /** EOS token ID */
18
+ eosTokenId?: number;
19
+ /** Minimum length before EOS allowed */
20
+ minLength?: number;
21
+ /** Number of beams to return */
22
+ numReturn?: number;
23
+ /** Diversity penalty for diverse beam search */
24
+ diversityPenalty?: number;
25
+ /** Number of groups for diverse beam search */
26
+ numBeamGroups?: number;
27
+ }
28
+ /**
29
+ * Single beam hypothesis
30
+ */
31
+ export interface BeamHypothesis {
32
+ /** Token sequence */
33
+ tokens: number[];
34
+ /** Log probability score */
35
+ score: number;
36
+ /** Normalized score (with length penalty) */
37
+ normalizedScore: number;
38
+ /** Whether sequence is complete (hit EOS) */
39
+ isComplete: boolean;
40
+ }
41
+ /**
42
+ * Beam search result
43
+ */
44
+ export interface BeamSearchResult {
45
+ /** Best hypotheses sorted by score */
46
+ hypotheses: BeamHypothesis[];
47
+ /** Number of steps taken */
48
+ numSteps: number;
49
+ }
50
+ /**
51
+ * Beam Search decoder
52
+ */
53
+ export declare class BeamSearch {
54
+ private config;
55
+ constructor(config: BeamSearchConfig);
56
+ /**
57
+ * Run beam search
58
+ *
59
+ * @param scoreFn Function that returns log probabilities for next token
60
+ * @param inputTokens Initial tokens (prompt)
61
+ */
62
+ search(scoreFn: (tokens: number[]) => Promise<Float32Array>, inputTokens: number[]): Promise<BeamSearchResult>;
63
+ /**
64
+ * Apply length penalty to score
65
+ */
66
+ private normalizeScore;
67
+ }
68
+ /**
69
+ * Diverse Beam Search
70
+ * Groups beams and penalizes similarity between groups
71
+ */
72
+ export declare class DiverseBeamSearch {
73
+ private config;
74
+ constructor(config: BeamSearchConfig);
75
+ /**
76
+ * Run diverse beam search
77
+ */
78
+ search(scoreFn: (tokens: number[]) => Promise<Float32Array>, inputTokens: number[]): Promise<BeamSearchResult>;
79
+ private normalizeScore;
80
+ }
81
+ /**
82
+ * Constrained beam search with prefix/suffix constraints
83
+ */
84
+ export declare function constrainedBeamSearch(beams: BeamHypothesis[], constraints: {
85
+ mustInclude?: number[][];
86
+ mustNotInclude?: number[][];
87
+ }): BeamHypothesis[];
@@ -1,6 +1,6 @@
1
1
  /**
2
2
  * Sampling Module Exports
3
3
  */
4
- export { topK, topKCPU, topKFilter } from "./top-k.ts";
5
- export { topPFilter, topPFilterCPU } from "./top-p.ts";
6
- export { sample, sampleCPU, sampleGreedy, sampleFromProbs, softmax, applyRepetitionPenalty, type SamplingConfig, } from "./sampler.ts";
4
+ export { topK, topKFilter } from "./top-k.ts";
5
+ export { topPFilter } from "./top-p.ts";
6
+ export { sample, sampleGreedy, sampleFromProbs, softmax, applyRepetitionPenalty, minPSamplingFromProbs, topKSamplingFromProbs, topPSamplingFromProbs, topKTopPSamplingFromProbs, topKTopPSamplingFromLogits, topPRenormProbs, topKRenormProbs, topKMaskLogits, type SamplingConfig, } from "./sampler.ts";
@@ -33,6 +33,39 @@ export declare function sampleGreedy(logits: Float32Array): number;
33
33
  * Full sampling pipeline
34
34
  */
35
35
  export declare function sample(device: WebInferDevice, logits: Tensor, config?: SamplingConfig, previousTokens?: number[]): Promise<number>;
36
+ /**
37
+ * Min-P sampling from probabilities
38
+ * Keeps tokens with probability >= min_p * max_prob
39
+ */
40
+ export declare function minPSamplingFromProbs(probs: Float32Array, minP: number, random?: () => number): number;
41
+ /**
42
+ * Top-K sampling from probabilities
43
+ */
44
+ export declare function topKSamplingFromProbs(probs: Float32Array, topK: number, random?: () => number): number;
45
+ /**
46
+ * Top-P sampling from probabilities
47
+ */
48
+ export declare function topPSamplingFromProbs(probs: Float32Array, topP: number, random?: () => number): number;
49
+ /**
50
+ * Combined Top-K and Top-P sampling from probabilities
51
+ */
52
+ export declare function topKTopPSamplingFromProbs(probs: Float32Array, topK: number, topP: number, random?: () => number): number;
53
+ /**
54
+ * Combined Top-K and Top-P sampling from logits
55
+ */
56
+ export declare function topKTopPSamplingFromLogits(logits: Float32Array, topK: number, topP: number, temperature?: number, random?: () => number): number;
57
+ /**
58
+ * Renormalize probabilities by top-p thresholding
59
+ */
60
+ export declare function topPRenormProbs(probs: Float32Array, topP: number): Float32Array;
61
+ /**
62
+ * Renormalize probabilities by top-k thresholding
63
+ */
64
+ export declare function topKRenormProbs(probs: Float32Array, topK: number): Float32Array;
65
+ /**
66
+ * Mask logits by top-k thresholding (set non-top-k to -inf)
67
+ */
68
+ export declare function topKMaskLogits(logits: Float32Array, topK: number): Float32Array;
36
69
  /**
37
70
  * CPU-only sampling (no GPU tensors)
38
71
  */
@@ -0,0 +1,65 @@
1
+ /**
2
+ * Speculative Decoding
3
+ * Uses a small draft model to generate candidates, verified by large target model
4
+ */
5
+ /**
6
+ * Speculative decoding configuration
7
+ */
8
+ export interface SpeculativeConfig {
9
+ /** Number of tokens to draft per step */
10
+ numDraftTokens: number;
11
+ /** Temperature for draft sampling */
12
+ draftTemperature?: number;
13
+ /** Temperature for target sampling */
14
+ targetTemperature?: number;
15
+ }
16
+ /**
17
+ * Result of speculative decoding step
18
+ */
19
+ export interface SpeculativeResult {
20
+ /** Accepted tokens */
21
+ tokens: number[];
22
+ /** Number of tokens accepted from draft */
23
+ numAccepted: number;
24
+ /** Number of tokens drafted */
25
+ numDrafted: number;
26
+ /** Acceptance rate for this step */
27
+ acceptanceRate: number;
28
+ }
29
+ /**
30
+ * Verify draft tokens against target model
31
+ * Returns accepted tokens and resampled token if rejection occurs
32
+ */
33
+ export declare function verifyDraft(draftTokens: number[], draftProbs: Float32Array[], // [numDraft][vocabSize]
34
+ targetLogits: Float32Array[], // [numDraft + 1][vocabSize]
35
+ temperature?: number): SpeculativeResult;
36
+ /**
37
+ * Speculative decoding step (CPU implementation)
38
+ *
39
+ * @param draftFn Function to get draft model logits for a token
40
+ * @param targetFn Function to get target model logits for multiple tokens
41
+ * @param inputTokens Current sequence tokens
42
+ * @param config Speculative decoding config
43
+ */
44
+ export declare function speculativeDecodingStep(draftFn: (tokens: number[]) => Promise<Float32Array>, targetFn: (tokens: number[]) => Promise<Float32Array[]>, inputTokens: number[], config: SpeculativeConfig): Promise<SpeculativeResult>;
45
+ /**
46
+ * Calculate expected speedup from acceptance rate
47
+ */
48
+ export declare function expectedSpeedup(acceptanceRate: number, numDraftTokens: number, draftCost: number, // Relative cost of draft model (0-1)
49
+ targetCost?: number): number;
50
+ /**
51
+ * Tree-based speculative decoding candidates
52
+ */
53
+ export interface DraftTree {
54
+ token: number;
55
+ prob: number;
56
+ children: DraftTree[];
57
+ }
58
+ /**
59
+ * Build draft tree with top-k branching
60
+ */
61
+ export declare function buildDraftTree(logits: Float32Array, depth: number, branching: number, temperature?: number): DraftTree[];
62
+ /**
63
+ * Flatten draft tree to sequences for batch verification
64
+ */
65
+ export declare function flattenDraftTree(tree: DraftTree[], prefix?: number[]): number[][];
@@ -0,0 +1,81 @@
1
+ /**
2
+ * TVM-WebInfer Tensor Adapter
3
+ * Bridges TVM tensors with webinfer tensors for zero-copy GPU operations
4
+ */
5
+ import type { WebInferDevice } from "../core/device.ts";
6
+ import { Tensor, type DType } from "../core/tensor.ts";
7
+ import type { Instance, Tensor as TVMTensor } from "@mlc-ai/web-runtime";
8
+ /**
9
+ * Adapter for bridging TVM and WebInfer tensor systems
10
+ */
11
+ export declare class TVMAdapter {
12
+ private tvm;
13
+ private webgpuCtx;
14
+ private device;
15
+ private zeroCopyEnabled;
16
+ constructor(tvm: Instance, device: WebInferDevice);
17
+ /**
18
+ * Check if zero-copy operations are available
19
+ */
20
+ get isZeroCopyEnabled(): boolean;
21
+ /**
22
+ * Try to get the underlying GPUBuffer from a TVM tensor
23
+ * This enables zero-copy operation if TVM exposes buffer access
24
+ */
25
+ getGPUBuffer(tensor: TVMTensor): GPUBuffer | null;
26
+ /**
27
+ * Create a webinfer Tensor that wraps a TVM tensor's buffer (zero-copy)
28
+ * Falls back to copy if buffer access is not available
29
+ */
30
+ wrapTVMTensor(tvmTensor: TVMTensor): Promise<Tensor>;
31
+ /**
32
+ * Create a webinfer Tensor from an existing GPUBuffer (zero-copy)
33
+ * The tensor will share the buffer - do not dispose independently
34
+ */
35
+ private createTensorFromBuffer;
36
+ /**
37
+ * Copy TVM tensor to webinfer tensor via CPU (fallback path)
38
+ */
39
+ copyTVMTensor(tvmTensor: TVMTensor): Promise<Tensor>;
40
+ /**
41
+ * Copy webinfer tensor data back to TVM tensor
42
+ */
43
+ copyToTVMTensor(tensor: Tensor, tvmTensor: TVMTensor): Promise<void>;
44
+ /**
45
+ * Create a new TVM tensor with same shape/dtype and copy data
46
+ */
47
+ toTVMTensor(tensor: Tensor): Promise<TVMTensor>;
48
+ /**
49
+ * Sync GPU operations on both TVM and webinfer sides
50
+ */
51
+ sync(): Promise<void>;
52
+ /**
53
+ * Get the underlying TVM instance
54
+ */
55
+ getTVM(): Instance;
56
+ /**
57
+ * Get the webinfer device
58
+ */
59
+ getDevice(): WebInferDevice;
60
+ }
61
+ /**
62
+ * Tensor wrapper that tracks ownership for safe disposal
63
+ */
64
+ export declare class TensorWrapper {
65
+ private tensor;
66
+ private ownsBuffer;
67
+ constructor(tensor: Tensor, ownsBuffer?: boolean);
68
+ get shape(): readonly number[];
69
+ get dtype(): DType;
70
+ get buffer(): GPUBuffer;
71
+ get device(): WebInferDevice;
72
+ /**
73
+ * Get the underlying tensor for use with webinfer ops
74
+ */
75
+ unwrap(): Tensor;
76
+ /**
77
+ * Whether this wrapper owns the buffer and can dispose it
78
+ */
79
+ get isOwner(): boolean;
80
+ dispose(): void;
81
+ }
@@ -0,0 +1,8 @@
1
+ /**
2
+ * TVM Backend for WebInfer
3
+ * Enables using webinfer ops as TVM external functions (BYOB - Bring Your Own Backend)
4
+ */
5
+ export type { TVMInstance, TVMTensor, PackedFunc, DLDevice, DLDataType, TVMObject, TVMArray, TVMModule, VirtualMachine, Scalar, GPUDeviceDetectOutput, } from "./types.ts";
6
+ export { instantiate, detectGPUDevice, toTVMDType, fromTVMDType, isWebGPUTensor, getTensorDataPtr, } from "./types.ts";
7
+ export { TVMAdapter, TensorWrapper } from "./adapter.ts";
8
+ export { registerWebInferOps, createWebInferBackend } from "./ops.ts";
@@ -0,0 +1,26 @@
1
+ /**
2
+ * WebInfer Ops Registration for TVM
3
+ * Registers webinfer operations as TVM PackedFuncs
4
+ */
5
+ import type { Instance } from "@mlc-ai/web-runtime";
6
+ import { TVMAdapter } from "./adapter.ts";
7
+ import type { WebInferDevice } from "../core/device.ts";
8
+ /**
9
+ * Register all webinfer ops as TVM PackedFuncs
10
+ */
11
+ export declare function registerWebInferOps(tvm: Instance, device: WebInferDevice, prefix?: string): TVMAdapter;
12
+ /**
13
+ * Create a simplified interface for calling webinfer ops from TVM
14
+ */
15
+ export declare function createWebInferBackend(tvm: Instance, device: WebInferDevice, prefix?: string): {
16
+ adapter: TVMAdapter;
17
+ opNames: {
18
+ matmul: string;
19
+ layerNorm: string;
20
+ rmsNorm: string;
21
+ softmax: string;
22
+ flashAttention: string;
23
+ rope: string;
24
+ sync: string;
25
+ };
26
+ };
@@ -0,0 +1,35 @@
1
+ /**
2
+ * TVM Runtime Type Re-exports
3
+ * Re-exports types from @mlc-ai/web-runtime (tvmjs)
4
+ */
5
+ export type { Instance as TVMInstance, Tensor as TVMTensor, PackedFunc, DLDevice, DLDataType, TVMObject, TVMArray, Module as TVMModule, VirtualMachine, Scalar, GPUDeviceDetectOutput, } from "@mlc-ai/web-runtime";
6
+ export { instantiate, detectGPUDevice } from "@mlc-ai/web-runtime";
7
+ import type { Tensor } from "@mlc-ai/web-runtime";
8
+ /**
9
+ * Extended WebGPU context interface for buffer access
10
+ * Note: bufferTable and gpuBufferFromPtr are private in tvmjs
11
+ * This interface is for internal use to attempt zero-copy when possible
12
+ */
13
+ export interface ExtendedWebGPUContext {
14
+ device: GPUDevice;
15
+ sync(): Promise<void>;
16
+ bufferTable?: (GPUBuffer | undefined)[];
17
+ gpuBufferFromPtr?(ptr: number): GPUBuffer;
18
+ }
19
+ /**
20
+ * Map webinfer dtype to TVM dtype string
21
+ */
22
+ export declare function toTVMDType(dtype: "f32" | "f16" | "i32" | "u32"): string;
23
+ /**
24
+ * Map TVM dtype string to webinfer dtype
25
+ */
26
+ export declare function fromTVMDType(dtype: string): "f32" | "f16" | "i32" | "u32";
27
+ /**
28
+ * Check if a TVM tensor is on WebGPU device
29
+ */
30
+ export declare function isWebGPUTensor(tensor: Tensor): boolean;
31
+ /**
32
+ * Get the data pointer from a TVM tensor
33
+ * This is used to look up the GPUBuffer in WebGPUContext.bufferTable
34
+ */
35
+ export declare function getTensorDataPtr(tensor: Tensor): number;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "webinfer",
3
- "version": "0.0.1",
3
+ "version": "0.0.3",
4
4
  "description": "High-performance LLM inference kernels for WebGPU",
5
5
  "license": "Apache-2.0",
6
6
  "repository": {