npm - webinfer - Versions diffs - 0.0.1 → 0.0.3 - Mend

webinfer 0.0.1 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

package/README.md +46 -0
package/dist/attention/block-sparse/patterns/tree.d.ts +65 -0
package/dist/attention/cascaded-inference.d.ts +29 -0
package/dist/attention/index.d.ts +112 -3
package/dist/attention/paged-attention.d.ts +40 -0
package/dist/attention/paged-kv/index.d.ts +2 -2
package/dist/attention/paged-kv/page-table.d.ts +66 -0
package/dist/core/tdr.d.ts +114 -0
package/dist/index.d.ts +13 -11
package/dist/index.js +3638 -2582
package/dist/inference/engine.d.ts +1 -1
package/dist/inference/index.d.ts +1 -1
package/dist/jit/compiler.d.ts +1 -1
package/dist/model/gguf.d.ts +1 -1
package/dist/model/index.d.ts +4 -4
package/dist/model/safetensors.d.ts +1 -1
package/dist/ops/normalization.d.ts +39 -0
package/dist/quantization/index.d.ts +2 -2
package/dist/sampling/beam-search.d.ts +87 -0
package/dist/sampling/index.d.ts +3 -3
package/dist/sampling/sampler.d.ts +33 -0
package/dist/sampling/speculative.d.ts +65 -0
package/dist/tvm/adapter.d.ts +81 -0
package/dist/tvm/index.d.ts +8 -0
package/dist/tvm/ops.d.ts +26 -0
package/dist/tvm/types.d.ts +35 -0
package/package.json +1 -1

package/dist/inference/engine.d.ts CHANGED Viewed

@@ -4,7 +4,7 @@
  */
 import type { WebInferDevice } from "../core/device.ts";
 import type { LoadedModel } from "../model/types.ts";
-import type { ModelConfig, InferenceConfig, ForwardResult } from "./types.ts";
+import type { ForwardResult, InferenceConfig, ModelConfig } from "./types.ts";
 /**
  * Inference Engine
  * Manages model weights and provides forward pass functionality

package/dist/inference/index.d.ts CHANGED Viewed

@@ -2,6 +2,6 @@
  * Inference Module
  * High-level API for LLM inference
  */
-export { type ModelConfig, type InferenceConfig, type GenerationConfig, type GenerationResult, type StreamToken, type FinishReason, type ForwardResult, type ModelWeights, type LayerWeights, DEFAULT_GENERATION_CONFIG, normalizeGenerationConfig, } from "./types.ts";
 export { InferenceEngine } from "./engine.ts";
 export { generate, generateStream, greedyDecode, sampleNextToken, } from "./generate.ts";
+export { DEFAULT_GENERATION_CONFIG, type FinishReason, type ForwardResult, type GenerationConfig, type GenerationResult, type InferenceConfig, type LayerWeights, type ModelConfig, type ModelWeights, normalizeGenerationConfig, type StreamToken, } from "./types.ts";

package/dist/jit/compiler.d.ts CHANGED Viewed

@@ -1,8 +1,8 @@
 /**
  * WGSL Compiler - Generates optimized GPU kernels
  */
-import { KernelCache } from "./kernel-cache.ts";
 import type { DeviceInfo } from "../core/device.ts";
+import type { KernelCache } from "./kernel-cache.ts";
 export interface MatMulConfig {
     M: number;
     N: number;

package/dist/model/gguf.d.ts CHANGED Viewed

@@ -11,7 +11,7 @@
  * - Padding to alignment
  * - Tensor data
  */
-import { type GGUFTensorInfo, type TensorInfo, type LoadedModel, type LoadOptions, GGUFQuantType } from "./types.ts";
+import { GGUFQuantType, type GGUFTensorInfo, type LoadedModel, type LoadOptions, type TensorInfo } from "./types.ts";
 /**
  * Reader helper for GGUF binary format
  */

package/dist/model/index.d.ts CHANGED Viewed

@@ -2,10 +2,10 @@
  * Model Loading Module
  * Supports SafeTensors and GGUF formats
  */
-import type { ModelFormat, LoadOptions, LoadedModel } from "./types.ts";
-export { type ModelFormat, type SafetensorsDType, GGUFQuantType, GGUFMetadataValueType, type TensorInfo, type SafetensorsHeader, type SafetensorsHeaderEntry, type ModelMetadata, type GGUFTensorInfo, type LoadedTensor, type LoadedModel, type LoadOptions, SAFETENSORS_DTYPE_BYTES, GGUF_QUANT_BLOCK_SIZE, GGUF_QUANT_BYTES_PER_BLOCK, } from "./types.ts";
-export { parseSafetensorsHeader, getSafetensorsTensorInfos, loadSafetensorsTensor, loadSafetensors, loadSafetensorsFromUrl, isSafetensors, } from "./safetensors.ts";
-export { type GGUFHeader, parseGGUFHeader, parseGGUFMetadata, parseGGUFTensorInfos, calculateGGUFTensorBytes, loadGGUF, loadGGUFFromUrl, loadGGUFTensor, dequantizeQ4_0Block, dequantizeQ8_0Block, isGGUF, } from "./gguf.ts";
+import type { LoadedModel, LoadOptions, ModelFormat } from "./types.ts";
+export { calculateGGUFTensorBytes, dequantizeQ4_0Block, dequantizeQ8_0Block, type GGUFHeader, isGGUF, loadGGUF, loadGGUFFromUrl, loadGGUFTensor, parseGGUFHeader, parseGGUFMetadata, parseGGUFTensorInfos, } from "./gguf.ts";
+export { getSafetensorsTensorInfos, isSafetensors, loadSafetensors, loadSafetensorsFromUrl, loadSafetensorsTensor, parseSafetensorsHeader, } from "./safetensors.ts";
+export { GGUF_QUANT_BLOCK_SIZE, GGUF_QUANT_BYTES_PER_BLOCK, GGUFMetadataValueType, GGUFQuantType, type GGUFTensorInfo, type LoadedModel, type LoadedTensor, type LoadOptions, type ModelFormat, type ModelMetadata, SAFETENSORS_DTYPE_BYTES, type SafetensorsDType, type SafetensorsHeader, type SafetensorsHeaderEntry, type TensorInfo, } from "./types.ts";
 /**
  * Auto-detect model format from buffer
  */

package/dist/model/safetensors.d.ts CHANGED Viewed

@@ -6,7 +6,7 @@
  * - N bytes: JSON header (UTF-8)
  * - Remaining: tensor data (contiguous)
  */
-import { type SafetensorsHeader, type TensorInfo, type LoadedModel, type LoadOptions } from "./types.ts";
+import { type LoadedModel, type LoadOptions, type SafetensorsHeader, type TensorInfo } from "./types.ts";
 /**
  * Parse the SafeTensors header from a buffer
  * @param buffer - ArrayBuffer containing the SafeTensors file

package/dist/ops/normalization.d.ts CHANGED Viewed

@@ -22,3 +22,42 @@ export declare function layerNorm(device: WebInferDevice, x: Tensor, weight: Ten
  * RMS Normalization (GPU)
  */
 export declare function rmsNorm(device: WebInferDevice, x: Tensor, weight: Tensor, eps?: number): Promise<Tensor>;
+/**
+ * Fused Add + RMS Normalization (CPU)
+ * Computes: output = rmsNorm(input + residual)
+ * Also updates residual in-place: residual = input + residual
+ */
+export declare function fusedAddRmsNormCPU(input: Float32Array, residual: Float32Array, weight: Float32Array, shape: number[], eps?: number): {
+    output: Float32Array;
+    residual: Float32Array;
+};
+/**
+ * Fused Add + RMS Normalization (GPU)
+ */
+export declare function fusedAddRmsNorm(device: WebInferDevice, input: Tensor, residual: Tensor, weight: Tensor, eps?: number): Promise<{
+    output: Tensor;
+    residual: Tensor;
+}>;
+/**
+ * Gemma-style RMS Normalization (CPU)
+ * Uses (1 + weight) instead of weight: output = x * invRms * (1 + weight)
+ */
+export declare function gemmaRmsNormCPU(x: Float32Array, weight: Float32Array, shape: number[], eps?: number): Float32Array;
+/**
+ * Gemma-style RMS Normalization (GPU)
+ */
+export declare function gemmaRmsNorm(device: WebInferDevice, x: Tensor, weight: Tensor, eps?: number): Promise<Tensor>;
+/**
+ * Gemma-style Fused Add + RMS Normalization (CPU)
+ */
+export declare function gemmaFusedAddRmsNormCPU(input: Float32Array, residual: Float32Array, weight: Float32Array, shape: number[], eps?: number): {
+    output: Float32Array;
+    residual: Float32Array;
+};
+/**
+ * Gemma-style Fused Add + RMS Normalization (GPU)
+ */
+export declare function gemmaFusedAddRmsNorm(device: WebInferDevice, input: Tensor, residual: Tensor, weight: Tensor, eps?: number): Promise<{
+    output: Tensor;
+    residual: Tensor;
+}>;

package/dist/quantization/index.d.ts CHANGED Viewed

@@ -2,5 +2,5 @@
  * Quantization Module
  * INT4 and INT8 quantization for efficient LLM inference
  */
-export { quantizeToInt8, quantizeToInt4, dequantizeInt8, dequantizeInt4, quantizationError, getMemorySavings, type QuantConfig, type QuantizedTensor, } from "./quantize.ts";
-export { qmatmulInt8CPU, qmatmulInt4CPU, qmatmulInt8BlockCPU, estimateQMatMulFlops, estimateQMatMulBandwidth, } from "./qmatmul.ts";
+export { estimateQMatMulBandwidth, estimateQMatMulFlops } from "./qmatmul.ts";
+export { dequantizeInt4, dequantizeInt8, getMemorySavings, type QuantConfig, type QuantizedTensor, quantizationError, quantizeToInt4, quantizeToInt8, } from "./quantize.ts";

package/dist/sampling/beam-search.d.ts ADDED Viewed

@@ -0,0 +1,87 @@
+/**
+ * Beam Search Decoding
+ * Maintains top-k hypotheses during generation
+ */
+/**
+ * Beam search configuration
+ */
+export interface BeamSearchConfig {
+    /** Number of beams to maintain */
+    beamWidth: number;
+    /** Maximum sequence length */
+    maxLength: number;
+    /** Length penalty (>1 favors longer, <1 favors shorter) */
+    lengthPenalty?: number;
+    /** Early stopping when all beams hit EOS */
+    earlyStopping?: boolean;
+    /** EOS token ID */
+    eosTokenId?: number;
+    /** Minimum length before EOS allowed */
+    minLength?: number;
+    /** Number of beams to return */
+    numReturn?: number;
+    /** Diversity penalty for diverse beam search */
+    diversityPenalty?: number;
+    /** Number of groups for diverse beam search */
+    numBeamGroups?: number;
+}
+/**
+ * Single beam hypothesis
+ */
+export interface BeamHypothesis {
+    /** Token sequence */
+    tokens: number[];
+    /** Log probability score */
+    score: number;
+    /** Normalized score (with length penalty) */
+    normalizedScore: number;
+    /** Whether sequence is complete (hit EOS) */
+    isComplete: boolean;
+}
+/**
+ * Beam search result
+ */
+export interface BeamSearchResult {
+    /** Best hypotheses sorted by score */
+    hypotheses: BeamHypothesis[];
+    /** Number of steps taken */
+    numSteps: number;
+}
+/**
+ * Beam Search decoder
+ */
+export declare class BeamSearch {
+    private config;
+    constructor(config: BeamSearchConfig);
+    /**
+     * Run beam search
+     *
+     * @param scoreFn Function that returns log probabilities for next token
+     * @param inputTokens Initial tokens (prompt)
+     */
+    search(scoreFn: (tokens: number[]) => Promise<Float32Array>, inputTokens: number[]): Promise<BeamSearchResult>;
+    /**
+     * Apply length penalty to score
+     */
+    private normalizeScore;
+}
+/**
+ * Diverse Beam Search
+ * Groups beams and penalizes similarity between groups
+ */
+export declare class DiverseBeamSearch {
+    private config;
+    constructor(config: BeamSearchConfig);
+    /**
+     * Run diverse beam search
+     */
+    search(scoreFn: (tokens: number[]) => Promise<Float32Array>, inputTokens: number[]): Promise<BeamSearchResult>;
+    private normalizeScore;
+}
+/**
+ * Constrained beam search with prefix/suffix constraints
+ */
+export declare function constrainedBeamSearch(beams: BeamHypothesis[], constraints: {
+    mustInclude?: number[][];
+    mustNotInclude?: number[][];
+}): BeamHypothesis[];

package/dist/sampling/index.d.ts CHANGED Viewed

@@ -1,6 +1,6 @@
 /**
  * Sampling Module Exports
  */
-export { topK, topKCPU, topKFilter } from "./top-k.ts";
-export { topPFilter, topPFilterCPU } from "./top-p.ts";
-export { sample, sampleCPU, sampleGreedy, sampleFromProbs, softmax, applyRepetitionPenalty, type SamplingConfig, } from "./sampler.ts";
+export { topK, topKFilter } from "./top-k.ts";
+export { topPFilter } from "./top-p.ts";
+export { sample, sampleGreedy, sampleFromProbs, softmax, applyRepetitionPenalty, minPSamplingFromProbs, topKSamplingFromProbs, topPSamplingFromProbs, topKTopPSamplingFromProbs, topKTopPSamplingFromLogits, topPRenormProbs, topKRenormProbs, topKMaskLogits, type SamplingConfig, } from "./sampler.ts";

package/dist/sampling/sampler.d.ts CHANGED Viewed

@@ -33,6 +33,39 @@ export declare function sampleGreedy(logits: Float32Array): number;
  * Full sampling pipeline
  */
 export declare function sample(device: WebInferDevice, logits: Tensor, config?: SamplingConfig, previousTokens?: number[]): Promise<number>;
+/**
+ * Min-P sampling from probabilities
+ * Keeps tokens with probability >= min_p * max_prob
+ */
+export declare function minPSamplingFromProbs(probs: Float32Array, minP: number, random?: () => number): number;
+/**
+ * Top-K sampling from probabilities
+ */
+export declare function topKSamplingFromProbs(probs: Float32Array, topK: number, random?: () => number): number;
+/**
+ * Top-P sampling from probabilities
+ */
+export declare function topPSamplingFromProbs(probs: Float32Array, topP: number, random?: () => number): number;
+/**
+ * Combined Top-K and Top-P sampling from probabilities
+ */
+export declare function topKTopPSamplingFromProbs(probs: Float32Array, topK: number, topP: number, random?: () => number): number;
+/**
+ * Combined Top-K and Top-P sampling from logits
+ */
+export declare function topKTopPSamplingFromLogits(logits: Float32Array, topK: number, topP: number, temperature?: number, random?: () => number): number;
+/**
+ * Renormalize probabilities by top-p thresholding
+ */
+export declare function topPRenormProbs(probs: Float32Array, topP: number): Float32Array;
+/**
+ * Renormalize probabilities by top-k thresholding
+ */
+export declare function topKRenormProbs(probs: Float32Array, topK: number): Float32Array;
+/**
+ * Mask logits by top-k thresholding (set non-top-k to -inf)
+ */
+export declare function topKMaskLogits(logits: Float32Array, topK: number): Float32Array;
 /**
  * CPU-only sampling (no GPU tensors)
  */

package/dist/sampling/speculative.d.ts ADDED Viewed

@@ -0,0 +1,65 @@
+/**
+ * Speculative Decoding
+ * Uses a small draft model to generate candidates, verified by large target model
+ */
+/**
+ * Speculative decoding configuration
+ */
+export interface SpeculativeConfig {
+    /** Number of tokens to draft per step */
+    numDraftTokens: number;
+    /** Temperature for draft sampling */
+    draftTemperature?: number;
+    /** Temperature for target sampling */
+    targetTemperature?: number;
+}
+/**
+ * Result of speculative decoding step
+ */
+export interface SpeculativeResult {
+    /** Accepted tokens */
+    tokens: number[];
+    /** Number of tokens accepted from draft */
+    numAccepted: number;
+    /** Number of tokens drafted */
+    numDrafted: number;
+    /** Acceptance rate for this step */
+    acceptanceRate: number;
+}
+/**
+ * Verify draft tokens against target model
+ * Returns accepted tokens and resampled token if rejection occurs
+ */
+export declare function verifyDraft(draftTokens: number[], draftProbs: Float32Array[], // [numDraft][vocabSize]
+targetLogits: Float32Array[], // [numDraft + 1][vocabSize]
+temperature?: number): SpeculativeResult;
+/**
+ * Speculative decoding step (CPU implementation)
+ *
+ * @param draftFn Function to get draft model logits for a token
+ * @param targetFn Function to get target model logits for multiple tokens
+ * @param inputTokens Current sequence tokens
+ * @param config Speculative decoding config
+ */
+export declare function speculativeDecodingStep(draftFn: (tokens: number[]) => Promise<Float32Array>, targetFn: (tokens: number[]) => Promise<Float32Array[]>, inputTokens: number[], config: SpeculativeConfig): Promise<SpeculativeResult>;
+/**
+ * Calculate expected speedup from acceptance rate
+ */
+export declare function expectedSpeedup(acceptanceRate: number, numDraftTokens: number, draftCost: number, // Relative cost of draft model (0-1)
+targetCost?: number): number;
+/**
+ * Tree-based speculative decoding candidates
+ */
+export interface DraftTree {
+    token: number;
+    prob: number;
+    children: DraftTree[];
+}
+/**
+ * Build draft tree with top-k branching
+ */
+export declare function buildDraftTree(logits: Float32Array, depth: number, branching: number, temperature?: number): DraftTree[];
+/**
+ * Flatten draft tree to sequences for batch verification
+ */
+export declare function flattenDraftTree(tree: DraftTree[], prefix?: number[]): number[][];

package/dist/tvm/adapter.d.ts ADDED Viewed

@@ -0,0 +1,81 @@
+/**
+ * TVM-WebInfer Tensor Adapter
+ * Bridges TVM tensors with webinfer tensors for zero-copy GPU operations
+ */
+import type { WebInferDevice } from "../core/device.ts";
+import { Tensor, type DType } from "../core/tensor.ts";
+import type { Instance, Tensor as TVMTensor } from "@mlc-ai/web-runtime";
+/**
+ * Adapter for bridging TVM and WebInfer tensor systems
+ */
+export declare class TVMAdapter {
+    private tvm;
+    private webgpuCtx;
+    private device;
+    private zeroCopyEnabled;
+    constructor(tvm: Instance, device: WebInferDevice);
+    /**
+     * Check if zero-copy operations are available
+     */
+    get isZeroCopyEnabled(): boolean;
+    /**
+     * Try to get the underlying GPUBuffer from a TVM tensor
+     * This enables zero-copy operation if TVM exposes buffer access
+     */
+    getGPUBuffer(tensor: TVMTensor): GPUBuffer | null;
+    /**
+     * Create a webinfer Tensor that wraps a TVM tensor's buffer (zero-copy)
+     * Falls back to copy if buffer access is not available
+     */
+    wrapTVMTensor(tvmTensor: TVMTensor): Promise<Tensor>;
+    /**
+     * Create a webinfer Tensor from an existing GPUBuffer (zero-copy)
+     * The tensor will share the buffer - do not dispose independently
+     */
+    private createTensorFromBuffer;
+    /**
+     * Copy TVM tensor to webinfer tensor via CPU (fallback path)
+     */
+    copyTVMTensor(tvmTensor: TVMTensor): Promise<Tensor>;
+    /**
+     * Copy webinfer tensor data back to TVM tensor
+     */
+    copyToTVMTensor(tensor: Tensor, tvmTensor: TVMTensor): Promise<void>;
+    /**
+     * Create a new TVM tensor with same shape/dtype and copy data
+     */
+    toTVMTensor(tensor: Tensor): Promise<TVMTensor>;
+    /**
+     * Sync GPU operations on both TVM and webinfer sides
+     */
+    sync(): Promise<void>;
+    /**
+     * Get the underlying TVM instance
+     */
+    getTVM(): Instance;
+    /**
+     * Get the webinfer device
+     */
+    getDevice(): WebInferDevice;
+}
+/**
+ * Tensor wrapper that tracks ownership for safe disposal
+ */
+export declare class TensorWrapper {
+    private tensor;
+    private ownsBuffer;
+    constructor(tensor: Tensor, ownsBuffer?: boolean);
+    get shape(): readonly number[];
+    get dtype(): DType;
+    get buffer(): GPUBuffer;
+    get device(): WebInferDevice;
+    /**
+     * Get the underlying tensor for use with webinfer ops
+     */
+    unwrap(): Tensor;
+    /**
+     * Whether this wrapper owns the buffer and can dispose it
+     */
+    get isOwner(): boolean;
+    dispose(): void;
+}

package/dist/tvm/index.d.ts ADDED Viewed

@@ -0,0 +1,8 @@
+/**
+ * TVM Backend for WebInfer
+ * Enables using webinfer ops as TVM external functions (BYOB - Bring Your Own Backend)
+ */
+export type { TVMInstance, TVMTensor, PackedFunc, DLDevice, DLDataType, TVMObject, TVMArray, TVMModule, VirtualMachine, Scalar, GPUDeviceDetectOutput, } from "./types.ts";
+export { instantiate, detectGPUDevice, toTVMDType, fromTVMDType, isWebGPUTensor, getTensorDataPtr, } from "./types.ts";
+export { TVMAdapter, TensorWrapper } from "./adapter.ts";
+export { registerWebInferOps, createWebInferBackend } from "./ops.ts";

package/dist/tvm/ops.d.ts ADDED Viewed

@@ -0,0 +1,26 @@
+/**
+ * WebInfer Ops Registration for TVM
+ * Registers webinfer operations as TVM PackedFuncs
+ */
+import type { Instance } from "@mlc-ai/web-runtime";
+import { TVMAdapter } from "./adapter.ts";
+import type { WebInferDevice } from "../core/device.ts";
+/**
+ * Register all webinfer ops as TVM PackedFuncs
+ */
+export declare function registerWebInferOps(tvm: Instance, device: WebInferDevice, prefix?: string): TVMAdapter;
+/**
+ * Create a simplified interface for calling webinfer ops from TVM
+ */
+export declare function createWebInferBackend(tvm: Instance, device: WebInferDevice, prefix?: string): {
+    adapter: TVMAdapter;
+    opNames: {
+        matmul: string;
+        layerNorm: string;
+        rmsNorm: string;
+        softmax: string;
+        flashAttention: string;
+        rope: string;
+        sync: string;
+    };
+};

package/dist/tvm/types.d.ts ADDED Viewed

@@ -0,0 +1,35 @@
+/**
+ * TVM Runtime Type Re-exports
+ * Re-exports types from @mlc-ai/web-runtime (tvmjs)
+ */
+export type { Instance as TVMInstance, Tensor as TVMTensor, PackedFunc, DLDevice, DLDataType, TVMObject, TVMArray, Module as TVMModule, VirtualMachine, Scalar, GPUDeviceDetectOutput, } from "@mlc-ai/web-runtime";
+export { instantiate, detectGPUDevice } from "@mlc-ai/web-runtime";
+import type { Tensor } from "@mlc-ai/web-runtime";
+/**
+ * Extended WebGPU context interface for buffer access
+ * Note: bufferTable and gpuBufferFromPtr are private in tvmjs
+ * This interface is for internal use to attempt zero-copy when possible
+ */
+export interface ExtendedWebGPUContext {
+    device: GPUDevice;
+    sync(): Promise<void>;
+    bufferTable?: (GPUBuffer | undefined)[];
+    gpuBufferFromPtr?(ptr: number): GPUBuffer;
+}
+/**
+ * Map webinfer dtype to TVM dtype string
+ */
+export declare function toTVMDType(dtype: "f32" | "f16" | "i32" | "u32"): string;
+/**
+ * Map TVM dtype string to webinfer dtype
+ */
+export declare function fromTVMDType(dtype: string): "f32" | "f16" | "i32" | "u32";
+/**
+ * Check if a TVM tensor is on WebGPU device
+ */
+export declare function isWebGPUTensor(tensor: Tensor): boolean;
+/**
+ * Get the data pointer from a TVM tensor
+ * This is used to look up the GPUBuffer in WebGPUContext.bufferTable
+ */
+export declare function getTensorDataPtr(tensor: Tensor): number;

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "webinfer",
-  "version": "0.0.1",
+  "version": "0.0.3",
   "description": "High-performance LLM inference kernels for WebGPU",
   "license": "Apache-2.0",
   "repository": {