npm - webinfer - Versions diffs - 0.0.4 → 0.0.5 - Mend

webinfer 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/dist/jit/index.d.ts ADDED Viewed

@@ -0,0 +1,138 @@
+/**
+ * JIT compilation module for WebInfer
+ *
+ * This module provides functionality to compile kernel specifications into
+ * executable WGSL shaders at runtime. It follows a similar pattern to
+ * FlashInfer's JIT compilation but targets WebGPU instead of CUDA.
+ */
+/**
+ * Kernel specification for JIT compilation.
+ * This is the format used by TVM to pass kernel configurations to WebInfer.
+ */
+export interface KernelSpec {
+    /** Type of kernel to compile */
+    kernel_type: 'batch_prefill_paged' | 'batch_decode_paged' | 'single_prefill' | 'single_decode' | 'rmsnorm' | 'silu_and_mul' | 'gelu_and_mul' | 'rope' | 'sampling';
+    /** Data type for computation */
+    dtype: 'float16' | 'float32';
+    /** Number of query/output heads */
+    num_qo_heads?: number;
+    /** Number of key/value heads (for GQA/MQA) */
+    num_kv_heads?: number;
+    /** Head dimension for query and key */
+    qk_head_dim?: number;
+    /** Head dimension for value (may differ from qk_head_dim) */
+    v_head_dim?: number;
+    /** Page size for paged KV cache */
+    page_size?: number;
+    /** Whether to use causal masking */
+    causal?: boolean;
+    /** Whether to enable inline RoPE */
+    enable_inline_rope?: boolean;
+    /** Hidden dimension (for normalization kernels) */
+    hidden_dim?: number;
+    /** Epsilon for numerical stability (for normalization) */
+    eps?: number;
+    /** RoPE theta base */
+    rope_theta?: number;
+    /** RoPE scaling factor */
+    rope_scale?: number;
+}
+/**
+ * Information about a binding in the compiled shader
+ */
+export interface BindingInfo {
+    /** Binding index */
+    binding: number;
+    /** Name of the binding (for documentation) */
+    name: string;
+    /** Type of binding */
+    type: 'storage' | 'storage_read' | 'uniform';
+    /** Data type */
+    dtype: string;
+}
+/**
+ * Result of JIT compilation
+ */
+export interface CompiledKernel {
+    /** Generated WGSL shader code */
+    wgsl: string;
+    /** Workgroup size [x, y, z] */
+    workgroupSize: [number, number, number];
+    /** Binding layout information */
+    bindings: BindingInfo[];
+    /** Entry point function name */
+    entryPoint: string;
+    /**
+     * Calculate dispatch size based on input parameters
+     * @param params - Parameters like batch_size, seq_len, etc.
+     * @returns Dispatch size [x, y, z]
+     */
+    dispatchSize: (params: Record<string, number>) => [number, number, number];
+    /** Original kernel specification */
+    spec: KernelSpec;
+}
+/**
+ * Plan information returned by plan phase
+ */
+export interface PlanInfo {
+    /** Unique key for this configuration */
+    key: string;
+    /** Required workspace size in bytes */
+    workspaceSize: number;
+    /** Compiled kernel reference */
+    kernel: CompiledKernel;
+    /** Additional configuration for execution */
+    config: Record<string, number | boolean>;
+}
+/**
+ * Compile a kernel from specification.
+ *
+ * This is the main entry point for JIT compilation. It takes a kernel
+ * specification and returns a compiled kernel that can be executed.
+ *
+ * @param spec - Kernel specification
+ * @returns Compiled kernel
+ */
+export declare function compileKernel(spec: KernelSpec): CompiledKernel;
+/**
+ * Generate a unique key for a kernel specification.
+ * Used for caching compiled pipelines.
+ */
+export declare function getSpecKey(spec: KernelSpec): string;
+/**
+ * Registry entry for a compiled kernel
+ */
+export interface CompiledKernelEntry {
+    /** The compiled kernel information */
+    kernel: CompiledKernel;
+    /** The GPU compute pipeline */
+    pipeline: GPUComputePipeline;
+    /** The bind group layout for this kernel */
+    bindGroupLayout: GPUBindGroupLayout;
+}
+/**
+ * Registry of compiled kernels, keyed by spec key
+ */
+export interface CompiledKernelRegistry {
+    [specKey: string]: CompiledKernelEntry;
+}
+/**
+ * Initialize multiple kernels from specifications.
+ *
+ * This function takes a list of kernel specs and compiles them all,
+ * returning a registry that can be used to quickly lookup pipelines
+ * at execution time.
+ *
+ * @param device - The GPU device to create pipelines on
+ * @param specs - Array of kernel specifications to compile
+ * @returns Promise resolving to a registry of compiled kernels
+ */
+export declare function initFromSpecs(device: GPUDevice, specs: KernelSpec[]): Promise<CompiledKernelRegistry>;
+/**
+ * Get a compiled kernel entry from the registry.
+ *
+ * @param registry - The kernel registry
+ * @param spec - The kernel specification to look up
+ * @returns The compiled kernel entry, or undefined if not found
+ */
+export declare function getCompiledKernel(registry: CompiledKernelRegistry, spec: KernelSpec): CompiledKernelEntry | undefined;

package/dist/prefill/index.d.ts CHANGED Viewed

@@ -5,6 +5,87 @@ import type { WebInferContext } from '../core/context.ts';
 import type { Tensor } from '../core/tensor.ts';
 import type { PagedKvCache } from '../core/paged-kv-cache.ts';
 import { PosEncodingMode } from '../core/types.ts';
+/**
+ * Options for batch_prefill_plan()
+ */
+export interface BatchPrefillPlanOptions {
+    /** Number of sequences in the batch */
+    batchSize: number;
+    /** Total number of query tokens across all sequences */
+    totalQoLen: number;
+    /** Size of each page in the paged KV cache */
+    pageSize: number;
+    /** Number of query/output heads */
+    numQoHeads: number;
+    /** Number of key/value heads */
+    numKvHeads: number;
+    /** Head dimension for query and key */
+    headDim: number;
+    /** Whether to apply causal masking (default: true) */
+    causal?: boolean;
+}
+/**
+ * Plan information for batch prefill with paged KV cache.
+ * This is returned by batch_prefill_plan() and passed to batch_prefill_run().
+ */
+export interface BatchPrefillPlanInfo {
+    /** Unique key for this configuration */
+    key: string;
+    /** Number of query/output heads */
+    num_qo_heads: number;
+    /** Number of key/value heads */
+    num_kv_heads: number;
+    /** Head dimension */
+    head_dim: number;
+    /** Page size */
+    page_size: number;
+    /** Softmax scale */
+    sm_scale: number;
+    /** Whether causal masking is enabled */
+    causal: boolean;
+    /** Batch size */
+    batch_size: number;
+    /** Total query/output length */
+    total_qo_len: number;
+    /** Required workspace size in bytes (currently 0) */
+    workspaceSize: number;
+}
+/**
+ * Plan batch prefill with paged KV cache.
+ *
+ * @param options - Configuration options
+ * @returns Plan information for execution
+ *
+ * @example
+ * const plan = batch_prefill_plan({
+ *   batchSize: 4,
+ *   totalQoLen: 2048,
+ *   pageSize: 16,
+ *   numQoHeads: 32,
+ *   numKvHeads: 8,
+ *   headDim: 128,
+ *   causal: true
+ * });
+ */
+export declare function batch_prefill_plan(options: BatchPrefillPlanOptions): BatchPrefillPlanInfo;
+/**
+ * Execute batch prefill with paged KV cache.
+ *
+ * This function executes the attention computation using the plan
+ * prepared by batch_prefill_plan().
+ *
+ * @param ctx - WebInfer context
+ * @param planInfo - Plan information from batch_prefill_plan()
+ * @param q - Query tensor [total_qo_len, num_qo_heads, head_dim]
+ * @param pagedKvCache - Paged KV cache
+ * @param qoIndptr - Query indirection pointer [batch_size + 1]
+ * @param pageIndptr - Page indirection pointer [batch_size + 1]
+ * @param pageIndices - Page indices [nnz_pages]
+ * @param lastPageLen - Last page lengths [batch_size]
+ * @param output - Output tensor [total_qo_len, num_qo_heads, head_dim]
+ * @param lse - Log-sum-exp output [total_qo_len, num_qo_heads] (optional)
+ */
+export declare function batch_prefill_run(ctx: WebInferContext, planInfo: BatchPrefillPlanInfo, q: Tensor, pagedKvCache: PagedKvCache, qoIndptr: Tensor, pageIndptr: Tensor, pageIndices: Tensor, lastPageLen: Tensor, output: Tensor, lse?: Tensor): Promise<void>;
 /**
  * Single prefill with KV cache
  *
@@ -15,6 +96,9 @@ import { PosEncodingMode } from '../core/types.ts';
  * @param q Query tensor of shape [qo_len, num_qo_heads, head_dim]
  * @param k Key tensor of shape [kv_len, num_kv_heads, head_dim]
  * @param v Value tensor of shape [kv_len, num_kv_heads, head_dim]
+ * @param output Optional pre-allocated output tensor [qo_len, num_qo_heads, head_dim].
+ *               If provided, results are written to this tensor (zero-copy for TVM integration).
+ *               If not provided, a new tensor is created and returned.
  * @param causal Whether to apply causal masking (default: true)
  * @param pos_encoding_mode Position encoding mode (default: NONE)
  * @param sm_scale Softmax scale (default: 1/sqrt(head_dim))
@@ -22,7 +106,7 @@ import { PosEncodingMode } from '../core/types.ts';
  * @param rope_theta RoPE theta (default: 10000.0) - not yet implemented
  * @returns Output tensor of shape [qo_len, num_qo_heads, head_dim]
  */
-export declare function single_prefill_with_kv_cache(ctx: WebInferContext, q: Tensor, k: Tensor, v: Tensor, causal?: boolean, pos_encoding_mode?: PosEncodingMode, sm_scale?: number, rope_scale?: number, rope_theta?: number): Promise<Tensor>;
+export declare function single_prefill_with_kv_cache(ctx: WebInferContext, q: Tensor, k: Tensor, v: Tensor, output?: Tensor, causal?: boolean, pos_encoding_mode?: PosEncodingMode, sm_scale?: number, rope_scale?: number, rope_theta?: number): Promise<Tensor>;
 /**
  * Batched prefill with paged KV cache wrapper
  *
@@ -63,7 +147,9 @@ export declare class BatchPrefillWithPagedKVCacheWrapper {
      * @param paged_kv_indptr Paged KV indirection pointer [batch_size + 1]
      * @param paged_kv_indices Paged KV indices [nnz_pages]
      * @param paged_kv_last_page_len Last page lengths [batch_size]
+     * @param output Optional pre-allocated output tensor [total_qo_len, num_qo_heads, head_dim].
+     *               If provided, results are written to this tensor (zero-copy for TVM integration).
      * @returns Output tensor [total_qo_len, num_qo_heads, head_dim]
      */
-    run(q: Tensor, paged_kv_cache: PagedKvCache, qo_indptr: Tensor, paged_kv_indptr: Tensor, paged_kv_indices: Tensor, paged_kv_last_page_len: Tensor): Promise<Tensor>;
+    run(q: Tensor, paged_kv_cache: PagedKvCache, qo_indptr: Tensor, paged_kv_indptr: Tensor, paged_kv_indices: Tensor, paged_kv_last_page_len: Tensor, output?: Tensor): Promise<Tensor>;
 }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "webinfer",
-  "version": "0.0.4",
+  "version": "0.0.5",
   "description": "High-performance LLM inference kernels for WebGPU",
   "license": "Apache-2.0",
   "repository": {