webinfer 0.0.2 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +38 -33
- package/dist/activation/index.d.ts +30 -0
- package/dist/core/context.d.ts +60 -0
- package/dist/core/paged-kv-cache.d.ts +33 -0
- package/dist/core/tensor.d.ts +38 -19
- package/dist/core/types.d.ts +27 -0
- package/dist/decode/index.d.ts +65 -0
- package/dist/gemm/index.d.ts +25 -0
- package/dist/index.d.ts +26 -19
- package/dist/index.js +2508 -3885
- package/dist/kernels/activation.wgsl.d.ts +14 -0
- package/dist/kernels/batch-decode-paged.wgsl.d.ts +12 -0
- package/dist/kernels/batch-prefill-paged.wgsl.d.ts +13 -0
- package/dist/kernels/decode-attention.wgsl.d.ts +16 -0
- package/dist/kernels/gemm.wgsl.d.ts +17 -0
- package/dist/kernels/page.wgsl.d.ts +10 -0
- package/dist/kernels/prefill-attention.wgsl.d.ts +17 -0
- package/dist/kernels/rmsnorm.wgsl.d.ts +10 -0
- package/dist/kernels/rope.wgsl.d.ts +19 -0
- package/dist/kernels/sampling.wgsl.d.ts +23 -0
- package/dist/norm/index.d.ts +43 -0
- package/dist/page/index.d.ts +21 -0
- package/dist/prefill/index.d.ts +69 -0
- package/dist/rope/index.d.ts +37 -0
- package/dist/sampling/index.d.ts +53 -4
- package/package.json +1 -1
- package/dist/attention/block-sparse/format.d.ts +0 -52
- package/dist/attention/block-sparse/patterns/causal.d.ts +0 -16
- package/dist/attention/block-sparse/patterns/sliding.d.ts +0 -22
- package/dist/attention/flash-attention.d.ts +0 -30
- package/dist/attention/index.d.ts +0 -9
- package/dist/attention/paged-kv/block-manager.d.ts +0 -102
- package/dist/attention/paged-kv/index.d.ts +0 -5
- package/dist/attention/paged-kv/page-table.d.ts +0 -99
- package/dist/attention/scheduler.d.ts +0 -40
- package/dist/core/buffer-pool.d.ts +0 -18
- package/dist/core/device.d.ts +0 -23
- package/dist/inference/engine.d.ts +0 -69
- package/dist/inference/generate.d.ts +0 -30
- package/dist/inference/index.d.ts +0 -7
- package/dist/inference/types.d.ts +0 -161
- package/dist/jit/compiler.d.ts +0 -23
- package/dist/jit/kernel-cache.d.ts +0 -21
- package/dist/model/gguf.d.ts +0 -90
- package/dist/model/index.d.ts +0 -16
- package/dist/model/safetensors.d.ts +0 -38
- package/dist/model/types.d.ts +0 -182
- package/dist/ops/activations.d.ts +0 -43
- package/dist/ops/elementwise.d.ts +0 -38
- package/dist/ops/embedding.d.ts +0 -30
- package/dist/ops/matmul.d.ts +0 -21
- package/dist/ops/normalization.d.ts +0 -24
- package/dist/ops/reshape.d.ts +0 -39
- package/dist/ops/rope.d.ts +0 -32
- package/dist/ops/softmax.d.ts +0 -18
- package/dist/quantization/index.d.ts +0 -6
- package/dist/quantization/qmatmul.d.ts +0 -38
- package/dist/quantization/quantize.d.ts +0 -52
- package/dist/sampling/sampler.d.ts +0 -39
- package/dist/sampling/top-k.d.ts +0 -24
- package/dist/sampling/top-p.d.ts +0 -14
|
@@ -1,99 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Paged KV Cache - Software page table for efficient memory management
|
|
3
|
-
* Inspired by vLLM's PagedAttention
|
|
4
|
-
*/
|
|
5
|
-
import type { WebInferDevice } from "../../core/device.ts";
|
|
6
|
-
/**
|
|
7
|
-
* Configuration for PagedKVCache
|
|
8
|
-
*/
|
|
9
|
-
export interface PagedKVCacheConfig {
|
|
10
|
-
numLayers: number;
|
|
11
|
-
numHeads: number;
|
|
12
|
-
headDim: number;
|
|
13
|
-
pageSize: number;
|
|
14
|
-
maxPages: number;
|
|
15
|
-
dtype?: "f32" | "f16";
|
|
16
|
-
}
|
|
17
|
-
/**
|
|
18
|
-
* Entry in the page table for a sequence
|
|
19
|
-
*/
|
|
20
|
-
export interface SequenceEntry {
|
|
21
|
-
seqId: number;
|
|
22
|
-
pages: number[];
|
|
23
|
-
length: number;
|
|
24
|
-
}
|
|
25
|
-
/**
|
|
26
|
-
* PagedKVCache - Manages KV cache with paging for efficient memory use
|
|
27
|
-
*
|
|
28
|
-
* Benefits:
|
|
29
|
-
* 1. No memory fragmentation - pages are fixed size
|
|
30
|
-
* 2. Efficient memory sharing - multiple sequences can share cache
|
|
31
|
-
* 3. Dynamic allocation - only allocate pages as needed
|
|
32
|
-
* 4. Easy defragmentation - just remap logical to physical pages
|
|
33
|
-
*/
|
|
34
|
-
export declare class PagedKVCache {
|
|
35
|
-
private device;
|
|
36
|
-
private config;
|
|
37
|
-
private keyCache;
|
|
38
|
-
private valueCache;
|
|
39
|
-
private pageTable;
|
|
40
|
-
private freePages;
|
|
41
|
-
private nextSeqId;
|
|
42
|
-
constructor(device: WebInferDevice, config: PagedKVCacheConfig);
|
|
43
|
-
/**
|
|
44
|
-
* Allocate pages for a new sequence
|
|
45
|
-
*/
|
|
46
|
-
allocateSequence(initialLength?: number): number;
|
|
47
|
-
/**
|
|
48
|
-
* Extend a sequence with new tokens
|
|
49
|
-
*/
|
|
50
|
-
extendSequence(seqId: number, numNewTokens: number): void;
|
|
51
|
-
/**
|
|
52
|
-
* Free a sequence and its pages
|
|
53
|
-
*/
|
|
54
|
-
freeSequence(seqId: number): void;
|
|
55
|
-
/**
|
|
56
|
-
* Get page indices for a sequence
|
|
57
|
-
*/
|
|
58
|
-
getSequencePages(seqId: number): number[] | null;
|
|
59
|
-
/**
|
|
60
|
-
* Get sequence length
|
|
61
|
-
*/
|
|
62
|
-
getSequenceLength(seqId: number): number;
|
|
63
|
-
/**
|
|
64
|
-
* Get the physical page index for a given sequence position
|
|
65
|
-
*/
|
|
66
|
-
getPageForPosition(seqId: number, position: number): number | null;
|
|
67
|
-
/**
|
|
68
|
-
* Get offset within a page for a given position
|
|
69
|
-
*/
|
|
70
|
-
getOffsetInPage(position: number): number;
|
|
71
|
-
private allocatePage;
|
|
72
|
-
private freePage;
|
|
73
|
-
/**
|
|
74
|
-
* Get cache statistics
|
|
75
|
-
*/
|
|
76
|
-
getStats(): {
|
|
77
|
-
totalPages: number;
|
|
78
|
-
usedPages: number;
|
|
79
|
-
freePages: number;
|
|
80
|
-
numSequences: number;
|
|
81
|
-
memoryUsedBytes: number;
|
|
82
|
-
memoryTotalBytes: number;
|
|
83
|
-
};
|
|
84
|
-
/**
|
|
85
|
-
* Get GPU buffers for kernel binding
|
|
86
|
-
*/
|
|
87
|
-
getBuffers(): {
|
|
88
|
-
keyCache: GPUBuffer;
|
|
89
|
-
valueCache: GPUBuffer;
|
|
90
|
-
};
|
|
91
|
-
/**
|
|
92
|
-
* Get configuration
|
|
93
|
-
*/
|
|
94
|
-
getConfig(): PagedKVCacheConfig;
|
|
95
|
-
/**
|
|
96
|
-
* Dispose GPU resources
|
|
97
|
-
*/
|
|
98
|
-
dispose(): void;
|
|
99
|
-
}
|
|
@@ -1,40 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Attention Scheduler - Prevents TDR (GPU timeout) by splitting long sequences
|
|
3
|
-
*/
|
|
4
|
-
import type { WebInferDevice } from "../core/device.ts";
|
|
5
|
-
export interface ChunkPlan {
|
|
6
|
-
numChunks: number;
|
|
7
|
-
chunkSize: number;
|
|
8
|
-
estimatedTimeMs: number;
|
|
9
|
-
}
|
|
10
|
-
/**
|
|
11
|
-
* Attention Scheduler for TDR prevention
|
|
12
|
-
* Splits long sequences into chunks to avoid GPU timeout
|
|
13
|
-
*/
|
|
14
|
-
export declare class AttentionScheduler {
|
|
15
|
-
private device;
|
|
16
|
-
private tdrLimit;
|
|
17
|
-
constructor(device: WebInferDevice);
|
|
18
|
-
private detectTDRLimit;
|
|
19
|
-
/**
|
|
20
|
-
* Estimate execution time for attention operation
|
|
21
|
-
* Based on empirical formula: time ∝ seqLen² × numHeads × headDim
|
|
22
|
-
*/
|
|
23
|
-
estimateExecutionTime(seqLen: number, numHeads: number, headDim: number): number;
|
|
24
|
-
/**
|
|
25
|
-
* Compute chunk plan for given sequence length
|
|
26
|
-
*/
|
|
27
|
-
computeChunkPlan(seqLen: number, numHeads: number, headDim: number): ChunkPlan;
|
|
28
|
-
/**
|
|
29
|
-
* Yield to main thread to prevent TDR
|
|
30
|
-
*/
|
|
31
|
-
yieldToMain(): Promise<void>;
|
|
32
|
-
/**
|
|
33
|
-
* Check if sequence might cause TDR
|
|
34
|
-
*/
|
|
35
|
-
mightCauseTDR(seqLen: number, numHeads: number, headDim: number): boolean;
|
|
36
|
-
/**
|
|
37
|
-
* Get recommended maximum sequence length for single-pass execution
|
|
38
|
-
*/
|
|
39
|
-
getMaxSinglePassSeqLen(numHeads: number, headDim: number): number;
|
|
40
|
-
}
|
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* GPU Buffer Pool - Reduces allocation overhead and memory fragmentation
|
|
3
|
-
*/
|
|
4
|
-
export declare class BufferPool {
|
|
5
|
-
private device;
|
|
6
|
-
private pools;
|
|
7
|
-
private sizeClasses;
|
|
8
|
-
constructor(device: GPUDevice);
|
|
9
|
-
private getSizeClass;
|
|
10
|
-
acquire(size: number, usage: GPUBufferUsageFlags): GPUBuffer;
|
|
11
|
-
release(buffer: GPUBuffer): void;
|
|
12
|
-
getStats(): {
|
|
13
|
-
totalBuffers: number;
|
|
14
|
-
inUse: number;
|
|
15
|
-
totalBytes: number;
|
|
16
|
-
};
|
|
17
|
-
dispose(): void;
|
|
18
|
-
}
|
package/dist/core/device.d.ts
DELETED
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* WebGPU Device Management
|
|
3
|
-
*/
|
|
4
|
-
export interface DeviceInfo {
|
|
5
|
-
vendor: "apple" | "nvidia" | "intel" | "amd" | "unknown";
|
|
6
|
-
architecture: string;
|
|
7
|
-
maxWorkgroupSize: number;
|
|
8
|
-
maxComputeInvocationsPerWorkgroup: number;
|
|
9
|
-
maxStorageBufferBindingSize: number;
|
|
10
|
-
}
|
|
11
|
-
export declare class WebInferDevice {
|
|
12
|
-
private _device;
|
|
13
|
-
private _info;
|
|
14
|
-
private constructor();
|
|
15
|
-
static create(): Promise<WebInferDevice>;
|
|
16
|
-
private static detectDeviceInfo;
|
|
17
|
-
get device(): GPUDevice;
|
|
18
|
-
get info(): DeviceInfo;
|
|
19
|
-
get limits(): GPUSupportedLimits;
|
|
20
|
-
createCommandEncoder(): GPUCommandEncoder;
|
|
21
|
-
submit(commandBuffers: GPUCommandBuffer[]): void;
|
|
22
|
-
dispose(): void;
|
|
23
|
-
}
|
|
@@ -1,69 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Inference Engine
|
|
3
|
-
* Core engine for running LLM inference
|
|
4
|
-
*/
|
|
5
|
-
import type { WebInferDevice } from "../core/device.ts";
|
|
6
|
-
import type { LoadedModel } from "../model/types.ts";
|
|
7
|
-
import type { ModelConfig, InferenceConfig, ForwardResult } from "./types.ts";
|
|
8
|
-
/**
|
|
9
|
-
* Inference Engine
|
|
10
|
-
* Manages model weights and provides forward pass functionality
|
|
11
|
-
*/
|
|
12
|
-
export declare class InferenceEngine {
|
|
13
|
-
private device;
|
|
14
|
-
private config;
|
|
15
|
-
private modelConfig;
|
|
16
|
-
private weights;
|
|
17
|
-
private loadedModel;
|
|
18
|
-
private kvCache;
|
|
19
|
-
private ropeFreqsCos;
|
|
20
|
-
private ropeFreqsSin;
|
|
21
|
-
constructor(device: WebInferDevice | null, config?: InferenceConfig);
|
|
22
|
-
/**
|
|
23
|
-
* Load model weights from a LoadedModel
|
|
24
|
-
*/
|
|
25
|
-
loadModel(model: LoadedModel, modelConfig: ModelConfig): Promise<void>;
|
|
26
|
-
/**
|
|
27
|
-
* Extract model weights from loaded model
|
|
28
|
-
*/
|
|
29
|
-
private extractWeights;
|
|
30
|
-
/**
|
|
31
|
-
* Initialize KV cache
|
|
32
|
-
*/
|
|
33
|
-
private initKVCache;
|
|
34
|
-
/**
|
|
35
|
-
* Reset KV cache (for new sequence)
|
|
36
|
-
*/
|
|
37
|
-
resetKVCache(): void;
|
|
38
|
-
/**
|
|
39
|
-
* Forward pass (CPU reference implementation)
|
|
40
|
-
* @param inputIds - Input token IDs [seqLen]
|
|
41
|
-
* @param startPos - Starting position for KV cache
|
|
42
|
-
* @returns Logits for the last token
|
|
43
|
-
*/
|
|
44
|
-
forward(inputIds: Uint32Array, startPos?: number): ForwardResult;
|
|
45
|
-
/**
|
|
46
|
-
* Attention forward pass
|
|
47
|
-
*/
|
|
48
|
-
private attentionForward;
|
|
49
|
-
/**
|
|
50
|
-
* Apply RoPE to a single head position
|
|
51
|
-
*/
|
|
52
|
-
private applyRoPE;
|
|
53
|
-
/**
|
|
54
|
-
* FFN forward pass (SwiGLU)
|
|
55
|
-
*/
|
|
56
|
-
private ffnForward;
|
|
57
|
-
/**
|
|
58
|
-
* Get model configuration
|
|
59
|
-
*/
|
|
60
|
-
getModelConfig(): ModelConfig | null;
|
|
61
|
-
/**
|
|
62
|
-
* Check if model is loaded
|
|
63
|
-
*/
|
|
64
|
-
isLoaded(): boolean;
|
|
65
|
-
/**
|
|
66
|
-
* Dispose resources
|
|
67
|
-
*/
|
|
68
|
-
dispose(): void;
|
|
69
|
-
}
|
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Text Generation API
|
|
3
|
-
* High-level API for generating text with LLMs
|
|
4
|
-
*/
|
|
5
|
-
import type { InferenceEngine } from "./engine.ts";
|
|
6
|
-
import type { GenerationConfig, GenerationResult, StreamToken } from "./types.ts";
|
|
7
|
-
/**
|
|
8
|
-
* Apply generation config to logits and sample next token
|
|
9
|
-
*/
|
|
10
|
-
export declare function sampleNextToken(logits: Float32Array, config: GenerationConfig, generatedTokens?: number[]): number;
|
|
11
|
-
/**
|
|
12
|
-
* Generate tokens from a prompt
|
|
13
|
-
* @param engine - Initialized inference engine with loaded model
|
|
14
|
-
* @param promptTokens - Tokenized prompt
|
|
15
|
-
* @param config - Generation configuration
|
|
16
|
-
* @returns Generation result
|
|
17
|
-
*/
|
|
18
|
-
export declare function generate(engine: InferenceEngine, promptTokens: number[] | Uint32Array, config?: Partial<GenerationConfig>): Promise<GenerationResult>;
|
|
19
|
-
/**
|
|
20
|
-
* Generate tokens with streaming (async iterator)
|
|
21
|
-
* @param engine - Initialized inference engine with loaded model
|
|
22
|
-
* @param promptTokens - Tokenized prompt
|
|
23
|
-
* @param config - Generation configuration
|
|
24
|
-
* @yields StreamToken for each generated token
|
|
25
|
-
*/
|
|
26
|
-
export declare function generateStream(engine: InferenceEngine, promptTokens: number[] | Uint32Array, config?: Partial<GenerationConfig>): AsyncGenerator<StreamToken, void, unknown>;
|
|
27
|
-
/**
|
|
28
|
-
* Simple greedy decode (no sampling, fastest)
|
|
29
|
-
*/
|
|
30
|
-
export declare function greedyDecode(engine: InferenceEngine, promptTokens: number[] | Uint32Array, maxTokens: number, eosTokenId?: number): number[];
|
|
@@ -1,7 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Inference Module
|
|
3
|
-
* High-level API for LLM inference
|
|
4
|
-
*/
|
|
5
|
-
export { type ModelConfig, type InferenceConfig, type GenerationConfig, type GenerationResult, type StreamToken, type FinishReason, type ForwardResult, type ModelWeights, type LayerWeights, DEFAULT_GENERATION_CONFIG, normalizeGenerationConfig, } from "./types.ts";
|
|
6
|
-
export { InferenceEngine } from "./engine.ts";
|
|
7
|
-
export { generate, generateStream, greedyDecode, sampleNextToken, } from "./generate.ts";
|
|
@@ -1,161 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Inference Types
|
|
3
|
-
* Configuration and result types for model inference
|
|
4
|
-
*/
|
|
5
|
-
import type { DType } from "../core/tensor.ts";
|
|
6
|
-
/**
|
|
7
|
-
* Model architecture configuration
|
|
8
|
-
*/
|
|
9
|
-
export interface ModelConfig {
|
|
10
|
-
/** Model architecture (e.g., "llama", "mistral", "gpt2") */
|
|
11
|
-
architecture: string;
|
|
12
|
-
/** Number of transformer layers */
|
|
13
|
-
numLayers: number;
|
|
14
|
-
/** Number of attention heads */
|
|
15
|
-
numHeads: number;
|
|
16
|
-
/** Number of key-value heads (for GQA, defaults to numHeads) */
|
|
17
|
-
numKVHeads?: number;
|
|
18
|
-
/** Hidden/embedding dimension */
|
|
19
|
-
hiddenSize: number;
|
|
20
|
-
/** Intermediate size for FFN */
|
|
21
|
-
intermediateSize: number;
|
|
22
|
-
/** Vocabulary size */
|
|
23
|
-
vocabSize: number;
|
|
24
|
-
/** Maximum sequence length */
|
|
25
|
-
maxSeqLen: number;
|
|
26
|
-
/** Head dimension (defaults to hiddenSize / numHeads) */
|
|
27
|
-
headDim?: number;
|
|
28
|
-
/** RoPE frequency base */
|
|
29
|
-
ropeFreqBase?: number;
|
|
30
|
-
/** RMS norm epsilon */
|
|
31
|
-
rmsNormEps?: number;
|
|
32
|
-
/** Data type for computation */
|
|
33
|
-
dtype?: DType;
|
|
34
|
-
}
|
|
35
|
-
/**
|
|
36
|
-
* Inference engine configuration
|
|
37
|
-
*/
|
|
38
|
-
export interface InferenceConfig {
|
|
39
|
-
/** Maximum batch size */
|
|
40
|
-
maxBatchSize?: number;
|
|
41
|
-
/** Maximum sequence length */
|
|
42
|
-
maxSeqLen?: number;
|
|
43
|
-
/** Use KV cache for generation */
|
|
44
|
-
useKVCache?: boolean;
|
|
45
|
-
/** Memory limit in bytes (optional) */
|
|
46
|
-
memoryLimit?: number;
|
|
47
|
-
/** Enable profiling */
|
|
48
|
-
enableProfiling?: boolean;
|
|
49
|
-
}
|
|
50
|
-
/**
|
|
51
|
-
* Generation / sampling configuration
|
|
52
|
-
*/
|
|
53
|
-
export interface GenerationConfig {
|
|
54
|
-
/** Maximum number of tokens to generate */
|
|
55
|
-
maxTokens: number;
|
|
56
|
-
/** Temperature for sampling (0 = greedy) */
|
|
57
|
-
temperature?: number;
|
|
58
|
-
/** Top-K sampling (0 = disabled) */
|
|
59
|
-
topK?: number;
|
|
60
|
-
/** Top-P / nucleus sampling (1.0 = disabled) */
|
|
61
|
-
topP?: number;
|
|
62
|
-
/** Repetition penalty (1.0 = disabled) */
|
|
63
|
-
repetitionPenalty?: number;
|
|
64
|
-
/** Stop sequences (generation stops when any is generated) */
|
|
65
|
-
stopSequences?: number[][];
|
|
66
|
-
/** EOS token ID */
|
|
67
|
-
eosTokenId?: number;
|
|
68
|
-
/** Pad token ID */
|
|
69
|
-
padTokenId?: number;
|
|
70
|
-
/** BOS token ID */
|
|
71
|
-
bosTokenId?: number;
|
|
72
|
-
/** Stream tokens as they are generated */
|
|
73
|
-
stream?: boolean;
|
|
74
|
-
/** Random seed for reproducibility */
|
|
75
|
-
seed?: number;
|
|
76
|
-
}
|
|
77
|
-
/**
|
|
78
|
-
* Default generation config values
|
|
79
|
-
*/
|
|
80
|
-
export declare const DEFAULT_GENERATION_CONFIG: Required<Omit<GenerationConfig, "stopSequences" | "seed">>;
|
|
81
|
-
/**
|
|
82
|
-
* Reason for generation completion
|
|
83
|
-
*/
|
|
84
|
-
export type FinishReason = "stop" | "length" | "eos";
|
|
85
|
-
/**
|
|
86
|
-
* Result of text generation
|
|
87
|
-
*/
|
|
88
|
-
export interface GenerationResult {
|
|
89
|
-
/** Generated token IDs */
|
|
90
|
-
tokens: number[];
|
|
91
|
-
/** Finish reason */
|
|
92
|
-
finishReason: FinishReason;
|
|
93
|
-
/** Number of prompt tokens */
|
|
94
|
-
promptTokens: number;
|
|
95
|
-
/** Number of generated tokens */
|
|
96
|
-
generatedTokens: number;
|
|
97
|
-
/** Total time in milliseconds */
|
|
98
|
-
totalTimeMs: number;
|
|
99
|
-
/** Tokens per second */
|
|
100
|
-
tokensPerSecond: number;
|
|
101
|
-
}
|
|
102
|
-
/**
|
|
103
|
-
* Streaming generation token
|
|
104
|
-
*/
|
|
105
|
-
export interface StreamToken {
|
|
106
|
-
/** Token ID */
|
|
107
|
-
tokenId: number;
|
|
108
|
-
/** Token index in generation */
|
|
109
|
-
index: number;
|
|
110
|
-
/** Whether this is the final token */
|
|
111
|
-
isLast: boolean;
|
|
112
|
-
/** Finish reason (only set if isLast) */
|
|
113
|
-
finishReason?: FinishReason;
|
|
114
|
-
}
|
|
115
|
-
/**
|
|
116
|
-
* Forward pass result
|
|
117
|
-
*/
|
|
118
|
-
export interface ForwardResult {
|
|
119
|
-
/** Logits [batch, vocabSize] or [batch, seqLen, vocabSize] */
|
|
120
|
-
logits: Float32Array;
|
|
121
|
-
/** Shape of logits */
|
|
122
|
-
logitsShape: number[];
|
|
123
|
-
}
|
|
124
|
-
/**
|
|
125
|
-
* Validate and normalize generation config
|
|
126
|
-
*/
|
|
127
|
-
export declare function normalizeGenerationConfig(config: Partial<GenerationConfig>): GenerationConfig;
|
|
128
|
-
/**
|
|
129
|
-
* Model layer weights
|
|
130
|
-
*/
|
|
131
|
-
export interface LayerWeights {
|
|
132
|
-
/** Attention weights */
|
|
133
|
-
attention: {
|
|
134
|
-
qProj: Float32Array;
|
|
135
|
-
kProj: Float32Array;
|
|
136
|
-
vProj: Float32Array;
|
|
137
|
-
oProj: Float32Array;
|
|
138
|
-
};
|
|
139
|
-
/** FFN weights */
|
|
140
|
-
ffn: {
|
|
141
|
-
gate?: Float32Array;
|
|
142
|
-
up: Float32Array;
|
|
143
|
-
down: Float32Array;
|
|
144
|
-
};
|
|
145
|
-
/** Normalization */
|
|
146
|
-
inputNorm: Float32Array;
|
|
147
|
-
postAttentionNorm: Float32Array;
|
|
148
|
-
}
|
|
149
|
-
/**
|
|
150
|
-
* Full model weights
|
|
151
|
-
*/
|
|
152
|
-
export interface ModelWeights {
|
|
153
|
-
/** Token embeddings */
|
|
154
|
-
embedTokens: Float32Array;
|
|
155
|
-
/** Layer weights */
|
|
156
|
-
layers: LayerWeights[];
|
|
157
|
-
/** Final norm */
|
|
158
|
-
finalNorm: Float32Array;
|
|
159
|
-
/** LM head (output projection) */
|
|
160
|
-
lmHead: Float32Array;
|
|
161
|
-
}
|
package/dist/jit/compiler.d.ts
DELETED
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* WGSL Compiler - Generates optimized GPU kernels
|
|
3
|
-
*/
|
|
4
|
-
import { KernelCache } from "./kernel-cache.ts";
|
|
5
|
-
import type { DeviceInfo } from "../core/device.ts";
|
|
6
|
-
export interface MatMulConfig {
|
|
7
|
-
M: number;
|
|
8
|
-
N: number;
|
|
9
|
-
K: number;
|
|
10
|
-
tileM?: number;
|
|
11
|
-
tileN?: number;
|
|
12
|
-
tileK?: number;
|
|
13
|
-
}
|
|
14
|
-
export declare class WGSLCompiler {
|
|
15
|
-
private device;
|
|
16
|
-
private cache;
|
|
17
|
-
private deviceInfo;
|
|
18
|
-
constructor(device: GPUDevice, cache: KernelCache, deviceInfo: DeviceInfo);
|
|
19
|
-
private selectTileSize;
|
|
20
|
-
compileMatMul(config: MatMulConfig): GPUComputePipeline;
|
|
21
|
-
private generateMatMulWGSL;
|
|
22
|
-
getCacheStats(): import("./kernel-cache.ts").CacheStats;
|
|
23
|
-
}
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Kernel Cache - Caches compiled GPU compute pipelines
|
|
3
|
-
*/
|
|
4
|
-
export interface CacheStats {
|
|
5
|
-
hits: number;
|
|
6
|
-
misses: number;
|
|
7
|
-
size: number;
|
|
8
|
-
}
|
|
9
|
-
export declare class KernelCache {
|
|
10
|
-
private device;
|
|
11
|
-
private cache;
|
|
12
|
-
private hits;
|
|
13
|
-
private misses;
|
|
14
|
-
constructor(device: GPUDevice);
|
|
15
|
-
getOrCreate(key: string, createFn: () => GPUComputePipeline): GPUComputePipeline;
|
|
16
|
-
has(key: string): boolean;
|
|
17
|
-
get(key: string): GPUComputePipeline | undefined;
|
|
18
|
-
set(key: string, pipeline: GPUComputePipeline): void;
|
|
19
|
-
getStats(): CacheStats;
|
|
20
|
-
clear(): void;
|
|
21
|
-
}
|
package/dist/model/gguf.d.ts
DELETED
|
@@ -1,90 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* GGUF Format Parser
|
|
3
|
-
*
|
|
4
|
-
* GGUF file structure:
|
|
5
|
-
* - 4 bytes: magic "GGUF"
|
|
6
|
-
* - 4 bytes: version (3)
|
|
7
|
-
* - 8 bytes: n_tensors (u64)
|
|
8
|
-
* - 8 bytes: n_kv (u64)
|
|
9
|
-
* - Key-value metadata pairs
|
|
10
|
-
* - Tensor info descriptors
|
|
11
|
-
* - Padding to alignment
|
|
12
|
-
* - Tensor data
|
|
13
|
-
*/
|
|
14
|
-
import { type GGUFTensorInfo, type TensorInfo, type LoadedModel, type LoadOptions, GGUFQuantType } from "./types.ts";
|
|
15
|
-
/**
|
|
16
|
-
* Reader helper for GGUF binary format
|
|
17
|
-
*/
|
|
18
|
-
declare class GGUFReader {
|
|
19
|
-
private view;
|
|
20
|
-
private offset;
|
|
21
|
-
private textDecoder;
|
|
22
|
-
constructor(buffer: ArrayBuffer);
|
|
23
|
-
get position(): number;
|
|
24
|
-
set position(pos: number);
|
|
25
|
-
readUint8(): number;
|
|
26
|
-
readInt8(): number;
|
|
27
|
-
readUint16(): number;
|
|
28
|
-
readInt16(): number;
|
|
29
|
-
readUint32(): number;
|
|
30
|
-
readInt32(): number;
|
|
31
|
-
readUint64(): bigint;
|
|
32
|
-
readInt64(): bigint;
|
|
33
|
-
readFloat32(): number;
|
|
34
|
-
readFloat64(): number;
|
|
35
|
-
readBool(): boolean;
|
|
36
|
-
readString(): string;
|
|
37
|
-
alignTo(alignment: number): void;
|
|
38
|
-
}
|
|
39
|
-
/**
|
|
40
|
-
* GGUF header information
|
|
41
|
-
*/
|
|
42
|
-
export interface GGUFHeader {
|
|
43
|
-
magic: number;
|
|
44
|
-
version: number;
|
|
45
|
-
nTensors: bigint;
|
|
46
|
-
nKV: bigint;
|
|
47
|
-
}
|
|
48
|
-
/**
|
|
49
|
-
* Parse the GGUF header
|
|
50
|
-
*/
|
|
51
|
-
export declare function parseGGUFHeader(reader: GGUFReader): GGUFHeader;
|
|
52
|
-
/**
|
|
53
|
-
* Parse all metadata key-value pairs
|
|
54
|
-
*/
|
|
55
|
-
export declare function parseGGUFMetadata(reader: GGUFReader, nKV: bigint): Map<string, unknown>;
|
|
56
|
-
/**
|
|
57
|
-
* Parse tensor info descriptors
|
|
58
|
-
*/
|
|
59
|
-
export declare function parseGGUFTensorInfos(reader: GGUFReader, nTensors: bigint): GGUFTensorInfo[];
|
|
60
|
-
/**
|
|
61
|
-
* Calculate byte size for a GGUF tensor
|
|
62
|
-
*/
|
|
63
|
-
export declare function calculateGGUFTensorBytes(type: GGUFQuantType, shape: number[]): number;
|
|
64
|
-
/**
|
|
65
|
-
* Load a GGUF model from an ArrayBuffer
|
|
66
|
-
*/
|
|
67
|
-
export declare function loadGGUF(buffer: ArrayBuffer, options?: LoadOptions): LoadedModel;
|
|
68
|
-
/**
|
|
69
|
-
* Load GGUF from a URL
|
|
70
|
-
*/
|
|
71
|
-
export declare function loadGGUFFromUrl(url: string, options?: LoadOptions): Promise<LoadedModel>;
|
|
72
|
-
/**
|
|
73
|
-
* Dequantize Q4_0 block to float32
|
|
74
|
-
* Q4_0: 32 values = 2 bytes scale (f16) + 16 bytes data (4-bit packed)
|
|
75
|
-
*/
|
|
76
|
-
export declare function dequantizeQ4_0Block(data: Uint8Array, offset: number): Float32Array;
|
|
77
|
-
/**
|
|
78
|
-
* Dequantize Q8_0 block to float32
|
|
79
|
-
* Q8_0: 32 values = 2 bytes scale (f16) + 32 bytes data (int8)
|
|
80
|
-
*/
|
|
81
|
-
export declare function dequantizeQ8_0Block(data: Uint8Array, offset: number): Float32Array;
|
|
82
|
-
/**
|
|
83
|
-
* Load and dequantize a GGUF tensor to Float32Array
|
|
84
|
-
*/
|
|
85
|
-
export declare function loadGGUFTensor(buffer: ArrayBuffer, info: TensorInfo): Float32Array;
|
|
86
|
-
/**
|
|
87
|
-
* Check if a buffer is a valid GGUF file
|
|
88
|
-
*/
|
|
89
|
-
export declare function isGGUF(buffer: ArrayBuffer): boolean;
|
|
90
|
-
export {};
|
package/dist/model/index.d.ts
DELETED
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Model Loading Module
|
|
3
|
-
* Supports SafeTensors and GGUF formats
|
|
4
|
-
*/
|
|
5
|
-
import type { ModelFormat, LoadOptions, LoadedModel } from "./types.ts";
|
|
6
|
-
export { type ModelFormat, type SafetensorsDType, GGUFQuantType, GGUFMetadataValueType, type TensorInfo, type SafetensorsHeader, type SafetensorsHeaderEntry, type ModelMetadata, type GGUFTensorInfo, type LoadedTensor, type LoadedModel, type LoadOptions, SAFETENSORS_DTYPE_BYTES, GGUF_QUANT_BLOCK_SIZE, GGUF_QUANT_BYTES_PER_BLOCK, } from "./types.ts";
|
|
7
|
-
export { parseSafetensorsHeader, getSafetensorsTensorInfos, loadSafetensorsTensor, loadSafetensors, loadSafetensorsFromUrl, isSafetensors, } from "./safetensors.ts";
|
|
8
|
-
export { type GGUFHeader, parseGGUFHeader, parseGGUFMetadata, parseGGUFTensorInfos, calculateGGUFTensorBytes, loadGGUF, loadGGUFFromUrl, loadGGUFTensor, dequantizeQ4_0Block, dequantizeQ8_0Block, isGGUF, } from "./gguf.ts";
|
|
9
|
-
/**
|
|
10
|
-
* Auto-detect model format from buffer
|
|
11
|
-
*/
|
|
12
|
-
export declare function detectModelFormat(buffer: ArrayBuffer): ModelFormat | null;
|
|
13
|
-
/**
|
|
14
|
-
* Load a model file, auto-detecting the format
|
|
15
|
-
*/
|
|
16
|
-
export declare function loadModel(source: ArrayBuffer | string, options?: LoadOptions): Promise<LoadedModel>;
|
|
@@ -1,38 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* SafeTensors Format Parser
|
|
3
|
-
*
|
|
4
|
-
* SafeTensors file structure:
|
|
5
|
-
* - 8 bytes: header size (little-endian u64)
|
|
6
|
-
* - N bytes: JSON header (UTF-8)
|
|
7
|
-
* - Remaining: tensor data (contiguous)
|
|
8
|
-
*/
|
|
9
|
-
import { type SafetensorsHeader, type TensorInfo, type LoadedModel, type LoadOptions } from "./types.ts";
|
|
10
|
-
/**
|
|
11
|
-
* Parse the SafeTensors header from a buffer
|
|
12
|
-
* @param buffer - ArrayBuffer containing the SafeTensors file
|
|
13
|
-
* @returns Parsed header and data offset
|
|
14
|
-
*/
|
|
15
|
-
export declare function parseSafetensorsHeader(buffer: ArrayBuffer): {
|
|
16
|
-
header: SafetensorsHeader;
|
|
17
|
-
dataOffset: number;
|
|
18
|
-
};
|
|
19
|
-
/**
|
|
20
|
-
* Extract tensor information from SafeTensors header
|
|
21
|
-
*/
|
|
22
|
-
export declare function getSafetensorsTensorInfos(header: SafetensorsHeader, dataOffset: number): Map<string, TensorInfo>;
|
|
23
|
-
/**
|
|
24
|
-
* Load a single tensor's data from the buffer
|
|
25
|
-
*/
|
|
26
|
-
export declare function loadSafetensorsTensor(buffer: ArrayBuffer, info: TensorInfo): Float32Array;
|
|
27
|
-
/**
|
|
28
|
-
* Load a SafeTensors model from an ArrayBuffer
|
|
29
|
-
*/
|
|
30
|
-
export declare function loadSafetensors(buffer: ArrayBuffer, options?: LoadOptions): LoadedModel;
|
|
31
|
-
/**
|
|
32
|
-
* Load SafeTensors from a URL
|
|
33
|
-
*/
|
|
34
|
-
export declare function loadSafetensorsFromUrl(url: string, options?: LoadOptions): Promise<LoadedModel>;
|
|
35
|
-
/**
|
|
36
|
-
* Check if a buffer is a valid SafeTensors file
|
|
37
|
-
*/
|
|
38
|
-
export declare function isSafetensors(buffer: ArrayBuffer): boolean;
|