webinfer 0.0.3 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +65 -21
- package/dist/activation/index.d.ts +30 -0
- package/dist/core/context.d.ts +70 -0
- package/dist/core/paged-kv-cache.d.ts +33 -0
- package/dist/core/tensor.d.ts +51 -19
- package/dist/core/types.d.ts +27 -0
- package/dist/decode/index.d.ts +140 -0
- package/dist/gemm/index.d.ts +27 -0
- package/dist/index.d.ts +29 -21
- package/dist/index.js +3433 -4809
- package/dist/jit/index.d.ts +138 -0
- package/dist/kernels/activation.wgsl.d.ts +14 -0
- package/dist/kernels/batch-decode-paged.wgsl.d.ts +12 -0
- package/dist/kernels/batch-prefill-paged.wgsl.d.ts +13 -0
- package/dist/kernels/decode-attention.wgsl.d.ts +16 -0
- package/dist/kernels/gemm.wgsl.d.ts +17 -0
- package/dist/kernels/page.wgsl.d.ts +10 -0
- package/dist/kernels/prefill-attention.wgsl.d.ts +17 -0
- package/dist/kernels/rmsnorm.wgsl.d.ts +10 -0
- package/dist/kernels/rope.wgsl.d.ts +19 -0
- package/dist/kernels/sampling.wgsl.d.ts +23 -0
- package/dist/norm/index.d.ts +43 -0
- package/dist/page/index.d.ts +21 -0
- package/dist/prefill/index.d.ts +155 -0
- package/dist/rope/index.d.ts +37 -0
- package/dist/sampling/index.d.ts +53 -4
- package/package.json +1 -1
- package/dist/attention/block-sparse/format.d.ts +0 -52
- package/dist/attention/block-sparse/patterns/causal.d.ts +0 -16
- package/dist/attention/block-sparse/patterns/sliding.d.ts +0 -22
- package/dist/attention/block-sparse/patterns/tree.d.ts +0 -65
- package/dist/attention/cascaded-inference.d.ts +0 -29
- package/dist/attention/flash-attention.d.ts +0 -30
- package/dist/attention/index.d.ts +0 -118
- package/dist/attention/paged-attention.d.ts +0 -40
- package/dist/attention/paged-kv/block-manager.d.ts +0 -102
- package/dist/attention/paged-kv/index.d.ts +0 -5
- package/dist/attention/paged-kv/page-table.d.ts +0 -165
- package/dist/attention/scheduler.d.ts +0 -40
- package/dist/core/buffer-pool.d.ts +0 -18
- package/dist/core/device.d.ts +0 -23
- package/dist/core/tdr.d.ts +0 -114
- package/dist/inference/engine.d.ts +0 -69
- package/dist/inference/generate.d.ts +0 -30
- package/dist/inference/index.d.ts +0 -7
- package/dist/inference/types.d.ts +0 -161
- package/dist/jit/compiler.d.ts +0 -23
- package/dist/jit/kernel-cache.d.ts +0 -21
- package/dist/model/gguf.d.ts +0 -90
- package/dist/model/index.d.ts +0 -16
- package/dist/model/safetensors.d.ts +0 -38
- package/dist/model/types.d.ts +0 -182
- package/dist/ops/activations.d.ts +0 -43
- package/dist/ops/elementwise.d.ts +0 -38
- package/dist/ops/embedding.d.ts +0 -30
- package/dist/ops/matmul.d.ts +0 -21
- package/dist/ops/normalization.d.ts +0 -63
- package/dist/ops/reshape.d.ts +0 -39
- package/dist/ops/rope.d.ts +0 -32
- package/dist/ops/softmax.d.ts +0 -18
- package/dist/quantization/index.d.ts +0 -6
- package/dist/quantization/qmatmul.d.ts +0 -38
- package/dist/quantization/quantize.d.ts +0 -52
- package/dist/sampling/beam-search.d.ts +0 -87
- package/dist/sampling/sampler.d.ts +0 -72
- package/dist/sampling/speculative.d.ts +0 -65
- package/dist/sampling/top-k.d.ts +0 -24
- package/dist/sampling/top-p.d.ts +0 -14
- package/dist/tvm/adapter.d.ts +0 -81
- package/dist/tvm/index.d.ts +0 -8
- package/dist/tvm/ops.d.ts +0 -26
- package/dist/tvm/types.d.ts +0 -35
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* GPU Buffer Pool - Reduces allocation overhead and memory fragmentation
|
|
3
|
-
*/
|
|
4
|
-
export declare class BufferPool {
|
|
5
|
-
private device;
|
|
6
|
-
private pools;
|
|
7
|
-
private sizeClasses;
|
|
8
|
-
constructor(device: GPUDevice);
|
|
9
|
-
private getSizeClass;
|
|
10
|
-
acquire(size: number, usage: GPUBufferUsageFlags): GPUBuffer;
|
|
11
|
-
release(buffer: GPUBuffer): void;
|
|
12
|
-
getStats(): {
|
|
13
|
-
totalBuffers: number;
|
|
14
|
-
inUse: number;
|
|
15
|
-
totalBytes: number;
|
|
16
|
-
};
|
|
17
|
-
dispose(): void;
|
|
18
|
-
}
|
package/dist/core/device.d.ts
DELETED
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* WebGPU Device Management
|
|
3
|
-
*/
|
|
4
|
-
export interface DeviceInfo {
|
|
5
|
-
vendor: "apple" | "nvidia" | "intel" | "amd" | "unknown";
|
|
6
|
-
architecture: string;
|
|
7
|
-
maxWorkgroupSize: number;
|
|
8
|
-
maxComputeInvocationsPerWorkgroup: number;
|
|
9
|
-
maxStorageBufferBindingSize: number;
|
|
10
|
-
}
|
|
11
|
-
export declare class WebInferDevice {
|
|
12
|
-
private _device;
|
|
13
|
-
private _info;
|
|
14
|
-
private constructor();
|
|
15
|
-
static create(): Promise<WebInferDevice>;
|
|
16
|
-
private static detectDeviceInfo;
|
|
17
|
-
get device(): GPUDevice;
|
|
18
|
-
get info(): DeviceInfo;
|
|
19
|
-
get limits(): GPUSupportedLimits;
|
|
20
|
-
createCommandEncoder(): GPUCommandEncoder;
|
|
21
|
-
submit(commandBuffers: GPUCommandBuffer[]): void;
|
|
22
|
-
dispose(): void;
|
|
23
|
-
}
|
package/dist/core/tdr.d.ts
DELETED
|
@@ -1,114 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* TDR (Timeout Detection and Recovery) Prevention
|
|
3
|
-
* Handles browser-specific GPU timeout limits and graceful degradation
|
|
4
|
-
*/
|
|
5
|
-
/**
|
|
6
|
-
* Browser detection result
|
|
7
|
-
*/
|
|
8
|
-
export interface BrowserInfo {
|
|
9
|
-
name: "chrome" | "safari" | "firefox" | "edge" | "unknown";
|
|
10
|
-
version: number;
|
|
11
|
-
isMobile: boolean;
|
|
12
|
-
hasWebGPU: boolean;
|
|
13
|
-
}
|
|
14
|
-
/**
|
|
15
|
-
* TDR configuration for each browser
|
|
16
|
-
*/
|
|
17
|
-
export interface TDRConfig {
|
|
18
|
-
timeoutMs: number;
|
|
19
|
-
safetyMargin: number;
|
|
20
|
-
maxChunkSize: number;
|
|
21
|
-
supportsTimestampQuery: boolean;
|
|
22
|
-
}
|
|
23
|
-
/**
|
|
24
|
-
* Detect browser information
|
|
25
|
-
*/
|
|
26
|
-
export declare function detectBrowser(): BrowserInfo;
|
|
27
|
-
/**
|
|
28
|
-
* Get TDR configuration for current browser
|
|
29
|
-
*/
|
|
30
|
-
export declare function getTDRConfig(browser?: BrowserInfo): TDRConfig;
|
|
31
|
-
/**
|
|
32
|
-
* Graceful degradation options
|
|
33
|
-
*/
|
|
34
|
-
export interface DegradationOptions {
|
|
35
|
-
/** Use CPU fallback when GPU fails */
|
|
36
|
-
enableCpuFallback: boolean;
|
|
37
|
-
/** Reduce precision to f16 when memory is tight */
|
|
38
|
-
enablePrecisionReduction: boolean;
|
|
39
|
-
/** Auto-chunk large sequences */
|
|
40
|
-
enableAutoChunking: boolean;
|
|
41
|
-
/** Maximum retries before falling back */
|
|
42
|
-
maxRetries: number;
|
|
43
|
-
}
|
|
44
|
-
/**
|
|
45
|
-
* Error types for graceful handling
|
|
46
|
-
*/
|
|
47
|
-
export type WebGPUErrorType = "device_lost" | "out_of_memory" | "validation" | "timeout" | "unknown";
|
|
48
|
-
/**
|
|
49
|
-
* Classify WebGPU error
|
|
50
|
-
*/
|
|
51
|
-
export declare function classifyError(error: unknown): WebGPUErrorType;
|
|
52
|
-
/**
|
|
53
|
-
* TDR-safe execution wrapper
|
|
54
|
-
*/
|
|
55
|
-
export declare class TDRGuard {
|
|
56
|
-
private browser;
|
|
57
|
-
private config;
|
|
58
|
-
private options;
|
|
59
|
-
private lastExecutionTime;
|
|
60
|
-
constructor(options?: Partial<DegradationOptions>);
|
|
61
|
-
/**
|
|
62
|
-
* Get safe execution time limit
|
|
63
|
-
*/
|
|
64
|
-
getSafeTimeLimit(): number;
|
|
65
|
-
/**
|
|
66
|
-
* Check if operation might cause TDR
|
|
67
|
-
*/
|
|
68
|
-
mightTimeout(estimatedMs: number): boolean;
|
|
69
|
-
/**
|
|
70
|
-
* Calculate chunks needed for safe execution
|
|
71
|
-
*/
|
|
72
|
-
calcChunks(estimatedTotalMs: number): number;
|
|
73
|
-
/**
|
|
74
|
-
* Yield to browser main thread
|
|
75
|
-
*/
|
|
76
|
-
yield(): Promise<void>;
|
|
77
|
-
/**
|
|
78
|
-
* Execute with TDR protection
|
|
79
|
-
*/
|
|
80
|
-
execute<T>(fn: () => Promise<T>, options?: {
|
|
81
|
-
estimatedMs?: number;
|
|
82
|
-
onRetry?: (attempt: number, error: unknown) => void;
|
|
83
|
-
fallback?: () => T | Promise<T>;
|
|
84
|
-
}): Promise<T>;
|
|
85
|
-
/**
|
|
86
|
-
* Get browser info
|
|
87
|
-
*/
|
|
88
|
-
getBrowserInfo(): BrowserInfo;
|
|
89
|
-
/**
|
|
90
|
-
* Get TDR config
|
|
91
|
-
*/
|
|
92
|
-
getTDRConfig(): TDRConfig;
|
|
93
|
-
/**
|
|
94
|
-
* Get last execution time
|
|
95
|
-
*/
|
|
96
|
-
getLastExecutionTime(): number;
|
|
97
|
-
}
|
|
98
|
-
/**
|
|
99
|
-
* Check WebGPU support and capabilities
|
|
100
|
-
*/
|
|
101
|
-
export declare function checkWebGPUSupport(): Promise<{
|
|
102
|
-
supported: boolean;
|
|
103
|
-
reason?: string;
|
|
104
|
-
adapter?: GPUAdapter;
|
|
105
|
-
limits?: GPUSupportedLimits;
|
|
106
|
-
}>;
|
|
107
|
-
/**
|
|
108
|
-
* Create device with graceful degradation
|
|
109
|
-
*/
|
|
110
|
-
export declare function createDeviceWithFallback(adapter: GPUAdapter, options?: {
|
|
111
|
-
requiredFeatures?: GPUFeatureName[];
|
|
112
|
-
requiredLimits?: Record<string, number>;
|
|
113
|
-
onFallback?: (reason: string) => void;
|
|
114
|
-
}): Promise<GPUDevice>;
|
|
@@ -1,69 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Inference Engine
|
|
3
|
-
* Core engine for running LLM inference
|
|
4
|
-
*/
|
|
5
|
-
import type { WebInferDevice } from "../core/device.ts";
|
|
6
|
-
import type { LoadedModel } from "../model/types.ts";
|
|
7
|
-
import type { ForwardResult, InferenceConfig, ModelConfig } from "./types.ts";
|
|
8
|
-
/**
|
|
9
|
-
* Inference Engine
|
|
10
|
-
* Manages model weights and provides forward pass functionality
|
|
11
|
-
*/
|
|
12
|
-
export declare class InferenceEngine {
|
|
13
|
-
private device;
|
|
14
|
-
private config;
|
|
15
|
-
private modelConfig;
|
|
16
|
-
private weights;
|
|
17
|
-
private loadedModel;
|
|
18
|
-
private kvCache;
|
|
19
|
-
private ropeFreqsCos;
|
|
20
|
-
private ropeFreqsSin;
|
|
21
|
-
constructor(device: WebInferDevice | null, config?: InferenceConfig);
|
|
22
|
-
/**
|
|
23
|
-
* Load model weights from a LoadedModel
|
|
24
|
-
*/
|
|
25
|
-
loadModel(model: LoadedModel, modelConfig: ModelConfig): Promise<void>;
|
|
26
|
-
/**
|
|
27
|
-
* Extract model weights from loaded model
|
|
28
|
-
*/
|
|
29
|
-
private extractWeights;
|
|
30
|
-
/**
|
|
31
|
-
* Initialize KV cache
|
|
32
|
-
*/
|
|
33
|
-
private initKVCache;
|
|
34
|
-
/**
|
|
35
|
-
* Reset KV cache (for new sequence)
|
|
36
|
-
*/
|
|
37
|
-
resetKVCache(): void;
|
|
38
|
-
/**
|
|
39
|
-
* Forward pass (CPU reference implementation)
|
|
40
|
-
* @param inputIds - Input token IDs [seqLen]
|
|
41
|
-
* @param startPos - Starting position for KV cache
|
|
42
|
-
* @returns Logits for the last token
|
|
43
|
-
*/
|
|
44
|
-
forward(inputIds: Uint32Array, startPos?: number): ForwardResult;
|
|
45
|
-
/**
|
|
46
|
-
* Attention forward pass
|
|
47
|
-
*/
|
|
48
|
-
private attentionForward;
|
|
49
|
-
/**
|
|
50
|
-
* Apply RoPE to a single head position
|
|
51
|
-
*/
|
|
52
|
-
private applyRoPE;
|
|
53
|
-
/**
|
|
54
|
-
* FFN forward pass (SwiGLU)
|
|
55
|
-
*/
|
|
56
|
-
private ffnForward;
|
|
57
|
-
/**
|
|
58
|
-
* Get model configuration
|
|
59
|
-
*/
|
|
60
|
-
getModelConfig(): ModelConfig | null;
|
|
61
|
-
/**
|
|
62
|
-
* Check if model is loaded
|
|
63
|
-
*/
|
|
64
|
-
isLoaded(): boolean;
|
|
65
|
-
/**
|
|
66
|
-
* Dispose resources
|
|
67
|
-
*/
|
|
68
|
-
dispose(): void;
|
|
69
|
-
}
|
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Text Generation API
|
|
3
|
-
* High-level API for generating text with LLMs
|
|
4
|
-
*/
|
|
5
|
-
import type { InferenceEngine } from "./engine.ts";
|
|
6
|
-
import type { GenerationConfig, GenerationResult, StreamToken } from "./types.ts";
|
|
7
|
-
/**
|
|
8
|
-
* Apply generation config to logits and sample next token
|
|
9
|
-
*/
|
|
10
|
-
export declare function sampleNextToken(logits: Float32Array, config: GenerationConfig, generatedTokens?: number[]): number;
|
|
11
|
-
/**
|
|
12
|
-
* Generate tokens from a prompt
|
|
13
|
-
* @param engine - Initialized inference engine with loaded model
|
|
14
|
-
* @param promptTokens - Tokenized prompt
|
|
15
|
-
* @param config - Generation configuration
|
|
16
|
-
* @returns Generation result
|
|
17
|
-
*/
|
|
18
|
-
export declare function generate(engine: InferenceEngine, promptTokens: number[] | Uint32Array, config?: Partial<GenerationConfig>): Promise<GenerationResult>;
|
|
19
|
-
/**
|
|
20
|
-
* Generate tokens with streaming (async iterator)
|
|
21
|
-
* @param engine - Initialized inference engine with loaded model
|
|
22
|
-
* @param promptTokens - Tokenized prompt
|
|
23
|
-
* @param config - Generation configuration
|
|
24
|
-
* @yields StreamToken for each generated token
|
|
25
|
-
*/
|
|
26
|
-
export declare function generateStream(engine: InferenceEngine, promptTokens: number[] | Uint32Array, config?: Partial<GenerationConfig>): AsyncGenerator<StreamToken, void, unknown>;
|
|
27
|
-
/**
|
|
28
|
-
* Simple greedy decode (no sampling, fastest)
|
|
29
|
-
*/
|
|
30
|
-
export declare function greedyDecode(engine: InferenceEngine, promptTokens: number[] | Uint32Array, maxTokens: number, eosTokenId?: number): number[];
|
|
@@ -1,7 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Inference Module
|
|
3
|
-
* High-level API for LLM inference
|
|
4
|
-
*/
|
|
5
|
-
export { InferenceEngine } from "./engine.ts";
|
|
6
|
-
export { generate, generateStream, greedyDecode, sampleNextToken, } from "./generate.ts";
|
|
7
|
-
export { DEFAULT_GENERATION_CONFIG, type FinishReason, type ForwardResult, type GenerationConfig, type GenerationResult, type InferenceConfig, type LayerWeights, type ModelConfig, type ModelWeights, normalizeGenerationConfig, type StreamToken, } from "./types.ts";
|
|
@@ -1,161 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Inference Types
|
|
3
|
-
* Configuration and result types for model inference
|
|
4
|
-
*/
|
|
5
|
-
import type { DType } from "../core/tensor.ts";
|
|
6
|
-
/**
|
|
7
|
-
* Model architecture configuration
|
|
8
|
-
*/
|
|
9
|
-
export interface ModelConfig {
|
|
10
|
-
/** Model architecture (e.g., "llama", "mistral", "gpt2") */
|
|
11
|
-
architecture: string;
|
|
12
|
-
/** Number of transformer layers */
|
|
13
|
-
numLayers: number;
|
|
14
|
-
/** Number of attention heads */
|
|
15
|
-
numHeads: number;
|
|
16
|
-
/** Number of key-value heads (for GQA, defaults to numHeads) */
|
|
17
|
-
numKVHeads?: number;
|
|
18
|
-
/** Hidden/embedding dimension */
|
|
19
|
-
hiddenSize: number;
|
|
20
|
-
/** Intermediate size for FFN */
|
|
21
|
-
intermediateSize: number;
|
|
22
|
-
/** Vocabulary size */
|
|
23
|
-
vocabSize: number;
|
|
24
|
-
/** Maximum sequence length */
|
|
25
|
-
maxSeqLen: number;
|
|
26
|
-
/** Head dimension (defaults to hiddenSize / numHeads) */
|
|
27
|
-
headDim?: number;
|
|
28
|
-
/** RoPE frequency base */
|
|
29
|
-
ropeFreqBase?: number;
|
|
30
|
-
/** RMS norm epsilon */
|
|
31
|
-
rmsNormEps?: number;
|
|
32
|
-
/** Data type for computation */
|
|
33
|
-
dtype?: DType;
|
|
34
|
-
}
|
|
35
|
-
/**
|
|
36
|
-
* Inference engine configuration
|
|
37
|
-
*/
|
|
38
|
-
export interface InferenceConfig {
|
|
39
|
-
/** Maximum batch size */
|
|
40
|
-
maxBatchSize?: number;
|
|
41
|
-
/** Maximum sequence length */
|
|
42
|
-
maxSeqLen?: number;
|
|
43
|
-
/** Use KV cache for generation */
|
|
44
|
-
useKVCache?: boolean;
|
|
45
|
-
/** Memory limit in bytes (optional) */
|
|
46
|
-
memoryLimit?: number;
|
|
47
|
-
/** Enable profiling */
|
|
48
|
-
enableProfiling?: boolean;
|
|
49
|
-
}
|
|
50
|
-
/**
|
|
51
|
-
* Generation / sampling configuration
|
|
52
|
-
*/
|
|
53
|
-
export interface GenerationConfig {
|
|
54
|
-
/** Maximum number of tokens to generate */
|
|
55
|
-
maxTokens: number;
|
|
56
|
-
/** Temperature for sampling (0 = greedy) */
|
|
57
|
-
temperature?: number;
|
|
58
|
-
/** Top-K sampling (0 = disabled) */
|
|
59
|
-
topK?: number;
|
|
60
|
-
/** Top-P / nucleus sampling (1.0 = disabled) */
|
|
61
|
-
topP?: number;
|
|
62
|
-
/** Repetition penalty (1.0 = disabled) */
|
|
63
|
-
repetitionPenalty?: number;
|
|
64
|
-
/** Stop sequences (generation stops when any is generated) */
|
|
65
|
-
stopSequences?: number[][];
|
|
66
|
-
/** EOS token ID */
|
|
67
|
-
eosTokenId?: number;
|
|
68
|
-
/** Pad token ID */
|
|
69
|
-
padTokenId?: number;
|
|
70
|
-
/** BOS token ID */
|
|
71
|
-
bosTokenId?: number;
|
|
72
|
-
/** Stream tokens as they are generated */
|
|
73
|
-
stream?: boolean;
|
|
74
|
-
/** Random seed for reproducibility */
|
|
75
|
-
seed?: number;
|
|
76
|
-
}
|
|
77
|
-
/**
|
|
78
|
-
* Default generation config values
|
|
79
|
-
*/
|
|
80
|
-
export declare const DEFAULT_GENERATION_CONFIG: Required<Omit<GenerationConfig, "stopSequences" | "seed">>;
|
|
81
|
-
/**
|
|
82
|
-
* Reason for generation completion
|
|
83
|
-
*/
|
|
84
|
-
export type FinishReason = "stop" | "length" | "eos";
|
|
85
|
-
/**
|
|
86
|
-
* Result of text generation
|
|
87
|
-
*/
|
|
88
|
-
export interface GenerationResult {
|
|
89
|
-
/** Generated token IDs */
|
|
90
|
-
tokens: number[];
|
|
91
|
-
/** Finish reason */
|
|
92
|
-
finishReason: FinishReason;
|
|
93
|
-
/** Number of prompt tokens */
|
|
94
|
-
promptTokens: number;
|
|
95
|
-
/** Number of generated tokens */
|
|
96
|
-
generatedTokens: number;
|
|
97
|
-
/** Total time in milliseconds */
|
|
98
|
-
totalTimeMs: number;
|
|
99
|
-
/** Tokens per second */
|
|
100
|
-
tokensPerSecond: number;
|
|
101
|
-
}
|
|
102
|
-
/**
|
|
103
|
-
* Streaming generation token
|
|
104
|
-
*/
|
|
105
|
-
export interface StreamToken {
|
|
106
|
-
/** Token ID */
|
|
107
|
-
tokenId: number;
|
|
108
|
-
/** Token index in generation */
|
|
109
|
-
index: number;
|
|
110
|
-
/** Whether this is the final token */
|
|
111
|
-
isLast: boolean;
|
|
112
|
-
/** Finish reason (only set if isLast) */
|
|
113
|
-
finishReason?: FinishReason;
|
|
114
|
-
}
|
|
115
|
-
/**
|
|
116
|
-
* Forward pass result
|
|
117
|
-
*/
|
|
118
|
-
export interface ForwardResult {
|
|
119
|
-
/** Logits [batch, vocabSize] or [batch, seqLen, vocabSize] */
|
|
120
|
-
logits: Float32Array;
|
|
121
|
-
/** Shape of logits */
|
|
122
|
-
logitsShape: number[];
|
|
123
|
-
}
|
|
124
|
-
/**
|
|
125
|
-
* Validate and normalize generation config
|
|
126
|
-
*/
|
|
127
|
-
export declare function normalizeGenerationConfig(config: Partial<GenerationConfig>): GenerationConfig;
|
|
128
|
-
/**
|
|
129
|
-
* Model layer weights
|
|
130
|
-
*/
|
|
131
|
-
export interface LayerWeights {
|
|
132
|
-
/** Attention weights */
|
|
133
|
-
attention: {
|
|
134
|
-
qProj: Float32Array;
|
|
135
|
-
kProj: Float32Array;
|
|
136
|
-
vProj: Float32Array;
|
|
137
|
-
oProj: Float32Array;
|
|
138
|
-
};
|
|
139
|
-
/** FFN weights */
|
|
140
|
-
ffn: {
|
|
141
|
-
gate?: Float32Array;
|
|
142
|
-
up: Float32Array;
|
|
143
|
-
down: Float32Array;
|
|
144
|
-
};
|
|
145
|
-
/** Normalization */
|
|
146
|
-
inputNorm: Float32Array;
|
|
147
|
-
postAttentionNorm: Float32Array;
|
|
148
|
-
}
|
|
149
|
-
/**
|
|
150
|
-
* Full model weights
|
|
151
|
-
*/
|
|
152
|
-
export interface ModelWeights {
|
|
153
|
-
/** Token embeddings */
|
|
154
|
-
embedTokens: Float32Array;
|
|
155
|
-
/** Layer weights */
|
|
156
|
-
layers: LayerWeights[];
|
|
157
|
-
/** Final norm */
|
|
158
|
-
finalNorm: Float32Array;
|
|
159
|
-
/** LM head (output projection) */
|
|
160
|
-
lmHead: Float32Array;
|
|
161
|
-
}
|
package/dist/jit/compiler.d.ts
DELETED
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* WGSL Compiler - Generates optimized GPU kernels
|
|
3
|
-
*/
|
|
4
|
-
import type { DeviceInfo } from "../core/device.ts";
|
|
5
|
-
import type { KernelCache } from "./kernel-cache.ts";
|
|
6
|
-
export interface MatMulConfig {
|
|
7
|
-
M: number;
|
|
8
|
-
N: number;
|
|
9
|
-
K: number;
|
|
10
|
-
tileM?: number;
|
|
11
|
-
tileN?: number;
|
|
12
|
-
tileK?: number;
|
|
13
|
-
}
|
|
14
|
-
export declare class WGSLCompiler {
|
|
15
|
-
private device;
|
|
16
|
-
private cache;
|
|
17
|
-
private deviceInfo;
|
|
18
|
-
constructor(device: GPUDevice, cache: KernelCache, deviceInfo: DeviceInfo);
|
|
19
|
-
private selectTileSize;
|
|
20
|
-
compileMatMul(config: MatMulConfig): GPUComputePipeline;
|
|
21
|
-
private generateMatMulWGSL;
|
|
22
|
-
getCacheStats(): import("./kernel-cache.ts").CacheStats;
|
|
23
|
-
}
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Kernel Cache - Caches compiled GPU compute pipelines
|
|
3
|
-
*/
|
|
4
|
-
export interface CacheStats {
|
|
5
|
-
hits: number;
|
|
6
|
-
misses: number;
|
|
7
|
-
size: number;
|
|
8
|
-
}
|
|
9
|
-
export declare class KernelCache {
|
|
10
|
-
private device;
|
|
11
|
-
private cache;
|
|
12
|
-
private hits;
|
|
13
|
-
private misses;
|
|
14
|
-
constructor(device: GPUDevice);
|
|
15
|
-
getOrCreate(key: string, createFn: () => GPUComputePipeline): GPUComputePipeline;
|
|
16
|
-
has(key: string): boolean;
|
|
17
|
-
get(key: string): GPUComputePipeline | undefined;
|
|
18
|
-
set(key: string, pipeline: GPUComputePipeline): void;
|
|
19
|
-
getStats(): CacheStats;
|
|
20
|
-
clear(): void;
|
|
21
|
-
}
|
package/dist/model/gguf.d.ts
DELETED
|
@@ -1,90 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* GGUF Format Parser
|
|
3
|
-
*
|
|
4
|
-
* GGUF file structure:
|
|
5
|
-
* - 4 bytes: magic "GGUF"
|
|
6
|
-
* - 4 bytes: version (3)
|
|
7
|
-
* - 8 bytes: n_tensors (u64)
|
|
8
|
-
* - 8 bytes: n_kv (u64)
|
|
9
|
-
* - Key-value metadata pairs
|
|
10
|
-
* - Tensor info descriptors
|
|
11
|
-
* - Padding to alignment
|
|
12
|
-
* - Tensor data
|
|
13
|
-
*/
|
|
14
|
-
import { GGUFQuantType, type GGUFTensorInfo, type LoadedModel, type LoadOptions, type TensorInfo } from "./types.ts";
|
|
15
|
-
/**
|
|
16
|
-
* Reader helper for GGUF binary format
|
|
17
|
-
*/
|
|
18
|
-
declare class GGUFReader {
|
|
19
|
-
private view;
|
|
20
|
-
private offset;
|
|
21
|
-
private textDecoder;
|
|
22
|
-
constructor(buffer: ArrayBuffer);
|
|
23
|
-
get position(): number;
|
|
24
|
-
set position(pos: number);
|
|
25
|
-
readUint8(): number;
|
|
26
|
-
readInt8(): number;
|
|
27
|
-
readUint16(): number;
|
|
28
|
-
readInt16(): number;
|
|
29
|
-
readUint32(): number;
|
|
30
|
-
readInt32(): number;
|
|
31
|
-
readUint64(): bigint;
|
|
32
|
-
readInt64(): bigint;
|
|
33
|
-
readFloat32(): number;
|
|
34
|
-
readFloat64(): number;
|
|
35
|
-
readBool(): boolean;
|
|
36
|
-
readString(): string;
|
|
37
|
-
alignTo(alignment: number): void;
|
|
38
|
-
}
|
|
39
|
-
/**
|
|
40
|
-
* GGUF header information
|
|
41
|
-
*/
|
|
42
|
-
export interface GGUFHeader {
|
|
43
|
-
magic: number;
|
|
44
|
-
version: number;
|
|
45
|
-
nTensors: bigint;
|
|
46
|
-
nKV: bigint;
|
|
47
|
-
}
|
|
48
|
-
/**
|
|
49
|
-
* Parse the GGUF header
|
|
50
|
-
*/
|
|
51
|
-
export declare function parseGGUFHeader(reader: GGUFReader): GGUFHeader;
|
|
52
|
-
/**
|
|
53
|
-
* Parse all metadata key-value pairs
|
|
54
|
-
*/
|
|
55
|
-
export declare function parseGGUFMetadata(reader: GGUFReader, nKV: bigint): Map<string, unknown>;
|
|
56
|
-
/**
|
|
57
|
-
* Parse tensor info descriptors
|
|
58
|
-
*/
|
|
59
|
-
export declare function parseGGUFTensorInfos(reader: GGUFReader, nTensors: bigint): GGUFTensorInfo[];
|
|
60
|
-
/**
|
|
61
|
-
* Calculate byte size for a GGUF tensor
|
|
62
|
-
*/
|
|
63
|
-
export declare function calculateGGUFTensorBytes(type: GGUFQuantType, shape: number[]): number;
|
|
64
|
-
/**
|
|
65
|
-
* Load a GGUF model from an ArrayBuffer
|
|
66
|
-
*/
|
|
67
|
-
export declare function loadGGUF(buffer: ArrayBuffer, options?: LoadOptions): LoadedModel;
|
|
68
|
-
/**
|
|
69
|
-
* Load GGUF from a URL
|
|
70
|
-
*/
|
|
71
|
-
export declare function loadGGUFFromUrl(url: string, options?: LoadOptions): Promise<LoadedModel>;
|
|
72
|
-
/**
|
|
73
|
-
* Dequantize Q4_0 block to float32
|
|
74
|
-
* Q4_0: 32 values = 2 bytes scale (f16) + 16 bytes data (4-bit packed)
|
|
75
|
-
*/
|
|
76
|
-
export declare function dequantizeQ4_0Block(data: Uint8Array, offset: number): Float32Array;
|
|
77
|
-
/**
|
|
78
|
-
* Dequantize Q8_0 block to float32
|
|
79
|
-
* Q8_0: 32 values = 2 bytes scale (f16) + 32 bytes data (int8)
|
|
80
|
-
*/
|
|
81
|
-
export declare function dequantizeQ8_0Block(data: Uint8Array, offset: number): Float32Array;
|
|
82
|
-
/**
|
|
83
|
-
* Load and dequantize a GGUF tensor to Float32Array
|
|
84
|
-
*/
|
|
85
|
-
export declare function loadGGUFTensor(buffer: ArrayBuffer, info: TensorInfo): Float32Array;
|
|
86
|
-
/**
|
|
87
|
-
* Check if a buffer is a valid GGUF file
|
|
88
|
-
*/
|
|
89
|
-
export declare function isGGUF(buffer: ArrayBuffer): boolean;
|
|
90
|
-
export {};
|
package/dist/model/index.d.ts
DELETED
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Model Loading Module
|
|
3
|
-
* Supports SafeTensors and GGUF formats
|
|
4
|
-
*/
|
|
5
|
-
import type { LoadedModel, LoadOptions, ModelFormat } from "./types.ts";
|
|
6
|
-
export { calculateGGUFTensorBytes, dequantizeQ4_0Block, dequantizeQ8_0Block, type GGUFHeader, isGGUF, loadGGUF, loadGGUFFromUrl, loadGGUFTensor, parseGGUFHeader, parseGGUFMetadata, parseGGUFTensorInfos, } from "./gguf.ts";
|
|
7
|
-
export { getSafetensorsTensorInfos, isSafetensors, loadSafetensors, loadSafetensorsFromUrl, loadSafetensorsTensor, parseSafetensorsHeader, } from "./safetensors.ts";
|
|
8
|
-
export { GGUF_QUANT_BLOCK_SIZE, GGUF_QUANT_BYTES_PER_BLOCK, GGUFMetadataValueType, GGUFQuantType, type GGUFTensorInfo, type LoadedModel, type LoadedTensor, type LoadOptions, type ModelFormat, type ModelMetadata, SAFETENSORS_DTYPE_BYTES, type SafetensorsDType, type SafetensorsHeader, type SafetensorsHeaderEntry, type TensorInfo, } from "./types.ts";
|
|
9
|
-
/**
|
|
10
|
-
* Auto-detect model format from buffer
|
|
11
|
-
*/
|
|
12
|
-
export declare function detectModelFormat(buffer: ArrayBuffer): ModelFormat | null;
|
|
13
|
-
/**
|
|
14
|
-
* Load a model file, auto-detecting the format
|
|
15
|
-
*/
|
|
16
|
-
export declare function loadModel(source: ArrayBuffer | string, options?: LoadOptions): Promise<LoadedModel>;
|
|
@@ -1,38 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* SafeTensors Format Parser
|
|
3
|
-
*
|
|
4
|
-
* SafeTensors file structure:
|
|
5
|
-
* - 8 bytes: header size (little-endian u64)
|
|
6
|
-
* - N bytes: JSON header (UTF-8)
|
|
7
|
-
* - Remaining: tensor data (contiguous)
|
|
8
|
-
*/
|
|
9
|
-
import { type LoadedModel, type LoadOptions, type SafetensorsHeader, type TensorInfo } from "./types.ts";
|
|
10
|
-
/**
|
|
11
|
-
* Parse the SafeTensors header from a buffer
|
|
12
|
-
* @param buffer - ArrayBuffer containing the SafeTensors file
|
|
13
|
-
* @returns Parsed header and data offset
|
|
14
|
-
*/
|
|
15
|
-
export declare function parseSafetensorsHeader(buffer: ArrayBuffer): {
|
|
16
|
-
header: SafetensorsHeader;
|
|
17
|
-
dataOffset: number;
|
|
18
|
-
};
|
|
19
|
-
/**
|
|
20
|
-
* Extract tensor information from SafeTensors header
|
|
21
|
-
*/
|
|
22
|
-
export declare function getSafetensorsTensorInfos(header: SafetensorsHeader, dataOffset: number): Map<string, TensorInfo>;
|
|
23
|
-
/**
|
|
24
|
-
* Load a single tensor's data from the buffer
|
|
25
|
-
*/
|
|
26
|
-
export declare function loadSafetensorsTensor(buffer: ArrayBuffer, info: TensorInfo): Float32Array;
|
|
27
|
-
/**
|
|
28
|
-
* Load a SafeTensors model from an ArrayBuffer
|
|
29
|
-
*/
|
|
30
|
-
export declare function loadSafetensors(buffer: ArrayBuffer, options?: LoadOptions): LoadedModel;
|
|
31
|
-
/**
|
|
32
|
-
* Load SafeTensors from a URL
|
|
33
|
-
*/
|
|
34
|
-
export declare function loadSafetensorsFromUrl(url: string, options?: LoadOptions): Promise<LoadedModel>;
|
|
35
|
-
/**
|
|
36
|
-
* Check if a buffer is a valid SafeTensors file
|
|
37
|
-
*/
|
|
38
|
-
export declare function isSafetensors(buffer: ArrayBuffer): boolean;
|