webinfer 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/dist/attention/block-sparse/format.d.ts +52 -0
- package/dist/attention/block-sparse/patterns/causal.d.ts +16 -0
- package/dist/attention/block-sparse/patterns/sliding.d.ts +22 -0
- package/dist/attention/flash-attention.d.ts +30 -0
- package/dist/attention/index.d.ts +9 -0
- package/dist/attention/paged-kv/block-manager.d.ts +102 -0
- package/dist/attention/paged-kv/index.d.ts +5 -0
- package/dist/attention/paged-kv/page-table.d.ts +99 -0
- package/dist/attention/scheduler.d.ts +40 -0
- package/dist/core/buffer-pool.d.ts +18 -0
- package/dist/core/device.d.ts +23 -0
- package/dist/core/tensor.d.ts +25 -0
- package/dist/index.d.ts +22 -0
- package/dist/index.js +4228 -0
- package/dist/inference/engine.d.ts +69 -0
- package/dist/inference/generate.d.ts +30 -0
- package/dist/inference/index.d.ts +7 -0
- package/dist/inference/types.d.ts +161 -0
- package/dist/jit/compiler.d.ts +23 -0
- package/dist/jit/kernel-cache.d.ts +21 -0
- package/dist/model/gguf.d.ts +90 -0
- package/dist/model/index.d.ts +16 -0
- package/dist/model/safetensors.d.ts +38 -0
- package/dist/model/types.d.ts +182 -0
- package/dist/ops/activations.d.ts +43 -0
- package/dist/ops/elementwise.d.ts +38 -0
- package/dist/ops/embedding.d.ts +30 -0
- package/dist/ops/matmul.d.ts +21 -0
- package/dist/ops/normalization.d.ts +24 -0
- package/dist/ops/reshape.d.ts +39 -0
- package/dist/ops/rope.d.ts +32 -0
- package/dist/ops/softmax.d.ts +18 -0
- package/dist/quantization/index.d.ts +6 -0
- package/dist/quantization/qmatmul.d.ts +38 -0
- package/dist/quantization/quantize.d.ts +52 -0
- package/dist/sampling/index.d.ts +6 -0
- package/dist/sampling/sampler.d.ts +39 -0
- package/dist/sampling/top-k.d.ts +24 -0
- package/dist/sampling/top-p.d.ts +14 -0
- package/package.json +54 -0
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Inference Engine
|
|
3
|
+
* Core engine for running LLM inference
|
|
4
|
+
*/
|
|
5
|
+
import type { WebInferDevice } from "../core/device.ts";
|
|
6
|
+
import type { LoadedModel } from "../model/types.ts";
|
|
7
|
+
import type { ModelConfig, InferenceConfig, ForwardResult } from "./types.ts";
|
|
8
|
+
/**
|
|
9
|
+
* Inference Engine
|
|
10
|
+
* Manages model weights and provides forward pass functionality
|
|
11
|
+
*/
|
|
12
|
+
export declare class InferenceEngine {
|
|
13
|
+
private device;
|
|
14
|
+
private config;
|
|
15
|
+
private modelConfig;
|
|
16
|
+
private weights;
|
|
17
|
+
private loadedModel;
|
|
18
|
+
private kvCache;
|
|
19
|
+
private ropeFreqsCos;
|
|
20
|
+
private ropeFreqsSin;
|
|
21
|
+
constructor(device: WebInferDevice | null, config?: InferenceConfig);
|
|
22
|
+
/**
|
|
23
|
+
* Load model weights from a LoadedModel
|
|
24
|
+
*/
|
|
25
|
+
loadModel(model: LoadedModel, modelConfig: ModelConfig): Promise<void>;
|
|
26
|
+
/**
|
|
27
|
+
* Extract model weights from loaded model
|
|
28
|
+
*/
|
|
29
|
+
private extractWeights;
|
|
30
|
+
/**
|
|
31
|
+
* Initialize KV cache
|
|
32
|
+
*/
|
|
33
|
+
private initKVCache;
|
|
34
|
+
/**
|
|
35
|
+
* Reset KV cache (for new sequence)
|
|
36
|
+
*/
|
|
37
|
+
resetKVCache(): void;
|
|
38
|
+
/**
|
|
39
|
+
* Forward pass (CPU reference implementation)
|
|
40
|
+
* @param inputIds - Input token IDs [seqLen]
|
|
41
|
+
* @param startPos - Starting position for KV cache
|
|
42
|
+
* @returns Logits for the last token
|
|
43
|
+
*/
|
|
44
|
+
forward(inputIds: Uint32Array, startPos?: number): ForwardResult;
|
|
45
|
+
/**
|
|
46
|
+
* Attention forward pass
|
|
47
|
+
*/
|
|
48
|
+
private attentionForward;
|
|
49
|
+
/**
|
|
50
|
+
* Apply RoPE to a single head position
|
|
51
|
+
*/
|
|
52
|
+
private applyRoPE;
|
|
53
|
+
/**
|
|
54
|
+
* FFN forward pass (SwiGLU)
|
|
55
|
+
*/
|
|
56
|
+
private ffnForward;
|
|
57
|
+
/**
|
|
58
|
+
* Get model configuration
|
|
59
|
+
*/
|
|
60
|
+
getModelConfig(): ModelConfig | null;
|
|
61
|
+
/**
|
|
62
|
+
* Check if model is loaded
|
|
63
|
+
*/
|
|
64
|
+
isLoaded(): boolean;
|
|
65
|
+
/**
|
|
66
|
+
* Dispose resources
|
|
67
|
+
*/
|
|
68
|
+
dispose(): void;
|
|
69
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Text Generation API
|
|
3
|
+
* High-level API for generating text with LLMs
|
|
4
|
+
*/
|
|
5
|
+
import type { InferenceEngine } from "./engine.ts";
|
|
6
|
+
import type { GenerationConfig, GenerationResult, StreamToken } from "./types.ts";
|
|
7
|
+
/**
|
|
8
|
+
* Apply generation config to logits and sample next token
|
|
9
|
+
*/
|
|
10
|
+
export declare function sampleNextToken(logits: Float32Array, config: GenerationConfig, generatedTokens?: number[]): number;
|
|
11
|
+
/**
|
|
12
|
+
* Generate tokens from a prompt
|
|
13
|
+
* @param engine - Initialized inference engine with loaded model
|
|
14
|
+
* @param promptTokens - Tokenized prompt
|
|
15
|
+
* @param config - Generation configuration
|
|
16
|
+
* @returns Generation result
|
|
17
|
+
*/
|
|
18
|
+
export declare function generate(engine: InferenceEngine, promptTokens: number[] | Uint32Array, config?: Partial<GenerationConfig>): Promise<GenerationResult>;
|
|
19
|
+
/**
|
|
20
|
+
* Generate tokens with streaming (async iterator)
|
|
21
|
+
* @param engine - Initialized inference engine with loaded model
|
|
22
|
+
* @param promptTokens - Tokenized prompt
|
|
23
|
+
* @param config - Generation configuration
|
|
24
|
+
* @yields StreamToken for each generated token
|
|
25
|
+
*/
|
|
26
|
+
export declare function generateStream(engine: InferenceEngine, promptTokens: number[] | Uint32Array, config?: Partial<GenerationConfig>): AsyncGenerator<StreamToken, void, unknown>;
|
|
27
|
+
/**
|
|
28
|
+
* Simple greedy decode (no sampling, fastest)
|
|
29
|
+
*/
|
|
30
|
+
export declare function greedyDecode(engine: InferenceEngine, promptTokens: number[] | Uint32Array, maxTokens: number, eosTokenId?: number): number[];
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Inference Module
|
|
3
|
+
* High-level API for LLM inference
|
|
4
|
+
*/
|
|
5
|
+
export { type ModelConfig, type InferenceConfig, type GenerationConfig, type GenerationResult, type StreamToken, type FinishReason, type ForwardResult, type ModelWeights, type LayerWeights, DEFAULT_GENERATION_CONFIG, normalizeGenerationConfig, } from "./types.ts";
|
|
6
|
+
export { InferenceEngine } from "./engine.ts";
|
|
7
|
+
export { generate, generateStream, greedyDecode, sampleNextToken, } from "./generate.ts";
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Inference Types
|
|
3
|
+
* Configuration and result types for model inference
|
|
4
|
+
*/
|
|
5
|
+
import type { DType } from "../core/tensor.ts";
|
|
6
|
+
/**
|
|
7
|
+
* Model architecture configuration
|
|
8
|
+
*/
|
|
9
|
+
export interface ModelConfig {
|
|
10
|
+
/** Model architecture (e.g., "llama", "mistral", "gpt2") */
|
|
11
|
+
architecture: string;
|
|
12
|
+
/** Number of transformer layers */
|
|
13
|
+
numLayers: number;
|
|
14
|
+
/** Number of attention heads */
|
|
15
|
+
numHeads: number;
|
|
16
|
+
/** Number of key-value heads (for GQA, defaults to numHeads) */
|
|
17
|
+
numKVHeads?: number;
|
|
18
|
+
/** Hidden/embedding dimension */
|
|
19
|
+
hiddenSize: number;
|
|
20
|
+
/** Intermediate size for FFN */
|
|
21
|
+
intermediateSize: number;
|
|
22
|
+
/** Vocabulary size */
|
|
23
|
+
vocabSize: number;
|
|
24
|
+
/** Maximum sequence length */
|
|
25
|
+
maxSeqLen: number;
|
|
26
|
+
/** Head dimension (defaults to hiddenSize / numHeads) */
|
|
27
|
+
headDim?: number;
|
|
28
|
+
/** RoPE frequency base */
|
|
29
|
+
ropeFreqBase?: number;
|
|
30
|
+
/** RMS norm epsilon */
|
|
31
|
+
rmsNormEps?: number;
|
|
32
|
+
/** Data type for computation */
|
|
33
|
+
dtype?: DType;
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Inference engine configuration
|
|
37
|
+
*/
|
|
38
|
+
export interface InferenceConfig {
|
|
39
|
+
/** Maximum batch size */
|
|
40
|
+
maxBatchSize?: number;
|
|
41
|
+
/** Maximum sequence length */
|
|
42
|
+
maxSeqLen?: number;
|
|
43
|
+
/** Use KV cache for generation */
|
|
44
|
+
useKVCache?: boolean;
|
|
45
|
+
/** Memory limit in bytes (optional) */
|
|
46
|
+
memoryLimit?: number;
|
|
47
|
+
/** Enable profiling */
|
|
48
|
+
enableProfiling?: boolean;
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* Generation / sampling configuration
|
|
52
|
+
*/
|
|
53
|
+
export interface GenerationConfig {
|
|
54
|
+
/** Maximum number of tokens to generate */
|
|
55
|
+
maxTokens: number;
|
|
56
|
+
/** Temperature for sampling (0 = greedy) */
|
|
57
|
+
temperature?: number;
|
|
58
|
+
/** Top-K sampling (0 = disabled) */
|
|
59
|
+
topK?: number;
|
|
60
|
+
/** Top-P / nucleus sampling (1.0 = disabled) */
|
|
61
|
+
topP?: number;
|
|
62
|
+
/** Repetition penalty (1.0 = disabled) */
|
|
63
|
+
repetitionPenalty?: number;
|
|
64
|
+
/** Stop sequences (generation stops when any is generated) */
|
|
65
|
+
stopSequences?: number[][];
|
|
66
|
+
/** EOS token ID */
|
|
67
|
+
eosTokenId?: number;
|
|
68
|
+
/** Pad token ID */
|
|
69
|
+
padTokenId?: number;
|
|
70
|
+
/** BOS token ID */
|
|
71
|
+
bosTokenId?: number;
|
|
72
|
+
/** Stream tokens as they are generated */
|
|
73
|
+
stream?: boolean;
|
|
74
|
+
/** Random seed for reproducibility */
|
|
75
|
+
seed?: number;
|
|
76
|
+
}
|
|
77
|
+
/**
|
|
78
|
+
* Default generation config values
|
|
79
|
+
*/
|
|
80
|
+
export declare const DEFAULT_GENERATION_CONFIG: Required<Omit<GenerationConfig, "stopSequences" | "seed">>;
|
|
81
|
+
/**
|
|
82
|
+
* Reason for generation completion
|
|
83
|
+
*/
|
|
84
|
+
export type FinishReason = "stop" | "length" | "eos";
|
|
85
|
+
/**
|
|
86
|
+
* Result of text generation
|
|
87
|
+
*/
|
|
88
|
+
export interface GenerationResult {
|
|
89
|
+
/** Generated token IDs */
|
|
90
|
+
tokens: number[];
|
|
91
|
+
/** Finish reason */
|
|
92
|
+
finishReason: FinishReason;
|
|
93
|
+
/** Number of prompt tokens */
|
|
94
|
+
promptTokens: number;
|
|
95
|
+
/** Number of generated tokens */
|
|
96
|
+
generatedTokens: number;
|
|
97
|
+
/** Total time in milliseconds */
|
|
98
|
+
totalTimeMs: number;
|
|
99
|
+
/** Tokens per second */
|
|
100
|
+
tokensPerSecond: number;
|
|
101
|
+
}
|
|
102
|
+
/**
|
|
103
|
+
* Streaming generation token
|
|
104
|
+
*/
|
|
105
|
+
export interface StreamToken {
|
|
106
|
+
/** Token ID */
|
|
107
|
+
tokenId: number;
|
|
108
|
+
/** Token index in generation */
|
|
109
|
+
index: number;
|
|
110
|
+
/** Whether this is the final token */
|
|
111
|
+
isLast: boolean;
|
|
112
|
+
/** Finish reason (only set if isLast) */
|
|
113
|
+
finishReason?: FinishReason;
|
|
114
|
+
}
|
|
115
|
+
/**
|
|
116
|
+
* Forward pass result
|
|
117
|
+
*/
|
|
118
|
+
export interface ForwardResult {
|
|
119
|
+
/** Logits [batch, vocabSize] or [batch, seqLen, vocabSize] */
|
|
120
|
+
logits: Float32Array;
|
|
121
|
+
/** Shape of logits */
|
|
122
|
+
logitsShape: number[];
|
|
123
|
+
}
|
|
124
|
+
/**
|
|
125
|
+
* Validate and normalize generation config
|
|
126
|
+
*/
|
|
127
|
+
export declare function normalizeGenerationConfig(config: Partial<GenerationConfig>): GenerationConfig;
|
|
128
|
+
/**
|
|
129
|
+
* Model layer weights
|
|
130
|
+
*/
|
|
131
|
+
export interface LayerWeights {
|
|
132
|
+
/** Attention weights */
|
|
133
|
+
attention: {
|
|
134
|
+
qProj: Float32Array;
|
|
135
|
+
kProj: Float32Array;
|
|
136
|
+
vProj: Float32Array;
|
|
137
|
+
oProj: Float32Array;
|
|
138
|
+
};
|
|
139
|
+
/** FFN weights */
|
|
140
|
+
ffn: {
|
|
141
|
+
gate?: Float32Array;
|
|
142
|
+
up: Float32Array;
|
|
143
|
+
down: Float32Array;
|
|
144
|
+
};
|
|
145
|
+
/** Normalization */
|
|
146
|
+
inputNorm: Float32Array;
|
|
147
|
+
postAttentionNorm: Float32Array;
|
|
148
|
+
}
|
|
149
|
+
/**
|
|
150
|
+
* Full model weights
|
|
151
|
+
*/
|
|
152
|
+
export interface ModelWeights {
|
|
153
|
+
/** Token embeddings */
|
|
154
|
+
embedTokens: Float32Array;
|
|
155
|
+
/** Layer weights */
|
|
156
|
+
layers: LayerWeights[];
|
|
157
|
+
/** Final norm */
|
|
158
|
+
finalNorm: Float32Array;
|
|
159
|
+
/** LM head (output projection) */
|
|
160
|
+
lmHead: Float32Array;
|
|
161
|
+
}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* WGSL Compiler - Generates optimized GPU kernels
|
|
3
|
+
*/
|
|
4
|
+
import { KernelCache } from "./kernel-cache.ts";
|
|
5
|
+
import type { DeviceInfo } from "../core/device.ts";
|
|
6
|
+
export interface MatMulConfig {
|
|
7
|
+
M: number;
|
|
8
|
+
N: number;
|
|
9
|
+
K: number;
|
|
10
|
+
tileM?: number;
|
|
11
|
+
tileN?: number;
|
|
12
|
+
tileK?: number;
|
|
13
|
+
}
|
|
14
|
+
export declare class WGSLCompiler {
|
|
15
|
+
private device;
|
|
16
|
+
private cache;
|
|
17
|
+
private deviceInfo;
|
|
18
|
+
constructor(device: GPUDevice, cache: KernelCache, deviceInfo: DeviceInfo);
|
|
19
|
+
private selectTileSize;
|
|
20
|
+
compileMatMul(config: MatMulConfig): GPUComputePipeline;
|
|
21
|
+
private generateMatMulWGSL;
|
|
22
|
+
getCacheStats(): import("./kernel-cache.ts").CacheStats;
|
|
23
|
+
}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Kernel Cache - Caches compiled GPU compute pipelines
|
|
3
|
+
*/
|
|
4
|
+
export interface CacheStats {
|
|
5
|
+
hits: number;
|
|
6
|
+
misses: number;
|
|
7
|
+
size: number;
|
|
8
|
+
}
|
|
9
|
+
export declare class KernelCache {
|
|
10
|
+
private device;
|
|
11
|
+
private cache;
|
|
12
|
+
private hits;
|
|
13
|
+
private misses;
|
|
14
|
+
constructor(device: GPUDevice);
|
|
15
|
+
getOrCreate(key: string, createFn: () => GPUComputePipeline): GPUComputePipeline;
|
|
16
|
+
has(key: string): boolean;
|
|
17
|
+
get(key: string): GPUComputePipeline | undefined;
|
|
18
|
+
set(key: string, pipeline: GPUComputePipeline): void;
|
|
19
|
+
getStats(): CacheStats;
|
|
20
|
+
clear(): void;
|
|
21
|
+
}
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* GGUF Format Parser
|
|
3
|
+
*
|
|
4
|
+
* GGUF file structure:
|
|
5
|
+
* - 4 bytes: magic "GGUF"
|
|
6
|
+
* - 4 bytes: version (3)
|
|
7
|
+
* - 8 bytes: n_tensors (u64)
|
|
8
|
+
* - 8 bytes: n_kv (u64)
|
|
9
|
+
* - Key-value metadata pairs
|
|
10
|
+
* - Tensor info descriptors
|
|
11
|
+
* - Padding to alignment
|
|
12
|
+
* - Tensor data
|
|
13
|
+
*/
|
|
14
|
+
import { type GGUFTensorInfo, type TensorInfo, type LoadedModel, type LoadOptions, GGUFQuantType } from "./types.ts";
|
|
15
|
+
/**
|
|
16
|
+
* Reader helper for GGUF binary format
|
|
17
|
+
*/
|
|
18
|
+
declare class GGUFReader {
|
|
19
|
+
private view;
|
|
20
|
+
private offset;
|
|
21
|
+
private textDecoder;
|
|
22
|
+
constructor(buffer: ArrayBuffer);
|
|
23
|
+
get position(): number;
|
|
24
|
+
set position(pos: number);
|
|
25
|
+
readUint8(): number;
|
|
26
|
+
readInt8(): number;
|
|
27
|
+
readUint16(): number;
|
|
28
|
+
readInt16(): number;
|
|
29
|
+
readUint32(): number;
|
|
30
|
+
readInt32(): number;
|
|
31
|
+
readUint64(): bigint;
|
|
32
|
+
readInt64(): bigint;
|
|
33
|
+
readFloat32(): number;
|
|
34
|
+
readFloat64(): number;
|
|
35
|
+
readBool(): boolean;
|
|
36
|
+
readString(): string;
|
|
37
|
+
alignTo(alignment: number): void;
|
|
38
|
+
}
|
|
39
|
+
/**
|
|
40
|
+
* GGUF header information
|
|
41
|
+
*/
|
|
42
|
+
export interface GGUFHeader {
|
|
43
|
+
magic: number;
|
|
44
|
+
version: number;
|
|
45
|
+
nTensors: bigint;
|
|
46
|
+
nKV: bigint;
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* Parse the GGUF header
|
|
50
|
+
*/
|
|
51
|
+
export declare function parseGGUFHeader(reader: GGUFReader): GGUFHeader;
|
|
52
|
+
/**
|
|
53
|
+
* Parse all metadata key-value pairs
|
|
54
|
+
*/
|
|
55
|
+
export declare function parseGGUFMetadata(reader: GGUFReader, nKV: bigint): Map<string, unknown>;
|
|
56
|
+
/**
|
|
57
|
+
* Parse tensor info descriptors
|
|
58
|
+
*/
|
|
59
|
+
export declare function parseGGUFTensorInfos(reader: GGUFReader, nTensors: bigint): GGUFTensorInfo[];
|
|
60
|
+
/**
|
|
61
|
+
* Calculate byte size for a GGUF tensor
|
|
62
|
+
*/
|
|
63
|
+
export declare function calculateGGUFTensorBytes(type: GGUFQuantType, shape: number[]): number;
|
|
64
|
+
/**
|
|
65
|
+
* Load a GGUF model from an ArrayBuffer
|
|
66
|
+
*/
|
|
67
|
+
export declare function loadGGUF(buffer: ArrayBuffer, options?: LoadOptions): LoadedModel;
|
|
68
|
+
/**
|
|
69
|
+
* Load GGUF from a URL
|
|
70
|
+
*/
|
|
71
|
+
export declare function loadGGUFFromUrl(url: string, options?: LoadOptions): Promise<LoadedModel>;
|
|
72
|
+
/**
|
|
73
|
+
* Dequantize Q4_0 block to float32
|
|
74
|
+
* Q4_0: 32 values = 2 bytes scale (f16) + 16 bytes data (4-bit packed)
|
|
75
|
+
*/
|
|
76
|
+
export declare function dequantizeQ4_0Block(data: Uint8Array, offset: number): Float32Array;
|
|
77
|
+
/**
|
|
78
|
+
* Dequantize Q8_0 block to float32
|
|
79
|
+
* Q8_0: 32 values = 2 bytes scale (f16) + 32 bytes data (int8)
|
|
80
|
+
*/
|
|
81
|
+
export declare function dequantizeQ8_0Block(data: Uint8Array, offset: number): Float32Array;
|
|
82
|
+
/**
|
|
83
|
+
* Load and dequantize a GGUF tensor to Float32Array
|
|
84
|
+
*/
|
|
85
|
+
export declare function loadGGUFTensor(buffer: ArrayBuffer, info: TensorInfo): Float32Array;
|
|
86
|
+
/**
|
|
87
|
+
* Check if a buffer is a valid GGUF file
|
|
88
|
+
*/
|
|
89
|
+
export declare function isGGUF(buffer: ArrayBuffer): boolean;
|
|
90
|
+
export {};
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Model Loading Module
|
|
3
|
+
* Supports SafeTensors and GGUF formats
|
|
4
|
+
*/
|
|
5
|
+
import type { ModelFormat, LoadOptions, LoadedModel } from "./types.ts";
|
|
6
|
+
export { type ModelFormat, type SafetensorsDType, GGUFQuantType, GGUFMetadataValueType, type TensorInfo, type SafetensorsHeader, type SafetensorsHeaderEntry, type ModelMetadata, type GGUFTensorInfo, type LoadedTensor, type LoadedModel, type LoadOptions, SAFETENSORS_DTYPE_BYTES, GGUF_QUANT_BLOCK_SIZE, GGUF_QUANT_BYTES_PER_BLOCK, } from "./types.ts";
|
|
7
|
+
export { parseSafetensorsHeader, getSafetensorsTensorInfos, loadSafetensorsTensor, loadSafetensors, loadSafetensorsFromUrl, isSafetensors, } from "./safetensors.ts";
|
|
8
|
+
export { type GGUFHeader, parseGGUFHeader, parseGGUFMetadata, parseGGUFTensorInfos, calculateGGUFTensorBytes, loadGGUF, loadGGUFFromUrl, loadGGUFTensor, dequantizeQ4_0Block, dequantizeQ8_0Block, isGGUF, } from "./gguf.ts";
|
|
9
|
+
/**
|
|
10
|
+
* Auto-detect model format from buffer
|
|
11
|
+
*/
|
|
12
|
+
export declare function detectModelFormat(buffer: ArrayBuffer): ModelFormat | null;
|
|
13
|
+
/**
|
|
14
|
+
* Load a model file, auto-detecting the format
|
|
15
|
+
*/
|
|
16
|
+
export declare function loadModel(source: ArrayBuffer | string, options?: LoadOptions): Promise<LoadedModel>;
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* SafeTensors Format Parser
|
|
3
|
+
*
|
|
4
|
+
* SafeTensors file structure:
|
|
5
|
+
* - 8 bytes: header size (little-endian u64)
|
|
6
|
+
* - N bytes: JSON header (UTF-8)
|
|
7
|
+
* - Remaining: tensor data (contiguous)
|
|
8
|
+
*/
|
|
9
|
+
import { type SafetensorsHeader, type TensorInfo, type LoadedModel, type LoadOptions } from "./types.ts";
|
|
10
|
+
/**
|
|
11
|
+
* Parse the SafeTensors header from a buffer
|
|
12
|
+
* @param buffer - ArrayBuffer containing the SafeTensors file
|
|
13
|
+
* @returns Parsed header and data offset
|
|
14
|
+
*/
|
|
15
|
+
export declare function parseSafetensorsHeader(buffer: ArrayBuffer): {
|
|
16
|
+
header: SafetensorsHeader;
|
|
17
|
+
dataOffset: number;
|
|
18
|
+
};
|
|
19
|
+
/**
|
|
20
|
+
* Extract tensor information from SafeTensors header
|
|
21
|
+
*/
|
|
22
|
+
export declare function getSafetensorsTensorInfos(header: SafetensorsHeader, dataOffset: number): Map<string, TensorInfo>;
|
|
23
|
+
/**
|
|
24
|
+
* Load a single tensor's data from the buffer
|
|
25
|
+
*/
|
|
26
|
+
export declare function loadSafetensorsTensor(buffer: ArrayBuffer, info: TensorInfo): Float32Array;
|
|
27
|
+
/**
|
|
28
|
+
* Load a SafeTensors model from an ArrayBuffer
|
|
29
|
+
*/
|
|
30
|
+
export declare function loadSafetensors(buffer: ArrayBuffer, options?: LoadOptions): LoadedModel;
|
|
31
|
+
/**
|
|
32
|
+
* Load SafeTensors from a URL
|
|
33
|
+
*/
|
|
34
|
+
export declare function loadSafetensorsFromUrl(url: string, options?: LoadOptions): Promise<LoadedModel>;
|
|
35
|
+
/**
|
|
36
|
+
* Check if a buffer is a valid SafeTensors file
|
|
37
|
+
*/
|
|
38
|
+
export declare function isSafetensors(buffer: ArrayBuffer): boolean;
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Model Loading Types
|
|
3
|
+
* Common types for SafeTensors and GGUF model loading
|
|
4
|
+
*/
|
|
5
|
+
/**
|
|
6
|
+
* Supported model formats
|
|
7
|
+
*/
|
|
8
|
+
export type ModelFormat = "safetensors" | "gguf";
|
|
9
|
+
/**
|
|
10
|
+
* SafeTensors data types
|
|
11
|
+
*/
|
|
12
|
+
export type SafetensorsDType = "F64" | "F32" | "F16" | "BF16" | "I64" | "I32" | "I16" | "I8" | "U8" | "BOOL";
|
|
13
|
+
/**
|
|
14
|
+
* GGUF quantization types
|
|
15
|
+
* Based on llama.cpp GGML types
|
|
16
|
+
*/
|
|
17
|
+
export declare enum GGUFQuantType {
|
|
18
|
+
F32 = 0,
|
|
19
|
+
F16 = 1,
|
|
20
|
+
Q4_0 = 2,
|
|
21
|
+
Q4_1 = 3,
|
|
22
|
+
Q5_0 = 6,
|
|
23
|
+
Q5_1 = 7,
|
|
24
|
+
Q8_0 = 8,
|
|
25
|
+
Q8_1 = 9,
|
|
26
|
+
Q2_K = 10,
|
|
27
|
+
Q3_K = 11,
|
|
28
|
+
Q4_K = 12,
|
|
29
|
+
Q5_K = 13,
|
|
30
|
+
Q6_K = 14,
|
|
31
|
+
Q8_K = 15,
|
|
32
|
+
IQ2_XXS = 16,
|
|
33
|
+
IQ2_XS = 17,
|
|
34
|
+
IQ3_XXS = 18,
|
|
35
|
+
IQ1_S = 19,
|
|
36
|
+
IQ4_NL = 20,
|
|
37
|
+
IQ3_S = 21,
|
|
38
|
+
IQ2_S = 22,
|
|
39
|
+
IQ4_XS = 23,
|
|
40
|
+
I8 = 24,
|
|
41
|
+
I16 = 25,
|
|
42
|
+
I32 = 26,
|
|
43
|
+
I64 = 27,
|
|
44
|
+
F64 = 28,
|
|
45
|
+
BF16 = 29
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* GGUF metadata value types
|
|
49
|
+
*/
|
|
50
|
+
export declare enum GGUFMetadataValueType {
|
|
51
|
+
UINT8 = 0,
|
|
52
|
+
INT8 = 1,
|
|
53
|
+
UINT16 = 2,
|
|
54
|
+
INT16 = 3,
|
|
55
|
+
UINT32 = 4,
|
|
56
|
+
INT32 = 5,
|
|
57
|
+
FLOAT32 = 6,
|
|
58
|
+
BOOL = 7,
|
|
59
|
+
STRING = 8,
|
|
60
|
+
ARRAY = 9,
|
|
61
|
+
UINT64 = 10,
|
|
62
|
+
INT64 = 11,
|
|
63
|
+
FLOAT64 = 12
|
|
64
|
+
}
|
|
65
|
+
/**
|
|
66
|
+
* Information about a single tensor in a model file
|
|
67
|
+
*/
|
|
68
|
+
export interface TensorInfo {
|
|
69
|
+
/** Tensor name (e.g., "model.layers.0.attention.wq.weight") */
|
|
70
|
+
name: string;
|
|
71
|
+
/** Tensor shape (e.g., [4096, 4096]) */
|
|
72
|
+
shape: number[];
|
|
73
|
+
/** Data type or quantization type */
|
|
74
|
+
dtype: SafetensorsDType | GGUFQuantType;
|
|
75
|
+
/** Byte offset in the file's data section */
|
|
76
|
+
offset: number;
|
|
77
|
+
/** Total bytes for this tensor's data */
|
|
78
|
+
byteSize: number;
|
|
79
|
+
}
|
|
80
|
+
/**
|
|
81
|
+
* SafeTensors header entry for a single tensor
|
|
82
|
+
*/
|
|
83
|
+
export interface SafetensorsHeaderEntry {
|
|
84
|
+
dtype: SafetensorsDType;
|
|
85
|
+
shape: number[];
|
|
86
|
+
data_offsets: [number, number];
|
|
87
|
+
}
|
|
88
|
+
/**
|
|
89
|
+
* Parsed SafeTensors header
|
|
90
|
+
*/
|
|
91
|
+
export interface SafetensorsHeader {
|
|
92
|
+
tensors: Record<string, SafetensorsHeaderEntry>;
|
|
93
|
+
__metadata__?: Record<string, string>;
|
|
94
|
+
}
|
|
95
|
+
/**
|
|
96
|
+
* Model metadata extracted from file headers
|
|
97
|
+
*/
|
|
98
|
+
export interface ModelMetadata {
|
|
99
|
+
/** Model format */
|
|
100
|
+
format: ModelFormat;
|
|
101
|
+
/** Model name (if available) */
|
|
102
|
+
name?: string;
|
|
103
|
+
/** Model architecture (e.g., "llama", "mistral", "gpt2") */
|
|
104
|
+
architecture?: string;
|
|
105
|
+
/** Context length */
|
|
106
|
+
contextLength?: number;
|
|
107
|
+
/** Embedding dimension */
|
|
108
|
+
embeddingLength?: number;
|
|
109
|
+
/** Number of layers */
|
|
110
|
+
numLayers?: number;
|
|
111
|
+
/** Number of attention heads */
|
|
112
|
+
numHeads?: number;
|
|
113
|
+
/** Number of KV heads (for GQA) */
|
|
114
|
+
numKVHeads?: number;
|
|
115
|
+
/** Vocabulary size */
|
|
116
|
+
vocabSize?: number;
|
|
117
|
+
/** Head dimension */
|
|
118
|
+
headDim?: number;
|
|
119
|
+
/** Hidden size for FFN */
|
|
120
|
+
hiddenSize?: number;
|
|
121
|
+
/** RoPE frequency base */
|
|
122
|
+
ropeFreqBase?: number;
|
|
123
|
+
/** Additional metadata */
|
|
124
|
+
extra?: Record<string, unknown>;
|
|
125
|
+
}
|
|
126
|
+
/**
|
|
127
|
+
* GGUF tensor descriptor (from file)
|
|
128
|
+
*/
|
|
129
|
+
export interface GGUFTensorInfo {
|
|
130
|
+
name: string;
|
|
131
|
+
nDims: number;
|
|
132
|
+
dimensions: bigint[];
|
|
133
|
+
type: GGUFQuantType;
|
|
134
|
+
offset: bigint;
|
|
135
|
+
}
|
|
136
|
+
/**
|
|
137
|
+
* Loaded tensor data
|
|
138
|
+
*/
|
|
139
|
+
export interface LoadedTensor {
|
|
140
|
+
info: TensorInfo;
|
|
141
|
+
/** Raw data (Float32Array for F32, Uint8Array for quantized) */
|
|
142
|
+
data: Float32Array | Float16Array | Uint8Array | Int8Array;
|
|
143
|
+
}
|
|
144
|
+
/**
|
|
145
|
+
* Fully loaded model
|
|
146
|
+
*/
|
|
147
|
+
export interface LoadedModel {
|
|
148
|
+
/** Model metadata */
|
|
149
|
+
metadata: ModelMetadata;
|
|
150
|
+
/** Map of tensor names to tensor info */
|
|
151
|
+
tensorInfos: Map<string, TensorInfo>;
|
|
152
|
+
/** Total size of tensor data in bytes */
|
|
153
|
+
totalBytes: number;
|
|
154
|
+
/** The raw buffer (for lazy loading) */
|
|
155
|
+
buffer: ArrayBuffer;
|
|
156
|
+
/** Offset where tensor data starts */
|
|
157
|
+
dataOffset: number;
|
|
158
|
+
}
|
|
159
|
+
/**
|
|
160
|
+
* Model loading options
|
|
161
|
+
*/
|
|
162
|
+
export interface LoadOptions {
|
|
163
|
+
/** Only load metadata, not tensor data */
|
|
164
|
+
metadataOnly?: boolean;
|
|
165
|
+
/** Filter which tensors to load by name pattern */
|
|
166
|
+
tensorFilter?: (name: string) => boolean;
|
|
167
|
+
/** Progress callback */
|
|
168
|
+
onProgress?: (loaded: number, total: number) => void;
|
|
169
|
+
}
|
|
170
|
+
/**
|
|
171
|
+
* Bytes per element for SafeTensors dtypes
|
|
172
|
+
*/
|
|
173
|
+
export declare const SAFETENSORS_DTYPE_BYTES: Record<SafetensorsDType, number>;
|
|
174
|
+
/**
|
|
175
|
+
* Block size for GGUF quantization types
|
|
176
|
+
* Most quantization types process data in blocks
|
|
177
|
+
*/
|
|
178
|
+
export declare const GGUF_QUANT_BLOCK_SIZE: Partial<Record<GGUFQuantType, number>>;
|
|
179
|
+
/**
|
|
180
|
+
* Bytes per block for GGUF quantization types
|
|
181
|
+
*/
|
|
182
|
+
export declare const GGUF_QUANT_BYTES_PER_BLOCK: Partial<Record<GGUFQuantType, number>>;
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Activation Functions
|
|
3
|
+
* GeLU, SiLU (Swish), ReLU for transformer models
|
|
4
|
+
*/
|
|
5
|
+
import type { WebInferDevice } from "../core/device.ts";
|
|
6
|
+
import { Tensor } from "../core/tensor.ts";
|
|
7
|
+
/**
|
|
8
|
+
* GeLU (Gaussian Error Linear Unit) - CPU
|
|
9
|
+
* Used in BERT, GPT-2, etc.
|
|
10
|
+
* Approximation: x * 0.5 * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
|
|
11
|
+
*/
|
|
12
|
+
export declare function geluCPU(x: Float32Array): Float32Array;
|
|
13
|
+
/**
|
|
14
|
+
* GeLU exact (using erf) - CPU
|
|
15
|
+
* More accurate but slower
|
|
16
|
+
*/
|
|
17
|
+
export declare function geluExactCPU(x: Float32Array): Float32Array;
|
|
18
|
+
/**
|
|
19
|
+
* SiLU (Sigmoid Linear Unit / Swish) - CPU
|
|
20
|
+
* Used in Llama, Mistral, etc.
|
|
21
|
+
* Formula: x * sigmoid(x) = x / (1 + exp(-x))
|
|
22
|
+
*/
|
|
23
|
+
export declare function siluCPU(x: Float32Array): Float32Array;
|
|
24
|
+
/**
|
|
25
|
+
* ReLU (Rectified Linear Unit) - CPU
|
|
26
|
+
*/
|
|
27
|
+
export declare function reluCPU(x: Float32Array): Float32Array;
|
|
28
|
+
/**
|
|
29
|
+
* Sigmoid - CPU
|
|
30
|
+
*/
|
|
31
|
+
export declare function sigmoidCPU(x: Float32Array): Float32Array;
|
|
32
|
+
/**
|
|
33
|
+
* GeLU - GPU
|
|
34
|
+
*/
|
|
35
|
+
export declare function gelu(device: WebInferDevice, x: Tensor): Promise<Tensor>;
|
|
36
|
+
/**
|
|
37
|
+
* SiLU - GPU
|
|
38
|
+
*/
|
|
39
|
+
export declare function silu(device: WebInferDevice, x: Tensor): Promise<Tensor>;
|
|
40
|
+
/**
|
|
41
|
+
* ReLU - GPU
|
|
42
|
+
*/
|
|
43
|
+
export declare function relu(device: WebInferDevice, x: Tensor): Promise<Tensor>;
|