webinfer 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/LICENSE +201 -0
  2. package/dist/attention/block-sparse/format.d.ts +52 -0
  3. package/dist/attention/block-sparse/patterns/causal.d.ts +16 -0
  4. package/dist/attention/block-sparse/patterns/sliding.d.ts +22 -0
  5. package/dist/attention/flash-attention.d.ts +30 -0
  6. package/dist/attention/index.d.ts +9 -0
  7. package/dist/attention/paged-kv/block-manager.d.ts +102 -0
  8. package/dist/attention/paged-kv/index.d.ts +5 -0
  9. package/dist/attention/paged-kv/page-table.d.ts +99 -0
  10. package/dist/attention/scheduler.d.ts +40 -0
  11. package/dist/core/buffer-pool.d.ts +18 -0
  12. package/dist/core/device.d.ts +23 -0
  13. package/dist/core/tensor.d.ts +25 -0
  14. package/dist/index.d.ts +22 -0
  15. package/dist/index.js +4228 -0
  16. package/dist/inference/engine.d.ts +69 -0
  17. package/dist/inference/generate.d.ts +30 -0
  18. package/dist/inference/index.d.ts +7 -0
  19. package/dist/inference/types.d.ts +161 -0
  20. package/dist/jit/compiler.d.ts +23 -0
  21. package/dist/jit/kernel-cache.d.ts +21 -0
  22. package/dist/model/gguf.d.ts +90 -0
  23. package/dist/model/index.d.ts +16 -0
  24. package/dist/model/safetensors.d.ts +38 -0
  25. package/dist/model/types.d.ts +182 -0
  26. package/dist/ops/activations.d.ts +43 -0
  27. package/dist/ops/elementwise.d.ts +38 -0
  28. package/dist/ops/embedding.d.ts +30 -0
  29. package/dist/ops/matmul.d.ts +21 -0
  30. package/dist/ops/normalization.d.ts +24 -0
  31. package/dist/ops/reshape.d.ts +39 -0
  32. package/dist/ops/rope.d.ts +32 -0
  33. package/dist/ops/softmax.d.ts +18 -0
  34. package/dist/quantization/index.d.ts +6 -0
  35. package/dist/quantization/qmatmul.d.ts +38 -0
  36. package/dist/quantization/quantize.d.ts +52 -0
  37. package/dist/sampling/index.d.ts +6 -0
  38. package/dist/sampling/sampler.d.ts +39 -0
  39. package/dist/sampling/top-k.d.ts +24 -0
  40. package/dist/sampling/top-p.d.ts +14 -0
  41. package/package.json +54 -0
@@ -0,0 +1,69 @@
1
+ /**
2
+ * Inference Engine
3
+ * Core engine for running LLM inference
4
+ */
5
+ import type { WebInferDevice } from "../core/device.ts";
6
+ import type { LoadedModel } from "../model/types.ts";
7
+ import type { ModelConfig, InferenceConfig, ForwardResult } from "./types.ts";
8
+ /**
9
+ * Inference Engine
10
+ * Manages model weights and provides forward pass functionality
11
+ */
12
+ export declare class InferenceEngine {
13
+ private device;
14
+ private config;
15
+ private modelConfig;
16
+ private weights;
17
+ private loadedModel;
18
+ private kvCache;
19
+ private ropeFreqsCos;
20
+ private ropeFreqsSin;
21
+ constructor(device: WebInferDevice | null, config?: InferenceConfig);
22
+ /**
23
+ * Load model weights from a LoadedModel
24
+ */
25
+ loadModel(model: LoadedModel, modelConfig: ModelConfig): Promise<void>;
26
+ /**
27
+ * Extract model weights from loaded model
28
+ */
29
+ private extractWeights;
30
+ /**
31
+ * Initialize KV cache
32
+ */
33
+ private initKVCache;
34
+ /**
35
+ * Reset KV cache (for new sequence)
36
+ */
37
+ resetKVCache(): void;
38
+ /**
39
+ * Forward pass (CPU reference implementation)
40
+ * @param inputIds - Input token IDs [seqLen]
41
+ * @param startPos - Starting position for KV cache
42
+ * @returns Logits for the last token
43
+ */
44
+ forward(inputIds: Uint32Array, startPos?: number): ForwardResult;
45
+ /**
46
+ * Attention forward pass
47
+ */
48
+ private attentionForward;
49
+ /**
50
+ * Apply RoPE to a single head position
51
+ */
52
+ private applyRoPE;
53
+ /**
54
+ * FFN forward pass (SwiGLU)
55
+ */
56
+ private ffnForward;
57
+ /**
58
+ * Get model configuration
59
+ */
60
+ getModelConfig(): ModelConfig | null;
61
+ /**
62
+ * Check if model is loaded
63
+ */
64
+ isLoaded(): boolean;
65
+ /**
66
+ * Dispose resources
67
+ */
68
+ dispose(): void;
69
+ }
@@ -0,0 +1,30 @@
1
+ /**
2
+ * Text Generation API
3
+ * High-level API for generating text with LLMs
4
+ */
5
+ import type { InferenceEngine } from "./engine.ts";
6
+ import type { GenerationConfig, GenerationResult, StreamToken } from "./types.ts";
7
+ /**
8
+ * Apply generation config to logits and sample next token
9
+ */
10
+ export declare function sampleNextToken(logits: Float32Array, config: GenerationConfig, generatedTokens?: number[]): number;
11
+ /**
12
+ * Generate tokens from a prompt
13
+ * @param engine - Initialized inference engine with loaded model
14
+ * @param promptTokens - Tokenized prompt
15
+ * @param config - Generation configuration
16
+ * @returns Generation result
17
+ */
18
+ export declare function generate(engine: InferenceEngine, promptTokens: number[] | Uint32Array, config?: Partial<GenerationConfig>): Promise<GenerationResult>;
19
+ /**
20
+ * Generate tokens with streaming (async iterator)
21
+ * @param engine - Initialized inference engine with loaded model
22
+ * @param promptTokens - Tokenized prompt
23
+ * @param config - Generation configuration
24
+ * @yields StreamToken for each generated token
25
+ */
26
+ export declare function generateStream(engine: InferenceEngine, promptTokens: number[] | Uint32Array, config?: Partial<GenerationConfig>): AsyncGenerator<StreamToken, void, unknown>;
27
+ /**
28
+ * Simple greedy decode (no sampling, fastest)
29
+ */
30
+ export declare function greedyDecode(engine: InferenceEngine, promptTokens: number[] | Uint32Array, maxTokens: number, eosTokenId?: number): number[];
@@ -0,0 +1,7 @@
1
+ /**
2
+ * Inference Module
3
+ * High-level API for LLM inference
4
+ */
5
+ export { type ModelConfig, type InferenceConfig, type GenerationConfig, type GenerationResult, type StreamToken, type FinishReason, type ForwardResult, type ModelWeights, type LayerWeights, DEFAULT_GENERATION_CONFIG, normalizeGenerationConfig, } from "./types.ts";
6
+ export { InferenceEngine } from "./engine.ts";
7
+ export { generate, generateStream, greedyDecode, sampleNextToken, } from "./generate.ts";
@@ -0,0 +1,161 @@
1
+ /**
2
+ * Inference Types
3
+ * Configuration and result types for model inference
4
+ */
5
+ import type { DType } from "../core/tensor.ts";
6
+ /**
7
+ * Model architecture configuration
8
+ */
9
+ export interface ModelConfig {
10
+ /** Model architecture (e.g., "llama", "mistral", "gpt2") */
11
+ architecture: string;
12
+ /** Number of transformer layers */
13
+ numLayers: number;
14
+ /** Number of attention heads */
15
+ numHeads: number;
16
+ /** Number of key-value heads (for GQA, defaults to numHeads) */
17
+ numKVHeads?: number;
18
+ /** Hidden/embedding dimension */
19
+ hiddenSize: number;
20
+ /** Intermediate size for FFN */
21
+ intermediateSize: number;
22
+ /** Vocabulary size */
23
+ vocabSize: number;
24
+ /** Maximum sequence length */
25
+ maxSeqLen: number;
26
+ /** Head dimension (defaults to hiddenSize / numHeads) */
27
+ headDim?: number;
28
+ /** RoPE frequency base */
29
+ ropeFreqBase?: number;
30
+ /** RMS norm epsilon */
31
+ rmsNormEps?: number;
32
+ /** Data type for computation */
33
+ dtype?: DType;
34
+ }
35
+ /**
36
+ * Inference engine configuration
37
+ */
38
+ export interface InferenceConfig {
39
+ /** Maximum batch size */
40
+ maxBatchSize?: number;
41
+ /** Maximum sequence length */
42
+ maxSeqLen?: number;
43
+ /** Use KV cache for generation */
44
+ useKVCache?: boolean;
45
+ /** Memory limit in bytes (optional) */
46
+ memoryLimit?: number;
47
+ /** Enable profiling */
48
+ enableProfiling?: boolean;
49
+ }
50
+ /**
51
+ * Generation / sampling configuration
52
+ */
53
+ export interface GenerationConfig {
54
+ /** Maximum number of tokens to generate */
55
+ maxTokens: number;
56
+ /** Temperature for sampling (0 = greedy) */
57
+ temperature?: number;
58
+ /** Top-K sampling (0 = disabled) */
59
+ topK?: number;
60
+ /** Top-P / nucleus sampling (1.0 = disabled) */
61
+ topP?: number;
62
+ /** Repetition penalty (1.0 = disabled) */
63
+ repetitionPenalty?: number;
64
+ /** Stop sequences (generation stops when any is generated) */
65
+ stopSequences?: number[][];
66
+ /** EOS token ID */
67
+ eosTokenId?: number;
68
+ /** Pad token ID */
69
+ padTokenId?: number;
70
+ /** BOS token ID */
71
+ bosTokenId?: number;
72
+ /** Stream tokens as they are generated */
73
+ stream?: boolean;
74
+ /** Random seed for reproducibility */
75
+ seed?: number;
76
+ }
77
+ /**
78
+ * Default generation config values
79
+ */
80
+ export declare const DEFAULT_GENERATION_CONFIG: Required<Omit<GenerationConfig, "stopSequences" | "seed">>;
81
+ /**
82
+ * Reason for generation completion
83
+ */
84
+ export type FinishReason = "stop" | "length" | "eos";
85
+ /**
86
+ * Result of text generation
87
+ */
88
+ export interface GenerationResult {
89
+ /** Generated token IDs */
90
+ tokens: number[];
91
+ /** Finish reason */
92
+ finishReason: FinishReason;
93
+ /** Number of prompt tokens */
94
+ promptTokens: number;
95
+ /** Number of generated tokens */
96
+ generatedTokens: number;
97
+ /** Total time in milliseconds */
98
+ totalTimeMs: number;
99
+ /** Tokens per second */
100
+ tokensPerSecond: number;
101
+ }
102
+ /**
103
+ * Streaming generation token
104
+ */
105
+ export interface StreamToken {
106
+ /** Token ID */
107
+ tokenId: number;
108
+ /** Token index in generation */
109
+ index: number;
110
+ /** Whether this is the final token */
111
+ isLast: boolean;
112
+ /** Finish reason (only set if isLast) */
113
+ finishReason?: FinishReason;
114
+ }
115
+ /**
116
+ * Forward pass result
117
+ */
118
+ export interface ForwardResult {
119
+ /** Logits [batch, vocabSize] or [batch, seqLen, vocabSize] */
120
+ logits: Float32Array;
121
+ /** Shape of logits */
122
+ logitsShape: number[];
123
+ }
124
+ /**
125
+ * Validate and normalize generation config
126
+ */
127
+ export declare function normalizeGenerationConfig(config: Partial<GenerationConfig>): GenerationConfig;
128
+ /**
129
+ * Model layer weights
130
+ */
131
+ export interface LayerWeights {
132
+ /** Attention weights */
133
+ attention: {
134
+ qProj: Float32Array;
135
+ kProj: Float32Array;
136
+ vProj: Float32Array;
137
+ oProj: Float32Array;
138
+ };
139
+ /** FFN weights */
140
+ ffn: {
141
+ gate?: Float32Array;
142
+ up: Float32Array;
143
+ down: Float32Array;
144
+ };
145
+ /** Normalization */
146
+ inputNorm: Float32Array;
147
+ postAttentionNorm: Float32Array;
148
+ }
149
+ /**
150
+ * Full model weights
151
+ */
152
+ export interface ModelWeights {
153
+ /** Token embeddings */
154
+ embedTokens: Float32Array;
155
+ /** Layer weights */
156
+ layers: LayerWeights[];
157
+ /** Final norm */
158
+ finalNorm: Float32Array;
159
+ /** LM head (output projection) */
160
+ lmHead: Float32Array;
161
+ }
@@ -0,0 +1,23 @@
1
+ /**
2
+ * WGSL Compiler - Generates optimized GPU kernels
3
+ */
4
+ import { KernelCache } from "./kernel-cache.ts";
5
+ import type { DeviceInfo } from "../core/device.ts";
6
+ export interface MatMulConfig {
7
+ M: number;
8
+ N: number;
9
+ K: number;
10
+ tileM?: number;
11
+ tileN?: number;
12
+ tileK?: number;
13
+ }
14
+ export declare class WGSLCompiler {
15
+ private device;
16
+ private cache;
17
+ private deviceInfo;
18
+ constructor(device: GPUDevice, cache: KernelCache, deviceInfo: DeviceInfo);
19
+ private selectTileSize;
20
+ compileMatMul(config: MatMulConfig): GPUComputePipeline;
21
+ private generateMatMulWGSL;
22
+ getCacheStats(): import("./kernel-cache.ts").CacheStats;
23
+ }
@@ -0,0 +1,21 @@
1
+ /**
2
+ * Kernel Cache - Caches compiled GPU compute pipelines
3
+ */
4
+ export interface CacheStats {
5
+ hits: number;
6
+ misses: number;
7
+ size: number;
8
+ }
9
+ export declare class KernelCache {
10
+ private device;
11
+ private cache;
12
+ private hits;
13
+ private misses;
14
+ constructor(device: GPUDevice);
15
+ getOrCreate(key: string, createFn: () => GPUComputePipeline): GPUComputePipeline;
16
+ has(key: string): boolean;
17
+ get(key: string): GPUComputePipeline | undefined;
18
+ set(key: string, pipeline: GPUComputePipeline): void;
19
+ getStats(): CacheStats;
20
+ clear(): void;
21
+ }
@@ -0,0 +1,90 @@
1
+ /**
2
+ * GGUF Format Parser
3
+ *
4
+ * GGUF file structure:
5
+ * - 4 bytes: magic "GGUF"
6
+ * - 4 bytes: version (3)
7
+ * - 8 bytes: n_tensors (u64)
8
+ * - 8 bytes: n_kv (u64)
9
+ * - Key-value metadata pairs
10
+ * - Tensor info descriptors
11
+ * - Padding to alignment
12
+ * - Tensor data
13
+ */
14
+ import { type GGUFTensorInfo, type TensorInfo, type LoadedModel, type LoadOptions, GGUFQuantType } from "./types.ts";
15
+ /**
16
+ * Reader helper for GGUF binary format
17
+ */
18
+ declare class GGUFReader {
19
+ private view;
20
+ private offset;
21
+ private textDecoder;
22
+ constructor(buffer: ArrayBuffer);
23
+ get position(): number;
24
+ set position(pos: number);
25
+ readUint8(): number;
26
+ readInt8(): number;
27
+ readUint16(): number;
28
+ readInt16(): number;
29
+ readUint32(): number;
30
+ readInt32(): number;
31
+ readUint64(): bigint;
32
+ readInt64(): bigint;
33
+ readFloat32(): number;
34
+ readFloat64(): number;
35
+ readBool(): boolean;
36
+ readString(): string;
37
+ alignTo(alignment: number): void;
38
+ }
39
+ /**
40
+ * GGUF header information
41
+ */
42
+ export interface GGUFHeader {
43
+ magic: number;
44
+ version: number;
45
+ nTensors: bigint;
46
+ nKV: bigint;
47
+ }
48
+ /**
49
+ * Parse the GGUF header
50
+ */
51
+ export declare function parseGGUFHeader(reader: GGUFReader): GGUFHeader;
52
+ /**
53
+ * Parse all metadata key-value pairs
54
+ */
55
+ export declare function parseGGUFMetadata(reader: GGUFReader, nKV: bigint): Map<string, unknown>;
56
+ /**
57
+ * Parse tensor info descriptors
58
+ */
59
+ export declare function parseGGUFTensorInfos(reader: GGUFReader, nTensors: bigint): GGUFTensorInfo[];
60
+ /**
61
+ * Calculate byte size for a GGUF tensor
62
+ */
63
+ export declare function calculateGGUFTensorBytes(type: GGUFQuantType, shape: number[]): number;
64
+ /**
65
+ * Load a GGUF model from an ArrayBuffer
66
+ */
67
+ export declare function loadGGUF(buffer: ArrayBuffer, options?: LoadOptions): LoadedModel;
68
+ /**
69
+ * Load GGUF from a URL
70
+ */
71
+ export declare function loadGGUFFromUrl(url: string, options?: LoadOptions): Promise<LoadedModel>;
72
+ /**
73
+ * Dequantize Q4_0 block to float32
74
+ * Q4_0: 32 values = 2 bytes scale (f16) + 16 bytes data (4-bit packed)
75
+ */
76
+ export declare function dequantizeQ4_0Block(data: Uint8Array, offset: number): Float32Array;
77
+ /**
78
+ * Dequantize Q8_0 block to float32
79
+ * Q8_0: 32 values = 2 bytes scale (f16) + 32 bytes data (int8)
80
+ */
81
+ export declare function dequantizeQ8_0Block(data: Uint8Array, offset: number): Float32Array;
82
+ /**
83
+ * Load and dequantize a GGUF tensor to Float32Array
84
+ */
85
+ export declare function loadGGUFTensor(buffer: ArrayBuffer, info: TensorInfo): Float32Array;
86
+ /**
87
+ * Check if a buffer is a valid GGUF file
88
+ */
89
+ export declare function isGGUF(buffer: ArrayBuffer): boolean;
90
+ export {};
@@ -0,0 +1,16 @@
1
+ /**
2
+ * Model Loading Module
3
+ * Supports SafeTensors and GGUF formats
4
+ */
5
+ import type { ModelFormat, LoadOptions, LoadedModel } from "./types.ts";
6
+ export { type ModelFormat, type SafetensorsDType, GGUFQuantType, GGUFMetadataValueType, type TensorInfo, type SafetensorsHeader, type SafetensorsHeaderEntry, type ModelMetadata, type GGUFTensorInfo, type LoadedTensor, type LoadedModel, type LoadOptions, SAFETENSORS_DTYPE_BYTES, GGUF_QUANT_BLOCK_SIZE, GGUF_QUANT_BYTES_PER_BLOCK, } from "./types.ts";
7
+ export { parseSafetensorsHeader, getSafetensorsTensorInfos, loadSafetensorsTensor, loadSafetensors, loadSafetensorsFromUrl, isSafetensors, } from "./safetensors.ts";
8
+ export { type GGUFHeader, parseGGUFHeader, parseGGUFMetadata, parseGGUFTensorInfos, calculateGGUFTensorBytes, loadGGUF, loadGGUFFromUrl, loadGGUFTensor, dequantizeQ4_0Block, dequantizeQ8_0Block, isGGUF, } from "./gguf.ts";
9
+ /**
10
+ * Auto-detect model format from buffer
11
+ */
12
+ export declare function detectModelFormat(buffer: ArrayBuffer): ModelFormat | null;
13
+ /**
14
+ * Load a model file, auto-detecting the format
15
+ */
16
+ export declare function loadModel(source: ArrayBuffer | string, options?: LoadOptions): Promise<LoadedModel>;
@@ -0,0 +1,38 @@
1
+ /**
2
+ * SafeTensors Format Parser
3
+ *
4
+ * SafeTensors file structure:
5
+ * - 8 bytes: header size (little-endian u64)
6
+ * - N bytes: JSON header (UTF-8)
7
+ * - Remaining: tensor data (contiguous)
8
+ */
9
+ import { type SafetensorsHeader, type TensorInfo, type LoadedModel, type LoadOptions } from "./types.ts";
10
+ /**
11
+ * Parse the SafeTensors header from a buffer
12
+ * @param buffer - ArrayBuffer containing the SafeTensors file
13
+ * @returns Parsed header and data offset
14
+ */
15
+ export declare function parseSafetensorsHeader(buffer: ArrayBuffer): {
16
+ header: SafetensorsHeader;
17
+ dataOffset: number;
18
+ };
19
+ /**
20
+ * Extract tensor information from SafeTensors header
21
+ */
22
+ export declare function getSafetensorsTensorInfos(header: SafetensorsHeader, dataOffset: number): Map<string, TensorInfo>;
23
+ /**
24
+ * Load a single tensor's data from the buffer
25
+ */
26
+ export declare function loadSafetensorsTensor(buffer: ArrayBuffer, info: TensorInfo): Float32Array;
27
+ /**
28
+ * Load a SafeTensors model from an ArrayBuffer
29
+ */
30
+ export declare function loadSafetensors(buffer: ArrayBuffer, options?: LoadOptions): LoadedModel;
31
+ /**
32
+ * Load SafeTensors from a URL
33
+ */
34
+ export declare function loadSafetensorsFromUrl(url: string, options?: LoadOptions): Promise<LoadedModel>;
35
+ /**
36
+ * Check if a buffer is a valid SafeTensors file
37
+ */
38
+ export declare function isSafetensors(buffer: ArrayBuffer): boolean;
@@ -0,0 +1,182 @@
1
+ /**
2
+ * Model Loading Types
3
+ * Common types for SafeTensors and GGUF model loading
4
+ */
5
+ /**
6
+ * Supported model formats
7
+ */
8
+ export type ModelFormat = "safetensors" | "gguf";
9
+ /**
10
+ * SafeTensors data types
11
+ */
12
+ export type SafetensorsDType = "F64" | "F32" | "F16" | "BF16" | "I64" | "I32" | "I16" | "I8" | "U8" | "BOOL";
13
+ /**
14
+ * GGUF quantization types
15
+ * Based on llama.cpp GGML types
16
+ */
17
+ export declare enum GGUFQuantType {
18
+ F32 = 0,
19
+ F16 = 1,
20
+ Q4_0 = 2,
21
+ Q4_1 = 3,
22
+ Q5_0 = 6,
23
+ Q5_1 = 7,
24
+ Q8_0 = 8,
25
+ Q8_1 = 9,
26
+ Q2_K = 10,
27
+ Q3_K = 11,
28
+ Q4_K = 12,
29
+ Q5_K = 13,
30
+ Q6_K = 14,
31
+ Q8_K = 15,
32
+ IQ2_XXS = 16,
33
+ IQ2_XS = 17,
34
+ IQ3_XXS = 18,
35
+ IQ1_S = 19,
36
+ IQ4_NL = 20,
37
+ IQ3_S = 21,
38
+ IQ2_S = 22,
39
+ IQ4_XS = 23,
40
+ I8 = 24,
41
+ I16 = 25,
42
+ I32 = 26,
43
+ I64 = 27,
44
+ F64 = 28,
45
+ BF16 = 29
46
+ }
47
+ /**
48
+ * GGUF metadata value types
49
+ */
50
+ export declare enum GGUFMetadataValueType {
51
+ UINT8 = 0,
52
+ INT8 = 1,
53
+ UINT16 = 2,
54
+ INT16 = 3,
55
+ UINT32 = 4,
56
+ INT32 = 5,
57
+ FLOAT32 = 6,
58
+ BOOL = 7,
59
+ STRING = 8,
60
+ ARRAY = 9,
61
+ UINT64 = 10,
62
+ INT64 = 11,
63
+ FLOAT64 = 12
64
+ }
65
+ /**
66
+ * Information about a single tensor in a model file
67
+ */
68
+ export interface TensorInfo {
69
+ /** Tensor name (e.g., "model.layers.0.attention.wq.weight") */
70
+ name: string;
71
+ /** Tensor shape (e.g., [4096, 4096]) */
72
+ shape: number[];
73
+ /** Data type or quantization type */
74
+ dtype: SafetensorsDType | GGUFQuantType;
75
+ /** Byte offset in the file's data section */
76
+ offset: number;
77
+ /** Total bytes for this tensor's data */
78
+ byteSize: number;
79
+ }
80
+ /**
81
+ * SafeTensors header entry for a single tensor
82
+ */
83
+ export interface SafetensorsHeaderEntry {
84
+ dtype: SafetensorsDType;
85
+ shape: number[];
86
+ data_offsets: [number, number];
87
+ }
88
+ /**
89
+ * Parsed SafeTensors header
90
+ */
91
+ export interface SafetensorsHeader {
92
+ tensors: Record<string, SafetensorsHeaderEntry>;
93
+ __metadata__?: Record<string, string>;
94
+ }
95
+ /**
96
+ * Model metadata extracted from file headers
97
+ */
98
+ export interface ModelMetadata {
99
+ /** Model format */
100
+ format: ModelFormat;
101
+ /** Model name (if available) */
102
+ name?: string;
103
+ /** Model architecture (e.g., "llama", "mistral", "gpt2") */
104
+ architecture?: string;
105
+ /** Context length */
106
+ contextLength?: number;
107
+ /** Embedding dimension */
108
+ embeddingLength?: number;
109
+ /** Number of layers */
110
+ numLayers?: number;
111
+ /** Number of attention heads */
112
+ numHeads?: number;
113
+ /** Number of KV heads (for GQA) */
114
+ numKVHeads?: number;
115
+ /** Vocabulary size */
116
+ vocabSize?: number;
117
+ /** Head dimension */
118
+ headDim?: number;
119
+ /** Hidden size for FFN */
120
+ hiddenSize?: number;
121
+ /** RoPE frequency base */
122
+ ropeFreqBase?: number;
123
+ /** Additional metadata */
124
+ extra?: Record<string, unknown>;
125
+ }
126
+ /**
127
+ * GGUF tensor descriptor (from file)
128
+ */
129
+ export interface GGUFTensorInfo {
130
+ name: string;
131
+ nDims: number;
132
+ dimensions: bigint[];
133
+ type: GGUFQuantType;
134
+ offset: bigint;
135
+ }
136
+ /**
137
+ * Loaded tensor data
138
+ */
139
+ export interface LoadedTensor {
140
+ info: TensorInfo;
141
+ /** Raw data (Float32Array for F32, Uint8Array for quantized) */
142
+ data: Float32Array | Float16Array | Uint8Array | Int8Array;
143
+ }
144
+ /**
145
+ * Fully loaded model
146
+ */
147
+ export interface LoadedModel {
148
+ /** Model metadata */
149
+ metadata: ModelMetadata;
150
+ /** Map of tensor names to tensor info */
151
+ tensorInfos: Map<string, TensorInfo>;
152
+ /** Total size of tensor data in bytes */
153
+ totalBytes: number;
154
+ /** The raw buffer (for lazy loading) */
155
+ buffer: ArrayBuffer;
156
+ /** Offset where tensor data starts */
157
+ dataOffset: number;
158
+ }
159
+ /**
160
+ * Model loading options
161
+ */
162
+ export interface LoadOptions {
163
+ /** Only load metadata, not tensor data */
164
+ metadataOnly?: boolean;
165
+ /** Filter which tensors to load by name pattern */
166
+ tensorFilter?: (name: string) => boolean;
167
+ /** Progress callback */
168
+ onProgress?: (loaded: number, total: number) => void;
169
+ }
170
+ /**
171
+ * Bytes per element for SafeTensors dtypes
172
+ */
173
+ export declare const SAFETENSORS_DTYPE_BYTES: Record<SafetensorsDType, number>;
174
+ /**
175
+ * Block size for GGUF quantization types
176
+ * Most quantization types process data in blocks
177
+ */
178
+ export declare const GGUF_QUANT_BLOCK_SIZE: Partial<Record<GGUFQuantType, number>>;
179
+ /**
180
+ * Bytes per block for GGUF quantization types
181
+ */
182
+ export declare const GGUF_QUANT_BYTES_PER_BLOCK: Partial<Record<GGUFQuantType, number>>;
@@ -0,0 +1,43 @@
1
+ /**
2
+ * Activation Functions
3
+ * GeLU, SiLU (Swish), ReLU for transformer models
4
+ */
5
+ import type { WebInferDevice } from "../core/device.ts";
6
+ import { Tensor } from "../core/tensor.ts";
7
+ /**
8
+ * GeLU (Gaussian Error Linear Unit) - CPU
9
+ * Used in BERT, GPT-2, etc.
10
+ * Approximation: x * 0.5 * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
11
+ */
12
+ export declare function geluCPU(x: Float32Array): Float32Array;
13
+ /**
14
+ * GeLU exact (using erf) - CPU
15
+ * More accurate but slower
16
+ */
17
+ export declare function geluExactCPU(x: Float32Array): Float32Array;
18
+ /**
19
+ * SiLU (Sigmoid Linear Unit / Swish) - CPU
20
+ * Used in Llama, Mistral, etc.
21
+ * Formula: x * sigmoid(x) = x / (1 + exp(-x))
22
+ */
23
+ export declare function siluCPU(x: Float32Array): Float32Array;
24
+ /**
25
+ * ReLU (Rectified Linear Unit) - CPU
26
+ */
27
+ export declare function reluCPU(x: Float32Array): Float32Array;
28
+ /**
29
+ * Sigmoid - CPU
30
+ */
31
+ export declare function sigmoidCPU(x: Float32Array): Float32Array;
32
+ /**
33
+ * GeLU - GPU
34
+ */
35
+ export declare function gelu(device: WebInferDevice, x: Tensor): Promise<Tensor>;
36
+ /**
37
+ * SiLU - GPU
38
+ */
39
+ export declare function silu(device: WebInferDevice, x: Tensor): Promise<Tensor>;
40
+ /**
41
+ * ReLU - GPU
42
+ */
43
+ export declare function relu(device: WebInferDevice, x: Tensor): Promise<Tensor>;