webinfer 0.0.2 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/README.md +38 -33
  2. package/dist/activation/index.d.ts +30 -0
  3. package/dist/core/context.d.ts +60 -0
  4. package/dist/core/paged-kv-cache.d.ts +33 -0
  5. package/dist/core/tensor.d.ts +38 -19
  6. package/dist/core/types.d.ts +27 -0
  7. package/dist/decode/index.d.ts +65 -0
  8. package/dist/gemm/index.d.ts +25 -0
  9. package/dist/index.d.ts +26 -19
  10. package/dist/index.js +2508 -3885
  11. package/dist/kernels/activation.wgsl.d.ts +14 -0
  12. package/dist/kernels/batch-decode-paged.wgsl.d.ts +12 -0
  13. package/dist/kernels/batch-prefill-paged.wgsl.d.ts +13 -0
  14. package/dist/kernels/decode-attention.wgsl.d.ts +16 -0
  15. package/dist/kernels/gemm.wgsl.d.ts +17 -0
  16. package/dist/kernels/page.wgsl.d.ts +10 -0
  17. package/dist/kernels/prefill-attention.wgsl.d.ts +17 -0
  18. package/dist/kernels/rmsnorm.wgsl.d.ts +10 -0
  19. package/dist/kernels/rope.wgsl.d.ts +19 -0
  20. package/dist/kernels/sampling.wgsl.d.ts +23 -0
  21. package/dist/norm/index.d.ts +43 -0
  22. package/dist/page/index.d.ts +21 -0
  23. package/dist/prefill/index.d.ts +69 -0
  24. package/dist/rope/index.d.ts +37 -0
  25. package/dist/sampling/index.d.ts +53 -4
  26. package/package.json +1 -1
  27. package/dist/attention/block-sparse/format.d.ts +0 -52
  28. package/dist/attention/block-sparse/patterns/causal.d.ts +0 -16
  29. package/dist/attention/block-sparse/patterns/sliding.d.ts +0 -22
  30. package/dist/attention/flash-attention.d.ts +0 -30
  31. package/dist/attention/index.d.ts +0 -9
  32. package/dist/attention/paged-kv/block-manager.d.ts +0 -102
  33. package/dist/attention/paged-kv/index.d.ts +0 -5
  34. package/dist/attention/paged-kv/page-table.d.ts +0 -99
  35. package/dist/attention/scheduler.d.ts +0 -40
  36. package/dist/core/buffer-pool.d.ts +0 -18
  37. package/dist/core/device.d.ts +0 -23
  38. package/dist/inference/engine.d.ts +0 -69
  39. package/dist/inference/generate.d.ts +0 -30
  40. package/dist/inference/index.d.ts +0 -7
  41. package/dist/inference/types.d.ts +0 -161
  42. package/dist/jit/compiler.d.ts +0 -23
  43. package/dist/jit/kernel-cache.d.ts +0 -21
  44. package/dist/model/gguf.d.ts +0 -90
  45. package/dist/model/index.d.ts +0 -16
  46. package/dist/model/safetensors.d.ts +0 -38
  47. package/dist/model/types.d.ts +0 -182
  48. package/dist/ops/activations.d.ts +0 -43
  49. package/dist/ops/elementwise.d.ts +0 -38
  50. package/dist/ops/embedding.d.ts +0 -30
  51. package/dist/ops/matmul.d.ts +0 -21
  52. package/dist/ops/normalization.d.ts +0 -24
  53. package/dist/ops/reshape.d.ts +0 -39
  54. package/dist/ops/rope.d.ts +0 -32
  55. package/dist/ops/softmax.d.ts +0 -18
  56. package/dist/quantization/index.d.ts +0 -6
  57. package/dist/quantization/qmatmul.d.ts +0 -38
  58. package/dist/quantization/quantize.d.ts +0 -52
  59. package/dist/sampling/sampler.d.ts +0 -39
  60. package/dist/sampling/top-k.d.ts +0 -24
  61. package/dist/sampling/top-p.d.ts +0 -14
@@ -1,99 +0,0 @@
1
- /**
2
- * Paged KV Cache - Software page table for efficient memory management
3
- * Inspired by vLLM's PagedAttention
4
- */
5
- import type { WebInferDevice } from "../../core/device.ts";
6
- /**
7
- * Configuration for PagedKVCache
8
- */
9
- export interface PagedKVCacheConfig {
10
- numLayers: number;
11
- numHeads: number;
12
- headDim: number;
13
- pageSize: number;
14
- maxPages: number;
15
- dtype?: "f32" | "f16";
16
- }
17
- /**
18
- * Entry in the page table for a sequence
19
- */
20
- export interface SequenceEntry {
21
- seqId: number;
22
- pages: number[];
23
- length: number;
24
- }
25
- /**
26
- * PagedKVCache - Manages KV cache with paging for efficient memory use
27
- *
28
- * Benefits:
29
- * 1. No memory fragmentation - pages are fixed size
30
- * 2. Efficient memory sharing - multiple sequences can share cache
31
- * 3. Dynamic allocation - only allocate pages as needed
32
- * 4. Easy defragmentation - just remap logical to physical pages
33
- */
34
- export declare class PagedKVCache {
35
- private device;
36
- private config;
37
- private keyCache;
38
- private valueCache;
39
- private pageTable;
40
- private freePages;
41
- private nextSeqId;
42
- constructor(device: WebInferDevice, config: PagedKVCacheConfig);
43
- /**
44
- * Allocate pages for a new sequence
45
- */
46
- allocateSequence(initialLength?: number): number;
47
- /**
48
- * Extend a sequence with new tokens
49
- */
50
- extendSequence(seqId: number, numNewTokens: number): void;
51
- /**
52
- * Free a sequence and its pages
53
- */
54
- freeSequence(seqId: number): void;
55
- /**
56
- * Get page indices for a sequence
57
- */
58
- getSequencePages(seqId: number): number[] | null;
59
- /**
60
- * Get sequence length
61
- */
62
- getSequenceLength(seqId: number): number;
63
- /**
64
- * Get the physical page index for a given sequence position
65
- */
66
- getPageForPosition(seqId: number, position: number): number | null;
67
- /**
68
- * Get offset within a page for a given position
69
- */
70
- getOffsetInPage(position: number): number;
71
- private allocatePage;
72
- private freePage;
73
- /**
74
- * Get cache statistics
75
- */
76
- getStats(): {
77
- totalPages: number;
78
- usedPages: number;
79
- freePages: number;
80
- numSequences: number;
81
- memoryUsedBytes: number;
82
- memoryTotalBytes: number;
83
- };
84
- /**
85
- * Get GPU buffers for kernel binding
86
- */
87
- getBuffers(): {
88
- keyCache: GPUBuffer;
89
- valueCache: GPUBuffer;
90
- };
91
- /**
92
- * Get configuration
93
- */
94
- getConfig(): PagedKVCacheConfig;
95
- /**
96
- * Dispose GPU resources
97
- */
98
- dispose(): void;
99
- }
@@ -1,40 +0,0 @@
1
- /**
2
- * Attention Scheduler - Prevents TDR (GPU timeout) by splitting long sequences
3
- */
4
- import type { WebInferDevice } from "../core/device.ts";
5
- export interface ChunkPlan {
6
- numChunks: number;
7
- chunkSize: number;
8
- estimatedTimeMs: number;
9
- }
10
- /**
11
- * Attention Scheduler for TDR prevention
12
- * Splits long sequences into chunks to avoid GPU timeout
13
- */
14
- export declare class AttentionScheduler {
15
- private device;
16
- private tdrLimit;
17
- constructor(device: WebInferDevice);
18
- private detectTDRLimit;
19
- /**
20
- * Estimate execution time for attention operation
21
- * Based on empirical formula: time ∝ seqLen² × numHeads × headDim
22
- */
23
- estimateExecutionTime(seqLen: number, numHeads: number, headDim: number): number;
24
- /**
25
- * Compute chunk plan for given sequence length
26
- */
27
- computeChunkPlan(seqLen: number, numHeads: number, headDim: number): ChunkPlan;
28
- /**
29
- * Yield to main thread to prevent TDR
30
- */
31
- yieldToMain(): Promise<void>;
32
- /**
33
- * Check if sequence might cause TDR
34
- */
35
- mightCauseTDR(seqLen: number, numHeads: number, headDim: number): boolean;
36
- /**
37
- * Get recommended maximum sequence length for single-pass execution
38
- */
39
- getMaxSinglePassSeqLen(numHeads: number, headDim: number): number;
40
- }
@@ -1,18 +0,0 @@
1
- /**
2
- * GPU Buffer Pool - Reduces allocation overhead and memory fragmentation
3
- */
4
- export declare class BufferPool {
5
- private device;
6
- private pools;
7
- private sizeClasses;
8
- constructor(device: GPUDevice);
9
- private getSizeClass;
10
- acquire(size: number, usage: GPUBufferUsageFlags): GPUBuffer;
11
- release(buffer: GPUBuffer): void;
12
- getStats(): {
13
- totalBuffers: number;
14
- inUse: number;
15
- totalBytes: number;
16
- };
17
- dispose(): void;
18
- }
@@ -1,23 +0,0 @@
1
- /**
2
- * WebGPU Device Management
3
- */
4
- export interface DeviceInfo {
5
- vendor: "apple" | "nvidia" | "intel" | "amd" | "unknown";
6
- architecture: string;
7
- maxWorkgroupSize: number;
8
- maxComputeInvocationsPerWorkgroup: number;
9
- maxStorageBufferBindingSize: number;
10
- }
11
- export declare class WebInferDevice {
12
- private _device;
13
- private _info;
14
- private constructor();
15
- static create(): Promise<WebInferDevice>;
16
- private static detectDeviceInfo;
17
- get device(): GPUDevice;
18
- get info(): DeviceInfo;
19
- get limits(): GPUSupportedLimits;
20
- createCommandEncoder(): GPUCommandEncoder;
21
- submit(commandBuffers: GPUCommandBuffer[]): void;
22
- dispose(): void;
23
- }
@@ -1,69 +0,0 @@
1
- /**
2
- * Inference Engine
3
- * Core engine for running LLM inference
4
- */
5
- import type { WebInferDevice } from "../core/device.ts";
6
- import type { LoadedModel } from "../model/types.ts";
7
- import type { ModelConfig, InferenceConfig, ForwardResult } from "./types.ts";
8
- /**
9
- * Inference Engine
10
- * Manages model weights and provides forward pass functionality
11
- */
12
- export declare class InferenceEngine {
13
- private device;
14
- private config;
15
- private modelConfig;
16
- private weights;
17
- private loadedModel;
18
- private kvCache;
19
- private ropeFreqsCos;
20
- private ropeFreqsSin;
21
- constructor(device: WebInferDevice | null, config?: InferenceConfig);
22
- /**
23
- * Load model weights from a LoadedModel
24
- */
25
- loadModel(model: LoadedModel, modelConfig: ModelConfig): Promise<void>;
26
- /**
27
- * Extract model weights from loaded model
28
- */
29
- private extractWeights;
30
- /**
31
- * Initialize KV cache
32
- */
33
- private initKVCache;
34
- /**
35
- * Reset KV cache (for new sequence)
36
- */
37
- resetKVCache(): void;
38
- /**
39
- * Forward pass (CPU reference implementation)
40
- * @param inputIds - Input token IDs [seqLen]
41
- * @param startPos - Starting position for KV cache
42
- * @returns Logits for the last token
43
- */
44
- forward(inputIds: Uint32Array, startPos?: number): ForwardResult;
45
- /**
46
- * Attention forward pass
47
- */
48
- private attentionForward;
49
- /**
50
- * Apply RoPE to a single head position
51
- */
52
- private applyRoPE;
53
- /**
54
- * FFN forward pass (SwiGLU)
55
- */
56
- private ffnForward;
57
- /**
58
- * Get model configuration
59
- */
60
- getModelConfig(): ModelConfig | null;
61
- /**
62
- * Check if model is loaded
63
- */
64
- isLoaded(): boolean;
65
- /**
66
- * Dispose resources
67
- */
68
- dispose(): void;
69
- }
@@ -1,30 +0,0 @@
1
- /**
2
- * Text Generation API
3
- * High-level API for generating text with LLMs
4
- */
5
- import type { InferenceEngine } from "./engine.ts";
6
- import type { GenerationConfig, GenerationResult, StreamToken } from "./types.ts";
7
- /**
8
- * Apply generation config to logits and sample next token
9
- */
10
- export declare function sampleNextToken(logits: Float32Array, config: GenerationConfig, generatedTokens?: number[]): number;
11
- /**
12
- * Generate tokens from a prompt
13
- * @param engine - Initialized inference engine with loaded model
14
- * @param promptTokens - Tokenized prompt
15
- * @param config - Generation configuration
16
- * @returns Generation result
17
- */
18
- export declare function generate(engine: InferenceEngine, promptTokens: number[] | Uint32Array, config?: Partial<GenerationConfig>): Promise<GenerationResult>;
19
- /**
20
- * Generate tokens with streaming (async iterator)
21
- * @param engine - Initialized inference engine with loaded model
22
- * @param promptTokens - Tokenized prompt
23
- * @param config - Generation configuration
24
- * @yields StreamToken for each generated token
25
- */
26
- export declare function generateStream(engine: InferenceEngine, promptTokens: number[] | Uint32Array, config?: Partial<GenerationConfig>): AsyncGenerator<StreamToken, void, unknown>;
27
- /**
28
- * Simple greedy decode (no sampling, fastest)
29
- */
30
- export declare function greedyDecode(engine: InferenceEngine, promptTokens: number[] | Uint32Array, maxTokens: number, eosTokenId?: number): number[];
@@ -1,7 +0,0 @@
1
- /**
2
- * Inference Module
3
- * High-level API for LLM inference
4
- */
5
- export { type ModelConfig, type InferenceConfig, type GenerationConfig, type GenerationResult, type StreamToken, type FinishReason, type ForwardResult, type ModelWeights, type LayerWeights, DEFAULT_GENERATION_CONFIG, normalizeGenerationConfig, } from "./types.ts";
6
- export { InferenceEngine } from "./engine.ts";
7
- export { generate, generateStream, greedyDecode, sampleNextToken, } from "./generate.ts";
@@ -1,161 +0,0 @@
1
- /**
2
- * Inference Types
3
- * Configuration and result types for model inference
4
- */
5
- import type { DType } from "../core/tensor.ts";
6
- /**
7
- * Model architecture configuration
8
- */
9
- export interface ModelConfig {
10
- /** Model architecture (e.g., "llama", "mistral", "gpt2") */
11
- architecture: string;
12
- /** Number of transformer layers */
13
- numLayers: number;
14
- /** Number of attention heads */
15
- numHeads: number;
16
- /** Number of key-value heads (for GQA, defaults to numHeads) */
17
- numKVHeads?: number;
18
- /** Hidden/embedding dimension */
19
- hiddenSize: number;
20
- /** Intermediate size for FFN */
21
- intermediateSize: number;
22
- /** Vocabulary size */
23
- vocabSize: number;
24
- /** Maximum sequence length */
25
- maxSeqLen: number;
26
- /** Head dimension (defaults to hiddenSize / numHeads) */
27
- headDim?: number;
28
- /** RoPE frequency base */
29
- ropeFreqBase?: number;
30
- /** RMS norm epsilon */
31
- rmsNormEps?: number;
32
- /** Data type for computation */
33
- dtype?: DType;
34
- }
35
- /**
36
- * Inference engine configuration
37
- */
38
- export interface InferenceConfig {
39
- /** Maximum batch size */
40
- maxBatchSize?: number;
41
- /** Maximum sequence length */
42
- maxSeqLen?: number;
43
- /** Use KV cache for generation */
44
- useKVCache?: boolean;
45
- /** Memory limit in bytes (optional) */
46
- memoryLimit?: number;
47
- /** Enable profiling */
48
- enableProfiling?: boolean;
49
- }
50
- /**
51
- * Generation / sampling configuration
52
- */
53
- export interface GenerationConfig {
54
- /** Maximum number of tokens to generate */
55
- maxTokens: number;
56
- /** Temperature for sampling (0 = greedy) */
57
- temperature?: number;
58
- /** Top-K sampling (0 = disabled) */
59
- topK?: number;
60
- /** Top-P / nucleus sampling (1.0 = disabled) */
61
- topP?: number;
62
- /** Repetition penalty (1.0 = disabled) */
63
- repetitionPenalty?: number;
64
- /** Stop sequences (generation stops when any is generated) */
65
- stopSequences?: number[][];
66
- /** EOS token ID */
67
- eosTokenId?: number;
68
- /** Pad token ID */
69
- padTokenId?: number;
70
- /** BOS token ID */
71
- bosTokenId?: number;
72
- /** Stream tokens as they are generated */
73
- stream?: boolean;
74
- /** Random seed for reproducibility */
75
- seed?: number;
76
- }
77
- /**
78
- * Default generation config values
79
- */
80
- export declare const DEFAULT_GENERATION_CONFIG: Required<Omit<GenerationConfig, "stopSequences" | "seed">>;
81
- /**
82
- * Reason for generation completion
83
- */
84
- export type FinishReason = "stop" | "length" | "eos";
85
- /**
86
- * Result of text generation
87
- */
88
- export interface GenerationResult {
89
- /** Generated token IDs */
90
- tokens: number[];
91
- /** Finish reason */
92
- finishReason: FinishReason;
93
- /** Number of prompt tokens */
94
- promptTokens: number;
95
- /** Number of generated tokens */
96
- generatedTokens: number;
97
- /** Total time in milliseconds */
98
- totalTimeMs: number;
99
- /** Tokens per second */
100
- tokensPerSecond: number;
101
- }
102
- /**
103
- * Streaming generation token
104
- */
105
- export interface StreamToken {
106
- /** Token ID */
107
- tokenId: number;
108
- /** Token index in generation */
109
- index: number;
110
- /** Whether this is the final token */
111
- isLast: boolean;
112
- /** Finish reason (only set if isLast) */
113
- finishReason?: FinishReason;
114
- }
115
- /**
116
- * Forward pass result
117
- */
118
- export interface ForwardResult {
119
- /** Logits [batch, vocabSize] or [batch, seqLen, vocabSize] */
120
- logits: Float32Array;
121
- /** Shape of logits */
122
- logitsShape: number[];
123
- }
124
- /**
125
- * Validate and normalize generation config
126
- */
127
- export declare function normalizeGenerationConfig(config: Partial<GenerationConfig>): GenerationConfig;
128
- /**
129
- * Model layer weights
130
- */
131
- export interface LayerWeights {
132
- /** Attention weights */
133
- attention: {
134
- qProj: Float32Array;
135
- kProj: Float32Array;
136
- vProj: Float32Array;
137
- oProj: Float32Array;
138
- };
139
- /** FFN weights */
140
- ffn: {
141
- gate?: Float32Array;
142
- up: Float32Array;
143
- down: Float32Array;
144
- };
145
- /** Normalization */
146
- inputNorm: Float32Array;
147
- postAttentionNorm: Float32Array;
148
- }
149
- /**
150
- * Full model weights
151
- */
152
- export interface ModelWeights {
153
- /** Token embeddings */
154
- embedTokens: Float32Array;
155
- /** Layer weights */
156
- layers: LayerWeights[];
157
- /** Final norm */
158
- finalNorm: Float32Array;
159
- /** LM head (output projection) */
160
- lmHead: Float32Array;
161
- }
@@ -1,23 +0,0 @@
1
- /**
2
- * WGSL Compiler - Generates optimized GPU kernels
3
- */
4
- import { KernelCache } from "./kernel-cache.ts";
5
- import type { DeviceInfo } from "../core/device.ts";
6
- export interface MatMulConfig {
7
- M: number;
8
- N: number;
9
- K: number;
10
- tileM?: number;
11
- tileN?: number;
12
- tileK?: number;
13
- }
14
- export declare class WGSLCompiler {
15
- private device;
16
- private cache;
17
- private deviceInfo;
18
- constructor(device: GPUDevice, cache: KernelCache, deviceInfo: DeviceInfo);
19
- private selectTileSize;
20
- compileMatMul(config: MatMulConfig): GPUComputePipeline;
21
- private generateMatMulWGSL;
22
- getCacheStats(): import("./kernel-cache.ts").CacheStats;
23
- }
@@ -1,21 +0,0 @@
1
- /**
2
- * Kernel Cache - Caches compiled GPU compute pipelines
3
- */
4
- export interface CacheStats {
5
- hits: number;
6
- misses: number;
7
- size: number;
8
- }
9
- export declare class KernelCache {
10
- private device;
11
- private cache;
12
- private hits;
13
- private misses;
14
- constructor(device: GPUDevice);
15
- getOrCreate(key: string, createFn: () => GPUComputePipeline): GPUComputePipeline;
16
- has(key: string): boolean;
17
- get(key: string): GPUComputePipeline | undefined;
18
- set(key: string, pipeline: GPUComputePipeline): void;
19
- getStats(): CacheStats;
20
- clear(): void;
21
- }
@@ -1,90 +0,0 @@
1
- /**
2
- * GGUF Format Parser
3
- *
4
- * GGUF file structure:
5
- * - 4 bytes: magic "GGUF"
6
- * - 4 bytes: version (3)
7
- * - 8 bytes: n_tensors (u64)
8
- * - 8 bytes: n_kv (u64)
9
- * - Key-value metadata pairs
10
- * - Tensor info descriptors
11
- * - Padding to alignment
12
- * - Tensor data
13
- */
14
- import { type GGUFTensorInfo, type TensorInfo, type LoadedModel, type LoadOptions, GGUFQuantType } from "./types.ts";
15
- /**
16
- * Reader helper for GGUF binary format
17
- */
18
- declare class GGUFReader {
19
- private view;
20
- private offset;
21
- private textDecoder;
22
- constructor(buffer: ArrayBuffer);
23
- get position(): number;
24
- set position(pos: number);
25
- readUint8(): number;
26
- readInt8(): number;
27
- readUint16(): number;
28
- readInt16(): number;
29
- readUint32(): number;
30
- readInt32(): number;
31
- readUint64(): bigint;
32
- readInt64(): bigint;
33
- readFloat32(): number;
34
- readFloat64(): number;
35
- readBool(): boolean;
36
- readString(): string;
37
- alignTo(alignment: number): void;
38
- }
39
- /**
40
- * GGUF header information
41
- */
42
- export interface GGUFHeader {
43
- magic: number;
44
- version: number;
45
- nTensors: bigint;
46
- nKV: bigint;
47
- }
48
- /**
49
- * Parse the GGUF header
50
- */
51
- export declare function parseGGUFHeader(reader: GGUFReader): GGUFHeader;
52
- /**
53
- * Parse all metadata key-value pairs
54
- */
55
- export declare function parseGGUFMetadata(reader: GGUFReader, nKV: bigint): Map<string, unknown>;
56
- /**
57
- * Parse tensor info descriptors
58
- */
59
- export declare function parseGGUFTensorInfos(reader: GGUFReader, nTensors: bigint): GGUFTensorInfo[];
60
- /**
61
- * Calculate byte size for a GGUF tensor
62
- */
63
- export declare function calculateGGUFTensorBytes(type: GGUFQuantType, shape: number[]): number;
64
- /**
65
- * Load a GGUF model from an ArrayBuffer
66
- */
67
- export declare function loadGGUF(buffer: ArrayBuffer, options?: LoadOptions): LoadedModel;
68
- /**
69
- * Load GGUF from a URL
70
- */
71
- export declare function loadGGUFFromUrl(url: string, options?: LoadOptions): Promise<LoadedModel>;
72
- /**
73
- * Dequantize Q4_0 block to float32
74
- * Q4_0: 32 values = 2 bytes scale (f16) + 16 bytes data (4-bit packed)
75
- */
76
- export declare function dequantizeQ4_0Block(data: Uint8Array, offset: number): Float32Array;
77
- /**
78
- * Dequantize Q8_0 block to float32
79
- * Q8_0: 32 values = 2 bytes scale (f16) + 32 bytes data (int8)
80
- */
81
- export declare function dequantizeQ8_0Block(data: Uint8Array, offset: number): Float32Array;
82
- /**
83
- * Load and dequantize a GGUF tensor to Float32Array
84
- */
85
- export declare function loadGGUFTensor(buffer: ArrayBuffer, info: TensorInfo): Float32Array;
86
- /**
87
- * Check if a buffer is a valid GGUF file
88
- */
89
- export declare function isGGUF(buffer: ArrayBuffer): boolean;
90
- export {};
@@ -1,16 +0,0 @@
1
- /**
2
- * Model Loading Module
3
- * Supports SafeTensors and GGUF formats
4
- */
5
- import type { ModelFormat, LoadOptions, LoadedModel } from "./types.ts";
6
- export { type ModelFormat, type SafetensorsDType, GGUFQuantType, GGUFMetadataValueType, type TensorInfo, type SafetensorsHeader, type SafetensorsHeaderEntry, type ModelMetadata, type GGUFTensorInfo, type LoadedTensor, type LoadedModel, type LoadOptions, SAFETENSORS_DTYPE_BYTES, GGUF_QUANT_BLOCK_SIZE, GGUF_QUANT_BYTES_PER_BLOCK, } from "./types.ts";
7
- export { parseSafetensorsHeader, getSafetensorsTensorInfos, loadSafetensorsTensor, loadSafetensors, loadSafetensorsFromUrl, isSafetensors, } from "./safetensors.ts";
8
- export { type GGUFHeader, parseGGUFHeader, parseGGUFMetadata, parseGGUFTensorInfos, calculateGGUFTensorBytes, loadGGUF, loadGGUFFromUrl, loadGGUFTensor, dequantizeQ4_0Block, dequantizeQ8_0Block, isGGUF, } from "./gguf.ts";
9
- /**
10
- * Auto-detect model format from buffer
11
- */
12
- export declare function detectModelFormat(buffer: ArrayBuffer): ModelFormat | null;
13
- /**
14
- * Load a model file, auto-detecting the format
15
- */
16
- export declare function loadModel(source: ArrayBuffer | string, options?: LoadOptions): Promise<LoadedModel>;
@@ -1,38 +0,0 @@
1
- /**
2
- * SafeTensors Format Parser
3
- *
4
- * SafeTensors file structure:
5
- * - 8 bytes: header size (little-endian u64)
6
- * - N bytes: JSON header (UTF-8)
7
- * - Remaining: tensor data (contiguous)
8
- */
9
- import { type SafetensorsHeader, type TensorInfo, type LoadedModel, type LoadOptions } from "./types.ts";
10
- /**
11
- * Parse the SafeTensors header from a buffer
12
- * @param buffer - ArrayBuffer containing the SafeTensors file
13
- * @returns Parsed header and data offset
14
- */
15
- export declare function parseSafetensorsHeader(buffer: ArrayBuffer): {
16
- header: SafetensorsHeader;
17
- dataOffset: number;
18
- };
19
- /**
20
- * Extract tensor information from SafeTensors header
21
- */
22
- export declare function getSafetensorsTensorInfos(header: SafetensorsHeader, dataOffset: number): Map<string, TensorInfo>;
23
- /**
24
- * Load a single tensor's data from the buffer
25
- */
26
- export declare function loadSafetensorsTensor(buffer: ArrayBuffer, info: TensorInfo): Float32Array;
27
- /**
28
- * Load a SafeTensors model from an ArrayBuffer
29
- */
30
- export declare function loadSafetensors(buffer: ArrayBuffer, options?: LoadOptions): LoadedModel;
31
- /**
32
- * Load SafeTensors from a URL
33
- */
34
- export declare function loadSafetensorsFromUrl(url: string, options?: LoadOptions): Promise<LoadedModel>;
35
- /**
36
- * Check if a buffer is a valid SafeTensors file
37
- */
38
- export declare function isSafetensors(buffer: ArrayBuffer): boolean;