@sauravpanda/flare 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/demo/README.md +40 -0
- package/demo/index.html +1767 -0
- package/js/index.ts +91 -0
- package/js/types.ts +136 -0
- package/js/webtransport-loader.js +126 -0
- package/js/worker.ts +159 -0
- package/package.json +58 -0
- package/pkg/flare_web.d.ts +1164 -0
- package/pkg/flare_web.js +2790 -0
- package/pkg/flare_web_bg.wasm +0 -0
- package/pkg/flare_web_bg.wasm.d.ts +105 -0
- package/pkg/package.json +27 -0
|
@@ -0,0 +1,1164 @@
|
|
|
1
|
+
/* tslint:disable */
|
|
2
|
+
/* eslint-disable */
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Flare LLM inference engine, exported to JS.
|
|
6
|
+
*
|
|
7
|
+
* Holds a loaded model and runs greedy/sampled token generation.
|
|
8
|
+
* The detected chat template is available via `chat_template_name` and
|
|
9
|
+
* `apply_chat_template` so the browser demo can format prompts correctly
|
|
10
|
+
* for instruction-tuned models.
|
|
11
|
+
*/
|
|
12
|
+
export class FlareEngine {
|
|
13
|
+
private constructor();
|
|
14
|
+
free(): void;
|
|
15
|
+
[Symbol.dispose](): void;
|
|
16
|
+
/**
|
|
17
|
+
* Register a stop sequence.
|
|
18
|
+
*
|
|
19
|
+
* Generation halts (without emitting the matched tokens) as soon as the
|
|
20
|
+
* decoded output ends with `sequence`. Call once per stop string before
|
|
21
|
+
* `begin_stream` or `generate_with_params`.
|
|
22
|
+
*
|
|
23
|
+
* Stop sequences are cleared by `reset()` or `clear_stop_sequences()`.
|
|
24
|
+
*
|
|
25
|
+
* ```javascript
|
|
26
|
+
* engine.add_stop_sequence("<|im_end|>");
|
|
27
|
+
* engine.add_stop_sequence("</s>");
|
|
28
|
+
* engine.begin_stream_with_params(promptIds, 200, 0.8, 0.95, 40, 1.1);
|
|
29
|
+
* ```
|
|
30
|
+
*/
|
|
31
|
+
add_stop_sequence(sequence: string): void;
|
|
32
|
+
/**
|
|
33
|
+
* Format a user message (and optional system prompt) using the model's
|
|
34
|
+
* auto-detected chat template. Returns the formatted prompt string ready
|
|
35
|
+
* to be passed to `FlareTokenizer.encode()`.
|
|
36
|
+
*
|
|
37
|
+
* Pass an empty string for `system_message` to omit the system turn.
|
|
38
|
+
*
|
|
39
|
+
* # JS example
|
|
40
|
+
* ```javascript
|
|
41
|
+
* const prompt = engine.apply_chat_template(
|
|
42
|
+
* 'Explain quantum computing in simple terms.',
|
|
43
|
+
* 'You are a helpful assistant.'
|
|
44
|
+
* );
|
|
45
|
+
* const ids = tokenizer.encode(prompt);
|
|
46
|
+
* const output = engine.generate_tokens(ids, 128);
|
|
47
|
+
* ```
|
|
48
|
+
*/
|
|
49
|
+
apply_chat_template(user_message: string, system_message: string): string;
|
|
50
|
+
/**
|
|
51
|
+
* Prepare for token-by-token streaming.
|
|
52
|
+
*
|
|
53
|
+
* Runs the prefill pass on `prompt_tokens`, then initialises internal
|
|
54
|
+
* state so that subsequent calls to `next_token()` each produce one
|
|
55
|
+
* output token. Call `engine.reset()` before `begin_stream()` to start
|
|
56
|
+
* a fresh conversation.
|
|
57
|
+
*
|
|
58
|
+
* # JS example
|
|
59
|
+
* ```javascript
|
|
60
|
+
* engine.reset();
|
|
61
|
+
* engine.begin_stream(promptIds, 128);
|
|
62
|
+
* function tick() {
|
|
63
|
+
* const id = engine.next_token();
|
|
64
|
+
* if (id === undefined) { /* done */ return; }
|
|
65
|
+
* output.textContent += tokenizer.decode_one(id);
|
|
66
|
+
* requestAnimationFrame(tick); // yield to browser, then continue
|
|
67
|
+
* }
|
|
68
|
+
* requestAnimationFrame(tick);
|
|
69
|
+
* ```
|
|
70
|
+
*/
|
|
71
|
+
begin_stream(prompt_tokens: Uint32Array, max_tokens: number): void;
|
|
72
|
+
/**
|
|
73
|
+
* Begin a token-by-token stream, healing the last prompt token.
|
|
74
|
+
*
|
|
75
|
+
* Identical to `begin_stream` but avoids double-processing the final prompt
|
|
76
|
+
* token: the prefill runs only tokens `[0 .. n-2]`, then the first
|
|
77
|
+
* `next_token()` call processes the last prompt token at its correct
|
|
78
|
+
* position `n-1` and produces the first output token. This keeps RoPE
|
|
79
|
+
* positional embeddings consistent and is recommended when the prompt
|
|
80
|
+
* ends at a natural token boundary (e.g. when encoding a user turn in a
|
|
81
|
+
* chat template).
|
|
82
|
+
*
|
|
83
|
+
* Falls back to `begin_stream` for prompts shorter than 2 tokens.
|
|
84
|
+
*
|
|
85
|
+
* # JS example
|
|
86
|
+
* ```javascript
|
|
87
|
+
* engine.reset();
|
|
88
|
+
* const ids = engine.encode_text(engine.apply_chat_template(userMsg, sysMsg));
|
|
89
|
+
* engine.begin_stream_healed(ids, 256);
|
|
90
|
+
* requestAnimationFrame(function tick() {
|
|
91
|
+
* const id = engine.next_token();
|
|
92
|
+
* if (id !== undefined) output.textContent += tokenizer.decode_one(id);
|
|
93
|
+
* if (!engine.stream_done) requestAnimationFrame(tick);
|
|
94
|
+
* });
|
|
95
|
+
* ```
|
|
96
|
+
*/
|
|
97
|
+
begin_stream_healed(prompt_tokens: Uint32Array, max_tokens: number): void;
|
|
98
|
+
/**
|
|
99
|
+
* Like `begin_stream_healed` but with full sampling parameters.
|
|
100
|
+
*
|
|
101
|
+
* Combines position-consistent prefill (see `begin_stream_healed`) with
|
|
102
|
+
* the same temperature / top-p / top-k / repeat-penalty / min-p controls
|
|
103
|
+
* available in `begin_stream_with_params`.
|
|
104
|
+
*
|
|
105
|
+
* # JS example
|
|
106
|
+
* ```javascript
|
|
107
|
+
* engine.reset();
|
|
108
|
+
* const ids = engine.encode_text(engine.apply_chat_template(userMsg, sysMsg));
|
|
109
|
+
* engine.begin_stream_healed_with_params(ids, 256, 0.8, 0.95, 40, 1.1, 0.0);
|
|
110
|
+
* requestAnimationFrame(function tick() {
|
|
111
|
+
* const id = engine.next_token();
|
|
112
|
+
* if (id !== undefined) output.textContent += tokenizer.decode_one(id);
|
|
113
|
+
* if (!engine.stream_done) requestAnimationFrame(tick);
|
|
114
|
+
* });
|
|
115
|
+
* ```
|
|
116
|
+
*/
|
|
117
|
+
begin_stream_healed_with_params(prompt_tokens: Uint32Array, max_tokens: number, temperature: number, top_p: number, top_k: number, repeat_penalty: number, min_p: number): void;
|
|
118
|
+
/**
|
|
119
|
+
* Like `begin_stream` but with temperature / top-p sampling.
|
|
120
|
+
*
|
|
121
|
+
* `temperature`: 0.0 = greedy, 0.7–1.0 = typical creative range.
|
|
122
|
+
* `top_p`: nucleus sampling threshold (0.0–1.0); 0.9 is a good default.
|
|
123
|
+
*
|
|
124
|
+
* # JS example
|
|
125
|
+
* ```javascript
|
|
126
|
+
* engine.reset();
|
|
127
|
+
* engine.begin_stream_with_params(promptIds, 128, 0.8, 0.9);
|
|
128
|
+
* function tick() {
|
|
129
|
+
* const id = engine.next_token();
|
|
130
|
+
* if (id === undefined) return;
|
|
131
|
+
* output.textContent += tokenizer.decode_one(id);
|
|
132
|
+
* requestAnimationFrame(tick);
|
|
133
|
+
* }
|
|
134
|
+
* requestAnimationFrame(tick);
|
|
135
|
+
* ```
|
|
136
|
+
* Begin a token-by-token stream with sampling parameters including top-k.
|
|
137
|
+
*
|
|
138
|
+
* - `temperature`: controls randomness (0 = greedy, higher = more random)
|
|
139
|
+
* - `top_p`: nucleus sampling — keep the smallest token set whose cumulative
|
|
140
|
+
* probability ≥ `top_p` (1.0 = disabled; applied when < 1.0)
|
|
141
|
+
* - `top_k`: keep only the `top_k` highest-probability tokens before sampling
|
|
142
|
+
* (0 = disabled; applied when `top_p` is 1.0 and `top_k` > 0)
|
|
143
|
+
* - `repeat_penalty`: penalty applied to logits of recently-seen tokens to
|
|
144
|
+
* reduce repetition (1.0 = disabled, 1.1–1.3 = typical range)
|
|
145
|
+
*
|
|
146
|
+
* ```javascript
|
|
147
|
+
* engine.begin_stream_with_params(promptIds, 200, 0.8, 0.95, 40, 1.1, 0.0);
|
|
148
|
+
* ```
|
|
149
|
+
*/
|
|
150
|
+
begin_stream_with_params(prompt_tokens: Uint32Array, max_tokens: number, temperature: number, top_p: number, top_k: number, repeat_penalty: number, min_p: number): void;
|
|
151
|
+
/**
|
|
152
|
+
* Clear any previously loaded raw quantized weights.
|
|
153
|
+
*
|
|
154
|
+
* After calling this the engine uses the f32 dequantized path for all
|
|
155
|
+
* matrix operations until `load_raw_weights` is called again.
|
|
156
|
+
*/
|
|
157
|
+
clear_raw_weights(): void;
|
|
158
|
+
/**
|
|
159
|
+
* Remove all registered stop sequences.
|
|
160
|
+
*/
|
|
161
|
+
clear_stop_sequences(): void;
|
|
162
|
+
/**
|
|
163
|
+
* Compute the perplexity of `text` under the loaded model.
|
|
164
|
+
*
|
|
165
|
+
* Encodes `text` with the embedded GGUF vocabulary, runs one forward pass
|
|
166
|
+
* per token, and measures the log-probability of each correct next-token
|
|
167
|
+
* prediction. Perplexity = exp(−mean(log_probs)).
|
|
168
|
+
*
|
|
169
|
+
* The KV cache is reset **before and after** the evaluation so the engine
|
|
170
|
+
* returns to a clean state.
|
|
171
|
+
*
|
|
172
|
+
* Returns `f32::INFINITY` if the text encodes to fewer than 2 tokens or if
|
|
173
|
+
* no GGUF vocabulary is available.
|
|
174
|
+
*
|
|
175
|
+
* # JS example
|
|
176
|
+
* ```javascript
|
|
177
|
+
* const ppl = engine.compute_perplexity("The quick brown fox");
|
|
178
|
+
* console.log("Perplexity:", ppl);
|
|
179
|
+
* ```
|
|
180
|
+
*/
|
|
181
|
+
compute_perplexity(text: string): number;
|
|
182
|
+
/**
|
|
183
|
+
* Count the number of tokens in `text` using the model's embedded GGUF vocabulary.
|
|
184
|
+
*
|
|
185
|
+
* Returns 0 if the model was not loaded from a GGUF file (e.g. SafeTensors only).
|
|
186
|
+
*
|
|
187
|
+
* # JS example
|
|
188
|
+
* ```javascript
|
|
189
|
+
* const n = engine.count_tokens(textarea.value);
|
|
190
|
+
* counter.textContent = `${n} / ${engine.max_seq_len} tokens`;
|
|
191
|
+
* ```
|
|
192
|
+
*/
|
|
193
|
+
count_tokens(text: string): number;
|
|
194
|
+
/**
|
|
195
|
+
* Decode token IDs to text using the embedded GGUF vocabulary.
|
|
196
|
+
*
|
|
197
|
+
* Returns an empty string if no GGUF vocab is available.
|
|
198
|
+
*
|
|
199
|
+
* # JS example
|
|
200
|
+
* ```javascript
|
|
201
|
+
* const text = engine.decode_ids(generatedIds);
|
|
202
|
+
* ```
|
|
203
|
+
*/
|
|
204
|
+
decode_ids(ids: Uint32Array): string;
|
|
205
|
+
/**
|
|
206
|
+
* Decode a single token ID to its text piece.
|
|
207
|
+
*
|
|
208
|
+
* Convenience wrapper around `decode_ids` for use directly inside a
|
|
209
|
+
* `next_token()` loop so callers don't need a separate `FlareTokenizer`.
|
|
210
|
+
*
|
|
211
|
+
* Returns an empty string if no GGUF vocab is loaded.
|
|
212
|
+
*
|
|
213
|
+
* # JS example
|
|
214
|
+
* ```javascript
|
|
215
|
+
* engine.begin_stream(promptIds, 128);
|
|
216
|
+
* requestAnimationFrame(function tick() {
|
|
217
|
+
* const id = engine.next_token();
|
|
218
|
+
* if (id !== undefined) output.textContent += engine.decode_token(id);
|
|
219
|
+
* if (!engine.stream_done) requestAnimationFrame(tick);
|
|
220
|
+
* });
|
|
221
|
+
* ```
|
|
222
|
+
*/
|
|
223
|
+
decode_token(id: number): string;
|
|
224
|
+
/**
|
|
225
|
+
* Decode a single token ID, correctly handling multi-byte UTF-8 sequences.
|
|
226
|
+
*
|
|
227
|
+
* SentencePiece tokenizers encode non-ASCII characters as consecutive
|
|
228
|
+
* byte-level tokens such as `<0xE4>`, `<0xB8>`, `<0xAD>` (the UTF-8
|
|
229
|
+
* encoding of `中`). The basic `decode_token` function returns incorrect
|
|
230
|
+
* Latin-1 characters in these cases because it treats each byte as an
|
|
231
|
+
* independent Unicode scalar.
|
|
232
|
+
*
|
|
233
|
+
* `decode_token_chunk` accumulates bytes in an internal buffer until a
|
|
234
|
+
* complete, valid UTF-8 sequence is assembled, then returns it as a
|
|
235
|
+
* `String`. While the sequence is incomplete it returns an empty string,
|
|
236
|
+
* and when a regular (non-byte) token is encountered it flushes any
|
|
237
|
+
* buffered bytes (replacing invalid sequences with U+FFFD) before
|
|
238
|
+
* returning the decoded text.
|
|
239
|
+
*
|
|
240
|
+
* **Use this instead of `decode_token` whenever you are streaming tokens
|
|
241
|
+
* that may include non-Latin characters.**
|
|
242
|
+
*
|
|
243
|
+
* ```javascript
|
|
244
|
+
* engine.begin_stream(prompt, 256);
|
|
245
|
+
* function tick() {
|
|
246
|
+
* const id = engine.next_token();
|
|
247
|
+
* if (id !== undefined) output.textContent += engine.decode_token_chunk(id);
|
|
248
|
+
* if (!engine.stream_done) requestAnimationFrame(tick);
|
|
249
|
+
* }
|
|
250
|
+
* requestAnimationFrame(tick);
|
|
251
|
+
* ```
|
|
252
|
+
*/
|
|
253
|
+
decode_token_chunk(id: number): string;
|
|
254
|
+
/**
|
|
255
|
+
* Look up the token embedding row for `token_id` as a flat `Float32Array`.
|
|
256
|
+
*
|
|
257
|
+
* The length of the returned vector is `hidden_dim`. See also
|
|
258
|
+
* [`FlareEngine::output_projection`] for the inverse tail step.
|
|
259
|
+
*/
|
|
260
|
+
embed_token(token_id: number): Float32Array;
|
|
261
|
+
/**
|
|
262
|
+
* # JS example
|
|
263
|
+
* ```javascript
|
|
264
|
+
* const ids = engine.encode_text("Hello, world!");
|
|
265
|
+
* const output = engine.generate_tokens(ids, 64);
|
|
266
|
+
* ```
|
|
267
|
+
*/
|
|
268
|
+
encode_text(text: string): Uint32Array;
|
|
269
|
+
/**
|
|
270
|
+
* Streaming text-in / text-out generation with a per-token JS callback.
|
|
271
|
+
*
|
|
272
|
+
* Encodes `prompt` with the embedded GGUF vocabulary, generates up to
|
|
273
|
+
* `max_tokens` tokens, and calls `on_token(token_str)` with the decoded
|
|
274
|
+
* text for each token as it is produced. Returns the number of tokens
|
|
275
|
+
* generated (excluding any EOS token).
|
|
276
|
+
*
|
|
277
|
+
* Returns 0 if no GGUF vocab is available.
|
|
278
|
+
*
|
|
279
|
+
* # Note on browser streaming
|
|
280
|
+
* `on_token` is called synchronously inside WASM, so the browser will
|
|
281
|
+
* not visually update between tokens. For visible character-by-character
|
|
282
|
+
* output, use `begin_stream` + `next_token` with `requestAnimationFrame`.
|
|
283
|
+
*
|
|
284
|
+
* # JS example
|
|
285
|
+
* ```javascript
|
|
286
|
+
* engine.reset();
|
|
287
|
+
* let out = '';
|
|
288
|
+
* const count = engine.generate_stream("What is Rust?", 128, (token) => {
|
|
289
|
+
* out += token;
|
|
290
|
+
* });
|
|
291
|
+
* output.textContent = out;
|
|
292
|
+
* ```
|
|
293
|
+
*/
|
|
294
|
+
generate_stream(prompt: string, max_tokens: number, on_token: Function): number;
|
|
295
|
+
/**
|
|
296
|
+
* Streaming text-in / text-out with explicit sampling parameters.
|
|
297
|
+
*
|
|
298
|
+
* Like `generate_stream` but with the full set of sampling controls:
|
|
299
|
+
*
|
|
300
|
+
* - `temperature`: 0 = greedy, higher = more diverse
|
|
301
|
+
* - `top_p`: nucleus sampling (1.0 = disabled)
|
|
302
|
+
* - `top_k`: top-k sampling, applied when `top_p` is 1.0 and `min_p` is 0.0 (0 = disabled)
|
|
303
|
+
* - `repeat_penalty`: repetition penalty (1.0 = disabled, 1.1–1.3 = typical)
|
|
304
|
+
* - `min_p`: min-p threshold (0.0 = disabled)
|
|
305
|
+
*
|
|
306
|
+
* Encodes `prompt` with the embedded GGUF vocabulary, generates up to
|
|
307
|
+
* `max_tokens` tokens, and calls `on_token(token_str)` with the decoded
|
|
308
|
+
* text for each token. Respects stop sequences registered via
|
|
309
|
+
* `add_stop_sequence`. Returns the number of tokens generated.
|
|
310
|
+
*
|
|
311
|
+
* Returns 0 if no GGUF vocab is available.
|
|
312
|
+
*
|
|
313
|
+
* # JS example
|
|
314
|
+
* ```javascript
|
|
315
|
+
* engine.add_stop_sequence("<|im_end|>");
|
|
316
|
+
* engine.reset();
|
|
317
|
+
* let out = '';
|
|
318
|
+
* const count = engine.generate_stream_with_params(
|
|
319
|
+
* prompt, 200, 0.8, 0.95, 40, 1.1, 0.0,
|
|
320
|
+
* (token) => { out += token; }
|
|
321
|
+
* );
|
|
322
|
+
* ```
|
|
323
|
+
*/
|
|
324
|
+
generate_stream_with_params(prompt: string, max_tokens: number, temperature: number, top_p: number, top_k: number, repeat_penalty: number, min_p: number, on_token: Function): number;
|
|
325
|
+
/**
|
|
326
|
+
* Full text-in / text-out generation using the embedded GGUF vocabulary.
|
|
327
|
+
*
|
|
328
|
+
* Encodes `prompt` with the embedded vocab, runs greedy generation for up
|
|
329
|
+
* to `max_tokens` steps, then decodes the output back to text. Stops
|
|
330
|
+
* automatically at EOS.
|
|
331
|
+
*
|
|
332
|
+
* Returns an empty string if no GGUF vocab is available.
|
|
333
|
+
*
|
|
334
|
+
* # JS example
|
|
335
|
+
* ```javascript
|
|
336
|
+
* engine.reset();
|
|
337
|
+
* const response = engine.generate_text("What is Rust?", 128);
|
|
338
|
+
* output.textContent = response;
|
|
339
|
+
* ```
|
|
340
|
+
*/
|
|
341
|
+
generate_text(prompt: string, max_tokens: number): string;
|
|
342
|
+
/**
|
|
343
|
+
* Full text-in / text-out generation with explicit sampling parameters.
|
|
344
|
+
*
|
|
345
|
+
* Like `generate_text` but with the full set of sampling controls:
|
|
346
|
+
*
|
|
347
|
+
* - `temperature`: 0 = greedy, higher = more diverse
|
|
348
|
+
* - `top_p`: nucleus sampling (1.0 = disabled)
|
|
349
|
+
* - `top_k`: top-k sampling, applied when `top_p` is 1.0 and `min_p` is 0.0 (0 = disabled)
|
|
350
|
+
* - `repeat_penalty`: repetition penalty (1.0 = disabled)
|
|
351
|
+
* - `min_p`: min-p threshold (0.0 = disabled)
|
|
352
|
+
*
|
|
353
|
+
* Returns the decoded generated text. Returns an empty string if no GGUF vocab is available.
|
|
354
|
+
* Respects stop sequences registered via `add_stop_sequence`.
|
|
355
|
+
*
|
|
356
|
+
* # JS example
|
|
357
|
+
* ```javascript
|
|
358
|
+
* engine.reset();
|
|
359
|
+
* const response = engine.generate_text_with_params(
|
|
360
|
+
* "What is Rust?", 128, 0.8, 0.95, 40, 1.1, 0.0
|
|
361
|
+
* );
|
|
362
|
+
* output.textContent = response;
|
|
363
|
+
* ```
|
|
364
|
+
*/
|
|
365
|
+
generate_text_with_params(prompt: string, max_tokens: number, temperature: number, top_p: number, top_k: number, repeat_penalty: number, min_p: number): string;
|
|
366
|
+
/**
|
|
367
|
+
* Generate `max_tokens` tokens starting from `prompt_tokens` (greedy).
|
|
368
|
+
* Stops early at EOS. Returns a Uint32Array of generated token IDs.
|
|
369
|
+
*/
|
|
370
|
+
generate_tokens(prompt_tokens: Uint32Array, max_tokens: number): Uint32Array;
|
|
371
|
+
/**
|
|
372
|
+
* Generate a batch of tokens with explicit sampling parameters.
|
|
373
|
+
*
|
|
374
|
+
* - `temperature`: 0 = greedy, higher = more diverse
|
|
375
|
+
* - `top_p`: nucleus sampling (1.0 = disabled)
|
|
376
|
+
* - `top_k`: top-k sampling, applied when `top_p` is 1.0 and `min_p` is 0.0 (0 = disabled)
|
|
377
|
+
* - `repeat_penalty`: repetition penalty applied to recently-seen tokens (1.0 = disabled)
|
|
378
|
+
* - `min_p`: min-p threshold (0.0 = disabled); applied after `top_p`, before `top_k`
|
|
379
|
+
*
|
|
380
|
+
* Stops early at EOS. Uses a fixed LCG RNG seed for reproducibility.
|
|
381
|
+
*/
|
|
382
|
+
generate_with_params(prompt_tokens: Uint32Array, max_tokens: number, temperature: number, top_p: number, top_k: number, repeat_penalty: number, min_p: number): Uint32Array;
|
|
383
|
+
/**
|
|
384
|
+
* Try to initialise the WebGPU compute backend.
|
|
385
|
+
*
|
|
386
|
+
* Call this after `load()` to enable GPU-accelerated matrix operations
|
|
387
|
+
* (matvec, matmul, silu_mul). Falls back silently to CPU if WebGPU is
|
|
388
|
+
* unavailable or adapter request fails.
|
|
389
|
+
*
|
|
390
|
+
* Returns `true` if a GPU backend was successfully initialised.
|
|
391
|
+
*
|
|
392
|
+
* ```javascript
|
|
393
|
+
* const engine = FlareEngine.load(bytes);
|
|
394
|
+
* const gpuEnabled = await engine.init_gpu();
|
|
395
|
+
* console.log('GPU:', gpuEnabled);
|
|
396
|
+
* ```
|
|
397
|
+
*/
|
|
398
|
+
init_gpu(): Promise<boolean>;
|
|
399
|
+
/**
|
|
400
|
+
* Initialise the WebGPU backend using previously serialised pipeline cache
|
|
401
|
+
* bytes (from `engine.pipeline_cache_data()`).
|
|
402
|
+
*
|
|
403
|
+
* On backends that support driver-managed pipeline caches (Vulkan native),
|
|
404
|
+
* this allows the driver to reuse compiled GPU machine code from a previous
|
|
405
|
+
* run, eliminating cold-start shader recompilation (typically 100ms–2s).
|
|
406
|
+
*
|
|
407
|
+
* On unsupported backends (WebGPU, Metal, DX12) this behaves identically to
|
|
408
|
+
* `init_gpu()` — the cache bytes are silently ignored.
|
|
409
|
+
*
|
|
410
|
+
* ```javascript
|
|
411
|
+
* const cached = localStorage.getItem('flare-pipeline-cache');
|
|
412
|
+
* const cacheBytes = cached ? new Uint8Array(JSON.parse(cached)) : new Uint8Array();
|
|
413
|
+
* await engine.init_gpu_with_cache(cacheBytes);
|
|
414
|
+
* // After inference, persist the cache:
|
|
415
|
+
* const data = engine.pipeline_cache_data();
|
|
416
|
+
* if (data.length > 0) {
|
|
417
|
+
* localStorage.setItem('flare-pipeline-cache', JSON.stringify(Array.from(data)));
|
|
418
|
+
* }
|
|
419
|
+
* ```
|
|
420
|
+
*/
|
|
421
|
+
init_gpu_with_cache(cache_data: Uint8Array): Promise<boolean>;
|
|
422
|
+
/**
|
|
423
|
+
* Load a GGUF model from a Uint8Array of bytes (e.g. from `fetch`).
|
|
424
|
+
*/
|
|
425
|
+
static load(gguf_bytes: Uint8Array): FlareEngine;
|
|
426
|
+
/**
|
|
427
|
+
* Load raw quantized weights from GGUF bytes so the GPU fused
|
|
428
|
+
* dequant+matvec kernels can be used during inference.
|
|
429
|
+
*
|
|
430
|
+
* Call this **after** `init_gpu()` so the backend is set before the raw
|
|
431
|
+
* weights are attached. The method is a no-op (returns `false`) if a
|
|
432
|
+
* layer's weights are in an unsupported quantization format — the engine
|
|
433
|
+
* continues to work using the f32 path loaded at `FlareEngine.load()`.
|
|
434
|
+
*
|
|
435
|
+
* Returns `true` if all layers were loaded successfully, `false` if any
|
|
436
|
+
* layer fell back to the f32 path.
|
|
437
|
+
*
|
|
438
|
+
* ```javascript
|
|
439
|
+
* const engine = FlareEngine.load(bytes);
|
|
440
|
+
* await engine.init_gpu();
|
|
441
|
+
* const ok = engine.load_raw_weights(bytes);
|
|
442
|
+
* console.log('Raw weights loaded:', ok);
|
|
443
|
+
* ```
|
|
444
|
+
*/
|
|
445
|
+
load_raw_weights(gguf_bytes: Uint8Array): boolean;
|
|
446
|
+
/**
|
|
447
|
+
* Merge a LoRA adapter (SafeTensors format) into the model weights.
|
|
448
|
+
*
|
|
449
|
+
* Pass the raw bytes of a `.safetensors` file containing LoRA A/B matrices.
|
|
450
|
+
* After merging, the adapter's effect is permanent for this engine instance;
|
|
451
|
+
* call `FlareEngine.load()` again to restore the base model.
|
|
452
|
+
*
|
|
453
|
+
* ```javascript
|
|
454
|
+
* const resp = await fetch('lora-adapter.safetensors');
|
|
455
|
+
* const bytes = new Uint8Array(await resp.arrayBuffer());
|
|
456
|
+
* engine.merge_lora(bytes);
|
|
457
|
+
* ```
|
|
458
|
+
*/
|
|
459
|
+
merge_lora(adapter_bytes: Uint8Array): void;
|
|
460
|
+
/**
|
|
461
|
+
* Merge a LoRA adapter with a custom alpha scaling factor.
|
|
462
|
+
*
|
|
463
|
+
* Same as `merge_lora` but overrides the alpha value embedded in the
|
|
464
|
+
* adapter file. The effective scaling is `alpha / rank`.
|
|
465
|
+
*/
|
|
466
|
+
merge_lora_with_alpha(adapter_bytes: Uint8Array, alpha: number): void;
|
|
467
|
+
/**
|
|
468
|
+
* Generate and return the next token ID, or `undefined` when the stream
|
|
469
|
+
* is complete (EOS reached, `max_tokens` exhausted, or `stop_stream()`
|
|
470
|
+
* was called).
|
|
471
|
+
*
|
|
472
|
+
* Sampling parameters are those set by the most recent `begin_stream` or
|
|
473
|
+
* `begin_stream_with_params` call. Call this inside
|
|
474
|
+
* `requestAnimationFrame` so the browser can update the DOM between
|
|
475
|
+
* tokens and the page remains responsive.
|
|
476
|
+
*/
|
|
477
|
+
next_token(): number | undefined;
|
|
478
|
+
/**
|
|
479
|
+
* Apply final RMSNorm + output projection to a hidden state and
|
|
480
|
+
* return logits over the vocabulary.
|
|
481
|
+
*
|
|
482
|
+
* `hidden` must have length `hidden_dim`. The returned vector has
|
|
483
|
+
* length `vocab_size`.
|
|
484
|
+
*/
|
|
485
|
+
output_projection(hidden: Float32Array): Float32Array;
|
|
486
|
+
/**
|
|
487
|
+
* Return a JSON string summarising the performance metrics from the last
|
|
488
|
+
* generation call.
|
|
489
|
+
*
|
|
490
|
+
* ```javascript
|
|
491
|
+
* const perf = JSON.parse(engine.performance_summary());
|
|
492
|
+
* console.log(`TTFT: ${perf.prefill_ms.toFixed(1)} ms`);
|
|
493
|
+
* console.log(`Decode: ${perf.tokens_per_second.toFixed(1)} tok/s`);
|
|
494
|
+
* ```
|
|
495
|
+
*/
|
|
496
|
+
performance_summary(): string;
|
|
497
|
+
/**
|
|
498
|
+
* Reset the KV cache (start a new conversation).
|
|
499
|
+
*
|
|
500
|
+
* Also clears stop sequences, the internal text accumulator, and
|
|
501
|
+
* restores the RNG seed to the default `0x12345678`.
|
|
502
|
+
*/
|
|
503
|
+
reset(): void;
|
|
504
|
+
/**
|
|
505
|
+
* Set the repetition-penalty look-back window (number of recent tokens to
|
|
506
|
+
* penalise). Use `0` to disable repetition penalty entirely. Default: 64.
|
|
507
|
+
*
|
|
508
|
+
* Takes effect on the next `begin_stream*` call.
|
|
509
|
+
*
|
|
510
|
+
* # JS example
|
|
511
|
+
* ```javascript
|
|
512
|
+
* engine.set_repeat_last_n(128); // wider window for creative writing
|
|
513
|
+
* engine.set_repeat_last_n(0); // disable repeat penalty
|
|
514
|
+
* ```
|
|
515
|
+
*/
|
|
516
|
+
set_repeat_last_n(n: number): void;
|
|
517
|
+
/**
|
|
518
|
+
* Set the LCG RNG seed used for the next sampled generation call.
|
|
519
|
+
*
|
|
520
|
+
* Controls the random state passed to `begin_stream_with_params` and
|
|
521
|
+
* `generate_with_params`, enabling reproducible outputs. The seed is
|
|
522
|
+
* applied on the next call and then *not* automatically reset, so the
|
|
523
|
+
* same seed will be reused on subsequent calls unless `set_rng_seed` or
|
|
524
|
+
* `reset()` is called again.
|
|
525
|
+
*
|
|
526
|
+
* `reset()` restores the seed to the default `0x12345678`.
|
|
527
|
+
*
|
|
528
|
+
* ```javascript
|
|
529
|
+
* engine.set_rng_seed(42);
|
|
530
|
+
* const out1 = engine.generate_text("Hello", 50);
|
|
531
|
+
* engine.set_rng_seed(42);
|
|
532
|
+
* const out2 = engine.generate_text("Hello", 50);
|
|
533
|
+
* // out1 === out2
|
|
534
|
+
* ```
|
|
535
|
+
*/
|
|
536
|
+
set_rng_seed(seed: number): void;
|
|
537
|
+
/**
|
|
538
|
+
* Set how many top log-probability entries to capture after each forward
|
|
539
|
+
* pass. Pass `0` (the default) to disable and save the computation.
|
|
540
|
+
*
|
|
541
|
+
* When enabled, `top_logprobs` is populated after every `next_token()`
|
|
542
|
+
* call and after every token in `generate_stream_with_params`.
|
|
543
|
+
*
|
|
544
|
+
* # JS example
|
|
545
|
+
* ```javascript
|
|
546
|
+
* engine.set_top_logprobs(5);
|
|
547
|
+
* engine.begin_stream(promptIds, 64);
|
|
548
|
+
* while (!engine.stream_done) {
|
|
549
|
+
* engine.next_token();
|
|
550
|
+
* const lp = engine.top_logprobs; // Float32Array [id0, lp0, id1, lp1, ...]
|
|
551
|
+
* }
|
|
552
|
+
* ```
|
|
553
|
+
*/
|
|
554
|
+
set_top_logprobs(n: number): void;
|
|
555
|
+
/**
|
|
556
|
+
* Signal the current stream to stop after the next `next_token()` call.
|
|
557
|
+
* The JS Stop button should call this, then wait for `next_token()` to
|
|
558
|
+
* return `undefined` before updating the UI.
|
|
559
|
+
*/
|
|
560
|
+
stop_stream(): void;
|
|
561
|
+
/**
|
|
562
|
+
* Truncate `text` so that it fits within `budget` tokens when encoded.
|
|
563
|
+
*
|
|
564
|
+
* Encodes `text` with the embedded GGUF vocabulary, keeps the **last**
|
|
565
|
+
* `budget` tokens (tail of the text is preferred, so recent context is
|
|
566
|
+
* preserved), and decodes them back to a string. Returns `text` unchanged
|
|
567
|
+
* if it already fits or if no vocab is available.
|
|
568
|
+
*
|
|
569
|
+
* A typical call reserves space for the system prompt + generated output:
|
|
570
|
+
*
|
|
571
|
+
* ```javascript
|
|
572
|
+
* // Keep only the tail of the conversation that fits in the context
|
|
573
|
+
* const budget = engine.max_seq_len - 256; // leave 256 tokens for output
|
|
574
|
+
* const trimmed = engine.truncate_to_context(conversationText, budget);
|
|
575
|
+
* ```
|
|
576
|
+
*/
|
|
577
|
+
truncate_to_context(text: string, budget: number): string;
|
|
578
|
+
/**
|
|
579
|
+
* Run a single dummy forward pass to pre-compile WebGPU shader pipelines.
|
|
580
|
+
*
|
|
581
|
+
* WebGPU (and wgpu on native) compiles shader pipelines lazily on the
|
|
582
|
+
* first dispatch. This causes a noticeable latency spike — often 100ms
|
|
583
|
+
* to several seconds — when the user makes their first inference request.
|
|
584
|
+
*
|
|
585
|
+
* Call `warmup()` once after `init_gpu()` completes to trigger all shader
|
|
586
|
+
* compilations in the background so the first real request feels fast.
|
|
587
|
+
* The KV cache is reset after the warmup so the engine is in a clean state.
|
|
588
|
+
*
|
|
589
|
+
* Returns `true` if the warmup forward pass ran without error, `false` if
|
|
590
|
+
* the model has not been loaded.
|
|
591
|
+
*
|
|
592
|
+
* # JS example
|
|
593
|
+
* ```javascript
|
|
594
|
+
* const engine = FlareEngine.load(bytes);
|
|
595
|
+
* await engine.init_gpu();
|
|
596
|
+
* engine.warmup(); // trigger shader compilation
|
|
597
|
+
* // First real inference is now fast
|
|
598
|
+
* engine.begin_stream(promptIds, 128);
|
|
599
|
+
* ```
|
|
600
|
+
*/
|
|
601
|
+
warmup(): boolean;
|
|
602
|
+
/**
|
|
603
|
+
* Whether the model requests automatic BOS token prepending.
|
|
604
|
+
*
|
|
605
|
+
* Sourced from `tokenizer.ggml.add_bos_token` in the GGUF metadata.
|
|
606
|
+
* When `true`, all generation methods (`generate_tokens`, `begin_stream`,
|
|
607
|
+
* `generate_text`, `generate_stream`) automatically prepend the BOS token
|
|
608
|
+
* to the input token sequence unless it is already the first token.
|
|
609
|
+
*/
|
|
610
|
+
readonly add_bos_token: boolean;
|
|
611
|
+
/**
|
|
612
|
+
* Model architecture name from `general.architecture` in the GGUF metadata.
|
|
613
|
+
*
|
|
614
|
+
* Returns a lowercase string such as `"llama"`, `"mistral"`, `"gemma2"`,
|
|
615
|
+
* `"phi3"`, or `"qwen2"`. Returns `"unknown"` if the field is absent.
|
|
616
|
+
*/
|
|
617
|
+
readonly architecture: string;
|
|
618
|
+
/**
|
|
619
|
+
* BOS (beginning of sequence) token ID from the GGUF model metadata, if present.
|
|
620
|
+
* Some models require this to be prepended to the input token sequence.
|
|
621
|
+
*/
|
|
622
|
+
readonly bos_token_id: number | undefined;
|
|
623
|
+
/**
|
|
624
|
+
* Name of the auto-detected chat template (e.g. `"ChatML"`, `"Llama3"`,
|
|
625
|
+
* `"Alpaca"`, `"Raw"`). Use this to display the template in the UI and
|
|
626
|
+
* decide whether to call `apply_chat_template` before encoding.
|
|
627
|
+
*/
|
|
628
|
+
readonly chat_template_name: string;
|
|
629
|
+
/**
|
|
630
|
+
* Fraction of the context window consumed (0.0 = empty, 1.0 = full).
|
|
631
|
+
*
|
|
632
|
+
* Equivalent to `tokens_used / max_seq_len`. Returns 0.0 if `max_seq_len` is 0.
|
|
633
|
+
*/
|
|
634
|
+
readonly context_window_pct: number;
|
|
635
|
+
/**
|
|
636
|
+
* EOS (end of sequence) token ID from the GGUF model metadata, if present.
|
|
637
|
+
* Generation stops automatically when this token is produced.
|
|
638
|
+
*/
|
|
639
|
+
readonly eos_token_id: number | undefined;
|
|
640
|
+
/**
|
|
641
|
+
* Returns `true` if raw quantized weights are currently loaded.
|
|
642
|
+
*/
|
|
643
|
+
readonly has_raw_weights: boolean;
|
|
644
|
+
/**
|
|
645
|
+
* Get the hidden dimension.
|
|
646
|
+
*/
|
|
647
|
+
readonly hidden_dim: number;
|
|
648
|
+
/**
|
|
649
|
+
* Milliseconds spent in decode steps of the last generation call.
|
|
650
|
+
*
|
|
651
|
+
* For batch generation (`generate_tokens` etc.) this is always 0 — see
|
|
652
|
+
* `last_prefill_ms` for the total time. For the streaming API this
|
|
653
|
+
* accumulates across all `next_token()` calls since the last
|
|
654
|
+
* `begin_stream()`.
|
|
655
|
+
*/
|
|
656
|
+
readonly last_decode_ms: number;
|
|
657
|
+
/**
|
|
658
|
+
* Raw pre-temperature logits from the most recent forward pass.
|
|
659
|
+
*
|
|
660
|
+
* Returns the full vocabulary logit vector as a `Float32Array`. These
|
|
661
|
+
* are the raw values *before* temperature scaling, repetition penalty,
|
|
662
|
+
* or any sampling filter — equivalent to the model's raw next-token
|
|
663
|
+
* distribution.
|
|
664
|
+
*
|
|
665
|
+
* Useful for:
|
|
666
|
+
* - Scoring candidate continuations (classification, ranking)
|
|
667
|
+
* - Computing perplexity / cross-entropy
|
|
668
|
+
* - Inspecting the model's "confidence" about the next token
|
|
669
|
+
*
|
|
670
|
+
* Returns an empty array before any inference has been run, and is
|
|
671
|
+
* cleared by `reset()`.
|
|
672
|
+
*
|
|
673
|
+
* ```javascript
|
|
674
|
+
* engine.begin_stream(promptIds, 1); // one token prefill+decode
|
|
675
|
+
* engine.next_token();
|
|
676
|
+
* const logits = engine.last_logits; // Float32Array of vocab_size
|
|
677
|
+
* const topTokenId = logits.indexOf(Math.max(...logits));
|
|
678
|
+
* ```
|
|
679
|
+
*/
|
|
680
|
+
readonly last_logits: Float32Array;
|
|
681
|
+
/**
|
|
682
|
+
* Milliseconds spent in the last prefill (prompt processing) phase.
|
|
683
|
+
*
|
|
684
|
+
* For `generate_tokens` / `generate_text` / `generate_with_params` this
|
|
685
|
+
* covers the entire call (prefill + decode are not separated internally).
|
|
686
|
+
* For the streaming API (`begin_stream` + `next_token`) this covers only
|
|
687
|
+
* the `begin_stream()` call.
|
|
688
|
+
*/
|
|
689
|
+
readonly last_prefill_ms: number;
|
|
690
|
+
/**
|
|
691
|
+
* Number of tokens generated by the last generation call (excludes prompt
|
|
692
|
+
* tokens and the EOS token itself).
|
|
693
|
+
*/
|
|
694
|
+
readonly last_tokens_generated: number;
|
|
695
|
+
/**
|
|
696
|
+
* Maximum sequence length (context window size) of the loaded model.
|
|
697
|
+
*
|
|
698
|
+
* Use this to warn users when their prompt is approaching the limit.
|
|
699
|
+
*/
|
|
700
|
+
readonly max_seq_len: number;
|
|
701
|
+
/**
|
|
702
|
+
* All GGUF model metadata as a JSON string.
|
|
703
|
+
*
|
|
704
|
+
* Returns a JSON object mapping each metadata key to its value.
|
|
705
|
+
* Large vocabulary arrays (`tokenizer.ggml.tokens`, `.merges`, `.scores`,
|
|
706
|
+
* `.added_tokens`) are omitted to keep the payload practical.
|
|
707
|
+
* Small arrays (≤ 64 entries) are included as JSON arrays.
|
|
708
|
+
*
|
|
709
|
+
* Returns `"{}"` if the model was not loaded from a GGUF file.
|
|
710
|
+
*
|
|
711
|
+
* ```javascript
|
|
712
|
+
* const meta = JSON.parse(engine.metadata_json);
|
|
713
|
+
* console.log(meta["llama.context_length"]); // e.g. 4096
|
|
714
|
+
* ```
|
|
715
|
+
*/
|
|
716
|
+
readonly metadata_json: string;
|
|
717
|
+
/**
|
|
718
|
+
* Model display name from `general.name` in the GGUF metadata.
|
|
719
|
+
*
|
|
720
|
+
* Returns the human-readable name embedded by the model author (e.g.
|
|
721
|
+
* `"Llama 3.2 1B Instruct"`). Returns an empty string if the field is absent.
|
|
722
|
+
*/
|
|
723
|
+
readonly model_name: string;
|
|
724
|
+
/**
|
|
725
|
+
* Get the number of attention heads.
|
|
726
|
+
*/
|
|
727
|
+
readonly num_heads: number;
|
|
728
|
+
/**
|
|
729
|
+
* Get the number of layers.
|
|
730
|
+
*/
|
|
731
|
+
readonly num_layers: number;
|
|
732
|
+
/**
|
|
733
|
+
* Serialise the driver-managed GPU pipeline cache to bytes.
|
|
734
|
+
*
|
|
735
|
+
* Returns an opaque blob that can be passed to `init_gpu_with_cache()` on
|
|
736
|
+
* the next startup to skip shader recompilation. Store it in
|
|
737
|
+
* `localStorage` or `IndexedDB` between page loads.
|
|
738
|
+
*
|
|
739
|
+
* Returns an empty `Uint8Array` if no GPU is active, or if the current
|
|
740
|
+
* backend does not support pipeline caching (WebGPU, Metal, DX12).
|
|
741
|
+
*/
|
|
742
|
+
readonly pipeline_cache_data: Uint8Array;
|
|
743
|
+
/**
|
|
744
|
+
* Raw Jinja2 chat template string from the GGUF model metadata, if present.
|
|
745
|
+
*
|
|
746
|
+
* This is the `tokenizer.chat_template` field embedded by the model author.
|
|
747
|
+
* Use this with a JavaScript Jinja2 renderer (e.g. `nunjucks`) for accurate
|
|
748
|
+
* prompt formatting across all model families, rather than relying on the
|
|
749
|
+
* simplified built-in `apply_chat_template`.
|
|
750
|
+
*
|
|
751
|
+
* Returns `undefined` if the GGUF file did not include a chat template.
|
|
752
|
+
*/
|
|
753
|
+
readonly raw_chat_template: string | undefined;
|
|
754
|
+
/**
|
|
755
|
+
* Current repetition-penalty window size (0 = disabled).
|
|
756
|
+
*/
|
|
757
|
+
readonly repeat_last_n: number;
|
|
758
|
+
/**
|
|
759
|
+
* Whether the current stream has finished.
|
|
760
|
+
*/
|
|
761
|
+
readonly stream_done: boolean;
|
|
762
|
+
/**
|
|
763
|
+
* Why the most-recent stream stopped.
|
|
764
|
+
*
|
|
765
|
+
* Returns one of:
|
|
766
|
+
* - `"eos"` — the model emitted the EOS token
|
|
767
|
+
* - `"length"` — `max_tokens` budget was exhausted
|
|
768
|
+
* - `"stop_sequence"` — a registered stop sequence was matched
|
|
769
|
+
* - `"user"` — `stop_stream()` was called
|
|
770
|
+
* - `""` (empty) — stream not yet started or still running
|
|
771
|
+
*
|
|
772
|
+
* # JS example
|
|
773
|
+
* ```javascript
|
|
774
|
+
* while (!engine.stream_done) engine.next_token();
|
|
775
|
+
* console.log("Stopped because:", engine.stream_stop_reason);
|
|
776
|
+
* ```
|
|
777
|
+
*/
|
|
778
|
+
readonly stream_stop_reason: string;
|
|
779
|
+
/**
|
|
780
|
+
* Decode throughput in tokens per second for the last generation call.
|
|
781
|
+
*
|
|
782
|
+
* For the streaming API this is calculated from `last_decode_ms`.
|
|
783
|
+
* For batch generation this is calculated from `last_prefill_ms`
|
|
784
|
+
* (the total call duration).
|
|
785
|
+
*
|
|
786
|
+
* Returns 0.0 if no generation has been run or if timing data is
|
|
787
|
+
* unavailable.
|
|
788
|
+
*/
|
|
789
|
+
readonly tokens_per_second: number;
|
|
790
|
+
/**
|
|
791
|
+
* How many tokens of context space remain before the window is full.
|
|
792
|
+
*
|
|
793
|
+
* Equivalent to `max_seq_len - tokens_used`. Returns 0 when the context is
|
|
794
|
+
* already full or `max_seq_len` is 0.
|
|
795
|
+
*
|
|
796
|
+
* # JS example
|
|
797
|
+
* ```javascript
|
|
798
|
+
* if (engine.tokens_remaining < 64) {
|
|
799
|
+
* console.warn("Context window almost full — consider resetting.");
|
|
800
|
+
* }
|
|
801
|
+
* ```
|
|
802
|
+
*/
|
|
803
|
+
readonly tokens_remaining: number;
|
|
804
|
+
/**
|
|
805
|
+
* Number of tokens currently consumed in the KV-cache session (prompt + generated).
|
|
806
|
+
*
|
|
807
|
+
* Updated after every generation call; reset to 0 by `engine.reset()`.
|
|
808
|
+
* Use with `max_seq_len` to build a context-usage progress bar.
|
|
809
|
+
*/
|
|
810
|
+
readonly tokens_used: number;
|
|
811
|
+
/**
|
|
812
|
+
* Interleaved top-N log-probabilities from the last forward pass.
|
|
813
|
+
*
|
|
814
|
+
* Layout: `[token_id_0 as f32, log_prob_0, token_id_1 as f32, log_prob_1, ...]`
|
|
815
|
+
* sorted by descending log-probability. Length is `top_logprobs_n * 2`.
|
|
816
|
+
*
|
|
817
|
+
* Returns an empty array if `set_top_logprobs(0)` (default) or before
|
|
818
|
+
* any inference has been run.
|
|
819
|
+
*/
|
|
820
|
+
readonly top_logprobs: Float32Array;
|
|
821
|
+
/**
|
|
822
|
+
* Get the vocabulary size of the loaded model.
|
|
823
|
+
*/
|
|
824
|
+
readonly vocab_size: number;
|
|
825
|
+
}
|
|
826
|
+
|
|
827
|
+
/**
|
|
828
|
+
* Progressive loader that fetches a GGUF model from a URL with streaming
|
|
829
|
+
* download progress.
|
|
830
|
+
*
|
|
831
|
+
* This enables the browser demo to show download progress as the model
|
|
832
|
+
* arrives over the network, then layer-loading progress as the model is
|
|
833
|
+
* parsed. For a 500MB Q4 model the download phase dominates; displaying
|
|
834
|
+
* progress prevents the page from appearing frozen.
|
|
835
|
+
*
|
|
836
|
+
* # JS example
|
|
837
|
+
*
|
|
838
|
+
* ```javascript
|
|
839
|
+
* const loader = new FlareProgressiveLoader('https://example.com/model.gguf');
|
|
840
|
+
* const engine = await loader.load((loaded, total) => {
|
|
841
|
+
* const pct = total > 0 ? Math.round(loaded / total * 100) : 0;
|
|
842
|
+
* progressBar.value = pct / 100;
|
|
843
|
+
* statusText.textContent = `Downloading… ${pct}%`;
|
|
844
|
+
* });
|
|
845
|
+
* ```
|
|
846
|
+
*/
|
|
847
|
+
export class FlareProgressiveLoader {
|
|
848
|
+
free(): void;
|
|
849
|
+
[Symbol.dispose](): void;
|
|
850
|
+
/**
|
|
851
|
+
* Fetch the model from the URL, calling `on_progress(loaded_bytes, total_bytes)`
|
|
852
|
+
* as each chunk arrives, then parse and return a `FlareEngine`.
|
|
853
|
+
*
|
|
854
|
+
* `total_bytes` is 0 when the server does not send a `Content-Length` header
|
|
855
|
+
* (e.g. when the response is gzip-compressed or chunked).
|
|
856
|
+
*/
|
|
857
|
+
load(on_progress: Function): Promise<FlareEngine>;
|
|
858
|
+
/**
|
|
859
|
+
* Create a loader for the given model URL.
|
|
860
|
+
*/
|
|
861
|
+
constructor(url: string);
|
|
862
|
+
}
|
|
863
|
+
|
|
864
|
+
/**
|
|
865
|
+
* BPE tokenizer exported to JS for encoding prompts and decoding generated tokens.
|
|
866
|
+
*
|
|
867
|
+
* Load from a HuggingFace `tokenizer.json` string, then use `encode` / `decode`
|
|
868
|
+
* in coordination with `FlareEngine` to run full text-in / text-out inference.
|
|
869
|
+
*
|
|
870
|
+
* # JS example
|
|
871
|
+
*
|
|
872
|
+
* ```javascript
|
|
873
|
+
* const resp = await fetch('tokenizer.json');
|
|
874
|
+
* const json = await resp.text();
|
|
875
|
+
* const tok = FlareTokenizer.from_json(json);
|
|
876
|
+
*
|
|
877
|
+
* const ids = tok.encode("Hello, world!");
|
|
878
|
+
* const engine = FlareEngine.load(modelBytes);
|
|
879
|
+
* const out = engine.generate_tokens(ids, 64);
|
|
880
|
+
* console.log(tok.decode(out));
|
|
881
|
+
* ```
|
|
882
|
+
*/
|
|
883
|
+
export class FlareTokenizer {
|
|
884
|
+
private constructor();
|
|
885
|
+
free(): void;
|
|
886
|
+
[Symbol.dispose](): void;
|
|
887
|
+
/**
|
|
888
|
+
* Decode a sequence of token IDs to text.
|
|
889
|
+
*/
|
|
890
|
+
decode(tokens: Uint32Array): string;
|
|
891
|
+
/**
|
|
892
|
+
* Decode a single token ID to text (useful for streaming output).
|
|
893
|
+
*/
|
|
894
|
+
decode_one(token_id: number): string;
|
|
895
|
+
/**
|
|
896
|
+
* Encode text to a sequence of token IDs.
|
|
897
|
+
*/
|
|
898
|
+
encode(text: string): Uint32Array;
|
|
899
|
+
/**
|
|
900
|
+
* Load a tokenizer from the text of a HuggingFace `tokenizer.json` file.
|
|
901
|
+
*/
|
|
902
|
+
static from_json(json: string): FlareTokenizer;
|
|
903
|
+
/**
|
|
904
|
+
* BOS (beginning of sequence) token ID, if defined.
|
|
905
|
+
*/
|
|
906
|
+
readonly bos_token_id: number | undefined;
|
|
907
|
+
/**
|
|
908
|
+
* EOS (end of sequence) token ID, if defined.
|
|
909
|
+
*/
|
|
910
|
+
readonly eos_token_id: number | undefined;
|
|
911
|
+
/**
|
|
912
|
+
* Vocabulary size.
|
|
913
|
+
*/
|
|
914
|
+
readonly vocab_size: number;
|
|
915
|
+
}
|
|
916
|
+
|
|
917
|
+
/**
|
|
918
|
+
* Save model bytes to OPFS.
|
|
919
|
+
*
|
|
920
|
+
* Creates the `flare-models` directory if it does not exist. Overwrites any
|
|
921
|
+
* existing file with the same name.
|
|
922
|
+
*/
|
|
923
|
+
export function cache_model(model_name: string, data: Uint8Array): Promise<void>;
|
|
924
|
+
|
|
925
|
+
/**
|
|
926
|
+
* Delete a cached model from OPFS.
|
|
927
|
+
*/
|
|
928
|
+
export function delete_cached_model(model_name: string): Promise<void>;
|
|
929
|
+
|
|
930
|
+
/**
|
|
931
|
+
* Get basic device info as a JSON string.
|
|
932
|
+
*/
|
|
933
|
+
export function device_info(): string;
|
|
934
|
+
|
|
935
|
+
/**
|
|
936
|
+
* Check if a model is cached in OPFS by name.
|
|
937
|
+
*
|
|
938
|
+
* Returns `false` if OPFS is unavailable or the model is not found.
|
|
939
|
+
*/
|
|
940
|
+
export function is_model_cached(model_name: string): Promise<boolean>;
|
|
941
|
+
|
|
942
|
+
/**
|
|
943
|
+
* List all cached models with their sizes (in bytes).
|
|
944
|
+
*
|
|
945
|
+
* Returns a JSON-serialised array of objects: `[{name: string, size: number}, ...]`.
|
|
946
|
+
* Returns `"[]"` if OPFS is unavailable or the models directory does not exist.
|
|
947
|
+
*/
|
|
948
|
+
export function list_cached_models(): Promise<any>;
|
|
949
|
+
|
|
950
|
+
/**
|
|
951
|
+
* Load model bytes from OPFS.
|
|
952
|
+
*
|
|
953
|
+
* Returns `null` (JS) / `None` (Rust) if the model is not cached or OPFS is
|
|
954
|
+
* unavailable.
|
|
955
|
+
*/
|
|
956
|
+
export function load_cached_model(model_name: string): Promise<any>;
|
|
957
|
+
|
|
958
|
+
/**
|
|
959
|
+
* Set up better panic messages in the browser console.
|
|
960
|
+
*/
|
|
961
|
+
export function start(): void;
|
|
962
|
+
|
|
963
|
+
/**
|
|
964
|
+
* Get storage usage and quota estimate.
|
|
965
|
+
*
|
|
966
|
+
* Returns a JSON string: `{usage: number, quota: number}`.
|
|
967
|
+
* Returns `"{}"` if the Storage API is unavailable.
|
|
968
|
+
*/
|
|
969
|
+
export function storage_estimate(): Promise<any>;
|
|
970
|
+
|
|
971
|
+
/**
|
|
972
|
+
* Check if this WASM build was compiled with relaxed SIMD support.
|
|
973
|
+
*
|
|
974
|
+
* Relaxed SIMD provides hardware-specific faster operations like fused
|
|
975
|
+
* multiply-add (`f32x4_relaxed_madd`) that map directly to ARM NEON and
|
|
976
|
+
* x86 SSE/AVX FMA instructions. When enabled, matvec operations use FMA
|
|
977
|
+
* for ~15-30% speedup.
|
|
978
|
+
*
|
|
979
|
+
* This is a compile-time feature: the WASM binary either includes relaxed
|
|
980
|
+
* SIMD instructions or it does not. The browser validates them at module
|
|
981
|
+
* load time, so if this module loaded successfully and returns `true`,
|
|
982
|
+
* relaxed SIMD is active.
|
|
983
|
+
*/
|
|
984
|
+
export function supports_relaxed_simd(): boolean;
|
|
985
|
+
|
|
986
|
+
/**
|
|
987
|
+
* Check if the browser exposes the Web Speech API for speech recognition.
|
|
988
|
+
*
|
|
989
|
+
* This probes `window.SpeechRecognition` and the WebKit-prefixed
|
|
990
|
+
* `window.webkitSpeechRecognition`. Returning `true` means the demo voice
|
|
991
|
+
* mode can capture microphone input and produce transcripts through the
|
|
992
|
+
* platform speech engine. This is a foundation for the voice pipeline
|
|
993
|
+
* (issue #395); a fully offline path will eventually run Whisper in WASM.
|
|
994
|
+
*/
|
|
995
|
+
export function supports_speech_recognition(): boolean;
|
|
996
|
+
|
|
997
|
+
/**
|
|
998
|
+
* Check if the browser exposes the Web Speech API for speech synthesis.
|
|
999
|
+
*
|
|
1000
|
+
* Returns `true` when `window.speechSynthesis` is available, enabling the
|
|
1001
|
+
* demo voice mode to speak model responses. A fully offline path will
|
|
1002
|
+
* eventually run a neural TTS model in WASM.
|
|
1003
|
+
*/
|
|
1004
|
+
export function supports_speech_synthesis(): boolean;
|
|
1005
|
+
|
|
1006
|
+
/**
|
|
1007
|
+
* Check if WebNN is available in the current browser.
|
|
1008
|
+
*
|
|
1009
|
+
* WebNN (`navigator.ml`) exposes neural-network acceleration through
|
|
1010
|
+
* platform NPUs/DSPs. This is a foundation check so JS code can decide
|
|
1011
|
+
* whether to build a WebNN graph from exported weights.
|
|
1012
|
+
*/
|
|
1013
|
+
export function supports_webnn(): boolean;
|
|
1014
|
+
|
|
1015
|
+
/**
|
|
1016
|
+
* Check if WebTransport is available in the current browser.
|
|
1017
|
+
*
|
|
1018
|
+
* WebTransport (`window.WebTransport`) is a modern transport API built on
|
|
1019
|
+
* HTTP/3 QUIC streams. It allows opening multiple parallel bidirectional
|
|
1020
|
+
* streams to the same origin with lower head-of-line blocking than fetch().
|
|
1021
|
+
* Useful for progressive model loading where different byte ranges of the
|
|
1022
|
+
* GGUF file can be downloaded concurrently.
|
|
1023
|
+
*
|
|
1024
|
+
* Note: actually using WebTransport for parallel range downloads requires
|
|
1025
|
+
* server-side support (HTTP/3 endpoint that accepts byte-range requests
|
|
1026
|
+
* on streams). This check only reports browser capability — the JS loader
|
|
1027
|
+
* will fall back to `fetch()` when the server does not cooperate.
|
|
1028
|
+
*/
|
|
1029
|
+
export function supports_webtransport(): boolean;
|
|
1030
|
+
|
|
1031
|
+
/**
|
|
1032
|
+
* Check if WebGPU is available in the current browser.
|
|
1033
|
+
*/
|
|
1034
|
+
export function webgpu_available(): boolean;
|
|
1035
|
+
|
|
1036
|
+
export type InitInput = RequestInfo | URL | Response | BufferSource | WebAssembly.Module;
|
|
1037
|
+
|
|
1038
|
+
export interface InitOutput {
|
|
1039
|
+
readonly memory: WebAssembly.Memory;
|
|
1040
|
+
readonly __wbg_flareengine_free: (a: number, b: number) => void;
|
|
1041
|
+
readonly __wbg_flareprogressiveloader_free: (a: number, b: number) => void;
|
|
1042
|
+
readonly __wbg_flaretokenizer_free: (a: number, b: number) => void;
|
|
1043
|
+
readonly cache_model: (a: number, b: number, c: number, d: number) => any;
|
|
1044
|
+
readonly delete_cached_model: (a: number, b: number) => any;
|
|
1045
|
+
readonly device_info: () => [number, number];
|
|
1046
|
+
readonly flareengine_add_bos_token: (a: number) => number;
|
|
1047
|
+
readonly flareengine_add_stop_sequence: (a: number, b: number, c: number) => void;
|
|
1048
|
+
readonly flareengine_apply_chat_template: (a: number, b: number, c: number, d: number, e: number) => [number, number];
|
|
1049
|
+
readonly flareengine_architecture: (a: number) => [number, number];
|
|
1050
|
+
readonly flareengine_begin_stream: (a: number, b: number, c: number, d: number) => void;
|
|
1051
|
+
readonly flareengine_begin_stream_healed: (a: number, b: number, c: number, d: number) => void;
|
|
1052
|
+
readonly flareengine_begin_stream_healed_with_params: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number, i: number) => void;
|
|
1053
|
+
readonly flareengine_begin_stream_with_params: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number, i: number) => void;
|
|
1054
|
+
readonly flareengine_bos_token_id: (a: number) => number;
|
|
1055
|
+
readonly flareengine_chat_template_name: (a: number) => [number, number];
|
|
1056
|
+
readonly flareengine_clear_raw_weights: (a: number) => void;
|
|
1057
|
+
readonly flareengine_clear_stop_sequences: (a: number) => void;
|
|
1058
|
+
readonly flareengine_compute_perplexity: (a: number, b: number, c: number) => number;
|
|
1059
|
+
readonly flareengine_context_window_pct: (a: number) => number;
|
|
1060
|
+
readonly flareengine_count_tokens: (a: number, b: number, c: number) => number;
|
|
1061
|
+
readonly flareengine_decode_ids: (a: number, b: number, c: number) => [number, number];
|
|
1062
|
+
readonly flareengine_decode_token: (a: number, b: number) => [number, number];
|
|
1063
|
+
readonly flareengine_decode_token_chunk: (a: number, b: number) => [number, number];
|
|
1064
|
+
readonly flareengine_embed_token: (a: number, b: number) => [number, number];
|
|
1065
|
+
readonly flareengine_encode_text: (a: number, b: number, c: number) => [number, number];
|
|
1066
|
+
readonly flareengine_eos_token_id: (a: number) => number;
|
|
1067
|
+
readonly flareengine_generate_stream: (a: number, b: number, c: number, d: number, e: any) => number;
|
|
1068
|
+
readonly flareengine_generate_stream_with_params: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number, i: number, j: any) => number;
|
|
1069
|
+
readonly flareengine_generate_text: (a: number, b: number, c: number, d: number) => [number, number];
|
|
1070
|
+
readonly flareengine_generate_text_with_params: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number, i: number) => [number, number];
|
|
1071
|
+
readonly flareengine_generate_tokens: (a: number, b: number, c: number, d: number) => [number, number];
|
|
1072
|
+
readonly flareengine_generate_with_params: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number, i: number) => [number, number];
|
|
1073
|
+
readonly flareengine_has_raw_weights: (a: number) => number;
|
|
1074
|
+
readonly flareengine_hidden_dim: (a: number) => number;
|
|
1075
|
+
readonly flareengine_init_gpu: (a: number) => any;
|
|
1076
|
+
readonly flareengine_init_gpu_with_cache: (a: number, b: number, c: number) => any;
|
|
1077
|
+
readonly flareengine_last_decode_ms: (a: number) => number;
|
|
1078
|
+
readonly flareengine_last_logits: (a: number) => [number, number];
|
|
1079
|
+
readonly flareengine_last_prefill_ms: (a: number) => number;
|
|
1080
|
+
readonly flareengine_last_tokens_generated: (a: number) => number;
|
|
1081
|
+
readonly flareengine_load: (a: number, b: number) => [number, number, number];
|
|
1082
|
+
readonly flareengine_load_raw_weights: (a: number, b: number, c: number) => number;
|
|
1083
|
+
readonly flareengine_max_seq_len: (a: number) => number;
|
|
1084
|
+
readonly flareengine_merge_lora: (a: number, b: number, c: number) => [number, number];
|
|
1085
|
+
readonly flareengine_merge_lora_with_alpha: (a: number, b: number, c: number, d: number) => [number, number];
|
|
1086
|
+
readonly flareengine_metadata_json: (a: number) => [number, number];
|
|
1087
|
+
readonly flareengine_model_name: (a: number) => [number, number];
|
|
1088
|
+
readonly flareengine_next_token: (a: number) => number;
|
|
1089
|
+
readonly flareengine_num_heads: (a: number) => number;
|
|
1090
|
+
readonly flareengine_num_layers: (a: number) => number;
|
|
1091
|
+
readonly flareengine_output_projection: (a: number, b: number, c: number) => [number, number];
|
|
1092
|
+
readonly flareengine_performance_summary: (a: number) => [number, number];
|
|
1093
|
+
readonly flareengine_pipeline_cache_data: (a: number) => [number, number];
|
|
1094
|
+
readonly flareengine_raw_chat_template: (a: number) => [number, number];
|
|
1095
|
+
readonly flareengine_repeat_last_n: (a: number) => number;
|
|
1096
|
+
readonly flareengine_reset: (a: number) => void;
|
|
1097
|
+
readonly flareengine_set_repeat_last_n: (a: number, b: number) => void;
|
|
1098
|
+
readonly flareengine_set_rng_seed: (a: number, b: number) => void;
|
|
1099
|
+
readonly flareengine_set_top_logprobs: (a: number, b: number) => void;
|
|
1100
|
+
readonly flareengine_stop_stream: (a: number) => void;
|
|
1101
|
+
readonly flareengine_stream_done: (a: number) => number;
|
|
1102
|
+
readonly flareengine_stream_stop_reason: (a: number) => [number, number];
|
|
1103
|
+
readonly flareengine_tokens_per_second: (a: number) => number;
|
|
1104
|
+
readonly flareengine_tokens_remaining: (a: number) => number;
|
|
1105
|
+
readonly flareengine_tokens_used: (a: number) => number;
|
|
1106
|
+
readonly flareengine_top_logprobs: (a: number) => [number, number];
|
|
1107
|
+
readonly flareengine_truncate_to_context: (a: number, b: number, c: number, d: number) => [number, number];
|
|
1108
|
+
readonly flareengine_vocab_size: (a: number) => number;
|
|
1109
|
+
readonly flareengine_warmup: (a: number) => number;
|
|
1110
|
+
readonly flareprogressiveloader_load: (a: number, b: any) => any;
|
|
1111
|
+
readonly flareprogressiveloader_new: (a: number, b: number) => number;
|
|
1112
|
+
readonly flaretokenizer_decode: (a: number, b: number, c: number) => [number, number, number, number];
|
|
1113
|
+
readonly flaretokenizer_decode_one: (a: number, b: number) => [number, number, number, number];
|
|
1114
|
+
readonly flaretokenizer_encode: (a: number, b: number, c: number) => [number, number, number, number];
|
|
1115
|
+
readonly flaretokenizer_from_json: (a: number, b: number) => [number, number, number];
|
|
1116
|
+
readonly flaretokenizer_vocab_size: (a: number) => number;
|
|
1117
|
+
readonly is_model_cached: (a: number, b: number) => any;
|
|
1118
|
+
readonly list_cached_models: () => any;
|
|
1119
|
+
readonly load_cached_model: (a: number, b: number) => any;
|
|
1120
|
+
readonly storage_estimate: () => any;
|
|
1121
|
+
readonly supports_relaxed_simd: () => number;
|
|
1122
|
+
readonly supports_speech_recognition: () => number;
|
|
1123
|
+
readonly supports_speech_synthesis: () => number;
|
|
1124
|
+
readonly supports_webnn: () => number;
|
|
1125
|
+
readonly supports_webtransport: () => number;
|
|
1126
|
+
readonly webgpu_available: () => number;
|
|
1127
|
+
readonly start: () => void;
|
|
1128
|
+
readonly flaretokenizer_bos_token_id: (a: number) => number;
|
|
1129
|
+
readonly flaretokenizer_eos_token_id: (a: number) => number;
|
|
1130
|
+
readonly wasm_bindgen__convert__closures_____invoke__h7ed8ea06cc0c8ca5: (a: number, b: number, c: any) => [number, number];
|
|
1131
|
+
readonly wasm_bindgen__convert__closures_____invoke__hcdfd434894ba1863: (a: number, b: number, c: any, d: any) => void;
|
|
1132
|
+
readonly wasm_bindgen__convert__closures_____invoke__h235e00bf230ad8a4: (a: number, b: number, c: any) => void;
|
|
1133
|
+
readonly __wbindgen_malloc: (a: number, b: number) => number;
|
|
1134
|
+
readonly __wbindgen_realloc: (a: number, b: number, c: number, d: number) => number;
|
|
1135
|
+
readonly __externref_table_alloc: () => number;
|
|
1136
|
+
readonly __wbindgen_externrefs: WebAssembly.Table;
|
|
1137
|
+
readonly __wbindgen_exn_store: (a: number) => void;
|
|
1138
|
+
readonly __wbindgen_free: (a: number, b: number, c: number) => void;
|
|
1139
|
+
readonly __wbindgen_destroy_closure: (a: number, b: number) => void;
|
|
1140
|
+
readonly __externref_table_dealloc: (a: number) => void;
|
|
1141
|
+
readonly __wbindgen_start: () => void;
|
|
1142
|
+
}
|
|
1143
|
+
|
|
1144
|
+
export type SyncInitInput = BufferSource | WebAssembly.Module;
|
|
1145
|
+
|
|
1146
|
+
/**
|
|
1147
|
+
* Instantiates the given `module`, which can either be bytes or
|
|
1148
|
+
* a precompiled `WebAssembly.Module`.
|
|
1149
|
+
*
|
|
1150
|
+
* @param {{ module: SyncInitInput }} module - Passing `SyncInitInput` directly is deprecated.
|
|
1151
|
+
*
|
|
1152
|
+
* @returns {InitOutput}
|
|
1153
|
+
*/
|
|
1154
|
+
export function initSync(module: { module: SyncInitInput } | SyncInitInput): InitOutput;
|
|
1155
|
+
|
|
1156
|
+
/**
|
|
1157
|
+
* If `module_or_path` is {RequestInfo} or {URL}, makes a request and
|
|
1158
|
+
* for everything else, calls `WebAssembly.instantiate` directly.
|
|
1159
|
+
*
|
|
1160
|
+
* @param {{ module_or_path: InitInput | Promise<InitInput> }} module_or_path - Passing `InitInput` directly is deprecated.
|
|
1161
|
+
*
|
|
1162
|
+
* @returns {Promise<InitOutput>}
|
|
1163
|
+
*/
|
|
1164
|
+
export default function __wbg_init (module_or_path?: { module_or_path: InitInput | Promise<InitInput> } | InitInput | Promise<InitInput>): Promise<InitOutput>;
|