@sauravpanda/flare 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1164 @@
1
+ /* tslint:disable */
2
+ /* eslint-disable */
3
+
4
+ /**
5
+ * Flare LLM inference engine, exported to JS.
6
+ *
7
+ * Holds a loaded model and runs greedy/sampled token generation.
8
+ * The detected chat template is available via `chat_template_name` and
9
+ * `apply_chat_template` so the browser demo can format prompts correctly
10
+ * for instruction-tuned models.
11
+ */
12
+ export class FlareEngine {
13
+ private constructor();
14
+ free(): void;
15
+ [Symbol.dispose](): void;
16
+ /**
17
+ * Register a stop sequence.
18
+ *
19
+ * Generation halts (without emitting the matched tokens) as soon as the
20
+ * decoded output ends with `sequence`. Call once per stop string before
21
+ * `begin_stream` or `generate_with_params`.
22
+ *
23
+ * Stop sequences are cleared by `reset()` or `clear_stop_sequences()`.
24
+ *
25
+ * ```javascript
26
+ * engine.add_stop_sequence("<|im_end|>");
27
+ * engine.add_stop_sequence("</s>");
28
+ * engine.begin_stream_with_params(promptIds, 200, 0.8, 0.95, 40, 1.1);
29
+ * ```
30
+ */
31
+ add_stop_sequence(sequence: string): void;
32
+ /**
33
+ * Format a user message (and optional system prompt) using the model's
34
+ * auto-detected chat template. Returns the formatted prompt string ready
35
+ * to be passed to `FlareTokenizer.encode()`.
36
+ *
37
+ * Pass an empty string for `system_message` to omit the system turn.
38
+ *
39
+ * # JS example
40
+ * ```javascript
41
+ * const prompt = engine.apply_chat_template(
42
+ * 'Explain quantum computing in simple terms.',
43
+ * 'You are a helpful assistant.'
44
+ * );
45
+ * const ids = tokenizer.encode(prompt);
46
+ * const output = engine.generate_tokens(ids, 128);
47
+ * ```
48
+ */
49
+ apply_chat_template(user_message: string, system_message: string): string;
50
+ /**
51
+ * Prepare for token-by-token streaming.
52
+ *
53
+ * Runs the prefill pass on `prompt_tokens`, then initialises internal
54
+ * state so that subsequent calls to `next_token()` each produce one
55
+ * output token. Call `engine.reset()` before `begin_stream()` to start
56
+ * a fresh conversation.
57
+ *
58
+ * # JS example
59
+ * ```javascript
60
+ * engine.reset();
61
+ * engine.begin_stream(promptIds, 128);
62
+ * function tick() {
63
+ * const id = engine.next_token();
64
+ * if (id === undefined) { /* done */ return; }
65
+ * output.textContent += tokenizer.decode_one(id);
66
+ * requestAnimationFrame(tick); // yield to browser, then continue
67
+ * }
68
+ * requestAnimationFrame(tick);
69
+ * ```
70
+ */
71
+ begin_stream(prompt_tokens: Uint32Array, max_tokens: number): void;
72
+ /**
73
+ * Begin a token-by-token stream, healing the last prompt token.
74
+ *
75
+ * Identical to `begin_stream` but avoids double-processing the final prompt
76
+ * token: the prefill runs only tokens `[0 .. n-2]`, then the first
77
+ * `next_token()` call processes the last prompt token at its correct
78
+ * position `n-1` and produces the first output token. This keeps RoPE
79
+ * positional embeddings consistent and is recommended when the prompt
80
+ * ends at a natural token boundary (e.g. when encoding a user turn in a
81
+ * chat template).
82
+ *
83
+ * Falls back to `begin_stream` for prompts shorter than 2 tokens.
84
+ *
85
+ * # JS example
86
+ * ```javascript
87
+ * engine.reset();
88
+ * const ids = engine.encode_text(engine.apply_chat_template(userMsg, sysMsg));
89
+ * engine.begin_stream_healed(ids, 256);
90
+ * requestAnimationFrame(function tick() {
91
+ * const id = engine.next_token();
92
+ * if (id !== undefined) output.textContent += tokenizer.decode_one(id);
93
+ * if (!engine.stream_done) requestAnimationFrame(tick);
94
+ * });
95
+ * ```
96
+ */
97
+ begin_stream_healed(prompt_tokens: Uint32Array, max_tokens: number): void;
98
+ /**
99
+ * Like `begin_stream_healed` but with full sampling parameters.
100
+ *
101
+ * Combines position-consistent prefill (see `begin_stream_healed`) with
102
+ * the same temperature / top-p / top-k / repeat-penalty / min-p controls
103
+ * available in `begin_stream_with_params`.
104
+ *
105
+ * # JS example
106
+ * ```javascript
107
+ * engine.reset();
108
+ * const ids = engine.encode_text(engine.apply_chat_template(userMsg, sysMsg));
109
+ * engine.begin_stream_healed_with_params(ids, 256, 0.8, 0.95, 40, 1.1, 0.0);
110
+ * requestAnimationFrame(function tick() {
111
+ * const id = engine.next_token();
112
+ * if (id !== undefined) output.textContent += tokenizer.decode_one(id);
113
+ * if (!engine.stream_done) requestAnimationFrame(tick);
114
+ * });
115
+ * ```
116
+ */
117
+ begin_stream_healed_with_params(prompt_tokens: Uint32Array, max_tokens: number, temperature: number, top_p: number, top_k: number, repeat_penalty: number, min_p: number): void;
118
+ /**
119
+ * Like `begin_stream` but with temperature / top-p sampling.
120
+ *
121
+ * `temperature`: 0.0 = greedy, 0.7–1.0 = typical creative range.
122
+ * `top_p`: nucleus sampling threshold (0.0–1.0); 0.9 is a good default.
123
+ *
124
+ * # JS example
125
+ * ```javascript
126
+ * engine.reset();
127
+ * engine.begin_stream_with_params(promptIds, 128, 0.8, 0.9);
128
+ * function tick() {
129
+ * const id = engine.next_token();
130
+ * if (id === undefined) return;
131
+ * output.textContent += tokenizer.decode_one(id);
132
+ * requestAnimationFrame(tick);
133
+ * }
134
+ * requestAnimationFrame(tick);
135
+ * ```
136
+ * Begin a token-by-token stream with sampling parameters including top-k.
137
+ *
138
+ * - `temperature`: controls randomness (0 = greedy, higher = more random)
139
+ * - `top_p`: nucleus sampling — keep the smallest token set whose cumulative
140
+ * probability ≥ `top_p` (1.0 = disabled; applied when < 1.0)
141
+ * - `top_k`: keep only the `top_k` highest-probability tokens before sampling
142
+ * (0 = disabled; applied when `top_p` is 1.0 and `top_k` > 0)
143
+ * - `repeat_penalty`: penalty applied to logits of recently-seen tokens to
144
+ * reduce repetition (1.0 = disabled, 1.1–1.3 = typical range)
145
+ *
146
+ * ```javascript
147
+ * engine.begin_stream_with_params(promptIds, 200, 0.8, 0.95, 40, 1.1, 0.0);
148
+ * ```
149
+ */
150
+ begin_stream_with_params(prompt_tokens: Uint32Array, max_tokens: number, temperature: number, top_p: number, top_k: number, repeat_penalty: number, min_p: number): void;
151
+ /**
152
+ * Clear any previously loaded raw quantized weights.
153
+ *
154
+ * After calling this the engine uses the f32 dequantized path for all
155
+ * matrix operations until `load_raw_weights` is called again.
156
+ */
157
+ clear_raw_weights(): void;
158
+ /**
159
+ * Remove all registered stop sequences.
160
+ */
161
+ clear_stop_sequences(): void;
162
+ /**
163
+ * Compute the perplexity of `text` under the loaded model.
164
+ *
165
+ * Encodes `text` with the embedded GGUF vocabulary, runs one forward pass
166
+ * per token, and measures the log-probability of each correct next-token
167
+ * prediction. Perplexity = exp(−mean(log_probs)).
168
+ *
169
+ * The KV cache is reset **before and after** the evaluation so the engine
170
+ * returns to a clean state.
171
+ *
172
+ * Returns `f32::INFINITY` if the text encodes to fewer than 2 tokens or if
173
+ * no GGUF vocabulary is available.
174
+ *
175
+ * # JS example
176
+ * ```javascript
177
+ * const ppl = engine.compute_perplexity("The quick brown fox");
178
+ * console.log("Perplexity:", ppl);
179
+ * ```
180
+ */
181
+ compute_perplexity(text: string): number;
182
+ /**
183
+ * Count the number of tokens in `text` using the model's embedded GGUF vocabulary.
184
+ *
185
+ * Returns 0 if the model was not loaded from a GGUF file (e.g. SafeTensors only).
186
+ *
187
+ * # JS example
188
+ * ```javascript
189
+ * const n = engine.count_tokens(textarea.value);
190
+ * counter.textContent = `${n} / ${engine.max_seq_len} tokens`;
191
+ * ```
192
+ */
193
+ count_tokens(text: string): number;
194
+ /**
195
+ * Decode token IDs to text using the embedded GGUF vocabulary.
196
+ *
197
+ * Returns an empty string if no GGUF vocab is available.
198
+ *
199
+ * # JS example
200
+ * ```javascript
201
+ * const text = engine.decode_ids(generatedIds);
202
+ * ```
203
+ */
204
+ decode_ids(ids: Uint32Array): string;
205
+ /**
206
+ * Decode a single token ID to its text piece.
207
+ *
208
+ * Convenience wrapper around `decode_ids` for use directly inside a
209
+ * `next_token()` loop so callers don't need a separate `FlareTokenizer`.
210
+ *
211
+ * Returns an empty string if no GGUF vocab is loaded.
212
+ *
213
+ * # JS example
214
+ * ```javascript
215
+ * engine.begin_stream(promptIds, 128);
216
+ * requestAnimationFrame(function tick() {
217
+ * const id = engine.next_token();
218
+ * if (id !== undefined) output.textContent += engine.decode_token(id);
219
+ * if (!engine.stream_done) requestAnimationFrame(tick);
220
+ * });
221
+ * ```
222
+ */
223
+ decode_token(id: number): string;
224
+ /**
225
+ * Decode a single token ID, correctly handling multi-byte UTF-8 sequences.
226
+ *
227
+ * SentencePiece tokenizers encode non-ASCII characters as consecutive
228
+ * byte-level tokens such as `<0xE4>`, `<0xB8>`, `<0xAD>` (the UTF-8
229
+ * encoding of `中`). The basic `decode_token` function returns incorrect
230
+ * Latin-1 characters in these cases because it treats each byte as an
231
+ * independent Unicode scalar.
232
+ *
233
+ * `decode_token_chunk` accumulates bytes in an internal buffer until a
234
+ * complete, valid UTF-8 sequence is assembled, then returns it as a
235
+ * `String`. While the sequence is incomplete it returns an empty string,
236
+ * and when a regular (non-byte) token is encountered it flushes any
237
+ * buffered bytes (replacing invalid sequences with U+FFFD) before
238
+ * returning the decoded text.
239
+ *
240
+ * **Use this instead of `decode_token` whenever you are streaming tokens
241
+ * that may include non-Latin characters.**
242
+ *
243
+ * ```javascript
244
+ * engine.begin_stream(prompt, 256);
245
+ * function tick() {
246
+ * const id = engine.next_token();
247
+ * if (id !== undefined) output.textContent += engine.decode_token_chunk(id);
248
+ * if (!engine.stream_done) requestAnimationFrame(tick);
249
+ * }
250
+ * requestAnimationFrame(tick);
251
+ * ```
252
+ */
253
+ decode_token_chunk(id: number): string;
254
+ /**
255
+ * Look up the token embedding row for `token_id` as a flat `Float32Array`.
256
+ *
257
+ * The length of the returned vector is `hidden_dim`. See also
258
+ * [`FlareEngine::output_projection`] for the inverse tail step.
259
+ */
260
+ embed_token(token_id: number): Float32Array;
261
+ /**
262
+ * # JS example
263
+ * ```javascript
264
+ * const ids = engine.encode_text("Hello, world!");
265
+ * const output = engine.generate_tokens(ids, 64);
266
+ * ```
267
+ */
268
+ encode_text(text: string): Uint32Array;
269
+ /**
270
+ * Streaming text-in / text-out generation with a per-token JS callback.
271
+ *
272
+ * Encodes `prompt` with the embedded GGUF vocabulary, generates up to
273
+ * `max_tokens` tokens, and calls `on_token(token_str)` with the decoded
274
+ * text for each token as it is produced. Returns the number of tokens
275
+ * generated (excluding any EOS token).
276
+ *
277
+ * Returns 0 if no GGUF vocab is available.
278
+ *
279
+ * # Note on browser streaming
280
+ * `on_token` is called synchronously inside WASM, so the browser will
281
+ * not visually update between tokens. For visible character-by-character
282
+ * output, use `begin_stream` + `next_token` with `requestAnimationFrame`.
283
+ *
284
+ * # JS example
285
+ * ```javascript
286
+ * engine.reset();
287
+ * let out = '';
288
+ * const count = engine.generate_stream("What is Rust?", 128, (token) => {
289
+ * out += token;
290
+ * });
291
+ * output.textContent = out;
292
+ * ```
293
+ */
294
+ generate_stream(prompt: string, max_tokens: number, on_token: Function): number;
295
+ /**
296
+ * Streaming text-in / text-out with explicit sampling parameters.
297
+ *
298
+ * Like `generate_stream` but with the full set of sampling controls:
299
+ *
300
+ * - `temperature`: 0 = greedy, higher = more diverse
301
+ * - `top_p`: nucleus sampling (1.0 = disabled)
302
+ * - `top_k`: top-k sampling, applied when `top_p` is 1.0 and `min_p` is 0.0 (0 = disabled)
303
+ * - `repeat_penalty`: repetition penalty (1.0 = disabled, 1.1–1.3 = typical)
304
+ * - `min_p`: min-p threshold (0.0 = disabled)
305
+ *
306
+ * Encodes `prompt` with the embedded GGUF vocabulary, generates up to
307
+ * `max_tokens` tokens, and calls `on_token(token_str)` with the decoded
308
+ * text for each token. Respects stop sequences registered via
309
+ * `add_stop_sequence`. Returns the number of tokens generated.
310
+ *
311
+ * Returns 0 if no GGUF vocab is available.
312
+ *
313
+ * # JS example
314
+ * ```javascript
315
+ * engine.add_stop_sequence("<|im_end|>");
316
+ * engine.reset();
317
+ * let out = '';
318
+ * const count = engine.generate_stream_with_params(
319
+ * prompt, 200, 0.8, 0.95, 40, 1.1, 0.0,
320
+ * (token) => { out += token; }
321
+ * );
322
+ * ```
323
+ */
324
+ generate_stream_with_params(prompt: string, max_tokens: number, temperature: number, top_p: number, top_k: number, repeat_penalty: number, min_p: number, on_token: Function): number;
325
+ /**
326
+ * Full text-in / text-out generation using the embedded GGUF vocabulary.
327
+ *
328
+ * Encodes `prompt` with the embedded vocab, runs greedy generation for up
329
+ * to `max_tokens` steps, then decodes the output back to text. Stops
330
+ * automatically at EOS.
331
+ *
332
+ * Returns an empty string if no GGUF vocab is available.
333
+ *
334
+ * # JS example
335
+ * ```javascript
336
+ * engine.reset();
337
+ * const response = engine.generate_text("What is Rust?", 128);
338
+ * output.textContent = response;
339
+ * ```
340
+ */
341
+ generate_text(prompt: string, max_tokens: number): string;
342
+ /**
343
+ * Full text-in / text-out generation with explicit sampling parameters.
344
+ *
345
+ * Like `generate_text` but with the full set of sampling controls:
346
+ *
347
+ * - `temperature`: 0 = greedy, higher = more diverse
348
+ * - `top_p`: nucleus sampling (1.0 = disabled)
349
+ * - `top_k`: top-k sampling, applied when `top_p` is 1.0 and `min_p` is 0.0 (0 = disabled)
350
+ * - `repeat_penalty`: repetition penalty (1.0 = disabled)
351
+ * - `min_p`: min-p threshold (0.0 = disabled)
352
+ *
353
+ * Returns the decoded generated text. Returns an empty string if no GGUF vocab is available.
354
+ * Respects stop sequences registered via `add_stop_sequence`.
355
+ *
356
+ * # JS example
357
+ * ```javascript
358
+ * engine.reset();
359
+ * const response = engine.generate_text_with_params(
360
+ * "What is Rust?", 128, 0.8, 0.95, 40, 1.1, 0.0
361
+ * );
362
+ * output.textContent = response;
363
+ * ```
364
+ */
365
+ generate_text_with_params(prompt: string, max_tokens: number, temperature: number, top_p: number, top_k: number, repeat_penalty: number, min_p: number): string;
366
+ /**
367
+ * Generate `max_tokens` tokens starting from `prompt_tokens` (greedy).
368
+ * Stops early at EOS. Returns a Uint32Array of generated token IDs.
369
+ */
370
+ generate_tokens(prompt_tokens: Uint32Array, max_tokens: number): Uint32Array;
371
+ /**
372
+ * Generate a batch of tokens with explicit sampling parameters.
373
+ *
374
+ * - `temperature`: 0 = greedy, higher = more diverse
375
+ * - `top_p`: nucleus sampling (1.0 = disabled)
376
+ * - `top_k`: top-k sampling, applied when `top_p` is 1.0 and `min_p` is 0.0 (0 = disabled)
377
+ * - `repeat_penalty`: repetition penalty applied to recently-seen tokens (1.0 = disabled)
378
+ * - `min_p`: min-p threshold (0.0 = disabled); applied after `top_p`, before `top_k`
379
+ *
380
+ * Stops early at EOS. Uses a fixed LCG RNG seed for reproducibility.
381
+ */
382
+ generate_with_params(prompt_tokens: Uint32Array, max_tokens: number, temperature: number, top_p: number, top_k: number, repeat_penalty: number, min_p: number): Uint32Array;
383
+ /**
384
+ * Try to initialise the WebGPU compute backend.
385
+ *
386
+ * Call this after `load()` to enable GPU-accelerated matrix operations
387
+ * (matvec, matmul, silu_mul). Falls back silently to CPU if WebGPU is
388
+ * unavailable or adapter request fails.
389
+ *
390
+ * Returns `true` if a GPU backend was successfully initialised.
391
+ *
392
+ * ```javascript
393
+ * const engine = FlareEngine.load(bytes);
394
+ * const gpuEnabled = await engine.init_gpu();
395
+ * console.log('GPU:', gpuEnabled);
396
+ * ```
397
+ */
398
+ init_gpu(): Promise<boolean>;
399
+ /**
400
+ * Initialise the WebGPU backend using previously serialised pipeline cache
401
+ * bytes (from `engine.pipeline_cache_data()`).
402
+ *
403
+ * On backends that support driver-managed pipeline caches (Vulkan native),
404
+ * this allows the driver to reuse compiled GPU machine code from a previous
405
+ * run, eliminating cold-start shader recompilation (typically 100ms–2s).
406
+ *
407
+ * On unsupported backends (WebGPU, Metal, DX12) this behaves identically to
408
+ * `init_gpu()` — the cache bytes are silently ignored.
409
+ *
410
+ * ```javascript
411
+ * const cached = localStorage.getItem('flare-pipeline-cache');
412
+ * const cacheBytes = cached ? new Uint8Array(JSON.parse(cached)) : new Uint8Array();
413
+ * await engine.init_gpu_with_cache(cacheBytes);
414
+ * // After inference, persist the cache:
415
+ * const data = engine.pipeline_cache_data();
416
+ * if (data.length > 0) {
417
+ * localStorage.setItem('flare-pipeline-cache', JSON.stringify(Array.from(data)));
418
+ * }
419
+ * ```
420
+ */
421
+ init_gpu_with_cache(cache_data: Uint8Array): Promise<boolean>;
422
+ /**
423
+ * Load a GGUF model from a Uint8Array of bytes (e.g. from `fetch`).
424
+ */
425
+ static load(gguf_bytes: Uint8Array): FlareEngine;
426
+ /**
427
+ * Load raw quantized weights from GGUF bytes so the GPU fused
428
+ * dequant+matvec kernels can be used during inference.
429
+ *
430
+ * Call this **after** `init_gpu()` so the backend is set before the raw
431
+ * weights are attached. The method is a no-op (returns `false`) if a
432
+ * layer's weights are in an unsupported quantization format — the engine
433
+ * continues to work using the f32 path loaded at `FlareEngine.load()`.
434
+ *
435
+ * Returns `true` if all layers were loaded successfully, `false` if any
436
+ * layer fell back to the f32 path.
437
+ *
438
+ * ```javascript
439
+ * const engine = FlareEngine.load(bytes);
440
+ * await engine.init_gpu();
441
+ * const ok = engine.load_raw_weights(bytes);
442
+ * console.log('Raw weights loaded:', ok);
443
+ * ```
444
+ */
445
+ load_raw_weights(gguf_bytes: Uint8Array): boolean;
446
+ /**
447
+ * Merge a LoRA adapter (SafeTensors format) into the model weights.
448
+ *
449
+ * Pass the raw bytes of a `.safetensors` file containing LoRA A/B matrices.
450
+ * After merging, the adapter's effect is permanent for this engine instance;
451
+ * call `FlareEngine.load()` again to restore the base model.
452
+ *
453
+ * ```javascript
454
+ * const resp = await fetch('lora-adapter.safetensors');
455
+ * const bytes = new Uint8Array(await resp.arrayBuffer());
456
+ * engine.merge_lora(bytes);
457
+ * ```
458
+ */
459
+ merge_lora(adapter_bytes: Uint8Array): void;
460
+ /**
461
+ * Merge a LoRA adapter with a custom alpha scaling factor.
462
+ *
463
+ * Same as `merge_lora` but overrides the alpha value embedded in the
464
+ * adapter file. The effective scaling is `alpha / rank`.
465
+ */
466
+ merge_lora_with_alpha(adapter_bytes: Uint8Array, alpha: number): void;
467
+ /**
468
+ * Generate and return the next token ID, or `undefined` when the stream
469
+ * is complete (EOS reached, `max_tokens` exhausted, or `stop_stream()`
470
+ * was called).
471
+ *
472
+ * Sampling parameters are those set by the most recent `begin_stream` or
473
+ * `begin_stream_with_params` call. Call this inside
474
+ * `requestAnimationFrame` so the browser can update the DOM between
475
+ * tokens and the page remains responsive.
476
+ */
477
+ next_token(): number | undefined;
478
+ /**
479
+ * Apply final RMSNorm + output projection to a hidden state and
480
+ * return logits over the vocabulary.
481
+ *
482
+ * `hidden` must have length `hidden_dim`. The returned vector has
483
+ * length `vocab_size`.
484
+ */
485
+ output_projection(hidden: Float32Array): Float32Array;
486
+ /**
487
+ * Return a JSON string summarising the performance metrics from the last
488
+ * generation call.
489
+ *
490
+ * ```javascript
491
+ * const perf = JSON.parse(engine.performance_summary());
492
+ * console.log(`TTFT: ${perf.prefill_ms.toFixed(1)} ms`);
493
+ * console.log(`Decode: ${perf.tokens_per_second.toFixed(1)} tok/s`);
494
+ * ```
495
+ */
496
+ performance_summary(): string;
497
+ /**
498
+ * Reset the KV cache (start a new conversation).
499
+ *
500
+ * Also clears stop sequences, the internal text accumulator, and
501
+ * restores the RNG seed to the default `0x12345678`.
502
+ */
503
+ reset(): void;
504
+ /**
505
+ * Set the repetition-penalty look-back window (number of recent tokens to
506
+ * penalise). Use `0` to disable repetition penalty entirely. Default: 64.
507
+ *
508
+ * Takes effect on the next `begin_stream*` call.
509
+ *
510
+ * # JS example
511
+ * ```javascript
512
+ * engine.set_repeat_last_n(128); // wider window for creative writing
513
+ * engine.set_repeat_last_n(0); // disable repeat penalty
514
+ * ```
515
+ */
516
+ set_repeat_last_n(n: number): void;
517
+ /**
518
+ * Set the LCG RNG seed used for the next sampled generation call.
519
+ *
520
+ * Controls the random state passed to `begin_stream_with_params` and
521
+ * `generate_with_params`, enabling reproducible outputs. The seed is
522
+ * applied on the next call and then *not* automatically reset, so the
523
+ * same seed will be reused on subsequent calls unless `set_rng_seed` or
524
+ * `reset()` is called again.
525
+ *
526
+ * `reset()` restores the seed to the default `0x12345678`.
527
+ *
528
+ * ```javascript
529
+ * engine.set_rng_seed(42);
530
+ * const out1 = engine.generate_text("Hello", 50);
531
+ * engine.set_rng_seed(42);
532
+ * const out2 = engine.generate_text("Hello", 50);
533
+ * // out1 === out2
534
+ * ```
535
+ */
536
+ set_rng_seed(seed: number): void;
537
+ /**
538
+ * Set how many top log-probability entries to capture after each forward
539
+ * pass. Pass `0` (the default) to disable and save the computation.
540
+ *
541
+ * When enabled, `top_logprobs` is populated after every `next_token()`
542
+ * call and after every token in `generate_stream_with_params`.
543
+ *
544
+ * # JS example
545
+ * ```javascript
546
+ * engine.set_top_logprobs(5);
547
+ * engine.begin_stream(promptIds, 64);
548
+ * while (!engine.stream_done) {
549
+ * engine.next_token();
550
+ * const lp = engine.top_logprobs; // Float32Array [id0, lp0, id1, lp1, ...]
551
+ * }
552
+ * ```
553
+ */
554
+ set_top_logprobs(n: number): void;
555
+ /**
556
+ * Signal the current stream to stop after the next `next_token()` call.
557
+ * The JS Stop button should call this, then wait for `next_token()` to
558
+ * return `undefined` before updating the UI.
559
+ */
560
+ stop_stream(): void;
561
+ /**
562
+ * Truncate `text` so that it fits within `budget` tokens when encoded.
563
+ *
564
+ * Encodes `text` with the embedded GGUF vocabulary, keeps the **last**
565
+ * `budget` tokens (tail of the text is preferred, so recent context is
566
+ * preserved), and decodes them back to a string. Returns `text` unchanged
567
+ * if it already fits or if no vocab is available.
568
+ *
569
+ * A typical call reserves space for the system prompt + generated output:
570
+ *
571
+ * ```javascript
572
+ * // Keep only the tail of the conversation that fits in the context
573
+ * const budget = engine.max_seq_len - 256; // leave 256 tokens for output
574
+ * const trimmed = engine.truncate_to_context(conversationText, budget);
575
+ * ```
576
+ */
577
+ truncate_to_context(text: string, budget: number): string;
578
+ /**
579
+ * Run a single dummy forward pass to pre-compile WebGPU shader pipelines.
580
+ *
581
+ * WebGPU (and wgpu on native) compiles shader pipelines lazily on the
582
+ * first dispatch. This causes a noticeable latency spike — often 100ms
583
+ * to several seconds — when the user makes their first inference request.
584
+ *
585
+ * Call `warmup()` once after `init_gpu()` completes to trigger all shader
586
+ * compilations in the background so the first real request feels fast.
587
+ * The KV cache is reset after the warmup so the engine is in a clean state.
588
+ *
589
+ * Returns `true` if the warmup forward pass ran without error, `false` if
590
+ * the model has not been loaded.
591
+ *
592
+ * # JS example
593
+ * ```javascript
594
+ * const engine = FlareEngine.load(bytes);
595
+ * await engine.init_gpu();
596
+ * engine.warmup(); // trigger shader compilation
597
+ * // First real inference is now fast
598
+ * engine.begin_stream(promptIds, 128);
599
+ * ```
600
+ */
601
+ warmup(): boolean;
602
+ /**
603
+ * Whether the model requests automatic BOS token prepending.
604
+ *
605
+ * Sourced from `tokenizer.ggml.add_bos_token` in the GGUF metadata.
606
+ * When `true`, all generation methods (`generate_tokens`, `begin_stream`,
607
+ * `generate_text`, `generate_stream`) automatically prepend the BOS token
608
+ * to the input token sequence unless it is already the first token.
609
+ */
610
+ readonly add_bos_token: boolean;
611
+ /**
612
+ * Model architecture name from `general.architecture` in the GGUF metadata.
613
+ *
614
+ * Returns a lowercase string such as `"llama"`, `"mistral"`, `"gemma2"`,
615
+ * `"phi3"`, or `"qwen2"`. Returns `"unknown"` if the field is absent.
616
+ */
617
+ readonly architecture: string;
618
+ /**
619
+ * BOS (beginning of sequence) token ID from the GGUF model metadata, if present.
620
+ * Some models require this to be prepended to the input token sequence.
621
+ */
622
+ readonly bos_token_id: number | undefined;
623
+ /**
624
+ * Name of the auto-detected chat template (e.g. `"ChatML"`, `"Llama3"`,
625
+ * `"Alpaca"`, `"Raw"`). Use this to display the template in the UI and
626
+ * decide whether to call `apply_chat_template` before encoding.
627
+ */
628
+ readonly chat_template_name: string;
629
+ /**
630
+ * Fraction of the context window consumed (0.0 = empty, 1.0 = full).
631
+ *
632
+ * Equivalent to `tokens_used / max_seq_len`. Returns 0.0 if `max_seq_len` is 0.
633
+ */
634
+ readonly context_window_pct: number;
635
+ /**
636
+ * EOS (end of sequence) token ID from the GGUF model metadata, if present.
637
+ * Generation stops automatically when this token is produced.
638
+ */
639
+ readonly eos_token_id: number | undefined;
640
+ /**
641
+ * Returns `true` if raw quantized weights are currently loaded.
642
+ */
643
+ readonly has_raw_weights: boolean;
644
+ /**
645
+ * Get the hidden dimension.
646
+ */
647
+ readonly hidden_dim: number;
648
+ /**
649
+ * Milliseconds spent in decode steps of the last generation call.
650
+ *
651
+ * For batch generation (`generate_tokens` etc.) this is always 0 — see
652
+ * `last_prefill_ms` for the total time. For the streaming API this
653
+ * accumulates across all `next_token()` calls since the last
654
+ * `begin_stream()`.
655
+ */
656
+ readonly last_decode_ms: number;
657
+ /**
658
+ * Raw pre-temperature logits from the most recent forward pass.
659
+ *
660
+ * Returns the full vocabulary logit vector as a `Float32Array`. These
661
+ * are the raw values *before* temperature scaling, repetition penalty,
662
+ * or any sampling filter — equivalent to the model's raw next-token
663
+ * distribution.
664
+ *
665
+ * Useful for:
666
+ * - Scoring candidate continuations (classification, ranking)
667
+ * - Computing perplexity / cross-entropy
668
+ * - Inspecting the model's "confidence" about the next token
669
+ *
670
+ * Returns an empty array before any inference has been run, and is
671
+ * cleared by `reset()`.
672
+ *
673
+ * ```javascript
674
+ * engine.begin_stream(promptIds, 1); // one token prefill+decode
675
+ * engine.next_token();
676
+ * const logits = engine.last_logits; // Float32Array of vocab_size
677
+ * const topTokenId = logits.indexOf(Math.max(...logits));
678
+ * ```
679
+ */
680
+ readonly last_logits: Float32Array;
681
+ /**
682
+ * Milliseconds spent in the last prefill (prompt processing) phase.
683
+ *
684
+ * For `generate_tokens` / `generate_text` / `generate_with_params` this
685
+ * covers the entire call (prefill + decode are not separated internally).
686
+ * For the streaming API (`begin_stream` + `next_token`) this covers only
687
+ * the `begin_stream()` call.
688
+ */
689
+ readonly last_prefill_ms: number;
690
+ /**
691
+ * Number of tokens generated by the last generation call (excludes prompt
692
+ * tokens and the EOS token itself).
693
+ */
694
+ readonly last_tokens_generated: number;
695
+ /**
696
+ * Maximum sequence length (context window size) of the loaded model.
697
+ *
698
+ * Use this to warn users when their prompt is approaching the limit.
699
+ */
700
+ readonly max_seq_len: number;
701
+ /**
702
+ * All GGUF model metadata as a JSON string.
703
+ *
704
+ * Returns a JSON object mapping each metadata key to its value.
705
+ * Large vocabulary arrays (`tokenizer.ggml.tokens`, `.merges`, `.scores`,
706
+ * `.added_tokens`) are omitted to keep the payload practical.
707
+ * Small arrays (≤ 64 entries) are included as JSON arrays.
708
+ *
709
+ * Returns `"{}"` if the model was not loaded from a GGUF file.
710
+ *
711
+ * ```javascript
712
+ * const meta = JSON.parse(engine.metadata_json);
713
+ * console.log(meta["llama.context_length"]); // e.g. 4096
714
+ * ```
715
+ */
716
+ readonly metadata_json: string;
717
+ /**
718
+ * Model display name from `general.name` in the GGUF metadata.
719
+ *
720
+ * Returns the human-readable name embedded by the model author (e.g.
721
+ * `"Llama 3.2 1B Instruct"`). Returns an empty string if the field is absent.
722
+ */
723
+ readonly model_name: string;
724
+ /**
725
+ * Get the number of attention heads.
726
+ */
727
+ readonly num_heads: number;
728
+ /**
729
+ * Get the number of layers.
730
+ */
731
+ readonly num_layers: number;
732
+ /**
733
+ * Serialise the driver-managed GPU pipeline cache to bytes.
734
+ *
735
+ * Returns an opaque blob that can be passed to `init_gpu_with_cache()` on
736
+ * the next startup to skip shader recompilation. Store it in
737
+ * `localStorage` or `IndexedDB` between page loads.
738
+ *
739
+ * Returns an empty `Uint8Array` if no GPU is active, or if the current
740
+ * backend does not support pipeline caching (WebGPU, Metal, DX12).
741
+ */
742
+ readonly pipeline_cache_data: Uint8Array;
743
+ /**
744
+ * Raw Jinja2 chat template string from the GGUF model metadata, if present.
745
+ *
746
+ * This is the `tokenizer.chat_template` field embedded by the model author.
747
+ * Use this with a JavaScript Jinja2 renderer (e.g. `nunjucks`) for accurate
748
+ * prompt formatting across all model families, rather than relying on the
749
+ * simplified built-in `apply_chat_template`.
750
+ *
751
+ * Returns `undefined` if the GGUF file did not include a chat template.
752
+ */
753
+ readonly raw_chat_template: string | undefined;
754
+ /**
755
+ * Current repetition-penalty window size (0 = disabled).
756
+ */
757
+ readonly repeat_last_n: number;
758
+ /**
759
+ * Whether the current stream has finished.
760
+ */
761
+ readonly stream_done: boolean;
762
+ /**
763
+ * Why the most-recent stream stopped.
764
+ *
765
+ * Returns one of:
766
+ * - `"eos"` — the model emitted the EOS token
767
+ * - `"length"` — `max_tokens` budget was exhausted
768
+ * - `"stop_sequence"` — a registered stop sequence was matched
769
+ * - `"user"` — `stop_stream()` was called
770
+ * - `""` (empty) — stream not yet started or still running
771
+ *
772
+ * # JS example
773
+ * ```javascript
774
+ * while (!engine.stream_done) engine.next_token();
775
+ * console.log("Stopped because:", engine.stream_stop_reason);
776
+ * ```
777
+ */
778
+ readonly stream_stop_reason: string;
779
+ /**
780
+ * Decode throughput in tokens per second for the last generation call.
781
+ *
782
+ * For the streaming API this is calculated from `last_decode_ms`.
783
+ * For batch generation this is calculated from `last_prefill_ms`
784
+ * (the total call duration).
785
+ *
786
+ * Returns 0.0 if no generation has been run or if timing data is
787
+ * unavailable.
788
+ */
789
+ readonly tokens_per_second: number;
790
+ /**
791
+ * How many tokens of context space remain before the window is full.
792
+ *
793
+ * Equivalent to `max_seq_len - tokens_used`. Returns 0 when the context is
794
+ * already full or `max_seq_len` is 0.
795
+ *
796
+ * # JS example
797
+ * ```javascript
798
+ * if (engine.tokens_remaining < 64) {
799
+ * console.warn("Context window almost full — consider resetting.");
800
+ * }
801
+ * ```
802
+ */
803
+ readonly tokens_remaining: number;
804
+ /**
805
+ * Number of tokens currently consumed in the KV-cache session (prompt + generated).
806
+ *
807
+ * Updated after every generation call; reset to 0 by `engine.reset()`.
808
+ * Use with `max_seq_len` to build a context-usage progress bar.
809
+ */
810
+ readonly tokens_used: number;
811
+ /**
812
+ * Interleaved top-N log-probabilities from the last forward pass.
813
+ *
814
+ * Layout: `[token_id_0 as f32, log_prob_0, token_id_1 as f32, log_prob_1, ...]`
815
+ * sorted by descending log-probability. Length is `top_logprobs_n * 2`.
816
+ *
817
+ * Returns an empty array if `set_top_logprobs(0)` (default) or before
818
+ * any inference has been run.
819
+ */
820
+ readonly top_logprobs: Float32Array;
821
+ /**
822
+ * Get the vocabulary size of the loaded model.
823
+ */
824
+ readonly vocab_size: number;
825
+ }
826
+
827
+ /**
828
+ * Progressive loader that fetches a GGUF model from a URL with streaming
829
+ * download progress.
830
+ *
831
+ * This enables the browser demo to show download progress as the model
832
+ * arrives over the network, then layer-loading progress as the model is
833
+ * parsed. For a 500MB Q4 model the download phase dominates; displaying
834
+ * progress prevents the page from appearing frozen.
835
+ *
836
+ * # JS example
837
+ *
838
+ * ```javascript
839
+ * const loader = new FlareProgressiveLoader('https://example.com/model.gguf');
840
+ * const engine = await loader.load((loaded, total) => {
841
+ * const pct = total > 0 ? Math.round(loaded / total * 100) : 0;
842
+ * progressBar.value = pct / 100;
843
+ * statusText.textContent = `Downloading… ${pct}%`;
844
+ * });
845
+ * ```
846
+ */
847
+ export class FlareProgressiveLoader {
848
+ free(): void;
849
+ [Symbol.dispose](): void;
850
+ /**
851
+ * Fetch the model from the URL, calling `on_progress(loaded_bytes, total_bytes)`
852
+ * as each chunk arrives, then parse and return a `FlareEngine`.
853
+ *
854
+ * `total_bytes` is 0 when the server does not send a `Content-Length` header
855
+ * (e.g. when the response is gzip-compressed or chunked).
856
+ */
857
+ load(on_progress: Function): Promise<FlareEngine>;
858
+ /**
859
+ * Create a loader for the given model URL.
860
+ */
861
+ constructor(url: string);
862
+ }
863
+
864
+ /**
865
+ * BPE tokenizer exported to JS for encoding prompts and decoding generated tokens.
866
+ *
867
+ * Load from a HuggingFace `tokenizer.json` string, then use `encode` / `decode`
868
+ * in coordination with `FlareEngine` to run full text-in / text-out inference.
869
+ *
870
+ * # JS example
871
+ *
872
+ * ```javascript
873
+ * const resp = await fetch('tokenizer.json');
874
+ * const json = await resp.text();
875
+ * const tok = FlareTokenizer.from_json(json);
876
+ *
877
+ * const ids = tok.encode("Hello, world!");
878
+ * const engine = FlareEngine.load(modelBytes);
879
+ * const out = engine.generate_tokens(ids, 64);
880
+ * console.log(tok.decode(out));
881
+ * ```
882
+ */
883
+ export class FlareTokenizer {
884
+ private constructor();
885
+ free(): void;
886
+ [Symbol.dispose](): void;
887
+ /**
888
+ * Decode a sequence of token IDs to text.
889
+ */
890
+ decode(tokens: Uint32Array): string;
891
+ /**
892
+ * Decode a single token ID to text (useful for streaming output).
893
+ */
894
+ decode_one(token_id: number): string;
895
+ /**
896
+ * Encode text to a sequence of token IDs.
897
+ */
898
+ encode(text: string): Uint32Array;
899
+ /**
900
+ * Load a tokenizer from the text of a HuggingFace `tokenizer.json` file.
901
+ */
902
+ static from_json(json: string): FlareTokenizer;
903
+ /**
904
+ * BOS (beginning of sequence) token ID, if defined.
905
+ */
906
+ readonly bos_token_id: number | undefined;
907
+ /**
908
+ * EOS (end of sequence) token ID, if defined.
909
+ */
910
+ readonly eos_token_id: number | undefined;
911
+ /**
912
+ * Vocabulary size.
913
+ */
914
+ readonly vocab_size: number;
915
+ }
916
+
917
+ /**
918
+ * Save model bytes to OPFS.
919
+ *
920
+ * Creates the `flare-models` directory if it does not exist. Overwrites any
921
+ * existing file with the same name.
922
+ */
923
+ export function cache_model(model_name: string, data: Uint8Array): Promise<void>;
924
+
925
+ /**
926
+ * Delete a cached model from OPFS.
927
+ */
928
+ export function delete_cached_model(model_name: string): Promise<void>;
929
+
930
+ /**
931
+ * Get basic device info as a JSON string.
932
+ */
933
+ export function device_info(): string;
934
+
935
+ /**
936
+ * Check if a model is cached in OPFS by name.
937
+ *
938
+ * Returns `false` if OPFS is unavailable or the model is not found.
939
+ */
940
+ export function is_model_cached(model_name: string): Promise<boolean>;
941
+
942
+ /**
943
+ * List all cached models with their sizes (in bytes).
944
+ *
945
+ * Returns a JSON-serialised array of objects: `[{name: string, size: number}, ...]`.
946
+ * Returns `"[]"` if OPFS is unavailable or the models directory does not exist.
947
+ */
948
+ export function list_cached_models(): Promise<any>;
949
+
950
+ /**
951
+ * Load model bytes from OPFS.
952
+ *
953
+ * Returns `null` (JS) / `None` (Rust) if the model is not cached or OPFS is
954
+ * unavailable.
955
+ */
956
+ export function load_cached_model(model_name: string): Promise<any>;
957
+
958
+ /**
959
+ * Set up better panic messages in the browser console.
960
+ */
961
+ export function start(): void;
962
+
963
+ /**
964
+ * Get storage usage and quota estimate.
965
+ *
966
+ * Returns a JSON string: `{usage: number, quota: number}`.
967
+ * Returns `"{}"` if the Storage API is unavailable.
968
+ */
969
+ export function storage_estimate(): Promise<any>;
970
+
971
+ /**
972
+ * Check if this WASM build was compiled with relaxed SIMD support.
973
+ *
974
+ * Relaxed SIMD provides hardware-specific faster operations like fused
975
+ * multiply-add (`f32x4_relaxed_madd`) that map directly to ARM NEON and
976
+ * x86 SSE/AVX FMA instructions. When enabled, matvec operations use FMA
977
+ * for ~15-30% speedup.
978
+ *
979
+ * This is a compile-time feature: the WASM binary either includes relaxed
980
+ * SIMD instructions or it does not. The browser validates them at module
981
+ * load time, so if this module loaded successfully and returns `true`,
982
+ * relaxed SIMD is active.
983
+ */
984
+ export function supports_relaxed_simd(): boolean;
985
+
986
+ /**
987
+ * Check if the browser exposes the Web Speech API for speech recognition.
988
+ *
989
+ * This probes `window.SpeechRecognition` and the WebKit-prefixed
990
+ * `window.webkitSpeechRecognition`. Returning `true` means the demo voice
991
+ * mode can capture microphone input and produce transcripts through the
992
+ * platform speech engine. This is a foundation for the voice pipeline
993
+ * (issue #395); a fully offline path will eventually run Whisper in WASM.
994
+ */
995
+ export function supports_speech_recognition(): boolean;
996
+
997
+ /**
998
+ * Check if the browser exposes the Web Speech API for speech synthesis.
999
+ *
1000
+ * Returns `true` when `window.speechSynthesis` is available, enabling the
1001
+ * demo voice mode to speak model responses. A fully offline path will
1002
+ * eventually run a neural TTS model in WASM.
1003
+ */
1004
+ export function supports_speech_synthesis(): boolean;
1005
+
1006
+ /**
1007
+ * Check if WebNN is available in the current browser.
1008
+ *
1009
+ * WebNN (`navigator.ml`) exposes neural-network acceleration through
1010
+ * platform NPUs/DSPs. This is a foundation check so JS code can decide
1011
+ * whether to build a WebNN graph from exported weights.
1012
+ */
1013
+ export function supports_webnn(): boolean;
1014
+
1015
+ /**
1016
+ * Check if WebTransport is available in the current browser.
1017
+ *
1018
+ * WebTransport (`window.WebTransport`) is a modern transport API built on
1019
+ * HTTP/3 QUIC streams. It allows opening multiple parallel bidirectional
1020
+ * streams to the same origin with lower head-of-line blocking than fetch().
1021
+ * Useful for progressive model loading where different byte ranges of the
1022
+ * GGUF file can be downloaded concurrently.
1023
+ *
1024
+ * Note: actually using WebTransport for parallel range downloads requires
1025
+ * server-side support (HTTP/3 endpoint that accepts byte-range requests
1026
+ * on streams). This check only reports browser capability — the JS loader
1027
+ * will fall back to `fetch()` when the server does not cooperate.
1028
+ */
1029
+ export function supports_webtransport(): boolean;
1030
+
1031
+ /**
1032
+ * Check if WebGPU is available in the current browser.
1033
+ */
1034
+ export function webgpu_available(): boolean;
1035
+
1036
+ export type InitInput = RequestInfo | URL | Response | BufferSource | WebAssembly.Module;
1037
+
1038
+ export interface InitOutput {
1039
+ readonly memory: WebAssembly.Memory;
1040
+ readonly __wbg_flareengine_free: (a: number, b: number) => void;
1041
+ readonly __wbg_flareprogressiveloader_free: (a: number, b: number) => void;
1042
+ readonly __wbg_flaretokenizer_free: (a: number, b: number) => void;
1043
+ readonly cache_model: (a: number, b: number, c: number, d: number) => any;
1044
+ readonly delete_cached_model: (a: number, b: number) => any;
1045
+ readonly device_info: () => [number, number];
1046
+ readonly flareengine_add_bos_token: (a: number) => number;
1047
+ readonly flareengine_add_stop_sequence: (a: number, b: number, c: number) => void;
1048
+ readonly flareengine_apply_chat_template: (a: number, b: number, c: number, d: number, e: number) => [number, number];
1049
+ readonly flareengine_architecture: (a: number) => [number, number];
1050
+ readonly flareengine_begin_stream: (a: number, b: number, c: number, d: number) => void;
1051
+ readonly flareengine_begin_stream_healed: (a: number, b: number, c: number, d: number) => void;
1052
+ readonly flareengine_begin_stream_healed_with_params: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number, i: number) => void;
1053
+ readonly flareengine_begin_stream_with_params: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number, i: number) => void;
1054
+ readonly flareengine_bos_token_id: (a: number) => number;
1055
+ readonly flareengine_chat_template_name: (a: number) => [number, number];
1056
+ readonly flareengine_clear_raw_weights: (a: number) => void;
1057
+ readonly flareengine_clear_stop_sequences: (a: number) => void;
1058
+ readonly flareengine_compute_perplexity: (a: number, b: number, c: number) => number;
1059
+ readonly flareengine_context_window_pct: (a: number) => number;
1060
+ readonly flareengine_count_tokens: (a: number, b: number, c: number) => number;
1061
+ readonly flareengine_decode_ids: (a: number, b: number, c: number) => [number, number];
1062
+ readonly flareengine_decode_token: (a: number, b: number) => [number, number];
1063
+ readonly flareengine_decode_token_chunk: (a: number, b: number) => [number, number];
1064
+ readonly flareengine_embed_token: (a: number, b: number) => [number, number];
1065
+ readonly flareengine_encode_text: (a: number, b: number, c: number) => [number, number];
1066
+ readonly flareengine_eos_token_id: (a: number) => number;
1067
+ readonly flareengine_generate_stream: (a: number, b: number, c: number, d: number, e: any) => number;
1068
+ readonly flareengine_generate_stream_with_params: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number, i: number, j: any) => number;
1069
+ readonly flareengine_generate_text: (a: number, b: number, c: number, d: number) => [number, number];
1070
+ readonly flareengine_generate_text_with_params: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number, i: number) => [number, number];
1071
+ readonly flareengine_generate_tokens: (a: number, b: number, c: number, d: number) => [number, number];
1072
+ readonly flareengine_generate_with_params: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number, i: number) => [number, number];
1073
+ readonly flareengine_has_raw_weights: (a: number) => number;
1074
+ readonly flareengine_hidden_dim: (a: number) => number;
1075
+ readonly flareengine_init_gpu: (a: number) => any;
1076
+ readonly flareengine_init_gpu_with_cache: (a: number, b: number, c: number) => any;
1077
+ readonly flareengine_last_decode_ms: (a: number) => number;
1078
+ readonly flareengine_last_logits: (a: number) => [number, number];
1079
+ readonly flareengine_last_prefill_ms: (a: number) => number;
1080
+ readonly flareengine_last_tokens_generated: (a: number) => number;
1081
+ readonly flareengine_load: (a: number, b: number) => [number, number, number];
1082
+ readonly flareengine_load_raw_weights: (a: number, b: number, c: number) => number;
1083
+ readonly flareengine_max_seq_len: (a: number) => number;
1084
+ readonly flareengine_merge_lora: (a: number, b: number, c: number) => [number, number];
1085
+ readonly flareengine_merge_lora_with_alpha: (a: number, b: number, c: number, d: number) => [number, number];
1086
+ readonly flareengine_metadata_json: (a: number) => [number, number];
1087
+ readonly flareengine_model_name: (a: number) => [number, number];
1088
+ readonly flareengine_next_token: (a: number) => number;
1089
+ readonly flareengine_num_heads: (a: number) => number;
1090
+ readonly flareengine_num_layers: (a: number) => number;
1091
+ readonly flareengine_output_projection: (a: number, b: number, c: number) => [number, number];
1092
+ readonly flareengine_performance_summary: (a: number) => [number, number];
1093
+ readonly flareengine_pipeline_cache_data: (a: number) => [number, number];
1094
+ readonly flareengine_raw_chat_template: (a: number) => [number, number];
1095
+ readonly flareengine_repeat_last_n: (a: number) => number;
1096
+ readonly flareengine_reset: (a: number) => void;
1097
+ readonly flareengine_set_repeat_last_n: (a: number, b: number) => void;
1098
+ readonly flareengine_set_rng_seed: (a: number, b: number) => void;
1099
+ readonly flareengine_set_top_logprobs: (a: number, b: number) => void;
1100
+ readonly flareengine_stop_stream: (a: number) => void;
1101
+ readonly flareengine_stream_done: (a: number) => number;
1102
+ readonly flareengine_stream_stop_reason: (a: number) => [number, number];
1103
+ readonly flareengine_tokens_per_second: (a: number) => number;
1104
+ readonly flareengine_tokens_remaining: (a: number) => number;
1105
+ readonly flareengine_tokens_used: (a: number) => number;
1106
+ readonly flareengine_top_logprobs: (a: number) => [number, number];
1107
+ readonly flareengine_truncate_to_context: (a: number, b: number, c: number, d: number) => [number, number];
1108
+ readonly flareengine_vocab_size: (a: number) => number;
1109
+ readonly flareengine_warmup: (a: number) => number;
1110
+ readonly flareprogressiveloader_load: (a: number, b: any) => any;
1111
+ readonly flareprogressiveloader_new: (a: number, b: number) => number;
1112
+ readonly flaretokenizer_decode: (a: number, b: number, c: number) => [number, number, number, number];
1113
+ readonly flaretokenizer_decode_one: (a: number, b: number) => [number, number, number, number];
1114
+ readonly flaretokenizer_encode: (a: number, b: number, c: number) => [number, number, number, number];
1115
+ readonly flaretokenizer_from_json: (a: number, b: number) => [number, number, number];
1116
+ readonly flaretokenizer_vocab_size: (a: number) => number;
1117
+ readonly is_model_cached: (a: number, b: number) => any;
1118
+ readonly list_cached_models: () => any;
1119
+ readonly load_cached_model: (a: number, b: number) => any;
1120
+ readonly storage_estimate: () => any;
1121
+ readonly supports_relaxed_simd: () => number;
1122
+ readonly supports_speech_recognition: () => number;
1123
+ readonly supports_speech_synthesis: () => number;
1124
+ readonly supports_webnn: () => number;
1125
+ readonly supports_webtransport: () => number;
1126
+ readonly webgpu_available: () => number;
1127
+ readonly start: () => void;
1128
+ readonly flaretokenizer_bos_token_id: (a: number) => number;
1129
+ readonly flaretokenizer_eos_token_id: (a: number) => number;
1130
+ readonly wasm_bindgen__convert__closures_____invoke__h7ed8ea06cc0c8ca5: (a: number, b: number, c: any) => [number, number];
1131
+ readonly wasm_bindgen__convert__closures_____invoke__hcdfd434894ba1863: (a: number, b: number, c: any, d: any) => void;
1132
+ readonly wasm_bindgen__convert__closures_____invoke__h235e00bf230ad8a4: (a: number, b: number, c: any) => void;
1133
+ readonly __wbindgen_malloc: (a: number, b: number) => number;
1134
+ readonly __wbindgen_realloc: (a: number, b: number, c: number, d: number) => number;
1135
+ readonly __externref_table_alloc: () => number;
1136
+ readonly __wbindgen_externrefs: WebAssembly.Table;
1137
+ readonly __wbindgen_exn_store: (a: number) => void;
1138
+ readonly __wbindgen_free: (a: number, b: number, c: number) => void;
1139
+ readonly __wbindgen_destroy_closure: (a: number, b: number) => void;
1140
+ readonly __externref_table_dealloc: (a: number) => void;
1141
+ readonly __wbindgen_start: () => void;
1142
+ }
1143
+
1144
+ export type SyncInitInput = BufferSource | WebAssembly.Module;
1145
+
1146
+ /**
1147
+ * Instantiates the given `module`, which can either be bytes or
1148
+ * a precompiled `WebAssembly.Module`.
1149
+ *
1150
+ * @param {{ module: SyncInitInput }} module - Passing `SyncInitInput` directly is deprecated.
1151
+ *
1152
+ * @returns {InitOutput}
1153
+ */
1154
+ export function initSync(module: { module: SyncInitInput } | SyncInitInput): InitOutput;
1155
+
1156
+ /**
1157
+ * If `module_or_path` is {RequestInfo} or {URL}, makes a request and
1158
+ * for everything else, calls `WebAssembly.instantiate` directly.
1159
+ *
1160
+ * @param {{ module_or_path: InitInput | Promise<InitInput> }} module_or_path - Passing `InitInput` directly is deprecated.
1161
+ *
1162
+ * @returns {Promise<InitOutput>}
1163
+ */
1164
+ export default function __wbg_init (module_or_path?: { module_or_path: InitInput | Promise<InitInput> } | InitInput | Promise<InitInput>): Promise<InitOutput>;