@web-llm-wrappers/react 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1904 @@
1
+ import * as tvmjs from "@mlc-ai/web-runtime";
2
+ import { Tokenizer } from "@mlc-ai/web-tokenizers";
3
+
4
+ //#region ../../node_modules/.pnpm/@mlc-ai+web-llm@0.2.79/node_modules/@mlc-ai/web-llm/lib/types.d.ts
5
+
6
+ /**
7
+ * Report during intialization.
8
+ */
9
+ interface InitProgressReport {
10
+ progress: number;
11
+ timeElapsed: number;
12
+ text: string;
13
+ }
14
+ /**
15
+ * Callbacks used to report initialization process.
16
+ */
17
+ type InitProgressCallback = (report: InitProgressReport) => void;
18
+ /**
19
+ * A stateful logitProcessor used to post-process logits after forwarding the input and before
20
+ * sampling the next token. If used with `GenerationConfig.logit_bias`, logit_bias is applied after
21
+ * `processLogits()` is called.
22
+ */
23
+ interface LogitProcessor {
24
+ /**
25
+ * Process logits after forward() and before sampling implicitly, happens on the CPU.
26
+ * @param logits The logits right after forward().
27
+ * Returns the processed logits.
28
+ */
29
+ processLogits: (logits: Float32Array) => Float32Array;
30
+ /**
31
+ * Use the sampled token to update the LogitProcessor's internal state. Called implicitly
32
+ * right after the next token is sampled/committed.
33
+ * @param token Token sampled from the processed logits.
34
+ */
35
+ processSampledToken: (token: number) => void;
36
+ /**
37
+ * Called when in `MLCEngine.resetChat()`. Can clear internal states.
38
+ */
39
+ resetState: () => void;
40
+ }
41
+ /**
42
+ * Common interface of MLCEngine that UI can interact with
43
+ */
44
+ interface MLCEngineInterface {
45
+ /**
46
+ * An object that exposes chat-related APIs.
47
+ */
48
+ chat: Chat;
49
+ /**
50
+ * An object that exposes text completion APIs.
51
+ */
52
+ completions: Completions;
53
+ /**
54
+ * An object that exposes embeddings APIs.
55
+ */
56
+ embeddings: Embeddings;
57
+ /**
58
+ * Set an initialization progress callback function
59
+ * which reports the progress of model loading.
60
+ *
61
+ * This function can be useful to implement an UI that
62
+ * update as we loading the model.
63
+ *
64
+ * @param initProgressCallback The callback function
65
+ */
66
+ setInitProgressCallback: (initProgressCallback: InitProgressCallback) => void;
67
+ /**
68
+ * @returns The current initialization progress callback function.
69
+ */
70
+ getInitProgressCallback: () => InitProgressCallback | undefined;
71
+ /**
72
+ * Setter for the engine's appConfig.
73
+ */
74
+ setAppConfig: (appConfig: AppConfig) => void;
75
+ /**
76
+ * Reload the chat with a new model.
77
+ *
78
+ * @param modelId model_id of the model to load, either string or string[]. When multiple models
79
+ * are provided, we load all models sequentially. Each modelId needs to either be in
80
+ * `webllm.prebuiltAppConfig`, or in `engineConfig.appConfig`.
81
+ * @param chatOpts Extra options to optionally override the `mlc-chat-config.json` of `modelId`.
82
+ * The size of which needs to match that of `modelId`; chatOpts[i] will be used for modelId[i].
83
+ * @returns A promise when reload finishes.
84
+ * @throws Throws error when device lost (mostly due to OOM); users should re-call reload(),
85
+ * potentially with a smaller model or smaller context window size.
86
+ * @note This is an async function.
87
+ */
88
+ reload: (modelId: string | string[], chatOpts?: ChatOptions | ChatOptions[]) => Promise<void>;
89
+ /**
90
+ * OpenAI-style API. Generate a chat completion response for the given conversation and
91
+ * configuration. Use `engine.chat.completions.create()` to invoke this API.
92
+ *
93
+ * @param request A OpenAI-style ChatCompletion request.
94
+ *
95
+ * @note The API is completely functional in behavior. That is, a previous request would not
96
+ * affect the current request's result. Thus, for multi-round chatting, users are responsible for
97
+ * maintaining the chat history. With that being said, as an implicit internal optimization, if we
98
+ * detect that the user is performing multi-round chatting, we will preserve the KV cache and only
99
+ * prefill the new tokens.
100
+ * @note For requests sent to the same modelId, will block until all previous requests finish.
101
+ * @note For more, see https://platform.openai.com/docs/api-reference/chat
102
+ */
103
+ chatCompletion(request: ChatCompletionRequestNonStreaming): Promise<ChatCompletion>;
104
+ chatCompletion(request: ChatCompletionRequestStreaming): Promise<AsyncIterable<ChatCompletionChunk>>;
105
+ chatCompletion(request: ChatCompletionRequestBase): Promise<AsyncIterable<ChatCompletionChunk> | ChatCompletion>;
106
+ chatCompletion(request: ChatCompletionRequest): Promise<AsyncIterable<ChatCompletionChunk> | ChatCompletion>;
107
+ /**
108
+ * OpenAI-style API. Completes a CompletionCreateParams, a text completion with no chat template.
109
+ * Use `engine.completions.create()` to invoke this API.
110
+ *
111
+ * @param request An OpenAI-style Completion request.
112
+ *
113
+ * @note For requests sent to the same modelId, will block until all previous requests finish.
114
+ * @note For more, see https://platform.openai.com/docs/api-reference/completions
115
+ */
116
+ completion(request: CompletionCreateParamsNonStreaming): Promise<Completion>;
117
+ completion(request: CompletionCreateParamsStreaming): Promise<AsyncIterable<Completion>>;
118
+ completion(request: CompletionCreateParamsBase): Promise<AsyncIterable<Completion> | Completion>;
119
+ completion(request: CompletionCreateParams): Promise<AsyncIterable<Completion> | Completion>;
120
+ /**
121
+ * OpenAI-style API. Creates an embedding vector representing the input text.
122
+ * Use `engine.embeddings.create()` to invoke this API.
123
+ *
124
+ * @param request An OpenAI-style Embeddings request.
125
+ *
126
+ * @note For requests sent to the same modelId, will block until all previous requests finish.
127
+ * @note For more, see https://platform.openai.com/docs/api-reference/embeddings/create
128
+ */
129
+ embedding(request: EmbeddingCreateParams): Promise<CreateEmbeddingResponse>;
130
+ /**
131
+ * @returns A text summarizing the runtime stats.
132
+ * @param modelId Only required when multiple models are loaded.
133
+ * @note This is an async function
134
+ */
135
+ runtimeStatsText: (modelId?: string) => Promise<string>;
136
+ /**
137
+ * Interrupt the generate process if it is already running.
138
+ */
139
+ interruptGenerate: () => void;
140
+ /**
141
+ * Explicitly unload the currently loaded model(s) and release the related resources. Waits until
142
+ * the webgpu device finishes all submitted work and destroys itself.
143
+ * @note This is an asynchronous function.
144
+ */
145
+ unload: () => Promise<void>;
146
+ /**
147
+ * Reset the current chat session by clear all memories.
148
+ * @param keepStats: If True, do not reset the statistics.
149
+ * @param modelId Only required when multiple models are loaded.
150
+ */
151
+ resetChat: (keepStats?: boolean, modelId?: string) => Promise<void>;
152
+ /**
153
+ * Get the current generated response.
154
+ * @param modelId Only required when multiple models are loaded.
155
+ * @returns The current output message.
156
+ */
157
+ getMessage: (modelId?: string) => Promise<string>;
158
+ /**
159
+ * Returns the device's maxStorageBufferBindingSize, can be used to guess whether the device
160
+ * has limited resources like an Android phone.
161
+ */
162
+ getMaxStorageBufferBindingSize(): Promise<number>;
163
+ /**
164
+ * Returns the device's gpu vendor (e.g. arm, qualcomm, apple) if available. Otherwise return
165
+ * an empty string.
166
+ */
167
+ getGPUVendor(): Promise<string>;
168
+ /**
169
+ * Forward the given input tokens to the model, then sample the next token.
170
+ *
171
+ * This function has side effects as the model will update its KV cache.
172
+ *
173
+ * @param inputIds The input tokens.
174
+ * @param isPrefill True if prefill, false if decode; only used for statistics.
175
+ * @param modelId Only required when multiple models are loaded.
176
+ * @returns Next token sampled.
177
+ * @note This is an async function.
178
+ */
179
+ forwardTokensAndSample(inputIds: Array<number>, isPrefill: boolean, modelId?: string): Promise<number>;
180
+ /**
181
+ * Set MLCEngine logging output level
182
+ *
183
+ * @param logLevel The new log level
184
+ */
185
+ setLogLevel(logLevel: LogLevel): void;
186
+ }
187
+ declare const LOG_LEVELS: {
188
+ TRACE: number;
189
+ DEBUG: number;
190
+ INFO: number;
191
+ WARN: number;
192
+ ERROR: number;
193
+ SILENT: number;
194
+ };
195
+ type LogLevel = keyof typeof LOG_LEVELS;
196
+ //#endregion
197
+ //#region ../../node_modules/.pnpm/@mlc-ai+web-llm@0.2.79/node_modules/@mlc-ai/web-llm/lib/openai_api_protocols/chat_completion.d.ts
198
+ declare class Chat {
199
+ private engine;
200
+ completions: Completions$1;
201
+ constructor(engine: MLCEngineInterface);
202
+ }
203
+ declare class Completions$1 {
204
+ private engine;
205
+ constructor(engine: MLCEngineInterface);
206
+ create(request: ChatCompletionRequestNonStreaming): Promise<ChatCompletion>;
207
+ create(request: ChatCompletionRequestStreaming): Promise<AsyncIterable<ChatCompletionChunk>>;
208
+ create(request: ChatCompletionRequestBase): Promise<AsyncIterable<ChatCompletionChunk> | ChatCompletion>;
209
+ }
210
+ /**
211
+ * OpenAI chat completion request protocol.
212
+ *
213
+ * API reference: https://platform.openai.com/docs/api-reference/chat/create
214
+ * Followed: https://github.com/openai/openai-node/blob/master/src/resources/chat/completions.ts
215
+ *
216
+ * @note `model` is excluded. Instead, call `CreateMLCEngine(model)` or `engine.reload(model)` explicitly before calling this API.
217
+ */
218
+ interface ChatCompletionRequestBase {
219
+ /**
220
+ * A list of messages comprising the conversation so far.
221
+ */
222
+ messages: Array<ChatCompletionMessageParam>;
223
+ /**
224
+ * If set, partial message deltas will be sent. It will be terminated by an empty chunk.
225
+ */
226
+ stream?: boolean | null;
227
+ /**
228
+ * Options for streaming response. Only set this when you set `stream: true`.
229
+ */
230
+ stream_options?: ChatCompletionStreamOptions | null;
231
+ /**
232
+ * How many chat completion choices to generate for each input message.
233
+ */
234
+ n?: number | null;
235
+ /**
236
+ * Number between -2.0 and 2.0. Positive values penalize new tokens based on their
237
+ * existing frequency in the text so far, decreasing the model's likelihood to
238
+ * repeat the same line verbatim.
239
+ *
240
+ * [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/text-generation/parameter-details)
241
+ */
242
+ frequency_penalty?: number | null;
243
+ /**
244
+ * Number between -2.0 and 2.0. Positive values penalize new tokens based on
245
+ * whether they appear in the text so far, increasing the model's likelihood to
246
+ * talk about new topics.
247
+ *
248
+ * [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/text-generation/parameter-details)
249
+ */
250
+ presence_penalty?: number | null;
251
+ /**
252
+ * The maximum number of [tokens](/tokenizer) that can be generated in the chat
253
+ * completion.
254
+ *
255
+ * The total length of input tokens and generated tokens is limited by the model's
256
+ * context length.
257
+ */
258
+ max_tokens?: number | null;
259
+ /**
260
+ * Sequences where the API will stop generating further tokens.
261
+ */
262
+ stop?: string | null | Array<string>;
263
+ /**
264
+ * What sampling temperature to use, between 0 and 2. Higher values like 0.8 will
265
+ * make the output more random, while lower values like 0.2 will make it more
266
+ * focused and deterministic.
267
+ */
268
+ temperature?: number | null;
269
+ /**
270
+ * An alternative to sampling with temperature, called nucleus sampling, where the
271
+ * model considers the results of the tokens with top_p probability mass. So 0.1
272
+ * means only the tokens comprising the top 10% probability mass are considered.
273
+ */
274
+ top_p?: number | null;
275
+ /**
276
+ * Modify the likelihood of specified tokens appearing in the completion.
277
+ *
278
+ * Accepts a JSON object that maps tokens (specified by their token ID, which varies per model)
279
+ * to an associated bias value from -100 to 100. Typically, you can see `tokenizer.json` of the
280
+ * model to see which token ID maps to what string. Mathematically, the bias is added to the
281
+ * logits generated by the model prior to sampling. The exact effect will vary per model, but
282
+ * values between -1 and 1 should decrease or increase likelihood of selection; values like -100
283
+ * or 100 should result in a ban or exclusive selection of the relevant token.
284
+ *
285
+ * As an example, you can pass `{"16230": -100}` to prevent the `Hello` token from being
286
+ * generated in Mistral-7B-Instruct-v0.2, according to the mapping in
287
+ * https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/raw/main/tokenizer.json.
288
+ *
289
+ * @note For stateful and customizable / flexible logit processing, see `webllm.LogitProcessor`.
290
+ * @note If used in combination with `webllm.LogitProcessor`, `logit_bias` is applied after
291
+ * `LogitProcessor.processLogits()` is called.
292
+ */
293
+ logit_bias?: Record<string, number> | null;
294
+ /**
295
+ * Whether to return log probabilities of the output tokens or not.
296
+ *
297
+ * If true, returns the log probabilities of each output token returned in the `content` of
298
+ * `message`.
299
+ */
300
+ logprobs?: boolean | null;
301
+ /**
302
+ * An integer between 0 and 5 specifying the number of most likely tokens to return
303
+ * at each token position, each with an associated log probability. `logprobs` must
304
+ * be set to `true` if this parameter is used.
305
+ */
306
+ top_logprobs?: number | null;
307
+ /**
308
+ * If specified, our system will make a best effort to sample deterministically, such that
309
+ * repeated requests with the same `seed` and parameters should return the same result.
310
+ *
311
+ * @note Seeding is done on a request-level rather than choice-level. That is, if `n > 1`, you
312
+ * would still get different content for each `Choice`. But if two requests with `n = 2` are
313
+ * processed with the same seed, the two results should be the same (two choices are different).
314
+ */
315
+ seed?: number | null;
316
+ /**
317
+ * Controls which (if any) function is called by the model. `none` means the model
318
+ * will not call a function and instead generates a message. `auto` means the model
319
+ * can pick between generating a message or calling a function. Specifying a
320
+ * particular function via
321
+ * `{"type": "function", "function": {"name": "my_function"}}` forces the model to
322
+ * call that function.
323
+ *
324
+ * `none` is the default when no functions are present. `auto` is the default if
325
+ * functions are present.
326
+ */
327
+ tool_choice?: ChatCompletionToolChoiceOption;
328
+ /**
329
+ * A list of tools the model may call. Currently, only functions are supported as a
330
+ * tool. Use this to provide a list of functions the model may generate JSON inputs
331
+ * for.
332
+ *
333
+ * The corresponding reply would populate the `tool_calls` field. If used with streaming,
334
+ * the last chunk would contain the `tool_calls` field, while the intermediate chunks would
335
+ * contain the raw string.
336
+ *
337
+ * If the generation terminates due to FinishReason other than "stop" (i.e. "length" or "abort"),
338
+ * then no `tool_calls` will be returned. User can still get the raw string output.
339
+ */
340
+ tools?: Array<ChatCompletionTool>;
341
+ /**
342
+ * An object specifying the format that the model must output.
343
+ *
344
+ * Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
345
+ * message the model generates is valid JSON.
346
+ *
347
+ * **Important:** when using JSON mode, you **must** also instruct the model to
348
+ * produce JSON yourself via a system or user message. Without this, the model may
349
+ * generate an unending stream of whitespace until the generation reaches the token
350
+ * limit, resulting in a long-running and seemingly "stuck" request. Also note that
351
+ * the message content may be partially cut off if `finish_reason="length"`, which
352
+ * indicates the generation exceeded `max_tokens` or the conversation exceeded the
353
+ * max context length.
354
+ */
355
+ response_format?: ResponseFormat;
356
+ /**
357
+ * If true, will ignore stop string and stop token and generate until max_tokens hit.
358
+ * If unset, will treat as false.
359
+ */
360
+ ignore_eos?: boolean;
361
+ /**
362
+ * ID of the model to use. This equals to `ModelRecord.model_id`, which needs to either be in
363
+ * `webllm.prebuiltAppConfig` or in `engineConfig.appConfig`.
364
+ *
365
+ * @note Call `CreateMLCEngine(model)` or `engine.reload(model)` ahead of time.
366
+ * @note If only one model is loaded in the engine, this field is optional. If multiple models
367
+ * are loaded, this is required.
368
+ */
369
+ model?: string | null;
370
+ /**
371
+ * Fields specific to WebLLM, not present in OpenAI.
372
+ */
373
+ extra_body?: {
374
+ /**
375
+ * If set to false, prepend a "<think>\n\n</think>\n\n" to the response, preventing the
376
+ * model from generating thinking tokens. If set to true or undefined, does nothing.
377
+ *
378
+ * @note Currently only allowed to be used for Qwen3 models, though not explicitly checked.
379
+ */
380
+ enable_thinking?: boolean | null;
381
+ };
382
+ }
383
+ interface ChatCompletionRequestNonStreaming extends ChatCompletionRequestBase {
384
+ /**
385
+ * If set, partial message deltas will be sent. It will be terminated by an empty chunk.
386
+ */
387
+ stream?: false | null;
388
+ }
389
+ interface ChatCompletionRequestStreaming extends ChatCompletionRequestBase {
390
+ /**
391
+ * If set, partial message deltas will be sent. It will be terminated by an empty chunk.
392
+ */
393
+ stream: true;
394
+ }
395
+ type ChatCompletionRequest = ChatCompletionRequestNonStreaming | ChatCompletionRequestStreaming;
396
+ /**
397
+ * Represents a chat completion response returned by model, based on the provided input.
398
+ */
399
+ interface ChatCompletion {
400
+ /**
401
+ * A unique identifier for the chat completion.
402
+ */
403
+ id: string;
404
+ /**
405
+ * A list of chat completion choices. Can be more than one if `n` is greater than 1.
406
+ */
407
+ choices: Array<ChatCompletion.Choice>;
408
+ /**
409
+ * The model used for the chat completion.
410
+ */
411
+ model: string;
412
+ /**
413
+ * The object type, which is always `chat.completion`.
414
+ */
415
+ object: "chat.completion";
416
+ /**
417
+ * The Unix timestamp (in seconds) of when the chat completion was created.
418
+ *
419
+ */
420
+ created: number;
421
+ /**
422
+ * Usage statistics for the completion request.
423
+ *
424
+ * @note If we detect user is performing multi-round chatting, only the new portion of the
425
+ * prompt is counted for prompt_tokens. If `n > 1`, all choices' generation usages combined.
426
+ */
427
+ usage?: CompletionUsage;
428
+ /**
429
+ * This fingerprint represents the backend configuration that the model runs with.
430
+ *
431
+ * Can be used in conjunction with the `seed` request parameter to understand when
432
+ * backend changes have been made that might impact determinism.
433
+ *
434
+ * @note Not supported yet.
435
+ */
436
+ system_fingerprint?: string;
437
+ }
438
+ /**
439
+ * Represents a streamed chunk of a chat completion response returned by model,
440
+ * based on the provided input.
441
+ */
442
+ interface ChatCompletionChunk {
443
+ /**
444
+ * A unique identifier for the chat completion. Each chunk has the same ID.
445
+ */
446
+ id: string;
447
+ /**
448
+ * A list of chat completion choices. Can contain more than one elements if `n` is
449
+ * greater than 1. Can also be empty for the last chunk if you set
450
+ * `stream_options: {"include_usage": true}`.
451
+ */
452
+ choices: Array<ChatCompletionChunk.Choice>;
453
+ /**
454
+ * The Unix timestamp (in seconds) of when the chat completion was created. Each
455
+ * chunk has the same timestamp.
456
+ */
457
+ created: number;
458
+ /**
459
+ * The model to generate the completion.
460
+ */
461
+ model: string;
462
+ /**
463
+ * The object type, which is always `chat.completion.chunk`.
464
+ */
465
+ object: "chat.completion.chunk";
466
+ /**
467
+ * This fingerprint represents the backend configuration that the model runs with.
468
+ * Can be used in conjunction with the `seed` request parameter to understand when
469
+ * backend changes have been made that might impact determinism.
470
+ *
471
+ * @note Not supported yet.
472
+ */
473
+ system_fingerprint?: string;
474
+ /**
475
+ * An optional field that will only be present when you set
476
+ * `stream_options: {"include_usage": true}` in your request. When present, it
477
+ * contains a null value except for the last chunk which contains the token usage
478
+ * statistics for the entire request.
479
+ */
480
+ usage?: CompletionUsage;
481
+ }
482
+ type ChatCompletionContentPart = ChatCompletionContentPartText | ChatCompletionContentPartImage;
483
+ interface ChatCompletionContentPartText {
484
+ /**
485
+ * The text content.
486
+ */
487
+ text: string;
488
+ /**
489
+ * The type of the content part.
490
+ */
491
+ type: "text";
492
+ }
493
+ declare namespace ChatCompletionContentPartImage {
494
+ interface ImageURL {
495
+ /**
496
+ * Either a URL of the image or the base64 encoded image data.
497
+ */
498
+ url: string;
499
+ /**
500
+ * Specifies the detail level of the image.
501
+ */
502
+ detail?: "auto" | "low" | "high";
503
+ }
504
+ }
505
+ interface ChatCompletionContentPartImage {
506
+ image_url: ChatCompletionContentPartImage.ImageURL;
507
+ /**
508
+ * The type of the content part.
509
+ */
510
+ type: "image_url";
511
+ }
512
+ interface ChatCompletionMessageToolCall {
513
+ /**
514
+ * The ID of the tool call. In WebLLM, it is used as the index of the tool call among all
515
+ * the tools calls in this request generation.
516
+ */
517
+ id: string;
518
+ /**
519
+ * The function that the model called.
520
+ */
521
+ function: ChatCompletionMessageToolCall.Function;
522
+ /**
523
+ * The type of the tool. Currently, only `function` is supported.
524
+ */
525
+ type: "function";
526
+ }
527
+ declare namespace ChatCompletionMessageToolCall {
528
+ /**
529
+ * The function that the model called.
530
+ */
531
+ interface Function {
532
+ /**
533
+ * The arguments to call the function with, as generated by the model in JSON
534
+ * format.
535
+ */
536
+ arguments: string;
537
+ /**
538
+ * The name of the function to call.
539
+ */
540
+ name: string;
541
+ }
542
+ }
543
+ /**
544
+ * Options for streaming response. Only set this when you set `stream: true`.
545
+ */
546
+ interface ChatCompletionStreamOptions {
547
+ /**
548
+ * If set, an additional chunk will be streamed after the last empty chunk.
549
+ * The `usage` field on this chunk shows the token usage statistics for the entire
550
+ * request, and the `choices` field will always be an empty array. All other chunks
551
+ * will also include a `usage` field, but with a null value.
552
+ */
553
+ include_usage?: boolean;
554
+ }
555
+ interface ChatCompletionSystemMessageParam {
556
+ /**
557
+ * The contents of the system message.
558
+ */
559
+ content: string;
560
+ /**
561
+ * The role of the messages author, in this case `system`.
562
+ */
563
+ role: "system";
564
+ }
565
+ interface ChatCompletionUserMessageParam {
566
+ /**
567
+ * The contents of the user message.
568
+ */
569
+ content: string | Array<ChatCompletionContentPart>;
570
+ /**
571
+ * The role of the messages author, in this case `user`.
572
+ */
573
+ role: "user";
574
+ /**
575
+ * An optional name for the participant. Provides the model information to
576
+ * differentiate between participants of the same role.
577
+ *
578
+ * @note This is experimental, as models typically have predefined names for the user.
579
+ */
580
+ name?: string;
581
+ }
582
+ interface ChatCompletionAssistantMessageParam {
583
+ /**
584
+ * The role of the messages author, in this case `assistant`.
585
+ */
586
+ role: "assistant";
587
+ /**
588
+ * The contents of the assistant message. Required unless `tool_calls` is specified.
589
+ */
590
+ content?: string | null;
591
+ /**
592
+ * An optional name for the participant. Provides the model information to
593
+ * differentiate between participants of the same role.
594
+ *
595
+ * @note This is experimental, as models typically have predefined names for the user.
596
+ */
597
+ name?: string;
598
+ /**
599
+ * The tool calls generated by the model, such as function calls.
600
+ */
601
+ tool_calls?: Array<ChatCompletionMessageToolCall>;
602
+ }
603
+ interface ChatCompletionToolMessageParam {
604
+ /**
605
+ * The contents of the tool message.
606
+ */
607
+ content: string;
608
+ /**
609
+ * The role of the messages author, in this case `tool`.
610
+ */
611
+ role: "tool";
612
+ /**
613
+ * Tool call that this message is responding to.
614
+ */
615
+ tool_call_id: string;
616
+ }
617
+ type ChatCompletionMessageParam = ChatCompletionSystemMessageParam | ChatCompletionUserMessageParam | ChatCompletionAssistantMessageParam | ChatCompletionToolMessageParam;
618
+ /**
619
+ * The parameters the functions accepts, described as a JSON Schema object. See the
620
+ * [guide](https://platform.openai.com/docs/guides/text-generation/function-calling)
621
+ * for examples, and the
622
+ * [JSON Schema reference](https://json-schema.org/understanding-json-schema/) for
623
+ * documentation about the format.
624
+ *
625
+ * Omitting `parameters` defines a function with an empty parameter list.
626
+ */
627
+ type FunctionParameters = Record<string, unknown>;
628
+ interface FunctionDefinition {
629
+ /**
630
+ * The name of the function to be called. Must be a-z, A-Z, 0-9, or contain
631
+ * underscores and dashes, with a maximum length of 64.
632
+ */
633
+ name: string;
634
+ /**
635
+ * A description of what the function does, used by the model to choose when and
636
+ * how to call the function.
637
+ */
638
+ description?: string;
639
+ /**
640
+ * The parameters the functions accepts, described as a JSON Schema object. See the
641
+ * [guide](https://platform.openai.com/docs/guides/text-generation/function-calling)
642
+ * for examples, and the
643
+ * [JSON Schema reference](https://json-schema.org/understanding-json-schema/) for
644
+ * documentation about the format.
645
+ *
646
+ * Omitting `parameters` defines a function with an empty parameter list.
647
+ */
648
+ parameters?: FunctionParameters;
649
+ }
650
+ interface ChatCompletionTool {
651
+ function: FunctionDefinition;
652
+ /**
653
+ * The type of the tool. Currently, only `function` is supported.
654
+ */
655
+ type: "function";
656
+ }
657
+ /**
658
+ * Specifies a tool the model should use. Use to force the model to call a specific
659
+ * function.
660
+ */
661
+ interface ChatCompletionNamedToolChoice {
662
+ function: ChatCompletionNamedToolChoice.Function;
663
+ /**
664
+ * The type of the tool. Currently, only `function` is supported.
665
+ */
666
+ type: "function";
667
+ }
668
+ declare namespace ChatCompletionNamedToolChoice {
669
+ interface Function {
670
+ /**
671
+ * The name of the function to call.
672
+ */
673
+ name: string;
674
+ }
675
+ }
676
+ /**
677
+ * Controls which (if any) function is called by the model. `none` means the model
678
+ * will not call a function and instead generates a message. `auto` means the model
679
+ * can pick between generating a message or calling a function. Specifying a
680
+ * particular function via
681
+ * `{"type": "function", "function": {"name": "my_function"}}` forces the model to
682
+ * call that function.
683
+ *
684
+ * `none` is the default when no functions are present. `auto` is the default if
685
+ * functions are present.
686
+ */
687
+ type ChatCompletionToolChoiceOption = "none" | "auto" | ChatCompletionNamedToolChoice;
688
+ interface TopLogprob {
689
+ /**
690
+ * The token.
691
+ */
692
+ token: string;
693
+ /**
694
+ * A list of integers representing the UTF-8 bytes representation of the token.
695
+ * Useful in instances where characters are represented by multiple tokens and
696
+ * their byte representations must be combined to generate the correct text
697
+ * representation. Can be `null` if there is no bytes representation for the token.
698
+ *
699
+ * @note Encoded with `TextEncoder.encode()` and can be decoded with `TextDecoder.decode()`.
700
+ * For details, see https://developer.mozilla.org/en-US/docs/Web/API/TextEncoder/encode.
701
+ */
702
+ bytes: Array<number> | null;
703
+ /**
704
+ * The log probability of this token.
705
+ */
706
+ logprob: number;
707
+ }
708
+ interface ChatCompletionTokenLogprob {
709
+ /**
710
+ * The token.
711
+ */
712
+ token: string;
713
+ /**
714
+ * A list of integers representing the UTF-8 bytes representation of the token.
715
+ * Useful in instances where characters are represented by multiple tokens and
716
+ * their byte representations must be combined to generate the correct text
717
+ * representation. Can be `null` if there is no bytes representation for the token.
718
+ *
719
+ * @note Encoded with `TextEncoder.encode()` and can be decoded with `TextDecoder.decode()`.
720
+ * For details, see https://developer.mozilla.org/en-US/docs/Web/API/TextEncoder/encode.
721
+ */
722
+ bytes: Array<number> | null;
723
+ /**
724
+ * The log probability of this token.
725
+ */
726
+ logprob: number;
727
+ /**
728
+ * List of the most likely tokens and their log probability, at this token
729
+ * position. In rare cases, there may be fewer than the number of requested
730
+ * `top_logprobs` returned.
731
+ */
732
+ top_logprobs: Array<TopLogprob>;
733
+ }
734
+ /**
735
+ * A chat completion message generated by the model.
736
+ */
737
+ interface ChatCompletionMessage {
738
+ /**
739
+ * The contents of the message.
740
+ */
741
+ content: string | null;
742
+ /**
743
+ * The role of the author of this message.
744
+ */
745
+ role: "assistant";
746
+ /**
747
+ * The tool calls generated by the model, such as function calls.
748
+ */
749
+ tool_calls?: Array<ChatCompletionMessageToolCall>;
750
+ }
751
+ /**
752
+ * Usage statistics for the completion request.
753
+ */
754
+ interface CompletionUsage {
755
+ /**
756
+ * Number of tokens in the generated completion.
757
+ */
758
+ completion_tokens: number;
759
+ /**
760
+ * Number of tokens in the prompt.
761
+ *
762
+ * @note If we detect user is performing multi-round chatting, only the new portion of the
763
+ * prompt is counted for prompt_tokens.
764
+ */
765
+ prompt_tokens: number;
766
+ /**
767
+ * Total number of tokens used in the request (prompt + completion).
768
+ */
769
+ total_tokens: number;
770
+ /**
771
+ * Fields specific to WebLLM, not present in OpenAI.
772
+ */
773
+ extra: {
774
+ /**
775
+ * Total seconds spent on this request, from receiving the request, to generating the response.
776
+ */
777
+ e2e_latency_s: number;
778
+ /**
779
+ * Number of tokens per second for prefilling.
780
+ */
781
+ prefill_tokens_per_s: number;
782
+ /**
783
+ * Number of tokens per second for autoregressive decoding.
784
+ */
785
+ decode_tokens_per_s: number;
786
+ /**
787
+ * Seconds spent to generate the first token since receiving the request. Mainly contains
788
+ * prefilling overhead. If n > 1, it is the sum over all choices.
789
+ */
790
+ time_to_first_token_s: number;
791
+ /**
792
+ * Seconds in between generated tokens. Mainly contains decoding overhead. If n > 1, it
793
+ * is the average over all choices.
794
+ */
795
+ time_per_output_token_s: number;
796
+ /**
797
+ * Seconds spent on initializing grammar matcher for structured output. If n > 1, it
798
+ * is the sum over all choices.
799
+ */
800
+ grammar_init_s?: number;
801
+ /**
802
+ * Seconds per-token that grammar matcher spent on creating bitmask and accepting token for
803
+ * structured output. If n > 1, it is the average over all choices.
804
+ */
805
+ grammar_per_token_s?: number;
806
+ };
807
+ }
808
+ /**
809
+ * The reason the model stopped generating tokens. This will be `stop` if the model
810
+ * hit a natural stop point or a provided stop sequence, `length` if the maximum
811
+ * number of tokens specified in the request was reached or the context_window_size will
812
+ * be exceeded, `tool_calls` if the model called a tool, or `abort` if user manually stops the
813
+ * generation.
814
+ */
815
+ type ChatCompletionFinishReason = "stop" | "length" | "tool_calls" | "abort";
816
+ declare namespace ChatCompletion {
817
+ interface Choice {
818
+ /**
819
+ * The reason the model stopped generating tokens. This will be `stop` if the model
820
+ * hit a natural stop point or a provided stop sequence, `length` if the maximum
821
+ * number of tokens specified in the request was reached, `tool_calls` if the
822
+ * model called a tool, or `abort` if user manually stops the generation.
823
+ */
824
+ finish_reason: ChatCompletionFinishReason;
825
+ /**
826
+ * The index of the choice in the list of choices.
827
+ */
828
+ index: number;
829
+ /**
830
+ * Log probability information for the choice.
831
+ */
832
+ logprobs: Choice.Logprobs | null;
833
+ /**
834
+ * A chat completion message generated by the model.
835
+ */
836
+ message: ChatCompletionMessage;
837
+ }
838
+ namespace Choice {
839
+ /**
840
+ * Log probability information for the choice.
841
+ */
842
+ interface Logprobs {
843
+ /**
844
+ * A list of message content tokens with log probability information.
845
+ */
846
+ content: Array<ChatCompletionTokenLogprob> | null;
847
+ }
848
+ }
849
+ }
850
+ declare namespace ChatCompletionChunk {
851
+ interface Choice {
852
+ /**
853
+ * A chat completion delta generated by streamed model responses.
854
+ */
855
+ delta: Choice.Delta;
856
+ /**
857
+ * The reason the model stopped generating tokens. This will be `stop` if the model
858
+ * hit a natural stop point or a provided stop sequence, `length` if the maximum
859
+ * number of tokens specified in the request was reached, `tool_calls` if the
860
+ * model called a tool, or `abort` if user manually stops the generation.
861
+ */
862
+ finish_reason: ChatCompletionFinishReason | null;
863
+ /**
864
+ * The index of the choice in the list of choices.
865
+ */
866
+ index: number;
867
+ /**
868
+ * Log probability information for the choice.
869
+ */
870
+ logprobs?: Choice.Logprobs | null;
871
+ }
872
+ namespace Choice {
873
+ /**
874
+ * A chat completion delta generated by streamed model responses.
875
+ */
876
+ interface Delta {
877
+ /**
878
+ * The contents of the chunk message.
879
+ */
880
+ content?: string | null;
881
+ /**
882
+ * The role of the author of this message.
883
+ */
884
+ role?: "system" | "user" | "assistant" | "tool";
885
+ tool_calls?: Array<Delta.ToolCall>;
886
+ }
887
+ namespace Delta {
888
+ interface ToolCall {
889
+ /**
890
+ * The index of the tool call among all the tools calls in this request generation.
891
+ */
892
+ index: number;
893
+ /**
894
+ * The ID of the tool call. Not used in WebLLM.
895
+ */
896
+ id?: string;
897
+ function?: ToolCall.Function;
898
+ /**
899
+ * The type of the tool. Currently, only `function` is supported.
900
+ */
901
+ type?: "function";
902
+ }
903
+ namespace ToolCall {
904
+ interface Function {
905
+ /**
906
+ * The arguments to call the function with, as generated by the model in JSON
907
+ * format. Note that the model does not always generate valid JSON, and may
908
+ * hallucinate parameters not defined by your function schema. Validate the
909
+ * arguments in your code before calling your function.
910
+ */
911
+ arguments?: string;
912
+ /**
913
+ * The name of the function to call.
914
+ */
915
+ name?: string;
916
+ }
917
+ }
918
+ }
919
+ /**
920
+ * Log probability information for the choice.
921
+ */
922
+ interface Logprobs {
923
+ /**
924
+ * A list of message content tokens with log probability information.
925
+ */
926
+ content: Array<ChatCompletionTokenLogprob> | null;
927
+ }
928
+ }
929
+ }
930
+ /**
931
+ * An object specifying the format that the model must output.
932
+ *
933
+ * Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
934
+ * message the model generates is valid JSON.
935
+ *
936
+ * Setting to `{ "type": "grammar" }` requires you to also specify the `grammar` field, which
937
+ * is a BNFGrammar string.
938
+ *
939
+ * Setting `schema` specifies the output format of the json object such as properties to include.
940
+ *
941
+ * **Important:** when using JSON mode, you **must** also instruct the model to produce JSON
942
+ * following the schema (if specified) yourself via a system or user message. Without this,
943
+ * the model may generate an unending stream of whitespace until the generation reaches the token
944
+ * limit, resulting in a long-running and seemingly "stuck" request. Also note that
945
+ * the message content may be partially cut off if `finish_reason="length"`, which
946
+ * indicates the generation exceeded `max_tokens` or the conversation exceeded the
947
+ * max context length.
948
+ */
949
+ interface ResponseFormat {
950
+ /**
951
+ * Must be one of `text`, `json_object`, or `grammar`.
952
+ */
953
+ type?: "text" | "json_object" | "grammar";
954
+ /**
955
+ * A schema string in the format of the schema of a JSON file. `type` needs to be `json_object`.
956
+ */
957
+ schema?: string;
958
+ /**
959
+ * An EBNF-formatted string. Needs to be specified when, and only specified when,
960
+ * `type` is `grammar`. The grammar will be normalized (simplified) by default.
961
+ * EBNF grammar: see https://www.w3.org/TR/xml/#sec-notation. Note:
962
+ 1. Use # as the comment mark
963
+ 2. Use C-style unicode escape sequence \u01AB, \U000001AB, \xAB
964
+ 3. A-B (match A and not match B) is not supported yet
965
+ 4. Lookahead assertion can be added at the end of a rule to speed up matching. E.g.
966
+ ```
967
+ main ::= "ab" a [a-z]
968
+ a ::= "cd" (=[a-z])
969
+ ```
970
+ The assertion (=[a-z]) means a must be followed by [a-z].
971
+ */
972
+ grammar?: string;
973
+ }
974
+ //#endregion
975
+ //#region ../../node_modules/.pnpm/@mlc-ai+web-llm@0.2.79/node_modules/@mlc-ai/web-llm/lib/openai_api_protocols/completion.d.ts
976
+ declare class Completions {
977
+ private engine;
978
+ constructor(engine: MLCEngineInterface);
979
+ create(request: CompletionCreateParamsNonStreaming): Promise<Completion>;
980
+ create(request: CompletionCreateParamsStreaming): Promise<AsyncIterable<Completion>>;
981
+ create(request: CompletionCreateParamsBase): Promise<AsyncIterable<Completion> | Completion>;
982
+ }
983
+ /**
984
+ * OpenAI completion request protocol.
985
+ *
986
+ * API reference: https://platform.openai.com/docs/api-reference/completions/create
987
+ * Followed: https://github.com/openai/openai-node/blob/master/src/resources/completions.ts
988
+ *
989
+ * @note `model` is excluded. Instead, call `CreateMLCEngine(model)` or `engine.reload(model)` explicitly before calling this API.
990
+ */
991
+ interface CompletionCreateParamsBase {
992
+ /**
993
+ * The prompt(s) to generate completions for, encoded as a string.
994
+ */
995
+ prompt: string;
996
+ /**
997
+ * Echo back the prompt in addition to the completion
998
+ */
999
+ echo?: boolean | null;
1000
+ /**
1001
+ * Number between -2.0 and 2.0. Positive values penalize new tokens based on their
1002
+ * existing frequency in the text so far, decreasing the model's likelihood to
1003
+ * repeat the same line verbatim.
1004
+ *
1005
+ * [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/text-generation/parameter-details)
1006
+ */
1007
+ frequency_penalty?: number | null;
1008
+ /**
1009
+ * Modify the likelihood of specified tokens appearing in the completion.
1010
+ *
1011
+ * Accepts a JSON object that maps tokens (specified by their token ID, which varies per model)
1012
+ * to an associated bias value from -100 to 100. Typically, you can see `tokenizer.json` of the
1013
+ * model to see which token ID maps to what string. Mathematically, the bias is added to the
1014
+ * logits generated by the model prior to sampling. The exact effect will vary per model, but
1015
+ * values between -1 and 1 should decrease or increase likelihood of selection; values like -100
1016
+ * or 100 should result in a ban or exclusive selection of the relevant token.
1017
+ *
1018
+ * As an example, you can pass `{"16230": -100}` to prevent the `Hello` token from being
1019
+ * generated in Mistral-7B-Instruct-v0.2, according to the mapping in
1020
+ * https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/raw/main/tokenizer.json.
1021
+ *
1022
+ * @note For stateful and customizable / flexible logit processing, see `webllm.LogitProcessor`.
1023
+ * @note If used in combination with `webllm.LogitProcessor`, `logit_bias` is applied after
1024
+ * `LogitProcessor.processLogits()` is called.
1025
+ */
1026
+ logit_bias?: Record<string, number> | null;
1027
+ /**
1028
+ * Whether to return log probabilities of the output tokens or not.
1029
+ *
1030
+ * If true, returns the log probabilities of each output token returned in the `content` of
1031
+ * `message`.
1032
+ */
1033
+ logprobs?: boolean | null;
1034
+ /**
1035
+ * An integer between 0 and 5 specifying the number of most likely tokens to return
1036
+ * at each token position, each with an associated log probability. `logprobs` must
1037
+ * be set to `true` if this parameter is used.
1038
+ */
1039
+ top_logprobs?: number | null;
1040
+ /**
1041
+ * The maximum number of [tokens](/tokenizer) that can be generated in the
1042
+ * completion.
1043
+ *
1044
+ * The total length of input tokens and generated tokens is limited by the model's
1045
+ * context length.
1046
+ */
1047
+ max_tokens?: number | null;
1048
+ /**
1049
+ * How many completions to generate for each prompt.
1050
+ */
1051
+ n?: number | null;
1052
+ /**
1053
+ * Number between -2.0 and 2.0. Positive values penalize new tokens based on
1054
+ * whether they appear in the text so far, increasing the model's likelihood to
1055
+ * talk about new topics.
1056
+ *
1057
+ * [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/text-generation/parameter-details)
1058
+ */
1059
+ presence_penalty?: number | null;
1060
+ /**
1061
+ * If specified, our system will make a best effort to sample deterministically,
1062
+ * such that repeated requests with the same `seed` and parameters should return
1063
+ * the same result.
1064
+ *
1065
+ * @note Seeding is done on a request-level rather than choice-level. That is, if `n > 1`, you
1066
+ * would still get different content for each `Choice`. But if two requests with `n = 2` are
1067
+ * processed with the same seed, the two results should be the same (two choices are different).
1068
+ */
1069
+ seed?: number | null;
1070
+ /**
1071
+ * Up to 4 sequences where the API will stop generating further tokens. The
1072
+ * returned text will not contain the stop sequence.
1073
+ */
1074
+ stop?: string | null | Array<string>;
1075
+ /**
1076
+ * If set, partial deltas will be sent. It will be terminated by an empty chunk.
1077
+ */
1078
+ stream?: boolean | null;
1079
+ /**
1080
+ * Options for streaming response. Only set this when you set `stream: true`.
1081
+ */
1082
+ stream_options?: ChatCompletionStreamOptions | null;
1083
+ /**
1084
+ * What sampling temperature to use, between 0 and 2. Higher values like 0.8 will
1085
+ * make the output more random, while lower values like 0.2 will make it more
1086
+ * focused and deterministic.
1087
+ *
1088
+ * We generally recommend altering this or `top_p` but not both.
1089
+ */
1090
+ temperature?: number | null;
1091
+ /**
1092
+ * An alternative to sampling with temperature, called nucleus sampling, where the
1093
+ * model considers the results of the tokens with top_p probability mass. So 0.1
1094
+ * means only the tokens comprising the top 10% probability mass are considered.
1095
+ *
1096
+ * We generally recommend altering this or `temperature` but not both.
1097
+ */
1098
+ top_p?: number | null;
1099
+ /**
1100
+ * If true, will ignore stop string and stop token and generate until max_tokens hit.
1101
+ * If unset, will treat as false.
1102
+ */
1103
+ ignore_eos?: boolean;
1104
+ /**
1105
+ * ID of the model to use. This equals to `ModelRecord.model_id`, which needs to either be in
1106
+ * `webllm.prebuiltAppConfig` or in `engineConfig.appConfig`.
1107
+ *
1108
+ * @note Call `CreateMLCEngine(model)` or `engine.reload(model)` ahead of time.
1109
+ * @note If only one model is loaded in the engine, this field is optional. If multiple models
1110
+ * are loaded, this is required.
1111
+ */
1112
+ model?: string | null;
1113
+ /**
1114
+ * The suffix that comes after a completion of inserted text.
1115
+ *
1116
+ * @note This field is not supported.
1117
+ */
1118
+ suffix?: string | null;
1119
+ /**
1120
+ * A unique identifier representing your end-user, which can help OpenAI to monitor
1121
+ * and detect abuse.
1122
+ *
1123
+ * @note This field is not supported.
1124
+ */
1125
+ user?: string;
1126
+ /**
1127
+ * Generates `best_of` completions server-side and returns the "best" (the one with
1128
+ * the highest log probability per token). Results cannot be streamed.
1129
+ *
1130
+ * When used with `n`, `best_of` controls the number of candidate completions and
1131
+ * `n` specifies how many to return – `best_of` must be greater than `n`.
1132
+ *
1133
+ * @note This field is not supported.
1134
+ */
1135
+ best_of?: number | null;
1136
+ }
1137
+ type CompletionCreateParams = CompletionCreateParamsNonStreaming | CompletionCreateParamsStreaming;
1138
+ interface CompletionCreateParamsNonStreaming extends CompletionCreateParamsBase {
1139
+ /**
1140
+ * If set, partial deltas will be sent. It will be terminated by an empty chunk.
1141
+ */
1142
+ stream?: false | null;
1143
+ }
1144
+ interface CompletionCreateParamsStreaming extends CompletionCreateParamsBase {
1145
+ /**
1146
+ * If set, partial deltas will be sent. It will be terminated by an empty chunk.
1147
+ */
1148
+ stream: true;
1149
+ }
1150
+ /**
1151
+ * Represents a completion response returned by model, based on the provided input.
1152
+ */
1153
+ interface Completion {
1154
+ /**
1155
+ * A unique identifier for the completion.
1156
+ */
1157
+ id: string;
1158
+ /**
1159
+ * The list of completion choices the model generated for the input prompt.
1160
+ */
1161
+ choices: Array<CompletionChoice>;
1162
+ /**
1163
+ * The Unix timestamp (in seconds) of when the completion was created.
1164
+ */
1165
+ created: number;
1166
+ /**
1167
+ * The model used for completion.
1168
+ */
1169
+ model: string;
1170
+ /**
1171
+ * The object type, which is always "text_completion"
1172
+ */
1173
+ object: "text_completion";
1174
+ /**
1175
+ * This fingerprint represents the backend configuration that the model runs with.
1176
+ *
1177
+ * Can be used in conjunction with the `seed` request parameter to understand when
1178
+ * backend changes have been made that might impact determinism.
1179
+ *
1180
+ * @note Not supported yet.
1181
+ */
1182
+ system_fingerprint?: string;
1183
+ /**
1184
+ * Usage statistics for the completion request.
1185
+ */
1186
+ usage?: CompletionUsage;
1187
+ }
1188
+ interface CompletionChoice {
1189
+ /**
1190
+ * The reason the model stopped generating tokens. This will be `stop` if the model
1191
+ * hit a natural stop point or a provided stop sequence, or `length` if the maximum
1192
+ * number of tokens specified in the request was reached.
1193
+ */
1194
+ finish_reason: ChatCompletionFinishReason | null;
1195
+ index: number;
1196
+ /**
1197
+ * A list of message content tokens with log probability information.
1198
+ * @note Different from openai-node, we reuse ChatCompletion's Logprobs.
1199
+ */
1200
+ logprobs?: ChatCompletion.Choice.Logprobs | null;
1201
+ text: string;
1202
+ }
1203
+ //#endregion
1204
+ //#region ../../node_modules/.pnpm/@mlc-ai+web-llm@0.2.79/node_modules/@mlc-ai/web-llm/lib/openai_api_protocols/embedding.d.ts
1205
+ declare class Embeddings {
1206
+ private engine;
1207
+ constructor(engine: MLCEngineInterface);
1208
+ /**
1209
+ * Creates an embedding vector representing the input text.
1210
+ */
1211
+ create(request: EmbeddingCreateParams): Promise<CreateEmbeddingResponse>;
1212
+ }
1213
+ interface CreateEmbeddingResponse {
1214
+ /**
1215
+ * The list of embeddings generated by the model.
1216
+ */
1217
+ data: Array<Embedding>;
1218
+ /**
1219
+ * The name of the model used to generate the embedding.
1220
+ */
1221
+ model: string;
1222
+ /**
1223
+ * The object type, which is always "list".
1224
+ */
1225
+ object: "list";
1226
+ /**
1227
+ * The usage information for the request.
1228
+ */
1229
+ usage: CreateEmbeddingResponse.Usage;
1230
+ }
1231
+ declare namespace CreateEmbeddingResponse {
1232
+ /**
1233
+ * The usage information for the request.
1234
+ */
1235
+ interface Usage {
1236
+ /**
1237
+ * The number of tokens used by the prompt.
1238
+ */
1239
+ prompt_tokens: number;
1240
+ /**
1241
+ * The total number of tokens used by the request.
1242
+ */
1243
+ total_tokens: number;
1244
+ /**
1245
+ * Fields specific to WebLLM, not present in OpenAI.
1246
+ */
1247
+ extra: {
1248
+ /**
1249
+ * Number of tokens per second for prefilling.
1250
+ */
1251
+ prefill_tokens_per_s: number;
1252
+ };
1253
+ }
1254
+ }
1255
+ /**
1256
+ * Represents an embedding vector returned by embedding endpoint.
1257
+ */
1258
+ interface Embedding {
1259
+ /**
1260
+ * The embedding vector, which is a list of floats. The length of vector depends on
1261
+ * the model.
1262
+ */
1263
+ embedding: Array<number>;
1264
+ /**
1265
+ * The index of the embedding in the list of embeddings.
1266
+ */
1267
+ index: number;
1268
+ /**
1269
+ * The object type, which is always "embedding".
1270
+ */
1271
+ object: "embedding";
1272
+ }
1273
+ interface EmbeddingCreateParams {
1274
+ /**
1275
+ * Input text to embed, encoded as a string or array of tokens. To embed multiple
1276
+ * inputs in a single request, pass an array of strings or array of token arrays.
1277
+ * The input must not exceed the max input tokens for the model, and cannot be an empty string.
1278
+ * If the batch size is too large, multiple forward of the will take place.
1279
+ */
1280
+ input: string | Array<string> | Array<number> | Array<Array<number>>;
1281
+ /**
1282
+ * ID of the model to use. This equals to `ModelRecord.model_id`, which needs to either be in
1283
+ * `webllm.prebuiltAppConfig` or in `engineConfig.appConfig`.
1284
+ *
1285
+ * @note Call `CreateMLCEngine(model)` or `engine.reload(model)` ahead of time.
1286
+ * @note If only one model is loaded in the engine, this field is optional. If multiple models
1287
+ * are loaded, this is required.
1288
+ */
1289
+ model?: string | null;
1290
+ /**
1291
+ * The format to return the embeddings in.
1292
+ *
1293
+ * @note Currently only support `float`.
1294
+ */
1295
+ encoding_format?: "float" | "base64";
1296
+ /**
1297
+ * The number of dimensions the resulting output embeddings should have.
1298
+ *
1299
+ * @note Not supported.
1300
+ */
1301
+ dimensions?: number;
1302
+ /**
1303
+ * A unique identifier representing your end-user, which can help OpenAI to monitor
1304
+ * and detect abuse.
1305
+ *
1306
+ * @note Not supported.
1307
+ */
1308
+ user?: string;
1309
+ }
1310
+ //#endregion
1311
+ //#region ../../node_modules/.pnpm/@mlc-ai+web-llm@0.2.79/node_modules/@mlc-ai/web-llm/lib/config.d.ts
1312
+ /**
1313
+ * Conversation template config
1314
+ */
1315
+ interface ConvTemplateConfig {
1316
+ system_template: string;
1317
+ system_message: string;
1318
+ roles: Record<Role, string>;
1319
+ role_templates?: Partial<Record<Role, string>>;
1320
+ seps: Array<string>;
1321
+ role_content_sep?: string;
1322
+ role_empty_sep?: string;
1323
+ stop_str: Array<string>;
1324
+ system_prefix_token_ids?: Array<number>;
1325
+ stop_token_ids: Array<number>;
1326
+ add_role_after_system_message?: boolean;
1327
+ }
1328
+ declare enum Role {
1329
+ user = "user",
1330
+ assistant = "assistant",
1331
+ tool = "tool",
1332
+ }
1333
+ /**
1334
+ * Information about the tokenizer. Currently, only `token_postproc_method` is used to
1335
+ * post process the token table when using grammar.
1336
+ */
1337
+ interface TokenizerInfo {
1338
+ token_postproc_method: string;
1339
+ prepend_space_in_encode: boolean;
1340
+ strip_space_in_decode: boolean;
1341
+ }
1342
+ /**
1343
+ * Config of one chat model, a data structure representing `mlc-chat-config.json`.
1344
+ * This only corresponds to the chat-related fields and `tokenizer_files` of `mlc-chat-config.json`.
1345
+ * Only these fields affect the conversation in runtime.
1346
+ * i.e. The third part in https://llm.mlc.ai/docs/get_started/mlc_chat_config.html.
1347
+ *
1348
+ * This is initialized in `MLCEngine.reload()` with the model's `mlc-chat-config.json`.
1349
+ */
1350
+ interface ChatConfig {
1351
+ tokenizer_files: Array<string>;
1352
+ tokenizer_info?: TokenizerInfo;
1353
+ token_table_postproc_method?: string;
1354
+ vocab_size: number;
1355
+ conv_config?: Partial<ConvTemplateConfig>;
1356
+ conv_template: ConvTemplateConfig;
1357
+ context_window_size: number;
1358
+ sliding_window_size: number;
1359
+ attention_sink_size: number;
1360
+ repetition_penalty: number;
1361
+ frequency_penalty: number;
1362
+ presence_penalty: number;
1363
+ top_p: number;
1364
+ temperature: number;
1365
+ bos_token_id?: number;
1366
+ }
1367
+ /**
1368
+ * Custom options that can be used to override known config values.
1369
+ */
1370
+ interface ChatOptions extends Partial<ChatConfig> {}
1371
+ /**
1372
+ * Optional configurations for `CreateMLCEngine()` and `CreateWebWorkerMLCEngine()`.
1373
+ *
1374
+ * appConfig: Configure the app, including the list of models and whether to use IndexedDB cache.
1375
+ * initProgressCallback: A callback for showing the progress of loading the model.
1376
+ * logitProcessorRegistry: A register for stateful logit processors, see `webllm.LogitProcessor`.
1377
+ *
1378
+ * @note All fields are optional, and `logitProcessorRegistry` is only used for `MLCEngine` and not
1379
+ * other `MLCEngine`s.
1380
+ */
1381
+ interface MLCEngineConfig {
1382
+ appConfig?: AppConfig;
1383
+ initProgressCallback?: InitProgressCallback;
1384
+ logitProcessorRegistry?: Map<string, LogitProcessor>;
1385
+ logLevel?: LogLevel;
1386
+ }
1387
+ /**
1388
+ * Config for a single generation.
1389
+ * Essentially `ChatConfig` without `tokenizer_files`, `conv_config`, or `conv_template`.
1390
+ * We also support additional fields not present in `mlc-chat-config.json` due to OpenAI-like APIs.
1391
+ *
1392
+ * Note that all values are optional. If unspecified, we use whatever values in `ChatConfig`
1393
+ * initialized during `MLCEngine.reload()`.
1394
+ */
1395
+ interface GenerationConfig {
1396
+ repetition_penalty?: number;
1397
+ ignore_eos?: boolean;
1398
+ top_p?: number | null;
1399
+ temperature?: number | null;
1400
+ max_tokens?: number | null;
1401
+ frequency_penalty?: number | null;
1402
+ presence_penalty?: number | null;
1403
+ stop?: string | null | Array<string>;
1404
+ n?: number | null;
1405
+ logit_bias?: Record<string, number> | null;
1406
+ logprobs?: boolean | null;
1407
+ top_logprobs?: number | null;
1408
+ response_format?: ResponseFormat | null;
1409
+ enable_thinking?: boolean | null;
1410
+ }
1411
+ declare enum ModelType {
1412
+ "LLM" = 0,
1413
+ "embedding" = 1,
1414
+ "VLM" = 2,
1415
+ }
1416
+ /**
1417
+ * Information for a model.
1418
+ * @param model: the huggingface link to download the model weights, accepting four formats:
1419
+ * - https://huggingface.co/{USERNAME}/{MODEL}, which we automatically use the main branch
1420
+ * - https://huggingface.co/{USERNAME}/{MODEL}/, which we automatically use the main branch
1421
+ * - https://huggingface.co/{USERNAME}/{MODEL}/resolve/{BRANCH}
1422
+ * - https://huggingface.co/{USERNAME}/{MODEL}/resolve/{BRANCH}/
1423
+ * @param model_id: what we call the model.
1424
+ * @param model_lib: link to the model library (wasm file) the model uses.
1425
+ * @param overrides: partial ChatConfig to override mlc-chat-config.json; can be used to change KVCache settings.
1426
+ * @param vram_required_MB: amount of vram in MB required to run the model (can use
1427
+ * `utils/vram_requirements` to calculate).
1428
+ * @param low_resource_required: whether the model can run on limited devices (e.g. Android phone).
1429
+ * @param buffer_size_required_bytes: required `maxStorageBufferBindingSize`, different for each device.
1430
+ * @param required_features: feature needed to run this model (e.g. shader-f16).
1431
+ * @param model_type: the intended usecase for the model, if unspecified, default to LLM.
1432
+ */
1433
+ interface ModelRecord {
1434
+ model: string;
1435
+ model_id: string;
1436
+ model_lib: string;
1437
+ overrides?: ChatOptions;
1438
+ vram_required_MB?: number;
1439
+ low_resource_required?: boolean;
1440
+ buffer_size_required_bytes?: number;
1441
+ required_features?: Array<string>;
1442
+ model_type?: ModelType;
1443
+ }
1444
+ /**
1445
+ * Extra configuration that can be
1446
+ * passed to the load.
1447
+ *
1448
+ * @param model_list: models to be used.
1449
+ * @param useIndexedDBCache: if true, will use IndexedDBCache to cache models and other artifacts.
1450
+ * If false or unspecified, will use the Cache API. For more information of the two, see:
1451
+ * https://developer.mozilla.org/en-US/docs/Web/API/Storage_API/Storage_quotas_and_eviction_criteria#what_technologies_store_data_in_the_browser
1452
+ *
1453
+ * @note Note that the Cache API is more well-tested in WebLLM as of now.
1454
+ */
1455
+ interface AppConfig {
1456
+ model_list: Array<ModelRecord>;
1457
+ useIndexedDBCache?: boolean;
1458
+ }
1459
+ //#endregion
1460
+ //#region ../../node_modules/.pnpm/@mlc-ai+web-llm@0.2.79/node_modules/@mlc-ai/web-llm/lib/conversation.d.ts
1461
+ type ImageURL = ChatCompletionContentPartImage.ImageURL;
1462
+ /**
1463
+ * Helper to keep track of history conversations.
1464
+ */
1465
+ declare class Conversation {
1466
+ /** Each message is a tuple of (Role, role_name_str, message), where message can be either a
1467
+ * string or an array of contentPart for possible image input.
1468
+ */
1469
+ messages: Array<[Role, string, string | Array<ChatCompletionContentPart> | undefined]>;
1470
+ readonly config: ConvTemplateConfig;
1471
+ /** Whether the Conversation object is for text completion with no conversation-style formatting */
1472
+ isTextCompletion: boolean;
1473
+ /** Used when isTextCompletion is true */
1474
+ prompt: string | undefined;
1475
+ function_string: string;
1476
+ use_function_calling: boolean;
1477
+ override_system_message?: string;
1478
+ /**
1479
+ * Tracks whether the last message is an empty thinking block. Should only
1480
+ * be true when we are in the middle of a generation. Will be set to
1481
+ * false when the reply is finished with `finishReply()`.
1482
+ */
1483
+ private isLastMessageEmptyThinkingReplyHeader;
1484
+ constructor(config: ConvTemplateConfig, isTextCompletion?: boolean);
1485
+ private getPromptArrayInternal;
1486
+ /**
1487
+ * Get prompt arrays with the first one as system.
1488
+ *
1489
+ * It is returned as an array of `string | Array<string | ImageURL>`, where each element of
1490
+ * the array represents the formatted message of a role/turn. If the message only contains text,
1491
+ * it will be a string that concatenates the role string, message, and separators. If the
1492
+ * message contains image(s), it will be an array of string and ImageURL in the order of which
1493
+ * they will be prefilled into the model. e.g. it can be something like
1494
+ * [
1495
+ * "<|system|>\nSome system prompt\n",
1496
+ * [
1497
+ * "<|user|>\n",
1498
+ * imageURL1,
1499
+ * "\n",
1500
+ * imageURL2,
1501
+ * "\n",
1502
+ * "Some user input<|end|>\n"
1503
+ * ],
1504
+ * ]
1505
+ *
1506
+ * @returns The prompt array.
1507
+ */
1508
+ getPromptArray(): Array<string | Array<string | ImageURL>>;
1509
+ /**
1510
+ * Get the last round of prompt has not been fed as input.
1511
+ *
1512
+ * @note This function needs to be used with the assumption that
1513
+ * the caller call appendMessage then appendReplyHeader.
1514
+ *
1515
+ * @returns The prompt array.
1516
+ */
1517
+ getPromptArrayLastRound(): (string | (string | ChatCompletionContentPartImage.ImageURL)[])[];
1518
+ /**
1519
+ * Return prompt in an array for non-conversation text completion.
1520
+ */
1521
+ getPromptArrayTextCompletion(): Array<string>;
1522
+ /**
1523
+ * Resets all states for this.conversation.
1524
+ */
1525
+ reset(): void;
1526
+ getStopStr(): string[];
1527
+ getStopTokens(): number[];
1528
+ appendMessage(role: Role, message: string | Array<ChatCompletionContentPart>, role_name?: string): void;
1529
+ appendReplyHeader(role: Role): void;
1530
+ appendEmptyThinkingReplyHeader(role: Role, emptyThinkingBlockStr: string): void;
1531
+ finishReply(message: string): void;
1532
+ }
1533
+ //#endregion
1534
+ //#region ../../node_modules/.pnpm/@mlc-ai+web-llm@0.2.79/node_modules/@mlc-ai/web-llm/lib/llm_chat.d.ts
1535
+ declare class LLMChatPipeline {
1536
+ private config;
1537
+ private tokenizer;
1538
+ private tvm;
1539
+ private device;
1540
+ private vm;
1541
+ private prefill;
1542
+ private decoding;
1543
+ private image_embed;
1544
+ private embed;
1545
+ private fapplyBitmask;
1546
+ private fclearKVCaches;
1547
+ private fKVCacheAddSequence;
1548
+ private fKVCacheRemoveSequence;
1549
+ private fKVCacheBeginForward;
1550
+ private fKVCacheEndForward;
1551
+ private fKVCacheEnableSlidingWindowForSeq;
1552
+ private params;
1553
+ private kvCache;
1554
+ private logitsOnCPU?;
1555
+ private filledKVCacheLength;
1556
+ private bosTokenId;
1557
+ private contextWindowSize;
1558
+ private slidingWindowSize;
1559
+ private attentionSinkSize;
1560
+ private prefillChunkSize;
1561
+ private resetStatsPerPrefill;
1562
+ private stopStr;
1563
+ private stopTokens;
1564
+ private outputMessage;
1565
+ private outputIds;
1566
+ private stopTriggered;
1567
+ private finishReason;
1568
+ private appearedTokensFreq;
1569
+ private conversation;
1570
+ private tokenLogprobArray;
1571
+ private decodingTotalTime;
1572
+ private decodingTotalTokens;
1573
+ private prefillTotalTime;
1574
+ private prefillTotalTokens;
1575
+ private curRoundDecodingTotalTokens;
1576
+ private curRoundPrefillTotalTokens;
1577
+ private curRoundDecodingTotalTime;
1578
+ private curRoundPrefillTotalTime;
1579
+ private logitProcessor?;
1580
+ private grammarMatcher?;
1581
+ private schemaOrGrammarStr?;
1582
+ private xgTokenizerInfo?;
1583
+ private grammarCompiler?;
1584
+ private bitmaskSize;
1585
+ private fullVocabSize;
1586
+ private token_postproc_method;
1587
+ private prepend_space_in_encode;
1588
+ private curRoundGrammarInitTotalTime;
1589
+ private curRoundGrammarPerTokenTotalTime;
1590
+ constructor(tvm: tvmjs.Instance, tokenizer: Tokenizer, config: ChatConfig, logitProcessor?: LogitProcessor);
1591
+ dispose(): void;
1592
+ /**
1593
+ * Get the current message.
1594
+ */
1595
+ getMessage(): string;
1596
+ /**
1597
+ * Reset the runtime statistics
1598
+ */
1599
+ resetRuntimeStats(): void;
1600
+ /**
1601
+ * Reset the chat history
1602
+ */
1603
+ resetChat(keepStats?: boolean): void;
1604
+ /**
1605
+ * Reset KV Cache
1606
+ */
1607
+ resetKVCache(): void;
1608
+ /**
1609
+ * @returns Whether stop is triggered.
1610
+ */
1611
+ stopped(): boolean;
1612
+ /**
1613
+ * @returns Finish reason; undefined if generation not started/stopped yet.
1614
+ */
1615
+ getFinishReason(): ChatCompletionFinishReason | undefined;
1616
+ /**
1617
+ * @returns tokenLogprobArray for this current round of autoregressive generation.
1618
+ * Updated upon each sampled token, cleared upon each prefillStep().
1619
+ */
1620
+ getTokenLogprobArray(): Array<ChatCompletionTokenLogprob>;
1621
+ /**
1622
+ * @returns the number of tokens decoded for a single request or a single choice in the request.
1623
+ */
1624
+ getCurRoundDecodingTotalTokens(): number;
1625
+ /**
1626
+ * @returns the number of tokens decoded for a single request or a single choice in the request.
1627
+ */
1628
+ getCurRoundPrefillTotalTokens(): number;
1629
+ /**
1630
+ * @returns the time spent on decode for a single request or a single choice in the request.
1631
+ */
1632
+ getCurRoundDecodingTotalTime(): number;
1633
+ /**
1634
+ * @returns the time spent on for a single request or a single choice in the request.
1635
+ */
1636
+ getCurRoundPrefillTotalTime(): number;
1637
+ /**
1638
+ * @returns the time (seconds) spent on for initializing grammar matcher for a single request.
1639
+ */
1640
+ getCurRoundGrammarInitTotalTime(): number;
1641
+ /**
1642
+ * @returns the total time (seconds) spent on creating bitmask and accepting token grammar matcher
1643
+ * for all the generated tokens in a single request.
1644
+ */
1645
+ getCurRoundGrammarPerTokenTotalTime(): number;
1646
+ /**
1647
+ * @returns Runtime stats information.
1648
+ */
1649
+ runtimeStatsText(): string;
1650
+ /**
1651
+ * @returns Runtime stats information, starting from the last prefill performed.
1652
+ */
1653
+ curRoundRuntimeStatsText(): string;
1654
+ /**
1655
+ * @returns Prefill tokens per second, starting from the last prefill performed.
1656
+ */
1657
+ getCurRoundPrefillTokensPerSec(): number;
1658
+ /**
1659
+ * @returns Prefill tokens per second, starting from the last prefill performed.
1660
+ */
1661
+ getCurRoundDecodingTokensPerSec(): number;
1662
+ /**
1663
+ * Set the seed for the RNG `this.tvm.rng`.
1664
+ */
1665
+ setSeed(seed: number): void;
1666
+ /**
1667
+ * @returns The conversation object (not a deep copy).
1668
+ */
1669
+ getConversationObject(): Conversation;
1670
+ /**
1671
+ * Set this.conversation to a new conversation object.
1672
+ */
1673
+ setConversation(newConv: Conversation): void;
1674
+ asyncLoadWebGPUPipelines(): Promise<void>;
1675
+ /**
1676
+ * Generate the first token given input prompt
1677
+ */
1678
+ prefillStep(inp: string, msgRole: Role,
1679
+ // either user or tool
1680
+ inp_role_str?: string, genConfig?: GenerationConfig): Promise<void>;
1681
+ decodeStep(genConfig?: GenerationConfig): Promise<void>;
1682
+ /**
1683
+ * Manually trigger stop if it is not stopped.
1684
+ */
1685
+ triggerStop(): void;
1686
+ /**
1687
+ * Add a generated token and check for stop.
1688
+ *
1689
+ * @param nextToken The next token.
1690
+ * @param genConfig Configs that override `this.config` for this round of generation.
1691
+ */
1692
+ private processNextToken;
1693
+ /**
1694
+ * Given input tokens, return embeddings of them by calling embed kernel.
1695
+ *
1696
+ * @note precondition: inputTokens.length <= prefillChunkSize, since we take care of
1697
+ * chunking in `getChunkedPrefillInputData()`.
1698
+ */
1699
+ private getTokensEmbeddings;
1700
+ /**
1701
+ * Embed an image input.
1702
+ */
1703
+ private getImageEmbeddings;
1704
+ /**
1705
+ * Embed and forward input data, that can be either array of tokens, or an image.
1706
+ * This will increment `this.filledKVCacheLength`.
1707
+ *
1708
+ * @param inputData data to embed and forward
1709
+ * @param inputDataLen length of this inputData, should smaller than prefill chunk size.
1710
+ * @returns The logits returned by this forward as tvmjs.NDArray on GPU.
1711
+ *
1712
+ * @note Precondition: inputData's data length is smaller than prefill chunk size
1713
+ */
1714
+ private embedAndForward;
1715
+ private updateLogitsOnCPU;
1716
+ private sampleTokenFromLogits;
1717
+ /**
1718
+ * Return the an array of a mixture of token arrays and imageURLs (which cannot be represented
1719
+ * as tokens). Also return the number of tokens this represents.
1720
+ *
1721
+ * We first convert the Conversation into a prompt array to be prefilled. Then we encode the
1722
+ * text parts, leaving the imageURLs as it is.
1723
+ * Example prompts:
1724
+ * [
1725
+ * "<|system|>\nSome system prompt\n",
1726
+ * [
1727
+ * "<|user|>\n",
1728
+ * imageURL1,
1729
+ * "\n",
1730
+ * imageURL2,
1731
+ * "\n",
1732
+ * "Some user input<|end|>\n"
1733
+ * ],
1734
+ * ]
1735
+ *
1736
+ * Expected output:
1737
+ * [
1738
+ * token array for "<|system|>\nSome system prompt\n<|user|>\n",
1739
+ * imageUrl1,
1740
+ * token array for "\n",
1741
+ * imageUrl2,
1742
+ * token array for "\nSome user input<|end|>\n"
1743
+ */
1744
+ private getInputData;
1745
+ forwardTokensAndSample(inputIds: Array<number>, isPrefill: boolean): Promise<number>;
1746
+ /**
1747
+ * Based on `sampledToken` and `this.logitsOnCPU`, which becomes a distribution after
1748
+ * calling `this.tvm.applySoftmaxWithTemperature()`, generate `ChatCompletionTokenLogprob` and
1749
+ * update `this.tokenLogprobArray`.
1750
+ *
1751
+ * @param sampledToken The token ID sampled.
1752
+ * @param top_logprobs Number of top tokens to include; `top_logprobs` in `ChatCompletionRequest`.
1753
+ *
1754
+ * @return The `ChatCompletionTokenLogprob` for this single autoregressive step.
1755
+ */
1756
+ private getTokenLogprob;
1757
+ /**
1758
+ * Synchronize the device.
1759
+ */
1760
+ sync(): Promise<void>;
1761
+ evaluate(): Promise<void>;
1762
+ }
1763
+ //#endregion
1764
+ //#region ../../node_modules/.pnpm/@mlc-ai+web-llm@0.2.79/node_modules/@mlc-ai/web-llm/lib/engine.d.ts
1765
+ /**
1766
+ * The main interface of MLCEngine, which loads a model and performs tasks.
1767
+ *
1768
+ * You can either initialize one with `webllm.CreateMLCEngine(modelId)`, or
1769
+ * `webllm.MLCEngine().reload(modelId)`.
1770
+ */
1771
+ declare class MLCEngine implements MLCEngineInterface {
1772
+ /** For chat.completions.create() */
1773
+ chat: Chat;
1774
+ /** For completions.create() */
1775
+ completions: Completions;
1776
+ /** For embeddings.create() */
1777
+ embeddings: Embeddings;
1778
+ /** Maps each loaded model's modelId to its pipeline */
1779
+ private loadedModelIdToPipeline;
1780
+ /** Maps each loaded model's modelId to its chatConfig */
1781
+ private loadedModelIdToChatConfig;
1782
+ /** Maps each loaded model's modelId to its modelType */
1783
+ private loadedModelIdToModelType;
1784
+ /** Maps each loaded model's modelId to a lock. Ensures
1785
+ * each model only processes one request at at time.
1786
+ */
1787
+ private loadedModelIdToLock;
1788
+ private logger;
1789
+ private logitProcessorRegistry?;
1790
+ private initProgressCallback?;
1791
+ private appConfig;
1792
+ private interruptSignal;
1793
+ private deviceLostIsError;
1794
+ private reloadController;
1795
+ constructor(engineConfig?: MLCEngineConfig);
1796
+ setAppConfig(appConfig: AppConfig): void;
1797
+ setInitProgressCallback(initProgressCallback?: InitProgressCallback): void;
1798
+ getInitProgressCallback(): InitProgressCallback | undefined;
1799
+ setLogitProcessorRegistry(logitProcessorRegistry?: Map<string, LogitProcessor>): void;
1800
+ /**
1801
+ * Set MLCEngine logging output level
1802
+ *
1803
+ * @param logLevel The new log level
1804
+ */
1805
+ setLogLevel(logLevel: LogLevel): void;
1806
+ reload(modelId: string | string[], chatOpts?: ChatOptions | ChatOptions[]): Promise<void>;
1807
+ private reloadInternal;
1808
+ unload(): Promise<void>;
1809
+ private _generate;
1810
+ /**
1811
+ * Similar to `_generate()`; but instead of using callback, we use an async iterable.
1812
+ */
1813
+ asyncGenerate(request: ChatCompletionRequestStreaming, model: string, pipeline: LLMChatPipeline, chatConfig: ChatConfig, genConfig: GenerationConfig, timeReceived: number): AsyncGenerator<ChatCompletionChunk, void, void>;
1814
+ asyncGenerate(request: CompletionCreateParamsStreaming, model: string, pipeline: LLMChatPipeline, chatConfig: ChatConfig, genConfig: GenerationConfig, timeReceived: number): AsyncGenerator<Completion, void, void>;
1815
+ interruptGenerate(): Promise<void>;
1816
+ /**
1817
+ * Completes a single ChatCompletionRequest.
1818
+ *
1819
+ * @param request A OpenAI-style ChatCompletion request.
1820
+ *
1821
+ * @note For each choice (i.e. `n`), a request is defined by a single `prefill()` and multiple
1822
+ * `decode()`. This is important as it determines the behavior of various fields including `seed`.
1823
+ */
1824
+ chatCompletion(request: ChatCompletionRequestNonStreaming): Promise<ChatCompletion>;
1825
+ chatCompletion(request: ChatCompletionRequestStreaming): Promise<AsyncIterable<ChatCompletionChunk>>;
1826
+ chatCompletion(request: ChatCompletionRequestBase): Promise<AsyncIterable<ChatCompletionChunk> | ChatCompletion>;
1827
+ /**
1828
+ * Completes a single CompletionCreateParams, a text completion with no chat template.
1829
+ *
1830
+ * @param request A OpenAI-style Completion request.
1831
+ *
1832
+ * @note For each choice (i.e. `n`), a request is defined by a single `prefill()` and multiple
1833
+ * `decode()`. This is important as it determines the behavior of various fields including `seed`.
1834
+ */
1835
+ completion(request: CompletionCreateParamsNonStreaming): Promise<Completion>;
1836
+ completion(request: CompletionCreateParamsStreaming): Promise<AsyncIterable<Completion>>;
1837
+ completion(request: CompletionCreateParamsBase): Promise<AsyncIterable<Completion> | Completion>;
1838
+ embedding(request: EmbeddingCreateParams): Promise<CreateEmbeddingResponse>;
1839
+ getMaxStorageBufferBindingSize(): Promise<number>;
1840
+ getGPUVendor(): Promise<string>;
1841
+ private getLLMStates;
1842
+ private getEmbeddingStates;
1843
+ /**
1844
+ * Return the model, its LLMChatPipeline, and ChatConfig to use. Throws error when unclear which
1845
+ * model to load. Ensure all loadedModelIdToXXX maps contain entry for the selected modelId.
1846
+ * @param requestName The type of request or API to load the model for. Needed for error throwing.
1847
+ * @param modelType The typ of model, determining what type of pipeline to expect.
1848
+ * @param modelId Model the user specified to load via the request. Required when multiple
1849
+ * models are loaded
1850
+ */
1851
+ private getModelStates;
1852
+ forwardTokensAndSample(inputIds: Array<number>, isPrefill: boolean, modelId?: string): Promise<number>;
1853
+ /**
1854
+ * Get the current generated response.
1855
+ *
1856
+ * @returns The current output message.
1857
+ */
1858
+ getMessage(modelId?: string): Promise<string>;
1859
+ runtimeStatsText(modelId?: string): Promise<string>;
1860
+ resetChat(keepStats?: boolean, modelId?: string): Promise<void>;
1861
+ /**
1862
+ * Run a prefill step with a given input.
1863
+ *
1864
+ * If `input` is a chatCompletionRequest, we treat `input.messages[-1]` as the usual user input.
1865
+ * We then convert `input.messages[:-1]` to a `Conversation` object, representing a conversation
1866
+ * history.
1867
+ *
1868
+ * If the new `Conversation` object matches the current one loaded, it means we are
1869
+ * performing multi-round chatting, so we do not reset, hence reusing KV cache. Otherwise, we
1870
+ * reset every thing, treating the request as something completely new.
1871
+ *
1872
+ * @param input The OpenAI-style prompt to prefill.
1873
+ * @param pipeline The loaded pipeline, hence model, to carry out this prefill.
1874
+ * @param chatConfig The chat config to use for this model.
1875
+ * @param genConfig Generation config.
1876
+ */
1877
+ prefill(input: ChatCompletionRequest | CompletionCreateParams, pipeline: LLMChatPipeline, chatConfig: ChatConfig, genConfig: GenerationConfig): Promise<void>;
1878
+ /**
1879
+ * Run a decode step to decode the next token.
1880
+ */
1881
+ decode(pipeline: LLMChatPipeline, genConfig?: GenerationConfig): Promise<void>;
1882
+ }
1883
+ //#endregion
1884
+ //#region src/index.d.ts
1885
+ declare function getAvailableModels(): ModelRecord[];
1886
+ declare function useWebLLM({
1887
+ modelId,
1888
+ engineConfig,
1889
+ chatOptions,
1890
+ debug
1891
+ }: {
1892
+ modelId?: string | string[];
1893
+ engineConfig?: MLCEngineConfig;
1894
+ chatOptions?: ChatOptions;
1895
+ debug?: boolean;
1896
+ }): {
1897
+ engine: MLCEngine;
1898
+ progressReport: InitProgressReport;
1899
+ getAvailableModels: typeof getAvailableModels;
1900
+ loadModel: (modelId: string) => Promise<void>;
1901
+ };
1902
+ //#endregion
1903
+ export { useWebLLM };
1904
+ //# sourceMappingURL=index.d.mts.map