npm - @web-llm-wrappers/react - Versions diffs - 0.0.1 - Mend

@web-llm-wrappers/react 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/dist/index.d.mts ADDED Viewed

@@ -0,0 +1,1904 @@
+import * as tvmjs from "@mlc-ai/web-runtime";
+import { Tokenizer } from "@mlc-ai/web-tokenizers";
+//#region ../../node_modules/.pnpm/@mlc-ai+web-llm@0.2.79/node_modules/@mlc-ai/web-llm/lib/types.d.ts
+/**
+ * Report during intialization.
+ */
+interface InitProgressReport {
+  progress: number;
+  timeElapsed: number;
+  text: string;
+}
+/**
+ * Callbacks used to report initialization process.
+ */
+type InitProgressCallback = (report: InitProgressReport) => void;
+/**
+ * A stateful logitProcessor used to post-process logits after forwarding the input and before
+ * sampling the next token. If used with `GenerationConfig.logit_bias`, logit_bias is applied after
+ * `processLogits()` is called.
+ */
+interface LogitProcessor {
+  /**
+   * Process logits after forward() and before sampling implicitly, happens on the CPU.
+   * @param logits The logits right after forward().
+   * Returns the processed logits.
+   */
+  processLogits: (logits: Float32Array) => Float32Array;
+  /**
+   * Use the sampled token to update the LogitProcessor's internal state. Called implicitly
+   * right after the next token is sampled/committed.
+   * @param token Token sampled from the processed logits.
+   */
+  processSampledToken: (token: number) => void;
+  /**
+   * Called when in `MLCEngine.resetChat()`. Can clear internal states.
+   */
+  resetState: () => void;
+}
+/**
+ * Common interface of MLCEngine that UI can interact with
+ */
+interface MLCEngineInterface {
+  /**
+   * An object that exposes chat-related APIs.
+   */
+  chat: Chat;
+  /**
+   * An object that exposes text completion APIs.
+   */
+  completions: Completions;
+  /**
+   * An object that exposes embeddings APIs.
+   */
+  embeddings: Embeddings;
+  /**
+   * Set an initialization progress callback function
+   * which reports the progress of model loading.
+   *
+   * This function can be useful to implement an UI that
+   * update as we loading the model.
+   *
+   * @param initProgressCallback The callback function
+   */
+  setInitProgressCallback: (initProgressCallback: InitProgressCallback) => void;
+  /**
+   * @returns The current initialization progress callback function.
+   */
+  getInitProgressCallback: () => InitProgressCallback | undefined;
+  /**
+   * Setter for the engine's appConfig.
+   */
+  setAppConfig: (appConfig: AppConfig) => void;
+  /**
+   * Reload the chat with a new model.
+   *
+   * @param modelId model_id of the model to load, either string or string[]. When multiple models
+   *   are provided, we load all models sequentially. Each modelId needs to either be in
+   *   `webllm.prebuiltAppConfig`, or in `engineConfig.appConfig`.
+   * @param chatOpts Extra options to optionally override the `mlc-chat-config.json` of `modelId`.
+   *   The size of which needs to match that of `modelId`; chatOpts[i] will be used for modelId[i].
+   * @returns A promise when reload finishes.
+   * @throws Throws error when device lost (mostly due to OOM); users should re-call reload(),
+   *   potentially with a smaller model or smaller context window size.
+   * @note This is an async function.
+   */
+  reload: (modelId: string | string[], chatOpts?: ChatOptions | ChatOptions[]) => Promise<void>;
+  /**
+   * OpenAI-style API. Generate a chat completion response for the given conversation and
+   * configuration. Use `engine.chat.completions.create()` to invoke this API.
+   *
+   * @param request A OpenAI-style ChatCompletion request.
+   *
+   * @note The API is completely functional in behavior. That is, a previous request would not
+   * affect the current request's result. Thus, for multi-round chatting, users are responsible for
+   * maintaining the chat history. With that being said, as an implicit internal optimization, if we
+   * detect that the user is performing multi-round chatting, we will preserve the KV cache and only
+   * prefill the new tokens.
+   * @note For requests sent to the same modelId, will block until all previous requests finish.
+   * @note For more, see https://platform.openai.com/docs/api-reference/chat
+   */
+  chatCompletion(request: ChatCompletionRequestNonStreaming): Promise<ChatCompletion>;
+  chatCompletion(request: ChatCompletionRequestStreaming): Promise<AsyncIterable<ChatCompletionChunk>>;
+  chatCompletion(request: ChatCompletionRequestBase): Promise<AsyncIterable<ChatCompletionChunk> | ChatCompletion>;
+  chatCompletion(request: ChatCompletionRequest): Promise<AsyncIterable<ChatCompletionChunk> | ChatCompletion>;
+  /**
+   * OpenAI-style API. Completes a CompletionCreateParams, a text completion with no chat template.
+   * Use `engine.completions.create()` to invoke this API.
+   *
+   * @param request An OpenAI-style Completion request.
+   *
+   * @note For requests sent to the same modelId, will block until all previous requests finish.
+   * @note For more, see https://platform.openai.com/docs/api-reference/completions
+   */
+  completion(request: CompletionCreateParamsNonStreaming): Promise<Completion>;
+  completion(request: CompletionCreateParamsStreaming): Promise<AsyncIterable<Completion>>;
+  completion(request: CompletionCreateParamsBase): Promise<AsyncIterable<Completion> | Completion>;
+  completion(request: CompletionCreateParams): Promise<AsyncIterable<Completion> | Completion>;
+  /**
+   * OpenAI-style API. Creates an embedding vector representing the input text.
+   * Use `engine.embeddings.create()` to invoke this API.
+   *
+   * @param request An OpenAI-style Embeddings request.
+   *
+   * @note For requests sent to the same modelId, will block until all previous requests finish.
+   * @note For more, see https://platform.openai.com/docs/api-reference/embeddings/create
+   */
+  embedding(request: EmbeddingCreateParams): Promise<CreateEmbeddingResponse>;
+  /**
+   * @returns A text summarizing the runtime stats.
+   * @param modelId Only required when multiple models are loaded.
+   * @note This is an async function
+   */
+  runtimeStatsText: (modelId?: string) => Promise<string>;
+  /**
+   * Interrupt the generate process if it is already running.
+   */
+  interruptGenerate: () => void;
+  /**
+   * Explicitly unload the currently loaded model(s) and release the related resources. Waits until
+   * the webgpu device finishes all submitted work and destroys itself.
+   * @note This is an asynchronous function.
+   */
+  unload: () => Promise<void>;
+  /**
+   * Reset the current chat session by clear all memories.
+   * @param keepStats: If True, do not reset the statistics.
+   * @param modelId Only required when multiple models are loaded.
+   */
+  resetChat: (keepStats?: boolean, modelId?: string) => Promise<void>;
+  /**
+   * Get the current generated response.
+   * @param modelId Only required when multiple models are loaded.
+   * @returns The current output message.
+   */
+  getMessage: (modelId?: string) => Promise<string>;
+  /**
+   * Returns the device's maxStorageBufferBindingSize, can be used to guess whether the device
+   * has limited resources like an Android phone.
+   */
+  getMaxStorageBufferBindingSize(): Promise<number>;
+  /**
+   * Returns the device's gpu vendor (e.g. arm, qualcomm, apple) if available. Otherwise return
+   * an empty string.
+   */
+  getGPUVendor(): Promise<string>;
+  /**
+   * Forward the given input tokens to the model, then sample the next token.
+   *
+   * This function has side effects as the model will update its KV cache.
+   *
+   * @param inputIds The input tokens.
+   * @param isPrefill True if prefill, false if decode; only used for statistics.
+   * @param modelId Only required when multiple models are loaded.
+   * @returns Next token sampled.
+   * @note This is an async function.
+   */
+  forwardTokensAndSample(inputIds: Array<number>, isPrefill: boolean, modelId?: string): Promise<number>;
+  /**
+   * Set MLCEngine logging output level
+   *
+   * @param logLevel The new log level
+   */
+  setLogLevel(logLevel: LogLevel): void;
+}
+declare const LOG_LEVELS: {
+  TRACE: number;
+  DEBUG: number;
+  INFO: number;
+  WARN: number;
+  ERROR: number;
+  SILENT: number;
+};
+type LogLevel = keyof typeof LOG_LEVELS;
+//#endregion
+//#region ../../node_modules/.pnpm/@mlc-ai+web-llm@0.2.79/node_modules/@mlc-ai/web-llm/lib/openai_api_protocols/chat_completion.d.ts
+declare class Chat {
+  private engine;
+  completions: Completions$1;
+  constructor(engine: MLCEngineInterface);
+}
+declare class Completions$1 {
+  private engine;
+  constructor(engine: MLCEngineInterface);
+  create(request: ChatCompletionRequestNonStreaming): Promise<ChatCompletion>;
+  create(request: ChatCompletionRequestStreaming): Promise<AsyncIterable<ChatCompletionChunk>>;
+  create(request: ChatCompletionRequestBase): Promise<AsyncIterable<ChatCompletionChunk> | ChatCompletion>;
+}
+/**
+ * OpenAI chat completion request protocol.
+ *
+ * API reference: https://platform.openai.com/docs/api-reference/chat/create
+ * Followed: https://github.com/openai/openai-node/blob/master/src/resources/chat/completions.ts
+ *
+ * @note `model` is excluded. Instead, call `CreateMLCEngine(model)` or `engine.reload(model)` explicitly before calling this API.
+ */
+interface ChatCompletionRequestBase {
+  /**
+   * A list of messages comprising the conversation so far.
+   */
+  messages: Array<ChatCompletionMessageParam>;
+  /**
+   * If set, partial message deltas will be sent. It will be terminated by an empty chunk.
+   */
+  stream?: boolean | null;
+  /**
+   * Options for streaming response. Only set this when you set `stream: true`.
+   */
+  stream_options?: ChatCompletionStreamOptions | null;
+  /**
+   * How many chat completion choices to generate for each input message.
+   */
+  n?: number | null;
+  /**
+   * Number between -2.0 and 2.0. Positive values penalize new tokens based on their
+   * existing frequency in the text so far, decreasing the model's likelihood to
+   * repeat the same line verbatim.
+   *
+   * [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/text-generation/parameter-details)
+   */
+  frequency_penalty?: number | null;
+  /**
+   * Number between -2.0 and 2.0. Positive values penalize new tokens based on
+   * whether they appear in the text so far, increasing the model's likelihood to
+   * talk about new topics.
+   *
+   * [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/text-generation/parameter-details)
+   */
+  presence_penalty?: number | null;
+  /**
+   * The maximum number of [tokens](/tokenizer) that can be generated in the chat
+   * completion.
+   *
+   * The total length of input tokens and generated tokens is limited by the model's
+   * context length.
+   */
+  max_tokens?: number | null;
+  /**
+   * Sequences where the API will stop generating further tokens.
+   */
+  stop?: string | null | Array<string>;
+  /**
+   * What sampling temperature to use, between 0 and 2. Higher values like 0.8 will
+   * make the output more random, while lower values like 0.2 will make it more
+   * focused and deterministic.
+   */
+  temperature?: number | null;
+  /**
+   * An alternative to sampling with temperature, called nucleus sampling, where the
+   * model considers the results of the tokens with top_p probability mass. So 0.1
+   * means only the tokens comprising the top 10% probability mass are considered.
+   */
+  top_p?: number | null;
+  /**
+   * Modify the likelihood of specified tokens appearing in the completion.
+   *
+   * Accepts a JSON object that maps tokens (specified by their token ID, which varies per model)
+   * to an associated bias value from -100 to 100. Typically, you can see `tokenizer.json` of the
+   * model to see which token ID maps to what string. Mathematically, the bias is added to the
+   * logits generated by the model prior to sampling. The exact effect will vary per model, but
+   * values between -1 and 1 should decrease or increase likelihood of selection; values like -100
+   * or 100 should result in a ban or exclusive selection of the relevant token.
+   *
+   * As an example, you can pass `{"16230": -100}` to prevent the `Hello` token from being
+   * generated in Mistral-7B-Instruct-v0.2, according to the mapping in
+   * https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/raw/main/tokenizer.json.
+   *
+   * @note For stateful and customizable / flexible logit processing, see `webllm.LogitProcessor`.
+   * @note If used in combination with `webllm.LogitProcessor`, `logit_bias` is applied after
+   * `LogitProcessor.processLogits()` is called.
+   */
+  logit_bias?: Record<string, number> | null;
+  /**
+   * Whether to return log probabilities of the output tokens or not.
+   *
+   * If true, returns the log probabilities of each output token returned in the `content` of
+   * `message`.
+   */
+  logprobs?: boolean | null;
+  /**
+   * An integer between 0 and 5 specifying the number of most likely tokens to return
+   * at each token position, each with an associated log probability. `logprobs` must
+   * be set to `true` if this parameter is used.
+   */
+  top_logprobs?: number | null;
+  /**
+   * If specified, our system will make a best effort to sample deterministically, such that
+   * repeated requests with the same `seed` and parameters should return the same result.
+   *
+   * @note Seeding is done on a request-level rather than choice-level. That is, if `n > 1`, you
+   * would still get different content for each `Choice`. But if two requests with `n = 2` are
+   * processed with the same seed, the two results should be the same (two choices are different).
+   */
+  seed?: number | null;
+  /**
+   * Controls which (if any) function is called by the model. `none` means the model
+   * will not call a function and instead generates a message. `auto` means the model
+   * can pick between generating a message or calling a function. Specifying a
+   * particular function via
+   * `{"type": "function", "function": {"name": "my_function"}}` forces the model to
+   * call that function.
+   *
+   * `none` is the default when no functions are present. `auto` is the default if
+   * functions are present.
+   */
+  tool_choice?: ChatCompletionToolChoiceOption;
+  /**
+   * A list of tools the model may call. Currently, only functions are supported as a
+   * tool. Use this to provide a list of functions the model may generate JSON inputs
+   * for.
+   *
+   * The corresponding reply would populate the `tool_calls` field. If used with streaming,
+   * the last chunk would contain the `tool_calls` field, while the intermediate chunks would
+   * contain the raw string.
+   *
+   * If the generation terminates due to FinishReason other than "stop" (i.e. "length" or "abort"),
+   * then no `tool_calls` will be returned. User can still get the raw string output.
+   */
+  tools?: Array<ChatCompletionTool>;
+  /**
+   * An object specifying the format that the model must output.
+   *
+   * Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
+   * message the model generates is valid JSON.
+   *
+   * **Important:** when using JSON mode, you **must** also instruct the model to
+   * produce JSON yourself via a system or user message. Without this, the model may
+   * generate an unending stream of whitespace until the generation reaches the token
+   * limit, resulting in a long-running and seemingly "stuck" request. Also note that
+   * the message content may be partially cut off if `finish_reason="length"`, which
+   * indicates the generation exceeded `max_tokens` or the conversation exceeded the
+   * max context length.
+   */
+  response_format?: ResponseFormat;
+  /**
+   * If true, will ignore stop string and stop token and generate until max_tokens hit.
+   * If unset, will treat as false.
+   */
+  ignore_eos?: boolean;
+  /**
+   * ID of the model to use. This equals to `ModelRecord.model_id`, which needs to either be in
+   * `webllm.prebuiltAppConfig` or in `engineConfig.appConfig`.
+   *
+   * @note Call `CreateMLCEngine(model)` or `engine.reload(model)` ahead of time.
+   * @note If only one model is loaded in the engine, this field is optional. If multiple models
+   *   are loaded, this is required.
+   */
+  model?: string | null;
+  /**
+   * Fields specific to WebLLM, not present in OpenAI.
+   */
+  extra_body?: {
+    /**
+     * If set to false, prepend a "<think>\n\n</think>\n\n" to the response, preventing the
+     * model from generating thinking tokens. If set to true or undefined, does nothing.
+     *
+     * @note Currently only allowed to be used for Qwen3 models, though not explicitly checked.
+     */
+    enable_thinking?: boolean | null;
+  };
+}
+interface ChatCompletionRequestNonStreaming extends ChatCompletionRequestBase {
+  /**
+   * If set, partial message deltas will be sent. It will be terminated by an empty chunk.
+   */
+  stream?: false | null;
+}
+interface ChatCompletionRequestStreaming extends ChatCompletionRequestBase {
+  /**
+   * If set, partial message deltas will be sent. It will be terminated by an empty chunk.
+   */
+  stream: true;
+}
+type ChatCompletionRequest = ChatCompletionRequestNonStreaming | ChatCompletionRequestStreaming;
+/**
+ * Represents a chat completion response returned by model, based on the provided input.
+ */
+interface ChatCompletion {
+  /**
+   * A unique identifier for the chat completion.
+   */
+  id: string;
+  /**
+   * A list of chat completion choices. Can be more than one if `n` is greater than 1.
+   */
+  choices: Array<ChatCompletion.Choice>;
+  /**
+   * The model used for the chat completion.
+   */
+  model: string;
+  /**
+   * The object type, which is always `chat.completion`.
+   */
+  object: "chat.completion";
+  /**
+   * The Unix timestamp (in seconds) of when the chat completion was created.
+   *
+   */
+  created: number;
+  /**
+   * Usage statistics for the completion request.
+   *
+   * @note If we detect user is performing multi-round chatting, only the new portion of the
+   * prompt is counted for prompt_tokens. If `n > 1`, all choices' generation usages combined.
+   */
+  usage?: CompletionUsage;
+  /**
+   * This fingerprint represents the backend configuration that the model runs with.
+   *
+   * Can be used in conjunction with the `seed` request parameter to understand when
+   * backend changes have been made that might impact determinism.
+   *
+   * @note Not supported yet.
+   */
+  system_fingerprint?: string;
+}
+/**
+ * Represents a streamed chunk of a chat completion response returned by model,
+ * based on the provided input.
+ */
+interface ChatCompletionChunk {
+  /**
+   * A unique identifier for the chat completion. Each chunk has the same ID.
+   */
+  id: string;
+  /**
+   * A list of chat completion choices. Can contain more than one elements if `n` is
+   * greater than 1. Can also be empty for the last chunk if you set
+   * `stream_options: {"include_usage": true}`.
+   */
+  choices: Array<ChatCompletionChunk.Choice>;
+  /**
+   * The Unix timestamp (in seconds) of when the chat completion was created. Each
+   * chunk has the same timestamp.
+   */
+  created: number;
+  /**
+   * The model to generate the completion.
+   */
+  model: string;
+  /**
+   * The object type, which is always `chat.completion.chunk`.
+   */
+  object: "chat.completion.chunk";
+  /**
+   * This fingerprint represents the backend configuration that the model runs with.
+   * Can be used in conjunction with the `seed` request parameter to understand when
+   * backend changes have been made that might impact determinism.
+   *
+   * @note Not supported yet.
+   */
+  system_fingerprint?: string;
+  /**
+   * An optional field that will only be present when you set
+   * `stream_options: {"include_usage": true}` in your request. When present, it
+   * contains a null value except for the last chunk which contains the token usage
+   * statistics for the entire request.
+   */
+  usage?: CompletionUsage;
+}
+type ChatCompletionContentPart = ChatCompletionContentPartText | ChatCompletionContentPartImage;
+interface ChatCompletionContentPartText {
+  /**
+   * The text content.
+   */
+  text: string;
+  /**
+   * The type of the content part.
+   */
+  type: "text";
+}
+declare namespace ChatCompletionContentPartImage {
+  interface ImageURL {
+    /**
+     * Either a URL of the image or the base64 encoded image data.
+     */
+    url: string;
+    /**
+     * Specifies the detail level of the image.
+     */
+    detail?: "auto" | "low" | "high";
+  }
+}
+interface ChatCompletionContentPartImage {
+  image_url: ChatCompletionContentPartImage.ImageURL;
+  /**
+   * The type of the content part.
+   */
+  type: "image_url";
+}
+interface ChatCompletionMessageToolCall {
+  /**
+   * The ID of the tool call. In WebLLM, it is used as the index of the tool call among all
+   * the tools calls in this request generation.
+   */
+  id: string;
+  /**
+   * The function that the model called.
+   */
+  function: ChatCompletionMessageToolCall.Function;
+  /**
+   * The type of the tool. Currently, only `function` is supported.
+   */
+  type: "function";
+}
+declare namespace ChatCompletionMessageToolCall {
+  /**
+   * The function that the model called.
+   */
+  interface Function {
+    /**
+     * The arguments to call the function with, as generated by the model in JSON
+     * format.
+     */
+    arguments: string;
+    /**
+     * The name of the function to call.
+     */
+    name: string;
+  }
+}
+/**
+ * Options for streaming response. Only set this when you set `stream: true`.
+ */
+interface ChatCompletionStreamOptions {
+  /**
+   * If set, an additional chunk will be streamed after the last empty chunk.
+   * The `usage` field on this chunk shows the token usage statistics for the entire
+   * request, and the `choices` field will always be an empty array. All other chunks
+   * will also include a `usage` field, but with a null value.
+   */
+  include_usage?: boolean;
+}
+interface ChatCompletionSystemMessageParam {
+  /**
+   * The contents of the system message.
+   */
+  content: string;
+  /**
+   * The role of the messages author, in this case `system`.
+   */
+  role: "system";
+}
+interface ChatCompletionUserMessageParam {
+  /**
+   * The contents of the user message.
+   */
+  content: string | Array<ChatCompletionContentPart>;
+  /**
+   * The role of the messages author, in this case `user`.
+   */
+  role: "user";
+  /**
+   * An optional name for the participant. Provides the model information to
+   * differentiate between participants of the same role.
+   *
+   * @note This is experimental, as models typically have predefined names for the user.
+   */
+  name?: string;
+}
+interface ChatCompletionAssistantMessageParam {
+  /**
+   * The role of the messages author, in this case `assistant`.
+   */
+  role: "assistant";
+  /**
+   * The contents of the assistant message. Required unless `tool_calls` is specified.
+   */
+  content?: string | null;
+  /**
+   * An optional name for the participant. Provides the model information to
+   * differentiate between participants of the same role.
+   *
+   * @note This is experimental, as models typically have predefined names for the user.
+   */
+  name?: string;
+  /**
+   * The tool calls generated by the model, such as function calls.
+   */
+  tool_calls?: Array<ChatCompletionMessageToolCall>;
+}
+interface ChatCompletionToolMessageParam {
+  /**
+   * The contents of the tool message.
+   */
+  content: string;
+  /**
+   * The role of the messages author, in this case `tool`.
+   */
+  role: "tool";
+  /**
+   * Tool call that this message is responding to.
+   */
+  tool_call_id: string;
+}
+type ChatCompletionMessageParam = ChatCompletionSystemMessageParam | ChatCompletionUserMessageParam | ChatCompletionAssistantMessageParam | ChatCompletionToolMessageParam;
+/**
+ * The parameters the functions accepts, described as a JSON Schema object. See the
+ * [guide](https://platform.openai.com/docs/guides/text-generation/function-calling)
+ * for examples, and the
+ * [JSON Schema reference](https://json-schema.org/understanding-json-schema/) for
+ * documentation about the format.
+ *
+ * Omitting `parameters` defines a function with an empty parameter list.
+ */
+type FunctionParameters = Record<string, unknown>;
+interface FunctionDefinition {
+  /**
+   * The name of the function to be called. Must be a-z, A-Z, 0-9, or contain
+   * underscores and dashes, with a maximum length of 64.
+   */
+  name: string;
+  /**
+   * A description of what the function does, used by the model to choose when and
+   * how to call the function.
+   */
+  description?: string;
+  /**
+   * The parameters the functions accepts, described as a JSON Schema object. See the
+   * [guide](https://platform.openai.com/docs/guides/text-generation/function-calling)
+   * for examples, and the
+   * [JSON Schema reference](https://json-schema.org/understanding-json-schema/) for
+   * documentation about the format.
+   *
+   * Omitting `parameters` defines a function with an empty parameter list.
+   */
+  parameters?: FunctionParameters;
+}
+interface ChatCompletionTool {
+  function: FunctionDefinition;
+  /**
+   * The type of the tool. Currently, only `function` is supported.
+   */
+  type: "function";
+}
+/**
+ * Specifies a tool the model should use. Use to force the model to call a specific
+ * function.
+ */
+interface ChatCompletionNamedToolChoice {
+  function: ChatCompletionNamedToolChoice.Function;
+  /**
+   * The type of the tool. Currently, only `function` is supported.
+   */
+  type: "function";
+}
+declare namespace ChatCompletionNamedToolChoice {
+  interface Function {
+    /**
+     * The name of the function to call.
+     */
+    name: string;
+  }
+}
+/**
+ * Controls which (if any) function is called by the model. `none` means the model
+ * will not call a function and instead generates a message. `auto` means the model
+ * can pick between generating a message or calling a function. Specifying a
+ * particular function via
+ * `{"type": "function", "function": {"name": "my_function"}}` forces the model to
+ * call that function.
+ *
+ * `none` is the default when no functions are present. `auto` is the default if
+ * functions are present.
+ */
+type ChatCompletionToolChoiceOption = "none" | "auto" | ChatCompletionNamedToolChoice;
+interface TopLogprob {
+  /**
+   * The token.
+   */
+  token: string;
+  /**
+   * A list of integers representing the UTF-8 bytes representation of the token.
+   * Useful in instances where characters are represented by multiple tokens and
+   * their byte representations must be combined to generate the correct text
+   * representation. Can be `null` if there is no bytes representation for the token.
+   *
+   * @note Encoded with `TextEncoder.encode()` and can be decoded with `TextDecoder.decode()`.
+   * For details, see https://developer.mozilla.org/en-US/docs/Web/API/TextEncoder/encode.
+   */
+  bytes: Array<number> | null;
+  /**
+   * The log probability of this token.
+   */
+  logprob: number;
+}
+interface ChatCompletionTokenLogprob {
+  /**
+   * The token.
+   */
+  token: string;
+  /**
+   * A list of integers representing the UTF-8 bytes representation of the token.
+   * Useful in instances where characters are represented by multiple tokens and
+   * their byte representations must be combined to generate the correct text
+   * representation. Can be `null` if there is no bytes representation for the token.
+   *
+   * @note Encoded with `TextEncoder.encode()` and can be decoded with `TextDecoder.decode()`.
+   * For details, see https://developer.mozilla.org/en-US/docs/Web/API/TextEncoder/encode.
+   */
+  bytes: Array<number> | null;
+  /**
+   * The log probability of this token.
+   */
+  logprob: number;
+  /**
+   * List of the most likely tokens and their log probability, at this token
+   * position. In rare cases, there may be fewer than the number of requested
+   * `top_logprobs` returned.
+   */
+  top_logprobs: Array<TopLogprob>;
+}
+/**
+ * A chat completion message generated by the model.
+ */
+interface ChatCompletionMessage {
+  /**
+   * The contents of the message.
+   */
+  content: string | null;
+  /**
+   * The role of the author of this message.
+   */
+  role: "assistant";
+  /**
+   * The tool calls generated by the model, such as function calls.
+   */
+  tool_calls?: Array<ChatCompletionMessageToolCall>;
+}
+/**
+ * Usage statistics for the completion request.
+ */
+interface CompletionUsage {
+  /**
+   * Number of tokens in the generated completion.
+   */
+  completion_tokens: number;
+  /**
+   * Number of tokens in the prompt.
+   *
+   * @note If we detect user is performing multi-round chatting, only the new portion of the
+   * prompt is counted for prompt_tokens.
+   */
+  prompt_tokens: number;
+  /**
+   * Total number of tokens used in the request (prompt + completion).
+   */
+  total_tokens: number;
+  /**
+   * Fields specific to WebLLM, not present in OpenAI.
+   */
+  extra: {
+    /**
+     * Total seconds spent on this request, from receiving the request, to generating the response.
+     */
+    e2e_latency_s: number;
+    /**
+     * Number of tokens per second for prefilling.
+     */
+    prefill_tokens_per_s: number;
+    /**
+     * Number of tokens per second for autoregressive decoding.
+     */
+    decode_tokens_per_s: number;
+    /**
+     * Seconds spent to generate the first token since receiving the request. Mainly contains
+     * prefilling overhead. If n > 1, it is the sum over all choices.
+     */
+    time_to_first_token_s: number;
+    /**
+     * Seconds in between generated tokens. Mainly contains decoding overhead. If n > 1, it
+     * is the average over all choices.
+     */
+    time_per_output_token_s: number;
+    /**
+     * Seconds spent on initializing grammar matcher for structured output. If n > 1, it
+     * is the sum over all choices.
+     */
+    grammar_init_s?: number;
+    /**
+     * Seconds per-token that grammar matcher spent on creating bitmask and accepting token for
+     * structured output. If n > 1, it is the average over all choices.
+     */
+    grammar_per_token_s?: number;
+  };
+}
+/**
+ * The reason the model stopped generating tokens. This will be `stop` if the model
+ * hit a natural stop point or a provided stop sequence, `length` if the maximum
+ * number of tokens specified in the request was reached or the context_window_size will
+ * be exceeded, `tool_calls` if the model called a tool, or `abort` if user manually stops the
+ * generation.
+ */
+type ChatCompletionFinishReason = "stop" | "length" | "tool_calls" | "abort";
+declare namespace ChatCompletion {
+  interface Choice {
+    /**
+     * The reason the model stopped generating tokens. This will be `stop` if the model
+     * hit a natural stop point or a provided stop sequence, `length` if the maximum
+     * number of tokens specified in the request was reached, `tool_calls` if the
+     * model called a tool, or `abort` if user manually stops the generation.
+     */
+    finish_reason: ChatCompletionFinishReason;
+    /**
+     * The index of the choice in the list of choices.
+     */
+    index: number;
+    /**
+     * Log probability information for the choice.
+     */
+    logprobs: Choice.Logprobs | null;
+    /**
+     * A chat completion message generated by the model.
+     */
+    message: ChatCompletionMessage;
+  }
+  namespace Choice {
+    /**
+     * Log probability information for the choice.
+     */
+    interface Logprobs {
+      /**
+       * A list of message content tokens with log probability information.
+       */
+      content: Array<ChatCompletionTokenLogprob> | null;
+    }
+  }
+}
+declare namespace ChatCompletionChunk {
+  interface Choice {
+    /**
+     * A chat completion delta generated by streamed model responses.
+     */
+    delta: Choice.Delta;
+    /**
+     * The reason the model stopped generating tokens. This will be `stop` if the model
+     * hit a natural stop point or a provided stop sequence, `length` if the maximum
+     * number of tokens specified in the request was reached, `tool_calls` if the
+     * model called a tool, or `abort` if user manually stops the generation.
+     */
+    finish_reason: ChatCompletionFinishReason | null;
+    /**
+     * The index of the choice in the list of choices.
+     */
+    index: number;
+    /**
+     * Log probability information for the choice.
+     */
+    logprobs?: Choice.Logprobs | null;
+  }
+  namespace Choice {
+    /**
+     * A chat completion delta generated by streamed model responses.
+     */
+    interface Delta {
+      /**
+       * The contents of the chunk message.
+       */
+      content?: string | null;
+      /**
+       * The role of the author of this message.
+       */
+      role?: "system" | "user" | "assistant" | "tool";
+      tool_calls?: Array<Delta.ToolCall>;
+    }
+    namespace Delta {
+      interface ToolCall {
+        /**
+         * The index of the tool call among all the tools calls in this request generation.
+         */
+        index: number;
+        /**
+         * The ID of the tool call. Not used in WebLLM.
+         */
+        id?: string;
+        function?: ToolCall.Function;
+        /**
+         * The type of the tool. Currently, only `function` is supported.
+         */
+        type?: "function";
+      }
+      namespace ToolCall {
+        interface Function {
+          /**
+           * The arguments to call the function with, as generated by the model in JSON
+           * format. Note that the model does not always generate valid JSON, and may
+           * hallucinate parameters not defined by your function schema. Validate the
+           * arguments in your code before calling your function.
+           */
+          arguments?: string;
+          /**
+           * The name of the function to call.
+           */
+          name?: string;
+        }
+      }
+    }
+    /**
+     * Log probability information for the choice.
+     */
+    interface Logprobs {
+      /**
+       * A list of message content tokens with log probability information.
+       */
+      content: Array<ChatCompletionTokenLogprob> | null;
+    }
+  }
+}
+/**
+ * An object specifying the format that the model must output.
+ *
+ * Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
+ * message the model generates is valid JSON.
+ *
+ * Setting to `{ "type": "grammar" }` requires you to also specify the `grammar` field, which
+ * is a BNFGrammar string.
+ *
+ * Setting `schema` specifies the output format of the json object such as properties to include.
+ *
+ * **Important:** when using JSON mode, you **must** also instruct the model to produce JSON
+ * following the schema (if specified) yourself via a system or user message. Without this,
+ * the model may generate an unending stream of whitespace until the generation reaches the token
+ * limit, resulting in a long-running and seemingly "stuck" request. Also note that
+ * the message content may be partially cut off if `finish_reason="length"`, which
+ * indicates the generation exceeded `max_tokens` or the conversation exceeded the
+ * max context length.
+ */
+interface ResponseFormat {
+  /**
+   * Must be one of `text`, `json_object`, or `grammar`.
+   */
+  type?: "text" | "json_object" | "grammar";
+  /**
+   * A schema string in the format of the schema of a JSON file. `type` needs to be `json_object`.
+   */
+  schema?: string;
+  /**
+   * An EBNF-formatted string. Needs to be specified when, and only specified when,
+   * `type` is `grammar`. The grammar will be normalized (simplified) by default.
+   * EBNF grammar: see https://www.w3.org/TR/xml/#sec-notation. Note:
+      1. Use # as the comment mark
+      2. Use C-style unicode escape sequence \u01AB, \U000001AB, \xAB
+      3. A-B (match A and not match B) is not supported yet
+      4. Lookahead assertion can be added at the end of a rule to speed up matching. E.g.
+      ```
+      main ::= "ab" a [a-z]
+      a ::= "cd" (=[a-z])
+      ```
+      The assertion (=[a-z]) means a must be followed by [a-z].
+   */
+  grammar?: string;
+}
+//#endregion
+//#region ../../node_modules/.pnpm/@mlc-ai+web-llm@0.2.79/node_modules/@mlc-ai/web-llm/lib/openai_api_protocols/completion.d.ts
+declare class Completions {
+  private engine;
+  constructor(engine: MLCEngineInterface);
+  create(request: CompletionCreateParamsNonStreaming): Promise<Completion>;
+  create(request: CompletionCreateParamsStreaming): Promise<AsyncIterable<Completion>>;
+  create(request: CompletionCreateParamsBase): Promise<AsyncIterable<Completion> | Completion>;
+}
+/**
+ * OpenAI completion request protocol.
+ *
+ * API reference: https://platform.openai.com/docs/api-reference/completions/create
+ * Followed: https://github.com/openai/openai-node/blob/master/src/resources/completions.ts
+ *
+ * @note `model` is excluded. Instead, call `CreateMLCEngine(model)` or `engine.reload(model)` explicitly before calling this API.
+ */
+interface CompletionCreateParamsBase {
+  /**
+   * The prompt(s) to generate completions for, encoded as a string.
+   */
+  prompt: string;
+  /**
+   * Echo back the prompt in addition to the completion
+   */
+  echo?: boolean | null;
+  /**
+   * Number between -2.0 and 2.0. Positive values penalize new tokens based on their
+   * existing frequency in the text so far, decreasing the model's likelihood to
+   * repeat the same line verbatim.
+   *
+   * [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/text-generation/parameter-details)
+   */
+  frequency_penalty?: number | null;
+  /**
+   * Modify the likelihood of specified tokens appearing in the completion.
+   *
+   * Accepts a JSON object that maps tokens (specified by their token ID, which varies per model)
+   * to an associated bias value from -100 to 100. Typically, you can see `tokenizer.json` of the
+   * model to see which token ID maps to what string. Mathematically, the bias is added to the
+   * logits generated by the model prior to sampling. The exact effect will vary per model, but
+   * values between -1 and 1 should decrease or increase likelihood of selection; values like -100
+   * or 100 should result in a ban or exclusive selection of the relevant token.
+   *
+   * As an example, you can pass `{"16230": -100}` to prevent the `Hello` token from being
+   * generated in Mistral-7B-Instruct-v0.2, according to the mapping in
+   * https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/raw/main/tokenizer.json.
+   *
+   * @note For stateful and customizable / flexible logit processing, see `webllm.LogitProcessor`.
+   * @note If used in combination with `webllm.LogitProcessor`, `logit_bias` is applied after
+   * `LogitProcessor.processLogits()` is called.
+   */
+  logit_bias?: Record<string, number> | null;
+  /**
+   * Whether to return log probabilities of the output tokens or not.
+   *
+   * If true, returns the log probabilities of each output token returned in the `content` of
+   * `message`.
+   */
+  logprobs?: boolean | null;
+  /**
+   * An integer between 0 and 5 specifying the number of most likely tokens to return
+   * at each token position, each with an associated log probability. `logprobs` must
+   * be set to `true` if this parameter is used.
+   */
+  top_logprobs?: number | null;
+  /**
+   * The maximum number of [tokens](/tokenizer) that can be generated in the
+   * completion.
+   *
+   * The total length of input tokens and generated tokens is limited by the model's
+   * context length.
+   */
+  max_tokens?: number | null;
+  /**
+   * How many completions to generate for each prompt.
+   */
+  n?: number | null;
+  /**
+   * Number between -2.0 and 2.0. Positive values penalize new tokens based on
+   * whether they appear in the text so far, increasing the model's likelihood to
+   * talk about new topics.
+   *
+   * [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/text-generation/parameter-details)
+   */
+  presence_penalty?: number | null;
+  /**
+   * If specified, our system will make a best effort to sample deterministically,
+   * such that repeated requests with the same `seed` and parameters should return
+   * the same result.
+   *
+   * @note Seeding is done on a request-level rather than choice-level. That is, if `n > 1`, you
+   * would still get different content for each `Choice`. But if two requests with `n = 2` are
+   * processed with the same seed, the two results should be the same (two choices are different).
+   */
+  seed?: number | null;
+  /**
+   * Up to 4 sequences where the API will stop generating further tokens. The
+   * returned text will not contain the stop sequence.
+   */
+  stop?: string | null | Array<string>;
+  /**
+   * If set, partial deltas will be sent. It will be terminated by an empty chunk.
+   */
+  stream?: boolean | null;
+  /**
+   * Options for streaming response. Only set this when you set `stream: true`.
+   */
+  stream_options?: ChatCompletionStreamOptions | null;
+  /**
+   * What sampling temperature to use, between 0 and 2. Higher values like 0.8 will
+   * make the output more random, while lower values like 0.2 will make it more
+   * focused and deterministic.
+   *
+   * We generally recommend altering this or `top_p` but not both.
+   */
+  temperature?: number | null;
+  /**
+   * An alternative to sampling with temperature, called nucleus sampling, where the
+   * model considers the results of the tokens with top_p probability mass. So 0.1
+   * means only the tokens comprising the top 10% probability mass are considered.
+   *
+   * We generally recommend altering this or `temperature` but not both.
+   */
+  top_p?: number | null;
+  /**
+   * If true, will ignore stop string and stop token and generate until max_tokens hit.
+   * If unset, will treat as false.
+   */
+  ignore_eos?: boolean;
+  /**
+   * ID of the model to use. This equals to `ModelRecord.model_id`, which needs to either be in
+   * `webllm.prebuiltAppConfig` or in `engineConfig.appConfig`.
+   *
+   * @note Call `CreateMLCEngine(model)` or `engine.reload(model)` ahead of time.
+   * @note If only one model is loaded in the engine, this field is optional. If multiple models
+   *   are loaded, this is required.
+   */
+  model?: string | null;
+  /**
+   * The suffix that comes after a completion of inserted text.
+   *
+   * @note This field is not supported.
+   */
+  suffix?: string | null;
+  /**
+   * A unique identifier representing your end-user, which can help OpenAI to monitor
+   * and detect abuse.
+   *
+   * @note This field is not supported.
+   */
+  user?: string;
+  /**
+   * Generates `best_of` completions server-side and returns the "best" (the one with
+   * the highest log probability per token). Results cannot be streamed.
+   *
+   * When used with `n`, `best_of` controls the number of candidate completions and
+   * `n` specifies how many to return – `best_of` must be greater than `n`.
+   *
+   * @note This field is not supported.
+   */
+  best_of?: number | null;
+}
+type CompletionCreateParams = CompletionCreateParamsNonStreaming | CompletionCreateParamsStreaming;
+interface CompletionCreateParamsNonStreaming extends CompletionCreateParamsBase {
+  /**
+   * If set, partial deltas will be sent. It will be terminated by an empty chunk.
+   */
+  stream?: false | null;
+}
+interface CompletionCreateParamsStreaming extends CompletionCreateParamsBase {
+  /**
+   * If set, partial deltas will be sent. It will be terminated by an empty chunk.
+   */
+  stream: true;
+}
+/**
+ * Represents a completion response returned by model, based on the provided input.
+ */
+interface Completion {
+  /**
+   * A unique identifier for the completion.
+   */
+  id: string;
+  /**
+   * The list of completion choices the model generated for the input prompt.
+   */
+  choices: Array<CompletionChoice>;
+  /**
+   * The Unix timestamp (in seconds) of when the completion was created.
+   */
+  created: number;
+  /**
+   * The model used for completion.
+   */
+  model: string;
+  /**
+   * The object type, which is always "text_completion"
+   */
+  object: "text_completion";
+  /**
+   * This fingerprint represents the backend configuration that the model runs with.
+   *
+   * Can be used in conjunction with the `seed` request parameter to understand when
+   * backend changes have been made that might impact determinism.
+   *
+   * @note Not supported yet.
+   */
+  system_fingerprint?: string;
+  /**
+   * Usage statistics for the completion request.
+   */
+  usage?: CompletionUsage;
+}
+interface CompletionChoice {
+  /**
+   * The reason the model stopped generating tokens. This will be `stop` if the model
+   * hit a natural stop point or a provided stop sequence, or `length` if the maximum
+   * number of tokens specified in the request was reached.
+   */
+  finish_reason: ChatCompletionFinishReason | null;
+  index: number;
+  /**
+   * A list of message content tokens with log probability information.
+   * @note Different from openai-node, we reuse ChatCompletion's Logprobs.
+   */
+  logprobs?: ChatCompletion.Choice.Logprobs | null;
+  text: string;
+}
+//#endregion
+//#region ../../node_modules/.pnpm/@mlc-ai+web-llm@0.2.79/node_modules/@mlc-ai/web-llm/lib/openai_api_protocols/embedding.d.ts
+declare class Embeddings {
+  private engine;
+  constructor(engine: MLCEngineInterface);
+  /**
+   * Creates an embedding vector representing the input text.
+   */
+  create(request: EmbeddingCreateParams): Promise<CreateEmbeddingResponse>;
+}
+interface CreateEmbeddingResponse {
+  /**
+   * The list of embeddings generated by the model.
+   */
+  data: Array<Embedding>;
+  /**
+   * The name of the model used to generate the embedding.
+   */
+  model: string;
+  /**
+   * The object type, which is always "list".
+   */
+  object: "list";
+  /**
+   * The usage information for the request.
+   */
+  usage: CreateEmbeddingResponse.Usage;
+}
+declare namespace CreateEmbeddingResponse {
+  /**
+   * The usage information for the request.
+   */
+  interface Usage {
+    /**
+     * The number of tokens used by the prompt.
+     */
+    prompt_tokens: number;
+    /**
+     * The total number of tokens used by the request.
+     */
+    total_tokens: number;
+    /**
+     * Fields specific to WebLLM, not present in OpenAI.
+     */
+    extra: {
+      /**
+       * Number of tokens per second for prefilling.
+       */
+      prefill_tokens_per_s: number;
+    };
+  }
+}
+/**
+ * Represents an embedding vector returned by embedding endpoint.
+ */
+interface Embedding {
+  /**
+   * The embedding vector, which is a list of floats. The length of vector depends on
+   * the model.
+   */
+  embedding: Array<number>;
+  /**
+   * The index of the embedding in the list of embeddings.
+   */
+  index: number;
+  /**
+   * The object type, which is always "embedding".
+   */
+  object: "embedding";
+}
+interface EmbeddingCreateParams {
+  /**
+   * Input text to embed, encoded as a string or array of tokens. To embed multiple
+   * inputs in a single request, pass an array of strings or array of token arrays.
+   * The input must not exceed the max input tokens for the model, and cannot be an empty string.
+   * If the batch size is too large, multiple forward of the will take place.
+   */
+  input: string | Array<string> | Array<number> | Array<Array<number>>;
+  /**
+   * ID of the model to use. This equals to `ModelRecord.model_id`, which needs to either be in
+   * `webllm.prebuiltAppConfig` or in `engineConfig.appConfig`.
+   *
+   * @note Call `CreateMLCEngine(model)` or `engine.reload(model)` ahead of time.
+   * @note If only one model is loaded in the engine, this field is optional. If multiple models
+   *   are loaded, this is required.
+   */
+  model?: string | null;
+  /**
+   * The format to return the embeddings in.
+   *
+   * @note Currently only support `float`.
+   */
+  encoding_format?: "float" | "base64";
+  /**
+   * The number of dimensions the resulting output embeddings should have.
+   *
+   * @note Not supported.
+   */
+  dimensions?: number;
+  /**
+   * A unique identifier representing your end-user, which can help OpenAI to monitor
+   * and detect abuse.
+   *
+   * @note Not supported.
+   */
+  user?: string;
+}
+//#endregion
+//#region ../../node_modules/.pnpm/@mlc-ai+web-llm@0.2.79/node_modules/@mlc-ai/web-llm/lib/config.d.ts
+/**
+ * Conversation template config
+ */
+interface ConvTemplateConfig {
+  system_template: string;
+  system_message: string;
+  roles: Record<Role, string>;
+  role_templates?: Partial<Record<Role, string>>;
+  seps: Array<string>;
+  role_content_sep?: string;
+  role_empty_sep?: string;
+  stop_str: Array<string>;
+  system_prefix_token_ids?: Array<number>;
+  stop_token_ids: Array<number>;
+  add_role_after_system_message?: boolean;
+}
+declare enum Role {
+  user = "user",
+  assistant = "assistant",
+  tool = "tool",
+}
+/**
+ * Information about the tokenizer. Currently, only `token_postproc_method` is used to
+ * post process the token table when using grammar.
+ */
+interface TokenizerInfo {
+  token_postproc_method: string;
+  prepend_space_in_encode: boolean;
+  strip_space_in_decode: boolean;
+}
+/**
+ * Config of one chat model, a data structure representing `mlc-chat-config.json`.
+ * This only corresponds to the chat-related fields and `tokenizer_files` of `mlc-chat-config.json`.
+ * Only these fields affect the conversation in runtime.
+ * i.e. The third part in https://llm.mlc.ai/docs/get_started/mlc_chat_config.html.
+ *
+ * This is initialized in `MLCEngine.reload()` with the model's `mlc-chat-config.json`.
+ */
+interface ChatConfig {
+  tokenizer_files: Array<string>;
+  tokenizer_info?: TokenizerInfo;
+  token_table_postproc_method?: string;
+  vocab_size: number;
+  conv_config?: Partial<ConvTemplateConfig>;
+  conv_template: ConvTemplateConfig;
+  context_window_size: number;
+  sliding_window_size: number;
+  attention_sink_size: number;
+  repetition_penalty: number;
+  frequency_penalty: number;
+  presence_penalty: number;
+  top_p: number;
+  temperature: number;
+  bos_token_id?: number;
+}
+/**
+ * Custom options that can be used to override known config values.
+ */
+interface ChatOptions extends Partial<ChatConfig> {}
+/**
+ * Optional configurations for `CreateMLCEngine()` and `CreateWebWorkerMLCEngine()`.
+ *
+ * appConfig: Configure the app, including the list of models and whether to use IndexedDB cache.
+ * initProgressCallback: A callback for showing the progress of loading the model.
+ * logitProcessorRegistry: A register for stateful logit processors, see `webllm.LogitProcessor`.
+ *
+ * @note All fields are optional, and `logitProcessorRegistry` is only used for `MLCEngine` and not
+ * other `MLCEngine`s.
+ */
+interface MLCEngineConfig {
+  appConfig?: AppConfig;
+  initProgressCallback?: InitProgressCallback;
+  logitProcessorRegistry?: Map<string, LogitProcessor>;
+  logLevel?: LogLevel;
+}
+/**
+ * Config for a single generation.
+ * Essentially `ChatConfig` without `tokenizer_files`, `conv_config`, or `conv_template`.
+ * We also support additional fields not present in `mlc-chat-config.json` due to OpenAI-like APIs.
+ *
+ * Note that all values are optional. If unspecified, we use whatever values in `ChatConfig`
+ * initialized during `MLCEngine.reload()`.
+ */
+interface GenerationConfig {
+  repetition_penalty?: number;
+  ignore_eos?: boolean;
+  top_p?: number | null;
+  temperature?: number | null;
+  max_tokens?: number | null;
+  frequency_penalty?: number | null;
+  presence_penalty?: number | null;
+  stop?: string | null | Array<string>;
+  n?: number | null;
+  logit_bias?: Record<string, number> | null;
+  logprobs?: boolean | null;
+  top_logprobs?: number | null;
+  response_format?: ResponseFormat | null;
+  enable_thinking?: boolean | null;
+}
+declare enum ModelType {
+  "LLM" = 0,
+  "embedding" = 1,
+  "VLM" = 2,
+}
+/**
+ * Information for a model.
+ * @param model: the huggingface link to download the model weights, accepting four formats:
+ *    - https://huggingface.co/{USERNAME}/{MODEL}, which we automatically use the main branch
+ *    - https://huggingface.co/{USERNAME}/{MODEL}/, which we automatically use the main branch
+ *    - https://huggingface.co/{USERNAME}/{MODEL}/resolve/{BRANCH}
+ *    - https://huggingface.co/{USERNAME}/{MODEL}/resolve/{BRANCH}/
+ * @param model_id: what we call the model.
+ * @param model_lib: link to the model library (wasm file) the model uses.
+ * @param overrides: partial ChatConfig to override mlc-chat-config.json; can be used to change KVCache settings.
+ * @param vram_required_MB: amount of vram in MB required to run the model (can use
+ *    `utils/vram_requirements` to calculate).
+ * @param low_resource_required: whether the model can run on limited devices (e.g. Android phone).
+ * @param buffer_size_required_bytes: required `maxStorageBufferBindingSize`, different for each device.
+ * @param required_features: feature needed to run this model (e.g. shader-f16).
+ * @param model_type: the intended usecase for the model, if unspecified, default to LLM.
+ */
+interface ModelRecord {
+  model: string;
+  model_id: string;
+  model_lib: string;
+  overrides?: ChatOptions;
+  vram_required_MB?: number;
+  low_resource_required?: boolean;
+  buffer_size_required_bytes?: number;
+  required_features?: Array<string>;
+  model_type?: ModelType;
+}
+/**
+ * Extra configuration that can be
+ * passed to the load.
+ *
+ * @param model_list: models to be used.
+ * @param useIndexedDBCache: if true, will use IndexedDBCache to cache models and other artifacts.
+ * If false or unspecified, will use the Cache API. For more information of the two, see:
+ * https://developer.mozilla.org/en-US/docs/Web/API/Storage_API/Storage_quotas_and_eviction_criteria#what_technologies_store_data_in_the_browser
+ *
+ * @note Note that the Cache API is more well-tested in WebLLM as of now.
+ */
+interface AppConfig {
+  model_list: Array<ModelRecord>;
+  useIndexedDBCache?: boolean;
+}
+//#endregion
+//#region ../../node_modules/.pnpm/@mlc-ai+web-llm@0.2.79/node_modules/@mlc-ai/web-llm/lib/conversation.d.ts
+type ImageURL = ChatCompletionContentPartImage.ImageURL;
+/**
+ * Helper to keep track of history conversations.
+ */
+declare class Conversation {
+  /** Each message is a tuple of (Role, role_name_str, message), where message can be either a
+   *  string or an array of contentPart for possible image input.
+   */
+  messages: Array<[Role, string, string | Array<ChatCompletionContentPart> | undefined]>;
+  readonly config: ConvTemplateConfig;
+  /** Whether the Conversation object is for text completion with no conversation-style formatting */
+  isTextCompletion: boolean;
+  /** Used when isTextCompletion is true */
+  prompt: string | undefined;
+  function_string: string;
+  use_function_calling: boolean;
+  override_system_message?: string;
+  /**
+   * Tracks whether the last message is an empty thinking block. Should only
+   * be true when we are in the middle of a generation. Will be set to
+   * false when the reply is finished with `finishReply()`.
+   */
+  private isLastMessageEmptyThinkingReplyHeader;
+  constructor(config: ConvTemplateConfig, isTextCompletion?: boolean);
+  private getPromptArrayInternal;
+  /**
+   * Get prompt arrays with the first one as system.
+   *
+   * It is returned as an array of `string | Array<string | ImageURL>`, where each element of
+   * the array represents the formatted message of a role/turn. If the message only contains text,
+   * it will be a string that concatenates the role string, message, and separators. If the
+   * message contains image(s), it will be an array of string and ImageURL in the order of which
+   * they will be prefilled into the model. e.g. it can be something like
+   * [
+   *   "<|system|>\nSome system prompt\n",
+   *   [
+   *     "<|user|>\n",
+   *     imageURL1,
+   *     "\n",
+   *     imageURL2,
+   *     "\n",
+   *     "Some user input<|end|>\n"
+   *   ],
+   * ]
+   *
+   * @returns The prompt array.
+   */
+  getPromptArray(): Array<string | Array<string | ImageURL>>;
+  /**
+   * Get the last round of prompt has not been fed as input.
+   *
+   * @note This function needs to be used with the assumption that
+   *       the caller call appendMessage then appendReplyHeader.
+   *
+   * @returns The prompt array.
+   */
+  getPromptArrayLastRound(): (string | (string | ChatCompletionContentPartImage.ImageURL)[])[];
+  /**
+   * Return prompt in an array for non-conversation text completion.
+   */
+  getPromptArrayTextCompletion(): Array<string>;
+  /**
+   * Resets all states for this.conversation.
+   */
+  reset(): void;
+  getStopStr(): string[];
+  getStopTokens(): number[];
+  appendMessage(role: Role, message: string | Array<ChatCompletionContentPart>, role_name?: string): void;
+  appendReplyHeader(role: Role): void;
+  appendEmptyThinkingReplyHeader(role: Role, emptyThinkingBlockStr: string): void;
+  finishReply(message: string): void;
+}
+//#endregion
+//#region ../../node_modules/.pnpm/@mlc-ai+web-llm@0.2.79/node_modules/@mlc-ai/web-llm/lib/llm_chat.d.ts
+declare class LLMChatPipeline {
+  private config;
+  private tokenizer;
+  private tvm;
+  private device;
+  private vm;
+  private prefill;
+  private decoding;
+  private image_embed;
+  private embed;
+  private fapplyBitmask;
+  private fclearKVCaches;
+  private fKVCacheAddSequence;
+  private fKVCacheRemoveSequence;
+  private fKVCacheBeginForward;
+  private fKVCacheEndForward;
+  private fKVCacheEnableSlidingWindowForSeq;
+  private params;
+  private kvCache;
+  private logitsOnCPU?;
+  private filledKVCacheLength;
+  private bosTokenId;
+  private contextWindowSize;
+  private slidingWindowSize;
+  private attentionSinkSize;
+  private prefillChunkSize;
+  private resetStatsPerPrefill;
+  private stopStr;
+  private stopTokens;
+  private outputMessage;
+  private outputIds;
+  private stopTriggered;
+  private finishReason;
+  private appearedTokensFreq;
+  private conversation;
+  private tokenLogprobArray;
+  private decodingTotalTime;
+  private decodingTotalTokens;
+  private prefillTotalTime;
+  private prefillTotalTokens;
+  private curRoundDecodingTotalTokens;
+  private curRoundPrefillTotalTokens;
+  private curRoundDecodingTotalTime;
+  private curRoundPrefillTotalTime;
+  private logitProcessor?;
+  private grammarMatcher?;
+  private schemaOrGrammarStr?;
+  private xgTokenizerInfo?;
+  private grammarCompiler?;
+  private bitmaskSize;
+  private fullVocabSize;
+  private token_postproc_method;
+  private prepend_space_in_encode;
+  private curRoundGrammarInitTotalTime;
+  private curRoundGrammarPerTokenTotalTime;
+  constructor(tvm: tvmjs.Instance, tokenizer: Tokenizer, config: ChatConfig, logitProcessor?: LogitProcessor);
+  dispose(): void;
+  /**
+   * Get the current message.
+   */
+  getMessage(): string;
+  /**
+   * Reset the runtime statistics
+   */
+  resetRuntimeStats(): void;
+  /**
+   * Reset the chat history
+   */
+  resetChat(keepStats?: boolean): void;
+  /**
+   * Reset KV Cache
+   */
+  resetKVCache(): void;
+  /**
+   * @returns Whether stop is triggered.
+   */
+  stopped(): boolean;
+  /**
+   * @returns Finish reason; undefined if generation not started/stopped yet.
+   */
+  getFinishReason(): ChatCompletionFinishReason | undefined;
+  /**
+   * @returns tokenLogprobArray for this current round of autoregressive generation.
+   * Updated upon each sampled token, cleared upon each prefillStep().
+   */
+  getTokenLogprobArray(): Array<ChatCompletionTokenLogprob>;
+  /**
+   * @returns the number of tokens decoded for a single request or a single choice in the request.
+   */
+  getCurRoundDecodingTotalTokens(): number;
+  /**
+   * @returns the number of tokens decoded for a single request or a single choice in the request.
+   */
+  getCurRoundPrefillTotalTokens(): number;
+  /**
+   * @returns the time spent on decode for a single request or a single choice in the request.
+   */
+  getCurRoundDecodingTotalTime(): number;
+  /**
+   * @returns the time spent on  for a single request or a single choice in the request.
+   */
+  getCurRoundPrefillTotalTime(): number;
+  /**
+   * @returns the time (seconds) spent on for initializing grammar matcher for a single request.
+   */
+  getCurRoundGrammarInitTotalTime(): number;
+  /**
+   * @returns the total time (seconds) spent on creating bitmask and accepting token grammar matcher
+   * for all the generated tokens in a single request.
+   */
+  getCurRoundGrammarPerTokenTotalTime(): number;
+  /**
+   * @returns Runtime stats information.
+   */
+  runtimeStatsText(): string;
+  /**
+   * @returns Runtime stats information, starting from the last prefill performed.
+   */
+  curRoundRuntimeStatsText(): string;
+  /**
+   * @returns Prefill tokens per second, starting from the last prefill performed.
+   */
+  getCurRoundPrefillTokensPerSec(): number;
+  /**
+   * @returns Prefill tokens per second, starting from the last prefill performed.
+   */
+  getCurRoundDecodingTokensPerSec(): number;
+  /**
+   * Set the seed for the RNG `this.tvm.rng`.
+   */
+  setSeed(seed: number): void;
+  /**
+   * @returns The conversation object (not a deep copy).
+   */
+  getConversationObject(): Conversation;
+  /**
+   * Set this.conversation to a new conversation object.
+   */
+  setConversation(newConv: Conversation): void;
+  asyncLoadWebGPUPipelines(): Promise<void>;
+  /**
+   * Generate the first token given input prompt
+   */
+  prefillStep(inp: string, msgRole: Role,
+  // either user or tool
+  inp_role_str?: string, genConfig?: GenerationConfig): Promise<void>;
+  decodeStep(genConfig?: GenerationConfig): Promise<void>;
+  /**
+   * Manually trigger stop if it is not stopped.
+   */
+  triggerStop(): void;
+  /**
+   * Add a generated token and check for stop.
+   *
+   * @param nextToken The next token.
+   * @param genConfig Configs that override `this.config` for this round of generation.
+   */
+  private processNextToken;
+  /**
+   * Given input tokens, return embeddings of them by calling embed kernel.
+   *
+   * @note precondition: inputTokens.length <= prefillChunkSize, since we take care of
+   * chunking in `getChunkedPrefillInputData()`.
+   */
+  private getTokensEmbeddings;
+  /**
+   * Embed an image input.
+   */
+  private getImageEmbeddings;
+  /**
+   * Embed and forward input data, that can be either array of tokens, or an image.
+   * This will increment `this.filledKVCacheLength`.
+   *
+   * @param inputData data to embed and forward
+   * @param inputDataLen length of this inputData, should smaller than prefill chunk size.
+   * @returns The logits returned by this forward as tvmjs.NDArray on GPU.
+   *
+   * @note Precondition: inputData's data length is smaller than prefill chunk size
+   */
+  private embedAndForward;
+  private updateLogitsOnCPU;
+  private sampleTokenFromLogits;
+  /**
+   * Return the an array of a mixture of token arrays and imageURLs (which cannot be represented
+   * as tokens). Also return the number of tokens this represents.
+   *
+   * We first convert the Conversation into a prompt array to be prefilled. Then we encode the
+   * text parts, leaving the imageURLs as it is.
+   * Example prompts:
+   * [
+   *   "<|system|>\nSome system prompt\n",
+   *   [
+   *     "<|user|>\n",
+   *     imageURL1,
+   *     "\n",
+   *     imageURL2,
+   *     "\n",
+   *     "Some user input<|end|>\n"
+   *   ],
+   * ]
+   *
+   * Expected output:
+   * [
+   *   token array for "<|system|>\nSome system prompt\n<|user|>\n",
+   *   imageUrl1,
+   *   token array for "\n",
+   *   imageUrl2,
+   *   token array for "\nSome user input<|end|>\n"
+   */
+  private getInputData;
+  forwardTokensAndSample(inputIds: Array<number>, isPrefill: boolean): Promise<number>;
+  /**
+   * Based on `sampledToken` and `this.logitsOnCPU`, which becomes a distribution after
+   * calling `this.tvm.applySoftmaxWithTemperature()`, generate `ChatCompletionTokenLogprob` and
+   * update `this.tokenLogprobArray`.
+   *
+   * @param sampledToken The token ID sampled.
+   * @param top_logprobs Number of top tokens to include; `top_logprobs` in `ChatCompletionRequest`.
+   *
+   * @return The `ChatCompletionTokenLogprob` for this single autoregressive step.
+   */
+  private getTokenLogprob;
+  /**
+   * Synchronize the device.
+   */
+  sync(): Promise<void>;
+  evaluate(): Promise<void>;
+}
+//#endregion
+//#region ../../node_modules/.pnpm/@mlc-ai+web-llm@0.2.79/node_modules/@mlc-ai/web-llm/lib/engine.d.ts
+/**
+ * The main interface of MLCEngine, which loads a model and performs tasks.
+ *
+ * You can either initialize one with `webllm.CreateMLCEngine(modelId)`, or
+ * `webllm.MLCEngine().reload(modelId)`.
+ */
+declare class MLCEngine implements MLCEngineInterface {
+  /** For chat.completions.create() */
+  chat: Chat;
+  /** For completions.create() */
+  completions: Completions;
+  /** For embeddings.create() */
+  embeddings: Embeddings;
+  /** Maps each loaded model's modelId to its pipeline */
+  private loadedModelIdToPipeline;
+  /** Maps each loaded model's modelId to its chatConfig */
+  private loadedModelIdToChatConfig;
+  /** Maps each loaded model's modelId to its modelType */
+  private loadedModelIdToModelType;
+  /** Maps each loaded model's modelId to a lock. Ensures
+   * each model only processes one request at at time.
+   */
+  private loadedModelIdToLock;
+  private logger;
+  private logitProcessorRegistry?;
+  private initProgressCallback?;
+  private appConfig;
+  private interruptSignal;
+  private deviceLostIsError;
+  private reloadController;
+  constructor(engineConfig?: MLCEngineConfig);
+  setAppConfig(appConfig: AppConfig): void;
+  setInitProgressCallback(initProgressCallback?: InitProgressCallback): void;
+  getInitProgressCallback(): InitProgressCallback | undefined;
+  setLogitProcessorRegistry(logitProcessorRegistry?: Map<string, LogitProcessor>): void;
+  /**
+   * Set MLCEngine logging output level
+   *
+   * @param logLevel The new log level
+   */
+  setLogLevel(logLevel: LogLevel): void;
+  reload(modelId: string | string[], chatOpts?: ChatOptions | ChatOptions[]): Promise<void>;
+  private reloadInternal;
+  unload(): Promise<void>;
+  private _generate;
+  /**
+   * Similar to `_generate()`; but instead of using callback, we use an async iterable.
+   */
+  asyncGenerate(request: ChatCompletionRequestStreaming, model: string, pipeline: LLMChatPipeline, chatConfig: ChatConfig, genConfig: GenerationConfig, timeReceived: number): AsyncGenerator<ChatCompletionChunk, void, void>;
+  asyncGenerate(request: CompletionCreateParamsStreaming, model: string, pipeline: LLMChatPipeline, chatConfig: ChatConfig, genConfig: GenerationConfig, timeReceived: number): AsyncGenerator<Completion, void, void>;
+  interruptGenerate(): Promise<void>;
+  /**
+   * Completes a single ChatCompletionRequest.
+   *
+   * @param request A OpenAI-style ChatCompletion request.
+   *
+   * @note For each choice (i.e. `n`), a request is defined by a single `prefill()` and multiple
+   * `decode()`. This is important as it determines the behavior of various fields including `seed`.
+   */
+  chatCompletion(request: ChatCompletionRequestNonStreaming): Promise<ChatCompletion>;
+  chatCompletion(request: ChatCompletionRequestStreaming): Promise<AsyncIterable<ChatCompletionChunk>>;
+  chatCompletion(request: ChatCompletionRequestBase): Promise<AsyncIterable<ChatCompletionChunk> | ChatCompletion>;
+  /**
+   * Completes a single CompletionCreateParams, a text completion with no chat template.
+   *
+   * @param request A OpenAI-style Completion request.
+   *
+   * @note For each choice (i.e. `n`), a request is defined by a single `prefill()` and multiple
+   * `decode()`. This is important as it determines the behavior of various fields including `seed`.
+   */
+  completion(request: CompletionCreateParamsNonStreaming): Promise<Completion>;
+  completion(request: CompletionCreateParamsStreaming): Promise<AsyncIterable<Completion>>;
+  completion(request: CompletionCreateParamsBase): Promise<AsyncIterable<Completion> | Completion>;
+  embedding(request: EmbeddingCreateParams): Promise<CreateEmbeddingResponse>;
+  getMaxStorageBufferBindingSize(): Promise<number>;
+  getGPUVendor(): Promise<string>;
+  private getLLMStates;
+  private getEmbeddingStates;
+  /**
+   * Return the model, its LLMChatPipeline, and ChatConfig to use. Throws error when unclear which
+   * model to load. Ensure all loadedModelIdToXXX maps contain entry for the selected modelId.
+   * @param requestName The type of request or API to load the model for. Needed for error throwing.
+   * @param modelType The typ of model, determining what type of pipeline to expect.
+   * @param modelId Model the user specified to load via the request. Required when multiple
+   *   models are loaded
+   */
+  private getModelStates;
+  forwardTokensAndSample(inputIds: Array<number>, isPrefill: boolean, modelId?: string): Promise<number>;
+  /**
+   * Get the current generated response.
+   *
+   * @returns The current output message.
+   */
+  getMessage(modelId?: string): Promise<string>;
+  runtimeStatsText(modelId?: string): Promise<string>;
+  resetChat(keepStats?: boolean, modelId?: string): Promise<void>;
+  /**
+   * Run a prefill step with a given input.
+   *
+   * If `input` is a chatCompletionRequest, we treat `input.messages[-1]` as the usual user input.
+   * We then convert `input.messages[:-1]` to a `Conversation` object, representing a conversation
+   * history.
+   *
+   * If the new `Conversation` object matches the current one loaded, it means we are
+   * performing multi-round chatting, so we do not reset, hence reusing KV cache. Otherwise, we
+   * reset every thing, treating the request as something completely new.
+   *
+   * @param input The OpenAI-style prompt to prefill.
+   * @param pipeline The loaded pipeline, hence model, to carry out this prefill.
+   * @param chatConfig The chat config to use for this model.
+   * @param genConfig Generation config.
+   */
+  prefill(input: ChatCompletionRequest | CompletionCreateParams, pipeline: LLMChatPipeline, chatConfig: ChatConfig, genConfig: GenerationConfig): Promise<void>;
+  /**
+   * Run a decode step to decode the next token.
+   */
+  decode(pipeline: LLMChatPipeline, genConfig?: GenerationConfig): Promise<void>;
+}
+//#endregion
+//#region src/index.d.ts
+declare function getAvailableModels(): ModelRecord[];
+declare function useWebLLM({
+  modelId,
+  engineConfig,
+  chatOptions,
+  debug
+}: {
+  modelId?: string | string[];
+  engineConfig?: MLCEngineConfig;
+  chatOptions?: ChatOptions;
+  debug?: boolean;
+}): {
+  engine: MLCEngine;
+  progressReport: InitProgressReport;
+  getAvailableModels: typeof getAvailableModels;
+  loadModel: (modelId: string) => Promise<void>;
+};
+//#endregion
+export { useWebLLM };
+//# sourceMappingURL=index.d.mts.map