npm - @agorapete/wllama - Versions diffs - 3.5.1-q2.0 - Mend

@agorapete/wllama 3.5.1-q2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (86) hide show

package/.gitmodules +3 -0
package/.prettierignore +38 -0
package/AGENTS.md +1 -0
package/CMakeLists.txt +131 -0
package/LICENCE +21 -0
package/README-dev.md +178 -0
package/README.md +225 -0
package/README_banner.png +0 -0
package/assets/screenshot_0.png +0 -0
package/cpp/generate_glue_prototype.js +115 -0
package/cpp/glue.hpp +664 -0
package/cpp/test_glue.cpp +80 -0
package/cpp/wllama-context.h +1172 -0
package/cpp/wllama-fs.h +148 -0
package/cpp/wllama.cpp +187 -0
package/cpp/wllama.h +6 -0
package/esm/cache-manager.d.ts +130 -0
package/esm/debug.d.ts +28 -0
package/esm/glue/glue.d.ts +22 -0
package/esm/glue/messages.d.ts +146 -0
package/esm/huggingface.d.ts +31 -0
package/esm/index.cjs +3406 -0
package/esm/index.d.ts +8 -0
package/esm/index.js +3387 -0
package/esm/index.min.js +1 -0
package/esm/index.min.js.map +1 -0
package/esm/model-manager.d.ts +136 -0
package/esm/storage/cos.d.ts +36 -0
package/esm/storage/index.d.ts +33 -0
package/esm/storage/opfs.d.ts +12 -0
package/esm/types/oai-compat.d.ts +278 -0
package/esm/types/types.d.ts +112 -0
package/esm/utils.d.ts +119 -0
package/esm/wasm/source-map.d.ts +1 -0
package/esm/wasm/wllama.wasm +0 -0
package/esm/wasm-from-cdn.d.ts +8 -0
package/esm/wllama.d.ts +397 -0
package/esm/worker.d.ts +92 -0
package/esm/workers-code/generated.d.ts +4 -0
package/guides/intro-v2.md +132 -0
package/guides/intro-v3.1.md +40 -0
package/guides/intro-v3.md +230 -0
package/index.ts +1 -0
package/package.json +71 -0
package/scripts/bisect_test.sh +33 -0
package/scripts/build_hf_space.sh +26 -0
package/scripts/build_source_map.js +269 -0
package/scripts/build_wasm.sh +19 -0
package/scripts/build_worker.sh +38 -0
package/scripts/check_debug_build.js +30 -0
package/scripts/check_package_size.js +25 -0
package/scripts/docker-compose.yml +76 -0
package/scripts/generate_wasm_from_cdn.js +24 -0
package/scripts/http_server.js +44 -0
package/scripts/post_build.sh +32 -0
package/src/cache-manager.ts +358 -0
package/src/debug.ts +111 -0
package/src/glue/glue.ts +291 -0
package/src/glue/messages.ts +773 -0
package/src/huggingface.ts +151 -0
package/src/index.ts +8 -0
package/src/mjs.test.ts +44 -0
package/src/model-manager.test.ts +200 -0
package/src/model-manager.ts +359 -0
package/src/storage/cos.test.ts +83 -0
package/src/storage/cos.ts +171 -0
package/src/storage/index.ts +40 -0
package/src/storage/opfs.ts +119 -0
package/src/types/oai-compat.ts +342 -0
package/src/types/types.ts +133 -0
package/src/utils.test.ts +231 -0
package/src/utils.ts +403 -0
package/src/wasm/source-map.ts +7 -0
package/src/wasm/wllama.js +1 -0
package/src/wasm/wllama.wasm +0 -0
package/src/wasm-from-cdn.ts +13 -0
package/src/wllama.test.ts +392 -0
package/src/wllama.ts +1138 -0
package/src/wllama.wgpu.test.ts +62 -0
package/src/worker.ts +443 -0
package/src/workers-code/generated.ts +11 -0
package/src/workers-code/llama-cpp.js +511 -0
package/src/workers-code/opfs-utils.js +150 -0
package/tsconfig.build.json +34 -0
package/tsup.config.ts +23 -0
package/vitest.config.ts +61 -0

package/esm/types/types.d.ts ADDED Viewed

@@ -0,0 +1,112 @@
+export interface LoadModelParams {
+    log_level?: LogLevel;
+    seed?: number;
+    n_ctx?: number;
+    n_batch?: number;
+    n_gpu_layers?: number;
+    n_threads?: number;
+    embeddings?: boolean;
+    offload_kqv?: boolean;
+    pooling_type?: 'LLAMA_POOLING_TYPE_UNSPECIFIED' | 'LLAMA_POOLING_TYPE_NONE' | 'LLAMA_POOLING_TYPE_MEAN' | 'LLAMA_POOLING_TYPE_CLS' | 'unspecified' | 'none' | 'mean' | 'cls' | 'last' | 'rank';
+    rope_scaling_type?: 'LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED' | 'LLAMA_ROPE_SCALING_TYPE_NONE' | 'LLAMA_ROPE_SCALING_TYPE_LINEAR' | 'LLAMA_ROPE_SCALING_TYPE_YARN';
+    rope_freq_base?: number;
+    rope_freq_scale?: number;
+    yarn_ext_factor?: number;
+    yarn_attn_factor?: number;
+    yarn_beta_fast?: number;
+    yarn_beta_slow?: number;
+    yarn_orig_ctx?: number;
+    cache_type_k?: 'f32' | 'f16' | 'q8_0' | 'q5_1' | 'q5_0' | 'q4_1' | 'q4_0';
+    cache_type_v?: 'f32' | 'f16' | 'q8_0' | 'q5_1' | 'q5_0' | 'q4_1' | 'q4_0';
+    flash_attn?: boolean;
+    swa_full?: boolean;
+    chat_template?: string;
+    jinja?: boolean;
+    reasoning?: boolean;
+    image_min_tokens?: number;
+    image_max_tokens?: number;
+    warmup?: boolean;
+    no_kv_offload?: boolean;
+    mmproj_offload?: boolean;
+    cont_batching?: boolean;
+    n_keep?: number;
+    ctx_shift?: boolean;
+    cache_idle_slots?: boolean;
+    n_cache_reuse?: number;
+    lora_adapters?: {
+        path: string;
+        scale?: number;
+    }[];
+    lora_init_without_apply?: boolean;
+    spec_draft_model?: string;
+    spec_draft_ngl?: number;
+    spec_draft_n_max?: number;
+    spec_draft_n_min?: number;
+    spec_draft_p_min?: number;
+    spec_draft_threads?: number;
+    spec_draft_threads_batch?: number;
+    kv_overrides?: Record<string, string>;
+    reasoning_budget_tokens?: number;
+    reasoning_budget_message?: string;
+    reasoning_format?: 'none' | 'deepseek-legacy' | 'deepseek';
+    skip_chat_parsing?: boolean;
+    prefill_assistant?: boolean;
+    default_template_kwargs?: Record<string, any>;
+}
+export interface LoadedContextInfo {
+    n_vocab: number;
+    n_ctx: number;
+    n_batch: number;
+    n_ubatch: number;
+    n_ctx_train: number;
+    n_embd: number;
+    n_layer: number;
+    metadata: Record<string, string>;
+    token_bos: number;
+    token_eos: number;
+    token_eot: number;
+    list_tokens_eog: number[];
+    has_encoder: boolean;
+    token_decoder_start: number;
+    add_bos_token: boolean;
+    add_eos_token: boolean;
+    has_image_input: boolean;
+    has_audio_input: boolean;
+}
+export interface SamplingParams {
+    seed?: number;
+    mirostat?: number | undefined;
+    mirostat_eta?: number | undefined;
+    mirostat_tau?: number | undefined;
+    samplers_sequence?: string[] | undefined;
+    temp?: number | undefined;
+    top_p?: number | undefined;
+    top_k?: number | undefined;
+    penalty_last_n?: number | undefined;
+    penalty_repeat?: number | undefined;
+    penalty_freq?: number | undefined;
+    penalty_present?: number | undefined;
+    dynatemp_range?: number | undefined;
+    dynatemp_exponent?: number | undefined;
+    grammar?: string;
+    n_prev?: number | undefined;
+    n_probs?: number | undefined;
+    min_p?: number | undefined;
+    typ_p?: number | undefined;
+    typical_p?: number | undefined;
+    logit_bias?: {
+        token: number;
+        bias: number;
+    }[] | undefined;
+    ignore_eos?: boolean | undefined;
+}
+export interface StreamParams<T> {
+    stream: true;
+    onData: (data: T) => void;
+}
+export declare enum LogLevel {
+    DEBUG = 1,
+    INFO = 2,
+    WARN = 3,
+    ERROR = 4
+}

package/esm/utils.d.ts ADDED Viewed

@@ -0,0 +1,119 @@
+export declare const joinBuffers: (buffers: Uint8Array[]) => Uint8Array;
+/**
+ * Convert list of bytes (number) to text
+ * @param buffer
+ * @returns a string
+ */
+export declare const bufToText: (buffer: ArrayBuffer | Uint8Array) => string;
+/**
+ * Get default stdout/stderr config for wasm module
+ */
+export declare const getWModuleConfig: (pathConfig: {
+    [filename: string]: string;
+}) => {
+    noInitialRun: boolean;
+    print: (text: any) => void;
+    printErr: (text: any) => void;
+    locateFile: (filename: string, basePath: string) => string;
+};
+export interface ShardInfo {
+    baseURL: string;
+    current: number;
+    total: number;
+}
+/**
+ * Parse shard number and total from a file name or URL
+ */
+export declare const parseShardNumber: (fnameOrUrl: string) => ShardInfo;
+/**
+ * Parses a model URL and returns an array of URLs based on the following patterns:
+ * - If the input URL is an array, it returns the array itself.
+ * - If the input URL is a string in the `gguf-split` format, it returns an array containing the URL of each shard in ascending order.
+ * - Otherwise, it returns an array containing the input URL as a single element array.
+ * @param modelUrl URL or list of URLs
+ */
+export declare const parseModelUrl: (modelUrl: string) => string[];
+/**
+ * Check if the given blobs are files or not, then sort them by shard number
+ */
+export declare const sortFileByShard: (blobs: Blob[]) => void;
+export declare const isMmproj: (blob: Blob) => Promise<boolean>;
+export declare const delay: (ms: number) => Promise<unknown>;
+export declare const absoluteUrl: (relativePath: string) => string;
+export declare const padDigits: (number: number, digits: number) => string;
+export declare const sumArr: (arr: number[]) => number;
+export declare const isString: (value: any) => boolean;
+export declare const MMPROJ_FILE_NAME = "mmproj.gguf";
+type ModelShard = {
+    blob: Blob;
+    name: string;
+};
+export declare const prepareBlobs: (blobsInp: Blob[]) => Promise<{
+    llm: ModelShard[];
+    mmproj: ModelShard | null;
+    all: ModelShard[];
+}>;
+/**
+ * Browser feature detection
+ * Copied from https://unpkg.com/wasm-feature-detect?module (Apache License)
+ */
+/**
+ * @returns true if browser support multi-threads
+ */
+export declare const isSupportMultiThread: () => Promise<boolean>;
+/**
+ * @returns true if browser support JSPI
+ */
+export declare const isSupportJSPI: () => boolean;
+/**
+ * @returns true if brower support WebGPU. Note: for browser without JSPI support, compat mode will be used.
+ */
+export declare const isSupportWebGPU: () => boolean;
+/**
+ * @returns true if browser support WASM Memory64
+ */
+export declare const isSupportMem64: () => boolean;
+/**
+ * Throws an error if the environment is not compatible
+ */
+export declare const checkEnvironmentCompatible: () => Promise<void>;
+/**
+ * Check if browser is Safari
+ * Source: https://github.com/DamonOehlman/detect-browser/blob/master/src/index.ts
+ */
+export declare const isSafari: () => boolean;
+/**
+ * Check if browser is Firefox
+ */
+export declare const isFirefox: () => boolean;
+/**
+ * Regular expression to validate GGUF file paths/URLs
+ * Matches paths ending with .gguf and optional query parameters
+ */
+export declare const GGUF_FILE_REGEX: RegExp;
+/**
+ * Validates if a given string is a valid GGUF file path/URL
+ * @param path The file path or URL to validate
+ * @returns true if the path is a valid GGUF file path/URL
+ */
+export declare const isValidGgufFile: (path: string) => boolean;
+/**
+ * Check if browser is Safari iOS / iPad / iPhone
+ * Source: https://github.com/DamonOehlman/detect-browser/blob/master/src/index.ts
+ */
+export declare const isSafariMobile: () => boolean;
+/**
+ * Create a worker from a string
+ */
+export declare const createWorker: (workerCode: string | Blob) => Worker;
+/**
+ * Convert callback to async iterator
+ */
+export declare const cbToAsyncIter: <A extends any[], T>(fn: (...args: [...args: A, callback: (val?: T, done?: boolean, err?: Error) => void]) => void) => (...args: A) => AsyncIterable<T>;
+/**
+ * Check if we can use async file read, where the wasm env can asynchronously read a Blob.
+ * Please refer to README-dev.md for more details.
+ */
+export declare const canUseAsyncFileRead: (compat: boolean) => boolean;
+export declare const needCompat: () => boolean;
+export {};

package/esm/wasm/source-map.d.ts ADDED Viewed

	@@ -0,0 +1 @@
1	+ export declare const WASM_SOURCE_MAP: Record<string, string>;

package/esm/wasm/wllama.wasm ADDED Viewed

Binary file

package/esm/wasm-from-cdn.d.ts ADDED Viewed

@@ -0,0 +1,8 @@
+declare const WasmFromCDN: {
+    default: string;
+};
+export declare const WasmCompatFromCDN: {
+    worker: string;
+    wasm: string;
+};
+export default WasmFromCDN;

package/esm/wllama.d.ts ADDED Viewed

@@ -0,0 +1,397 @@
+/// <reference types="node" />
+/// <reference types="node" />
+import { type WllamaWorkerResources } from './worker';
+import CacheManager, { type DownloadOptions } from './cache-manager';
+import { ModelManager, Model, type ModelSource } from './model-manager';
+import type { LoadedContextInfo, LoadModelParams, StreamParams } from './types/types';
+import type { ChatCompletionChunk, ChatCompletionParams, ChatCompletionResponse, CreateEmbeddingResponse, EmbeddingCreateParams, RawCompletionChunk, RawCompletionParams, RawCompletionResponse, RerankParams, RerankResponse } from './types/oai-compat';
+import { type HuggingFaceParams } from './huggingface';
+export interface WllamaLogger {
+    debug: typeof console.debug;
+    log: typeof console.log;
+    warn: typeof console.warn;
+    error: typeof console.error;
+}
+export interface WllamaConfig {
+    /**
+     * If true, suppress all log messages from native CPP code
+     */
+    suppressNativeLog?: boolean;
+    /**
+     * Custom logger functions
+     */
+    logger?: WllamaLogger;
+    /**
+     * Maximum number of parallel files to be downloaded
+     *
+     * Default: parallelDownloads = 3
+     */
+    parallelDownloads?: number;
+    /**
+     * Allow offline mode. If true, the model will be loaded from cache if it's available.
+     *
+     * Default: allowOffline = false
+     */
+    allowOffline?: boolean;
+    /**
+     * Custom cache manager (only for advanced usage)
+     */
+    cacheManager?: CacheManager;
+    /**
+     * Custom model manager (only for advanced usage)
+     */
+    modelManager?: ModelManager;
+}
+export interface WllamaChatMessage {
+    role: 'system' | 'user' | 'assistant';
+    content: string;
+}
+export interface AssetsPathConfig {
+    default: string;
+    'single-thread/wllama.wasm'?: string;
+    'multi-thread/wllama.wasm'?: string;
+}
+export interface ModelMetadata {
+    hparams: {
+        nVocab: number;
+        nCtxTrain: number;
+        nEmbd: number;
+        nLayer: number;
+    };
+    meta: Record<string, string>;
+}
+/**
+ * Logger preset with debug messages suppressed
+ */
+export declare const LoggerWithoutDebug: {
+    debug: () => void;
+    assert(condition?: boolean | undefined, ...data: any[]): void;
+    assert(value: any, message?: string | undefined, ...optionalParams: any[]): void;
+    clear(): void;
+    clear(): void;
+    count(label?: string | undefined): void;
+    count(label?: string | undefined): void;
+    countReset(label?: string | undefined): void;
+    countReset(label?: string | undefined): void;
+    dir(item?: any, options?: any): void;
+    dir(obj: any, options?: import("util").InspectOptions | undefined): void;
+    dirxml(...data: any[]): void;
+    dirxml(...data: any[]): void;
+    error(...data: any[]): void;
+    error(message?: any, ...optionalParams: any[]): void;
+    group(...data: any[]): void;
+    group(...label: any[]): void;
+    groupCollapsed(...data: any[]): void;
+    groupCollapsed(...label: any[]): void;
+    groupEnd(): void;
+    groupEnd(): void;
+    info(...data: any[]): void;
+    info(message?: any, ...optionalParams: any[]): void;
+    log(...data: any[]): void;
+    log(message?: any, ...optionalParams: any[]): void;
+    table(tabularData?: any, properties?: string[] | undefined): void;
+    table(tabularData: any, properties?: readonly string[] | undefined): void;
+    time(label?: string | undefined): void;
+    time(label?: string | undefined): void;
+    timeEnd(label?: string | undefined): void;
+    timeEnd(label?: string | undefined): void;
+    timeLog(label?: string | undefined, ...data: any[]): void;
+    timeLog(label?: string | undefined, ...data: any[]): void;
+    timeStamp(label?: string | undefined): void;
+    timeStamp(label?: string | undefined): void;
+    trace(...data: any[]): void;
+    trace(message?: any, ...optionalParams: any[]): void;
+    warn(...data: any[]): void;
+    warn(message?: any, ...optionalParams: any[]): void;
+    Console: console.ConsoleConstructor;
+    profile(label?: string | undefined): void;
+    profileEnd(label?: string | undefined): void;
+};
+export type WllamaErrorType = 'model_not_loaded' | 'download_error' | 'load_error' | 'kv_cache_full' | 'unknown_error' | 'inference_error';
+export declare class WllamaError extends Error {
+    type: WllamaErrorType;
+    constructor(message: string, type?: WllamaErrorType);
+}
+/**
+ * AbortError is thrown when the user wants to abort the current operation.
+ * This is equivalent to AbortError in Fetch API.
+ */
+export declare class WllamaAbortError extends Error {
+    name: string;
+    constructor();
+}
+/**
+ * RuntimeError is thrown when there is an error in the WASM runtime, such as stack overflow, OOM, etc.
+ * Stack trace of the error in the WASM runtime can be included in the error object for debugging purpose.
+ */
+export declare class WllamaRuntimeError extends Error {
+    name: string;
+    stack: string;
+    constructor(message: string, stack: string);
+}
+/**
+ * Set compatibility options for Wllama.
+ * By default, these are set to URL of the latest builds on CDN, which requires internet to download. If you want to use local assets or have your own CDN, follow the instruction from @wllama/wllama-compat package.
+ */
+export interface WllamaCompat {
+    worker: string | {
+        code: string;
+    };
+    wasm: string;
+}
+export declare class Wllama {
+    cacheManager: CacheManager;
+    modelManager: ModelManager;
+    private compat;
+    private proxy;
+    private config;
+    private pathConfig;
+    private useMultiThread;
+    private nbThreads;
+    private useEmbeddings;
+    private useRerank;
+    private loadedContextInfo;
+    private seed;
+    private bosToken;
+    private eosToken;
+    private eotToken;
+    private eogTokens;
+    private addBosToken;
+    private addEosToken;
+    private mediaMarker?;
+    private chatTemplate?;
+    private metadata?;
+    private hasEncoder;
+    private decoderStartToken;
+    private chatTemplateKwargs;
+    constructor(pathConfig: AssetsPathConfig, wllamaConfig?: WllamaConfig);
+    private logger;
+    private checkModelLoaded;
+    /**
+     * Get the libllama version string, e.g. "b6327-4d74393".
+     *
+     * @returns version string embedded at build time.
+     */
+    static getLibllamaVersion(): string;
+    /**
+     * Set compatibility options for Wllama.
+     * @param compat Set to null to disable compatibility, or 'default' to use the default compat resources from CDN.
+     * @param mode 'safari' by default; If set to 'firefox_safari', the compat mode will **also** be enabled on Firefox, which will significantly degrade the performance but allow using WebGPU on Firefox.
+     */
+    setCompat(compat: WllamaCompat | null | 'default', mode?: 'safari' | 'firefox_safari'): void;
+    /**
+     * Check if the model is loaded via `loadModel()`
+     */
+    isModelLoaded(): boolean;
+    /**
+     * Get token ID associated to BOS (begin of sentence) token.
+     *
+     * NOTE: This can only being used after `loadModel` is called.
+     *
+     * @returns -1 if the model is not loaded.
+     */
+    getBOS(): number;
+    /**
+     * Get token ID associated to EOS (end of sentence) token.
+     *
+     * NOTE: This can only being used after `loadModel` is called.
+     *
+     * @returns -1 if the model is not loaded.
+     */
+    getEOS(): number;
+    /**
+     * Get token ID associated to EOT (end of turn) token.
+     *
+     * NOTE: This can only being used after `loadModel` is called.
+     *
+     * @returns -1 if the model is not loaded.
+     */
+    getEOT(): number;
+    /**
+     * Check if a given token is end-of-generation token (e.g. EOS, EOT, etc.)
+     *
+     * @param token the token ID to be checked
+     * @returns true if the token is EOS, EOT, or any other end-of-generation tokens
+     */
+    isTokenEOG(token: number): boolean;
+    /**
+     * Get token ID associated to token used by decoder, to start generating output sequence(only usable for encoder-decoder architecture). In other words, encoder uses normal BOS and decoder uses this token.
+     *
+     * NOTE: This can only being used after `loadModel` is called.
+     *
+     * @returns -1 if the model is not loaded.
+     */
+    getDecoderStartToken(): number;
+    /**
+     * Get model hyper-parameters and metadata
+     *
+     * NOTE: This can only being used after `loadModel` is called.
+     *
+     * @returns ModelMetadata
+     */
+    getModelMetadata(): ModelMetadata;
+    /**
+     * Check if we're currently using multi-thread build.
+     *
+     * NOTE: This can only being used after `loadModel` is called.
+     *
+     * @returns true if multi-thread is used.
+     */
+    isMultithread(): boolean;
+    /**
+     * Get number of threads used in the current context.
+     *
+     * NOTE: This can only being used after `loadModel` is called.
+     *
+     * @returns number of threads
+     */
+    getNumThreads(): number;
+    /**
+     * Check if the current model uses encoder-decoder architecture
+     *
+     * NOTE: This can only being used after `loadModel` is called.
+     *
+     * @returns true if multi-thread is used.
+     */
+    isEncoderDecoderArchitecture(): boolean;
+    /**
+     * Must we add BOS token to the tokenized sequence?
+     *
+     * NOTE: This can only being used after `loadModel` is called.
+     *
+     * @returns true if BOS token must be added to the sequence
+     */
+    mustAddBosToken(): boolean;
+    /**
+     * Must we add EOS token to the tokenized sequence?
+     *
+     * NOTE: This can only being used after `loadModel` is called.
+     *
+     * @returns true if EOS token must be added to the sequence
+     */
+    mustAddEosToken(): boolean;
+    /**
+     * Get the jinja chat template comes with the model. It only available if the original model (before converting to gguf) has the template in `tokenizer_config.json`
+     *
+     * NOTE: This can only being used after `loadModel` is called.
+     *
+     * @returns the jinja template. null if there is no template in gguf
+     */
+    getChatTemplate(): string | null;
+    /**
+     * Check if WebGPU is supported by the current environment.
+     * @returns true if WebGPU is supported
+     */
+    isSupportWebGPU(): boolean;
+    /**
+     * Load model from a given URL (or a list of URLs, in case the model is splitted into smaller files)
+     * - If the model already been downloaded (via `downloadModel()`), then we will use the cached model
+     * - Else, we download the model from internet
+     * @param modelSourceOrURL
+     * @param params
+     */
+    loadModelFromUrl(modelSourceOrURL: ModelSource | string, params?: LoadModelParams & DownloadOptions & {
+        useCache?: boolean;
+    }): Promise<void>;
+    /**
+     * Load model from a given Hugging Face model ID and file path.
+     *
+     * @param hfOptions
+     * @param params
+     */
+    loadModelFromHF(hfOptions: HuggingFaceParams, params?: LoadModelParams & DownloadOptions & {
+        useCache?: boolean;
+    }): Promise<void>;
+    /**
+     * Load model from a given list of Blob.
+     *
+     * You can pass multiple buffers into the function (in case the model contains multiple shards).
+     *
+     * @param ggufBlobsOrModel Can be either list of Blobs (in case you use local file), or a Model object (in case you use ModelManager)
+     * @param params LoadModelParams
+     */
+    loadModel(ggufBlobsOrModel: Blob[] | Model, params?: LoadModelParams): Promise<void>;
+    getLoadedContextInfo(): LoadedContextInfo;
+    /**
+     * Calculate embedding vector for a given text.
+     * By default, BOS and EOS tokens will be added automatically. You can use the "skipBOS" and "skipEOS" option to disable it.
+     * @param options OAI-compatible embedding creation options
+     * @returns OAI-compatible embedding response
+     */
+    createEmbedding(options: EmbeddingCreateParams): Promise<CreateEmbeddingResponse>;
+    /**
+     * Rerank a list of documents against a query.
+     * Requires the model to be loaded with embeddings: true and pooling_type: 'rank'.
+     * @param options Reranking options (query, documents, top_n)
+     * @returns Reranking response with relevance scores sorted highest first
+     */
+    createRerank(options: RerankParams): Promise<RerankResponse>;
+    /**
+     * Make chat completion for a given chat messages.
+     * @param options OAI-compatible chat completion options
+     * @returns OAI-compatible chat completion response (only the final result when stream=false) or an async iterator of completion chunks (when stream=true)
+     */
+    createChatCompletion(options: ChatCompletionParams & {
+        stream?: false;
+    }): Promise<ChatCompletionResponse>;
+    createChatCompletion(options: ChatCompletionParams & StreamParams<ChatCompletionChunk>): Promise<void>;
+    createChatCompletion(options: ChatCompletionParams & {
+        stream: true;
+    }): Promise<AsyncIterable<ChatCompletionChunk>>;
+    /**
+     * Make (raw) completion for a given text.
+     * @param options OAI-compatible completion options
+     * @returns OAI-compatible completion response (stream=false), void when done (stream=true + onData), or async iterator (stream=true, no onData)
+     */
+    createCompletion(options: RawCompletionParams & {
+        stream?: false;
+    }): Promise<RawCompletionResponse>;
+    createCompletion(options: RawCompletionParams & StreamParams<RawCompletionChunk>): Promise<void>;
+    createCompletion(options: RawCompletionParams & {
+        stream: true;
+    }): Promise<AsyncIterable<RawCompletionChunk>>;
+    /**
+     * Private implementation of createCompletion
+     */
+    private createCompletionImpl;
+    /**
+     * Same with `createCompletion`, but returns an async iterator instead.
+     * Only called when stream=true and no onData is provided.
+     */
+    private createCompletionGenerator;
+    /**
+     * Whether the currently loaded model supports a specific input modality (e.g. image or audio).
+     * @param modality
+     * @returns
+     */
+    supportInputModality(modality: 'image' | 'audio'): boolean;
+    /**
+     * Unload the model and free all memory.
+     *
+     * Note: This function will NOT crash if model is not yet loaded
+     */
+    exit(): Promise<void>;
+    /**
+     * [FOR DEBUGGING ONLY] Run ggml backend ops tests without loading any model.
+     *
+     * Initializes the wasm runtime, executes `test-backend-ops` with the given args, then shuts down.
+     *
+     * For more info, please refer to guides/debug.md
+     *
+     * @param args Arguments forwarded to test-backend-ops (e.g. ["-o", "ADD"])
+     * @returns retcode (0 = all tests passed) and success flag
+     */
+    testBackendOps(args?: string[]): Promise<{
+        retcode: number;
+        success: boolean;
+    }>;
+    /**
+     * get debug info
+     */
+    _getDebugInfo(): Promise<any>;
+    private jsonDecode;
+    private prepareMultimodalInput;
+    private getRerankResult;
+    private getResponse;
+    getWorkerResources(): WllamaWorkerResources;
+}