npm - @superlinked/sie-sdk - Versions diffs - 0.3.4 → 0.4.1 - Mend

@superlinked/sie-sdk 0.3.4 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.d.cts CHANGED Viewed

@@ -239,9 +239,9 @@ interface CapacityInfo {
     gpuCount: number;
     /** Number of unique models loaded across all workers */
     modelsLoaded: number;
-    /** GPU types configured in the cluster */
+    /** Canonical machine profiles configured in the cluster */
     configuredGpuTypes: string[];
-    /** GPU types currently running */
+    /** Machine profiles currently running */
     liveGpuTypes: string[];
     /** List of worker details */
     workers: WorkerInfo[];
@@ -250,10 +250,14 @@ interface CapacityInfo {
  * Pool specification for creating resource pools.
  */
 interface PoolSpec {
-    /** Pool name (used in GPU param as "poolName/gpuType") */
+    /** Pool name (used in GPU param as "poolName/machineProfile") */
     name: string;
-    /** GPU requirements, e.g., { l4: 2, "a100-40gb": 1 } */
+    /** Machine profile requirements for pool readiness, e.g., { l4: 2, "a100-40gb": 1 } */
     gpus?: Record<string, number>;
+    /** Optional maximum assigned workers per machine profile */
+    gpuCaps?: Record<string, number>;
+    /** Optional maximum assigned workers per machine profile, as returned by the gateway */
+    gpu_caps?: Record<string, number>;
 }
 /**
  * Pool status information.
@@ -281,6 +285,7 @@ interface PoolInfo {
     /** Pool specification */
     spec: {
         gpus?: Record<string, number>;
+        gpu_caps?: Record<string, number>;
     };
     /** Pool status */
     status: PoolStatus;
@@ -405,6 +410,324 @@ interface ScoreOptions {
     /** Whether to wait for capacity */
     waitForCapacity?: boolean;
 }
+/** Reason the generation terminated. */
+type FinishReason = "stop" | "length" | "cancelled" | "content_filter" | "error";
+/** Token usage for a single generation call. */
+interface GenerationUsage {
+    promptTokens: number;
+    completionTokens: number;
+    totalTokens: number;
+}
+/** Options for the generate operation. */
+interface GenerateOptions {
+    /** Hard cap on output tokens. Required. */
+    maxNewTokens: number;
+    /** Sampling temperature. */
+    temperature?: number;
+    /** Nucleus sampling cutoff. */
+    topP?: number;
+    /** Optional list of stop strings. */
+    stop?: string[];
+    /** GPU type / pool spec, e.g. ``"l4"`` or ``"eval-bench/l4"``. */
+    gpu?: string;
+    /** Auto-retry under provisioning. */
+    waitForCapacity?: boolean;
+}
+/** Aggregated generation result. */
+interface GenerateResult {
+    /** Model id the gateway dispatched to. */
+    model: string;
+    /** Full generated text (concatenation of all streamed deltas). */
+    text: string;
+    /** Termination reason. */
+    finishReason: FinishReason;
+    /** Prompt / completion / total token counts. */
+    usage: GenerationUsage;
+    /** Worker-generated attempt id. */
+    attemptId?: string;
+    /** Time-to-first-token in milliseconds. */
+    ttftMs?: number;
+    /** Average time per output token in milliseconds. */
+    tpotMs?: number;
+}
+/**
+ * A single message in a chat completion request.
+ *
+ * Accepted roles: `system`, `user`, `assistant`, `tool`, `developer`. The
+ * gateway normalises `developer` → `system` before forwarding to the worker
+ * (the OpenAI 2024-08 rename — most chat templates only have `system`).
+ *
+ * `content` may be a string OR an array of typed content parts. The gateway
+ * concatenates `text` / `input_text` parts; `image_url` / `input_image` parts
+ * are rejected with `400 unsupported_field` because no vision-capable
+ * generation model is configured today (the contract is forward-ready). See
+ * `packages/sie_gateway/src/openapi.rs` and `proxy.rs::chat_params_from_json`
+ * for the canonical accepted subset.
+ */
+interface ChatMessage {
+    role: "system" | "user" | "assistant" | "tool" | "developer";
+    content: string | ChatContentPart[] | null;
+    name?: string;
+    /** Required when `role === "tool"`. */
+    tool_call_id?: string;
+    /** Populated by the model when calling tools (assistant turns only). */
+    tool_calls?: ToolCall[];
+}
+/**
+ * One content part inside a multimodal `messages[*].content` array. Only the
+ * text variants are accepted today; image parts are declared so callers can
+ * see the rejection at the type layer instead of at runtime.
+ */
+type ChatContentPart = {
+    type: "text";
+    text: string;
+} | {
+    type: "input_text";
+    text: string;
+};
+/** A tool call emitted by the model. */
+interface ToolCall {
+    id: string;
+    type: "function";
+    function: {
+        name: string;
+        arguments: string;
+    };
+}
+/** A tool the model is allowed to call. */
+interface ToolSpec {
+    type: "function";
+    function: {
+        name: string;
+        description?: string;
+        /** JSON Schema describing the function arguments. */
+        parameters?: Record<string, unknown>;
+    };
+}
+/** Tool-routing directive. */
+type ToolChoice = "auto" | "none" | "required" | {
+    type: "function";
+    function: {
+        name: string;
+    };
+};
+/** Structured-output `response_format` envelope. */
+interface ResponseFormat {
+    type: "json_schema" | "json_object" | "text";
+    /** JSON Schema body when `type === "json_schema"`. */
+    json_schema?: unknown;
+}
+/** OpenAI-compatible chat-completion finish reason. */
+type ChatFinishReason = "stop" | "length" | "tool_calls" | "content_filter" | null;
+/**
+ * Request body for `chatCompletions` / `streamChatCompletions`.
+ *
+ * Field names are snake_case (the wire shape) so the SDK can hand the object
+ * to `JSON.stringify` without further translation. SIE-specific routing
+ * fields (`routing_key`, `prompt_cache_key`) match the gateway schema in
+ * `packages/sie_gateway/src/openapi.rs`.
+ *
+ * The gateway honours: `model`, `messages`, `max_tokens` /
+ * `max_completion_tokens`, `temperature`, `top_p`, `top_k`, `stop`, `stream`,
+ * `stream_options`, `tools`, `tool_choice`, `parallel_tool_calls`,
+ * `response_format`, `frequency_penalty`, `presence_penalty` (each in
+ * `[-2, 2]`), `repetition_penalty`, `n`, `best_of`, `logprobs`,
+ * `top_logprobs`, `logit_bias`, `seed`, `user`, `safety_identifier`,
+ * `lora_adapter`, `routing_key`, and `prompt_cache_key`. Unknown fields
+ * are rejected with `400 unsupported_field`.
+ */
+interface ChatCompletionRequest {
+    model: string;
+    messages: ChatMessage[];
+    /** Legacy alias; the gateway prefers `max_completion_tokens` when both set. */
+    max_tokens?: number;
+    max_completion_tokens?: number;
+    temperature?: number;
+    top_p?: number;
+    /**
+     * Non-OpenAI sampling knob (vLLM / SGLang). Integer `>= 1`; absent →
+     * sampler default (top-k disabled).
+     */
+    top_k?: number;
+    /**
+     * Non-OpenAI repetition penalty (SGLang). Float in `(0.0, 2.0]`; `1.0`
+     * means no penalty. Absent → sampler default.
+     */
+    repetition_penalty?: number;
+    /** Single stop string or list of stop strings. */
+    stop?: string | string[];
+    /** Set to `true` to use `streamChatCompletions`. `chatCompletions` rejects this. */
+    stream?: boolean;
+    /** Streaming-only: ask the server to emit a final usage-only chunk before `[DONE]`. */
+    stream_options?: {
+        include_usage?: boolean;
+    };
+    tools?: ToolSpec[];
+    tool_choice?: ToolChoice;
+    /** OpenAI parallel-tool-calls toggle (default `true`). */
+    parallel_tool_calls?: boolean;
+    response_format?: ResponseFormat;
+    /** Accepted in the OpenAI range [-2, 2]; out-of-range values are rejected. */
+    frequency_penalty?: number;
+    presence_penalty?: number;
+    /**
+     * Multi-candidate count. Default `1`. `n > 1 && stream === true` is
+     * rejected by the gateway with 400.
+     */
+    n?: number;
+    /**
+     * Generate this many candidates and return the top `n` by cumulative
+     * logprob. Range `[1, 128]`; requires `best_of >= n` and `stream: false`.
+     */
+    best_of?: number;
+    /**
+     * `true` requests per-token log-probabilities on each chunk / on the
+     * aggregate response. Required when `top_logprobs > 0`.
+     */
+    logprobs?: boolean;
+    /**
+     * How many alternate-token logprobs to return per position. Range
+     * `[0, 20]` per the OpenAI spec; implies `logprobs: true` when `> 0`.
+     */
+    top_logprobs?: number;
+    /**
+     * `{token_id: bias_float}` map. Gateway validates per-value range
+     * `[-100, 100]` and caps map size.
+     */
+    logit_bias?: Record<string, number>;
+    seed?: number;
+    /**
+     * OpenAI's free-text end-user identifier. Accepted and logged at debug
+     * level by the gateway.
+     */
+    user?: string;
+    /**
+     * OpenAI's free-text safety-tier identifier (replacement for `user` on
+     * safety-sensitive accounts). Accepted but intentionally not logged.
+     */
+    safety_identifier?: string;
+    /**
+     * Multi-LoRA: served-name of the adapter to apply on the worker (SIE
+     * extension). Must be a non-empty string; unknown names are rejected by
+     * the gateway with 400 `unknown_lora`.
+     */
+    lora_adapter?: string;
+    /** SIE-native routing affinity hint. */
+    routing_key?: string;
+    /** SIE-native prompt-cache hint. */
+    prompt_cache_key?: string;
+}
+/** Token usage block (snake_case, matches the wire shape). */
+interface ChatUsage {
+    prompt_tokens: number;
+    completion_tokens: number;
+    total_tokens: number;
+}
+/** A single choice in a `ChatCompletion` (non-streaming). */
+interface ChatChoice {
+    index: number;
+    message: ChatMessage;
+    finish_reason: ChatFinishReason;
+    logprobs: null;
+}
+/** Non-streaming response from `chatCompletions`. */
+interface ChatCompletion {
+    id: string;
+    object: "chat.completion";
+    created: number;
+    model: string;
+    system_fingerprint: string | null;
+    choices: ChatChoice[];
+    usage: ChatUsage;
+}
+/** Incremental delta emitted on each streaming chunk. */
+interface ChatDelta {
+    /** First chunk only, per the OpenAI streaming contract. */
+    role?: "assistant";
+    content?: string;
+    tool_calls?: ToolCallDelta[];
+}
+/** Partial tool-call materialised across multiple streaming chunks. */
+interface ToolCallDelta {
+    index: number;
+    id?: string;
+    type?: "function";
+    function?: {
+        name?: string;
+        arguments?: string;
+    };
+}
+/** A single choice in a streaming `ChatCompletionChunk`. */
+interface ChatChunkChoice {
+    index: number;
+    delta: ChatDelta;
+    finish_reason: ChatFinishReason;
+    logprobs: null;
+}
+/**
+ * One SSE event from `streamChatCompletions`.
+ *
+ * The terminal-usage chunk (emitted when `stream_options.include_usage` is
+ * `true`) sets `choices: []` and populates `usage`.
+ */
+interface ChatCompletionChunk {
+    id: string;
+    object: "chat.completion.chunk";
+    created: number;
+    model: string;
+    system_fingerprint: string | null;
+    choices: ChatChunkChoice[];
+    usage?: ChatUsage;
+}
+/**
+ * Per-call options for `chatCompletions` controlling the pre-execution
+ * provisioning / retry loop. The request body itself is the separate
+ * {@link ChatCompletionRequest} argument; these knobs only govern HOW the
+ * SDK talks to the gateway, not WHAT it asks for.
+ *
+ * All fields are optional and fall back to the client-level defaults
+ * (`waitForCapacity`, `provisionTimeout`) when omitted.
+ */
+interface ChatCompletionOptions {
+    /**
+     * When `true`, retry the SAFE pre-execution capacity signals
+     * (`202 Accepted`, `503 MODEL_LOADING`, generic `503`) until
+     * `provisionTimeoutMs` elapses. When `false`, the first such signal
+     * throws (`ProvisioningError` / `ModelLoadingError` / `ServerError`).
+     * Defaults to the client's `waitForCapacity` (false unless the
+     * constructor opted in).
+     */
+    waitForCapacity?: boolean;
+    /**
+     * Total cumulative wall-clock budget (ms) for provisioning retries.
+     * Independent of the per-attempt `timeout`. Defaults to the client's
+     * `provisionTimeout` (typically 5 minutes).
+     */
+    provisionTimeoutMs?: number;
+}
+/**
+ * One SSE event from `streamGenerate`.
+ *
+ * SIE-native shape — see `packages/sie_gateway/src/handlers/sse.rs`
+ * (`build_generate_chunk_event`). `usage` and `ttft_ms` only land on the
+ * terminal chunk; `error` is populated when generation failed mid-stream
+ * (handled by throwing `SIEStreamError`, never yielded).
+ */
+interface GenerateChunk {
+    request_id: string;
+    seq: number;
+    text_delta: string;
+    done: boolean;
+    finish_reason?: "stop" | "length" | "cancelled" | "error";
+    usage?: ChatUsage;
+    /** Time-to-first-token, milliseconds. Terminal chunk only. */
+    ttft_ms?: number;
+    /** Populated when the worker / gateway errored mid-stream. */
+    error?: {
+        code: string;
+        message: string;
+    };
+}
 /**
  * Options for extract operation.
  */
@@ -579,6 +902,177 @@ declare class SIEClient {
      * console.log(result.scores[0].itemId); // most relevant
      * ```
      */
+    /**
+     * Generate text from a prompt (walking-skeleton SDK surface).
+     *
+     * The SDK does not currently expose streaming chunks. The worker streams
+     * to the gateway, the gateway aggregates, and the SDK returns the
+     * assembled result plus SIE-native timing metadata (TTFT, TPOT,
+     * attempt id).
+     *
+     * @example
+     * ```typescript
+     * const result = await client.generate(
+     *   "Qwen__Qwen3-4B-Instruct-2507",
+     *   "Write a haiku about the sea.",
+     *   { maxNewTokens: 64, temperature: 0.7 },
+     * );
+     * console.log(result.text);
+     * console.log(`TTFT: ${result.ttftMs}ms`);
+     * ```
+     */
+    generate(model: string, prompt: string, options: GenerateOptions): Promise<GenerateResult>;
+    /**
+     * Per-attempt JSON POST used by the non-streaming surfaces
+     * ({@link generate}, {@link chatCompletions}) inside the
+     * {@link withProvisioningRetry} loop.
+     *
+     * Translates low-level transport failures into typed errors that the
+     * retry loop will surface verbatim:
+     *   - `AbortError` → `SIEConnectionError` (per-attempt timeout)
+     *   - `TypeError`  → `SIEConnectionError` (NOT retried — generation is
+     *     non-idempotent, so a mid-flight drop must surface instead of
+     *     silently re-issuing a billable generation)
+     *
+     * Each call uses a fresh `AbortController` so concurrent retries don't
+     * share state, and the per-attempt timeout is bounded by `this.timeout`
+     * (NOT the cumulative provisioning budget).
+     */
+    private performJsonPost;
+    /**
+     * Non-streaming chat-completion call against `/v1/chat/completions`.
+     *
+     * This is the OpenAI-compatible surface. The request body is forwarded
+     * verbatim as JSON, so any field documented at
+     * <https://platform.openai.com/docs/api-reference/chat/create> can be set;
+     * the gateway will reject fields it does not yet support with
+     * `400 unsupported_field`. SIE-native routing hints (`routing_key`,
+     * `prompt_cache_key`) are part of the same request shape.
+     *
+     * Error semantics mirror `generate()`: 4xx → `RequestError`, 5xx →
+     * `ServerError` (or the more specific `ModelLoadFailedError` for 502
+     * `MODEL_LOAD_FAILED`), connection / timeout failures →
+     * `SIEConnectionError`.
+     *
+     * If `req.stream === true`, this method throws `RequestError` immediately —
+     * use {@link streamChatCompletions} instead. We do not auto-route because
+     * the return type is fundamentally different (`Promise` vs
+     * `AsyncGenerator`) and silently flipping would mis-type the call site.
+     *
+     * @example
+     * ```typescript
+     * const reply = await client.chatCompletions({
+     *   model: "Qwen/Qwen3-4B-Instruct-2507",
+     *   messages: [{ role: "user", content: "Write a haiku about the sea." }],
+     *   max_completion_tokens: 64,
+     * });
+     * console.log(reply.choices[0]?.message.content);
+     * ```
+     */
+    chatCompletions(req: ChatCompletionRequest, options?: ChatCompletionOptions): Promise<ChatCompletion>;
+    /**
+     * Streaming chat-completion call against `/v1/chat/completions` with
+     * `Accept: text/event-stream`.
+     *
+     * Yields `ChatCompletionChunk` events in the order the gateway emits them.
+     * The terminal chunk carries `finish_reason`; if
+     * `req.stream_options.include_usage === true`, a final usage-only chunk
+     * (`choices: []`, populated `usage`) follows it. The generator completes
+     * cleanly on the `data: [DONE]` sentinel.
+     *
+     * Error semantics:
+     *
+     *   - HTTP 4xx / 5xx **before** the stream opens → throws `RequestError` /
+     *     `ServerError` (same as {@link chatCompletions}).
+     *   - A chunk containing `error: { ... }` mid-stream → throws
+     *     {@link SIEStreamError}. The error chunk is consumed, never yielded.
+     *   - `signal.abort()` mid-stream → the generator throws
+     *     `SIEConnectionError` and releases the underlying reader, which
+     *     fires `StreamCancelGuard` on the gateway side.
+     *
+     * `req.stream` is set to `true` automatically; any existing value is
+     * overwritten. We do not validate `req.stream === false` because the
+     * call-site intent is unambiguous.
+     *
+     * @param req     The chat-completion request. See {@link ChatCompletionRequest}.
+     * @param signal  Optional `AbortSignal` for cooperative cancellation.
+     *
+     * @example
+     * ```typescript
+     * const controller = new AbortController();
+     * try {
+     *   for await (const chunk of client.streamChatCompletions(
+     *     {
+     *       model: "Qwen/Qwen3-4B-Instruct-2507",
+     *       messages: [{ role: "user", content: "Count to ten." }],
+     *       stream_options: { include_usage: true },
+     *     },
+     *     controller.signal,
+     *   )) {
+     *     process.stdout.write(chunk.choices[0]?.delta.content ?? "");
+     *   }
+     * } catch (err) {
+     *   if (err instanceof SIEStreamError) {
+     *     console.error(`mid-stream error: ${err.code} — ${err.message}`);
+     *   } else throw err;
+     * }
+     * ```
+     */
+    streamChatCompletions(req: ChatCompletionRequest, signal?: AbortSignal): AsyncGenerator<ChatCompletionChunk, void, undefined>;
+    /**
+     * Streaming companion to {@link generate} — opens an SSE connection to
+     * `/v1/generate/{model}` with `stream: true` and yields the SIE-native
+     * chunk shape documented in
+     * `packages/sie_gateway/src/handlers/sse.rs::build_generate_chunk_event`.
+     *
+     * The first delta carries `seq: 0` and `text_delta` populated; the
+     * terminal chunk has `done: true`, `finish_reason`, and (typically)
+     * `usage` + `ttft_ms`. The generator completes on the `data: [DONE]`
+     * sentinel.
+     *
+     * Error semantics match {@link streamChatCompletions}: pre-stream HTTP
+     * errors throw normally, mid-stream `error` chunks throw
+     * {@link SIEStreamError}.
+     *
+     * @example
+     * ```typescript
+     * for await (const chunk of client.streamGenerate(
+     *   "Qwen/Qwen3-4B-Instruct-2507",
+     *   "Write a haiku.",
+     *   { maxNewTokens: 64, temperature: 0.7 },
+     * )) {
+     *   process.stdout.write(chunk.text_delta);
+     *   if (chunk.done) console.log(`\nTTFT: ${chunk.ttft_ms}ms`);
+     * }
+     * ```
+     */
+    streamGenerate(model: string, prompt: string, options: GenerateOptions, signal?: AbortSignal): AsyncGenerator<GenerateChunk, void, undefined>;
+    /**
+     * Shared SSE consumption helper for the streaming methods.
+     *
+     * Performs a pre-stream provisioning retry loop (honoring
+     * `waitForCapacity`/`provisionTimeout`), surfaces pre-stream errors via
+     * {@link handleError} (so callers see the same `RequestError` /
+     * `ServerError` hierarchy as the non-streaming endpoints), then iterates
+     * the SSE payloads via {@link parseSseStream}. Each payload is JSON-parsed;
+     * if the consumer-supplied `extractError` returns an `SIEStreamError`, the
+     * generator throws it instead of yielding the chunk.
+     *
+     * Retry policy mirrors {@link generate}: only the SAFE pre-execution
+     * capacity signals — `202` (provisioning) and `503 MODEL_LOADING` — are
+     * retried, and only while `waitForCapacity` is set and the provision
+     * budget remains. Once the body opens we never retry (the call is
+     * non-idempotent; a mid-stream failure must not re-issue generation).
+     *
+     * @internal
+     */
+    private consumeSseStream;
+    /**
+     * Build the standard JSON header set for the chat-completions surface.
+     * Pulled out so both the streaming and non-streaming paths agree on
+     * auth / version / content-type wiring.
+     */
+    private buildChatHeaders;
     score(model: string, query: Item, items: Item[], options?: ScoreOptions): Promise<ScoreResult>;
     /**
      * Extract entities from a single item.
@@ -607,17 +1101,18 @@ declare class SIEClient {
      */
     close(): Promise<void>;
     /**
-     * Create a resource pool for isolated capacity.
+     * Create or update a resource pool for isolated capacity.
      *
      * Pools provide dedicated worker capacity, isolated from other clients.
      * Workers are assigned to pools and only serve requests from that pool.
      *
      * @param name - Pool name (used in GPU param as "poolName/machineProfile")
-     * @param gpus - Machine profile requirements, e.g., { "l4": 2, "l4-spot": 1 }
+     * @param gpus - Optional machine profile requirements for pool readiness, e.g., { "l4": 2, "l4-spot": 1 }
+     * @param gpuCaps - Optional maximum assigned workers per machine profile
      *
      * @example
      * ```typescript
-     * // Create a pool with 2 L4 GPUs
+     * // Create or update a pool with 2 L4 GPUs
      * await client.createPool("eval-bench", { l4: 2 });
      *
      * // Use the pool for requests
@@ -627,7 +1122,7 @@ declare class SIEClient {
      * await client.deletePool("eval-bench");
      * ```
      */
-    createPool(name: string, gpus: Record<string, number>): Promise<void>;
+    createPool(name: string, gpus?: Record<string, number>, gpuCaps?: Record<string, number>): Promise<void>;
     /**
      * Get information about a pool.
      *
@@ -740,7 +1235,7 @@ declare class SIEClient {
     private detectEndpointType;
 }
-declare const SDK_VERSION = "0.3.4";
+declare const SDK_VERSION = "0.4.1";
 /**
  * Helpers for converting SIE encode results to plain JavaScript types.
@@ -956,6 +1451,33 @@ declare class ModelLoadingError extends SIEError {
     readonly model: string | undefined;
     constructor(message: string, model?: string);
 }
+/**
+ * Error surfaced mid-stream from `streamChatCompletions` / `streamGenerate`.
+ *
+ * The SSE wire shape includes optional `error: {message, type, param, code}`
+ * (chat) or `error: {code, message}` (SIE-native generate) on the terminal
+ * chunk. When the SDK sees such a chunk it does NOT yield the chunk; instead
+ * it throws `SIEStreamError`, mirroring the non-streaming `handleError` path
+ * so callers can catch the same way they would for HTTP-level failures.
+ *
+ * Compare with `RequestError` / `ServerError`: those fire before the SSE
+ * stream opens (HTTP 4xx / 5xx). `SIEStreamError` fires after at least one
+ * byte has gone out — the connection itself was healthy, but the worker /
+ * gateway emitted an error envelope partway through generation.
+ */
+declare class SIEStreamError extends SIEError {
+    /** SIE-native error code (e.g. `context_exceeded`, `cancelled`). */
+    readonly code: string | undefined;
+    /** OpenAI-style error type (e.g. `context_length_exceeded`, `server_error`). */
+    readonly errorType: string | undefined;
+    /** Offending field name when known (chat shape only). */
+    readonly param: string | null | undefined;
+    constructor(message: string, options?: {
+        code?: string;
+        errorType?: string;
+        param?: string | null;
+    });
+}
 /**
  * Error when the server reports a *terminal* model-load failure.
  *
@@ -1114,4 +1636,4 @@ declare function toImageWireFormat(input: ImageInput, format?: "jpeg" | "png" |
  */
 declare function detectImageFormat(bytes: Uint8Array): "jpeg" | "png" | "webp" | "unknown";
-export { type CapacityInfo, type Classification, type ClusterStatusMessage, type ClusterSummary, type ClusterWorkerInfo, type DType, type DetectedObject, type EncodeOptions, type EncodeResult, type Entity, type ExtractOptions, type ExtractResult, type GPUMetrics, type ImageInput, type ImageWireFormat, InputTooLongError, type Item, LoraLoadingError, type ModelConfig, type ModelDims, type ModelInfo, ModelLoadFailedError, ModelLoadingError, type ModelState, type ModelStatus, type ModelSummary, type OutputType, PoolError, type PoolInfo, type PoolSpec, type PoolStatus, ProvisioningError, type Relation, RequestError, SDK_VERSION, SIEClient, type SIEClientOptions, SIEConnectionError, SIEError, type ScoreEntry, type ScoreOptions, type ScoreResult, ServerError, type ServerInfo, type SparseResult, type SparseVector, type StatusMessage, type TimingInfo, type WorkerInfo, type WorkerStatusMessage, denseEmbedding, detectImageFormat, multivectorEmbedding, normalizeSparseVector, packMessage, sparseEmbedding, sparseEmbeddingMap, toFloat32Array, toImageBytes, toImageWireFormat, toNumberArray, unpackMessage };
+export { type CapacityInfo, type ChatChoice, type ChatChunkChoice, type ChatCompletion, type ChatCompletionChunk, type ChatCompletionRequest, type ChatDelta, type ChatFinishReason, type ChatMessage, type ChatUsage, type Classification, type ClusterStatusMessage, type ClusterSummary, type ClusterWorkerInfo, type DType, type DetectedObject, type EncodeOptions, type EncodeResult, type Entity, type ExtractOptions, type ExtractResult, type FinishReason, type GPUMetrics, type GenerateChunk, type GenerateOptions, type GenerateResult, type GenerationUsage, type ImageInput, type ImageWireFormat, InputTooLongError, type Item, LoraLoadingError, type ModelConfig, type ModelDims, type ModelInfo, ModelLoadFailedError, ModelLoadingError, type ModelState, type ModelStatus, type ModelSummary, type OutputType, PoolError, type PoolInfo, type PoolSpec, type PoolStatus, ProvisioningError, type Relation, RequestError, type ResponseFormat, SDK_VERSION, SIEClient, type SIEClientOptions, SIEConnectionError, SIEError, SIEStreamError, type ScoreEntry, type ScoreOptions, type ScoreResult, ServerError, type ServerInfo, type SparseResult, type SparseVector, type StatusMessage, type TimingInfo, type ToolCall, type ToolCallDelta, type ToolChoice, type ToolSpec, type WorkerInfo, type WorkerStatusMessage, denseEmbedding, detectImageFormat, multivectorEmbedding, normalizeSparseVector, packMessage, sparseEmbedding, sparseEmbeddingMap, toFloat32Array, toImageBytes, toImageWireFormat, toNumberArray, unpackMessage };