@superlinked/sie-sdk 0.3.4 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -239,9 +239,9 @@ interface CapacityInfo {
239
239
  gpuCount: number;
240
240
  /** Number of unique models loaded across all workers */
241
241
  modelsLoaded: number;
242
- /** GPU types configured in the cluster */
242
+ /** Canonical machine profiles configured in the cluster */
243
243
  configuredGpuTypes: string[];
244
- /** GPU types currently running */
244
+ /** Machine profiles currently running */
245
245
  liveGpuTypes: string[];
246
246
  /** List of worker details */
247
247
  workers: WorkerInfo[];
@@ -250,10 +250,14 @@ interface CapacityInfo {
250
250
  * Pool specification for creating resource pools.
251
251
  */
252
252
  interface PoolSpec {
253
- /** Pool name (used in GPU param as "poolName/gpuType") */
253
+ /** Pool name (used in GPU param as "poolName/machineProfile") */
254
254
  name: string;
255
- /** GPU requirements, e.g., { l4: 2, "a100-40gb": 1 } */
255
+ /** Machine profile requirements for pool readiness, e.g., { l4: 2, "a100-40gb": 1 } */
256
256
  gpus?: Record<string, number>;
257
+ /** Optional maximum assigned workers per machine profile */
258
+ gpuCaps?: Record<string, number>;
259
+ /** Optional maximum assigned workers per machine profile, as returned by the gateway */
260
+ gpu_caps?: Record<string, number>;
257
261
  }
258
262
  /**
259
263
  * Pool status information.
@@ -281,6 +285,7 @@ interface PoolInfo {
281
285
  /** Pool specification */
282
286
  spec: {
283
287
  gpus?: Record<string, number>;
288
+ gpu_caps?: Record<string, number>;
284
289
  };
285
290
  /** Pool status */
286
291
  status: PoolStatus;
@@ -405,6 +410,324 @@ interface ScoreOptions {
405
410
  /** Whether to wait for capacity */
406
411
  waitForCapacity?: boolean;
407
412
  }
413
+ /** Reason the generation terminated. */
414
+ type FinishReason = "stop" | "length" | "cancelled" | "content_filter" | "error";
415
+ /** Token usage for a single generation call. */
416
+ interface GenerationUsage {
417
+ promptTokens: number;
418
+ completionTokens: number;
419
+ totalTokens: number;
420
+ }
421
+ /** Options for the generate operation. */
422
+ interface GenerateOptions {
423
+ /** Hard cap on output tokens. Required. */
424
+ maxNewTokens: number;
425
+ /** Sampling temperature. */
426
+ temperature?: number;
427
+ /** Nucleus sampling cutoff. */
428
+ topP?: number;
429
+ /** Optional list of stop strings. */
430
+ stop?: string[];
431
+ /** GPU type / pool spec, e.g. ``"l4"`` or ``"eval-bench/l4"``. */
432
+ gpu?: string;
433
+ /** Auto-retry under provisioning. */
434
+ waitForCapacity?: boolean;
435
+ }
436
+ /** Aggregated generation result. */
437
+ interface GenerateResult {
438
+ /** Model id the gateway dispatched to. */
439
+ model: string;
440
+ /** Full generated text (concatenation of all streamed deltas). */
441
+ text: string;
442
+ /** Termination reason. */
443
+ finishReason: FinishReason;
444
+ /** Prompt / completion / total token counts. */
445
+ usage: GenerationUsage;
446
+ /** Worker-generated attempt id. */
447
+ attemptId?: string;
448
+ /** Time-to-first-token in milliseconds. */
449
+ ttftMs?: number;
450
+ /** Average time per output token in milliseconds. */
451
+ tpotMs?: number;
452
+ }
453
+ /**
454
+ * A single message in a chat completion request.
455
+ *
456
+ * Accepted roles: `system`, `user`, `assistant`, `tool`, `developer`. The
457
+ * gateway normalises `developer` → `system` before forwarding to the worker
458
+ * (the OpenAI 2024-08 rename — most chat templates only have `system`).
459
+ *
460
+ * `content` may be a string OR an array of typed content parts. The gateway
461
+ * concatenates `text` / `input_text` parts; `image_url` / `input_image` parts
462
+ * are rejected with `400 unsupported_field` because no vision-capable
463
+ * generation model is configured today (the contract is forward-ready). See
464
+ * `packages/sie_gateway/src/openapi.rs` and `proxy.rs::chat_params_from_json`
465
+ * for the canonical accepted subset.
466
+ */
467
+ interface ChatMessage {
468
+ role: "system" | "user" | "assistant" | "tool" | "developer";
469
+ content: string | ChatContentPart[] | null;
470
+ name?: string;
471
+ /** Required when `role === "tool"`. */
472
+ tool_call_id?: string;
473
+ /** Populated by the model when calling tools (assistant turns only). */
474
+ tool_calls?: ToolCall[];
475
+ }
476
+ /**
477
+ * One content part inside a multimodal `messages[*].content` array. Only the
478
+ * text variants are accepted today; image parts are declared so callers can
479
+ * see the rejection at the type layer instead of at runtime.
480
+ */
481
+ type ChatContentPart = {
482
+ type: "text";
483
+ text: string;
484
+ } | {
485
+ type: "input_text";
486
+ text: string;
487
+ };
488
+ /** A tool call emitted by the model. */
489
+ interface ToolCall {
490
+ id: string;
491
+ type: "function";
492
+ function: {
493
+ name: string;
494
+ arguments: string;
495
+ };
496
+ }
497
+ /** A tool the model is allowed to call. */
498
+ interface ToolSpec {
499
+ type: "function";
500
+ function: {
501
+ name: string;
502
+ description?: string;
503
+ /** JSON Schema describing the function arguments. */
504
+ parameters?: Record<string, unknown>;
505
+ };
506
+ }
507
+ /** Tool-routing directive. */
508
+ type ToolChoice = "auto" | "none" | "required" | {
509
+ type: "function";
510
+ function: {
511
+ name: string;
512
+ };
513
+ };
514
+ /** Structured-output `response_format` envelope. */
515
+ interface ResponseFormat {
516
+ type: "json_schema" | "json_object" | "text";
517
+ /** JSON Schema body when `type === "json_schema"`. */
518
+ json_schema?: unknown;
519
+ }
520
+ /** OpenAI-compatible chat-completion finish reason. */
521
+ type ChatFinishReason = "stop" | "length" | "tool_calls" | "content_filter" | null;
522
+ /**
523
+ * Request body for `chatCompletions` / `streamChatCompletions`.
524
+ *
525
+ * Field names are snake_case (the wire shape) so the SDK can hand the object
526
+ * to `JSON.stringify` without further translation. SIE-specific routing
527
+ * fields (`routing_key`, `prompt_cache_key`) match the gateway schema in
528
+ * `packages/sie_gateway/src/openapi.rs`.
529
+ *
530
+ * The gateway honours: `model`, `messages`, `max_tokens` /
531
+ * `max_completion_tokens`, `temperature`, `top_p`, `top_k`, `stop`, `stream`,
532
+ * `stream_options`, `tools`, `tool_choice`, `parallel_tool_calls`,
533
+ * `response_format`, `frequency_penalty`, `presence_penalty` (each in
534
+ * `[-2, 2]`), `repetition_penalty`, `n`, `best_of`, `logprobs`,
535
+ * `top_logprobs`, `logit_bias`, `seed`, `user`, `safety_identifier`,
536
+ * `lora_adapter`, `routing_key`, and `prompt_cache_key`. Unknown fields
537
+ * are rejected with `400 unsupported_field`.
538
+ */
539
+ interface ChatCompletionRequest {
540
+ model: string;
541
+ messages: ChatMessage[];
542
+ /** Legacy alias; the gateway prefers `max_completion_tokens` when both set. */
543
+ max_tokens?: number;
544
+ max_completion_tokens?: number;
545
+ temperature?: number;
546
+ top_p?: number;
547
+ /**
548
+ * Non-OpenAI sampling knob (vLLM / SGLang). Integer `>= 1`; absent →
549
+ * sampler default (top-k disabled).
550
+ */
551
+ top_k?: number;
552
+ /**
553
+ * Non-OpenAI repetition penalty (SGLang). Float in `(0.0, 2.0]`; `1.0`
554
+ * means no penalty. Absent → sampler default.
555
+ */
556
+ repetition_penalty?: number;
557
+ /** Single stop string or list of stop strings. */
558
+ stop?: string | string[];
559
+ /** Set to `true` to use `streamChatCompletions`. `chatCompletions` rejects this. */
560
+ stream?: boolean;
561
+ /** Streaming-only: ask the server to emit a final usage-only chunk before `[DONE]`. */
562
+ stream_options?: {
563
+ include_usage?: boolean;
564
+ };
565
+ tools?: ToolSpec[];
566
+ tool_choice?: ToolChoice;
567
+ /** OpenAI parallel-tool-calls toggle (default `true`). */
568
+ parallel_tool_calls?: boolean;
569
+ response_format?: ResponseFormat;
570
+ /** Accepted in the OpenAI range [-2, 2]; out-of-range values are rejected. */
571
+ frequency_penalty?: number;
572
+ presence_penalty?: number;
573
+ /**
574
+ * Multi-candidate count. Default `1`. `n > 1 && stream === true` is
575
+ * rejected by the gateway with 400.
576
+ */
577
+ n?: number;
578
+ /**
579
+ * Generate this many candidates and return the top `n` by cumulative
580
+ * logprob. Range `[1, 128]`; requires `best_of >= n` and `stream: false`.
581
+ */
582
+ best_of?: number;
583
+ /**
584
+ * `true` requests per-token log-probabilities on each chunk / on the
585
+ * aggregate response. Required when `top_logprobs > 0`.
586
+ */
587
+ logprobs?: boolean;
588
+ /**
589
+ * How many alternate-token logprobs to return per position. Range
590
+ * `[0, 20]` per the OpenAI spec; implies `logprobs: true` when `> 0`.
591
+ */
592
+ top_logprobs?: number;
593
+ /**
594
+ * `{token_id: bias_float}` map. Gateway validates per-value range
595
+ * `[-100, 100]` and caps map size.
596
+ */
597
+ logit_bias?: Record<string, number>;
598
+ seed?: number;
599
+ /**
600
+ * OpenAI's free-text end-user identifier. Accepted and logged at debug
601
+ * level by the gateway.
602
+ */
603
+ user?: string;
604
+ /**
605
+ * OpenAI's free-text safety-tier identifier (replacement for `user` on
606
+ * safety-sensitive accounts). Accepted but intentionally not logged.
607
+ */
608
+ safety_identifier?: string;
609
+ /**
610
+ * Multi-LoRA: served-name of the adapter to apply on the worker (SIE
611
+ * extension). Must be a non-empty string; unknown names are rejected by
612
+ * the gateway with 400 `unknown_lora`.
613
+ */
614
+ lora_adapter?: string;
615
+ /** SIE-native routing affinity hint. */
616
+ routing_key?: string;
617
+ /** SIE-native prompt-cache hint. */
618
+ prompt_cache_key?: string;
619
+ }
620
+ /** Token usage block (snake_case, matches the wire shape). */
621
+ interface ChatUsage {
622
+ prompt_tokens: number;
623
+ completion_tokens: number;
624
+ total_tokens: number;
625
+ }
626
+ /** A single choice in a `ChatCompletion` (non-streaming). */
627
+ interface ChatChoice {
628
+ index: number;
629
+ message: ChatMessage;
630
+ finish_reason: ChatFinishReason;
631
+ logprobs: null;
632
+ }
633
+ /** Non-streaming response from `chatCompletions`. */
634
+ interface ChatCompletion {
635
+ id: string;
636
+ object: "chat.completion";
637
+ created: number;
638
+ model: string;
639
+ system_fingerprint: string | null;
640
+ choices: ChatChoice[];
641
+ usage: ChatUsage;
642
+ }
643
+ /** Incremental delta emitted on each streaming chunk. */
644
+ interface ChatDelta {
645
+ /** First chunk only, per the OpenAI streaming contract. */
646
+ role?: "assistant";
647
+ content?: string;
648
+ tool_calls?: ToolCallDelta[];
649
+ }
650
+ /** Partial tool-call materialised across multiple streaming chunks. */
651
+ interface ToolCallDelta {
652
+ index: number;
653
+ id?: string;
654
+ type?: "function";
655
+ function?: {
656
+ name?: string;
657
+ arguments?: string;
658
+ };
659
+ }
660
+ /** A single choice in a streaming `ChatCompletionChunk`. */
661
+ interface ChatChunkChoice {
662
+ index: number;
663
+ delta: ChatDelta;
664
+ finish_reason: ChatFinishReason;
665
+ logprobs: null;
666
+ }
667
+ /**
668
+ * One SSE event from `streamChatCompletions`.
669
+ *
670
+ * The terminal-usage chunk (emitted when `stream_options.include_usage` is
671
+ * `true`) sets `choices: []` and populates `usage`.
672
+ */
673
+ interface ChatCompletionChunk {
674
+ id: string;
675
+ object: "chat.completion.chunk";
676
+ created: number;
677
+ model: string;
678
+ system_fingerprint: string | null;
679
+ choices: ChatChunkChoice[];
680
+ usage?: ChatUsage;
681
+ }
682
+ /**
683
+ * Per-call options for `chatCompletions` controlling the pre-execution
684
+ * provisioning / retry loop. The request body itself is the separate
685
+ * {@link ChatCompletionRequest} argument; these knobs only govern HOW the
686
+ * SDK talks to the gateway, not WHAT it asks for.
687
+ *
688
+ * All fields are optional and fall back to the client-level defaults
689
+ * (`waitForCapacity`, `provisionTimeout`) when omitted.
690
+ */
691
+ interface ChatCompletionOptions {
692
+ /**
693
+ * When `true`, retry the SAFE pre-execution capacity signals
694
+ * (`202 Accepted`, `503 MODEL_LOADING`, generic `503`) until
695
+ * `provisionTimeoutMs` elapses. When `false`, the first such signal
696
+ * throws (`ProvisioningError` / `ModelLoadingError` / `ServerError`).
697
+ * Defaults to the client's `waitForCapacity` (false unless the
698
+ * constructor opted in).
699
+ */
700
+ waitForCapacity?: boolean;
701
+ /**
702
+ * Total cumulative wall-clock budget (ms) for provisioning retries.
703
+ * Independent of the per-attempt `timeout`. Defaults to the client's
704
+ * `provisionTimeout` (typically 5 minutes).
705
+ */
706
+ provisionTimeoutMs?: number;
707
+ }
708
+ /**
709
+ * One SSE event from `streamGenerate`.
710
+ *
711
+ * SIE-native shape — see `packages/sie_gateway/src/handlers/sse.rs`
712
+ * (`build_generate_chunk_event`). `usage` and `ttft_ms` only land on the
713
+ * terminal chunk; `error` is populated when generation failed mid-stream
714
+ * (handled by throwing `SIEStreamError`, never yielded).
715
+ */
716
+ interface GenerateChunk {
717
+ request_id: string;
718
+ seq: number;
719
+ text_delta: string;
720
+ done: boolean;
721
+ finish_reason?: "stop" | "length" | "cancelled" | "error";
722
+ usage?: ChatUsage;
723
+ /** Time-to-first-token, milliseconds. Terminal chunk only. */
724
+ ttft_ms?: number;
725
+ /** Populated when the worker / gateway errored mid-stream. */
726
+ error?: {
727
+ code: string;
728
+ message: string;
729
+ };
730
+ }
408
731
  /**
409
732
  * Options for extract operation.
410
733
  */
@@ -579,6 +902,177 @@ declare class SIEClient {
579
902
  * console.log(result.scores[0].itemId); // most relevant
580
903
  * ```
581
904
  */
905
+ /**
906
+ * Generate text from a prompt (walking-skeleton SDK surface).
907
+ *
908
+ * The SDK does not currently expose streaming chunks. The worker streams
909
+ * to the gateway, the gateway aggregates, and the SDK returns the
910
+ * assembled result plus SIE-native timing metadata (TTFT, TPOT,
911
+ * attempt id).
912
+ *
913
+ * @example
914
+ * ```typescript
915
+ * const result = await client.generate(
916
+ * "Qwen__Qwen3-4B-Instruct-2507",
917
+ * "Write a haiku about the sea.",
918
+ * { maxNewTokens: 64, temperature: 0.7 },
919
+ * );
920
+ * console.log(result.text);
921
+ * console.log(`TTFT: ${result.ttftMs}ms`);
922
+ * ```
923
+ */
924
+ generate(model: string, prompt: string, options: GenerateOptions): Promise<GenerateResult>;
925
+ /**
926
+ * Per-attempt JSON POST used by the non-streaming surfaces
927
+ * ({@link generate}, {@link chatCompletions}) inside the
928
+ * {@link withProvisioningRetry} loop.
929
+ *
930
+ * Translates low-level transport failures into typed errors that the
931
+ * retry loop will surface verbatim:
932
+ * - `AbortError` → `SIEConnectionError` (per-attempt timeout)
933
+ * - `TypeError` → `SIEConnectionError` (NOT retried — generation is
934
+ * non-idempotent, so a mid-flight drop must surface instead of
935
+ * silently re-issuing a billable generation)
936
+ *
937
+ * Each call uses a fresh `AbortController` so concurrent retries don't
938
+ * share state, and the per-attempt timeout is bounded by `this.timeout`
939
+ * (NOT the cumulative provisioning budget).
940
+ */
941
+ private performJsonPost;
942
+ /**
943
+ * Non-streaming chat-completion call against `/v1/chat/completions`.
944
+ *
945
+ * This is the OpenAI-compatible surface. The request body is forwarded
946
+ * verbatim as JSON, so any field documented at
947
+ * <https://platform.openai.com/docs/api-reference/chat/create> can be set;
948
+ * the gateway will reject fields it does not yet support with
949
+ * `400 unsupported_field`. SIE-native routing hints (`routing_key`,
950
+ * `prompt_cache_key`) are part of the same request shape.
951
+ *
952
+ * Error semantics mirror `generate()`: 4xx → `RequestError`, 5xx →
953
+ * `ServerError` (or the more specific `ModelLoadFailedError` for 502
954
+ * `MODEL_LOAD_FAILED`), connection / timeout failures →
955
+ * `SIEConnectionError`.
956
+ *
957
+ * If `req.stream === true`, this method throws `RequestError` immediately —
958
+ * use {@link streamChatCompletions} instead. We do not auto-route because
959
+ * the return type is fundamentally different (`Promise` vs
960
+ * `AsyncGenerator`) and silently flipping would mis-type the call site.
961
+ *
962
+ * @example
963
+ * ```typescript
964
+ * const reply = await client.chatCompletions({
965
+ * model: "Qwen/Qwen3-4B-Instruct-2507",
966
+ * messages: [{ role: "user", content: "Write a haiku about the sea." }],
967
+ * max_completion_tokens: 64,
968
+ * });
969
+ * console.log(reply.choices[0]?.message.content);
970
+ * ```
971
+ */
972
+ chatCompletions(req: ChatCompletionRequest, options?: ChatCompletionOptions): Promise<ChatCompletion>;
973
+ /**
974
+ * Streaming chat-completion call against `/v1/chat/completions` with
975
+ * `Accept: text/event-stream`.
976
+ *
977
+ * Yields `ChatCompletionChunk` events in the order the gateway emits them.
978
+ * The terminal chunk carries `finish_reason`; if
979
+ * `req.stream_options.include_usage === true`, a final usage-only chunk
980
+ * (`choices: []`, populated `usage`) follows it. The generator completes
981
+ * cleanly on the `data: [DONE]` sentinel.
982
+ *
983
+ * Error semantics:
984
+ *
985
+ * - HTTP 4xx / 5xx **before** the stream opens → throws `RequestError` /
986
+ * `ServerError` (same as {@link chatCompletions}).
987
+ * - A chunk containing `error: { ... }` mid-stream → throws
988
+ * {@link SIEStreamError}. The error chunk is consumed, never yielded.
989
+ * - `signal.abort()` mid-stream → the generator throws
990
+ * `SIEConnectionError` and releases the underlying reader, which
991
+ * fires `StreamCancelGuard` on the gateway side.
992
+ *
993
+ * `req.stream` is set to `true` automatically; any existing value is
994
+ * overwritten. We do not validate `req.stream === false` because the
995
+ * call-site intent is unambiguous.
996
+ *
997
+ * @param req The chat-completion request. See {@link ChatCompletionRequest}.
998
+ * @param signal Optional `AbortSignal` for cooperative cancellation.
999
+ *
1000
+ * @example
1001
+ * ```typescript
1002
+ * const controller = new AbortController();
1003
+ * try {
1004
+ * for await (const chunk of client.streamChatCompletions(
1005
+ * {
1006
+ * model: "Qwen/Qwen3-4B-Instruct-2507",
1007
+ * messages: [{ role: "user", content: "Count to ten." }],
1008
+ * stream_options: { include_usage: true },
1009
+ * },
1010
+ * controller.signal,
1011
+ * )) {
1012
+ * process.stdout.write(chunk.choices[0]?.delta.content ?? "");
1013
+ * }
1014
+ * } catch (err) {
1015
+ * if (err instanceof SIEStreamError) {
1016
+ * console.error(`mid-stream error: ${err.code} — ${err.message}`);
1017
+ * } else throw err;
1018
+ * }
1019
+ * ```
1020
+ */
1021
+ streamChatCompletions(req: ChatCompletionRequest, signal?: AbortSignal): AsyncGenerator<ChatCompletionChunk, void, undefined>;
1022
+ /**
1023
+ * Streaming companion to {@link generate} — opens an SSE connection to
1024
+ * `/v1/generate/{model}` with `stream: true` and yields the SIE-native
1025
+ * chunk shape documented in
1026
+ * `packages/sie_gateway/src/handlers/sse.rs::build_generate_chunk_event`.
1027
+ *
1028
+ * The first delta carries `seq: 0` and `text_delta` populated; the
1029
+ * terminal chunk has `done: true`, `finish_reason`, and (typically)
1030
+ * `usage` + `ttft_ms`. The generator completes on the `data: [DONE]`
1031
+ * sentinel.
1032
+ *
1033
+ * Error semantics match {@link streamChatCompletions}: pre-stream HTTP
1034
+ * errors throw normally, mid-stream `error` chunks throw
1035
+ * {@link SIEStreamError}.
1036
+ *
1037
+ * @example
1038
+ * ```typescript
1039
+ * for await (const chunk of client.streamGenerate(
1040
+ * "Qwen/Qwen3-4B-Instruct-2507",
1041
+ * "Write a haiku.",
1042
+ * { maxNewTokens: 64, temperature: 0.7 },
1043
+ * )) {
1044
+ * process.stdout.write(chunk.text_delta);
1045
+ * if (chunk.done) console.log(`\nTTFT: ${chunk.ttft_ms}ms`);
1046
+ * }
1047
+ * ```
1048
+ */
1049
+ streamGenerate(model: string, prompt: string, options: GenerateOptions, signal?: AbortSignal): AsyncGenerator<GenerateChunk, void, undefined>;
1050
+ /**
1051
+ * Shared SSE consumption helper for the streaming methods.
1052
+ *
1053
+ * Performs a pre-stream provisioning retry loop (honoring
1054
+ * `waitForCapacity`/`provisionTimeout`), surfaces pre-stream errors via
1055
+ * {@link handleError} (so callers see the same `RequestError` /
1056
+ * `ServerError` hierarchy as the non-streaming endpoints), then iterates
1057
+ * the SSE payloads via {@link parseSseStream}. Each payload is JSON-parsed;
1058
+ * if the consumer-supplied `extractError` returns an `SIEStreamError`, the
1059
+ * generator throws it instead of yielding the chunk.
1060
+ *
1061
+ * Retry policy mirrors {@link generate}: only the SAFE pre-execution
1062
+ * capacity signals — `202` (provisioning) and `503 MODEL_LOADING` — are
1063
+ * retried, and only while `waitForCapacity` is set and the provision
1064
+ * budget remains. Once the body opens we never retry (the call is
1065
+ * non-idempotent; a mid-stream failure must not re-issue generation).
1066
+ *
1067
+ * @internal
1068
+ */
1069
+ private consumeSseStream;
1070
+ /**
1071
+ * Build the standard JSON header set for the chat-completions surface.
1072
+ * Pulled out so both the streaming and non-streaming paths agree on
1073
+ * auth / version / content-type wiring.
1074
+ */
1075
+ private buildChatHeaders;
582
1076
  score(model: string, query: Item, items: Item[], options?: ScoreOptions): Promise<ScoreResult>;
583
1077
  /**
584
1078
  * Extract entities from a single item.
@@ -607,17 +1101,18 @@ declare class SIEClient {
607
1101
  */
608
1102
  close(): Promise<void>;
609
1103
  /**
610
- * Create a resource pool for isolated capacity.
1104
+ * Create or update a resource pool for isolated capacity.
611
1105
  *
612
1106
  * Pools provide dedicated worker capacity, isolated from other clients.
613
1107
  * Workers are assigned to pools and only serve requests from that pool.
614
1108
  *
615
1109
  * @param name - Pool name (used in GPU param as "poolName/machineProfile")
616
- * @param gpus - Machine profile requirements, e.g., { "l4": 2, "l4-spot": 1 }
1110
+ * @param gpus - Optional machine profile requirements for pool readiness, e.g., { "l4": 2, "l4-spot": 1 }
1111
+ * @param gpuCaps - Optional maximum assigned workers per machine profile
617
1112
  *
618
1113
  * @example
619
1114
  * ```typescript
620
- * // Create a pool with 2 L4 GPUs
1115
+ * // Create or update a pool with 2 L4 GPUs
621
1116
  * await client.createPool("eval-bench", { l4: 2 });
622
1117
  *
623
1118
  * // Use the pool for requests
@@ -627,7 +1122,7 @@ declare class SIEClient {
627
1122
  * await client.deletePool("eval-bench");
628
1123
  * ```
629
1124
  */
630
- createPool(name: string, gpus: Record<string, number>): Promise<void>;
1125
+ createPool(name: string, gpus?: Record<string, number>, gpuCaps?: Record<string, number>): Promise<void>;
631
1126
  /**
632
1127
  * Get information about a pool.
633
1128
  *
@@ -740,7 +1235,7 @@ declare class SIEClient {
740
1235
  private detectEndpointType;
741
1236
  }
742
1237
 
743
- declare const SDK_VERSION = "0.3.4";
1238
+ declare const SDK_VERSION = "0.4.1";
744
1239
 
745
1240
  /**
746
1241
  * Helpers for converting SIE encode results to plain JavaScript types.
@@ -956,6 +1451,33 @@ declare class ModelLoadingError extends SIEError {
956
1451
  readonly model: string | undefined;
957
1452
  constructor(message: string, model?: string);
958
1453
  }
1454
+ /**
1455
+ * Error surfaced mid-stream from `streamChatCompletions` / `streamGenerate`.
1456
+ *
1457
+ * The SSE wire shape includes optional `error: {message, type, param, code}`
1458
+ * (chat) or `error: {code, message}` (SIE-native generate) on the terminal
1459
+ * chunk. When the SDK sees such a chunk it does NOT yield the chunk; instead
1460
+ * it throws `SIEStreamError`, mirroring the non-streaming `handleError` path
1461
+ * so callers can catch the same way they would for HTTP-level failures.
1462
+ *
1463
+ * Compare with `RequestError` / `ServerError`: those fire before the SSE
1464
+ * stream opens (HTTP 4xx / 5xx). `SIEStreamError` fires after at least one
1465
+ * byte has gone out — the connection itself was healthy, but the worker /
1466
+ * gateway emitted an error envelope partway through generation.
1467
+ */
1468
+ declare class SIEStreamError extends SIEError {
1469
+ /** SIE-native error code (e.g. `context_exceeded`, `cancelled`). */
1470
+ readonly code: string | undefined;
1471
+ /** OpenAI-style error type (e.g. `context_length_exceeded`, `server_error`). */
1472
+ readonly errorType: string | undefined;
1473
+ /** Offending field name when known (chat shape only). */
1474
+ readonly param: string | null | undefined;
1475
+ constructor(message: string, options?: {
1476
+ code?: string;
1477
+ errorType?: string;
1478
+ param?: string | null;
1479
+ });
1480
+ }
959
1481
  /**
960
1482
  * Error when the server reports a *terminal* model-load failure.
961
1483
  *
@@ -1114,4 +1636,4 @@ declare function toImageWireFormat(input: ImageInput, format?: "jpeg" | "png" |
1114
1636
  */
1115
1637
  declare function detectImageFormat(bytes: Uint8Array): "jpeg" | "png" | "webp" | "unknown";
1116
1638
 
1117
- export { type CapacityInfo, type Classification, type ClusterStatusMessage, type ClusterSummary, type ClusterWorkerInfo, type DType, type DetectedObject, type EncodeOptions, type EncodeResult, type Entity, type ExtractOptions, type ExtractResult, type GPUMetrics, type ImageInput, type ImageWireFormat, InputTooLongError, type Item, LoraLoadingError, type ModelConfig, type ModelDims, type ModelInfo, ModelLoadFailedError, ModelLoadingError, type ModelState, type ModelStatus, type ModelSummary, type OutputType, PoolError, type PoolInfo, type PoolSpec, type PoolStatus, ProvisioningError, type Relation, RequestError, SDK_VERSION, SIEClient, type SIEClientOptions, SIEConnectionError, SIEError, type ScoreEntry, type ScoreOptions, type ScoreResult, ServerError, type ServerInfo, type SparseResult, type SparseVector, type StatusMessage, type TimingInfo, type WorkerInfo, type WorkerStatusMessage, denseEmbedding, detectImageFormat, multivectorEmbedding, normalizeSparseVector, packMessage, sparseEmbedding, sparseEmbeddingMap, toFloat32Array, toImageBytes, toImageWireFormat, toNumberArray, unpackMessage };
1639
+ export { type CapacityInfo, type ChatChoice, type ChatChunkChoice, type ChatCompletion, type ChatCompletionChunk, type ChatCompletionRequest, type ChatDelta, type ChatFinishReason, type ChatMessage, type ChatUsage, type Classification, type ClusterStatusMessage, type ClusterSummary, type ClusterWorkerInfo, type DType, type DetectedObject, type EncodeOptions, type EncodeResult, type Entity, type ExtractOptions, type ExtractResult, type FinishReason, type GPUMetrics, type GenerateChunk, type GenerateOptions, type GenerateResult, type GenerationUsage, type ImageInput, type ImageWireFormat, InputTooLongError, type Item, LoraLoadingError, type ModelConfig, type ModelDims, type ModelInfo, ModelLoadFailedError, ModelLoadingError, type ModelState, type ModelStatus, type ModelSummary, type OutputType, PoolError, type PoolInfo, type PoolSpec, type PoolStatus, ProvisioningError, type Relation, RequestError, type ResponseFormat, SDK_VERSION, SIEClient, type SIEClientOptions, SIEConnectionError, SIEError, SIEStreamError, type ScoreEntry, type ScoreOptions, type ScoreResult, ServerError, type ServerInfo, type SparseResult, type SparseVector, type StatusMessage, type TimingInfo, type ToolCall, type ToolCallDelta, type ToolChoice, type ToolSpec, type WorkerInfo, type WorkerStatusMessage, denseEmbedding, detectImageFormat, multivectorEmbedding, normalizeSparseVector, packMessage, sparseEmbedding, sparseEmbeddingMap, toFloat32Array, toImageBytes, toImageWireFormat, toNumberArray, unpackMessage };