@superlinked/sie-sdk 0.3.4 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +709 -17
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +532 -10
- package/dist/index.d.ts +532 -10
- package/dist/index.js +709 -18
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.cts
CHANGED
|
@@ -239,9 +239,9 @@ interface CapacityInfo {
|
|
|
239
239
|
gpuCount: number;
|
|
240
240
|
/** Number of unique models loaded across all workers */
|
|
241
241
|
modelsLoaded: number;
|
|
242
|
-
/**
|
|
242
|
+
/** Canonical machine profiles configured in the cluster */
|
|
243
243
|
configuredGpuTypes: string[];
|
|
244
|
-
/**
|
|
244
|
+
/** Machine profiles currently running */
|
|
245
245
|
liveGpuTypes: string[];
|
|
246
246
|
/** List of worker details */
|
|
247
247
|
workers: WorkerInfo[];
|
|
@@ -250,10 +250,14 @@ interface CapacityInfo {
|
|
|
250
250
|
* Pool specification for creating resource pools.
|
|
251
251
|
*/
|
|
252
252
|
interface PoolSpec {
|
|
253
|
-
/** Pool name (used in GPU param as "poolName/
|
|
253
|
+
/** Pool name (used in GPU param as "poolName/machineProfile") */
|
|
254
254
|
name: string;
|
|
255
|
-
/**
|
|
255
|
+
/** Machine profile requirements for pool readiness, e.g., { l4: 2, "a100-40gb": 1 } */
|
|
256
256
|
gpus?: Record<string, number>;
|
|
257
|
+
/** Optional maximum assigned workers per machine profile */
|
|
258
|
+
gpuCaps?: Record<string, number>;
|
|
259
|
+
/** Optional maximum assigned workers per machine profile, as returned by the gateway */
|
|
260
|
+
gpu_caps?: Record<string, number>;
|
|
257
261
|
}
|
|
258
262
|
/**
|
|
259
263
|
* Pool status information.
|
|
@@ -281,6 +285,7 @@ interface PoolInfo {
|
|
|
281
285
|
/** Pool specification */
|
|
282
286
|
spec: {
|
|
283
287
|
gpus?: Record<string, number>;
|
|
288
|
+
gpu_caps?: Record<string, number>;
|
|
284
289
|
};
|
|
285
290
|
/** Pool status */
|
|
286
291
|
status: PoolStatus;
|
|
@@ -405,6 +410,324 @@ interface ScoreOptions {
|
|
|
405
410
|
/** Whether to wait for capacity */
|
|
406
411
|
waitForCapacity?: boolean;
|
|
407
412
|
}
|
|
413
|
+
/** Reason the generation terminated. */
|
|
414
|
+
type FinishReason = "stop" | "length" | "cancelled" | "content_filter" | "error";
|
|
415
|
+
/** Token usage for a single generation call. */
|
|
416
|
+
interface GenerationUsage {
|
|
417
|
+
promptTokens: number;
|
|
418
|
+
completionTokens: number;
|
|
419
|
+
totalTokens: number;
|
|
420
|
+
}
|
|
421
|
+
/** Options for the generate operation. */
|
|
422
|
+
interface GenerateOptions {
|
|
423
|
+
/** Hard cap on output tokens. Required. */
|
|
424
|
+
maxNewTokens: number;
|
|
425
|
+
/** Sampling temperature. */
|
|
426
|
+
temperature?: number;
|
|
427
|
+
/** Nucleus sampling cutoff. */
|
|
428
|
+
topP?: number;
|
|
429
|
+
/** Optional list of stop strings. */
|
|
430
|
+
stop?: string[];
|
|
431
|
+
/** GPU type / pool spec, e.g. ``"l4"`` or ``"eval-bench/l4"``. */
|
|
432
|
+
gpu?: string;
|
|
433
|
+
/** Auto-retry under provisioning. */
|
|
434
|
+
waitForCapacity?: boolean;
|
|
435
|
+
}
|
|
436
|
+
/** Aggregated generation result. */
|
|
437
|
+
interface GenerateResult {
|
|
438
|
+
/** Model id the gateway dispatched to. */
|
|
439
|
+
model: string;
|
|
440
|
+
/** Full generated text (concatenation of all streamed deltas). */
|
|
441
|
+
text: string;
|
|
442
|
+
/** Termination reason. */
|
|
443
|
+
finishReason: FinishReason;
|
|
444
|
+
/** Prompt / completion / total token counts. */
|
|
445
|
+
usage: GenerationUsage;
|
|
446
|
+
/** Worker-generated attempt id. */
|
|
447
|
+
attemptId?: string;
|
|
448
|
+
/** Time-to-first-token in milliseconds. */
|
|
449
|
+
ttftMs?: number;
|
|
450
|
+
/** Average time per output token in milliseconds. */
|
|
451
|
+
tpotMs?: number;
|
|
452
|
+
}
|
|
453
|
+
/**
|
|
454
|
+
* A single message in a chat completion request.
|
|
455
|
+
*
|
|
456
|
+
* Accepted roles: `system`, `user`, `assistant`, `tool`, `developer`. The
|
|
457
|
+
* gateway normalises `developer` → `system` before forwarding to the worker
|
|
458
|
+
* (the OpenAI 2024-08 rename — most chat templates only have `system`).
|
|
459
|
+
*
|
|
460
|
+
* `content` may be a string OR an array of typed content parts. The gateway
|
|
461
|
+
* concatenates `text` / `input_text` parts; `image_url` / `input_image` parts
|
|
462
|
+
* are rejected with `400 unsupported_field` because no vision-capable
|
|
463
|
+
* generation model is configured today (the contract is forward-ready). See
|
|
464
|
+
* `packages/sie_gateway/src/openapi.rs` and `proxy.rs::chat_params_from_json`
|
|
465
|
+
* for the canonical accepted subset.
|
|
466
|
+
*/
|
|
467
|
+
interface ChatMessage {
|
|
468
|
+
role: "system" | "user" | "assistant" | "tool" | "developer";
|
|
469
|
+
content: string | ChatContentPart[] | null;
|
|
470
|
+
name?: string;
|
|
471
|
+
/** Required when `role === "tool"`. */
|
|
472
|
+
tool_call_id?: string;
|
|
473
|
+
/** Populated by the model when calling tools (assistant turns only). */
|
|
474
|
+
tool_calls?: ToolCall[];
|
|
475
|
+
}
|
|
476
|
+
/**
|
|
477
|
+
* One content part inside a multimodal `messages[*].content` array. Only the
|
|
478
|
+
* text variants are accepted today; image parts are declared so callers can
|
|
479
|
+
* see the rejection at the type layer instead of at runtime.
|
|
480
|
+
*/
|
|
481
|
+
type ChatContentPart = {
|
|
482
|
+
type: "text";
|
|
483
|
+
text: string;
|
|
484
|
+
} | {
|
|
485
|
+
type: "input_text";
|
|
486
|
+
text: string;
|
|
487
|
+
};
|
|
488
|
+
/** A tool call emitted by the model. */
|
|
489
|
+
interface ToolCall {
|
|
490
|
+
id: string;
|
|
491
|
+
type: "function";
|
|
492
|
+
function: {
|
|
493
|
+
name: string;
|
|
494
|
+
arguments: string;
|
|
495
|
+
};
|
|
496
|
+
}
|
|
497
|
+
/** A tool the model is allowed to call. */
|
|
498
|
+
interface ToolSpec {
|
|
499
|
+
type: "function";
|
|
500
|
+
function: {
|
|
501
|
+
name: string;
|
|
502
|
+
description?: string;
|
|
503
|
+
/** JSON Schema describing the function arguments. */
|
|
504
|
+
parameters?: Record<string, unknown>;
|
|
505
|
+
};
|
|
506
|
+
}
|
|
507
|
+
/** Tool-routing directive. */
|
|
508
|
+
type ToolChoice = "auto" | "none" | "required" | {
|
|
509
|
+
type: "function";
|
|
510
|
+
function: {
|
|
511
|
+
name: string;
|
|
512
|
+
};
|
|
513
|
+
};
|
|
514
|
+
/** Structured-output `response_format` envelope. */
|
|
515
|
+
interface ResponseFormat {
|
|
516
|
+
type: "json_schema" | "json_object" | "text";
|
|
517
|
+
/** JSON Schema body when `type === "json_schema"`. */
|
|
518
|
+
json_schema?: unknown;
|
|
519
|
+
}
|
|
520
|
+
/** OpenAI-compatible chat-completion finish reason. */
|
|
521
|
+
type ChatFinishReason = "stop" | "length" | "tool_calls" | "content_filter" | null;
|
|
522
|
+
/**
|
|
523
|
+
* Request body for `chatCompletions` / `streamChatCompletions`.
|
|
524
|
+
*
|
|
525
|
+
* Field names are snake_case (the wire shape) so the SDK can hand the object
|
|
526
|
+
* to `JSON.stringify` without further translation. SIE-specific routing
|
|
527
|
+
* fields (`routing_key`, `prompt_cache_key`) match the gateway schema in
|
|
528
|
+
* `packages/sie_gateway/src/openapi.rs`.
|
|
529
|
+
*
|
|
530
|
+
* The gateway honours: `model`, `messages`, `max_tokens` /
|
|
531
|
+
* `max_completion_tokens`, `temperature`, `top_p`, `top_k`, `stop`, `stream`,
|
|
532
|
+
* `stream_options`, `tools`, `tool_choice`, `parallel_tool_calls`,
|
|
533
|
+
* `response_format`, `frequency_penalty`, `presence_penalty` (each in
|
|
534
|
+
* `[-2, 2]`), `repetition_penalty`, `n`, `best_of`, `logprobs`,
|
|
535
|
+
* `top_logprobs`, `logit_bias`, `seed`, `user`, `safety_identifier`,
|
|
536
|
+
* `lora_adapter`, `routing_key`, and `prompt_cache_key`. Unknown fields
|
|
537
|
+
* are rejected with `400 unsupported_field`.
|
|
538
|
+
*/
|
|
539
|
+
interface ChatCompletionRequest {
|
|
540
|
+
model: string;
|
|
541
|
+
messages: ChatMessage[];
|
|
542
|
+
/** Legacy alias; the gateway prefers `max_completion_tokens` when both set. */
|
|
543
|
+
max_tokens?: number;
|
|
544
|
+
max_completion_tokens?: number;
|
|
545
|
+
temperature?: number;
|
|
546
|
+
top_p?: number;
|
|
547
|
+
/**
|
|
548
|
+
* Non-OpenAI sampling knob (vLLM / SGLang). Integer `>= 1`; absent →
|
|
549
|
+
* sampler default (top-k disabled).
|
|
550
|
+
*/
|
|
551
|
+
top_k?: number;
|
|
552
|
+
/**
|
|
553
|
+
* Non-OpenAI repetition penalty (SGLang). Float in `(0.0, 2.0]`; `1.0`
|
|
554
|
+
* means no penalty. Absent → sampler default.
|
|
555
|
+
*/
|
|
556
|
+
repetition_penalty?: number;
|
|
557
|
+
/** Single stop string or list of stop strings. */
|
|
558
|
+
stop?: string | string[];
|
|
559
|
+
/** Set to `true` to use `streamChatCompletions`. `chatCompletions` rejects this. */
|
|
560
|
+
stream?: boolean;
|
|
561
|
+
/** Streaming-only: ask the server to emit a final usage-only chunk before `[DONE]`. */
|
|
562
|
+
stream_options?: {
|
|
563
|
+
include_usage?: boolean;
|
|
564
|
+
};
|
|
565
|
+
tools?: ToolSpec[];
|
|
566
|
+
tool_choice?: ToolChoice;
|
|
567
|
+
/** OpenAI parallel-tool-calls toggle (default `true`). */
|
|
568
|
+
parallel_tool_calls?: boolean;
|
|
569
|
+
response_format?: ResponseFormat;
|
|
570
|
+
/** Accepted in the OpenAI range [-2, 2]; out-of-range values are rejected. */
|
|
571
|
+
frequency_penalty?: number;
|
|
572
|
+
presence_penalty?: number;
|
|
573
|
+
/**
|
|
574
|
+
* Multi-candidate count. Default `1`. `n > 1 && stream === true` is
|
|
575
|
+
* rejected by the gateway with 400.
|
|
576
|
+
*/
|
|
577
|
+
n?: number;
|
|
578
|
+
/**
|
|
579
|
+
* Generate this many candidates and return the top `n` by cumulative
|
|
580
|
+
* logprob. Range `[1, 128]`; requires `best_of >= n` and `stream: false`.
|
|
581
|
+
*/
|
|
582
|
+
best_of?: number;
|
|
583
|
+
/**
|
|
584
|
+
* `true` requests per-token log-probabilities on each chunk / on the
|
|
585
|
+
* aggregate response. Required when `top_logprobs > 0`.
|
|
586
|
+
*/
|
|
587
|
+
logprobs?: boolean;
|
|
588
|
+
/**
|
|
589
|
+
* How many alternate-token logprobs to return per position. Range
|
|
590
|
+
* `[0, 20]` per the OpenAI spec; implies `logprobs: true` when `> 0`.
|
|
591
|
+
*/
|
|
592
|
+
top_logprobs?: number;
|
|
593
|
+
/**
|
|
594
|
+
* `{token_id: bias_float}` map. Gateway validates per-value range
|
|
595
|
+
* `[-100, 100]` and caps map size.
|
|
596
|
+
*/
|
|
597
|
+
logit_bias?: Record<string, number>;
|
|
598
|
+
seed?: number;
|
|
599
|
+
/**
|
|
600
|
+
* OpenAI's free-text end-user identifier. Accepted and logged at debug
|
|
601
|
+
* level by the gateway.
|
|
602
|
+
*/
|
|
603
|
+
user?: string;
|
|
604
|
+
/**
|
|
605
|
+
* OpenAI's free-text safety-tier identifier (replacement for `user` on
|
|
606
|
+
* safety-sensitive accounts). Accepted but intentionally not logged.
|
|
607
|
+
*/
|
|
608
|
+
safety_identifier?: string;
|
|
609
|
+
/**
|
|
610
|
+
* Multi-LoRA: served-name of the adapter to apply on the worker (SIE
|
|
611
|
+
* extension). Must be a non-empty string; unknown names are rejected by
|
|
612
|
+
* the gateway with 400 `unknown_lora`.
|
|
613
|
+
*/
|
|
614
|
+
lora_adapter?: string;
|
|
615
|
+
/** SIE-native routing affinity hint. */
|
|
616
|
+
routing_key?: string;
|
|
617
|
+
/** SIE-native prompt-cache hint. */
|
|
618
|
+
prompt_cache_key?: string;
|
|
619
|
+
}
|
|
620
|
+
/** Token usage block (snake_case, matches the wire shape). */
|
|
621
|
+
interface ChatUsage {
|
|
622
|
+
prompt_tokens: number;
|
|
623
|
+
completion_tokens: number;
|
|
624
|
+
total_tokens: number;
|
|
625
|
+
}
|
|
626
|
+
/** A single choice in a `ChatCompletion` (non-streaming). */
|
|
627
|
+
interface ChatChoice {
|
|
628
|
+
index: number;
|
|
629
|
+
message: ChatMessage;
|
|
630
|
+
finish_reason: ChatFinishReason;
|
|
631
|
+
logprobs: null;
|
|
632
|
+
}
|
|
633
|
+
/** Non-streaming response from `chatCompletions`. */
|
|
634
|
+
interface ChatCompletion {
|
|
635
|
+
id: string;
|
|
636
|
+
object: "chat.completion";
|
|
637
|
+
created: number;
|
|
638
|
+
model: string;
|
|
639
|
+
system_fingerprint: string | null;
|
|
640
|
+
choices: ChatChoice[];
|
|
641
|
+
usage: ChatUsage;
|
|
642
|
+
}
|
|
643
|
+
/** Incremental delta emitted on each streaming chunk. */
|
|
644
|
+
interface ChatDelta {
|
|
645
|
+
/** First chunk only, per the OpenAI streaming contract. */
|
|
646
|
+
role?: "assistant";
|
|
647
|
+
content?: string;
|
|
648
|
+
tool_calls?: ToolCallDelta[];
|
|
649
|
+
}
|
|
650
|
+
/** Partial tool-call materialised across multiple streaming chunks. */
|
|
651
|
+
interface ToolCallDelta {
|
|
652
|
+
index: number;
|
|
653
|
+
id?: string;
|
|
654
|
+
type?: "function";
|
|
655
|
+
function?: {
|
|
656
|
+
name?: string;
|
|
657
|
+
arguments?: string;
|
|
658
|
+
};
|
|
659
|
+
}
|
|
660
|
+
/** A single choice in a streaming `ChatCompletionChunk`. */
|
|
661
|
+
interface ChatChunkChoice {
|
|
662
|
+
index: number;
|
|
663
|
+
delta: ChatDelta;
|
|
664
|
+
finish_reason: ChatFinishReason;
|
|
665
|
+
logprobs: null;
|
|
666
|
+
}
|
|
667
|
+
/**
|
|
668
|
+
* One SSE event from `streamChatCompletions`.
|
|
669
|
+
*
|
|
670
|
+
* The terminal-usage chunk (emitted when `stream_options.include_usage` is
|
|
671
|
+
* `true`) sets `choices: []` and populates `usage`.
|
|
672
|
+
*/
|
|
673
|
+
interface ChatCompletionChunk {
|
|
674
|
+
id: string;
|
|
675
|
+
object: "chat.completion.chunk";
|
|
676
|
+
created: number;
|
|
677
|
+
model: string;
|
|
678
|
+
system_fingerprint: string | null;
|
|
679
|
+
choices: ChatChunkChoice[];
|
|
680
|
+
usage?: ChatUsage;
|
|
681
|
+
}
|
|
682
|
+
/**
|
|
683
|
+
* Per-call options for `chatCompletions` controlling the pre-execution
|
|
684
|
+
* provisioning / retry loop. The request body itself is the separate
|
|
685
|
+
* {@link ChatCompletionRequest} argument; these knobs only govern HOW the
|
|
686
|
+
* SDK talks to the gateway, not WHAT it asks for.
|
|
687
|
+
*
|
|
688
|
+
* All fields are optional and fall back to the client-level defaults
|
|
689
|
+
* (`waitForCapacity`, `provisionTimeout`) when omitted.
|
|
690
|
+
*/
|
|
691
|
+
interface ChatCompletionOptions {
|
|
692
|
+
/**
|
|
693
|
+
* When `true`, retry the SAFE pre-execution capacity signals
|
|
694
|
+
* (`202 Accepted`, `503 MODEL_LOADING`, generic `503`) until
|
|
695
|
+
* `provisionTimeoutMs` elapses. When `false`, the first such signal
|
|
696
|
+
* throws (`ProvisioningError` / `ModelLoadingError` / `ServerError`).
|
|
697
|
+
* Defaults to the client's `waitForCapacity` (false unless the
|
|
698
|
+
* constructor opted in).
|
|
699
|
+
*/
|
|
700
|
+
waitForCapacity?: boolean;
|
|
701
|
+
/**
|
|
702
|
+
* Total cumulative wall-clock budget (ms) for provisioning retries.
|
|
703
|
+
* Independent of the per-attempt `timeout`. Defaults to the client's
|
|
704
|
+
* `provisionTimeout` (typically 5 minutes).
|
|
705
|
+
*/
|
|
706
|
+
provisionTimeoutMs?: number;
|
|
707
|
+
}
|
|
708
|
+
/**
|
|
709
|
+
* One SSE event from `streamGenerate`.
|
|
710
|
+
*
|
|
711
|
+
* SIE-native shape — see `packages/sie_gateway/src/handlers/sse.rs`
|
|
712
|
+
* (`build_generate_chunk_event`). `usage` and `ttft_ms` only land on the
|
|
713
|
+
* terminal chunk; `error` is populated when generation failed mid-stream
|
|
714
|
+
* (handled by throwing `SIEStreamError`, never yielded).
|
|
715
|
+
*/
|
|
716
|
+
interface GenerateChunk {
|
|
717
|
+
request_id: string;
|
|
718
|
+
seq: number;
|
|
719
|
+
text_delta: string;
|
|
720
|
+
done: boolean;
|
|
721
|
+
finish_reason?: "stop" | "length" | "cancelled" | "error";
|
|
722
|
+
usage?: ChatUsage;
|
|
723
|
+
/** Time-to-first-token, milliseconds. Terminal chunk only. */
|
|
724
|
+
ttft_ms?: number;
|
|
725
|
+
/** Populated when the worker / gateway errored mid-stream. */
|
|
726
|
+
error?: {
|
|
727
|
+
code: string;
|
|
728
|
+
message: string;
|
|
729
|
+
};
|
|
730
|
+
}
|
|
408
731
|
/**
|
|
409
732
|
* Options for extract operation.
|
|
410
733
|
*/
|
|
@@ -579,6 +902,177 @@ declare class SIEClient {
|
|
|
579
902
|
* console.log(result.scores[0].itemId); // most relevant
|
|
580
903
|
* ```
|
|
581
904
|
*/
|
|
905
|
+
/**
|
|
906
|
+
* Generate text from a prompt (walking-skeleton SDK surface).
|
|
907
|
+
*
|
|
908
|
+
* The SDK does not currently expose streaming chunks. The worker streams
|
|
909
|
+
* to the gateway, the gateway aggregates, and the SDK returns the
|
|
910
|
+
* assembled result plus SIE-native timing metadata (TTFT, TPOT,
|
|
911
|
+
* attempt id).
|
|
912
|
+
*
|
|
913
|
+
* @example
|
|
914
|
+
* ```typescript
|
|
915
|
+
* const result = await client.generate(
|
|
916
|
+
* "Qwen__Qwen3-4B-Instruct-2507",
|
|
917
|
+
* "Write a haiku about the sea.",
|
|
918
|
+
* { maxNewTokens: 64, temperature: 0.7 },
|
|
919
|
+
* );
|
|
920
|
+
* console.log(result.text);
|
|
921
|
+
* console.log(`TTFT: ${result.ttftMs}ms`);
|
|
922
|
+
* ```
|
|
923
|
+
*/
|
|
924
|
+
generate(model: string, prompt: string, options: GenerateOptions): Promise<GenerateResult>;
|
|
925
|
+
/**
|
|
926
|
+
* Per-attempt JSON POST used by the non-streaming surfaces
|
|
927
|
+
* ({@link generate}, {@link chatCompletions}) inside the
|
|
928
|
+
* {@link withProvisioningRetry} loop.
|
|
929
|
+
*
|
|
930
|
+
* Translates low-level transport failures into typed errors that the
|
|
931
|
+
* retry loop will surface verbatim:
|
|
932
|
+
* - `AbortError` → `SIEConnectionError` (per-attempt timeout)
|
|
933
|
+
* - `TypeError` → `SIEConnectionError` (NOT retried — generation is
|
|
934
|
+
* non-idempotent, so a mid-flight drop must surface instead of
|
|
935
|
+
* silently re-issuing a billable generation)
|
|
936
|
+
*
|
|
937
|
+
* Each call uses a fresh `AbortController` so concurrent retries don't
|
|
938
|
+
* share state, and the per-attempt timeout is bounded by `this.timeout`
|
|
939
|
+
* (NOT the cumulative provisioning budget).
|
|
940
|
+
*/
|
|
941
|
+
private performJsonPost;
|
|
942
|
+
/**
|
|
943
|
+
* Non-streaming chat-completion call against `/v1/chat/completions`.
|
|
944
|
+
*
|
|
945
|
+
* This is the OpenAI-compatible surface. The request body is forwarded
|
|
946
|
+
* verbatim as JSON, so any field documented at
|
|
947
|
+
* <https://platform.openai.com/docs/api-reference/chat/create> can be set;
|
|
948
|
+
* the gateway will reject fields it does not yet support with
|
|
949
|
+
* `400 unsupported_field`. SIE-native routing hints (`routing_key`,
|
|
950
|
+
* `prompt_cache_key`) are part of the same request shape.
|
|
951
|
+
*
|
|
952
|
+
* Error semantics mirror `generate()`: 4xx → `RequestError`, 5xx →
|
|
953
|
+
* `ServerError` (or the more specific `ModelLoadFailedError` for 502
|
|
954
|
+
* `MODEL_LOAD_FAILED`), connection / timeout failures →
|
|
955
|
+
* `SIEConnectionError`.
|
|
956
|
+
*
|
|
957
|
+
* If `req.stream === true`, this method throws `RequestError` immediately —
|
|
958
|
+
* use {@link streamChatCompletions} instead. We do not auto-route because
|
|
959
|
+
* the return type is fundamentally different (`Promise` vs
|
|
960
|
+
* `AsyncGenerator`) and silently flipping would mis-type the call site.
|
|
961
|
+
*
|
|
962
|
+
* @example
|
|
963
|
+
* ```typescript
|
|
964
|
+
* const reply = await client.chatCompletions({
|
|
965
|
+
* model: "Qwen/Qwen3-4B-Instruct-2507",
|
|
966
|
+
* messages: [{ role: "user", content: "Write a haiku about the sea." }],
|
|
967
|
+
* max_completion_tokens: 64,
|
|
968
|
+
* });
|
|
969
|
+
* console.log(reply.choices[0]?.message.content);
|
|
970
|
+
* ```
|
|
971
|
+
*/
|
|
972
|
+
chatCompletions(req: ChatCompletionRequest, options?: ChatCompletionOptions): Promise<ChatCompletion>;
|
|
973
|
+
/**
|
|
974
|
+
* Streaming chat-completion call against `/v1/chat/completions` with
|
|
975
|
+
* `Accept: text/event-stream`.
|
|
976
|
+
*
|
|
977
|
+
* Yields `ChatCompletionChunk` events in the order the gateway emits them.
|
|
978
|
+
* The terminal chunk carries `finish_reason`; if
|
|
979
|
+
* `req.stream_options.include_usage === true`, a final usage-only chunk
|
|
980
|
+
* (`choices: []`, populated `usage`) follows it. The generator completes
|
|
981
|
+
* cleanly on the `data: [DONE]` sentinel.
|
|
982
|
+
*
|
|
983
|
+
* Error semantics:
|
|
984
|
+
*
|
|
985
|
+
* - HTTP 4xx / 5xx **before** the stream opens → throws `RequestError` /
|
|
986
|
+
* `ServerError` (same as {@link chatCompletions}).
|
|
987
|
+
* - A chunk containing `error: { ... }` mid-stream → throws
|
|
988
|
+
* {@link SIEStreamError}. The error chunk is consumed, never yielded.
|
|
989
|
+
* - `signal.abort()` mid-stream → the generator throws
|
|
990
|
+
* `SIEConnectionError` and releases the underlying reader, which
|
|
991
|
+
* fires `StreamCancelGuard` on the gateway side.
|
|
992
|
+
*
|
|
993
|
+
* `req.stream` is set to `true` automatically; any existing value is
|
|
994
|
+
* overwritten. We do not validate `req.stream === false` because the
|
|
995
|
+
* call-site intent is unambiguous.
|
|
996
|
+
*
|
|
997
|
+
* @param req The chat-completion request. See {@link ChatCompletionRequest}.
|
|
998
|
+
* @param signal Optional `AbortSignal` for cooperative cancellation.
|
|
999
|
+
*
|
|
1000
|
+
* @example
|
|
1001
|
+
* ```typescript
|
|
1002
|
+
* const controller = new AbortController();
|
|
1003
|
+
* try {
|
|
1004
|
+
* for await (const chunk of client.streamChatCompletions(
|
|
1005
|
+
* {
|
|
1006
|
+
* model: "Qwen/Qwen3-4B-Instruct-2507",
|
|
1007
|
+
* messages: [{ role: "user", content: "Count to ten." }],
|
|
1008
|
+
* stream_options: { include_usage: true },
|
|
1009
|
+
* },
|
|
1010
|
+
* controller.signal,
|
|
1011
|
+
* )) {
|
|
1012
|
+
* process.stdout.write(chunk.choices[0]?.delta.content ?? "");
|
|
1013
|
+
* }
|
|
1014
|
+
* } catch (err) {
|
|
1015
|
+
* if (err instanceof SIEStreamError) {
|
|
1016
|
+
* console.error(`mid-stream error: ${err.code} — ${err.message}`);
|
|
1017
|
+
* } else throw err;
|
|
1018
|
+
* }
|
|
1019
|
+
* ```
|
|
1020
|
+
*/
|
|
1021
|
+
streamChatCompletions(req: ChatCompletionRequest, signal?: AbortSignal): AsyncGenerator<ChatCompletionChunk, void, undefined>;
|
|
1022
|
+
/**
|
|
1023
|
+
* Streaming companion to {@link generate} — opens an SSE connection to
|
|
1024
|
+
* `/v1/generate/{model}` with `stream: true` and yields the SIE-native
|
|
1025
|
+
* chunk shape documented in
|
|
1026
|
+
* `packages/sie_gateway/src/handlers/sse.rs::build_generate_chunk_event`.
|
|
1027
|
+
*
|
|
1028
|
+
* The first delta carries `seq: 0` and `text_delta` populated; the
|
|
1029
|
+
* terminal chunk has `done: true`, `finish_reason`, and (typically)
|
|
1030
|
+
* `usage` + `ttft_ms`. The generator completes on the `data: [DONE]`
|
|
1031
|
+
* sentinel.
|
|
1032
|
+
*
|
|
1033
|
+
* Error semantics match {@link streamChatCompletions}: pre-stream HTTP
|
|
1034
|
+
* errors throw normally, mid-stream `error` chunks throw
|
|
1035
|
+
* {@link SIEStreamError}.
|
|
1036
|
+
*
|
|
1037
|
+
* @example
|
|
1038
|
+
* ```typescript
|
|
1039
|
+
* for await (const chunk of client.streamGenerate(
|
|
1040
|
+
* "Qwen/Qwen3-4B-Instruct-2507",
|
|
1041
|
+
* "Write a haiku.",
|
|
1042
|
+
* { maxNewTokens: 64, temperature: 0.7 },
|
|
1043
|
+
* )) {
|
|
1044
|
+
* process.stdout.write(chunk.text_delta);
|
|
1045
|
+
* if (chunk.done) console.log(`\nTTFT: ${chunk.ttft_ms}ms`);
|
|
1046
|
+
* }
|
|
1047
|
+
* ```
|
|
1048
|
+
*/
|
|
1049
|
+
streamGenerate(model: string, prompt: string, options: GenerateOptions, signal?: AbortSignal): AsyncGenerator<GenerateChunk, void, undefined>;
|
|
1050
|
+
/**
|
|
1051
|
+
* Shared SSE consumption helper for the streaming methods.
|
|
1052
|
+
*
|
|
1053
|
+
* Performs a pre-stream provisioning retry loop (honoring
|
|
1054
|
+
* `waitForCapacity`/`provisionTimeout`), surfaces pre-stream errors via
|
|
1055
|
+
* {@link handleError} (so callers see the same `RequestError` /
|
|
1056
|
+
* `ServerError` hierarchy as the non-streaming endpoints), then iterates
|
|
1057
|
+
* the SSE payloads via {@link parseSseStream}. Each payload is JSON-parsed;
|
|
1058
|
+
* if the consumer-supplied `extractError` returns an `SIEStreamError`, the
|
|
1059
|
+
* generator throws it instead of yielding the chunk.
|
|
1060
|
+
*
|
|
1061
|
+
* Retry policy mirrors {@link generate}: only the SAFE pre-execution
|
|
1062
|
+
* capacity signals — `202` (provisioning) and `503 MODEL_LOADING` — are
|
|
1063
|
+
* retried, and only while `waitForCapacity` is set and the provision
|
|
1064
|
+
* budget remains. Once the body opens we never retry (the call is
|
|
1065
|
+
* non-idempotent; a mid-stream failure must not re-issue generation).
|
|
1066
|
+
*
|
|
1067
|
+
* @internal
|
|
1068
|
+
*/
|
|
1069
|
+
private consumeSseStream;
|
|
1070
|
+
/**
|
|
1071
|
+
* Build the standard JSON header set for the chat-completions surface.
|
|
1072
|
+
* Pulled out so both the streaming and non-streaming paths agree on
|
|
1073
|
+
* auth / version / content-type wiring.
|
|
1074
|
+
*/
|
|
1075
|
+
private buildChatHeaders;
|
|
582
1076
|
score(model: string, query: Item, items: Item[], options?: ScoreOptions): Promise<ScoreResult>;
|
|
583
1077
|
/**
|
|
584
1078
|
* Extract entities from a single item.
|
|
@@ -607,17 +1101,18 @@ declare class SIEClient {
|
|
|
607
1101
|
*/
|
|
608
1102
|
close(): Promise<void>;
|
|
609
1103
|
/**
|
|
610
|
-
* Create a resource pool for isolated capacity.
|
|
1104
|
+
* Create or update a resource pool for isolated capacity.
|
|
611
1105
|
*
|
|
612
1106
|
* Pools provide dedicated worker capacity, isolated from other clients.
|
|
613
1107
|
* Workers are assigned to pools and only serve requests from that pool.
|
|
614
1108
|
*
|
|
615
1109
|
* @param name - Pool name (used in GPU param as "poolName/machineProfile")
|
|
616
|
-
* @param gpus -
|
|
1110
|
+
* @param gpus - Optional machine profile requirements for pool readiness, e.g., { "l4": 2, "l4-spot": 1 }
|
|
1111
|
+
* @param gpuCaps - Optional maximum assigned workers per machine profile
|
|
617
1112
|
*
|
|
618
1113
|
* @example
|
|
619
1114
|
* ```typescript
|
|
620
|
-
* // Create a pool with 2 L4 GPUs
|
|
1115
|
+
* // Create or update a pool with 2 L4 GPUs
|
|
621
1116
|
* await client.createPool("eval-bench", { l4: 2 });
|
|
622
1117
|
*
|
|
623
1118
|
* // Use the pool for requests
|
|
@@ -627,7 +1122,7 @@ declare class SIEClient {
|
|
|
627
1122
|
* await client.deletePool("eval-bench");
|
|
628
1123
|
* ```
|
|
629
1124
|
*/
|
|
630
|
-
createPool(name: string, gpus
|
|
1125
|
+
createPool(name: string, gpus?: Record<string, number>, gpuCaps?: Record<string, number>): Promise<void>;
|
|
631
1126
|
/**
|
|
632
1127
|
* Get information about a pool.
|
|
633
1128
|
*
|
|
@@ -740,7 +1235,7 @@ declare class SIEClient {
|
|
|
740
1235
|
private detectEndpointType;
|
|
741
1236
|
}
|
|
742
1237
|
|
|
743
|
-
declare const SDK_VERSION = "0.
|
|
1238
|
+
declare const SDK_VERSION = "0.4.1";
|
|
744
1239
|
|
|
745
1240
|
/**
|
|
746
1241
|
* Helpers for converting SIE encode results to plain JavaScript types.
|
|
@@ -956,6 +1451,33 @@ declare class ModelLoadingError extends SIEError {
|
|
|
956
1451
|
readonly model: string | undefined;
|
|
957
1452
|
constructor(message: string, model?: string);
|
|
958
1453
|
}
|
|
1454
|
+
/**
|
|
1455
|
+
* Error surfaced mid-stream from `streamChatCompletions` / `streamGenerate`.
|
|
1456
|
+
*
|
|
1457
|
+
* The SSE wire shape includes optional `error: {message, type, param, code}`
|
|
1458
|
+
* (chat) or `error: {code, message}` (SIE-native generate) on the terminal
|
|
1459
|
+
* chunk. When the SDK sees such a chunk it does NOT yield the chunk; instead
|
|
1460
|
+
* it throws `SIEStreamError`, mirroring the non-streaming `handleError` path
|
|
1461
|
+
* so callers can catch the same way they would for HTTP-level failures.
|
|
1462
|
+
*
|
|
1463
|
+
* Compare with `RequestError` / `ServerError`: those fire before the SSE
|
|
1464
|
+
* stream opens (HTTP 4xx / 5xx). `SIEStreamError` fires after at least one
|
|
1465
|
+
* byte has gone out — the connection itself was healthy, but the worker /
|
|
1466
|
+
* gateway emitted an error envelope partway through generation.
|
|
1467
|
+
*/
|
|
1468
|
+
declare class SIEStreamError extends SIEError {
|
|
1469
|
+
/** SIE-native error code (e.g. `context_exceeded`, `cancelled`). */
|
|
1470
|
+
readonly code: string | undefined;
|
|
1471
|
+
/** OpenAI-style error type (e.g. `context_length_exceeded`, `server_error`). */
|
|
1472
|
+
readonly errorType: string | undefined;
|
|
1473
|
+
/** Offending field name when known (chat shape only). */
|
|
1474
|
+
readonly param: string | null | undefined;
|
|
1475
|
+
constructor(message: string, options?: {
|
|
1476
|
+
code?: string;
|
|
1477
|
+
errorType?: string;
|
|
1478
|
+
param?: string | null;
|
|
1479
|
+
});
|
|
1480
|
+
}
|
|
959
1481
|
/**
|
|
960
1482
|
* Error when the server reports a *terminal* model-load failure.
|
|
961
1483
|
*
|
|
@@ -1114,4 +1636,4 @@ declare function toImageWireFormat(input: ImageInput, format?: "jpeg" | "png" |
|
|
|
1114
1636
|
*/
|
|
1115
1637
|
declare function detectImageFormat(bytes: Uint8Array): "jpeg" | "png" | "webp" | "unknown";
|
|
1116
1638
|
|
|
1117
|
-
export { type CapacityInfo, type Classification, type ClusterStatusMessage, type ClusterSummary, type ClusterWorkerInfo, type DType, type DetectedObject, type EncodeOptions, type EncodeResult, type Entity, type ExtractOptions, type ExtractResult, type GPUMetrics, type ImageInput, type ImageWireFormat, InputTooLongError, type Item, LoraLoadingError, type ModelConfig, type ModelDims, type ModelInfo, ModelLoadFailedError, ModelLoadingError, type ModelState, type ModelStatus, type ModelSummary, type OutputType, PoolError, type PoolInfo, type PoolSpec, type PoolStatus, ProvisioningError, type Relation, RequestError, SDK_VERSION, SIEClient, type SIEClientOptions, SIEConnectionError, SIEError, type ScoreEntry, type ScoreOptions, type ScoreResult, ServerError, type ServerInfo, type SparseResult, type SparseVector, type StatusMessage, type TimingInfo, type WorkerInfo, type WorkerStatusMessage, denseEmbedding, detectImageFormat, multivectorEmbedding, normalizeSparseVector, packMessage, sparseEmbedding, sparseEmbeddingMap, toFloat32Array, toImageBytes, toImageWireFormat, toNumberArray, unpackMessage };
|
|
1639
|
+
export { type CapacityInfo, type ChatChoice, type ChatChunkChoice, type ChatCompletion, type ChatCompletionChunk, type ChatCompletionRequest, type ChatDelta, type ChatFinishReason, type ChatMessage, type ChatUsage, type Classification, type ClusterStatusMessage, type ClusterSummary, type ClusterWorkerInfo, type DType, type DetectedObject, type EncodeOptions, type EncodeResult, type Entity, type ExtractOptions, type ExtractResult, type FinishReason, type GPUMetrics, type GenerateChunk, type GenerateOptions, type GenerateResult, type GenerationUsage, type ImageInput, type ImageWireFormat, InputTooLongError, type Item, LoraLoadingError, type ModelConfig, type ModelDims, type ModelInfo, ModelLoadFailedError, ModelLoadingError, type ModelState, type ModelStatus, type ModelSummary, type OutputType, PoolError, type PoolInfo, type PoolSpec, type PoolStatus, ProvisioningError, type Relation, RequestError, type ResponseFormat, SDK_VERSION, SIEClient, type SIEClientOptions, SIEConnectionError, SIEError, SIEStreamError, type ScoreEntry, type ScoreOptions, type ScoreResult, ServerError, type ServerInfo, type SparseResult, type SparseVector, type StatusMessage, type TimingInfo, type ToolCall, type ToolCallDelta, type ToolChoice, type ToolSpec, type WorkerInfo, type WorkerStatusMessage, denseEmbedding, detectImageFormat, multivectorEmbedding, normalizeSparseVector, packMessage, sparseEmbedding, sparseEmbeddingMap, toFloat32Array, toImageBytes, toImageWireFormat, toNumberArray, unpackMessage };
|