@superlinked/sie-sdk 0.3.3 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -87,6 +87,21 @@ var ModelLoadingError = class extends SIEError {
87
87
  this.model = model;
88
88
  }
89
89
  };
90
+ var SIEStreamError = class extends SIEError {
91
+ /** SIE-native error code (e.g. `context_exceeded`, `cancelled`). */
92
+ code;
93
+ /** OpenAI-style error type (e.g. `context_length_exceeded`, `server_error`). */
94
+ errorType;
95
+ /** Offending field name when known (chat shape only). */
96
+ param;
97
+ constructor(message, options) {
98
+ super(message);
99
+ this.name = "SIEStreamError";
100
+ this.code = options?.code;
101
+ this.errorType = options?.errorType;
102
+ this.param = options?.param;
103
+ }
104
+ };
90
105
  var ModelLoadFailedError = class extends ServerError {
91
106
  /** The model that was requested */
92
107
  model;
@@ -109,6 +124,15 @@ var ModelLoadFailedError = class extends ServerError {
109
124
  this.attempts = options?.attempts ?? 1;
110
125
  }
111
126
  };
127
+ var InputTooLongError = class extends RequestError {
128
+ /** The model that was requested */
129
+ model;
130
+ constructor(message, options) {
131
+ super(message, "INPUT_TOO_LONG", 400);
132
+ this.name = "InputTooLongError";
133
+ this.model = options?.model;
134
+ }
135
+ };
112
136
 
113
137
  // src/internal/constants.ts
114
138
  var MSGPACK_CONTENT_TYPE = "application/msgpack";
@@ -317,7 +341,7 @@ function unpackMessage(data) {
317
341
  function getRetryAfter(header) {
318
342
  if (!header) return void 0;
319
343
  const seconds = Number.parseInt(header, 10);
320
- if (!Number.isNaN(seconds) && seconds > 0) {
344
+ if (!Number.isNaN(seconds) && seconds >= 0) {
321
345
  return seconds * 1e3;
322
346
  }
323
347
  const date = new Date(header);
@@ -380,6 +404,14 @@ async function throwIfModelLoadFailed(response, model) {
380
404
  attempts
381
405
  });
382
406
  }
407
+ async function throwIfInputTooLong(response, model) {
408
+ if (response.status !== 400) return;
409
+ const detail = await getErrorDetail(response.clone());
410
+ if (!detail) return;
411
+ if (detail.code !== "INPUT_TOO_LONG") return;
412
+ const message = typeof detail.message === "string" ? detail.message : "Input exceeds the model's maximum token capacity";
413
+ throw new InputTooLongError(message, { model });
414
+ }
383
415
  async function handleError(response, gpu) {
384
416
  const { status } = response;
385
417
  const detail = await getErrorDetail(response.clone());
@@ -413,6 +445,9 @@ async function handleError(response, gpu) {
413
445
  throw new ProvisioningError(message, gpu, retryAfter);
414
446
  }
415
447
  if (status >= HTTP_CLIENT_ERROR_MIN && status <= HTTP_CLIENT_ERROR_MAX) {
448
+ if (status === 400 && code === "INPUT_TOO_LONG") {
449
+ throw new InputTooLongError(message);
450
+ }
416
451
  throw new RequestError(message, code, status);
417
452
  }
418
453
  if (status >= HTTP_SERVER_ERROR_MIN && status <= HTTP_SERVER_ERROR_MAX) {
@@ -505,6 +540,41 @@ function parseExtractResult(data) {
505
540
  function parseExtractResults(data) {
506
541
  return data.map(parseExtractResult);
507
542
  }
543
+ function describeType(value) {
544
+ if (value === null) return "null";
545
+ return typeof value;
546
+ }
547
+ function coerceTokenCount(v) {
548
+ return typeof v === "number" && Number.isFinite(v) ? Math.trunc(v) : 0;
549
+ }
550
+ function parseGenerateResult(data) {
551
+ const wire = data;
552
+ if (typeof wire.model !== "string") {
553
+ throw new RequestError(
554
+ `Generate response missing string 'model' field: got ${describeType(wire.model)}`
555
+ );
556
+ }
557
+ if (typeof wire.text !== "string") {
558
+ throw new RequestError(
559
+ `Generate response missing string 'text' field: got ${describeType(wire.text)}`
560
+ );
561
+ }
562
+ const usage = wire.usage ?? {};
563
+ const finish = wire.finish_reason ?? "stop";
564
+ return {
565
+ model: wire.model,
566
+ text: wire.text,
567
+ finishReason: finish,
568
+ usage: {
569
+ promptTokens: coerceTokenCount(usage.prompt_tokens),
570
+ completionTokens: coerceTokenCount(usage.completion_tokens),
571
+ totalTokens: coerceTokenCount(usage.total_tokens)
572
+ },
573
+ attemptId: wire.attempt_id,
574
+ ttftMs: wire.ttft_ms,
575
+ tpotMs: wire.tpot_ms
576
+ };
577
+ }
508
578
  function parseCapacityInfo(data, gpuFilter) {
509
579
  const wire = data;
510
580
  let workers = wire.workers ?? [];
@@ -530,11 +600,208 @@ function parseCapacityInfo(data, gpuFilter) {
530
600
  };
531
601
  }
532
602
 
603
+ // src/internal/provisioning.ts
604
+ function sleep(ms) {
605
+ return new Promise((resolve) => setTimeout(resolve, ms));
606
+ }
607
+ async function withProvisioningRetry(performFetch, opts) {
608
+ const startTime = Date.now();
609
+ while (true) {
610
+ const response = await performFetch();
611
+ if (response.status === HTTP_ACCEPTED) {
612
+ if (!opts.waitForCapacity) {
613
+ throw new ProvisioningError(
614
+ "No capacity available. Server is provisioning.",
615
+ opts.gpu,
616
+ getRetryAfter2(response)
617
+ );
618
+ }
619
+ const elapsed = Date.now() - startTime;
620
+ if (elapsed >= opts.provisionTimeoutMs) {
621
+ throw new ProvisioningError(
622
+ `Provisioning timeout after ${elapsed}ms`,
623
+ opts.gpu,
624
+ getRetryAfter2(response)
625
+ );
626
+ }
627
+ const delay = getRetryAfter2(response) ?? DEFAULT_RETRY_DELAY;
628
+ await sleep(Math.min(delay, opts.provisionTimeoutMs - elapsed));
629
+ continue;
630
+ }
631
+ await throwIfModelLoadFailed(response, opts.model);
632
+ if (response.status === 503) {
633
+ const errorCode = await getErrorCode(response.clone());
634
+ if (errorCode === MODEL_LOADING_ERROR_CODE) {
635
+ const elapsed = Date.now() - startTime;
636
+ if (elapsed >= opts.provisionTimeoutMs) {
637
+ throw new ModelLoadingError(`Model loading timeout for '${opts.model}'`, opts.model);
638
+ }
639
+ const delay = getRetryAfter2(response) ?? MODEL_LOADING_DEFAULT_DELAY;
640
+ await sleep(Math.min(delay, opts.provisionTimeoutMs - elapsed));
641
+ continue;
642
+ }
643
+ if (opts.waitForCapacity) {
644
+ const elapsed = Date.now() - startTime;
645
+ if (elapsed < opts.provisionTimeoutMs) {
646
+ const delay = getRetryAfter2(response) ?? DEFAULT_RETRY_DELAY;
647
+ await sleep(Math.min(delay, opts.provisionTimeoutMs - elapsed));
648
+ continue;
649
+ }
650
+ }
651
+ }
652
+ if (!response.ok) {
653
+ await handleError(response);
654
+ }
655
+ if (response.status !== 200) {
656
+ throw new RequestError(`Unexpected response status ${response.status}`);
657
+ }
658
+ return response;
659
+ }
660
+ }
661
+
662
+ // src/sse.ts
663
+ var SSE_DONE = "[DONE]";
664
+ var MAX_SSE_BUFFER_CHARS = 8 * 1024 * 1024;
665
+ async function* parseSseStream(reader, signal) {
666
+ const decoder = new TextDecoder("utf-8");
667
+ let buffer = "";
668
+ let completedCleanly = false;
669
+ const onAbort = () => {
670
+ reader.cancel().catch(() => {
671
+ });
672
+ };
673
+ if (signal) {
674
+ if (signal.aborted) {
675
+ throw new SIEConnectionError("Stream aborted before first read", "other");
676
+ }
677
+ signal.addEventListener("abort", onAbort, { once: true });
678
+ }
679
+ try {
680
+ while (true) {
681
+ if (signal?.aborted) {
682
+ throw new SIEConnectionError("Stream aborted by caller", "other");
683
+ }
684
+ let result;
685
+ try {
686
+ if (signal) {
687
+ if (signal.aborted) {
688
+ throw new SIEConnectionError("Stream aborted by caller", "other");
689
+ }
690
+ result = await new Promise((resolve, reject) => {
691
+ let settled = false;
692
+ const onAbortRace = () => {
693
+ if (settled) return;
694
+ settled = true;
695
+ signal.removeEventListener("abort", onAbortRace);
696
+ reject(new SIEConnectionError("Stream aborted by caller", "other"));
697
+ };
698
+ signal.addEventListener("abort", onAbortRace, { once: true });
699
+ reader.read().then(
700
+ (r) => {
701
+ if (settled) return;
702
+ settled = true;
703
+ signal.removeEventListener("abort", onAbortRace);
704
+ resolve(r);
705
+ },
706
+ (err) => {
707
+ if (settled) return;
708
+ settled = true;
709
+ signal.removeEventListener("abort", onAbortRace);
710
+ reject(err);
711
+ }
712
+ );
713
+ });
714
+ } else {
715
+ result = await reader.read();
716
+ }
717
+ } catch (err) {
718
+ if (err instanceof SIEConnectionError) throw err;
719
+ if (signal?.aborted) {
720
+ throw new SIEConnectionError("Stream aborted by caller", "other");
721
+ }
722
+ throw err;
723
+ }
724
+ if (result.done) {
725
+ buffer += decoder.decode();
726
+ break;
727
+ }
728
+ buffer += decoder.decode(result.value, { stream: true });
729
+ if (buffer.length > MAX_SSE_BUFFER_CHARS) {
730
+ throw new SIEStreamError(
731
+ `SSE event buffer exceeded ${MAX_SSE_BUFFER_CHARS} chars without an event terminator`
732
+ );
733
+ }
734
+ let sepIdx;
735
+ while (true) {
736
+ const lfIdx = buffer.indexOf("\n\n");
737
+ const crlfIdx = buffer.indexOf("\r\n\r\n");
738
+ if (lfIdx === -1 && crlfIdx === -1) break;
739
+ let sepLen = 2;
740
+ if (lfIdx === -1) {
741
+ sepIdx = crlfIdx;
742
+ sepLen = 4;
743
+ } else if (crlfIdx === -1) {
744
+ sepIdx = lfIdx;
745
+ } else {
746
+ if (lfIdx < crlfIdx) {
747
+ sepIdx = lfIdx;
748
+ } else {
749
+ sepIdx = crlfIdx;
750
+ sepLen = 4;
751
+ }
752
+ }
753
+ const eventBlock = buffer.slice(0, sepIdx);
754
+ buffer = buffer.slice(sepIdx + sepLen);
755
+ const payload = extractDataPayload(eventBlock);
756
+ if (payload === null) continue;
757
+ if (payload === SSE_DONE) {
758
+ completedCleanly = true;
759
+ return;
760
+ }
761
+ yield payload;
762
+ }
763
+ }
764
+ const tail = buffer.replace(/\r?\n$/, "");
765
+ if (tail !== "") {
766
+ const payload = extractDataPayload(tail);
767
+ if (payload !== null && payload !== SSE_DONE) {
768
+ yield payload;
769
+ }
770
+ }
771
+ completedCleanly = true;
772
+ } finally {
773
+ if (signal) signal.removeEventListener("abort", onAbort);
774
+ if (completedCleanly) {
775
+ try {
776
+ reader.releaseLock();
777
+ } catch {
778
+ }
779
+ } else {
780
+ await reader.cancel().catch(() => {
781
+ });
782
+ }
783
+ }
784
+ }
785
+ function extractDataPayload(block) {
786
+ const lines = block.split(/\r?\n/);
787
+ const parts = [];
788
+ for (const line of lines) {
789
+ if (line === "" || line.startsWith(":")) continue;
790
+ if (line.startsWith("data:")) {
791
+ let value = line.slice(5);
792
+ if (value.startsWith(" ")) value = value.slice(1);
793
+ parts.push(value);
794
+ }
795
+ }
796
+ if (parts.length === 0) return null;
797
+ return parts.join("\n");
798
+ }
799
+
533
800
  // src/version.ts
534
- var SDK_VERSION = "0.3.3";
801
+ var SDK_VERSION = "0.4.0";
535
802
 
536
803
  // src/client.ts
537
- function sleep(ms) {
804
+ function sleep2(ms) {
538
805
  return new Promise((resolve) => setTimeout(resolve, ms));
539
806
  }
540
807
  function abortableSleep(ms, signal) {
@@ -552,6 +819,19 @@ function abortableSleep(ms, signal) {
552
819
  });
553
820
  }
554
821
  var _LEASE_RENEWAL_MAX_RETRIES = 5;
822
+ function extractChatChunkError(chunk) {
823
+ const err = chunk.error;
824
+ if (!err) return null;
825
+ return new SIEStreamError(err.message ?? "stream error", {
826
+ code: err.code,
827
+ errorType: err.type,
828
+ param: err.param
829
+ });
830
+ }
831
+ function extractGenerateChunkError(chunk) {
832
+ if (!chunk.error) return null;
833
+ return new SIEStreamError(chunk.error.message, { code: chunk.error.code });
834
+ }
555
835
  var SIEClient = class {
556
836
  baseUrl;
557
837
  timeout;
@@ -771,6 +1051,427 @@ var SIEClient = class {
771
1051
  * console.log(result.scores[0].itemId); // most relevant
772
1052
  * ```
773
1053
  */
1054
+ /**
1055
+ * Generate text from a prompt (walking-skeleton SDK surface).
1056
+ *
1057
+ * The SDK does not currently expose streaming chunks. The worker streams
1058
+ * to the gateway, the gateway aggregates, and the SDK returns the
1059
+ * assembled result plus SIE-native timing metadata (TTFT, TPOT,
1060
+ * attempt id).
1061
+ *
1062
+ * @example
1063
+ * ```typescript
1064
+ * const result = await client.generate(
1065
+ * "Qwen__Qwen3-4B-Instruct-2507",
1066
+ * "Write a haiku about the sea.",
1067
+ * { maxNewTokens: 64, temperature: 0.7 },
1068
+ * );
1069
+ * console.log(result.text);
1070
+ * console.log(`TTFT: ${result.ttftMs}ms`);
1071
+ * ```
1072
+ */
1073
+ async generate(model, prompt, options) {
1074
+ const body = {
1075
+ prompt,
1076
+ max_new_tokens: options.maxNewTokens,
1077
+ temperature: options.temperature ?? 1,
1078
+ top_p: options.topP ?? 1
1079
+ };
1080
+ if (options.stop !== void 0) {
1081
+ body.stop = options.stop;
1082
+ }
1083
+ const { pool, gpu } = this.parseGpuParam(options.gpu);
1084
+ const headers = {
1085
+ Accept: "application/json",
1086
+ "Content-Type": JSON_CONTENT_TYPE,
1087
+ [SDK_VERSION_HEADER]: SDK_VERSION
1088
+ };
1089
+ if (pool) headers["X-SIE-Pool"] = pool;
1090
+ if (gpu) headers["X-SIE-MACHINE-PROFILE"] = gpu;
1091
+ if (this.apiKey) headers.Authorization = `Bearer ${this.apiKey}`;
1092
+ const safeModel = model.replaceAll("/", "__");
1093
+ const url = `${this.baseUrl}/v1/generate/${encodeURIComponent(safeModel)}`;
1094
+ const waitForCapacity = options.waitForCapacity ?? this.defaultWaitForCapacity;
1095
+ const response = await withProvisioningRetry(() => this.performJsonPost(url, body, headers), {
1096
+ model,
1097
+ gpu,
1098
+ waitForCapacity,
1099
+ provisionTimeoutMs: this.provisionTimeout
1100
+ });
1101
+ const data = await response.json();
1102
+ if (data === null || typeof data !== "object") {
1103
+ throw new RequestError("Unexpected generate response shape");
1104
+ }
1105
+ return parseGenerateResult(data);
1106
+ }
1107
+ /**
1108
+ * Per-attempt JSON POST used by the non-streaming surfaces
1109
+ * ({@link generate}, {@link chatCompletions}) inside the
1110
+ * {@link withProvisioningRetry} loop.
1111
+ *
1112
+ * Translates low-level transport failures into typed errors that the
1113
+ * retry loop will surface verbatim:
1114
+ * - `AbortError` → `SIEConnectionError` (per-attempt timeout)
1115
+ * - `TypeError` → `SIEConnectionError` (NOT retried — generation is
1116
+ * non-idempotent, so a mid-flight drop must surface instead of
1117
+ * silently re-issuing a billable generation)
1118
+ *
1119
+ * Each call uses a fresh `AbortController` so concurrent retries don't
1120
+ * share state, and the per-attempt timeout is bounded by `this.timeout`
1121
+ * (NOT the cumulative provisioning budget).
1122
+ */
1123
+ async performJsonPost(url, body, headers) {
1124
+ const controller = new AbortController();
1125
+ const timeoutId = setTimeout(() => controller.abort(), this.timeout);
1126
+ try {
1127
+ return await fetch(url, {
1128
+ method: "POST",
1129
+ headers,
1130
+ body: JSON.stringify(body),
1131
+ signal: controller.signal
1132
+ });
1133
+ } catch (err) {
1134
+ if (err instanceof Error && err.name === "AbortError") {
1135
+ throw new SIEConnectionError(`Request timeout after ${this.timeout}ms`, "timeout");
1136
+ }
1137
+ if (err instanceof TypeError) {
1138
+ throw new SIEConnectionError(`Connection failed: ${err.message}`, "connect");
1139
+ }
1140
+ throw err;
1141
+ } finally {
1142
+ clearTimeout(timeoutId);
1143
+ }
1144
+ }
1145
+ /**
1146
+ * Non-streaming chat-completion call against `/v1/chat/completions`.
1147
+ *
1148
+ * This is the OpenAI-compatible surface. The request body is forwarded
1149
+ * verbatim as JSON, so any field documented at
1150
+ * <https://platform.openai.com/docs/api-reference/chat/create> can be set;
1151
+ * the gateway will reject fields it does not yet support with
1152
+ * `400 unsupported_field`. SIE-native routing hints (`routing_key`,
1153
+ * `prompt_cache_key`) are part of the same request shape.
1154
+ *
1155
+ * Error semantics mirror `generate()`: 4xx → `RequestError`, 5xx →
1156
+ * `ServerError` (or the more specific `ModelLoadFailedError` for 502
1157
+ * `MODEL_LOAD_FAILED`), connection / timeout failures →
1158
+ * `SIEConnectionError`.
1159
+ *
1160
+ * If `req.stream === true`, this method throws `RequestError` immediately —
1161
+ * use {@link streamChatCompletions} instead. We do not auto-route because
1162
+ * the return type is fundamentally different (`Promise` vs
1163
+ * `AsyncGenerator`) and silently flipping would mis-type the call site.
1164
+ *
1165
+ * @example
1166
+ * ```typescript
1167
+ * const reply = await client.chatCompletions({
1168
+ * model: "Qwen/Qwen3-4B-Instruct-2507",
1169
+ * messages: [{ role: "user", content: "Write a haiku about the sea." }],
1170
+ * max_completion_tokens: 64,
1171
+ * });
1172
+ * console.log(reply.choices[0]?.message.content);
1173
+ * ```
1174
+ */
1175
+ async chatCompletions(req, options = {}) {
1176
+ if (req.stream === true) {
1177
+ throw new RequestError(
1178
+ "chatCompletions() cannot be used with stream:true \u2014 use streamChatCompletions() instead.",
1179
+ "invalid_request",
1180
+ 400
1181
+ );
1182
+ }
1183
+ const body = { ...req, stream: false };
1184
+ const url = `${this.baseUrl}/v1/chat/completions`;
1185
+ const headers = this.buildChatHeaders("application/json");
1186
+ const waitForCapacity = options.waitForCapacity ?? this.defaultWaitForCapacity;
1187
+ const provisionTimeoutMs = options.provisionTimeoutMs ?? this.provisionTimeout;
1188
+ const response = await withProvisioningRetry(() => this.performJsonPost(url, body, headers), {
1189
+ model: req.model,
1190
+ gpu: void 0,
1191
+ waitForCapacity,
1192
+ provisionTimeoutMs
1193
+ });
1194
+ this.checkServerVersion(response);
1195
+ const data = await response.json();
1196
+ if (data === null || typeof data !== "object") {
1197
+ throw new RequestError("Unexpected chat.completion response shape");
1198
+ }
1199
+ return data;
1200
+ }
1201
+ /**
1202
+ * Streaming chat-completion call against `/v1/chat/completions` with
1203
+ * `Accept: text/event-stream`.
1204
+ *
1205
+ * Yields `ChatCompletionChunk` events in the order the gateway emits them.
1206
+ * The terminal chunk carries `finish_reason`; if
1207
+ * `req.stream_options.include_usage === true`, a final usage-only chunk
1208
+ * (`choices: []`, populated `usage`) follows it. The generator completes
1209
+ * cleanly on the `data: [DONE]` sentinel.
1210
+ *
1211
+ * Error semantics:
1212
+ *
1213
+ * - HTTP 4xx / 5xx **before** the stream opens → throws `RequestError` /
1214
+ * `ServerError` (same as {@link chatCompletions}).
1215
+ * - A chunk containing `error: { ... }` mid-stream → throws
1216
+ * {@link SIEStreamError}. The error chunk is consumed, never yielded.
1217
+ * - `signal.abort()` mid-stream → the generator throws
1218
+ * `SIEConnectionError` and releases the underlying reader, which
1219
+ * fires `StreamCancelGuard` on the gateway side.
1220
+ *
1221
+ * `req.stream` is set to `true` automatically; any existing value is
1222
+ * overwritten. We do not validate `req.stream === false` because the
1223
+ * call-site intent is unambiguous.
1224
+ *
1225
+ * @param req The chat-completion request. See {@link ChatCompletionRequest}.
1226
+ * @param signal Optional `AbortSignal` for cooperative cancellation.
1227
+ *
1228
+ * @example
1229
+ * ```typescript
1230
+ * const controller = new AbortController();
1231
+ * try {
1232
+ * for await (const chunk of client.streamChatCompletions(
1233
+ * {
1234
+ * model: "Qwen/Qwen3-4B-Instruct-2507",
1235
+ * messages: [{ role: "user", content: "Count to ten." }],
1236
+ * stream_options: { include_usage: true },
1237
+ * },
1238
+ * controller.signal,
1239
+ * )) {
1240
+ * process.stdout.write(chunk.choices[0]?.delta.content ?? "");
1241
+ * }
1242
+ * } catch (err) {
1243
+ * if (err instanceof SIEStreamError) {
1244
+ * console.error(`mid-stream error: ${err.code} — ${err.message}`);
1245
+ * } else throw err;
1246
+ * }
1247
+ * ```
1248
+ */
1249
+ async *streamChatCompletions(req, signal) {
1250
+ const body = { ...req, stream: true };
1251
+ const url = `${this.baseUrl}/v1/chat/completions`;
1252
+ yield* this.consumeSseStream(
1253
+ url,
1254
+ body,
1255
+ req.model,
1256
+ signal,
1257
+ (chunk) => extractChatChunkError(chunk)
1258
+ );
1259
+ }
1260
+ /**
1261
+ * Streaming companion to {@link generate} — opens an SSE connection to
1262
+ * `/v1/generate/{model}` with `stream: true` and yields the SIE-native
1263
+ * chunk shape documented in
1264
+ * `packages/sie_gateway/src/handlers/sse.rs::build_generate_chunk_event`.
1265
+ *
1266
+ * The first delta carries `seq: 0` and `text_delta` populated; the
1267
+ * terminal chunk has `done: true`, `finish_reason`, and (typically)
1268
+ * `usage` + `ttft_ms`. The generator completes on the `data: [DONE]`
1269
+ * sentinel.
1270
+ *
1271
+ * Error semantics match {@link streamChatCompletions}: pre-stream HTTP
1272
+ * errors throw normally, mid-stream `error` chunks throw
1273
+ * {@link SIEStreamError}.
1274
+ *
1275
+ * @example
1276
+ * ```typescript
1277
+ * for await (const chunk of client.streamGenerate(
1278
+ * "Qwen/Qwen3-4B-Instruct-2507",
1279
+ * "Write a haiku.",
1280
+ * { maxNewTokens: 64, temperature: 0.7 },
1281
+ * )) {
1282
+ * process.stdout.write(chunk.text_delta);
1283
+ * if (chunk.done) console.log(`\nTTFT: ${chunk.ttft_ms}ms`);
1284
+ * }
1285
+ * ```
1286
+ */
1287
+ async *streamGenerate(model, prompt, options, signal) {
1288
+ const body = {
1289
+ prompt,
1290
+ max_new_tokens: options.maxNewTokens,
1291
+ temperature: options.temperature ?? 1,
1292
+ top_p: options.topP ?? 1,
1293
+ stream: true
1294
+ };
1295
+ if (options.stop !== void 0) body.stop = options.stop;
1296
+ const safeModel = model.replaceAll("/", "__");
1297
+ const url = `${this.baseUrl}/v1/generate/${encodeURIComponent(safeModel)}`;
1298
+ const { pool, gpu } = this.parseGpuParam(options.gpu);
1299
+ const waitForCapacity = options.waitForCapacity ?? this.defaultWaitForCapacity;
1300
+ yield* this.consumeSseStream(
1301
+ url,
1302
+ body,
1303
+ model,
1304
+ signal,
1305
+ (chunk) => extractGenerateChunkError(chunk),
1306
+ { pool, gpu },
1307
+ { waitForCapacity }
1308
+ );
1309
+ }
1310
+ /**
1311
+ * Shared SSE consumption helper for the streaming methods.
1312
+ *
1313
+ * Performs a pre-stream provisioning retry loop (honoring
1314
+ * `waitForCapacity`/`provisionTimeout`), surfaces pre-stream errors via
1315
+ * {@link handleError} (so callers see the same `RequestError` /
1316
+ * `ServerError` hierarchy as the non-streaming endpoints), then iterates
1317
+ * the SSE payloads via {@link parseSseStream}. Each payload is JSON-parsed;
1318
+ * if the consumer-supplied `extractError` returns an `SIEStreamError`, the
1319
+ * generator throws it instead of yielding the chunk.
1320
+ *
1321
+ * Retry policy mirrors {@link generate}: only the SAFE pre-execution
1322
+ * capacity signals — `202` (provisioning) and `503 MODEL_LOADING` — are
1323
+ * retried, and only while `waitForCapacity` is set and the provision
1324
+ * budget remains. Once the body opens we never retry (the call is
1325
+ * non-idempotent; a mid-stream failure must not re-issue generation).
1326
+ *
1327
+ * @internal
1328
+ */
1329
+ async *consumeSseStream(url, body, model, signal, extractError, routing, provisioning) {
1330
+ const headers = this.buildChatHeaders("text/event-stream");
1331
+ if (routing?.pool) headers["X-SIE-Pool"] = routing.pool;
1332
+ if (routing?.gpu) headers["X-SIE-MACHINE-PROFILE"] = routing.gpu;
1333
+ const waitForCapacity = provisioning?.waitForCapacity ?? this.defaultWaitForCapacity;
1334
+ const gpu = routing?.gpu;
1335
+ const controller = new AbortController();
1336
+ const onCallerAbort = () => controller.abort();
1337
+ if (signal) {
1338
+ if (signal.aborted) {
1339
+ throw new SIEConnectionError("Stream aborted before request", "other");
1340
+ }
1341
+ signal.addEventListener("abort", onCallerAbort, { once: true });
1342
+ }
1343
+ try {
1344
+ const startTime = Date.now();
1345
+ let response;
1346
+ while (true) {
1347
+ if (signal?.aborted) {
1348
+ throw new SIEConnectionError("Stream aborted before request", "other");
1349
+ }
1350
+ const preStreamTimeoutId = setTimeout(() => controller.abort(), this.timeout);
1351
+ let attemptResponse;
1352
+ try {
1353
+ attemptResponse = await fetch(url, {
1354
+ method: "POST",
1355
+ headers,
1356
+ body: JSON.stringify(body),
1357
+ signal: controller.signal
1358
+ });
1359
+ } catch (error) {
1360
+ if (signal?.aborted) {
1361
+ throw new SIEConnectionError("Stream aborted before response", "other");
1362
+ }
1363
+ if (error instanceof Error && error.name === "AbortError") {
1364
+ throw new SIEConnectionError(`Stream open timeout after ${this.timeout}ms`, "timeout");
1365
+ }
1366
+ if (error instanceof TypeError) {
1367
+ throw new SIEConnectionError(`Connection failed: ${error.message}`, "connect");
1368
+ }
1369
+ throw error;
1370
+ } finally {
1371
+ clearTimeout(preStreamTimeoutId);
1372
+ }
1373
+ if (attemptResponse.status === HTTP_ACCEPTED) {
1374
+ if (!waitForCapacity) {
1375
+ throw new ProvisioningError(
1376
+ "No capacity available. Server is provisioning.",
1377
+ gpu,
1378
+ getRetryAfter2(attemptResponse)
1379
+ );
1380
+ }
1381
+ const elapsed = Date.now() - startTime;
1382
+ if (elapsed >= this.provisionTimeout) {
1383
+ throw new ProvisioningError(
1384
+ `Provisioning timeout after ${elapsed}ms`,
1385
+ gpu,
1386
+ getRetryAfter2(attemptResponse)
1387
+ );
1388
+ }
1389
+ const delay = getRetryAfter2(attemptResponse) ?? DEFAULT_RETRY_DELAY;
1390
+ if (await abortableSleep(
1391
+ Math.min(delay, this.provisionTimeout - elapsed),
1392
+ controller.signal
1393
+ )) {
1394
+ throw new SIEConnectionError("Stream aborted while provisioning", "other");
1395
+ }
1396
+ continue;
1397
+ }
1398
+ await throwIfModelLoadFailed(attemptResponse, model);
1399
+ if (attemptResponse.status === 503) {
1400
+ const errorCode = await getErrorCode(attemptResponse.clone());
1401
+ if (errorCode === MODEL_LOADING_ERROR_CODE && waitForCapacity) {
1402
+ const elapsed = Date.now() - startTime;
1403
+ if (elapsed >= this.provisionTimeout) {
1404
+ throw new ModelLoadingError(`Model loading timeout for '${model}'`, model);
1405
+ }
1406
+ const delay = getRetryAfter2(attemptResponse) ?? MODEL_LOADING_DEFAULT_DELAY;
1407
+ if (await abortableSleep(
1408
+ Math.min(delay, this.provisionTimeout - elapsed),
1409
+ controller.signal
1410
+ )) {
1411
+ throw new SIEConnectionError("Stream aborted while provisioning", "other");
1412
+ }
1413
+ continue;
1414
+ }
1415
+ if (waitForCapacity) {
1416
+ const elapsed = Date.now() - startTime;
1417
+ if (elapsed < this.provisionTimeout) {
1418
+ const delay = getRetryAfter2(attemptResponse) ?? DEFAULT_RETRY_DELAY;
1419
+ if (await abortableSleep(
1420
+ Math.min(delay, this.provisionTimeout - elapsed),
1421
+ controller.signal
1422
+ )) {
1423
+ throw new SIEConnectionError("Stream aborted while provisioning", "other");
1424
+ }
1425
+ continue;
1426
+ }
1427
+ }
1428
+ }
1429
+ if (attemptResponse.status !== 200) {
1430
+ await handleError(attemptResponse);
1431
+ }
1432
+ response = attemptResponse;
1433
+ break;
1434
+ }
1435
+ if (!response) {
1436
+ throw new RequestError("Streaming request failed without producing a response");
1437
+ }
1438
+ this.checkServerVersion(response);
1439
+ const bodyStream = response.body;
1440
+ if (!bodyStream) {
1441
+ throw new RequestError("Streaming response has no body");
1442
+ }
1443
+ const reader = bodyStream.getReader();
1444
+ for await (const payload of parseSseStream(reader, signal ?? controller.signal)) {
1445
+ let chunk;
1446
+ try {
1447
+ chunk = JSON.parse(payload);
1448
+ } catch (err) {
1449
+ throw new RequestError(
1450
+ `Failed to parse SSE chunk as JSON: ${err instanceof Error ? err.message : String(err)}`
1451
+ );
1452
+ }
1453
+ const streamErr = extractError(chunk);
1454
+ if (streamErr) throw streamErr;
1455
+ yield chunk;
1456
+ }
1457
+ } finally {
1458
+ if (signal) signal.removeEventListener("abort", onCallerAbort);
1459
+ }
1460
+ }
1461
+ /**
1462
+ * Build the standard JSON header set for the chat-completions surface.
1463
+ * Pulled out so both the streaming and non-streaming paths agree on
1464
+ * auth / version / content-type wiring.
1465
+ */
1466
+ buildChatHeaders(accept) {
1467
+ const headers = {
1468
+ Accept: accept,
1469
+ "Content-Type": JSON_CONTENT_TYPE,
1470
+ [SDK_VERSION_HEADER]: SDK_VERSION
1471
+ };
1472
+ if (this.apiKey) headers.Authorization = `Bearer ${this.apiKey}`;
1473
+ return headers;
1474
+ }
774
1475
  async score(model, query, items, options = {}) {
775
1476
  const body = {
776
1477
  query,
@@ -820,6 +1521,9 @@ var SIEClient = class {
820
1521
  if (options.threshold !== void 0) {
821
1522
  params.threshold = options.threshold;
822
1523
  }
1524
+ if (options.adapterOptions !== void 0) {
1525
+ params.options = options.adapterOptions;
1526
+ }
823
1527
  body.params = params;
824
1528
  const waitForCapacity = options.waitForCapacity ?? this.defaultWaitForCapacity;
825
1529
  const { pool, gpu } = this.parseGpuParam(options.gpu);
@@ -859,17 +1563,18 @@ var SIEClient = class {
859
1563
  this.pools.clear();
860
1564
  }
861
1565
  /**
862
- * Create a resource pool for isolated capacity.
1566
+ * Create or update a resource pool for isolated capacity.
863
1567
  *
864
1568
  * Pools provide dedicated worker capacity, isolated from other clients.
865
1569
  * Workers are assigned to pools and only serve requests from that pool.
866
1570
  *
867
1571
  * @param name - Pool name (used in GPU param as "poolName/machineProfile")
868
- * @param gpus - Machine profile requirements, e.g., { "l4": 2, "l4-spot": 1 }
1572
+ * @param gpus - Optional machine profile requirements for pool readiness, e.g., { "l4": 2, "l4-spot": 1 }
1573
+ * @param gpuCaps - Optional maximum assigned workers per machine profile
869
1574
  *
870
1575
  * @example
871
1576
  * ```typescript
872
- * // Create a pool with 2 L4 GPUs
1577
+ * // Create or update a pool with 2 L4 GPUs
873
1578
  * await client.createPool("eval-bench", { l4: 2 });
874
1579
  *
875
1580
  * // Use the pool for requests
@@ -879,11 +1584,17 @@ var SIEClient = class {
879
1584
  * await client.deletePool("eval-bench");
880
1585
  * ```
881
1586
  */
882
- async createPool(name, gpus) {
883
- if (this.pools.has(name)) {
884
- return;
1587
+ async createPool(name, gpus, gpuCaps) {
1588
+ const alreadyTracking = this.pools.has(name);
1589
+ const requestBody = {
1590
+ name
1591
+ };
1592
+ if (gpus !== void 0) {
1593
+ requestBody.gpus = gpus;
1594
+ }
1595
+ if (gpuCaps) {
1596
+ requestBody.gpu_caps = gpuCaps;
885
1597
  }
886
- const requestBody = { name, gpus };
887
1598
  const url = `${this.baseUrl}/v1/pools`;
888
1599
  const headers = {
889
1600
  "Content-Type": JSON_CONTENT_TYPE,
@@ -911,6 +1622,9 @@ var SIEClient = class {
911
1622
  }
912
1623
  throw new PoolError(`Failed to create pool '${name}': ${errorMsg}`, name);
913
1624
  }
1625
+ if (alreadyTracking || this.pools.has(name)) {
1626
+ return;
1627
+ }
914
1628
  const abortController = new AbortController();
915
1629
  const poolState = {
916
1630
  timeoutId: null,
@@ -942,7 +1656,7 @@ var SIEClient = class {
942
1656
  signal: perAttempt.signal
943
1657
  });
944
1658
  if (resp.ok) break;
945
- } catch (error) {
1659
+ } catch {
946
1660
  if (abortController.signal.aborted) return;
947
1661
  } finally {
948
1662
  clearTimeout(attemptTimeout);
@@ -1179,7 +1893,7 @@ var SIEClient = class {
1179
1893
  }
1180
1894
  const remaining = timeout - elapsed;
1181
1895
  const delay = Math.min(pollInterval, remaining);
1182
- await sleep(delay);
1896
+ await sleep2(delay);
1183
1897
  }
1184
1898
  }
1185
1899
  /**
@@ -1206,7 +1920,7 @@ var SIEClient = class {
1206
1920
  if (elapsed < this.provisionTimeout) {
1207
1921
  const remaining = this.provisionTimeout - elapsed;
1208
1922
  const delay = Math.min(DEFAULT_RETRY_DELAY, remaining);
1209
- await sleep(delay);
1923
+ await sleep2(delay);
1210
1924
  continue;
1211
1925
  }
1212
1926
  }
@@ -1232,10 +1946,11 @@ var SIEClient = class {
1232
1946
  const delay = retryAfter ?? DEFAULT_RETRY_DELAY;
1233
1947
  const remaining = this.provisionTimeout - elapsed;
1234
1948
  const actualDelay = Math.min(delay, remaining);
1235
- await sleep(actualDelay);
1949
+ await sleep2(actualDelay);
1236
1950
  continue;
1237
1951
  }
1238
1952
  await throwIfModelLoadFailed(response, model);
1953
+ await throwIfInputTooLong(response, model);
1239
1954
  if (response.status === 503) {
1240
1955
  const clonedResponse = response.clone();
1241
1956
  const errorCode = await getErrorCode(clonedResponse);
@@ -1251,7 +1966,7 @@ var SIEClient = class {
1251
1966
  }
1252
1967
  const retryAfter = getRetryAfter2(response);
1253
1968
  const delay = retryAfter ?? LORA_LOADING_DEFAULT_DELAY;
1254
- await sleep(delay);
1969
+ await sleep2(delay);
1255
1970
  continue;
1256
1971
  }
1257
1972
  if (errorCode === MODEL_LOADING_ERROR_CODE) {
@@ -1266,7 +1981,7 @@ var SIEClient = class {
1266
1981
  const delay = retryAfter ?? MODEL_LOADING_DEFAULT_DELAY;
1267
1982
  const remaining = this.provisionTimeout - elapsed;
1268
1983
  const actualDelay = Math.min(delay, remaining);
1269
- await sleep(actualDelay);
1984
+ await sleep2(actualDelay);
1270
1985
  continue;
1271
1986
  }
1272
1987
  if (waitForCapacity) {
@@ -1276,7 +1991,7 @@ var SIEClient = class {
1276
1991
  const delay = retryAfter ?? DEFAULT_RETRY_DELAY;
1277
1992
  const remaining = this.provisionTimeout - elapsed;
1278
1993
  const actualDelay = Math.min(delay, remaining);
1279
- await sleep(actualDelay);
1994
+ await sleep2(actualDelay);
1280
1995
  continue;
1281
1996
  }
1282
1997
  }
@@ -1557,6 +2272,7 @@ function detectImageFormat(bytes) {
1557
2272
  return "unknown";
1558
2273
  }
1559
2274
 
2275
+ exports.InputTooLongError = InputTooLongError;
1560
2276
  exports.LoraLoadingError = LoraLoadingError;
1561
2277
  exports.ModelLoadFailedError = ModelLoadFailedError;
1562
2278
  exports.ModelLoadingError = ModelLoadingError;
@@ -1567,6 +2283,7 @@ exports.SDK_VERSION = SDK_VERSION;
1567
2283
  exports.SIEClient = SIEClient;
1568
2284
  exports.SIEConnectionError = SIEConnectionError;
1569
2285
  exports.SIEError = SIEError;
2286
+ exports.SIEStreamError = SIEStreamError;
1570
2287
  exports.ServerError = ServerError;
1571
2288
  exports.denseEmbedding = denseEmbedding;
1572
2289
  exports.detectImageFormat = detectImageFormat;