@superlinked/sie-sdk 0.3.3 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -85,6 +85,21 @@ var ModelLoadingError = class extends SIEError {
85
85
  this.model = model;
86
86
  }
87
87
  };
88
+ var SIEStreamError = class extends SIEError {
89
+ /** SIE-native error code (e.g. `context_exceeded`, `cancelled`). */
90
+ code;
91
+ /** OpenAI-style error type (e.g. `context_length_exceeded`, `server_error`). */
92
+ errorType;
93
+ /** Offending field name when known (chat shape only). */
94
+ param;
95
+ constructor(message, options) {
96
+ super(message);
97
+ this.name = "SIEStreamError";
98
+ this.code = options?.code;
99
+ this.errorType = options?.errorType;
100
+ this.param = options?.param;
101
+ }
102
+ };
88
103
  var ModelLoadFailedError = class extends ServerError {
89
104
  /** The model that was requested */
90
105
  model;
@@ -107,6 +122,15 @@ var ModelLoadFailedError = class extends ServerError {
107
122
  this.attempts = options?.attempts ?? 1;
108
123
  }
109
124
  };
125
+ var InputTooLongError = class extends RequestError {
126
+ /** The model that was requested */
127
+ model;
128
+ constructor(message, options) {
129
+ super(message, "INPUT_TOO_LONG", 400);
130
+ this.name = "InputTooLongError";
131
+ this.model = options?.model;
132
+ }
133
+ };
110
134
 
111
135
  // src/internal/constants.ts
112
136
  var MSGPACK_CONTENT_TYPE = "application/msgpack";
@@ -315,7 +339,7 @@ function unpackMessage(data) {
315
339
  function getRetryAfter(header) {
316
340
  if (!header) return void 0;
317
341
  const seconds = Number.parseInt(header, 10);
318
- if (!Number.isNaN(seconds) && seconds > 0) {
342
+ if (!Number.isNaN(seconds) && seconds >= 0) {
319
343
  return seconds * 1e3;
320
344
  }
321
345
  const date = new Date(header);
@@ -378,6 +402,14 @@ async function throwIfModelLoadFailed(response, model) {
378
402
  attempts
379
403
  });
380
404
  }
405
+ async function throwIfInputTooLong(response, model) {
406
+ if (response.status !== 400) return;
407
+ const detail = await getErrorDetail(response.clone());
408
+ if (!detail) return;
409
+ if (detail.code !== "INPUT_TOO_LONG") return;
410
+ const message = typeof detail.message === "string" ? detail.message : "Input exceeds the model's maximum token capacity";
411
+ throw new InputTooLongError(message, { model });
412
+ }
381
413
  async function handleError(response, gpu) {
382
414
  const { status } = response;
383
415
  const detail = await getErrorDetail(response.clone());
@@ -411,6 +443,9 @@ async function handleError(response, gpu) {
411
443
  throw new ProvisioningError(message, gpu, retryAfter);
412
444
  }
413
445
  if (status >= HTTP_CLIENT_ERROR_MIN && status <= HTTP_CLIENT_ERROR_MAX) {
446
+ if (status === 400 && code === "INPUT_TOO_LONG") {
447
+ throw new InputTooLongError(message);
448
+ }
414
449
  throw new RequestError(message, code, status);
415
450
  }
416
451
  if (status >= HTTP_SERVER_ERROR_MIN && status <= HTTP_SERVER_ERROR_MAX) {
@@ -503,6 +538,41 @@ function parseExtractResult(data) {
503
538
  function parseExtractResults(data) {
504
539
  return data.map(parseExtractResult);
505
540
  }
541
+ function describeType(value) {
542
+ if (value === null) return "null";
543
+ return typeof value;
544
+ }
545
+ function coerceTokenCount(v) {
546
+ return typeof v === "number" && Number.isFinite(v) ? Math.trunc(v) : 0;
547
+ }
548
+ function parseGenerateResult(data) {
549
+ const wire = data;
550
+ if (typeof wire.model !== "string") {
551
+ throw new RequestError(
552
+ `Generate response missing string 'model' field: got ${describeType(wire.model)}`
553
+ );
554
+ }
555
+ if (typeof wire.text !== "string") {
556
+ throw new RequestError(
557
+ `Generate response missing string 'text' field: got ${describeType(wire.text)}`
558
+ );
559
+ }
560
+ const usage = wire.usage ?? {};
561
+ const finish = wire.finish_reason ?? "stop";
562
+ return {
563
+ model: wire.model,
564
+ text: wire.text,
565
+ finishReason: finish,
566
+ usage: {
567
+ promptTokens: coerceTokenCount(usage.prompt_tokens),
568
+ completionTokens: coerceTokenCount(usage.completion_tokens),
569
+ totalTokens: coerceTokenCount(usage.total_tokens)
570
+ },
571
+ attemptId: wire.attempt_id,
572
+ ttftMs: wire.ttft_ms,
573
+ tpotMs: wire.tpot_ms
574
+ };
575
+ }
506
576
  function parseCapacityInfo(data, gpuFilter) {
507
577
  const wire = data;
508
578
  let workers = wire.workers ?? [];
@@ -528,11 +598,208 @@ function parseCapacityInfo(data, gpuFilter) {
528
598
  };
529
599
  }
530
600
 
601
+ // src/internal/provisioning.ts
602
+ function sleep(ms) {
603
+ return new Promise((resolve) => setTimeout(resolve, ms));
604
+ }
605
+ async function withProvisioningRetry(performFetch, opts) {
606
+ const startTime = Date.now();
607
+ while (true) {
608
+ const response = await performFetch();
609
+ if (response.status === HTTP_ACCEPTED) {
610
+ if (!opts.waitForCapacity) {
611
+ throw new ProvisioningError(
612
+ "No capacity available. Server is provisioning.",
613
+ opts.gpu,
614
+ getRetryAfter2(response)
615
+ );
616
+ }
617
+ const elapsed = Date.now() - startTime;
618
+ if (elapsed >= opts.provisionTimeoutMs) {
619
+ throw new ProvisioningError(
620
+ `Provisioning timeout after ${elapsed}ms`,
621
+ opts.gpu,
622
+ getRetryAfter2(response)
623
+ );
624
+ }
625
+ const delay = getRetryAfter2(response) ?? DEFAULT_RETRY_DELAY;
626
+ await sleep(Math.min(delay, opts.provisionTimeoutMs - elapsed));
627
+ continue;
628
+ }
629
+ await throwIfModelLoadFailed(response, opts.model);
630
+ if (response.status === 503) {
631
+ const errorCode = await getErrorCode(response.clone());
632
+ if (errorCode === MODEL_LOADING_ERROR_CODE) {
633
+ const elapsed = Date.now() - startTime;
634
+ if (elapsed >= opts.provisionTimeoutMs) {
635
+ throw new ModelLoadingError(`Model loading timeout for '${opts.model}'`, opts.model);
636
+ }
637
+ const delay = getRetryAfter2(response) ?? MODEL_LOADING_DEFAULT_DELAY;
638
+ await sleep(Math.min(delay, opts.provisionTimeoutMs - elapsed));
639
+ continue;
640
+ }
641
+ if (opts.waitForCapacity) {
642
+ const elapsed = Date.now() - startTime;
643
+ if (elapsed < opts.provisionTimeoutMs) {
644
+ const delay = getRetryAfter2(response) ?? DEFAULT_RETRY_DELAY;
645
+ await sleep(Math.min(delay, opts.provisionTimeoutMs - elapsed));
646
+ continue;
647
+ }
648
+ }
649
+ }
650
+ if (!response.ok) {
651
+ await handleError(response);
652
+ }
653
+ if (response.status !== 200) {
654
+ throw new RequestError(`Unexpected response status ${response.status}`);
655
+ }
656
+ return response;
657
+ }
658
+ }
659
+
660
+ // src/sse.ts
661
+ var SSE_DONE = "[DONE]";
662
+ var MAX_SSE_BUFFER_CHARS = 8 * 1024 * 1024;
663
+ async function* parseSseStream(reader, signal) {
664
+ const decoder = new TextDecoder("utf-8");
665
+ let buffer = "";
666
+ let completedCleanly = false;
667
+ const onAbort = () => {
668
+ reader.cancel().catch(() => {
669
+ });
670
+ };
671
+ if (signal) {
672
+ if (signal.aborted) {
673
+ throw new SIEConnectionError("Stream aborted before first read", "other");
674
+ }
675
+ signal.addEventListener("abort", onAbort, { once: true });
676
+ }
677
+ try {
678
+ while (true) {
679
+ if (signal?.aborted) {
680
+ throw new SIEConnectionError("Stream aborted by caller", "other");
681
+ }
682
+ let result;
683
+ try {
684
+ if (signal) {
685
+ if (signal.aborted) {
686
+ throw new SIEConnectionError("Stream aborted by caller", "other");
687
+ }
688
+ result = await new Promise((resolve, reject) => {
689
+ let settled = false;
690
+ const onAbortRace = () => {
691
+ if (settled) return;
692
+ settled = true;
693
+ signal.removeEventListener("abort", onAbortRace);
694
+ reject(new SIEConnectionError("Stream aborted by caller", "other"));
695
+ };
696
+ signal.addEventListener("abort", onAbortRace, { once: true });
697
+ reader.read().then(
698
+ (r) => {
699
+ if (settled) return;
700
+ settled = true;
701
+ signal.removeEventListener("abort", onAbortRace);
702
+ resolve(r);
703
+ },
704
+ (err) => {
705
+ if (settled) return;
706
+ settled = true;
707
+ signal.removeEventListener("abort", onAbortRace);
708
+ reject(err);
709
+ }
710
+ );
711
+ });
712
+ } else {
713
+ result = await reader.read();
714
+ }
715
+ } catch (err) {
716
+ if (err instanceof SIEConnectionError) throw err;
717
+ if (signal?.aborted) {
718
+ throw new SIEConnectionError("Stream aborted by caller", "other");
719
+ }
720
+ throw err;
721
+ }
722
+ if (result.done) {
723
+ buffer += decoder.decode();
724
+ break;
725
+ }
726
+ buffer += decoder.decode(result.value, { stream: true });
727
+ if (buffer.length > MAX_SSE_BUFFER_CHARS) {
728
+ throw new SIEStreamError(
729
+ `SSE event buffer exceeded ${MAX_SSE_BUFFER_CHARS} chars without an event terminator`
730
+ );
731
+ }
732
+ let sepIdx;
733
+ while (true) {
734
+ const lfIdx = buffer.indexOf("\n\n");
735
+ const crlfIdx = buffer.indexOf("\r\n\r\n");
736
+ if (lfIdx === -1 && crlfIdx === -1) break;
737
+ let sepLen = 2;
738
+ if (lfIdx === -1) {
739
+ sepIdx = crlfIdx;
740
+ sepLen = 4;
741
+ } else if (crlfIdx === -1) {
742
+ sepIdx = lfIdx;
743
+ } else {
744
+ if (lfIdx < crlfIdx) {
745
+ sepIdx = lfIdx;
746
+ } else {
747
+ sepIdx = crlfIdx;
748
+ sepLen = 4;
749
+ }
750
+ }
751
+ const eventBlock = buffer.slice(0, sepIdx);
752
+ buffer = buffer.slice(sepIdx + sepLen);
753
+ const payload = extractDataPayload(eventBlock);
754
+ if (payload === null) continue;
755
+ if (payload === SSE_DONE) {
756
+ completedCleanly = true;
757
+ return;
758
+ }
759
+ yield payload;
760
+ }
761
+ }
762
+ const tail = buffer.replace(/\r?\n$/, "");
763
+ if (tail !== "") {
764
+ const payload = extractDataPayload(tail);
765
+ if (payload !== null && payload !== SSE_DONE) {
766
+ yield payload;
767
+ }
768
+ }
769
+ completedCleanly = true;
770
+ } finally {
771
+ if (signal) signal.removeEventListener("abort", onAbort);
772
+ if (completedCleanly) {
773
+ try {
774
+ reader.releaseLock();
775
+ } catch {
776
+ }
777
+ } else {
778
+ await reader.cancel().catch(() => {
779
+ });
780
+ }
781
+ }
782
+ }
783
+ function extractDataPayload(block) {
784
+ const lines = block.split(/\r?\n/);
785
+ const parts = [];
786
+ for (const line of lines) {
787
+ if (line === "" || line.startsWith(":")) continue;
788
+ if (line.startsWith("data:")) {
789
+ let value = line.slice(5);
790
+ if (value.startsWith(" ")) value = value.slice(1);
791
+ parts.push(value);
792
+ }
793
+ }
794
+ if (parts.length === 0) return null;
795
+ return parts.join("\n");
796
+ }
797
+
531
798
  // src/version.ts
532
- var SDK_VERSION = "0.3.3";
799
+ var SDK_VERSION = "0.4.0";
533
800
 
534
801
  // src/client.ts
535
- function sleep(ms) {
802
+ function sleep2(ms) {
536
803
  return new Promise((resolve) => setTimeout(resolve, ms));
537
804
  }
538
805
  function abortableSleep(ms, signal) {
@@ -550,6 +817,19 @@ function abortableSleep(ms, signal) {
550
817
  });
551
818
  }
552
819
  var _LEASE_RENEWAL_MAX_RETRIES = 5;
820
+ function extractChatChunkError(chunk) {
821
+ const err = chunk.error;
822
+ if (!err) return null;
823
+ return new SIEStreamError(err.message ?? "stream error", {
824
+ code: err.code,
825
+ errorType: err.type,
826
+ param: err.param
827
+ });
828
+ }
829
+ function extractGenerateChunkError(chunk) {
830
+ if (!chunk.error) return null;
831
+ return new SIEStreamError(chunk.error.message, { code: chunk.error.code });
832
+ }
553
833
  var SIEClient = class {
554
834
  baseUrl;
555
835
  timeout;
@@ -769,6 +1049,427 @@ var SIEClient = class {
769
1049
  * console.log(result.scores[0].itemId); // most relevant
770
1050
  * ```
771
1051
  */
1052
+ /**
1053
+ * Generate text from a prompt (walking-skeleton SDK surface).
1054
+ *
1055
+ * The SDK does not currently expose streaming chunks. The worker streams
1056
+ * to the gateway, the gateway aggregates, and the SDK returns the
1057
+ * assembled result plus SIE-native timing metadata (TTFT, TPOT,
1058
+ * attempt id).
1059
+ *
1060
+ * @example
1061
+ * ```typescript
1062
+ * const result = await client.generate(
1063
+ * "Qwen__Qwen3-4B-Instruct-2507",
1064
+ * "Write a haiku about the sea.",
1065
+ * { maxNewTokens: 64, temperature: 0.7 },
1066
+ * );
1067
+ * console.log(result.text);
1068
+ * console.log(`TTFT: ${result.ttftMs}ms`);
1069
+ * ```
1070
+ */
1071
+ async generate(model, prompt, options) {
1072
+ const body = {
1073
+ prompt,
1074
+ max_new_tokens: options.maxNewTokens,
1075
+ temperature: options.temperature ?? 1,
1076
+ top_p: options.topP ?? 1
1077
+ };
1078
+ if (options.stop !== void 0) {
1079
+ body.stop = options.stop;
1080
+ }
1081
+ const { pool, gpu } = this.parseGpuParam(options.gpu);
1082
+ const headers = {
1083
+ Accept: "application/json",
1084
+ "Content-Type": JSON_CONTENT_TYPE,
1085
+ [SDK_VERSION_HEADER]: SDK_VERSION
1086
+ };
1087
+ if (pool) headers["X-SIE-Pool"] = pool;
1088
+ if (gpu) headers["X-SIE-MACHINE-PROFILE"] = gpu;
1089
+ if (this.apiKey) headers.Authorization = `Bearer ${this.apiKey}`;
1090
+ const safeModel = model.replaceAll("/", "__");
1091
+ const url = `${this.baseUrl}/v1/generate/${encodeURIComponent(safeModel)}`;
1092
+ const waitForCapacity = options.waitForCapacity ?? this.defaultWaitForCapacity;
1093
+ const response = await withProvisioningRetry(() => this.performJsonPost(url, body, headers), {
1094
+ model,
1095
+ gpu,
1096
+ waitForCapacity,
1097
+ provisionTimeoutMs: this.provisionTimeout
1098
+ });
1099
+ const data = await response.json();
1100
+ if (data === null || typeof data !== "object") {
1101
+ throw new RequestError("Unexpected generate response shape");
1102
+ }
1103
+ return parseGenerateResult(data);
1104
+ }
1105
+ /**
1106
+ * Per-attempt JSON POST used by the non-streaming surfaces
1107
+ * ({@link generate}, {@link chatCompletions}) inside the
1108
+ * {@link withProvisioningRetry} loop.
1109
+ *
1110
+ * Translates low-level transport failures into typed errors that the
1111
+ * retry loop will surface verbatim:
1112
+ * - `AbortError` → `SIEConnectionError` (per-attempt timeout)
1113
+ * - `TypeError` → `SIEConnectionError` (NOT retried — generation is
1114
+ * non-idempotent, so a mid-flight drop must surface instead of
1115
+ * silently re-issuing a billable generation)
1116
+ *
1117
+ * Each call uses a fresh `AbortController` so concurrent retries don't
1118
+ * share state, and the per-attempt timeout is bounded by `this.timeout`
1119
+ * (NOT the cumulative provisioning budget).
1120
+ */
1121
+ async performJsonPost(url, body, headers) {
1122
+ const controller = new AbortController();
1123
+ const timeoutId = setTimeout(() => controller.abort(), this.timeout);
1124
+ try {
1125
+ return await fetch(url, {
1126
+ method: "POST",
1127
+ headers,
1128
+ body: JSON.stringify(body),
1129
+ signal: controller.signal
1130
+ });
1131
+ } catch (err) {
1132
+ if (err instanceof Error && err.name === "AbortError") {
1133
+ throw new SIEConnectionError(`Request timeout after ${this.timeout}ms`, "timeout");
1134
+ }
1135
+ if (err instanceof TypeError) {
1136
+ throw new SIEConnectionError(`Connection failed: ${err.message}`, "connect");
1137
+ }
1138
+ throw err;
1139
+ } finally {
1140
+ clearTimeout(timeoutId);
1141
+ }
1142
+ }
1143
+ /**
1144
+ * Non-streaming chat-completion call against `/v1/chat/completions`.
1145
+ *
1146
+ * This is the OpenAI-compatible surface. The request body is forwarded
1147
+ * verbatim as JSON, so any field documented at
1148
+ * <https://platform.openai.com/docs/api-reference/chat/create> can be set;
1149
+ * the gateway will reject fields it does not yet support with
1150
+ * `400 unsupported_field`. SIE-native routing hints (`routing_key`,
1151
+ * `prompt_cache_key`) are part of the same request shape.
1152
+ *
1153
+ * Error semantics mirror `generate()`: 4xx → `RequestError`, 5xx →
1154
+ * `ServerError` (or the more specific `ModelLoadFailedError` for 502
1155
+ * `MODEL_LOAD_FAILED`), connection / timeout failures →
1156
+ * `SIEConnectionError`.
1157
+ *
1158
+ * If `req.stream === true`, this method throws `RequestError` immediately —
1159
+ * use {@link streamChatCompletions} instead. We do not auto-route because
1160
+ * the return type is fundamentally different (`Promise` vs
1161
+ * `AsyncGenerator`) and silently flipping would mis-type the call site.
1162
+ *
1163
+ * @example
1164
+ * ```typescript
1165
+ * const reply = await client.chatCompletions({
1166
+ * model: "Qwen/Qwen3-4B-Instruct-2507",
1167
+ * messages: [{ role: "user", content: "Write a haiku about the sea." }],
1168
+ * max_completion_tokens: 64,
1169
+ * });
1170
+ * console.log(reply.choices[0]?.message.content);
1171
+ * ```
1172
+ */
1173
+ async chatCompletions(req, options = {}) {
1174
+ if (req.stream === true) {
1175
+ throw new RequestError(
1176
+ "chatCompletions() cannot be used with stream:true \u2014 use streamChatCompletions() instead.",
1177
+ "invalid_request",
1178
+ 400
1179
+ );
1180
+ }
1181
+ const body = { ...req, stream: false };
1182
+ const url = `${this.baseUrl}/v1/chat/completions`;
1183
+ const headers = this.buildChatHeaders("application/json");
1184
+ const waitForCapacity = options.waitForCapacity ?? this.defaultWaitForCapacity;
1185
+ const provisionTimeoutMs = options.provisionTimeoutMs ?? this.provisionTimeout;
1186
+ const response = await withProvisioningRetry(() => this.performJsonPost(url, body, headers), {
1187
+ model: req.model,
1188
+ gpu: void 0,
1189
+ waitForCapacity,
1190
+ provisionTimeoutMs
1191
+ });
1192
+ this.checkServerVersion(response);
1193
+ const data = await response.json();
1194
+ if (data === null || typeof data !== "object") {
1195
+ throw new RequestError("Unexpected chat.completion response shape");
1196
+ }
1197
+ return data;
1198
+ }
1199
+ /**
1200
+ * Streaming chat-completion call against `/v1/chat/completions` with
1201
+ * `Accept: text/event-stream`.
1202
+ *
1203
+ * Yields `ChatCompletionChunk` events in the order the gateway emits them.
1204
+ * The terminal chunk carries `finish_reason`; if
1205
+ * `req.stream_options.include_usage === true`, a final usage-only chunk
1206
+ * (`choices: []`, populated `usage`) follows it. The generator completes
1207
+ * cleanly on the `data: [DONE]` sentinel.
1208
+ *
1209
+ * Error semantics:
1210
+ *
1211
+ * - HTTP 4xx / 5xx **before** the stream opens → throws `RequestError` /
1212
+ * `ServerError` (same as {@link chatCompletions}).
1213
+ * - A chunk containing `error: { ... }` mid-stream → throws
1214
+ * {@link SIEStreamError}. The error chunk is consumed, never yielded.
1215
+ * - `signal.abort()` mid-stream → the generator throws
1216
+ * `SIEConnectionError` and releases the underlying reader, which
1217
+ * fires `StreamCancelGuard` on the gateway side.
1218
+ *
1219
+ * `req.stream` is set to `true` automatically; any existing value is
1220
+ * overwritten. We do not validate `req.stream === false` because the
1221
+ * call-site intent is unambiguous.
1222
+ *
1223
+ * @param req The chat-completion request. See {@link ChatCompletionRequest}.
1224
+ * @param signal Optional `AbortSignal` for cooperative cancellation.
1225
+ *
1226
+ * @example
1227
+ * ```typescript
1228
+ * const controller = new AbortController();
1229
+ * try {
1230
+ * for await (const chunk of client.streamChatCompletions(
1231
+ * {
1232
+ * model: "Qwen/Qwen3-4B-Instruct-2507",
1233
+ * messages: [{ role: "user", content: "Count to ten." }],
1234
+ * stream_options: { include_usage: true },
1235
+ * },
1236
+ * controller.signal,
1237
+ * )) {
1238
+ * process.stdout.write(chunk.choices[0]?.delta.content ?? "");
1239
+ * }
1240
+ * } catch (err) {
1241
+ * if (err instanceof SIEStreamError) {
1242
+ * console.error(`mid-stream error: ${err.code} — ${err.message}`);
1243
+ * } else throw err;
1244
+ * }
1245
+ * ```
1246
+ */
1247
+ async *streamChatCompletions(req, signal) {
1248
+ const body = { ...req, stream: true };
1249
+ const url = `${this.baseUrl}/v1/chat/completions`;
1250
+ yield* this.consumeSseStream(
1251
+ url,
1252
+ body,
1253
+ req.model,
1254
+ signal,
1255
+ (chunk) => extractChatChunkError(chunk)
1256
+ );
1257
+ }
1258
+ /**
1259
+ * Streaming companion to {@link generate} — opens an SSE connection to
1260
+ * `/v1/generate/{model}` with `stream: true` and yields the SIE-native
1261
+ * chunk shape documented in
1262
+ * `packages/sie_gateway/src/handlers/sse.rs::build_generate_chunk_event`.
1263
+ *
1264
+ * The first delta carries `seq: 0` and `text_delta` populated; the
1265
+ * terminal chunk has `done: true`, `finish_reason`, and (typically)
1266
+ * `usage` + `ttft_ms`. The generator completes on the `data: [DONE]`
1267
+ * sentinel.
1268
+ *
1269
+ * Error semantics match {@link streamChatCompletions}: pre-stream HTTP
1270
+ * errors throw normally, mid-stream `error` chunks throw
1271
+ * {@link SIEStreamError}.
1272
+ *
1273
+ * @example
1274
+ * ```typescript
1275
+ * for await (const chunk of client.streamGenerate(
1276
+ * "Qwen/Qwen3-4B-Instruct-2507",
1277
+ * "Write a haiku.",
1278
+ * { maxNewTokens: 64, temperature: 0.7 },
1279
+ * )) {
1280
+ * process.stdout.write(chunk.text_delta);
1281
+ * if (chunk.done) console.log(`\nTTFT: ${chunk.ttft_ms}ms`);
1282
+ * }
1283
+ * ```
1284
+ */
1285
+ async *streamGenerate(model, prompt, options, signal) {
1286
+ const body = {
1287
+ prompt,
1288
+ max_new_tokens: options.maxNewTokens,
1289
+ temperature: options.temperature ?? 1,
1290
+ top_p: options.topP ?? 1,
1291
+ stream: true
1292
+ };
1293
+ if (options.stop !== void 0) body.stop = options.stop;
1294
+ const safeModel = model.replaceAll("/", "__");
1295
+ const url = `${this.baseUrl}/v1/generate/${encodeURIComponent(safeModel)}`;
1296
+ const { pool, gpu } = this.parseGpuParam(options.gpu);
1297
+ const waitForCapacity = options.waitForCapacity ?? this.defaultWaitForCapacity;
1298
+ yield* this.consumeSseStream(
1299
+ url,
1300
+ body,
1301
+ model,
1302
+ signal,
1303
+ (chunk) => extractGenerateChunkError(chunk),
1304
+ { pool, gpu },
1305
+ { waitForCapacity }
1306
+ );
1307
+ }
1308
+ /**
1309
+ * Shared SSE consumption helper for the streaming methods.
1310
+ *
1311
+ * Performs a pre-stream provisioning retry loop (honoring
1312
+ * `waitForCapacity`/`provisionTimeout`), surfaces pre-stream errors via
1313
+ * {@link handleError} (so callers see the same `RequestError` /
1314
+ * `ServerError` hierarchy as the non-streaming endpoints), then iterates
1315
+ * the SSE payloads via {@link parseSseStream}. Each payload is JSON-parsed;
1316
+ * if the consumer-supplied `extractError` returns an `SIEStreamError`, the
1317
+ * generator throws it instead of yielding the chunk.
1318
+ *
1319
+ * Retry policy mirrors {@link generate}: only the SAFE pre-execution
1320
+ * capacity signals — `202` (provisioning) and `503 MODEL_LOADING` — are
1321
+ * retried, and only while `waitForCapacity` is set and the provision
1322
+ * budget remains. Once the body opens we never retry (the call is
1323
+ * non-idempotent; a mid-stream failure must not re-issue generation).
1324
+ *
1325
+ * @internal
1326
+ */
1327
+ async *consumeSseStream(url, body, model, signal, extractError, routing, provisioning) {
1328
+ const headers = this.buildChatHeaders("text/event-stream");
1329
+ if (routing?.pool) headers["X-SIE-Pool"] = routing.pool;
1330
+ if (routing?.gpu) headers["X-SIE-MACHINE-PROFILE"] = routing.gpu;
1331
+ const waitForCapacity = provisioning?.waitForCapacity ?? this.defaultWaitForCapacity;
1332
+ const gpu = routing?.gpu;
1333
+ const controller = new AbortController();
1334
+ const onCallerAbort = () => controller.abort();
1335
+ if (signal) {
1336
+ if (signal.aborted) {
1337
+ throw new SIEConnectionError("Stream aborted before request", "other");
1338
+ }
1339
+ signal.addEventListener("abort", onCallerAbort, { once: true });
1340
+ }
1341
+ try {
1342
+ const startTime = Date.now();
1343
+ let response;
1344
+ while (true) {
1345
+ if (signal?.aborted) {
1346
+ throw new SIEConnectionError("Stream aborted before request", "other");
1347
+ }
1348
+ const preStreamTimeoutId = setTimeout(() => controller.abort(), this.timeout);
1349
+ let attemptResponse;
1350
+ try {
1351
+ attemptResponse = await fetch(url, {
1352
+ method: "POST",
1353
+ headers,
1354
+ body: JSON.stringify(body),
1355
+ signal: controller.signal
1356
+ });
1357
+ } catch (error) {
1358
+ if (signal?.aborted) {
1359
+ throw new SIEConnectionError("Stream aborted before response", "other");
1360
+ }
1361
+ if (error instanceof Error && error.name === "AbortError") {
1362
+ throw new SIEConnectionError(`Stream open timeout after ${this.timeout}ms`, "timeout");
1363
+ }
1364
+ if (error instanceof TypeError) {
1365
+ throw new SIEConnectionError(`Connection failed: ${error.message}`, "connect");
1366
+ }
1367
+ throw error;
1368
+ } finally {
1369
+ clearTimeout(preStreamTimeoutId);
1370
+ }
1371
+ if (attemptResponse.status === HTTP_ACCEPTED) {
1372
+ if (!waitForCapacity) {
1373
+ throw new ProvisioningError(
1374
+ "No capacity available. Server is provisioning.",
1375
+ gpu,
1376
+ getRetryAfter2(attemptResponse)
1377
+ );
1378
+ }
1379
+ const elapsed = Date.now() - startTime;
1380
+ if (elapsed >= this.provisionTimeout) {
1381
+ throw new ProvisioningError(
1382
+ `Provisioning timeout after ${elapsed}ms`,
1383
+ gpu,
1384
+ getRetryAfter2(attemptResponse)
1385
+ );
1386
+ }
1387
+ const delay = getRetryAfter2(attemptResponse) ?? DEFAULT_RETRY_DELAY;
1388
+ if (await abortableSleep(
1389
+ Math.min(delay, this.provisionTimeout - elapsed),
1390
+ controller.signal
1391
+ )) {
1392
+ throw new SIEConnectionError("Stream aborted while provisioning", "other");
1393
+ }
1394
+ continue;
1395
+ }
1396
+ await throwIfModelLoadFailed(attemptResponse, model);
1397
+ if (attemptResponse.status === 503) {
1398
+ const errorCode = await getErrorCode(attemptResponse.clone());
1399
+ if (errorCode === MODEL_LOADING_ERROR_CODE && waitForCapacity) {
1400
+ const elapsed = Date.now() - startTime;
1401
+ if (elapsed >= this.provisionTimeout) {
1402
+ throw new ModelLoadingError(`Model loading timeout for '${model}'`, model);
1403
+ }
1404
+ const delay = getRetryAfter2(attemptResponse) ?? MODEL_LOADING_DEFAULT_DELAY;
1405
+ if (await abortableSleep(
1406
+ Math.min(delay, this.provisionTimeout - elapsed),
1407
+ controller.signal
1408
+ )) {
1409
+ throw new SIEConnectionError("Stream aborted while provisioning", "other");
1410
+ }
1411
+ continue;
1412
+ }
1413
+ if (waitForCapacity) {
1414
+ const elapsed = Date.now() - startTime;
1415
+ if (elapsed < this.provisionTimeout) {
1416
+ const delay = getRetryAfter2(attemptResponse) ?? DEFAULT_RETRY_DELAY;
1417
+ if (await abortableSleep(
1418
+ Math.min(delay, this.provisionTimeout - elapsed),
1419
+ controller.signal
1420
+ )) {
1421
+ throw new SIEConnectionError("Stream aborted while provisioning", "other");
1422
+ }
1423
+ continue;
1424
+ }
1425
+ }
1426
+ }
1427
+ if (attemptResponse.status !== 200) {
1428
+ await handleError(attemptResponse);
1429
+ }
1430
+ response = attemptResponse;
1431
+ break;
1432
+ }
1433
+ if (!response) {
1434
+ throw new RequestError("Streaming request failed without producing a response");
1435
+ }
1436
+ this.checkServerVersion(response);
1437
+ const bodyStream = response.body;
1438
+ if (!bodyStream) {
1439
+ throw new RequestError("Streaming response has no body");
1440
+ }
1441
+ const reader = bodyStream.getReader();
1442
+ for await (const payload of parseSseStream(reader, signal ?? controller.signal)) {
1443
+ let chunk;
1444
+ try {
1445
+ chunk = JSON.parse(payload);
1446
+ } catch (err) {
1447
+ throw new RequestError(
1448
+ `Failed to parse SSE chunk as JSON: ${err instanceof Error ? err.message : String(err)}`
1449
+ );
1450
+ }
1451
+ const streamErr = extractError(chunk);
1452
+ if (streamErr) throw streamErr;
1453
+ yield chunk;
1454
+ }
1455
+ } finally {
1456
+ if (signal) signal.removeEventListener("abort", onCallerAbort);
1457
+ }
1458
+ }
1459
+ /**
1460
+ * Build the standard JSON header set for the chat-completions surface.
1461
+ * Pulled out so both the streaming and non-streaming paths agree on
1462
+ * auth / version / content-type wiring.
1463
+ */
1464
+ buildChatHeaders(accept) {
1465
+ const headers = {
1466
+ Accept: accept,
1467
+ "Content-Type": JSON_CONTENT_TYPE,
1468
+ [SDK_VERSION_HEADER]: SDK_VERSION
1469
+ };
1470
+ if (this.apiKey) headers.Authorization = `Bearer ${this.apiKey}`;
1471
+ return headers;
1472
+ }
772
1473
  async score(model, query, items, options = {}) {
773
1474
  const body = {
774
1475
  query,
@@ -818,6 +1519,9 @@ var SIEClient = class {
818
1519
  if (options.threshold !== void 0) {
819
1520
  params.threshold = options.threshold;
820
1521
  }
1522
+ if (options.adapterOptions !== void 0) {
1523
+ params.options = options.adapterOptions;
1524
+ }
821
1525
  body.params = params;
822
1526
  const waitForCapacity = options.waitForCapacity ?? this.defaultWaitForCapacity;
823
1527
  const { pool, gpu } = this.parseGpuParam(options.gpu);
@@ -857,17 +1561,18 @@ var SIEClient = class {
857
1561
  this.pools.clear();
858
1562
  }
859
1563
  /**
860
- * Create a resource pool for isolated capacity.
1564
+ * Create or update a resource pool for isolated capacity.
861
1565
  *
862
1566
  * Pools provide dedicated worker capacity, isolated from other clients.
863
1567
  * Workers are assigned to pools and only serve requests from that pool.
864
1568
  *
865
1569
  * @param name - Pool name (used in GPU param as "poolName/machineProfile")
866
- * @param gpus - Machine profile requirements, e.g., { "l4": 2, "l4-spot": 1 }
1570
+ * @param gpus - Optional machine profile requirements for pool readiness, e.g., { "l4": 2, "l4-spot": 1 }
1571
+ * @param gpuCaps - Optional maximum assigned workers per machine profile
867
1572
  *
868
1573
  * @example
869
1574
  * ```typescript
870
- * // Create a pool with 2 L4 GPUs
1575
+ * // Create or update a pool with 2 L4 GPUs
871
1576
  * await client.createPool("eval-bench", { l4: 2 });
872
1577
  *
873
1578
  * // Use the pool for requests
@@ -877,11 +1582,17 @@ var SIEClient = class {
877
1582
  * await client.deletePool("eval-bench");
878
1583
  * ```
879
1584
  */
880
- async createPool(name, gpus) {
881
- if (this.pools.has(name)) {
882
- return;
1585
+ async createPool(name, gpus, gpuCaps) {
1586
+ const alreadyTracking = this.pools.has(name);
1587
+ const requestBody = {
1588
+ name
1589
+ };
1590
+ if (gpus !== void 0) {
1591
+ requestBody.gpus = gpus;
1592
+ }
1593
+ if (gpuCaps) {
1594
+ requestBody.gpu_caps = gpuCaps;
883
1595
  }
884
- const requestBody = { name, gpus };
885
1596
  const url = `${this.baseUrl}/v1/pools`;
886
1597
  const headers = {
887
1598
  "Content-Type": JSON_CONTENT_TYPE,
@@ -909,6 +1620,9 @@ var SIEClient = class {
909
1620
  }
910
1621
  throw new PoolError(`Failed to create pool '${name}': ${errorMsg}`, name);
911
1622
  }
1623
+ if (alreadyTracking || this.pools.has(name)) {
1624
+ return;
1625
+ }
912
1626
  const abortController = new AbortController();
913
1627
  const poolState = {
914
1628
  timeoutId: null,
@@ -940,7 +1654,7 @@ var SIEClient = class {
940
1654
  signal: perAttempt.signal
941
1655
  });
942
1656
  if (resp.ok) break;
943
- } catch (error) {
1657
+ } catch {
944
1658
  if (abortController.signal.aborted) return;
945
1659
  } finally {
946
1660
  clearTimeout(attemptTimeout);
@@ -1177,7 +1891,7 @@ var SIEClient = class {
1177
1891
  }
1178
1892
  const remaining = timeout - elapsed;
1179
1893
  const delay = Math.min(pollInterval, remaining);
1180
- await sleep(delay);
1894
+ await sleep2(delay);
1181
1895
  }
1182
1896
  }
1183
1897
  /**
@@ -1204,7 +1918,7 @@ var SIEClient = class {
1204
1918
  if (elapsed < this.provisionTimeout) {
1205
1919
  const remaining = this.provisionTimeout - elapsed;
1206
1920
  const delay = Math.min(DEFAULT_RETRY_DELAY, remaining);
1207
- await sleep(delay);
1921
+ await sleep2(delay);
1208
1922
  continue;
1209
1923
  }
1210
1924
  }
@@ -1230,10 +1944,11 @@ var SIEClient = class {
1230
1944
  const delay = retryAfter ?? DEFAULT_RETRY_DELAY;
1231
1945
  const remaining = this.provisionTimeout - elapsed;
1232
1946
  const actualDelay = Math.min(delay, remaining);
1233
- await sleep(actualDelay);
1947
+ await sleep2(actualDelay);
1234
1948
  continue;
1235
1949
  }
1236
1950
  await throwIfModelLoadFailed(response, model);
1951
+ await throwIfInputTooLong(response, model);
1237
1952
  if (response.status === 503) {
1238
1953
  const clonedResponse = response.clone();
1239
1954
  const errorCode = await getErrorCode(clonedResponse);
@@ -1249,7 +1964,7 @@ var SIEClient = class {
1249
1964
  }
1250
1965
  const retryAfter = getRetryAfter2(response);
1251
1966
  const delay = retryAfter ?? LORA_LOADING_DEFAULT_DELAY;
1252
- await sleep(delay);
1967
+ await sleep2(delay);
1253
1968
  continue;
1254
1969
  }
1255
1970
  if (errorCode === MODEL_LOADING_ERROR_CODE) {
@@ -1264,7 +1979,7 @@ var SIEClient = class {
1264
1979
  const delay = retryAfter ?? MODEL_LOADING_DEFAULT_DELAY;
1265
1980
  const remaining = this.provisionTimeout - elapsed;
1266
1981
  const actualDelay = Math.min(delay, remaining);
1267
- await sleep(actualDelay);
1982
+ await sleep2(actualDelay);
1268
1983
  continue;
1269
1984
  }
1270
1985
  if (waitForCapacity) {
@@ -1274,7 +1989,7 @@ var SIEClient = class {
1274
1989
  const delay = retryAfter ?? DEFAULT_RETRY_DELAY;
1275
1990
  const remaining = this.provisionTimeout - elapsed;
1276
1991
  const actualDelay = Math.min(delay, remaining);
1277
- await sleep(actualDelay);
1992
+ await sleep2(actualDelay);
1278
1993
  continue;
1279
1994
  }
1280
1995
  }
@@ -1555,6 +2270,6 @@ function detectImageFormat(bytes) {
1555
2270
  return "unknown";
1556
2271
  }
1557
2272
 
1558
- export { LoraLoadingError, ModelLoadFailedError, ModelLoadingError, PoolError, ProvisioningError, RequestError, SDK_VERSION, SIEClient, SIEConnectionError, SIEError, ServerError, denseEmbedding, detectImageFormat, maxsim, maxsimBatch, maxsimDocuments, multivectorEmbedding, normalizeSparseVector, packMessage, sparseEmbedding, sparseEmbeddingMap, toFloat32Array, toImageBytes, toImageWireFormat, toNumberArray, unpackMessage };
2273
+ export { InputTooLongError, LoraLoadingError, ModelLoadFailedError, ModelLoadingError, PoolError, ProvisioningError, RequestError, SDK_VERSION, SIEClient, SIEConnectionError, SIEError, SIEStreamError, ServerError, denseEmbedding, detectImageFormat, maxsim, maxsimBatch, maxsimDocuments, multivectorEmbedding, normalizeSparseVector, packMessage, sparseEmbedding, sparseEmbeddingMap, toFloat32Array, toImageBytes, toImageWireFormat, toNumberArray, unpackMessage };
1559
2274
  //# sourceMappingURL=index.js.map
1560
2275
  //# sourceMappingURL=index.js.map