@blockrun/franklin 3.24.0 → 3.24.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent/llm.js +17 -4
- package/package.json +1 -1
package/dist/agent/llm.js
CHANGED
|
@@ -79,6 +79,11 @@ function getModelRequestTimeoutMs() {
|
|
|
79
79
|
180_000);
|
|
80
80
|
}
|
|
81
81
|
function getModelStreamIdleTimeoutMs() {
|
|
82
|
+
// Inter-chunk idle budget: the max gap allowed *between* SSE chunks once the
|
|
83
|
+
// stream is flowing. It does NOT cover time-to-first-token — that first read
|
|
84
|
+
// uses the larger request budget (see getModelRequestTimeoutMs + the
|
|
85
|
+
// firstRead branch in parseSSEStream). Conflating the two regressed #74:
|
|
86
|
+
// reasoning models taking 60–120s to first token aborted at this 90s wall.
|
|
82
87
|
return (parseTimeoutEnv('FRANKLIN_MODEL_STREAM_IDLE_TIMEOUT_MS') ??
|
|
83
88
|
parseTimeoutEnv('FRANKLIN_MODEL_IDLE_TIMEOUT_MS') ??
|
|
84
89
|
90_000);
|
|
@@ -597,8 +602,11 @@ export class ModelClient {
|
|
|
597
602
|
yield* this.parseNonStreamingMessage(response, request.model);
|
|
598
603
|
return;
|
|
599
604
|
}
|
|
600
|
-
// Parse SSE stream
|
|
601
|
-
|
|
605
|
+
// Parse SSE stream. The first read waits for time-to-first-token (which
|
|
606
|
+
// the gateway does *not* cover with the request timeout — it flushes SSE
|
|
607
|
+
// headers before the first content chunk), so it gets the larger request
|
|
608
|
+
// budget; subsequent reads use the tighter stream-idle budget.
|
|
609
|
+
yield* this.parseSSEStream(response, requestController, streamTimeoutMs, request.model, requestTimeoutMs);
|
|
602
610
|
}
|
|
603
611
|
finally {
|
|
604
612
|
unlinkAbort();
|
|
@@ -1087,7 +1095,7 @@ export class ModelClient {
|
|
|
1087
1095
|
return header;
|
|
1088
1096
|
}
|
|
1089
1097
|
// ─── SSE Parsing ───────────────────────────────────────────────────────
|
|
1090
|
-
async *parseSSEStream(response, controller, timeoutMs, model) {
|
|
1098
|
+
async *parseSSEStream(response, controller, timeoutMs, model, firstReadTimeoutMs = timeoutMs) {
|
|
1091
1099
|
const reader = response.body?.getReader();
|
|
1092
1100
|
if (!reader) {
|
|
1093
1101
|
yield { kind: 'error', payload: { message: 'No response body' } };
|
|
@@ -1097,12 +1105,17 @@ export class ModelClient {
|
|
|
1097
1105
|
let buffer = '';
|
|
1098
1106
|
// Persist across read() calls — event: and data: may arrive in separate chunks
|
|
1099
1107
|
let currentEvent = '';
|
|
1108
|
+
// The first read waits for time-to-first-token (60–120s for reasoning
|
|
1109
|
+
// models on cache-cold prompts); only later reads measure inter-chunk idle.
|
|
1110
|
+
let firstRead = true;
|
|
1100
1111
|
const MAX_BUFFER = 1_000_000; // 1MB buffer cap
|
|
1101
1112
|
try {
|
|
1102
1113
|
while (true) {
|
|
1103
1114
|
if (controller.signal.aborted)
|
|
1104
1115
|
break;
|
|
1105
|
-
const
|
|
1116
|
+
const budgetMs = firstRead ? firstReadTimeoutMs : timeoutMs;
|
|
1117
|
+
firstRead = false;
|
|
1118
|
+
const { done, value } = await withAbortableTimeout(() => reader.read(), controller, createModelTimeoutError('stream', model, budgetMs), budgetMs);
|
|
1106
1119
|
if (done)
|
|
1107
1120
|
break;
|
|
1108
1121
|
buffer += decoder.decode(value, { stream: true });
|
package/package.json
CHANGED