@oh-my-pi/pi-ai 14.7.1 → 14.7.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +8 -0
- package/package.json +3 -3
- package/src/provider-models/openai-compat.ts +104 -33
- package/src/providers/anthropic.ts +2 -144
- package/src/utils/event-stream.ts +1 -100
package/CHANGELOG.md
CHANGED
|
@@ -2,6 +2,14 @@
|
|
|
2
2
|
|
|
3
3
|
## [Unreleased]
|
|
4
4
|
|
|
5
|
+
## [14.7.2] - 2026-05-06
|
|
6
|
+
|
|
7
|
+
### Fixed
|
|
8
|
+
|
|
9
|
+
- Fixed VLLM model discovery to use `max_model_len` as the context window when the endpoint reports it.
|
|
10
|
+
- Fixed custom Ollama Cloud/local-proxy model aliases (for example `deepseek-v4-pro:cloud`) to inherit bundled cache-pricing metadata when the upstream model is known ([#937](https://github.com/can1357/oh-my-pi/issues/937)).
|
|
11
|
+
- Fixed local Ollama model discovery to apply `/api/show` thinking and vision capabilities in addition to native context windows ([#928](https://github.com/can1357/oh-my-pi/issues/928)).
|
|
12
|
+
|
|
5
13
|
## [14.7.0] - 2026-05-04
|
|
6
14
|
### Breaking Changes
|
|
7
15
|
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"type": "module",
|
|
3
3
|
"name": "@oh-my-pi/pi-ai",
|
|
4
|
-
"version": "14.7.
|
|
4
|
+
"version": "14.7.3",
|
|
5
5
|
"description": "Unified LLM API with automatic model discovery and provider configuration",
|
|
6
6
|
"homepage": "https://github.com/can1357/oh-my-pi",
|
|
7
7
|
"author": "Can Boluk",
|
|
@@ -46,8 +46,8 @@
|
|
|
46
46
|
"@aws-sdk/credential-provider-node": "^3.972.39",
|
|
47
47
|
"@bufbuild/protobuf": "^2.12.0",
|
|
48
48
|
"@google/genai": "^1.52.0",
|
|
49
|
-
"@oh-my-pi/pi-natives": "14.7.
|
|
50
|
-
"@oh-my-pi/pi-utils": "14.7.
|
|
49
|
+
"@oh-my-pi/pi-natives": "14.7.3",
|
|
50
|
+
"@oh-my-pi/pi-utils": "14.7.3",
|
|
51
51
|
"@sinclair/typebox": "^0.34.49",
|
|
52
52
|
"@smithy/node-http-handler": "^4.6.1",
|
|
53
53
|
"ajv": "^8.20.0",
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import type { ModelManagerOptions } from "../model-manager";
|
|
2
|
+
import { Effort } from "../model-thinking";
|
|
2
3
|
import { getBundledModels } from "../models";
|
|
3
|
-
import type { Api, Model } from "../types";
|
|
4
|
+
import type { Api, Model, ThinkingConfig } from "../types";
|
|
4
5
|
import { isAnthropicOAuthToken, isRecord, toBoolean, toNumber, toPositiveNumber } from "../utils";
|
|
5
6
|
import {
|
|
6
7
|
fetchOpenAICompatibleModels,
|
|
@@ -192,7 +193,7 @@ function toOllamaNativeBaseUrl(baseUrl: string): string {
|
|
|
192
193
|
|
|
193
194
|
async function fetchOllamaNativeModels(
|
|
194
195
|
baseUrl: string,
|
|
195
|
-
|
|
196
|
+
resolveMetadata: (modelId: string) => Promise<OllamaResolvedMetadata>,
|
|
196
197
|
): Promise<Model<"openai-responses">[] | null> {
|
|
197
198
|
const nativeBaseUrl = toOllamaNativeBaseUrl(baseUrl);
|
|
198
199
|
let response: Response;
|
|
@@ -213,18 +214,19 @@ async function fetchOllamaNativeModels(
|
|
|
213
214
|
entries.map(async (entry): Promise<Model<"openai-responses"> | null> => {
|
|
214
215
|
const id = entry.model ?? entry.name;
|
|
215
216
|
if (!id) return null;
|
|
216
|
-
const
|
|
217
|
+
const metadata = await resolveMetadata(id);
|
|
217
218
|
return {
|
|
218
219
|
id,
|
|
219
220
|
name: entry.name ?? id,
|
|
220
221
|
api: "openai-responses",
|
|
221
222
|
provider: "ollama",
|
|
222
223
|
baseUrl,
|
|
223
|
-
reasoning: false,
|
|
224
|
-
|
|
224
|
+
reasoning: metadata.reasoning ?? false,
|
|
225
|
+
thinking: metadata.thinking,
|
|
226
|
+
input: metadata.input ?? ["text"],
|
|
225
227
|
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
|
|
226
|
-
contextWindow,
|
|
227
|
-
maxTokens,
|
|
228
|
+
contextWindow: metadata.contextWindow,
|
|
229
|
+
maxTokens: metadata.maxTokens,
|
|
228
230
|
};
|
|
229
231
|
}),
|
|
230
232
|
);
|
|
@@ -241,18 +243,65 @@ const OLLAMA_FALLBACK_CONTEXT_WINDOW = 128_000;
|
|
|
241
243
|
/** Cap max output tokens at a value that matches OMP's other openai-responses defaults. */
|
|
242
244
|
const OLLAMA_DEFAULT_MAX_TOKENS = 8192;
|
|
243
245
|
|
|
244
|
-
interface
|
|
246
|
+
interface OllamaResolvedMetadata {
|
|
245
247
|
contextWindow: number;
|
|
246
248
|
maxTokens: number;
|
|
249
|
+
capabilities?: string[];
|
|
250
|
+
reasoning?: boolean;
|
|
251
|
+
thinking?: ThinkingConfig;
|
|
252
|
+
input?: ("text" | "image")[];
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
interface OllamaShowMetadata {
|
|
256
|
+
contextWindow?: number;
|
|
257
|
+
maxTokens?: number;
|
|
258
|
+
capabilities?: string[];
|
|
259
|
+
reasoning?: boolean;
|
|
260
|
+
thinking?: ThinkingConfig;
|
|
261
|
+
input?: ("text" | "image")[];
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
function getOllamaContextWindow(modelInfo: Record<string, unknown> | undefined): number | undefined {
|
|
265
|
+
if (!modelInfo) {
|
|
266
|
+
return undefined;
|
|
267
|
+
}
|
|
268
|
+
for (const [key, value] of Object.entries(modelInfo)) {
|
|
269
|
+
if (typeof value !== "number" || value <= 0) {
|
|
270
|
+
continue;
|
|
271
|
+
}
|
|
272
|
+
if (key.endsWith(".context_length") || key.endsWith(".num_ctx") || key.endsWith(".context_window")) {
|
|
273
|
+
return value;
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
function getOllamaCapabilities(value: unknown): string[] | undefined {
|
|
279
|
+
if (!Array.isArray(value)) {
|
|
280
|
+
return undefined;
|
|
281
|
+
}
|
|
282
|
+
return value.filter((item): item is string => typeof item === "string");
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
function getOllamaThinkingConfig(capabilities: string[] | undefined): ThinkingConfig | undefined {
|
|
286
|
+
if (!capabilities?.includes("thinking")) {
|
|
287
|
+
return undefined;
|
|
288
|
+
}
|
|
289
|
+
return {
|
|
290
|
+
mode: "effort",
|
|
291
|
+
minLevel: Effort.Minimal,
|
|
292
|
+
maxLevel: Effort.High,
|
|
293
|
+
};
|
|
247
294
|
}
|
|
248
295
|
|
|
249
296
|
/**
|
|
250
|
-
* Query Ollama's `/api/show` endpoint for a single model and pull
|
|
251
|
-
* context
|
|
252
|
-
*
|
|
253
|
-
* unavailable so callers can layer their own fallback.
|
|
297
|
+
* Query Ollama's `/api/show` endpoint for a single model and pull native
|
|
298
|
+
* context and capability metadata from the response. Returns `undefined` when
|
|
299
|
+
* the endpoint is unavailable so callers can layer their own fallback.
|
|
254
300
|
*/
|
|
255
|
-
async function
|
|
301
|
+
async function fetchOllamaShowMetadata(
|
|
302
|
+
nativeBaseUrl: string,
|
|
303
|
+
modelId: string,
|
|
304
|
+
): Promise<OllamaShowMetadata | undefined> {
|
|
256
305
|
try {
|
|
257
306
|
const response = await fetch(`${nativeBaseUrl}/api/show`, {
|
|
258
307
|
method: "POST",
|
|
@@ -262,13 +311,21 @@ async function fetchOllamaShowLimits(nativeBaseUrl: string, modelId: string): Pr
|
|
|
262
311
|
if (!response.ok) {
|
|
263
312
|
return undefined;
|
|
264
313
|
}
|
|
265
|
-
const payload = (await response.json()) as { model_info?: Record<string, unknown> };
|
|
266
|
-
const
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
314
|
+
const payload = (await response.json()) as { capabilities?: unknown; model_info?: Record<string, unknown> };
|
|
315
|
+
const capabilities = getOllamaCapabilities(payload.capabilities);
|
|
316
|
+
const contextWindow = getOllamaContextWindow(payload.model_info);
|
|
317
|
+
return {
|
|
318
|
+
contextWindow,
|
|
319
|
+
maxTokens: contextWindow ? OLLAMA_DEFAULT_MAX_TOKENS : undefined,
|
|
320
|
+
capabilities,
|
|
321
|
+
reasoning: capabilities ? capabilities.includes("thinking") : undefined,
|
|
322
|
+
thinking: getOllamaThinkingConfig(capabilities),
|
|
323
|
+
input: capabilities
|
|
324
|
+
? capabilities.includes("vision")
|
|
325
|
+
? (["text", "image"] as Array<"text" | "image">)
|
|
326
|
+
: (["text"] as Array<"text">)
|
|
327
|
+
: undefined,
|
|
328
|
+
};
|
|
272
329
|
} catch {
|
|
273
330
|
// fall through; caller decides on the fallback
|
|
274
331
|
}
|
|
@@ -276,23 +333,27 @@ async function fetchOllamaShowLimits(nativeBaseUrl: string, modelId: string): Pr
|
|
|
276
333
|
}
|
|
277
334
|
|
|
278
335
|
/**
|
|
279
|
-
* Build a resolver that fetches `/api/show`
|
|
280
|
-
* result in-memory for the lifetime of the manager. Successful lookups are
|
|
336
|
+
* Build a resolver that fetches `/api/show` metadata per model id and caches
|
|
337
|
+
* the result in-memory for the lifetime of the manager. Successful lookups are
|
|
281
338
|
* cached so repeated `fetchDynamicModels` calls do not refetch; failed
|
|
282
339
|
* lookups stay uncached so a later refresh can recover.
|
|
283
340
|
*/
|
|
284
|
-
function
|
|
285
|
-
const cache = new Map<string, Promise<
|
|
341
|
+
function createOllamaMetadataResolver(nativeBaseUrl: string): (modelId: string) => Promise<OllamaResolvedMetadata> {
|
|
342
|
+
const cache = new Map<string, Promise<OllamaResolvedMetadata>>();
|
|
286
343
|
return modelId => {
|
|
287
344
|
const cached = cache.get(modelId);
|
|
288
345
|
if (cached) return cached;
|
|
289
346
|
const pending = (async () => {
|
|
290
|
-
const
|
|
291
|
-
if (!
|
|
347
|
+
const metadata = await fetchOllamaShowMetadata(nativeBaseUrl, modelId);
|
|
348
|
+
if (!metadata) {
|
|
292
349
|
cache.delete(modelId);
|
|
293
350
|
return { contextWindow: OLLAMA_FALLBACK_CONTEXT_WINDOW, maxTokens: OLLAMA_DEFAULT_MAX_TOKENS };
|
|
294
351
|
}
|
|
295
|
-
return
|
|
352
|
+
return {
|
|
353
|
+
...metadata,
|
|
354
|
+
contextWindow: metadata.contextWindow ?? OLLAMA_FALLBACK_CONTEXT_WINDOW,
|
|
355
|
+
maxTokens: metadata.maxTokens ?? OLLAMA_DEFAULT_MAX_TOKENS,
|
|
356
|
+
};
|
|
296
357
|
})();
|
|
297
358
|
cache.set(modelId, pending);
|
|
298
359
|
void pending.catch(() => cache.delete(modelId));
|
|
@@ -702,7 +763,7 @@ export function ollamaModelManagerOptions(config?: OllamaModelManagerConfig): Mo
|
|
|
702
763
|
const baseUrl = normalizeOllamaBaseUrl(config?.baseUrl);
|
|
703
764
|
const nativeBaseUrl = toOllamaNativeBaseUrl(baseUrl);
|
|
704
765
|
const references = createBundledReferenceMap<"openai-responses">("ollama" as Parameters<typeof getBundledModels>[0]);
|
|
705
|
-
const
|
|
766
|
+
const resolveMetadata = createOllamaMetadataResolver(nativeBaseUrl);
|
|
706
767
|
return {
|
|
707
768
|
providerId: "ollama",
|
|
708
769
|
fetchDynamicModels: async () => {
|
|
@@ -727,13 +788,20 @@ export function ollamaModelManagerOptions(config?: OllamaModelManagerConfig): Mo
|
|
|
727
788
|
if (openAiCompatible && openAiCompatible.length > 0) {
|
|
728
789
|
await Promise.all(
|
|
729
790
|
openAiCompatible.map(async model => {
|
|
730
|
-
const
|
|
731
|
-
model.contextWindow =
|
|
791
|
+
const metadata = await resolveMetadata(model.id);
|
|
792
|
+
model.contextWindow = metadata.contextWindow;
|
|
793
|
+
if (metadata.reasoning !== undefined) {
|
|
794
|
+
model.reasoning = metadata.reasoning;
|
|
795
|
+
model.thinking = metadata.thinking;
|
|
796
|
+
}
|
|
797
|
+
if (metadata.input) {
|
|
798
|
+
model.input = metadata.input;
|
|
799
|
+
}
|
|
732
800
|
}),
|
|
733
801
|
);
|
|
734
802
|
return openAiCompatible;
|
|
735
803
|
}
|
|
736
|
-
const nativeFallback = await fetchOllamaNativeModels(baseUrl,
|
|
804
|
+
const nativeFallback = await fetchOllamaNativeModels(baseUrl, resolveMetadata);
|
|
737
805
|
if (nativeFallback && nativeFallback.length > 0) {
|
|
738
806
|
return nativeFallback;
|
|
739
807
|
}
|
|
@@ -1407,8 +1475,11 @@ export function vllmModelManagerOptions(config?: VllmModelManagerConfig): ModelM
|
|
|
1407
1475
|
baseUrl,
|
|
1408
1476
|
apiKey,
|
|
1409
1477
|
mapModel: (entry, defaults) => {
|
|
1410
|
-
const
|
|
1411
|
-
return
|
|
1478
|
+
const model = mapWithBundledReference(entry, defaults, references.get(defaults.id));
|
|
1479
|
+
return {
|
|
1480
|
+
...model,
|
|
1481
|
+
contextWindow: toPositiveNumber(entry.max_model_len, model.contextWindow),
|
|
1482
|
+
};
|
|
1412
1483
|
},
|
|
1413
1484
|
}),
|
|
1414
1485
|
};
|
|
@@ -8,7 +8,7 @@ import type {
|
|
|
8
8
|
MessageParam,
|
|
9
9
|
RawMessageStreamEvent,
|
|
10
10
|
} from "@anthropic-ai/sdk/resources/messages";
|
|
11
|
-
import { $env, abortableSleep, isEnoent } from "@oh-my-pi/pi-utils";
|
|
11
|
+
import { $env, abortableSleep, isEnoent, readSseEvents } from "@oh-my-pi/pi-utils";
|
|
12
12
|
import { hasOpus47ApiRestrictions, mapEffortToAnthropicAdaptiveEffort } from "../model-thinking";
|
|
13
13
|
import { calculateCost } from "../models";
|
|
14
14
|
import { getEnvApiKey, OUTPUT_FALLBACK_BUFFER } from "../stream";
|
|
@@ -658,18 +658,6 @@ function mergeHeaders(...headerSources: (Record<string, string> | undefined)[]):
|
|
|
658
658
|
// We surface the resulting provider error ourselves, so keep the SDK quiet.
|
|
659
659
|
const ANTHROPIC_SDK_LOG_LEVEL = "off" as const;
|
|
660
660
|
|
|
661
|
-
interface ServerSentEvent {
|
|
662
|
-
event: string | null;
|
|
663
|
-
data: string;
|
|
664
|
-
raw: string[];
|
|
665
|
-
}
|
|
666
|
-
|
|
667
|
-
interface SseDecoderState {
|
|
668
|
-
event: string | null;
|
|
669
|
-
data: string[];
|
|
670
|
-
raw: string[];
|
|
671
|
-
}
|
|
672
|
-
|
|
673
661
|
const ANTHROPIC_MESSAGE_EVENTS: ReadonlySet<string> = new Set([
|
|
674
662
|
"message_start",
|
|
675
663
|
"message_delta",
|
|
@@ -679,136 +667,6 @@ const ANTHROPIC_MESSAGE_EVENTS: ReadonlySet<string> = new Set([
|
|
|
679
667
|
"content_block_stop",
|
|
680
668
|
]);
|
|
681
669
|
|
|
682
|
-
function flushSseEvent(state: SseDecoderState): ServerSentEvent | null {
|
|
683
|
-
if (!state.event && state.data.length === 0) {
|
|
684
|
-
return null;
|
|
685
|
-
}
|
|
686
|
-
|
|
687
|
-
const event: ServerSentEvent = {
|
|
688
|
-
event: state.event,
|
|
689
|
-
data: state.data.join("\n"),
|
|
690
|
-
raw: [...state.raw],
|
|
691
|
-
};
|
|
692
|
-
state.event = null;
|
|
693
|
-
state.data = [];
|
|
694
|
-
state.raw = [];
|
|
695
|
-
return event;
|
|
696
|
-
}
|
|
697
|
-
|
|
698
|
-
function decodeSseLine(line: string, state: SseDecoderState): ServerSentEvent | null {
|
|
699
|
-
if (line === "") {
|
|
700
|
-
return flushSseEvent(state);
|
|
701
|
-
}
|
|
702
|
-
|
|
703
|
-
state.raw.push(line);
|
|
704
|
-
if (line.startsWith(":")) {
|
|
705
|
-
return null;
|
|
706
|
-
}
|
|
707
|
-
|
|
708
|
-
const delimiterIndex = line.indexOf(":");
|
|
709
|
-
const fieldName = delimiterIndex === -1 ? line : line.slice(0, delimiterIndex);
|
|
710
|
-
let value = delimiterIndex === -1 ? "" : line.slice(delimiterIndex + 1);
|
|
711
|
-
if (value.startsWith(" ")) {
|
|
712
|
-
value = value.slice(1);
|
|
713
|
-
}
|
|
714
|
-
|
|
715
|
-
if (fieldName === "event") {
|
|
716
|
-
state.event = value;
|
|
717
|
-
} else if (fieldName === "data") {
|
|
718
|
-
state.data.push(value);
|
|
719
|
-
}
|
|
720
|
-
|
|
721
|
-
return null;
|
|
722
|
-
}
|
|
723
|
-
|
|
724
|
-
function nextLineBreakIndex(text: string): number {
|
|
725
|
-
const carriageReturnIndex = text.indexOf("\r");
|
|
726
|
-
const newlineIndex = text.indexOf("\n");
|
|
727
|
-
if (carriageReturnIndex === -1) {
|
|
728
|
-
return newlineIndex;
|
|
729
|
-
}
|
|
730
|
-
if (newlineIndex === -1) {
|
|
731
|
-
return carriageReturnIndex;
|
|
732
|
-
}
|
|
733
|
-
return Math.min(carriageReturnIndex, newlineIndex);
|
|
734
|
-
}
|
|
735
|
-
|
|
736
|
-
function consumeLine(text: string): { line: string; rest: string } | null {
|
|
737
|
-
const lineBreakIndex = nextLineBreakIndex(text);
|
|
738
|
-
if (lineBreakIndex === -1) {
|
|
739
|
-
return null;
|
|
740
|
-
}
|
|
741
|
-
|
|
742
|
-
let nextIndex = lineBreakIndex + 1;
|
|
743
|
-
if (text[lineBreakIndex] === "\r" && text[nextIndex] === "\n") {
|
|
744
|
-
nextIndex += 1;
|
|
745
|
-
}
|
|
746
|
-
|
|
747
|
-
return {
|
|
748
|
-
line: text.slice(0, lineBreakIndex),
|
|
749
|
-
rest: text.slice(nextIndex),
|
|
750
|
-
};
|
|
751
|
-
}
|
|
752
|
-
|
|
753
|
-
async function* iterateSseMessages(
|
|
754
|
-
body: ReadableStream<Uint8Array>,
|
|
755
|
-
signal?: AbortSignal,
|
|
756
|
-
): AsyncGenerator<ServerSentEvent> {
|
|
757
|
-
const reader = body.getReader();
|
|
758
|
-
const decoder = new TextDecoder();
|
|
759
|
-
const state: SseDecoderState = { event: null, data: [], raw: [] };
|
|
760
|
-
let buffer = "";
|
|
761
|
-
|
|
762
|
-
try {
|
|
763
|
-
while (true) {
|
|
764
|
-
if (signal?.aborted) {
|
|
765
|
-
throw new Error("Request was aborted");
|
|
766
|
-
}
|
|
767
|
-
|
|
768
|
-
const { value, done } = await reader.read();
|
|
769
|
-
if (done) {
|
|
770
|
-
break;
|
|
771
|
-
}
|
|
772
|
-
|
|
773
|
-
buffer += decoder.decode(value, { stream: true });
|
|
774
|
-
let consumed = consumeLine(buffer);
|
|
775
|
-
while (consumed) {
|
|
776
|
-
buffer = consumed.rest;
|
|
777
|
-
const event = decodeSseLine(consumed.line, state);
|
|
778
|
-
if (event) {
|
|
779
|
-
yield event;
|
|
780
|
-
}
|
|
781
|
-
consumed = consumeLine(buffer);
|
|
782
|
-
}
|
|
783
|
-
}
|
|
784
|
-
|
|
785
|
-
buffer += decoder.decode();
|
|
786
|
-
let consumed = consumeLine(buffer);
|
|
787
|
-
while (consumed) {
|
|
788
|
-
buffer = consumed.rest;
|
|
789
|
-
const event = decodeSseLine(consumed.line, state);
|
|
790
|
-
if (event) {
|
|
791
|
-
yield event;
|
|
792
|
-
}
|
|
793
|
-
consumed = consumeLine(buffer);
|
|
794
|
-
}
|
|
795
|
-
|
|
796
|
-
if (buffer.length > 0) {
|
|
797
|
-
const event = decodeSseLine(buffer, state);
|
|
798
|
-
if (event) {
|
|
799
|
-
yield event;
|
|
800
|
-
}
|
|
801
|
-
}
|
|
802
|
-
|
|
803
|
-
const trailingEvent = flushSseEvent(state);
|
|
804
|
-
if (trailingEvent) {
|
|
805
|
-
yield trailingEvent;
|
|
806
|
-
}
|
|
807
|
-
} finally {
|
|
808
|
-
reader.releaseLock();
|
|
809
|
-
}
|
|
810
|
-
}
|
|
811
|
-
|
|
812
670
|
async function* iterateAnthropicEvents(
|
|
813
671
|
response: Response,
|
|
814
672
|
signal?: AbortSignal,
|
|
@@ -820,7 +678,7 @@ async function* iterateAnthropicEvents(
|
|
|
820
678
|
let sawMessageStart = false;
|
|
821
679
|
let sawMessageEnd = false;
|
|
822
680
|
|
|
823
|
-
for await (const sse of
|
|
681
|
+
for await (const sse of readSseEvents(response.body, signal)) {
|
|
824
682
|
if (sse.event === "error") {
|
|
825
683
|
throw new Error(sse.data);
|
|
826
684
|
}
|
|
@@ -105,23 +105,7 @@ export class EventStream<T, R = T> implements AsyncIterable<T> {
|
|
|
105
105
|
}
|
|
106
106
|
}
|
|
107
107
|
|
|
108
|
-
// Delta events that can be batched for throttling
|
|
109
|
-
type DeltaEvent =
|
|
110
|
-
| { type: "text_delta"; contentIndex: number; delta: string; partial: AssistantMessage }
|
|
111
|
-
| { type: "thinking_delta"; contentIndex: number; delta: string; partial: AssistantMessage }
|
|
112
|
-
| { type: "toolcall_delta"; contentIndex: number; delta: string; partial: AssistantMessage };
|
|
113
|
-
|
|
114
|
-
function isDeltaEvent(event: AssistantMessageEvent): event is DeltaEvent {
|
|
115
|
-
return event.type === "text_delta" || event.type === "thinking_delta" || event.type === "toolcall_delta";
|
|
116
|
-
}
|
|
117
|
-
|
|
118
108
|
export class AssistantMessageEventStream extends EventStream<AssistantMessageEvent, AssistantMessage> {
|
|
119
|
-
// Throttling state
|
|
120
|
-
#deltaBuffer: DeltaEvent[] = [];
|
|
121
|
-
#flushTimer?: NodeJS.Timeout;
|
|
122
|
-
#lastFlushTime = 0;
|
|
123
|
-
readonly #throttleMs = 50; // 20 updates/sec
|
|
124
|
-
|
|
125
109
|
constructor() {
|
|
126
110
|
super(
|
|
127
111
|
event => event.type === "done" || event.type === "error",
|
|
@@ -139,103 +123,20 @@ export class AssistantMessageEventStream extends EventStream<AssistantMessageEve
|
|
|
139
123
|
override push(event: AssistantMessageEvent): void {
|
|
140
124
|
if (this.done) return;
|
|
141
125
|
|
|
142
|
-
//
|
|
126
|
+
// Completion resolves the final result and still emits the terminal event.
|
|
143
127
|
if (this.isComplete(event)) {
|
|
144
|
-
this.#flushDeltas(); // Flush any pending deltas before completing
|
|
145
128
|
this.done = true;
|
|
146
129
|
this.resolveFinalResult(this.extractResult(event));
|
|
147
130
|
}
|
|
148
131
|
|
|
149
|
-
// Delta events get batched and throttled
|
|
150
|
-
if (isDeltaEvent(event)) {
|
|
151
|
-
this.#deltaBuffer.push(event);
|
|
152
|
-
this.#scheduleFlush();
|
|
153
|
-
return;
|
|
154
|
-
}
|
|
155
|
-
|
|
156
|
-
// Non-delta events flush pending deltas immediately, then emit
|
|
157
|
-
this.#flushDeltas();
|
|
158
132
|
this.deliver(event);
|
|
159
133
|
}
|
|
160
134
|
|
|
161
135
|
override end(result?: AssistantMessage): void {
|
|
162
|
-
this.#flushDeltas();
|
|
163
136
|
this.done = true;
|
|
164
137
|
if (result !== undefined) {
|
|
165
138
|
this.resolveFinalResult(result);
|
|
166
139
|
}
|
|
167
140
|
this.endWaiting();
|
|
168
141
|
}
|
|
169
|
-
|
|
170
|
-
override fail(err: unknown): void {
|
|
171
|
-
if (this.#flushTimer) {
|
|
172
|
-
clearTimeout(this.#flushTimer);
|
|
173
|
-
this.#flushTimer = undefined;
|
|
174
|
-
}
|
|
175
|
-
this.#deltaBuffer = [];
|
|
176
|
-
super.fail(err);
|
|
177
|
-
}
|
|
178
|
-
|
|
179
|
-
#scheduleFlush(): void {
|
|
180
|
-
if (this.#flushTimer) return; // Already scheduled
|
|
181
|
-
|
|
182
|
-
const now = Bun.nanoseconds();
|
|
183
|
-
const timeSinceLastFlush = (now - this.#lastFlushTime) / 1e6;
|
|
184
|
-
|
|
185
|
-
if (timeSinceLastFlush >= this.#throttleMs) {
|
|
186
|
-
// Flush immediately if throttle window has passed
|
|
187
|
-
this.#flushDeltas();
|
|
188
|
-
} else {
|
|
189
|
-
// Schedule flush for when throttle window expires
|
|
190
|
-
const delay = this.#throttleMs - timeSinceLastFlush;
|
|
191
|
-
this.#flushTimer = setTimeout(() => {
|
|
192
|
-
this.#flushTimer = undefined;
|
|
193
|
-
this.#flushDeltas();
|
|
194
|
-
}, delay);
|
|
195
|
-
}
|
|
196
|
-
}
|
|
197
|
-
|
|
198
|
-
#flushDeltas(): void {
|
|
199
|
-
if (this.#flushTimer) {
|
|
200
|
-
clearTimeout(this.#flushTimer);
|
|
201
|
-
this.#flushTimer = undefined;
|
|
202
|
-
}
|
|
203
|
-
|
|
204
|
-
if (this.#deltaBuffer.length === 0) return;
|
|
205
|
-
|
|
206
|
-
// Merge consecutive deltas for the same content block and type
|
|
207
|
-
const merged = this.#mergeDeltas(this.#deltaBuffer);
|
|
208
|
-
this.#deltaBuffer = [];
|
|
209
|
-
this.#lastFlushTime = Bun.nanoseconds();
|
|
210
|
-
|
|
211
|
-
for (const event of merged) {
|
|
212
|
-
this.deliver(event);
|
|
213
|
-
}
|
|
214
|
-
}
|
|
215
|
-
|
|
216
|
-
#mergeDeltas(deltas: DeltaEvent[]): AssistantMessageEvent[] {
|
|
217
|
-
if (deltas.length === 0) return [];
|
|
218
|
-
if (deltas.length === 1) return [deltas[0]];
|
|
219
|
-
|
|
220
|
-
const result: AssistantMessageEvent[] = [];
|
|
221
|
-
let current = deltas[0];
|
|
222
|
-
|
|
223
|
-
for (let i = 1; i < deltas.length; i++) {
|
|
224
|
-
const next = deltas[i];
|
|
225
|
-
// Can merge if same type, same content index
|
|
226
|
-
if (next.type === current.type && next.contentIndex === current.contentIndex) {
|
|
227
|
-
current = {
|
|
228
|
-
...current,
|
|
229
|
-
delta: current.delta + next.delta,
|
|
230
|
-
partial: next.partial, // Use latest partial
|
|
231
|
-
} as DeltaEvent;
|
|
232
|
-
} else {
|
|
233
|
-
result.push(current);
|
|
234
|
-
current = next;
|
|
235
|
-
}
|
|
236
|
-
}
|
|
237
|
-
result.push(current);
|
|
238
|
-
|
|
239
|
-
return result;
|
|
240
|
-
}
|
|
241
142
|
}
|