@hebo-ai/gateway 0.4.0-beta.4 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +27 -0
- package/dist/endpoints/chat-completions/converters.d.ts +3 -3
- package/dist/endpoints/chat-completions/converters.js +15 -7
- package/dist/endpoints/chat-completions/handler.js +9 -9
- package/dist/endpoints/chat-completions/otel.js +10 -4
- package/dist/endpoints/embeddings/handler.js +5 -4
- package/dist/errors/gateway.d.ts +1 -1
- package/dist/errors/gateway.js +3 -3
- package/dist/errors/openai.js +2 -1
- package/dist/errors/utils.d.ts +2 -1
- package/dist/errors/utils.js +1 -0
- package/dist/lifecycle.js +14 -6
- package/dist/telemetry/gen-ai.d.ts +2 -1
- package/dist/telemetry/gen-ai.js +21 -3
- package/dist/telemetry/memory.d.ts +2 -0
- package/dist/telemetry/memory.js +27 -0
- package/dist/telemetry/span.js +1 -1
- package/dist/telemetry/stream.d.ts +1 -1
- package/dist/telemetry/stream.js +25 -28
- package/dist/types.d.ts +2 -3
- package/package.json +2 -1
- package/src/endpoints/chat-completions/converters.ts +17 -10
- package/src/endpoints/chat-completions/handler.ts +13 -9
- package/src/endpoints/chat-completions/otel.ts +11 -4
- package/src/endpoints/embeddings/handler.ts +9 -4
- package/src/errors/gateway.ts +5 -4
- package/src/errors/openai.ts +2 -1
- package/src/errors/utils.ts +1 -0
- package/src/lifecycle.ts +17 -6
- package/src/telemetry/gen-ai.ts +31 -3
- package/src/telemetry/memory.ts +36 -0
- package/src/telemetry/span.ts +1 -1
- package/src/telemetry/stream.ts +31 -31
- package/src/types.ts +3 -6
package/README.md
CHANGED
|
@@ -19,6 +19,7 @@ Learn more in our blog post: [Yet Another AI Gateway?](https://hebo.ai/blog/2601
|
|
|
19
19
|
- 🗂️ Model catalog with extensible metadata capabilities.
|
|
20
20
|
- 🪝 Hook system to customize routing, auth, rate limits, and shape responses.
|
|
21
21
|
- 🧰 Low-level OpenAI-compatible schema, converters, and middleware helpers.
|
|
22
|
+
- 👁️ OpenTelemetry support for GenAI semantic conventions (Langfuse-compatible).
|
|
22
23
|
|
|
23
24
|
## 📦 Installation
|
|
24
25
|
|
|
@@ -641,6 +642,32 @@ https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-spans/
|
|
|
641
642
|
|
|
642
643
|
For observability integration that is not otel compliant, you can disable built-in telemetry and manually instrument requests during `before` / `after` hooks.
|
|
643
644
|
|
|
645
|
+
#### Langfuse
|
|
646
|
+
|
|
647
|
+
Hebo telemetry spans are OpenTelemetry-compatible, so you can send them to Langfuse via `@langfuse/otel`.
|
|
648
|
+
|
|
649
|
+
```ts
|
|
650
|
+
import { gateway } from "@hebo-ai/gateway";
|
|
651
|
+
import { LangfuseSpanProcessor } from "@langfuse/otel";
|
|
652
|
+
import { context } from "@opentelemetry/api";
|
|
653
|
+
import { AsyncLocalStorageContextManager } from "@opentelemetry/context-async-hooks";
|
|
654
|
+
import { BasicTracerProvider } from "@opentelemetry/sdk-trace-base";
|
|
655
|
+
|
|
656
|
+
context.setGlobalContextManager(new AsyncLocalStorageContextManager().enable());
|
|
657
|
+
|
|
658
|
+
const gw = gateway({
|
|
659
|
+
// ...
|
|
660
|
+
telemetry: {
|
|
661
|
+
enabled: true,
|
|
662
|
+
tracer = new BasicTracerProvider({
|
|
663
|
+
spanProcessors: [new LangfuseSpanProcessor()],
|
|
664
|
+
}).getTracer("hebo");,
|
|
665
|
+
},
|
|
666
|
+
});
|
|
667
|
+
```
|
|
668
|
+
|
|
669
|
+
Langfuse credentials are read from environment variables by the Langfuse OTel SDK (`LANGFUSE_PUBLIC_KEY`, `LANGFUSE_SECRET_KEY`, `LANGFUSE_BASE_URL`).
|
|
670
|
+
|
|
644
671
|
### Passing Framework State to Hooks
|
|
645
672
|
|
|
646
673
|
You can pass per-request info from your framework into the gateway via the second `state` argument on the handler, then read it in hooks through `ctx.state`.
|
|
@@ -25,10 +25,10 @@ export declare const convertToToolSet: (tools: ChatCompletionsTool[] | undefined
|
|
|
25
25
|
export declare const convertToToolChoice: (toolChoice: ChatCompletionsToolChoice | undefined) => ToolChoice<ToolSet> | undefined;
|
|
26
26
|
export declare function toChatCompletions(result: GenerateTextResult<ToolSet, Output.Output>, model: string): ChatCompletions;
|
|
27
27
|
export declare function toChatCompletionsResponse(result: GenerateTextResult<ToolSet, Output.Output>, model: string, responseInit?: ResponseInit): Response;
|
|
28
|
-
export declare function toChatCompletionsStream(result: StreamTextResult<ToolSet, Output.Output>, model: string): ReadableStream<ChatCompletionsChunk | OpenAIError>;
|
|
28
|
+
export declare function toChatCompletionsStream<E extends boolean = false>(result: StreamTextResult<ToolSet, Output.Output>, model: string, wrapErrors?: E): ReadableStream<ChatCompletionsChunk | (E extends true ? OpenAIError : Error)>;
|
|
29
29
|
export declare function toChatCompletionsStreamResponse(result: StreamTextResult<ToolSet, Output.Output>, model: string, responseInit?: ResponseInit): Response;
|
|
30
|
-
export declare class ChatCompletionsStream extends TransformStream<TextStreamPart<ToolSet>, ChatCompletionsChunk | OpenAIError> {
|
|
31
|
-
constructor(model: string);
|
|
30
|
+
export declare class ChatCompletionsStream<E extends boolean = false> extends TransformStream<TextStreamPart<ToolSet>, ChatCompletionsChunk | (E extends true ? OpenAIError : Error)> {
|
|
31
|
+
constructor(model: string, wrapErrors?: E);
|
|
32
32
|
}
|
|
33
33
|
export declare const toChatCompletionsAssistantMessage: (result: GenerateTextResult<ToolSet, Output.Output>) => ChatCompletionsAssistantMessage;
|
|
34
34
|
export declare function toReasoningDetail(reasoning: ReasoningOutput, id: string, index: number): ChatCompletionsReasoningDetail;
|
|
@@ -275,14 +275,14 @@ export function toChatCompletions(result, model) {
|
|
|
275
275
|
export function toChatCompletionsResponse(result, model, responseInit) {
|
|
276
276
|
return toResponse(toChatCompletions(result, model), responseInit);
|
|
277
277
|
}
|
|
278
|
-
export function toChatCompletionsStream(result, model) {
|
|
279
|
-
return result.fullStream.pipeThrough(new ChatCompletionsStream(model));
|
|
278
|
+
export function toChatCompletionsStream(result, model, wrapErrors) {
|
|
279
|
+
return result.fullStream.pipeThrough(new ChatCompletionsStream(model, wrapErrors));
|
|
280
280
|
}
|
|
281
281
|
export function toChatCompletionsStreamResponse(result, model, responseInit) {
|
|
282
|
-
return toResponse(toChatCompletionsStream(result, model), responseInit);
|
|
282
|
+
return toResponse(toChatCompletionsStream(result, model, true), responseInit);
|
|
283
283
|
}
|
|
284
284
|
export class ChatCompletionsStream extends TransformStream {
|
|
285
|
-
constructor(model) {
|
|
285
|
+
constructor(model, wrapErrors) {
|
|
286
286
|
const streamId = `chatcmpl-${crypto.randomUUID()}`;
|
|
287
287
|
const creationTime = Math.floor(Date.now() / 1000);
|
|
288
288
|
let toolCallIndexCounter = 0;
|
|
@@ -348,9 +348,17 @@ export class ChatCompletionsStream extends TransformStream {
|
|
|
348
348
|
break;
|
|
349
349
|
}
|
|
350
350
|
case "error": {
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
351
|
+
let err;
|
|
352
|
+
if (wrapErrors) {
|
|
353
|
+
err = toOpenAIError(part.error);
|
|
354
|
+
}
|
|
355
|
+
else if (part.error instanceof Error) {
|
|
356
|
+
err = part.error;
|
|
357
|
+
}
|
|
358
|
+
else {
|
|
359
|
+
err = new Error(String(part.error));
|
|
360
|
+
}
|
|
361
|
+
controller.enqueue(err);
|
|
354
362
|
}
|
|
355
363
|
}
|
|
356
364
|
},
|
|
@@ -5,7 +5,7 @@ import { winterCgHandler } from "../../lifecycle";
|
|
|
5
5
|
import { logger } from "../../logger";
|
|
6
6
|
import { modelMiddlewareMatcher } from "../../middleware/matcher";
|
|
7
7
|
import { resolveProvider } from "../../providers/registry";
|
|
8
|
-
import { recordRequestDuration, recordTokenUsage } from "../../telemetry/gen-ai";
|
|
8
|
+
import { recordRequestDuration, recordTimePerOutputToken, recordTokenUsage, } from "../../telemetry/gen-ai";
|
|
9
9
|
import { addSpanEvent, setSpanAttributes } from "../../telemetry/span";
|
|
10
10
|
import { resolveRequestId } from "../../utils/headers";
|
|
11
11
|
import { prepareForwardHeaders } from "../../utils/request";
|
|
@@ -33,8 +33,8 @@ export const chatCompletions = (config) => {
|
|
|
33
33
|
addSpanEvent("hebo.request.deserialized");
|
|
34
34
|
const parsed = ChatCompletionsBodySchema.safeParse(ctx.body);
|
|
35
35
|
if (!parsed.success) {
|
|
36
|
-
// FUTURE:
|
|
37
|
-
throw new GatewayError(z.prettifyError(parsed.error), 400);
|
|
36
|
+
// FUTURE: consider adding body shape to metadata
|
|
37
|
+
throw new GatewayError(z.prettifyError(parsed.error), 400, undefined, parsed.error);
|
|
38
38
|
}
|
|
39
39
|
ctx.body = parsed.data;
|
|
40
40
|
addSpanEvent("hebo.request.parsed");
|
|
@@ -84,13 +84,12 @@ export const chatCompletions = (config) => {
|
|
|
84
84
|
const result = streamText({
|
|
85
85
|
model: languageModelWithMiddleware,
|
|
86
86
|
headers: prepareForwardHeaders(ctx.request),
|
|
87
|
-
|
|
88
|
-
// abortSignal: ctx.request.signal,
|
|
87
|
+
abortSignal: ctx.request.signal,
|
|
89
88
|
timeout: {
|
|
90
89
|
totalMs: 5 * 60 * 1000,
|
|
91
90
|
},
|
|
92
91
|
onAbort: () => {
|
|
93
|
-
throw new DOMException("
|
|
92
|
+
throw new DOMException("The operation was aborted.", "AbortError");
|
|
94
93
|
},
|
|
95
94
|
onError: () => { },
|
|
96
95
|
onFinish: (res) => {
|
|
@@ -100,7 +99,8 @@ export const chatCompletions = (config) => {
|
|
|
100
99
|
const genAiResponseAttrs = getChatResponseAttributes(streamResult, genAiSignalLevel);
|
|
101
100
|
setSpanAttributes(genAiResponseAttrs);
|
|
102
101
|
recordTokenUsage(genAiResponseAttrs, genAiGeneralAttrs, genAiSignalLevel);
|
|
103
|
-
|
|
102
|
+
recordTimePerOutputToken(start, genAiResponseAttrs, genAiGeneralAttrs, genAiSignalLevel);
|
|
103
|
+
recordRequestDuration(start, genAiGeneralAttrs, genAiSignalLevel);
|
|
104
104
|
},
|
|
105
105
|
experimental_include: {
|
|
106
106
|
requestBody: false,
|
|
@@ -119,7 +119,6 @@ export const chatCompletions = (config) => {
|
|
|
119
119
|
const result = await generateText({
|
|
120
120
|
model: languageModelWithMiddleware,
|
|
121
121
|
headers: prepareForwardHeaders(ctx.request),
|
|
122
|
-
// FUTURE: currently can't tell whether upstream or downstream abort
|
|
123
122
|
abortSignal: ctx.request.signal,
|
|
124
123
|
timeout: 5 * 60 * 1000,
|
|
125
124
|
experimental_include: {
|
|
@@ -140,7 +139,8 @@ export const chatCompletions = (config) => {
|
|
|
140
139
|
ctx.result = (await hooks.after(ctx)) ?? ctx.result;
|
|
141
140
|
addSpanEvent("hebo.hooks.after.completed");
|
|
142
141
|
}
|
|
143
|
-
|
|
142
|
+
recordTimePerOutputToken(start, genAiResponseAttrs, genAiGeneralAttrs, genAiSignalLevel);
|
|
143
|
+
recordRequestDuration(start, genAiGeneralAttrs, genAiSignalLevel);
|
|
144
144
|
return ctx.result;
|
|
145
145
|
};
|
|
146
146
|
return { handler: winterCgHandler(handler, config) };
|
|
@@ -44,6 +44,10 @@ const toMessageParts = (message) => {
|
|
|
44
44
|
}
|
|
45
45
|
return parts;
|
|
46
46
|
}
|
|
47
|
+
// FUTURE: remove once Langfuse supports gen_ai.system_instructions
|
|
48
|
+
if (message.role === "system") {
|
|
49
|
+
return [toTextPart(message.content)];
|
|
50
|
+
}
|
|
47
51
|
return [];
|
|
48
52
|
};
|
|
49
53
|
export const getChatGeneralAttributes = (ctx, signalLevel) => {
|
|
@@ -81,11 +85,13 @@ export const getChatRequestAttributes = (inputs, signalLevel) => {
|
|
|
81
85
|
}
|
|
82
86
|
if (signalLevel === "full") {
|
|
83
87
|
Object.assign(attrs, {
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
88
|
+
// FUTURE: move system instructions from messages to here
|
|
89
|
+
// blocker: https://github.com/langfuse/langfuse/issues/11607
|
|
90
|
+
// "gen_ai.system_instructions": inputs.messages
|
|
91
|
+
// .filter((m) => m.role === "system")
|
|
92
|
+
// .map((m) => JSON.stringify(toTextPart(m.content))),
|
|
87
93
|
"gen_ai.input.messages": inputs.messages
|
|
88
|
-
|
|
94
|
+
//.filter((m) => m.role !== "system")
|
|
89
95
|
.map((m) => JSON.stringify({ role: m.role, parts: toMessageParts(m) })),
|
|
90
96
|
"gen_ai.tool.definitions": JSON.stringify(inputs.tools),
|
|
91
97
|
});
|
|
@@ -5,7 +5,7 @@ import { winterCgHandler } from "../../lifecycle";
|
|
|
5
5
|
import { logger } from "../../logger";
|
|
6
6
|
import { modelMiddlewareMatcher } from "../../middleware/matcher";
|
|
7
7
|
import { resolveProvider } from "../../providers/registry";
|
|
8
|
-
import { recordRequestDuration, recordTokenUsage } from "../../telemetry/gen-ai";
|
|
8
|
+
import { recordRequestDuration, recordTimePerOutputToken, recordTokenUsage, } from "../../telemetry/gen-ai";
|
|
9
9
|
import { addSpanEvent, setSpanAttributes } from "../../telemetry/span";
|
|
10
10
|
import { resolveRequestId } from "../../utils/headers";
|
|
11
11
|
import { prepareForwardHeaders } from "../../utils/request";
|
|
@@ -33,8 +33,8 @@ export const embeddings = (config) => {
|
|
|
33
33
|
addSpanEvent("hebo.request.deserialized");
|
|
34
34
|
const parsed = EmbeddingsBodySchema.safeParse(ctx.body);
|
|
35
35
|
if (!parsed.success) {
|
|
36
|
-
// FUTURE:
|
|
37
|
-
throw new GatewayError(z.prettifyError(parsed.error), 400);
|
|
36
|
+
// FUTURE: consider adding body shape to metadata
|
|
37
|
+
throw new GatewayError(z.prettifyError(parsed.error), 400, undefined, parsed.error);
|
|
38
38
|
}
|
|
39
39
|
ctx.body = parsed.data;
|
|
40
40
|
addSpanEvent("hebo.request.parsed");
|
|
@@ -95,7 +95,8 @@ export const embeddings = (config) => {
|
|
|
95
95
|
ctx.result = (await hooks.after(ctx)) ?? ctx.result;
|
|
96
96
|
addSpanEvent("hebo.hooks.after.completed");
|
|
97
97
|
}
|
|
98
|
-
|
|
98
|
+
recordTimePerOutputToken(start, genAiResponseAttrs, genAiGeneralAttrs, genAiSignalLevel);
|
|
99
|
+
recordRequestDuration(start, genAiGeneralAttrs, genAiSignalLevel);
|
|
99
100
|
return ctx.result;
|
|
100
101
|
};
|
|
101
102
|
return { handler: winterCgHandler(handler, config) };
|
package/dist/errors/gateway.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
export declare class GatewayError extends Error {
|
|
2
2
|
readonly status: number;
|
|
3
3
|
readonly code: string;
|
|
4
|
-
constructor(error:
|
|
4
|
+
constructor(error: unknown, status: number, code?: string, cause?: unknown);
|
|
5
5
|
}
|
package/dist/errors/gateway.js
CHANGED
|
@@ -3,10 +3,10 @@ export class GatewayError extends Error {
|
|
|
3
3
|
status;
|
|
4
4
|
code;
|
|
5
5
|
constructor(error, status, code, cause) {
|
|
6
|
-
const
|
|
7
|
-
super(
|
|
6
|
+
const isError = error instanceof Error;
|
|
7
|
+
super(isError ? error.message : String(error));
|
|
8
|
+
this.cause = cause ?? (isError ? error : undefined);
|
|
8
9
|
this.status = status;
|
|
9
10
|
this.code = code ?? STATUS_CODE(status);
|
|
10
|
-
this.cause = cause ?? (typeof error === "string" ? undefined : error);
|
|
11
11
|
}
|
|
12
12
|
}
|
package/dist/errors/openai.js
CHANGED
|
@@ -19,7 +19,8 @@ export class OpenAIError {
|
|
|
19
19
|
}
|
|
20
20
|
const mapType = (status) => (status < 500 ? "invalid_request_error" : "server_error");
|
|
21
21
|
const maybeMaskMessage = (meta, requestId) => {
|
|
22
|
-
|
|
22
|
+
// FUTURE: consider masking all upstream errors, also 4xx
|
|
23
|
+
if (!(isProduction() && meta.status >= 500)) {
|
|
23
24
|
return meta.message;
|
|
24
25
|
}
|
|
25
26
|
// FUTURE: always attach requestId to errors (masked and unmasked)
|
package/dist/errors/utils.d.ts
CHANGED
|
@@ -8,12 +8,13 @@ export declare const STATUS_CODES: {
|
|
|
8
8
|
readonly 409: "CONFLICT";
|
|
9
9
|
readonly 422: "UNPROCESSABLE_ENTITY";
|
|
10
10
|
readonly 429: "TOO_MANY_REQUESTS";
|
|
11
|
+
readonly 499: "CLIENT_CLOSED_REQUEST";
|
|
11
12
|
readonly 500: "INTERNAL_SERVER_ERROR";
|
|
12
13
|
readonly 502: "BAD_GATEWAY";
|
|
13
14
|
readonly 503: "SERVICE_UNAVAILABLE";
|
|
14
15
|
readonly 504: "GATEWAY_TIMEOUT";
|
|
15
16
|
};
|
|
16
|
-
export declare const STATUS_CODE: (status: number) => "BAD_REQUEST" | "UNAUTHORIZED" | "PAYMENT_REQUIRED" | "FORBIDDEN" | "NOT_FOUND" | "METHOD_NOT_ALLOWED" | "CONFLICT" | "UNPROCESSABLE_ENTITY" | "TOO_MANY_REQUESTS" | "INTERNAL_SERVER_ERROR" | "BAD_GATEWAY" | "SERVICE_UNAVAILABLE" | "GATEWAY_TIMEOUT";
|
|
17
|
+
export declare const STATUS_CODE: (status: number) => "BAD_REQUEST" | "UNAUTHORIZED" | "PAYMENT_REQUIRED" | "FORBIDDEN" | "NOT_FOUND" | "METHOD_NOT_ALLOWED" | "CONFLICT" | "UNPROCESSABLE_ENTITY" | "TOO_MANY_REQUESTS" | "CLIENT_CLOSED_REQUEST" | "INTERNAL_SERVER_ERROR" | "BAD_GATEWAY" | "SERVICE_UNAVAILABLE" | "GATEWAY_TIMEOUT";
|
|
17
18
|
export declare function getErrorMeta(error: unknown): {
|
|
18
19
|
status: number;
|
|
19
20
|
code: string;
|
package/dist/errors/utils.js
CHANGED
package/dist/lifecycle.js
CHANGED
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
import { parseConfig } from "./config";
|
|
2
|
+
import { GatewayError } from "./errors/gateway";
|
|
2
3
|
import { toOpenAIErrorResponse } from "./errors/openai";
|
|
3
4
|
import { logger } from "./logger";
|
|
4
5
|
import { getBaggageAttributes } from "./telemetry/baggage";
|
|
5
6
|
import { initFetch } from "./telemetry/fetch";
|
|
6
7
|
import { getRequestAttributes, getResponseAttributes } from "./telemetry/http";
|
|
8
|
+
import { recordV8jsMemory } from "./telemetry/memory";
|
|
7
9
|
import { addSpanEvent, setSpanEventsEnabled, setSpanTracer, startSpan } from "./telemetry/span";
|
|
8
10
|
import { wrapStream } from "./telemetry/stream";
|
|
9
11
|
import { resolveRequestId } from "./utils/headers";
|
|
@@ -11,7 +13,7 @@ import { maybeApplyRequestPatch, prepareRequestHeaders } from "./utils/request";
|
|
|
11
13
|
import { prepareResponseInit, toResponse } from "./utils/response";
|
|
12
14
|
export const winterCgHandler = (run, config) => {
|
|
13
15
|
const parsedConfig = parseConfig(config);
|
|
14
|
-
if (parsedConfig.telemetry
|
|
16
|
+
if (parsedConfig.telemetry?.enabled) {
|
|
15
17
|
setSpanTracer(parsedConfig.telemetry?.tracer);
|
|
16
18
|
setSpanEventsEnabled(parsedConfig.telemetry?.signals?.hebo);
|
|
17
19
|
initFetch(parsedConfig.telemetry?.signals?.hebo);
|
|
@@ -39,17 +41,21 @@ export const winterCgHandler = (run, config) => {
|
|
|
39
41
|
// FUTURE add http.server.request.duration
|
|
40
42
|
span.setAttributes(getResponseAttributes(ctx.response, parsedConfig.telemetry?.signals?.http));
|
|
41
43
|
}
|
|
42
|
-
|
|
44
|
+
let realStatus = status;
|
|
45
|
+
if (ctx.request.signal.aborted)
|
|
46
|
+
realStatus = 499;
|
|
47
|
+
else if (status === 200 && ctx.response?.status)
|
|
48
|
+
realStatus = ctx.response.status;
|
|
43
49
|
if (realStatus !== 200) {
|
|
44
|
-
// FUTURE: in-stream errors are redacted in prod
|
|
45
50
|
(realStatus >= 500 ? logger.error : logger.warn)({
|
|
46
51
|
requestId: resolveRequestId(ctx.request),
|
|
47
|
-
err: reason,
|
|
52
|
+
err: reason ?? ctx.request.signal.reason,
|
|
48
53
|
});
|
|
49
54
|
if (realStatus >= 500)
|
|
50
55
|
span.recordError(reason);
|
|
51
56
|
}
|
|
52
57
|
span.setAttributes({ "http.response.status_code_effective": realStatus });
|
|
58
|
+
recordV8jsMemory(parsedConfig.telemetry?.signals?.hebo);
|
|
53
59
|
span.finish();
|
|
54
60
|
};
|
|
55
61
|
try {
|
|
@@ -66,7 +72,7 @@ export const winterCgHandler = (run, config) => {
|
|
|
66
72
|
if (!ctx.response) {
|
|
67
73
|
ctx.result = (await span.runWithContext(() => run(ctx)));
|
|
68
74
|
if (ctx.result instanceof ReadableStream) {
|
|
69
|
-
ctx.result = wrapStream(ctx.result, { onDone: finalize }
|
|
75
|
+
ctx.result = wrapStream(ctx.result, { onDone: finalize });
|
|
70
76
|
}
|
|
71
77
|
ctx.response = toResponse(ctx.result, prepareResponseInit(ctx.request));
|
|
72
78
|
}
|
|
@@ -83,7 +89,9 @@ export const winterCgHandler = (run, config) => {
|
|
|
83
89
|
}
|
|
84
90
|
}
|
|
85
91
|
catch (error) {
|
|
86
|
-
ctx.response = toOpenAIErrorResponse(
|
|
92
|
+
ctx.response = toOpenAIErrorResponse(ctx.request.signal.aborted
|
|
93
|
+
? new GatewayError(error ?? ctx.request.signal.reason, 499)
|
|
94
|
+
: error, prepareResponseInit(ctx.request));
|
|
87
95
|
finalize(ctx.response.status, error);
|
|
88
96
|
}
|
|
89
97
|
return ctx.response ?? new Response("Internal Server Error", { status: 500 });
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { type Attributes } from "@opentelemetry/api";
|
|
2
2
|
import type { TelemetrySignalLevel } from "../types";
|
|
3
|
-
export declare const recordRequestDuration: (
|
|
3
|
+
export declare const recordRequestDuration: (start: number, attrs: Attributes, signalLevel?: TelemetrySignalLevel) => void;
|
|
4
|
+
export declare const recordTimePerOutputToken: (start: number, tokenAttrs: Attributes, metricAttrs: Attributes, signalLevel?: TelemetrySignalLevel) => void;
|
|
4
5
|
export declare const recordTokenUsage: (tokenAttrs: Attributes, metricAttrs: Attributes, signalLevel?: TelemetrySignalLevel) => void;
|
package/dist/telemetry/gen-ai.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { metrics } from "@opentelemetry/api";
|
|
2
|
-
const meter = metrics.getMeter("@hebo
|
|
2
|
+
const meter = metrics.getMeter("@hebo/gateway");
|
|
3
3
|
const requestDurationHistogram = meter.createHistogram("gen_ai.server.request.duration", {
|
|
4
4
|
description: "End-to-end gateway request duration",
|
|
5
5
|
unit: "s",
|
|
@@ -9,6 +9,15 @@ const requestDurationHistogram = meter.createHistogram("gen_ai.server.request.du
|
|
|
9
9
|
],
|
|
10
10
|
},
|
|
11
11
|
});
|
|
12
|
+
const timePerOutputTokenHistogram = meter.createHistogram("gen_ai.server.time_per_output_token", {
|
|
13
|
+
description: "End-to-end gateway request duration per output token",
|
|
14
|
+
unit: "s",
|
|
15
|
+
advice: {
|
|
16
|
+
explicitBucketBoundaries: [
|
|
17
|
+
0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, 1.0, 2.5,
|
|
18
|
+
],
|
|
19
|
+
},
|
|
20
|
+
});
|
|
12
21
|
const tokenUsageHistogram = meter.createHistogram("gen_ai.client.token.usage", {
|
|
13
22
|
description: "Token usage reported by upstream model responses",
|
|
14
23
|
unit: "{token}",
|
|
@@ -20,10 +29,19 @@ const tokenUsageHistogram = meter.createHistogram("gen_ai.client.token.usage", {
|
|
|
20
29
|
},
|
|
21
30
|
});
|
|
22
31
|
// FUTURE: record unsuccessful calls
|
|
23
|
-
export const recordRequestDuration = (
|
|
32
|
+
export const recordRequestDuration = (start, attrs, signalLevel) => {
|
|
24
33
|
if (!signalLevel || signalLevel === "off")
|
|
25
34
|
return;
|
|
26
|
-
requestDurationHistogram.record(
|
|
35
|
+
requestDurationHistogram.record((performance.now() - start) / 1000, attrs);
|
|
36
|
+
};
|
|
37
|
+
// FUTURE: record unsuccessful calls
|
|
38
|
+
export const recordTimePerOutputToken = (start, tokenAttrs, metricAttrs, signalLevel) => {
|
|
39
|
+
if (!signalLevel || (signalLevel !== "recommended" && signalLevel !== "full"))
|
|
40
|
+
return;
|
|
41
|
+
const outputTokens = tokenAttrs["gen_ai.usage.output_tokens"];
|
|
42
|
+
if (typeof outputTokens !== "number" || outputTokens <= 0)
|
|
43
|
+
return;
|
|
44
|
+
timePerOutputTokenHistogram.record((performance.now() - start) / 1000 / outputTokens, metricAttrs);
|
|
27
45
|
};
|
|
28
46
|
// FUTURE: record unsuccessful calls
|
|
29
47
|
export const recordTokenUsage = (tokenAttrs, metricAttrs, signalLevel) => {
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import { metrics } from "@opentelemetry/api";
|
|
2
|
+
const meter = metrics.getMeter("@hebo/gateway");
|
|
3
|
+
const defaultHeapSpaceAttrs = { "v8js.heap.space.name": "total" };
|
|
4
|
+
const heapUsedCounter = meter.createUpDownCounter("v8js.memory.heap.used", {
|
|
5
|
+
description: "Used bytes in the V8 heap",
|
|
6
|
+
unit: "By",
|
|
7
|
+
});
|
|
8
|
+
const heapSpacePhysicalSizeCounter = meter.createUpDownCounter("v8js.memory.heap.space.physical_size", {
|
|
9
|
+
description: "Physical bytes allocated for the V8 heap space",
|
|
10
|
+
unit: "By",
|
|
11
|
+
});
|
|
12
|
+
const isEnabled = (level) => level === "recommended" || level === "full";
|
|
13
|
+
export const recordV8jsMemory = (level) => {
|
|
14
|
+
if (!isEnabled(level))
|
|
15
|
+
return;
|
|
16
|
+
let usage;
|
|
17
|
+
try {
|
|
18
|
+
usage = globalThis.process?.memoryUsage?.();
|
|
19
|
+
}
|
|
20
|
+
catch {
|
|
21
|
+
return;
|
|
22
|
+
}
|
|
23
|
+
if (!usage)
|
|
24
|
+
return;
|
|
25
|
+
heapUsedCounter.add(usage.heapUsed, defaultHeapSpaceAttrs);
|
|
26
|
+
heapSpacePhysicalSizeCounter.add(usage.rss, defaultHeapSpaceAttrs);
|
|
27
|
+
};
|
package/dist/telemetry/span.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { INVALID_SPAN_CONTEXT, SpanKind, SpanStatusCode, context, trace } from "@opentelemetry/api";
|
|
2
|
-
const DEFAULT_TRACER_NAME = "@hebo
|
|
2
|
+
const DEFAULT_TRACER_NAME = "@hebo/gateway";
|
|
3
3
|
let spanTracer;
|
|
4
4
|
let spanEventsEnabled = false;
|
|
5
5
|
const NOOP_SPAN = {
|
package/dist/telemetry/stream.js
CHANGED
|
@@ -1,43 +1,37 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
1
|
+
import { toOpenAIError } from "#/errors/openai";
|
|
2
|
+
const isErrorChunk = (v) => v instanceof Error || !!v?.error;
|
|
3
|
+
export const wrapStream = (src, hooks) => {
|
|
4
|
+
let finished = false;
|
|
5
|
+
const done = (reader, controller, status, reason) => {
|
|
6
|
+
if (!finished) {
|
|
7
|
+
finished = true;
|
|
8
|
+
hooks.onDone?.(status, reason);
|
|
9
|
+
}
|
|
10
|
+
reader.cancel(reason).catch(() => { });
|
|
11
|
+
controller.close();
|
|
9
12
|
};
|
|
10
13
|
return new ReadableStream({
|
|
11
14
|
async start(controller) {
|
|
12
15
|
const reader = src.getReader();
|
|
13
|
-
const close = (status, reason) => {
|
|
14
|
-
finish(status, reason);
|
|
15
|
-
reader.cancel(reason).catch(() => { });
|
|
16
|
-
controller.close();
|
|
17
|
-
};
|
|
18
16
|
try {
|
|
19
17
|
for (;;) {
|
|
20
|
-
if (signal?.aborted) {
|
|
21
|
-
close(499, signal.reason);
|
|
22
|
-
return;
|
|
23
|
-
}
|
|
24
18
|
// eslint-disable-next-line no-await-in-loop
|
|
25
|
-
const { value, done } = await reader.read();
|
|
26
|
-
if (
|
|
19
|
+
const { value, done: eof } = await reader.read();
|
|
20
|
+
if (eof)
|
|
27
21
|
break;
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
22
|
+
const out = isErrorChunk(value) ? toOpenAIError(value) : value;
|
|
23
|
+
controller.enqueue(out);
|
|
24
|
+
if (out !== value) {
|
|
25
|
+
const status = out.error?.type === "invalid_request_error" ? 422 : 502;
|
|
26
|
+
done(reader, controller, status, value);
|
|
32
27
|
return;
|
|
33
28
|
}
|
|
34
29
|
}
|
|
35
|
-
|
|
36
|
-
controller.close();
|
|
30
|
+
done(reader, controller, 200);
|
|
37
31
|
}
|
|
38
32
|
catch (err) {
|
|
39
|
-
|
|
40
|
-
|
|
33
|
+
controller.enqueue(toOpenAIError(err));
|
|
34
|
+
done(reader, controller, 502, err);
|
|
41
35
|
}
|
|
42
36
|
finally {
|
|
43
37
|
try {
|
|
@@ -47,7 +41,10 @@ export const wrapStream = (src, hooks, signal) => {
|
|
|
47
41
|
}
|
|
48
42
|
},
|
|
49
43
|
cancel(reason) {
|
|
50
|
-
|
|
44
|
+
if (!finished) {
|
|
45
|
+
finished = true;
|
|
46
|
+
hooks.onDone?.(499, reason);
|
|
47
|
+
}
|
|
51
48
|
src.cancel(reason).catch(() => { });
|
|
52
49
|
},
|
|
53
50
|
});
|
package/dist/types.d.ts
CHANGED
|
@@ -3,7 +3,6 @@ import type { Tracer } from "@opentelemetry/api";
|
|
|
3
3
|
import type { ChatCompletions, ChatCompletionsBody, ChatCompletionsChunk } from "./endpoints/chat-completions/schema";
|
|
4
4
|
import type { Embeddings, EmbeddingsBody } from "./endpoints/embeddings/schema";
|
|
5
5
|
import type { Model, ModelList } from "./endpoints/models";
|
|
6
|
-
import type { OpenAIError } from "./errors/openai";
|
|
7
6
|
import type { Logger, LoggerConfig } from "./logger";
|
|
8
7
|
import type { ModelCatalog, ModelId } from "./models/types";
|
|
9
8
|
import type { ProviderId, ProviderRegistry } from "./providers/types";
|
|
@@ -67,7 +66,7 @@ export type GatewayContext = {
|
|
|
67
66
|
/**
|
|
68
67
|
* Result returned by the handler (pre-response).
|
|
69
68
|
*/
|
|
70
|
-
result?: ChatCompletions | ReadableStream<ChatCompletionsChunk |
|
|
69
|
+
result?: ChatCompletions | ReadableStream<ChatCompletionsChunk | Error> | Embeddings | Model | ModelList;
|
|
71
70
|
/**
|
|
72
71
|
* Response object returned by the handler.
|
|
73
72
|
*/
|
|
@@ -115,7 +114,7 @@ export type GatewayHooks = {
|
|
|
115
114
|
* Runs after the endpoint handler.
|
|
116
115
|
* @returns Result to replace, or undefined to keep original.
|
|
117
116
|
*/
|
|
118
|
-
after?: (ctx: AfterHookContext) => void | ChatCompletions | ReadableStream<ChatCompletionsChunk |
|
|
117
|
+
after?: (ctx: AfterHookContext) => void | ChatCompletions | ReadableStream<ChatCompletionsChunk | Error> | Embeddings | Promise<void | ChatCompletions | ReadableStream<ChatCompletionsChunk | Error> | Embeddings>;
|
|
119
118
|
/**
|
|
120
119
|
* Runs after the lifecycle has produced the final Response.
|
|
121
120
|
* @returns Replacement Response, or undefined to keep original.
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@hebo-ai/gateway",
|
|
3
|
-
"version": "0.4.0
|
|
3
|
+
"version": "0.4.0",
|
|
4
4
|
"description": "AI gateway as a framework. For full control over models, routing & lifecycle. OpenAI-compatible /chat/completions, /embeddings & /models.",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"ai",
|
|
@@ -168,6 +168,7 @@
|
|
|
168
168
|
"@ai-sdk/groq": "^3.0.19",
|
|
169
169
|
"@ai-sdk/openai": "^3.0.23",
|
|
170
170
|
"@aws-sdk/credential-providers": "^3.981.0",
|
|
171
|
+
"@langfuse/otel": "^4.6.1",
|
|
171
172
|
"@mjackson/node-fetch-server": "^0.7.0",
|
|
172
173
|
"@opentelemetry/api": "^1.9.0",
|
|
173
174
|
"@opentelemetry/context-async-hooks": "^2.5.1",
|
|
@@ -405,11 +405,12 @@ export function toChatCompletionsResponse(
|
|
|
405
405
|
return toResponse(toChatCompletions(result, model), responseInit);
|
|
406
406
|
}
|
|
407
407
|
|
|
408
|
-
export function toChatCompletionsStream(
|
|
408
|
+
export function toChatCompletionsStream<E extends boolean = false>(
|
|
409
409
|
result: StreamTextResult<ToolSet, Output.Output>,
|
|
410
410
|
model: string,
|
|
411
|
-
|
|
412
|
-
|
|
411
|
+
wrapErrors?: E,
|
|
412
|
+
): ReadableStream<ChatCompletionsChunk | (E extends true ? OpenAIError : Error)> {
|
|
413
|
+
return result.fullStream.pipeThrough(new ChatCompletionsStream(model, wrapErrors));
|
|
413
414
|
}
|
|
414
415
|
|
|
415
416
|
export function toChatCompletionsStreamResponse(
|
|
@@ -417,14 +418,14 @@ export function toChatCompletionsStreamResponse(
|
|
|
417
418
|
model: string,
|
|
418
419
|
responseInit?: ResponseInit,
|
|
419
420
|
): Response {
|
|
420
|
-
return toResponse(toChatCompletionsStream(result, model), responseInit);
|
|
421
|
+
return toResponse(toChatCompletionsStream(result, model, true), responseInit);
|
|
421
422
|
}
|
|
422
423
|
|
|
423
|
-
export class ChatCompletionsStream extends TransformStream<
|
|
424
|
+
export class ChatCompletionsStream<E extends boolean = false> extends TransformStream<
|
|
424
425
|
TextStreamPart<ToolSet>,
|
|
425
|
-
ChatCompletionsChunk | OpenAIError
|
|
426
|
+
ChatCompletionsChunk | (E extends true ? OpenAIError : Error)
|
|
426
427
|
> {
|
|
427
|
-
constructor(model: string) {
|
|
428
|
+
constructor(model: string, wrapErrors?: E) {
|
|
428
429
|
const streamId = `chatcmpl-${crypto.randomUUID()}`;
|
|
429
430
|
const creationTime = Math.floor(Date.now() / 1000);
|
|
430
431
|
let toolCallIndexCounter = 0;
|
|
@@ -535,9 +536,15 @@ export class ChatCompletionsStream extends TransformStream<
|
|
|
535
536
|
}
|
|
536
537
|
|
|
537
538
|
case "error": {
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
539
|
+
let err: Error | OpenAIError;
|
|
540
|
+
if (wrapErrors) {
|
|
541
|
+
err = toOpenAIError(part.error);
|
|
542
|
+
} else if (part.error instanceof Error) {
|
|
543
|
+
err = part.error;
|
|
544
|
+
} else {
|
|
545
|
+
err = new Error(String(part.error));
|
|
546
|
+
}
|
|
547
|
+
controller.enqueue(err as E extends true ? OpenAIError : Error);
|
|
541
548
|
}
|
|
542
549
|
}
|
|
543
550
|
},
|
|
@@ -23,7 +23,11 @@ import { winterCgHandler } from "../../lifecycle";
|
|
|
23
23
|
import { logger } from "../../logger";
|
|
24
24
|
import { modelMiddlewareMatcher } from "../../middleware/matcher";
|
|
25
25
|
import { resolveProvider } from "../../providers/registry";
|
|
26
|
-
import {
|
|
26
|
+
import {
|
|
27
|
+
recordRequestDuration,
|
|
28
|
+
recordTimePerOutputToken,
|
|
29
|
+
recordTokenUsage,
|
|
30
|
+
} from "../../telemetry/gen-ai";
|
|
27
31
|
import { addSpanEvent, setSpanAttributes } from "../../telemetry/span";
|
|
28
32
|
import { resolveRequestId } from "../../utils/headers";
|
|
29
33
|
import { prepareForwardHeaders } from "../../utils/request";
|
|
@@ -60,8 +64,8 @@ export const chatCompletions = (config: GatewayConfig): Endpoint => {
|
|
|
60
64
|
|
|
61
65
|
const parsed = ChatCompletionsBodySchema.safeParse(ctx.body);
|
|
62
66
|
if (!parsed.success) {
|
|
63
|
-
// FUTURE:
|
|
64
|
-
throw new GatewayError(z.prettifyError(parsed.error), 400);
|
|
67
|
+
// FUTURE: consider adding body shape to metadata
|
|
68
|
+
throw new GatewayError(z.prettifyError(parsed.error), 400, undefined, parsed.error);
|
|
65
69
|
}
|
|
66
70
|
ctx.body = parsed.data;
|
|
67
71
|
addSpanEvent("hebo.request.parsed");
|
|
@@ -123,13 +127,12 @@ export const chatCompletions = (config: GatewayConfig): Endpoint => {
|
|
|
123
127
|
const result = streamText({
|
|
124
128
|
model: languageModelWithMiddleware,
|
|
125
129
|
headers: prepareForwardHeaders(ctx.request),
|
|
126
|
-
|
|
127
|
-
// abortSignal: ctx.request.signal,
|
|
130
|
+
abortSignal: ctx.request.signal,
|
|
128
131
|
timeout: {
|
|
129
132
|
totalMs: 5 * 60 * 1000,
|
|
130
133
|
},
|
|
131
134
|
onAbort: () => {
|
|
132
|
-
throw new DOMException("
|
|
135
|
+
throw new DOMException("The operation was aborted.", "AbortError");
|
|
133
136
|
},
|
|
134
137
|
onError: () => {},
|
|
135
138
|
onFinish: (res) => {
|
|
@@ -143,7 +146,8 @@ export const chatCompletions = (config: GatewayConfig): Endpoint => {
|
|
|
143
146
|
const genAiResponseAttrs = getChatResponseAttributes(streamResult, genAiSignalLevel);
|
|
144
147
|
setSpanAttributes(genAiResponseAttrs);
|
|
145
148
|
recordTokenUsage(genAiResponseAttrs, genAiGeneralAttrs, genAiSignalLevel);
|
|
146
|
-
|
|
149
|
+
recordTimePerOutputToken(start, genAiResponseAttrs, genAiGeneralAttrs, genAiSignalLevel);
|
|
150
|
+
recordRequestDuration(start, genAiGeneralAttrs, genAiSignalLevel);
|
|
147
151
|
},
|
|
148
152
|
experimental_include: {
|
|
149
153
|
requestBody: false,
|
|
@@ -166,7 +170,6 @@ export const chatCompletions = (config: GatewayConfig): Endpoint => {
|
|
|
166
170
|
const result = await generateText({
|
|
167
171
|
model: languageModelWithMiddleware,
|
|
168
172
|
headers: prepareForwardHeaders(ctx.request),
|
|
169
|
-
// FUTURE: currently can't tell whether upstream or downstream abort
|
|
170
173
|
abortSignal: ctx.request.signal,
|
|
171
174
|
timeout: 5 * 60 * 1000,
|
|
172
175
|
experimental_include: {
|
|
@@ -191,7 +194,8 @@ export const chatCompletions = (config: GatewayConfig): Endpoint => {
|
|
|
191
194
|
addSpanEvent("hebo.hooks.after.completed");
|
|
192
195
|
}
|
|
193
196
|
|
|
194
|
-
|
|
197
|
+
recordTimePerOutputToken(start, genAiResponseAttrs, genAiGeneralAttrs, genAiSignalLevel);
|
|
198
|
+
recordRequestDuration(start, genAiGeneralAttrs, genAiSignalLevel);
|
|
195
199
|
return ctx.result;
|
|
196
200
|
};
|
|
197
201
|
|
|
@@ -54,6 +54,11 @@ const toMessageParts = (message: ChatCompletionsMessage): Record<string, unknown
|
|
|
54
54
|
return parts;
|
|
55
55
|
}
|
|
56
56
|
|
|
57
|
+
// FUTURE: remove once Langfuse supports gen_ai.system_instructions
|
|
58
|
+
if (message.role === "system") {
|
|
59
|
+
return [toTextPart(message.content)];
|
|
60
|
+
}
|
|
61
|
+
|
|
57
62
|
return [];
|
|
58
63
|
};
|
|
59
64
|
|
|
@@ -103,11 +108,13 @@ export const getChatRequestAttributes = (
|
|
|
103
108
|
|
|
104
109
|
if (signalLevel === "full") {
|
|
105
110
|
Object.assign(attrs, {
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
111
|
+
// FUTURE: move system instructions from messages to here
|
|
112
|
+
// blocker: https://github.com/langfuse/langfuse/issues/11607
|
|
113
|
+
// "gen_ai.system_instructions": inputs.messages
|
|
114
|
+
// .filter((m) => m.role === "system")
|
|
115
|
+
// .map((m) => JSON.stringify(toTextPart(m.content))),
|
|
109
116
|
"gen_ai.input.messages": inputs.messages
|
|
110
|
-
|
|
117
|
+
//.filter((m) => m.role !== "system")
|
|
111
118
|
.map((m) => JSON.stringify({ role: m.role, parts: toMessageParts(m) })),
|
|
112
119
|
"gen_ai.tool.definitions": JSON.stringify(inputs.tools),
|
|
113
120
|
});
|
|
@@ -16,7 +16,11 @@ import { winterCgHandler } from "../../lifecycle";
|
|
|
16
16
|
import { logger } from "../../logger";
|
|
17
17
|
import { modelMiddlewareMatcher } from "../../middleware/matcher";
|
|
18
18
|
import { resolveProvider } from "../../providers/registry";
|
|
19
|
-
import {
|
|
19
|
+
import {
|
|
20
|
+
recordRequestDuration,
|
|
21
|
+
recordTimePerOutputToken,
|
|
22
|
+
recordTokenUsage,
|
|
23
|
+
} from "../../telemetry/gen-ai";
|
|
20
24
|
import { addSpanEvent, setSpanAttributes } from "../../telemetry/span";
|
|
21
25
|
import { resolveRequestId } from "../../utils/headers";
|
|
22
26
|
import { prepareForwardHeaders } from "../../utils/request";
|
|
@@ -53,8 +57,8 @@ export const embeddings = (config: GatewayConfig): Endpoint => {
|
|
|
53
57
|
|
|
54
58
|
const parsed = EmbeddingsBodySchema.safeParse(ctx.body);
|
|
55
59
|
if (!parsed.success) {
|
|
56
|
-
// FUTURE:
|
|
57
|
-
throw new GatewayError(z.prettifyError(parsed.error), 400);
|
|
60
|
+
// FUTURE: consider adding body shape to metadata
|
|
61
|
+
throw new GatewayError(z.prettifyError(parsed.error), 400, undefined, parsed.error);
|
|
58
62
|
}
|
|
59
63
|
ctx.body = parsed.data;
|
|
60
64
|
addSpanEvent("hebo.request.parsed");
|
|
@@ -127,7 +131,8 @@ export const embeddings = (config: GatewayConfig): Endpoint => {
|
|
|
127
131
|
addSpanEvent("hebo.hooks.after.completed");
|
|
128
132
|
}
|
|
129
133
|
|
|
130
|
-
|
|
134
|
+
recordTimePerOutputToken(start, genAiResponseAttrs, genAiGeneralAttrs, genAiSignalLevel);
|
|
135
|
+
recordRequestDuration(start, genAiGeneralAttrs, genAiSignalLevel);
|
|
131
136
|
return ctx.result;
|
|
132
137
|
};
|
|
133
138
|
|
package/src/errors/gateway.ts
CHANGED
|
@@ -4,11 +4,12 @@ export class GatewayError extends Error {
|
|
|
4
4
|
readonly status: number;
|
|
5
5
|
readonly code: string;
|
|
6
6
|
|
|
7
|
-
constructor(error:
|
|
8
|
-
const
|
|
9
|
-
super(
|
|
7
|
+
constructor(error: unknown, status: number, code?: string, cause?: unknown) {
|
|
8
|
+
const isError = error instanceof Error;
|
|
9
|
+
super(isError ? error.message : String(error));
|
|
10
|
+
this.cause = cause ?? (isError ? error : undefined);
|
|
11
|
+
|
|
10
12
|
this.status = status;
|
|
11
13
|
this.code = code ?? STATUS_CODE(status);
|
|
12
|
-
this.cause = cause ?? (typeof error === "string" ? undefined : error);
|
|
13
14
|
}
|
|
14
15
|
}
|
package/src/errors/openai.ts
CHANGED
|
@@ -25,7 +25,8 @@ export class OpenAIError {
|
|
|
25
25
|
const mapType = (status: number) => (status < 500 ? "invalid_request_error" : "server_error");
|
|
26
26
|
|
|
27
27
|
const maybeMaskMessage = (meta: ReturnType<typeof getErrorMeta>, requestId?: string) => {
|
|
28
|
-
|
|
28
|
+
// FUTURE: consider masking all upstream errors, also 4xx
|
|
29
|
+
if (!(isProduction() && meta.status >= 500)) {
|
|
29
30
|
return meta.message;
|
|
30
31
|
}
|
|
31
32
|
// FUTURE: always attach requestId to errors (masked and unmasked)
|
package/src/errors/utils.ts
CHANGED
package/src/lifecycle.ts
CHANGED
|
@@ -6,11 +6,13 @@ import type {
|
|
|
6
6
|
} from "./types";
|
|
7
7
|
|
|
8
8
|
import { parseConfig } from "./config";
|
|
9
|
+
import { GatewayError } from "./errors/gateway";
|
|
9
10
|
import { toOpenAIErrorResponse } from "./errors/openai";
|
|
10
11
|
import { logger } from "./logger";
|
|
11
12
|
import { getBaggageAttributes } from "./telemetry/baggage";
|
|
12
13
|
import { initFetch } from "./telemetry/fetch";
|
|
13
14
|
import { getRequestAttributes, getResponseAttributes } from "./telemetry/http";
|
|
15
|
+
import { recordV8jsMemory } from "./telemetry/memory";
|
|
14
16
|
import { addSpanEvent, setSpanEventsEnabled, setSpanTracer, startSpan } from "./telemetry/span";
|
|
15
17
|
import { wrapStream } from "./telemetry/stream";
|
|
16
18
|
import { resolveRequestId } from "./utils/headers";
|
|
@@ -23,7 +25,7 @@ export const winterCgHandler = (
|
|
|
23
25
|
) => {
|
|
24
26
|
const parsedConfig = parseConfig(config);
|
|
25
27
|
|
|
26
|
-
if (parsedConfig.telemetry
|
|
28
|
+
if (parsedConfig.telemetry?.enabled) {
|
|
27
29
|
setSpanTracer(parsedConfig.telemetry?.tracer);
|
|
28
30
|
setSpanEventsEnabled(parsedConfig.telemetry?.signals?.hebo);
|
|
29
31
|
initFetch(parsedConfig.telemetry?.signals?.hebo);
|
|
@@ -58,18 +60,22 @@ export const winterCgHandler = (
|
|
|
58
60
|
);
|
|
59
61
|
}
|
|
60
62
|
|
|
61
|
-
|
|
63
|
+
let realStatus = status;
|
|
64
|
+
if (ctx.request.signal.aborted) realStatus = 499;
|
|
65
|
+
else if (status === 200 && ctx.response?.status) realStatus = ctx.response.status;
|
|
66
|
+
|
|
62
67
|
if (realStatus !== 200) {
|
|
63
|
-
// FUTURE: in-stream errors are redacted in prod
|
|
64
68
|
(realStatus >= 500 ? logger.error : logger.warn)({
|
|
65
69
|
requestId: resolveRequestId(ctx.request),
|
|
66
|
-
err: reason,
|
|
70
|
+
err: reason ?? ctx.request.signal.reason,
|
|
67
71
|
});
|
|
68
72
|
|
|
69
73
|
if (realStatus >= 500) span.recordError(reason);
|
|
70
74
|
}
|
|
71
75
|
span.setAttributes({ "http.response.status_code_effective": realStatus });
|
|
72
76
|
|
|
77
|
+
recordV8jsMemory(parsedConfig.telemetry?.signals?.hebo);
|
|
78
|
+
|
|
73
79
|
span.finish();
|
|
74
80
|
};
|
|
75
81
|
|
|
@@ -89,7 +95,7 @@ export const winterCgHandler = (
|
|
|
89
95
|
ctx.result = (await span.runWithContext(() => run(ctx))) as typeof ctx.result;
|
|
90
96
|
|
|
91
97
|
if (ctx.result instanceof ReadableStream) {
|
|
92
|
-
ctx.result = wrapStream(ctx.result, { onDone: finalize }
|
|
98
|
+
ctx.result = wrapStream(ctx.result, { onDone: finalize });
|
|
93
99
|
}
|
|
94
100
|
|
|
95
101
|
ctx.response = toResponse(ctx.result!, prepareResponseInit(ctx.request));
|
|
@@ -108,7 +114,12 @@ export const winterCgHandler = (
|
|
|
108
114
|
finalize(ctx.response.status);
|
|
109
115
|
}
|
|
110
116
|
} catch (error) {
|
|
111
|
-
ctx.response = toOpenAIErrorResponse(
|
|
117
|
+
ctx.response = toOpenAIErrorResponse(
|
|
118
|
+
ctx.request.signal.aborted
|
|
119
|
+
? new GatewayError(error ?? ctx.request.signal.reason, 499)
|
|
120
|
+
: error,
|
|
121
|
+
prepareResponseInit(ctx.request),
|
|
122
|
+
);
|
|
112
123
|
finalize(ctx.response.status, error);
|
|
113
124
|
}
|
|
114
125
|
|
package/src/telemetry/gen-ai.ts
CHANGED
|
@@ -2,7 +2,7 @@ import { metrics, type Attributes } from "@opentelemetry/api";
|
|
|
2
2
|
|
|
3
3
|
import type { TelemetrySignalLevel } from "../types";
|
|
4
4
|
|
|
5
|
-
const meter = metrics.getMeter("@hebo
|
|
5
|
+
const meter = metrics.getMeter("@hebo/gateway");
|
|
6
6
|
|
|
7
7
|
const requestDurationHistogram = meter.createHistogram("gen_ai.server.request.duration", {
|
|
8
8
|
description: "End-to-end gateway request duration",
|
|
@@ -14,6 +14,16 @@ const requestDurationHistogram = meter.createHistogram("gen_ai.server.request.du
|
|
|
14
14
|
},
|
|
15
15
|
});
|
|
16
16
|
|
|
17
|
+
const timePerOutputTokenHistogram = meter.createHistogram("gen_ai.server.time_per_output_token", {
|
|
18
|
+
description: "End-to-end gateway request duration per output token",
|
|
19
|
+
unit: "s",
|
|
20
|
+
advice: {
|
|
21
|
+
explicitBucketBoundaries: [
|
|
22
|
+
0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, 1.0, 2.5,
|
|
23
|
+
],
|
|
24
|
+
},
|
|
25
|
+
});
|
|
26
|
+
|
|
17
27
|
const tokenUsageHistogram = meter.createHistogram("gen_ai.client.token.usage", {
|
|
18
28
|
description: "Token usage reported by upstream model responses",
|
|
19
29
|
unit: "{token}",
|
|
@@ -27,13 +37,31 @@ const tokenUsageHistogram = meter.createHistogram("gen_ai.client.token.usage", {
|
|
|
27
37
|
|
|
28
38
|
// FUTURE: record unsuccessful calls
|
|
29
39
|
export const recordRequestDuration = (
|
|
30
|
-
|
|
40
|
+
start: number,
|
|
31
41
|
attrs: Attributes,
|
|
32
42
|
signalLevel?: TelemetrySignalLevel,
|
|
33
43
|
) => {
|
|
34
44
|
if (!signalLevel || signalLevel === "off") return;
|
|
35
45
|
|
|
36
|
-
requestDurationHistogram.record(
|
|
46
|
+
requestDurationHistogram.record((performance.now() - start) / 1000, attrs);
|
|
47
|
+
};
|
|
48
|
+
|
|
49
|
+
// FUTURE: record unsuccessful calls
|
|
50
|
+
export const recordTimePerOutputToken = (
|
|
51
|
+
start: number,
|
|
52
|
+
tokenAttrs: Attributes,
|
|
53
|
+
metricAttrs: Attributes,
|
|
54
|
+
signalLevel?: TelemetrySignalLevel,
|
|
55
|
+
) => {
|
|
56
|
+
if (!signalLevel || (signalLevel !== "recommended" && signalLevel !== "full")) return;
|
|
57
|
+
|
|
58
|
+
const outputTokens = tokenAttrs["gen_ai.usage.output_tokens"];
|
|
59
|
+
if (typeof outputTokens !== "number" || outputTokens <= 0) return;
|
|
60
|
+
|
|
61
|
+
timePerOutputTokenHistogram.record(
|
|
62
|
+
(performance.now() - start) / 1000 / outputTokens,
|
|
63
|
+
metricAttrs,
|
|
64
|
+
);
|
|
37
65
|
};
|
|
38
66
|
|
|
39
67
|
// FUTURE: record unsuccessful calls
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import { metrics } from "@opentelemetry/api";
|
|
2
|
+
|
|
3
|
+
import type { TelemetrySignalLevel } from "../types";
|
|
4
|
+
|
|
5
|
+
const meter = metrics.getMeter("@hebo/gateway");
|
|
6
|
+
const defaultHeapSpaceAttrs = { "v8js.heap.space.name": "total" } as const;
|
|
7
|
+
|
|
8
|
+
const heapUsedCounter = meter.createUpDownCounter("v8js.memory.heap.used", {
|
|
9
|
+
description: "Used bytes in the V8 heap",
|
|
10
|
+
unit: "By",
|
|
11
|
+
});
|
|
12
|
+
|
|
13
|
+
const heapSpacePhysicalSizeCounter = meter.createUpDownCounter(
|
|
14
|
+
"v8js.memory.heap.space.physical_size",
|
|
15
|
+
{
|
|
16
|
+
description: "Physical bytes allocated for the V8 heap space",
|
|
17
|
+
unit: "By",
|
|
18
|
+
},
|
|
19
|
+
);
|
|
20
|
+
|
|
21
|
+
const isEnabled = (level?: TelemetrySignalLevel) => level === "recommended" || level === "full";
|
|
22
|
+
|
|
23
|
+
export const recordV8jsMemory = (level?: TelemetrySignalLevel) => {
|
|
24
|
+
if (!isEnabled(level)) return;
|
|
25
|
+
|
|
26
|
+
let usage;
|
|
27
|
+
try {
|
|
28
|
+
usage = globalThis.process?.memoryUsage?.();
|
|
29
|
+
} catch {
|
|
30
|
+
return;
|
|
31
|
+
}
|
|
32
|
+
if (!usage) return;
|
|
33
|
+
|
|
34
|
+
heapUsedCounter.add(usage.heapUsed, defaultHeapSpaceAttrs);
|
|
35
|
+
heapSpacePhysicalSizeCounter.add(usage.rss, defaultHeapSpaceAttrs);
|
|
36
|
+
};
|
package/src/telemetry/span.ts
CHANGED
|
@@ -4,7 +4,7 @@ import { INVALID_SPAN_CONTEXT, SpanKind, SpanStatusCode, context, trace } from "
|
|
|
4
4
|
|
|
5
5
|
import type { TelemetrySignalLevel } from "../types";
|
|
6
6
|
|
|
7
|
-
const DEFAULT_TRACER_NAME = "@hebo
|
|
7
|
+
const DEFAULT_TRACER_NAME = "@hebo/gateway";
|
|
8
8
|
|
|
9
9
|
let spanTracer: Tracer | undefined;
|
|
10
10
|
let spanEventsEnabled = false;
|
package/src/telemetry/stream.ts
CHANGED
|
@@ -1,54 +1,51 @@
|
|
|
1
|
-
|
|
1
|
+
import { toOpenAIError } from "#/errors/openai";
|
|
2
|
+
|
|
3
|
+
const isErrorChunk = (v: unknown) => v instanceof Error || !!(v as any)?.error;
|
|
2
4
|
|
|
3
5
|
export const wrapStream = (
|
|
4
6
|
src: ReadableStream,
|
|
5
7
|
hooks: { onDone?: (status: number, reason: unknown) => void },
|
|
6
|
-
signal?: AbortSignal,
|
|
7
8
|
): ReadableStream => {
|
|
8
|
-
let
|
|
9
|
-
|
|
10
|
-
const finish = (status: number, reason?: unknown) => {
|
|
11
|
-
if (finishOnce) return;
|
|
12
|
-
finishOnce = true;
|
|
9
|
+
let finished = false;
|
|
13
10
|
|
|
14
|
-
|
|
11
|
+
const done = (
|
|
12
|
+
reader: ReadableStreamDefaultReader,
|
|
13
|
+
controller: ReadableStreamDefaultController,
|
|
14
|
+
status: number,
|
|
15
|
+
reason?: unknown,
|
|
16
|
+
) => {
|
|
17
|
+
if (!finished) {
|
|
18
|
+
finished = true;
|
|
19
|
+
hooks.onDone?.(status, reason);
|
|
20
|
+
}
|
|
21
|
+
reader.cancel(reason).catch(() => {});
|
|
22
|
+
controller.close();
|
|
15
23
|
};
|
|
16
24
|
|
|
17
25
|
return new ReadableStream({
|
|
18
26
|
async start(controller) {
|
|
19
27
|
const reader = src.getReader();
|
|
20
28
|
|
|
21
|
-
const close = (status: number, reason?: unknown) => {
|
|
22
|
-
finish(status, reason);
|
|
23
|
-
reader.cancel(reason).catch(() => {});
|
|
24
|
-
controller.close();
|
|
25
|
-
};
|
|
26
|
-
|
|
27
29
|
try {
|
|
28
30
|
for (;;) {
|
|
29
|
-
if (signal?.aborted) {
|
|
30
|
-
close(499, signal.reason);
|
|
31
|
-
return;
|
|
32
|
-
}
|
|
33
|
-
|
|
34
31
|
// eslint-disable-next-line no-await-in-loop
|
|
35
|
-
const { value, done } = await reader.read();
|
|
36
|
-
if (
|
|
32
|
+
const { value, done: eof } = await reader.read();
|
|
33
|
+
if (eof) break;
|
|
37
34
|
|
|
38
|
-
|
|
35
|
+
const out = isErrorChunk(value) ? toOpenAIError(value) : value;
|
|
36
|
+
controller.enqueue(out);
|
|
39
37
|
|
|
40
|
-
if (
|
|
41
|
-
const status =
|
|
42
|
-
|
|
38
|
+
if (out !== value) {
|
|
39
|
+
const status = out.error?.type === "invalid_request_error" ? 422 : 502;
|
|
40
|
+
done(reader, controller, status, value);
|
|
43
41
|
return;
|
|
44
42
|
}
|
|
45
43
|
}
|
|
46
44
|
|
|
47
|
-
|
|
48
|
-
controller.close();
|
|
45
|
+
done(reader, controller, 200);
|
|
49
46
|
} catch (err) {
|
|
50
|
-
|
|
51
|
-
|
|
47
|
+
controller.enqueue(toOpenAIError(err));
|
|
48
|
+
done(reader, controller, 502, err);
|
|
52
49
|
} finally {
|
|
53
50
|
try {
|
|
54
51
|
reader.releaseLock();
|
|
@@ -56,8 +53,11 @@ export const wrapStream = (
|
|
|
56
53
|
}
|
|
57
54
|
},
|
|
58
55
|
|
|
59
|
-
cancel(reason
|
|
60
|
-
|
|
56
|
+
cancel(reason) {
|
|
57
|
+
if (!finished) {
|
|
58
|
+
finished = true;
|
|
59
|
+
hooks.onDone?.(499, reason);
|
|
60
|
+
}
|
|
61
61
|
src.cancel(reason).catch(() => {});
|
|
62
62
|
},
|
|
63
63
|
});
|
package/src/types.ts
CHANGED
|
@@ -8,7 +8,6 @@ import type {
|
|
|
8
8
|
} from "./endpoints/chat-completions/schema";
|
|
9
9
|
import type { Embeddings, EmbeddingsBody } from "./endpoints/embeddings/schema";
|
|
10
10
|
import type { Model, ModelList } from "./endpoints/models";
|
|
11
|
-
import type { OpenAIError } from "./errors/openai";
|
|
12
11
|
import type { Logger, LoggerConfig } from "./logger";
|
|
13
12
|
import type { ModelCatalog, ModelId } from "./models/types";
|
|
14
13
|
import type { ProviderId, ProviderRegistry } from "./providers/types";
|
|
@@ -76,7 +75,7 @@ export type GatewayContext = {
|
|
|
76
75
|
*/
|
|
77
76
|
result?:
|
|
78
77
|
| ChatCompletions
|
|
79
|
-
| ReadableStream<ChatCompletionsChunk |
|
|
78
|
+
| ReadableStream<ChatCompletionsChunk | Error>
|
|
80
79
|
| Embeddings
|
|
81
80
|
| Model
|
|
82
81
|
| ModelList;
|
|
@@ -150,11 +149,9 @@ export type GatewayHooks = {
|
|
|
150
149
|
) =>
|
|
151
150
|
| void
|
|
152
151
|
| ChatCompletions
|
|
153
|
-
| ReadableStream<ChatCompletionsChunk |
|
|
152
|
+
| ReadableStream<ChatCompletionsChunk | Error>
|
|
154
153
|
| Embeddings
|
|
155
|
-
| Promise<
|
|
156
|
-
void | ChatCompletions | ReadableStream<ChatCompletionsChunk | OpenAIError> | Embeddings
|
|
157
|
-
>;
|
|
154
|
+
| Promise<void | ChatCompletions | ReadableStream<ChatCompletionsChunk | Error> | Embeddings>;
|
|
158
155
|
/**
|
|
159
156
|
* Runs after the lifecycle has produced the final Response.
|
|
160
157
|
* @returns Replacement Response, or undefined to keep original.
|