@hebo-ai/gateway 0.4.0-beta.4 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -19,6 +19,7 @@ Learn more in our blog post: [Yet Another AI Gateway?](https://hebo.ai/blog/2601
19
19
  - 🗂️ Model catalog with extensible metadata capabilities.
20
20
  - 🪝 Hook system to customize routing, auth, rate limits, and shape responses.
21
21
  - 🧰 Low-level OpenAI-compatible schema, converters, and middleware helpers.
22
+ - 👁️ OpenTelemetry support for GenAI semantic conventions (Langfuse-compatible).
22
23
 
23
24
  ## 📦 Installation
24
25
 
@@ -641,6 +642,32 @@ https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-spans/
641
642
 
642
643
  For observability integration that is not otel compliant, you can disable built-in telemetry and manually instrument requests during `before` / `after` hooks.
643
644
 
645
+ #### Langfuse
646
+
647
+ Hebo telemetry spans are OpenTelemetry-compatible, so you can send them to Langfuse via `@langfuse/otel`.
648
+
649
+ ```ts
650
+ import { gateway } from "@hebo-ai/gateway";
651
+ import { LangfuseSpanProcessor } from "@langfuse/otel";
652
+ import { context } from "@opentelemetry/api";
653
+ import { AsyncLocalStorageContextManager } from "@opentelemetry/context-async-hooks";
654
+ import { BasicTracerProvider } from "@opentelemetry/sdk-trace-base";
655
+
656
+ context.setGlobalContextManager(new AsyncLocalStorageContextManager().enable());
657
+
658
+ const gw = gateway({
659
+ // ...
660
+ telemetry: {
661
+ enabled: true,
662
+ tracer = new BasicTracerProvider({
663
+ spanProcessors: [new LangfuseSpanProcessor()],
664
+ }).getTracer("hebo");,
665
+ },
666
+ });
667
+ ```
668
+
669
+ Langfuse credentials are read from environment variables by the Langfuse OTel SDK (`LANGFUSE_PUBLIC_KEY`, `LANGFUSE_SECRET_KEY`, `LANGFUSE_BASE_URL`).
670
+
644
671
  ### Passing Framework State to Hooks
645
672
 
646
673
  You can pass per-request info from your framework into the gateway via the second `state` argument on the handler, then read it in hooks through `ctx.state`.
@@ -25,10 +25,10 @@ export declare const convertToToolSet: (tools: ChatCompletionsTool[] | undefined
25
25
  export declare const convertToToolChoice: (toolChoice: ChatCompletionsToolChoice | undefined) => ToolChoice<ToolSet> | undefined;
26
26
  export declare function toChatCompletions(result: GenerateTextResult<ToolSet, Output.Output>, model: string): ChatCompletions;
27
27
  export declare function toChatCompletionsResponse(result: GenerateTextResult<ToolSet, Output.Output>, model: string, responseInit?: ResponseInit): Response;
28
- export declare function toChatCompletionsStream(result: StreamTextResult<ToolSet, Output.Output>, model: string): ReadableStream<ChatCompletionsChunk | OpenAIError>;
28
+ export declare function toChatCompletionsStream<E extends boolean = false>(result: StreamTextResult<ToolSet, Output.Output>, model: string, wrapErrors?: E): ReadableStream<ChatCompletionsChunk | (E extends true ? OpenAIError : Error)>;
29
29
  export declare function toChatCompletionsStreamResponse(result: StreamTextResult<ToolSet, Output.Output>, model: string, responseInit?: ResponseInit): Response;
30
- export declare class ChatCompletionsStream extends TransformStream<TextStreamPart<ToolSet>, ChatCompletionsChunk | OpenAIError> {
31
- constructor(model: string);
30
+ export declare class ChatCompletionsStream<E extends boolean = false> extends TransformStream<TextStreamPart<ToolSet>, ChatCompletionsChunk | (E extends true ? OpenAIError : Error)> {
31
+ constructor(model: string, wrapErrors?: E);
32
32
  }
33
33
  export declare const toChatCompletionsAssistantMessage: (result: GenerateTextResult<ToolSet, Output.Output>) => ChatCompletionsAssistantMessage;
34
34
  export declare function toReasoningDetail(reasoning: ReasoningOutput, id: string, index: number): ChatCompletionsReasoningDetail;
@@ -275,14 +275,14 @@ export function toChatCompletions(result, model) {
275
275
  export function toChatCompletionsResponse(result, model, responseInit) {
276
276
  return toResponse(toChatCompletions(result, model), responseInit);
277
277
  }
278
- export function toChatCompletionsStream(result, model) {
279
- return result.fullStream.pipeThrough(new ChatCompletionsStream(model));
278
+ export function toChatCompletionsStream(result, model, wrapErrors) {
279
+ return result.fullStream.pipeThrough(new ChatCompletionsStream(model, wrapErrors));
280
280
  }
281
281
  export function toChatCompletionsStreamResponse(result, model, responseInit) {
282
- return toResponse(toChatCompletionsStream(result, model), responseInit);
282
+ return toResponse(toChatCompletionsStream(result, model, true), responseInit);
283
283
  }
284
284
  export class ChatCompletionsStream extends TransformStream {
285
- constructor(model) {
285
+ constructor(model, wrapErrors) {
286
286
  const streamId = `chatcmpl-${crypto.randomUUID()}`;
287
287
  const creationTime = Math.floor(Date.now() / 1000);
288
288
  let toolCallIndexCounter = 0;
@@ -348,9 +348,17 @@ export class ChatCompletionsStream extends TransformStream {
348
348
  break;
349
349
  }
350
350
  case "error": {
351
- const error = part.error;
352
- controller.enqueue(toOpenAIError(error));
353
- controller.terminate();
351
+ let err;
352
+ if (wrapErrors) {
353
+ err = toOpenAIError(part.error);
354
+ }
355
+ else if (part.error instanceof Error) {
356
+ err = part.error;
357
+ }
358
+ else {
359
+ err = new Error(String(part.error));
360
+ }
361
+ controller.enqueue(err);
354
362
  }
355
363
  }
356
364
  },
@@ -5,7 +5,7 @@ import { winterCgHandler } from "../../lifecycle";
5
5
  import { logger } from "../../logger";
6
6
  import { modelMiddlewareMatcher } from "../../middleware/matcher";
7
7
  import { resolveProvider } from "../../providers/registry";
8
- import { recordRequestDuration, recordTokenUsage } from "../../telemetry/gen-ai";
8
+ import { recordRequestDuration, recordTimePerOutputToken, recordTokenUsage, } from "../../telemetry/gen-ai";
9
9
  import { addSpanEvent, setSpanAttributes } from "../../telemetry/span";
10
10
  import { resolveRequestId } from "../../utils/headers";
11
11
  import { prepareForwardHeaders } from "../../utils/request";
@@ -33,8 +33,8 @@ export const chatCompletions = (config) => {
33
33
  addSpanEvent("hebo.request.deserialized");
34
34
  const parsed = ChatCompletionsBodySchema.safeParse(ctx.body);
35
35
  if (!parsed.success) {
36
- // FUTURE: add body shape to error message
37
- throw new GatewayError(z.prettifyError(parsed.error), 400);
36
+ // FUTURE: consider adding body shape to metadata
37
+ throw new GatewayError(z.prettifyError(parsed.error), 400, undefined, parsed.error);
38
38
  }
39
39
  ctx.body = parsed.data;
40
40
  addSpanEvent("hebo.request.parsed");
@@ -84,13 +84,12 @@ export const chatCompletions = (config) => {
84
84
  const result = streamText({
85
85
  model: languageModelWithMiddleware,
86
86
  headers: prepareForwardHeaders(ctx.request),
87
- // No abort signal here, otherwise we can't detect upstream from client cancellations
88
- // abortSignal: ctx.request.signal,
87
+ abortSignal: ctx.request.signal,
89
88
  timeout: {
90
89
  totalMs: 5 * 60 * 1000,
91
90
  },
92
91
  onAbort: () => {
93
- throw new DOMException("Upstream failed", "AbortError");
92
+ throw new DOMException("The operation was aborted.", "AbortError");
94
93
  },
95
94
  onError: () => { },
96
95
  onFinish: (res) => {
@@ -100,7 +99,8 @@ export const chatCompletions = (config) => {
100
99
  const genAiResponseAttrs = getChatResponseAttributes(streamResult, genAiSignalLevel);
101
100
  setSpanAttributes(genAiResponseAttrs);
102
101
  recordTokenUsage(genAiResponseAttrs, genAiGeneralAttrs, genAiSignalLevel);
103
- recordRequestDuration(performance.now() - start, genAiGeneralAttrs, genAiSignalLevel);
102
+ recordTimePerOutputToken(start, genAiResponseAttrs, genAiGeneralAttrs, genAiSignalLevel);
103
+ recordRequestDuration(start, genAiGeneralAttrs, genAiSignalLevel);
104
104
  },
105
105
  experimental_include: {
106
106
  requestBody: false,
@@ -119,7 +119,6 @@ export const chatCompletions = (config) => {
119
119
  const result = await generateText({
120
120
  model: languageModelWithMiddleware,
121
121
  headers: prepareForwardHeaders(ctx.request),
122
- // FUTURE: currently can't tell whether upstream or downstream abort
123
122
  abortSignal: ctx.request.signal,
124
123
  timeout: 5 * 60 * 1000,
125
124
  experimental_include: {
@@ -140,7 +139,8 @@ export const chatCompletions = (config) => {
140
139
  ctx.result = (await hooks.after(ctx)) ?? ctx.result;
141
140
  addSpanEvent("hebo.hooks.after.completed");
142
141
  }
143
- recordRequestDuration(performance.now() - start, genAiGeneralAttrs, genAiSignalLevel);
142
+ recordTimePerOutputToken(start, genAiResponseAttrs, genAiGeneralAttrs, genAiSignalLevel);
143
+ recordRequestDuration(start, genAiGeneralAttrs, genAiSignalLevel);
144
144
  return ctx.result;
145
145
  };
146
146
  return { handler: winterCgHandler(handler, config) };
@@ -44,6 +44,10 @@ const toMessageParts = (message) => {
44
44
  }
45
45
  return parts;
46
46
  }
47
+ // FUTURE: remove once Langfuse supports gen_ai.system_instructions
48
+ if (message.role === "system") {
49
+ return [toTextPart(message.content)];
50
+ }
47
51
  return [];
48
52
  };
49
53
  export const getChatGeneralAttributes = (ctx, signalLevel) => {
@@ -81,11 +85,13 @@ export const getChatRequestAttributes = (inputs, signalLevel) => {
81
85
  }
82
86
  if (signalLevel === "full") {
83
87
  Object.assign(attrs, {
84
- "gen_ai.system_instructions": inputs.messages
85
- .filter((m) => m.role === "system")
86
- .map((m) => JSON.stringify({ parts: [toTextPart(m.content)] })),
88
+ // FUTURE: move system instructions from messages to here
89
+ // blocker: https://github.com/langfuse/langfuse/issues/11607
90
+ // "gen_ai.system_instructions": inputs.messages
91
+ // .filter((m) => m.role === "system")
92
+ // .map((m) => JSON.stringify(toTextPart(m.content))),
87
93
  "gen_ai.input.messages": inputs.messages
88
- .filter((m) => m.role !== "system")
94
+ //.filter((m) => m.role !== "system")
89
95
  .map((m) => JSON.stringify({ role: m.role, parts: toMessageParts(m) })),
90
96
  "gen_ai.tool.definitions": JSON.stringify(inputs.tools),
91
97
  });
@@ -5,7 +5,7 @@ import { winterCgHandler } from "../../lifecycle";
5
5
  import { logger } from "../../logger";
6
6
  import { modelMiddlewareMatcher } from "../../middleware/matcher";
7
7
  import { resolveProvider } from "../../providers/registry";
8
- import { recordRequestDuration, recordTokenUsage } from "../../telemetry/gen-ai";
8
+ import { recordRequestDuration, recordTimePerOutputToken, recordTokenUsage, } from "../../telemetry/gen-ai";
9
9
  import { addSpanEvent, setSpanAttributes } from "../../telemetry/span";
10
10
  import { resolveRequestId } from "../../utils/headers";
11
11
  import { prepareForwardHeaders } from "../../utils/request";
@@ -33,8 +33,8 @@ export const embeddings = (config) => {
33
33
  addSpanEvent("hebo.request.deserialized");
34
34
  const parsed = EmbeddingsBodySchema.safeParse(ctx.body);
35
35
  if (!parsed.success) {
36
- // FUTURE: add body shape to error message
37
- throw new GatewayError(z.prettifyError(parsed.error), 400);
36
+ // FUTURE: consider adding body shape to metadata
37
+ throw new GatewayError(z.prettifyError(parsed.error), 400, undefined, parsed.error);
38
38
  }
39
39
  ctx.body = parsed.data;
40
40
  addSpanEvent("hebo.request.parsed");
@@ -95,7 +95,8 @@ export const embeddings = (config) => {
95
95
  ctx.result = (await hooks.after(ctx)) ?? ctx.result;
96
96
  addSpanEvent("hebo.hooks.after.completed");
97
97
  }
98
- recordRequestDuration(performance.now() - start, genAiGeneralAttrs, genAiSignalLevel);
98
+ recordTimePerOutputToken(start, genAiResponseAttrs, genAiGeneralAttrs, genAiSignalLevel);
99
+ recordRequestDuration(start, genAiGeneralAttrs, genAiSignalLevel);
99
100
  return ctx.result;
100
101
  };
101
102
  return { handler: winterCgHandler(handler, config) };
@@ -1,5 +1,5 @@
1
1
  export declare class GatewayError extends Error {
2
2
  readonly status: number;
3
3
  readonly code: string;
4
- constructor(error: string | Error, status: number, code?: string, cause?: unknown);
4
+ constructor(error: unknown, status: number, code?: string, cause?: unknown);
5
5
  }
@@ -3,10 +3,10 @@ export class GatewayError extends Error {
3
3
  status;
4
4
  code;
5
5
  constructor(error, status, code, cause) {
6
- const msg = typeof error === "string" ? error : error.message;
7
- super(msg);
6
+ const isError = error instanceof Error;
7
+ super(isError ? error.message : String(error));
8
+ this.cause = cause ?? (isError ? error : undefined);
8
9
  this.status = status;
9
10
  this.code = code ?? STATUS_CODE(status);
10
- this.cause = cause ?? (typeof error === "string" ? undefined : error);
11
11
  }
12
12
  }
@@ -19,7 +19,8 @@ export class OpenAIError {
19
19
  }
20
20
  const mapType = (status) => (status < 500 ? "invalid_request_error" : "server_error");
21
21
  const maybeMaskMessage = (meta, requestId) => {
22
- if (!(isProduction() && (meta.status >= 500 || meta.code.includes("UPSTREAM")))) {
22
+ // FUTURE: consider masking all upstream errors, also 4xx
23
+ if (!(isProduction() && meta.status >= 500)) {
23
24
  return meta.message;
24
25
  }
25
26
  // FUTURE: always attach requestId to errors (masked and unmasked)
@@ -8,12 +8,13 @@ export declare const STATUS_CODES: {
8
8
  readonly 409: "CONFLICT";
9
9
  readonly 422: "UNPROCESSABLE_ENTITY";
10
10
  readonly 429: "TOO_MANY_REQUESTS";
11
+ readonly 499: "CLIENT_CLOSED_REQUEST";
11
12
  readonly 500: "INTERNAL_SERVER_ERROR";
12
13
  readonly 502: "BAD_GATEWAY";
13
14
  readonly 503: "SERVICE_UNAVAILABLE";
14
15
  readonly 504: "GATEWAY_TIMEOUT";
15
16
  };
16
- export declare const STATUS_CODE: (status: number) => "BAD_REQUEST" | "UNAUTHORIZED" | "PAYMENT_REQUIRED" | "FORBIDDEN" | "NOT_FOUND" | "METHOD_NOT_ALLOWED" | "CONFLICT" | "UNPROCESSABLE_ENTITY" | "TOO_MANY_REQUESTS" | "INTERNAL_SERVER_ERROR" | "BAD_GATEWAY" | "SERVICE_UNAVAILABLE" | "GATEWAY_TIMEOUT";
17
+ export declare const STATUS_CODE: (status: number) => "BAD_REQUEST" | "UNAUTHORIZED" | "PAYMENT_REQUIRED" | "FORBIDDEN" | "NOT_FOUND" | "METHOD_NOT_ALLOWED" | "CONFLICT" | "UNPROCESSABLE_ENTITY" | "TOO_MANY_REQUESTS" | "CLIENT_CLOSED_REQUEST" | "INTERNAL_SERVER_ERROR" | "BAD_GATEWAY" | "SERVICE_UNAVAILABLE" | "GATEWAY_TIMEOUT";
17
18
  export declare function getErrorMeta(error: unknown): {
18
19
  status: number;
19
20
  code: string;
@@ -10,6 +10,7 @@ export const STATUS_CODES = {
10
10
  409: "CONFLICT",
11
11
  422: "UNPROCESSABLE_ENTITY",
12
12
  429: "TOO_MANY_REQUESTS",
13
+ 499: "CLIENT_CLOSED_REQUEST",
13
14
  500: "INTERNAL_SERVER_ERROR",
14
15
  502: "BAD_GATEWAY",
15
16
  503: "SERVICE_UNAVAILABLE",
package/dist/lifecycle.js CHANGED
@@ -1,9 +1,11 @@
1
1
  import { parseConfig } from "./config";
2
+ import { GatewayError } from "./errors/gateway";
2
3
  import { toOpenAIErrorResponse } from "./errors/openai";
3
4
  import { logger } from "./logger";
4
5
  import { getBaggageAttributes } from "./telemetry/baggage";
5
6
  import { initFetch } from "./telemetry/fetch";
6
7
  import { getRequestAttributes, getResponseAttributes } from "./telemetry/http";
8
+ import { recordV8jsMemory } from "./telemetry/memory";
7
9
  import { addSpanEvent, setSpanEventsEnabled, setSpanTracer, startSpan } from "./telemetry/span";
8
10
  import { wrapStream } from "./telemetry/stream";
9
11
  import { resolveRequestId } from "./utils/headers";
@@ -11,7 +13,7 @@ import { maybeApplyRequestPatch, prepareRequestHeaders } from "./utils/request";
11
13
  import { prepareResponseInit, toResponse } from "./utils/response";
12
14
  export const winterCgHandler = (run, config) => {
13
15
  const parsedConfig = parseConfig(config);
14
- if (parsedConfig.telemetry.enabled) {
16
+ if (parsedConfig.telemetry?.enabled) {
15
17
  setSpanTracer(parsedConfig.telemetry?.tracer);
16
18
  setSpanEventsEnabled(parsedConfig.telemetry?.signals?.hebo);
17
19
  initFetch(parsedConfig.telemetry?.signals?.hebo);
@@ -39,17 +41,21 @@ export const winterCgHandler = (run, config) => {
39
41
  // FUTURE add http.server.request.duration
40
42
  span.setAttributes(getResponseAttributes(ctx.response, parsedConfig.telemetry?.signals?.http));
41
43
  }
42
- const realStatus = status === 200 ? (ctx.response?.status ?? status) : status;
44
+ let realStatus = status;
45
+ if (ctx.request.signal.aborted)
46
+ realStatus = 499;
47
+ else if (status === 200 && ctx.response?.status)
48
+ realStatus = ctx.response.status;
43
49
  if (realStatus !== 200) {
44
- // FUTURE: in-stream errors are redacted in prod
45
50
  (realStatus >= 500 ? logger.error : logger.warn)({
46
51
  requestId: resolveRequestId(ctx.request),
47
- err: reason,
52
+ err: reason ?? ctx.request.signal.reason,
48
53
  });
49
54
  if (realStatus >= 500)
50
55
  span.recordError(reason);
51
56
  }
52
57
  span.setAttributes({ "http.response.status_code_effective": realStatus });
58
+ recordV8jsMemory(parsedConfig.telemetry?.signals?.hebo);
53
59
  span.finish();
54
60
  };
55
61
  try {
@@ -66,7 +72,7 @@ export const winterCgHandler = (run, config) => {
66
72
  if (!ctx.response) {
67
73
  ctx.result = (await span.runWithContext(() => run(ctx)));
68
74
  if (ctx.result instanceof ReadableStream) {
69
- ctx.result = wrapStream(ctx.result, { onDone: finalize }, ctx.request.signal);
75
+ ctx.result = wrapStream(ctx.result, { onDone: finalize });
70
76
  }
71
77
  ctx.response = toResponse(ctx.result, prepareResponseInit(ctx.request));
72
78
  }
@@ -83,7 +89,9 @@ export const winterCgHandler = (run, config) => {
83
89
  }
84
90
  }
85
91
  catch (error) {
86
- ctx.response = toOpenAIErrorResponse(error, prepareResponseInit(ctx.request));
92
+ ctx.response = toOpenAIErrorResponse(ctx.request.signal.aborted
93
+ ? new GatewayError(error ?? ctx.request.signal.reason, 499)
94
+ : error, prepareResponseInit(ctx.request));
87
95
  finalize(ctx.response.status, error);
88
96
  }
89
97
  return ctx.response ?? new Response("Internal Server Error", { status: 500 });
@@ -1,4 +1,5 @@
1
1
  import { type Attributes } from "@opentelemetry/api";
2
2
  import type { TelemetrySignalLevel } from "../types";
3
- export declare const recordRequestDuration: (duration: number, attrs: Attributes, signalLevel?: TelemetrySignalLevel) => void;
3
+ export declare const recordRequestDuration: (start: number, attrs: Attributes, signalLevel?: TelemetrySignalLevel) => void;
4
+ export declare const recordTimePerOutputToken: (start: number, tokenAttrs: Attributes, metricAttrs: Attributes, signalLevel?: TelemetrySignalLevel) => void;
4
5
  export declare const recordTokenUsage: (tokenAttrs: Attributes, metricAttrs: Attributes, signalLevel?: TelemetrySignalLevel) => void;
@@ -1,5 +1,5 @@
1
1
  import { metrics } from "@opentelemetry/api";
2
- const meter = metrics.getMeter("@hebo-ai/gateway");
2
+ const meter = metrics.getMeter("@hebo/gateway");
3
3
  const requestDurationHistogram = meter.createHistogram("gen_ai.server.request.duration", {
4
4
  description: "End-to-end gateway request duration",
5
5
  unit: "s",
@@ -9,6 +9,15 @@ const requestDurationHistogram = meter.createHistogram("gen_ai.server.request.du
9
9
  ],
10
10
  },
11
11
  });
12
+ const timePerOutputTokenHistogram = meter.createHistogram("gen_ai.server.time_per_output_token", {
13
+ description: "End-to-end gateway request duration per output token",
14
+ unit: "s",
15
+ advice: {
16
+ explicitBucketBoundaries: [
17
+ 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, 1.0, 2.5,
18
+ ],
19
+ },
20
+ });
12
21
  const tokenUsageHistogram = meter.createHistogram("gen_ai.client.token.usage", {
13
22
  description: "Token usage reported by upstream model responses",
14
23
  unit: "{token}",
@@ -20,10 +29,19 @@ const tokenUsageHistogram = meter.createHistogram("gen_ai.client.token.usage", {
20
29
  },
21
30
  });
22
31
  // FUTURE: record unsuccessful calls
23
- export const recordRequestDuration = (duration, attrs, signalLevel) => {
32
+ export const recordRequestDuration = (start, attrs, signalLevel) => {
24
33
  if (!signalLevel || signalLevel === "off")
25
34
  return;
26
- requestDurationHistogram.record(duration / 1000, attrs);
35
+ requestDurationHistogram.record((performance.now() - start) / 1000, attrs);
36
+ };
37
+ // FUTURE: record unsuccessful calls
38
+ export const recordTimePerOutputToken = (start, tokenAttrs, metricAttrs, signalLevel) => {
39
+ if (!signalLevel || (signalLevel !== "recommended" && signalLevel !== "full"))
40
+ return;
41
+ const outputTokens = tokenAttrs["gen_ai.usage.output_tokens"];
42
+ if (typeof outputTokens !== "number" || outputTokens <= 0)
43
+ return;
44
+ timePerOutputTokenHistogram.record((performance.now() - start) / 1000 / outputTokens, metricAttrs);
27
45
  };
28
46
  // FUTURE: record unsuccessful calls
29
47
  export const recordTokenUsage = (tokenAttrs, metricAttrs, signalLevel) => {
@@ -0,0 +1,2 @@
1
+ import type { TelemetrySignalLevel } from "../types";
2
+ export declare const recordV8jsMemory: (level?: TelemetrySignalLevel) => void;
@@ -0,0 +1,27 @@
1
+ import { metrics } from "@opentelemetry/api";
2
+ const meter = metrics.getMeter("@hebo/gateway");
3
+ const defaultHeapSpaceAttrs = { "v8js.heap.space.name": "total" };
4
+ const heapUsedCounter = meter.createUpDownCounter("v8js.memory.heap.used", {
5
+ description: "Used bytes in the V8 heap",
6
+ unit: "By",
7
+ });
8
+ const heapSpacePhysicalSizeCounter = meter.createUpDownCounter("v8js.memory.heap.space.physical_size", {
9
+ description: "Physical bytes allocated for the V8 heap space",
10
+ unit: "By",
11
+ });
12
+ const isEnabled = (level) => level === "recommended" || level === "full";
13
+ export const recordV8jsMemory = (level) => {
14
+ if (!isEnabled(level))
15
+ return;
16
+ let usage;
17
+ try {
18
+ usage = globalThis.process?.memoryUsage?.();
19
+ }
20
+ catch {
21
+ return;
22
+ }
23
+ if (!usage)
24
+ return;
25
+ heapUsedCounter.add(usage.heapUsed, defaultHeapSpaceAttrs);
26
+ heapSpacePhysicalSizeCounter.add(usage.rss, defaultHeapSpaceAttrs);
27
+ };
@@ -1,5 +1,5 @@
1
1
  import { INVALID_SPAN_CONTEXT, SpanKind, SpanStatusCode, context, trace } from "@opentelemetry/api";
2
- const DEFAULT_TRACER_NAME = "@hebo-ai/gateway";
2
+ const DEFAULT_TRACER_NAME = "@hebo/gateway";
3
3
  let spanTracer;
4
4
  let spanEventsEnabled = false;
5
5
  const NOOP_SPAN = {
@@ -1,3 +1,3 @@
1
1
  export declare const wrapStream: (src: ReadableStream, hooks: {
2
2
  onDone?: (status: number, reason: unknown) => void;
3
- }, signal?: AbortSignal) => ReadableStream;
3
+ }) => ReadableStream;
@@ -1,43 +1,37 @@
1
- const isErrorChunk = (v) => !!v?.error;
2
- export const wrapStream = (src, hooks, signal) => {
3
- let finishOnce = false;
4
- const finish = (status, reason) => {
5
- if (finishOnce)
6
- return;
7
- finishOnce = true;
8
- hooks.onDone?.(status, reason ?? signal?.reason);
1
+ import { toOpenAIError } from "#/errors/openai";
2
+ const isErrorChunk = (v) => v instanceof Error || !!v?.error;
3
+ export const wrapStream = (src, hooks) => {
4
+ let finished = false;
5
+ const done = (reader, controller, status, reason) => {
6
+ if (!finished) {
7
+ finished = true;
8
+ hooks.onDone?.(status, reason);
9
+ }
10
+ reader.cancel(reason).catch(() => { });
11
+ controller.close();
9
12
  };
10
13
  return new ReadableStream({
11
14
  async start(controller) {
12
15
  const reader = src.getReader();
13
- const close = (status, reason) => {
14
- finish(status, reason);
15
- reader.cancel(reason).catch(() => { });
16
- controller.close();
17
- };
18
16
  try {
19
17
  for (;;) {
20
- if (signal?.aborted) {
21
- close(499, signal.reason);
22
- return;
23
- }
24
18
  // eslint-disable-next-line no-await-in-loop
25
- const { value, done } = await reader.read();
26
- if (done)
19
+ const { value, done: eof } = await reader.read();
20
+ if (eof)
27
21
  break;
28
- controller.enqueue(value);
29
- if (isErrorChunk(value)) {
30
- const status = value.error.type === "invalid_request_error" ? 422 : 502;
31
- close(status, value.error.message);
22
+ const out = isErrorChunk(value) ? toOpenAIError(value) : value;
23
+ controller.enqueue(out);
24
+ if (out !== value) {
25
+ const status = out.error?.type === "invalid_request_error" ? 422 : 502;
26
+ done(reader, controller, status, value);
32
27
  return;
33
28
  }
34
29
  }
35
- finish(200);
36
- controller.close();
30
+ done(reader, controller, 200);
37
31
  }
38
32
  catch (err) {
39
- const status = signal?.aborted ? 499 : err?.name === "AbortError" ? 503 : 502;
40
- close(status, err);
33
+ controller.enqueue(toOpenAIError(err));
34
+ done(reader, controller, 502, err);
41
35
  }
42
36
  finally {
43
37
  try {
@@ -47,7 +41,10 @@ export const wrapStream = (src, hooks, signal) => {
47
41
  }
48
42
  },
49
43
  cancel(reason) {
50
- finish(499, reason);
44
+ if (!finished) {
45
+ finished = true;
46
+ hooks.onDone?.(499, reason);
47
+ }
51
48
  src.cancel(reason).catch(() => { });
52
49
  },
53
50
  });
package/dist/types.d.ts CHANGED
@@ -3,7 +3,6 @@ import type { Tracer } from "@opentelemetry/api";
3
3
  import type { ChatCompletions, ChatCompletionsBody, ChatCompletionsChunk } from "./endpoints/chat-completions/schema";
4
4
  import type { Embeddings, EmbeddingsBody } from "./endpoints/embeddings/schema";
5
5
  import type { Model, ModelList } from "./endpoints/models";
6
- import type { OpenAIError } from "./errors/openai";
7
6
  import type { Logger, LoggerConfig } from "./logger";
8
7
  import type { ModelCatalog, ModelId } from "./models/types";
9
8
  import type { ProviderId, ProviderRegistry } from "./providers/types";
@@ -67,7 +66,7 @@ export type GatewayContext = {
67
66
  /**
68
67
  * Result returned by the handler (pre-response).
69
68
  */
70
- result?: ChatCompletions | ReadableStream<ChatCompletionsChunk | OpenAIError> | Embeddings | Model | ModelList;
69
+ result?: ChatCompletions | ReadableStream<ChatCompletionsChunk | Error> | Embeddings | Model | ModelList;
71
70
  /**
72
71
  * Response object returned by the handler.
73
72
  */
@@ -115,7 +114,7 @@ export type GatewayHooks = {
115
114
  * Runs after the endpoint handler.
116
115
  * @returns Result to replace, or undefined to keep original.
117
116
  */
118
- after?: (ctx: AfterHookContext) => void | ChatCompletions | ReadableStream<ChatCompletionsChunk | OpenAIError> | Embeddings | Promise<void | ChatCompletions | ReadableStream<ChatCompletionsChunk | OpenAIError> | Embeddings>;
117
+ after?: (ctx: AfterHookContext) => void | ChatCompletions | ReadableStream<ChatCompletionsChunk | Error> | Embeddings | Promise<void | ChatCompletions | ReadableStream<ChatCompletionsChunk | Error> | Embeddings>;
119
118
  /**
120
119
  * Runs after the lifecycle has produced the final Response.
121
120
  * @returns Replacement Response, or undefined to keep original.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@hebo-ai/gateway",
3
- "version": "0.4.0-beta.4",
3
+ "version": "0.4.0",
4
4
  "description": "AI gateway as a framework. For full control over models, routing & lifecycle. OpenAI-compatible /chat/completions, /embeddings & /models.",
5
5
  "keywords": [
6
6
  "ai",
@@ -168,6 +168,7 @@
168
168
  "@ai-sdk/groq": "^3.0.19",
169
169
  "@ai-sdk/openai": "^3.0.23",
170
170
  "@aws-sdk/credential-providers": "^3.981.0",
171
+ "@langfuse/otel": "^4.6.1",
171
172
  "@mjackson/node-fetch-server": "^0.7.0",
172
173
  "@opentelemetry/api": "^1.9.0",
173
174
  "@opentelemetry/context-async-hooks": "^2.5.1",
@@ -405,11 +405,12 @@ export function toChatCompletionsResponse(
405
405
  return toResponse(toChatCompletions(result, model), responseInit);
406
406
  }
407
407
 
408
- export function toChatCompletionsStream(
408
+ export function toChatCompletionsStream<E extends boolean = false>(
409
409
  result: StreamTextResult<ToolSet, Output.Output>,
410
410
  model: string,
411
- ): ReadableStream<ChatCompletionsChunk | OpenAIError> {
412
- return result.fullStream.pipeThrough(new ChatCompletionsStream(model));
411
+ wrapErrors?: E,
412
+ ): ReadableStream<ChatCompletionsChunk | (E extends true ? OpenAIError : Error)> {
413
+ return result.fullStream.pipeThrough(new ChatCompletionsStream(model, wrapErrors));
413
414
  }
414
415
 
415
416
  export function toChatCompletionsStreamResponse(
@@ -417,14 +418,14 @@ export function toChatCompletionsStreamResponse(
417
418
  model: string,
418
419
  responseInit?: ResponseInit,
419
420
  ): Response {
420
- return toResponse(toChatCompletionsStream(result, model), responseInit);
421
+ return toResponse(toChatCompletionsStream(result, model, true), responseInit);
421
422
  }
422
423
 
423
- export class ChatCompletionsStream extends TransformStream<
424
+ export class ChatCompletionsStream<E extends boolean = false> extends TransformStream<
424
425
  TextStreamPart<ToolSet>,
425
- ChatCompletionsChunk | OpenAIError
426
+ ChatCompletionsChunk | (E extends true ? OpenAIError : Error)
426
427
  > {
427
- constructor(model: string) {
428
+ constructor(model: string, wrapErrors?: E) {
428
429
  const streamId = `chatcmpl-${crypto.randomUUID()}`;
429
430
  const creationTime = Math.floor(Date.now() / 1000);
430
431
  let toolCallIndexCounter = 0;
@@ -535,9 +536,15 @@ export class ChatCompletionsStream extends TransformStream<
535
536
  }
536
537
 
537
538
  case "error": {
538
- const error = part.error;
539
- controller.enqueue(toOpenAIError(error));
540
- controller.terminate();
539
+ let err: Error | OpenAIError;
540
+ if (wrapErrors) {
541
+ err = toOpenAIError(part.error);
542
+ } else if (part.error instanceof Error) {
543
+ err = part.error;
544
+ } else {
545
+ err = new Error(String(part.error));
546
+ }
547
+ controller.enqueue(err as E extends true ? OpenAIError : Error);
541
548
  }
542
549
  }
543
550
  },
@@ -23,7 +23,11 @@ import { winterCgHandler } from "../../lifecycle";
23
23
  import { logger } from "../../logger";
24
24
  import { modelMiddlewareMatcher } from "../../middleware/matcher";
25
25
  import { resolveProvider } from "../../providers/registry";
26
- import { recordRequestDuration, recordTokenUsage } from "../../telemetry/gen-ai";
26
+ import {
27
+ recordRequestDuration,
28
+ recordTimePerOutputToken,
29
+ recordTokenUsage,
30
+ } from "../../telemetry/gen-ai";
27
31
  import { addSpanEvent, setSpanAttributes } from "../../telemetry/span";
28
32
  import { resolveRequestId } from "../../utils/headers";
29
33
  import { prepareForwardHeaders } from "../../utils/request";
@@ -60,8 +64,8 @@ export const chatCompletions = (config: GatewayConfig): Endpoint => {
60
64
 
61
65
  const parsed = ChatCompletionsBodySchema.safeParse(ctx.body);
62
66
  if (!parsed.success) {
63
- // FUTURE: add body shape to error message
64
- throw new GatewayError(z.prettifyError(parsed.error), 400);
67
+ // FUTURE: consider adding body shape to metadata
68
+ throw new GatewayError(z.prettifyError(parsed.error), 400, undefined, parsed.error);
65
69
  }
66
70
  ctx.body = parsed.data;
67
71
  addSpanEvent("hebo.request.parsed");
@@ -123,13 +127,12 @@ export const chatCompletions = (config: GatewayConfig): Endpoint => {
123
127
  const result = streamText({
124
128
  model: languageModelWithMiddleware,
125
129
  headers: prepareForwardHeaders(ctx.request),
126
- // No abort signal here, otherwise we can't detect upstream from client cancellations
127
- // abortSignal: ctx.request.signal,
130
+ abortSignal: ctx.request.signal,
128
131
  timeout: {
129
132
  totalMs: 5 * 60 * 1000,
130
133
  },
131
134
  onAbort: () => {
132
- throw new DOMException("Upstream failed", "AbortError");
135
+ throw new DOMException("The operation was aborted.", "AbortError");
133
136
  },
134
137
  onError: () => {},
135
138
  onFinish: (res) => {
@@ -143,7 +146,8 @@ export const chatCompletions = (config: GatewayConfig): Endpoint => {
143
146
  const genAiResponseAttrs = getChatResponseAttributes(streamResult, genAiSignalLevel);
144
147
  setSpanAttributes(genAiResponseAttrs);
145
148
  recordTokenUsage(genAiResponseAttrs, genAiGeneralAttrs, genAiSignalLevel);
146
- recordRequestDuration(performance.now() - start, genAiGeneralAttrs, genAiSignalLevel);
149
+ recordTimePerOutputToken(start, genAiResponseAttrs, genAiGeneralAttrs, genAiSignalLevel);
150
+ recordRequestDuration(start, genAiGeneralAttrs, genAiSignalLevel);
147
151
  },
148
152
  experimental_include: {
149
153
  requestBody: false,
@@ -166,7 +170,6 @@ export const chatCompletions = (config: GatewayConfig): Endpoint => {
166
170
  const result = await generateText({
167
171
  model: languageModelWithMiddleware,
168
172
  headers: prepareForwardHeaders(ctx.request),
169
- // FUTURE: currently can't tell whether upstream or downstream abort
170
173
  abortSignal: ctx.request.signal,
171
174
  timeout: 5 * 60 * 1000,
172
175
  experimental_include: {
@@ -191,7 +194,8 @@ export const chatCompletions = (config: GatewayConfig): Endpoint => {
191
194
  addSpanEvent("hebo.hooks.after.completed");
192
195
  }
193
196
 
194
- recordRequestDuration(performance.now() - start, genAiGeneralAttrs, genAiSignalLevel);
197
+ recordTimePerOutputToken(start, genAiResponseAttrs, genAiGeneralAttrs, genAiSignalLevel);
198
+ recordRequestDuration(start, genAiGeneralAttrs, genAiSignalLevel);
195
199
  return ctx.result;
196
200
  };
197
201
 
@@ -54,6 +54,11 @@ const toMessageParts = (message: ChatCompletionsMessage): Record<string, unknown
54
54
  return parts;
55
55
  }
56
56
 
57
+ // FUTURE: remove once Langfuse supports gen_ai.system_instructions
58
+ if (message.role === "system") {
59
+ return [toTextPart(message.content)];
60
+ }
61
+
57
62
  return [];
58
63
  };
59
64
 
@@ -103,11 +108,13 @@ export const getChatRequestAttributes = (
103
108
 
104
109
  if (signalLevel === "full") {
105
110
  Object.assign(attrs, {
106
- "gen_ai.system_instructions": inputs.messages
107
- .filter((m) => m.role === "system")
108
- .map((m) => JSON.stringify({ parts: [toTextPart(m.content)] })),
111
+ // FUTURE: move system instructions from messages to here
112
+ // blocker: https://github.com/langfuse/langfuse/issues/11607
113
+ // "gen_ai.system_instructions": inputs.messages
114
+ // .filter((m) => m.role === "system")
115
+ // .map((m) => JSON.stringify(toTextPart(m.content))),
109
116
  "gen_ai.input.messages": inputs.messages
110
- .filter((m) => m.role !== "system")
117
+ //.filter((m) => m.role !== "system")
111
118
  .map((m) => JSON.stringify({ role: m.role, parts: toMessageParts(m) })),
112
119
  "gen_ai.tool.definitions": JSON.stringify(inputs.tools),
113
120
  });
@@ -16,7 +16,11 @@ import { winterCgHandler } from "../../lifecycle";
16
16
  import { logger } from "../../logger";
17
17
  import { modelMiddlewareMatcher } from "../../middleware/matcher";
18
18
  import { resolveProvider } from "../../providers/registry";
19
- import { recordRequestDuration, recordTokenUsage } from "../../telemetry/gen-ai";
19
+ import {
20
+ recordRequestDuration,
21
+ recordTimePerOutputToken,
22
+ recordTokenUsage,
23
+ } from "../../telemetry/gen-ai";
20
24
  import { addSpanEvent, setSpanAttributes } from "../../telemetry/span";
21
25
  import { resolveRequestId } from "../../utils/headers";
22
26
  import { prepareForwardHeaders } from "../../utils/request";
@@ -53,8 +57,8 @@ export const embeddings = (config: GatewayConfig): Endpoint => {
53
57
 
54
58
  const parsed = EmbeddingsBodySchema.safeParse(ctx.body);
55
59
  if (!parsed.success) {
56
- // FUTURE: add body shape to error message
57
- throw new GatewayError(z.prettifyError(parsed.error), 400);
60
+ // FUTURE: consider adding body shape to metadata
61
+ throw new GatewayError(z.prettifyError(parsed.error), 400, undefined, parsed.error);
58
62
  }
59
63
  ctx.body = parsed.data;
60
64
  addSpanEvent("hebo.request.parsed");
@@ -127,7 +131,8 @@ export const embeddings = (config: GatewayConfig): Endpoint => {
127
131
  addSpanEvent("hebo.hooks.after.completed");
128
132
  }
129
133
 
130
- recordRequestDuration(performance.now() - start, genAiGeneralAttrs, genAiSignalLevel);
134
+ recordTimePerOutputToken(start, genAiResponseAttrs, genAiGeneralAttrs, genAiSignalLevel);
135
+ recordRequestDuration(start, genAiGeneralAttrs, genAiSignalLevel);
131
136
  return ctx.result;
132
137
  };
133
138
 
@@ -4,11 +4,12 @@ export class GatewayError extends Error {
4
4
  readonly status: number;
5
5
  readonly code: string;
6
6
 
7
- constructor(error: string | Error, status: number, code?: string, cause?: unknown) {
8
- const msg = typeof error === "string" ? error : error.message;
9
- super(msg);
7
+ constructor(error: unknown, status: number, code?: string, cause?: unknown) {
8
+ const isError = error instanceof Error;
9
+ super(isError ? error.message : String(error));
10
+ this.cause = cause ?? (isError ? error : undefined);
11
+
10
12
  this.status = status;
11
13
  this.code = code ?? STATUS_CODE(status);
12
- this.cause = cause ?? (typeof error === "string" ? undefined : error);
13
14
  }
14
15
  }
@@ -25,7 +25,8 @@ export class OpenAIError {
25
25
  const mapType = (status: number) => (status < 500 ? "invalid_request_error" : "server_error");
26
26
 
27
27
  const maybeMaskMessage = (meta: ReturnType<typeof getErrorMeta>, requestId?: string) => {
28
- if (!(isProduction() && (meta.status >= 500 || meta.code.includes("UPSTREAM")))) {
28
+ // FUTURE: consider masking all upstream errors, also 4xx
29
+ if (!(isProduction() && meta.status >= 500)) {
29
30
  return meta.message;
30
31
  }
31
32
  // FUTURE: always attach requestId to errors (masked and unmasked)
@@ -11,6 +11,7 @@ export const STATUS_CODES = {
11
11
  409: "CONFLICT",
12
12
  422: "UNPROCESSABLE_ENTITY",
13
13
  429: "TOO_MANY_REQUESTS",
14
+ 499: "CLIENT_CLOSED_REQUEST",
14
15
  500: "INTERNAL_SERVER_ERROR",
15
16
  502: "BAD_GATEWAY",
16
17
  503: "SERVICE_UNAVAILABLE",
package/src/lifecycle.ts CHANGED
@@ -6,11 +6,13 @@ import type {
6
6
  } from "./types";
7
7
 
8
8
  import { parseConfig } from "./config";
9
+ import { GatewayError } from "./errors/gateway";
9
10
  import { toOpenAIErrorResponse } from "./errors/openai";
10
11
  import { logger } from "./logger";
11
12
  import { getBaggageAttributes } from "./telemetry/baggage";
12
13
  import { initFetch } from "./telemetry/fetch";
13
14
  import { getRequestAttributes, getResponseAttributes } from "./telemetry/http";
15
+ import { recordV8jsMemory } from "./telemetry/memory";
14
16
  import { addSpanEvent, setSpanEventsEnabled, setSpanTracer, startSpan } from "./telemetry/span";
15
17
  import { wrapStream } from "./telemetry/stream";
16
18
  import { resolveRequestId } from "./utils/headers";
@@ -23,7 +25,7 @@ export const winterCgHandler = (
23
25
  ) => {
24
26
  const parsedConfig = parseConfig(config);
25
27
 
26
- if (parsedConfig.telemetry!.enabled) {
28
+ if (parsedConfig.telemetry?.enabled) {
27
29
  setSpanTracer(parsedConfig.telemetry?.tracer);
28
30
  setSpanEventsEnabled(parsedConfig.telemetry?.signals?.hebo);
29
31
  initFetch(parsedConfig.telemetry?.signals?.hebo);
@@ -58,18 +60,22 @@ export const winterCgHandler = (
58
60
  );
59
61
  }
60
62
 
61
- const realStatus = status === 200 ? (ctx.response?.status ?? status) : status;
63
+ let realStatus = status;
64
+ if (ctx.request.signal.aborted) realStatus = 499;
65
+ else if (status === 200 && ctx.response?.status) realStatus = ctx.response.status;
66
+
62
67
  if (realStatus !== 200) {
63
- // FUTURE: in-stream errors are redacted in prod
64
68
  (realStatus >= 500 ? logger.error : logger.warn)({
65
69
  requestId: resolveRequestId(ctx.request),
66
- err: reason,
70
+ err: reason ?? ctx.request.signal.reason,
67
71
  });
68
72
 
69
73
  if (realStatus >= 500) span.recordError(reason);
70
74
  }
71
75
  span.setAttributes({ "http.response.status_code_effective": realStatus });
72
76
 
77
+ recordV8jsMemory(parsedConfig.telemetry?.signals?.hebo);
78
+
73
79
  span.finish();
74
80
  };
75
81
 
@@ -89,7 +95,7 @@ export const winterCgHandler = (
89
95
  ctx.result = (await span.runWithContext(() => run(ctx))) as typeof ctx.result;
90
96
 
91
97
  if (ctx.result instanceof ReadableStream) {
92
- ctx.result = wrapStream(ctx.result, { onDone: finalize }, ctx.request.signal);
98
+ ctx.result = wrapStream(ctx.result, { onDone: finalize });
93
99
  }
94
100
 
95
101
  ctx.response = toResponse(ctx.result!, prepareResponseInit(ctx.request));
@@ -108,7 +114,12 @@ export const winterCgHandler = (
108
114
  finalize(ctx.response.status);
109
115
  }
110
116
  } catch (error) {
111
- ctx.response = toOpenAIErrorResponse(error, prepareResponseInit(ctx.request));
117
+ ctx.response = toOpenAIErrorResponse(
118
+ ctx.request.signal.aborted
119
+ ? new GatewayError(error ?? ctx.request.signal.reason, 499)
120
+ : error,
121
+ prepareResponseInit(ctx.request),
122
+ );
112
123
  finalize(ctx.response.status, error);
113
124
  }
114
125
 
@@ -2,7 +2,7 @@ import { metrics, type Attributes } from "@opentelemetry/api";
2
2
 
3
3
  import type { TelemetrySignalLevel } from "../types";
4
4
 
5
- const meter = metrics.getMeter("@hebo-ai/gateway");
5
+ const meter = metrics.getMeter("@hebo/gateway");
6
6
 
7
7
  const requestDurationHistogram = meter.createHistogram("gen_ai.server.request.duration", {
8
8
  description: "End-to-end gateway request duration",
@@ -14,6 +14,16 @@ const requestDurationHistogram = meter.createHistogram("gen_ai.server.request.du
14
14
  },
15
15
  });
16
16
 
17
+ const timePerOutputTokenHistogram = meter.createHistogram("gen_ai.server.time_per_output_token", {
18
+ description: "End-to-end gateway request duration per output token",
19
+ unit: "s",
20
+ advice: {
21
+ explicitBucketBoundaries: [
22
+ 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, 1.0, 2.5,
23
+ ],
24
+ },
25
+ });
26
+
17
27
  const tokenUsageHistogram = meter.createHistogram("gen_ai.client.token.usage", {
18
28
  description: "Token usage reported by upstream model responses",
19
29
  unit: "{token}",
@@ -27,13 +37,31 @@ const tokenUsageHistogram = meter.createHistogram("gen_ai.client.token.usage", {
27
37
 
28
38
  // FUTURE: record unsuccessful calls
29
39
  export const recordRequestDuration = (
30
- duration: number,
40
+ start: number,
31
41
  attrs: Attributes,
32
42
  signalLevel?: TelemetrySignalLevel,
33
43
  ) => {
34
44
  if (!signalLevel || signalLevel === "off") return;
35
45
 
36
- requestDurationHistogram.record(duration / 1000, attrs);
46
+ requestDurationHistogram.record((performance.now() - start) / 1000, attrs);
47
+ };
48
+
49
+ // FUTURE: record unsuccessful calls
50
+ export const recordTimePerOutputToken = (
51
+ start: number,
52
+ tokenAttrs: Attributes,
53
+ metricAttrs: Attributes,
54
+ signalLevel?: TelemetrySignalLevel,
55
+ ) => {
56
+ if (!signalLevel || (signalLevel !== "recommended" && signalLevel !== "full")) return;
57
+
58
+ const outputTokens = tokenAttrs["gen_ai.usage.output_tokens"];
59
+ if (typeof outputTokens !== "number" || outputTokens <= 0) return;
60
+
61
+ timePerOutputTokenHistogram.record(
62
+ (performance.now() - start) / 1000 / outputTokens,
63
+ metricAttrs,
64
+ );
37
65
  };
38
66
 
39
67
  // FUTURE: record unsuccessful calls
@@ -0,0 +1,36 @@
1
+ import { metrics } from "@opentelemetry/api";
2
+
3
+ import type { TelemetrySignalLevel } from "../types";
4
+
5
+ const meter = metrics.getMeter("@hebo/gateway");
6
+ const defaultHeapSpaceAttrs = { "v8js.heap.space.name": "total" } as const;
7
+
8
+ const heapUsedCounter = meter.createUpDownCounter("v8js.memory.heap.used", {
9
+ description: "Used bytes in the V8 heap",
10
+ unit: "By",
11
+ });
12
+
13
+ const heapSpacePhysicalSizeCounter = meter.createUpDownCounter(
14
+ "v8js.memory.heap.space.physical_size",
15
+ {
16
+ description: "Physical bytes allocated for the V8 heap space",
17
+ unit: "By",
18
+ },
19
+ );
20
+
21
+ const isEnabled = (level?: TelemetrySignalLevel) => level === "recommended" || level === "full";
22
+
23
+ export const recordV8jsMemory = (level?: TelemetrySignalLevel) => {
24
+ if (!isEnabled(level)) return;
25
+
26
+ let usage;
27
+ try {
28
+ usage = globalThis.process?.memoryUsage?.();
29
+ } catch {
30
+ return;
31
+ }
32
+ if (!usage) return;
33
+
34
+ heapUsedCounter.add(usage.heapUsed, defaultHeapSpaceAttrs);
35
+ heapSpacePhysicalSizeCounter.add(usage.rss, defaultHeapSpaceAttrs);
36
+ };
@@ -4,7 +4,7 @@ import { INVALID_SPAN_CONTEXT, SpanKind, SpanStatusCode, context, trace } from "
4
4
 
5
5
  import type { TelemetrySignalLevel } from "../types";
6
6
 
7
- const DEFAULT_TRACER_NAME = "@hebo-ai/gateway";
7
+ const DEFAULT_TRACER_NAME = "@hebo/gateway";
8
8
 
9
9
  let spanTracer: Tracer | undefined;
10
10
  let spanEventsEnabled = false;
@@ -1,54 +1,51 @@
1
- const isErrorChunk = (v: unknown) => !!(v as any)?.error;
1
+ import { toOpenAIError } from "#/errors/openai";
2
+
3
+ const isErrorChunk = (v: unknown) => v instanceof Error || !!(v as any)?.error;
2
4
 
3
5
  export const wrapStream = (
4
6
  src: ReadableStream,
5
7
  hooks: { onDone?: (status: number, reason: unknown) => void },
6
- signal?: AbortSignal,
7
8
  ): ReadableStream => {
8
- let finishOnce = false;
9
-
10
- const finish = (status: number, reason?: unknown) => {
11
- if (finishOnce) return;
12
- finishOnce = true;
9
+ let finished = false;
13
10
 
14
- hooks.onDone?.(status, reason ?? signal?.reason);
11
+ const done = (
12
+ reader: ReadableStreamDefaultReader,
13
+ controller: ReadableStreamDefaultController,
14
+ status: number,
15
+ reason?: unknown,
16
+ ) => {
17
+ if (!finished) {
18
+ finished = true;
19
+ hooks.onDone?.(status, reason);
20
+ }
21
+ reader.cancel(reason).catch(() => {});
22
+ controller.close();
15
23
  };
16
24
 
17
25
  return new ReadableStream({
18
26
  async start(controller) {
19
27
  const reader = src.getReader();
20
28
 
21
- const close = (status: number, reason?: unknown) => {
22
- finish(status, reason);
23
- reader.cancel(reason).catch(() => {});
24
- controller.close();
25
- };
26
-
27
29
  try {
28
30
  for (;;) {
29
- if (signal?.aborted) {
30
- close(499, signal.reason);
31
- return;
32
- }
33
-
34
31
  // eslint-disable-next-line no-await-in-loop
35
- const { value, done } = await reader.read();
36
- if (done) break;
32
+ const { value, done: eof } = await reader.read();
33
+ if (eof) break;
37
34
 
38
- controller.enqueue(value);
35
+ const out = isErrorChunk(value) ? toOpenAIError(value) : value;
36
+ controller.enqueue(out);
39
37
 
40
- if (isErrorChunk(value)) {
41
- const status = value.error.type === "invalid_request_error" ? 422 : 502;
42
- close(status, value.error.message);
38
+ if (out !== value) {
39
+ const status = out.error?.type === "invalid_request_error" ? 422 : 502;
40
+ done(reader, controller, status, value);
43
41
  return;
44
42
  }
45
43
  }
46
44
 
47
- finish(200);
48
- controller.close();
45
+ done(reader, controller, 200);
49
46
  } catch (err) {
50
- const status = signal?.aborted ? 499 : (err as any)?.name === "AbortError" ? 503 : 502;
51
- close(status, err);
47
+ controller.enqueue(toOpenAIError(err));
48
+ done(reader, controller, 502, err);
52
49
  } finally {
53
50
  try {
54
51
  reader.releaseLock();
@@ -56,8 +53,11 @@ export const wrapStream = (
56
53
  }
57
54
  },
58
55
 
59
- cancel(reason?: unknown) {
60
- finish(499, reason);
56
+ cancel(reason) {
57
+ if (!finished) {
58
+ finished = true;
59
+ hooks.onDone?.(499, reason);
60
+ }
61
61
  src.cancel(reason).catch(() => {});
62
62
  },
63
63
  });
package/src/types.ts CHANGED
@@ -8,7 +8,6 @@ import type {
8
8
  } from "./endpoints/chat-completions/schema";
9
9
  import type { Embeddings, EmbeddingsBody } from "./endpoints/embeddings/schema";
10
10
  import type { Model, ModelList } from "./endpoints/models";
11
- import type { OpenAIError } from "./errors/openai";
12
11
  import type { Logger, LoggerConfig } from "./logger";
13
12
  import type { ModelCatalog, ModelId } from "./models/types";
14
13
  import type { ProviderId, ProviderRegistry } from "./providers/types";
@@ -76,7 +75,7 @@ export type GatewayContext = {
76
75
  */
77
76
  result?:
78
77
  | ChatCompletions
79
- | ReadableStream<ChatCompletionsChunk | OpenAIError>
78
+ | ReadableStream<ChatCompletionsChunk | Error>
80
79
  | Embeddings
81
80
  | Model
82
81
  | ModelList;
@@ -150,11 +149,9 @@ export type GatewayHooks = {
150
149
  ) =>
151
150
  | void
152
151
  | ChatCompletions
153
- | ReadableStream<ChatCompletionsChunk | OpenAIError>
152
+ | ReadableStream<ChatCompletionsChunk | Error>
154
153
  | Embeddings
155
- | Promise<
156
- void | ChatCompletions | ReadableStream<ChatCompletionsChunk | OpenAIError> | Embeddings
157
- >;
154
+ | Promise<void | ChatCompletions | ReadableStream<ChatCompletionsChunk | Error> | Embeddings>;
158
155
  /**
159
156
  * Runs after the lifecycle has produced the final Response.
160
157
  * @returns Replacement Response, or undefined to keep original.