@hebo-ai/gateway 0.6.2-rc1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/README.md +58 -8
  2. package/dist/config.js +28 -1
  3. package/dist/endpoints/chat-completions/converters.d.ts +5 -5
  4. package/dist/endpoints/chat-completions/converters.js +86 -49
  5. package/dist/endpoints/chat-completions/handler.js +4 -4
  6. package/dist/endpoints/chat-completions/otel.d.ts +1 -1
  7. package/dist/endpoints/chat-completions/otel.js +20 -18
  8. package/dist/endpoints/chat-completions/schema.d.ts +47 -23
  9. package/dist/endpoints/chat-completions/schema.js +24 -17
  10. package/dist/endpoints/embeddings/handler.js +2 -2
  11. package/dist/endpoints/embeddings/otel.d.ts +2 -2
  12. package/dist/endpoints/embeddings/otel.js +7 -2
  13. package/dist/endpoints/embeddings/schema.d.ts +6 -0
  14. package/dist/endpoints/embeddings/schema.js +4 -1
  15. package/dist/endpoints/models/handler.js +2 -2
  16. package/dist/errors/openai.d.ts +1 -6
  17. package/dist/lifecycle.d.ts +3 -2
  18. package/dist/lifecycle.js +4 -6
  19. package/dist/middleware/utils.js +0 -1
  20. package/dist/models/amazon/middleware.js +6 -5
  21. package/dist/models/anthropic/middleware.js +13 -13
  22. package/dist/models/cohere/middleware.js +7 -5
  23. package/dist/models/google/middleware.d.ts +1 -1
  24. package/dist/models/google/middleware.js +29 -25
  25. package/dist/models/google/presets.d.ts +28 -0
  26. package/dist/models/google/presets.js +7 -1
  27. package/dist/models/openai/middleware.js +7 -7
  28. package/dist/models/types.d.ts +1 -1
  29. package/dist/models/types.js +1 -0
  30. package/dist/models/voyage/middleware.js +2 -1
  31. package/dist/providers/bedrock/middleware.d.ts +1 -0
  32. package/dist/providers/bedrock/middleware.js +54 -23
  33. package/dist/providers/groq/index.d.ts +1 -0
  34. package/dist/providers/groq/index.js +1 -0
  35. package/dist/providers/groq/middleware.d.ts +2 -0
  36. package/dist/providers/groq/middleware.js +31 -0
  37. package/dist/providers/vertex/index.d.ts +1 -0
  38. package/dist/providers/vertex/index.js +1 -0
  39. package/dist/providers/vertex/middleware.d.ts +2 -0
  40. package/dist/providers/vertex/middleware.js +47 -0
  41. package/dist/types.d.ts +25 -4
  42. package/dist/types.js +1 -0
  43. package/dist/utils/response.d.ts +4 -1
  44. package/dist/utils/response.js +5 -20
  45. package/dist/utils/stream.d.ts +9 -0
  46. package/dist/utils/stream.js +100 -0
  47. package/package.json +5 -1
  48. package/dist/telemetry/stream.d.ts +0 -3
  49. package/dist/telemetry/stream.js +0 -58
package/README.md CHANGED
@@ -32,13 +32,13 @@ bun install @hebo-ai/gateway
32
32
  - Quickstart
33
33
  - [Setup A Gateway Instance](#setup-a-gateway-instance) | [Mount Route Handlers](#mount-route-handlers) | [Call the Gateway](#call-the-gateway)
34
34
  - Configuration Reference
35
- - [Providers](#providers) | [Models](#models) | [Hooks](#hooks) | [Logger](#logger-settings) | [Observability](#observability)
35
+ - [Providers](#providers) | [Models](#models) | [Hooks](#hooks) | [Logger](#logger-settings) | [Observability](#observability) | [Timeouts](#timeout-settings)
36
36
  - Framework Support
37
37
  - [ElysiaJS](#elysiajs) | [Hono](#hono) | [Next.js](#nextjs) | [TanStack Start](#tanstack-start)
38
38
  - Runtime Support
39
39
  - [Vercel Edge](#vercel-edge) | [Cloudflare Workers](#cloudflare-workers) | [Deno Deploy](#deno-deploy) | [AWS Lambda](#aws-lambda)
40
40
  - OpenAI Extensions
41
- - [Reasoning](#reasoning) | [Prompt Caching](#prompt-caching)
41
+ - [Reasoning](#reasoning) | [Service Tier](#service-tier) | [Prompt Caching](#prompt-caching)
42
42
  - Advanced Usage
43
43
  - [Passing Framework State to Hooks](#passing-framework-state-to-hooks) | [Selective Route Mounting](#selective-route-mounting) | [Low-level Schemas & Converters](#low-level-schemas--converters)
44
44
 
@@ -342,9 +342,9 @@ const gw = gateway({
342
342
  * @returns Modified result, or undefined to keep original.
343
343
  */
344
344
  after: async (ctx: {
345
- result: ChatCompletions | ReadableStream<ChatCompletionsChunk | Error> | Embeddings;
345
+ result: ChatCompletions | ChatCompletionsStream | Embeddings;
346
346
  }): Promise<
347
- ChatCompletions | ReadableStream<ChatCompletionsChunk | Error> | Embeddings | void
347
+ ChatCompletions | ChatCompletionsStream | Embeddings | void
348
348
  > => {
349
349
  // Example Use Cases:
350
350
  // - Transform result
@@ -536,14 +536,14 @@ Normalization rules:
536
536
 
537
537
  - `enabled` -> fall-back to model default if none provided
538
538
  - `max_tokens`: fall-back to model default if model supports
539
- - `effort` supports: `none`, `minimal`, `low`, `medium`, `high`, `xhigh`, `max`
539
+ - `effort` supports: `none`, `minimal`, `low`, `medium`, `high`, `xhigh`
540
540
  - Generic `effort` -> budget = percentage of `max_tokens`
541
541
  - `none`: 0%
542
542
  - `minimal`: 10%
543
543
  - `low`: 20%
544
544
  - `medium`: 50% (default)
545
545
  - `high`: 80%
546
- - `xhigh` / `max`: 95%
546
+ - `xhigh`: 95%
547
547
 
548
548
  Reasoning output is surfaced as extension to the `completion` object.
549
549
 
@@ -561,6 +561,25 @@ Advanced models (like Anthropic Claude 3.7 or Gemini 3) surface structured reaso
561
561
 
562
562
  For **Gemini 3** models, returning the thought signature via `extra_content` is mandatory to resume the chain-of-thought; failing to do so may result in errors or degraded performance.
563
563
 
564
+ ### Service Tier
565
+
566
+ The chat completions endpoint accepts a provider-agnostic `service_tier` extension:
567
+
568
+ - `auto`, `default`, `flex`, `priority`, `scale`
569
+
570
+ Provider-specific mapping:
571
+
572
+ - **OpenAI**: forwards as OpenAI `serviceTier` (no middleware remap).
573
+ - **Groq**: maps to Groq `serviceTier` (`default` -> `on_demand`, `scale`/`priority` -> `performance`).
574
+ - **Google Vertex**: maps to request headers via middleware:
575
+ - `default` -> `x-vertex-ai-llm-request-type: shared`
576
+ - `flex` -> `x-vertex-ai-llm-request-type: shared` + `x-vertex-ai-llm-shared-request-type: flex`
577
+ - `priority` -> `x-vertex-ai-llm-request-type: shared` + `x-vertex-ai-llm-shared-request-type: priority`
578
+ - `scale` -> `x-vertex-ai-llm-request-type: dedicated`
579
+ - **Amazon Bedrock**: maps to Bedrock `serviceTier.type` (`default`, `flex`, `priority`, `reserved`; `scale` -> `reserved`, `auto` -> omitted/default).
580
+
581
+ When available, the resolved value is echoed back on response as `service_tier`.
582
+
564
583
  ### Prompt Caching
565
584
 
566
585
  The chat completions endpoint supports both implicit (provider-managed) and explicit prompt caching across OpenAI-compatible providers.
@@ -665,7 +684,7 @@ https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-metrics/
665
684
 
666
685
  > [!TIP]
667
686
  > To populate custom span attributes, the inbound W3C `baggage` header is supported. Keys in the `hebo.` namespace are mapped to span attributes, with the namespace stripped. For example: `baggage: hebo.user_id=u-123` becomes span attribute `user_id=u-123`.
668
- > For `/chat/completions`, request `metadata` (`Record<string, string>`, key 1-64 chars, value up to 512 chars) is also forwarded to spans as `gen_ai.request.metadata.<key>`.
687
+ > For `/chat/completions` and `/embeddings`, request `metadata` (`Record<string, string>`, key 1-64 chars, value up to 512 chars) is also forwarded to spans as `gen_ai.request.metadata.<key>`.
669
688
 
670
689
  For observability integration that is not otel compliant, you can disable built-in telemetry and manually instrument requests during `before` / `after` hooks.
671
690
 
@@ -737,6 +756,37 @@ const gw = gateway({
737
756
 
738
757
  Langfuse credentials are read from environment variables by the Langfuse OTel SDK (`LANGFUSE_PUBLIC_KEY`, `LANGFUSE_SECRET_KEY`, `LANGFUSE_BASE_URL`).
739
758
 
759
+ ### Timeout Settings
760
+
761
+ You can configure request timeouts via the `timeouts` field:
762
+
763
+ ```ts
764
+ import { gateway } from "@hebo-ai/gateway";
765
+
766
+ const gw = gateway({
767
+ // ...
768
+ // default timeout is 300_000 (5 minutes).
769
+ // You can set one timeout for all tiers...
770
+ timeouts: 60_000,
771
+ // ...disable timeouts completely:
772
+ // timeouts: null,
773
+ // ...or split by service tier:
774
+ // - normal: all non-flex tiers (set null to disable)
775
+ // - flex: defaults to 3x normal when omitted (set null to disable)
776
+ // timeouts: { normal: 30_000, flex: null },
777
+ });
778
+ ```
779
+
780
+ > [!NOTE]
781
+ > **Runtime/engine timeout limits**
782
+ > Runtime-level `fetch()` clients may enforce their own timeouts. Configure those runtime/platform limits in addition to gateway `timeouts`.
783
+ >
784
+ > - Node.js runtimes use Undici: https://github.com/nodejs/undici/issues/1373 (Node.js, Vercel Serverless Functions, AWS Lambda)
785
+ > - Bun context: https://github.com/oven-sh/bun/issues/16682
786
+ >
787
+ > **Provider/service timeout limits**
788
+ > Serverless platforms (e.g. Cloudflare Workers, Vercel Edge/Serverless, AWS Lambda) also enforce platform time limits (roughly ~25-100s on edge paths, ~300s for streaming, and up to ~900s configurable for some).
789
+
740
790
  ### Passing Framework State to Hooks
741
791
 
742
792
  You can pass per-request info from your framework into the gateway via the second `state` argument on the handler, then read it in hooks through `ctx.state`.
@@ -838,7 +888,7 @@ export async function handler(req: Request): Promise<Response> {
838
888
  }
839
889
  ```
840
890
 
841
- Non-streaming versions are available via `createChatCompletionsResponse`. Equivalent schemas and helpers are available in the `embeddings` and `models` endpoints.
891
+ Non-streaming versions are available via `toChatCompletionsResponse`. Equivalent schemas and helpers are available in the `embeddings` and `models` endpoints.
842
892
 
843
893
  > [!TIP]
844
894
  > Since Zod v4.3 you can generate a JSON Schema from any zod object by calling `z.toJSONSchema(...)`. This is useful for producing OpenAPI documentation from the same source of truth.
package/dist/config.js CHANGED
@@ -1,7 +1,7 @@
1
1
  import { isLogger, logger, setLoggerInstance } from "./logger";
2
2
  import { createDefaultLogger } from "./logger/default";
3
3
  import { installAiSdkWarningLogger } from "./telemetry/ai-sdk";
4
- import { kParsed, } from "./types";
4
+ import { DEFAULT_CHAT_TIMEOUT_MS, kParsed, } from "./types";
5
5
  export const parseConfig = (config) => {
6
6
  // If it has been parsed before, just return.
7
7
  if (kParsed in config)
@@ -66,9 +66,36 @@ export const parseConfig = (config) => {
66
66
  hebo: "off",
67
67
  };
68
68
  installAiSdkWarningLogger(telemetrySignals.gen_ai);
69
+ // Default timeouts
70
+ let normal;
71
+ let flex;
72
+ const t = config.timeouts;
73
+ if (t === null) {
74
+ normal = flex = undefined;
75
+ }
76
+ else if (typeof t === "number") {
77
+ normal = t;
78
+ flex = t * 3;
79
+ }
80
+ else {
81
+ if (t?.normal === null)
82
+ normal = undefined;
83
+ else if (t?.normal === undefined)
84
+ normal = DEFAULT_CHAT_TIMEOUT_MS;
85
+ else
86
+ normal = t.normal;
87
+ if (t?.flex === null)
88
+ flex = undefined;
89
+ else if (t?.flex === undefined)
90
+ flex = normal === undefined ? undefined : normal * 3;
91
+ else
92
+ flex = t.flex;
93
+ }
94
+ const parsedTimeouts = { normal, flex };
69
95
  // Return parsed config.
70
96
  return {
71
97
  ...config,
98
+ timeouts: parsedTimeouts,
72
99
  telemetry: {
73
100
  ...config.telemetry,
74
101
  enabled: telemetryEnabled,
@@ -1,8 +1,8 @@
1
1
  import type { SharedV3ProviderOptions, SharedV3ProviderMetadata } from "@ai-sdk/provider";
2
2
  import type { GenerateTextResult, StreamTextResult, FinishReason, ToolChoice, ToolSet, ModelMessage, UserContent, LanguageModelUsage, TextStreamPart, ReasoningOutput, AssistantModelMessage, ToolModelMessage, UserModelMessage } from "ai";
3
3
  import { Output } from "ai";
4
- import type { ChatCompletionsToolCall, ChatCompletionsTool, ChatCompletionsToolChoice, ChatCompletionsContentPart, ChatCompletionsMessage, ChatCompletionsUserMessage, ChatCompletionsAssistantMessage, ChatCompletionsToolMessage, ChatCompletionsFinishReason, ChatCompletionsUsage, ChatCompletionsInputs, ChatCompletions, ChatCompletionsChunk, ChatCompletionsReasoningDetail } from "./schema";
5
- import { OpenAIError } from "../../errors/openai";
4
+ import type { ChatCompletionsToolCall, ChatCompletionsTool, ChatCompletionsToolChoice, ChatCompletionsStream, ChatCompletionsContentPart, ChatCompletionsMessage, ChatCompletionsUserMessage, ChatCompletionsAssistantMessage, ChatCompletionsToolMessage, ChatCompletionsFinishReason, ChatCompletionsUsage, ChatCompletionsInputs, ChatCompletions, ChatCompletionsChunk, ChatCompletionsReasoningDetail } from "./schema";
5
+ import type { SseErrorFrame, SseFrame } from "../../utils/stream";
6
6
  export type TextCallOptions = {
7
7
  messages: ModelMessage[];
8
8
  tools?: ToolSet;
@@ -31,10 +31,10 @@ export declare const convertToToolChoiceOptions: (toolChoice: ChatCompletionsToo
31
31
  };
32
32
  export declare function toChatCompletions(result: GenerateTextResult<ToolSet, Output.Output>, model: string): ChatCompletions;
33
33
  export declare function toChatCompletionsResponse(result: GenerateTextResult<ToolSet, Output.Output>, model: string, responseInit?: ResponseInit): Response;
34
- export declare function toChatCompletionsStream<E extends boolean = false>(result: StreamTextResult<ToolSet, Output.Output>, model: string, wrapErrors?: E): ReadableStream<ChatCompletionsChunk | (E extends true ? OpenAIError : Error)>;
34
+ export declare function toChatCompletionsStream(result: StreamTextResult<ToolSet, Output.Output>, model: string): ChatCompletionsStream;
35
35
  export declare function toChatCompletionsStreamResponse(result: StreamTextResult<ToolSet, Output.Output>, model: string, responseInit?: ResponseInit): Response;
36
- export declare class ChatCompletionsStream<E extends boolean = false> extends TransformStream<TextStreamPart<ToolSet>, ChatCompletionsChunk | (E extends true ? OpenAIError : Error)> {
37
- constructor(model: string, wrapErrors?: E);
36
+ export declare class ChatCompletionsTransformStream extends TransformStream<TextStreamPart<ToolSet>, SseFrame<ChatCompletionsChunk> | SseErrorFrame> {
37
+ constructor(model: string);
38
38
  }
39
39
  export declare const toChatCompletionsAssistantMessage: (result: GenerateTextResult<ToolSet, Output.Output>) => ChatCompletionsAssistantMessage;
40
40
  export declare function toReasoningDetail(reasoning: ReasoningOutput, id: string, index: number): ChatCompletionsReasoningDetail;
@@ -1,14 +1,18 @@
1
1
  import { Output, jsonSchema, tool } from "ai";
2
2
  import { z } from "zod";
3
3
  import { GatewayError } from "../../errors/gateway";
4
- import { OpenAIError, toOpenAIError } from "../../errors/openai";
5
4
  import { toResponse } from "../../utils/response";
6
5
  import { parseDataUrl } from "../../utils/url";
7
6
  // --- Request Flow ---
8
7
  export function convertToTextCallOptions(params) {
9
8
  const { messages, tools, tool_choice, temperature, max_tokens, max_completion_tokens, response_format, reasoning_effort, reasoning, prompt_cache_key, prompt_cache_retention, extra_body, cache_control, frequency_penalty, presence_penalty, seed, stop, top_p, ...rest } = params;
10
9
  Object.assign(rest, parseReasoningOptions(reasoning_effort, reasoning));
11
- Object.assign(rest, parsePromptCachingOptions(prompt_cache_key, prompt_cache_retention, extra_body?.google?.cached_content, cache_control));
10
+ Object.assign(rest, parsePromptCachingOptions(prompt_cache_key, prompt_cache_retention, cache_control));
11
+ if (extra_body) {
12
+ for (const v of Object.values(extra_body)) {
13
+ Object.assign(rest, v);
14
+ }
15
+ }
12
16
  const { toolChoice, activeTools } = convertToToolChoiceOptions(tool_choice);
13
17
  return {
14
18
  messages: convertToModelMessages(messages),
@@ -337,29 +341,25 @@ function parseReasoningOptions(reasoning_effort, reasoning) {
337
341
  }
338
342
  return out;
339
343
  }
340
- function parsePromptCachingOptions(prompt_cache_key, prompt_cache_retention, cached_content, cache_control) {
344
+ function parsePromptCachingOptions(prompt_cache_key, prompt_cache_retention, cache_control) {
341
345
  const out = {};
342
- const syncedCacheKey = prompt_cache_key ?? cached_content;
343
- const syncedCachedContent = cached_content ?? prompt_cache_key;
344
- let syncedCacheRetention = prompt_cache_retention;
345
- if (!syncedCacheRetention && cache_control?.ttl) {
346
- syncedCacheRetention = cache_control.ttl === "24h" ? "24h" : "in_memory";
347
- }
348
- let syncedCacheControl = cache_control;
349
- if (!syncedCacheControl && syncedCacheRetention) {
350
- syncedCacheControl = {
346
+ let retention = prompt_cache_retention;
347
+ if (!retention && cache_control?.ttl) {
348
+ retention = cache_control.ttl === "24h" ? "24h" : "in_memory";
349
+ }
350
+ let control = cache_control;
351
+ if (!control && retention) {
352
+ control = {
351
353
  type: "ephemeral",
352
- ttl: syncedCacheRetention === "24h" ? "24h" : "5m",
354
+ ttl: retention === "24h" ? "24h" : "5m",
353
355
  };
354
356
  }
355
- if (syncedCacheKey)
356
- out["prompt_cache_key"] = syncedCacheKey;
357
- if (syncedCacheRetention)
358
- out["prompt_cache_retention"] = syncedCacheRetention;
359
- if (syncedCachedContent)
360
- out["cached_content"] = syncedCachedContent;
361
- if (syncedCacheControl)
362
- out["cache_control"] = syncedCacheControl;
357
+ if (prompt_cache_key)
358
+ out["prompt_cache_key"] = prompt_cache_key;
359
+ if (retention)
360
+ out["prompt_cache_retention"] = retention;
361
+ if (control)
362
+ out["cache_control"] = control;
363
363
  return out;
364
364
  }
365
365
  // --- Response Flow ---
@@ -378,19 +378,20 @@ export function toChatCompletions(result, model) {
378
378
  ],
379
379
  usage: result.totalUsage ? toChatCompletionsUsage(result.totalUsage) : null,
380
380
  provider_metadata: result.providerMetadata,
381
+ service_tier: resolveResponseServiceTier(result.providerMetadata),
381
382
  };
382
383
  }
383
384
  export function toChatCompletionsResponse(result, model, responseInit) {
384
385
  return toResponse(toChatCompletions(result, model), responseInit);
385
386
  }
386
- export function toChatCompletionsStream(result, model, wrapErrors) {
387
- return result.fullStream.pipeThrough(new ChatCompletionsStream(model, wrapErrors));
387
+ export function toChatCompletionsStream(result, model) {
388
+ return result.fullStream.pipeThrough(new ChatCompletionsTransformStream(model));
388
389
  }
389
390
  export function toChatCompletionsStreamResponse(result, model, responseInit) {
390
- return toResponse(toChatCompletionsStream(result, model, true), responseInit);
391
+ return toResponse(toChatCompletionsStream(result, model), responseInit);
391
392
  }
392
- export class ChatCompletionsStream extends TransformStream {
393
- constructor(model, wrapErrors) {
393
+ export class ChatCompletionsTransformStream extends TransformStream {
394
+ constructor(model) {
394
395
  const streamId = `chatcmpl-${crypto.randomUUID()}`;
395
396
  const creationTime = Math.floor(Date.now() / 1000);
396
397
  let toolCallIndexCounter = 0;
@@ -401,18 +402,21 @@ export class ChatCompletionsStream extends TransformStream {
401
402
  delta.extra_content = provider_metadata;
402
403
  }
403
404
  return {
404
- id: streamId,
405
- object: "chat.completion.chunk",
406
- created: creationTime,
407
- model,
408
- choices: [
409
- {
410
- index: 0,
411
- delta,
412
- finish_reason: finish_reason ?? null,
413
- },
414
- ],
415
- usage: usage ?? null,
405
+ data: {
406
+ id: streamId,
407
+ object: "chat.completion.chunk",
408
+ created: creationTime,
409
+ model,
410
+ choices: [
411
+ {
412
+ index: 0,
413
+ delta,
414
+ finish_reason: finish_reason ?? null,
415
+ },
416
+ ],
417
+ usage: usage ?? null,
418
+ service_tier: resolveResponseServiceTier(provider_metadata),
419
+ },
416
420
  };
417
421
  };
418
422
  super({
@@ -458,23 +462,56 @@ export class ChatCompletionsStream extends TransformStream {
458
462
  break;
459
463
  }
460
464
  case "error": {
461
- let err;
462
- if (wrapErrors) {
463
- err = toOpenAIError(part.error);
464
- }
465
- else if (part.error instanceof Error) {
466
- err = part.error;
467
- }
468
- else {
469
- err = new Error(String(part.error));
470
- }
471
- controller.enqueue(err);
465
+ controller.enqueue({
466
+ data: part.error instanceof Error ? part.error : new Error(String(part.error)),
467
+ });
472
468
  }
473
469
  }
474
470
  },
475
471
  });
476
472
  }
477
473
  }
474
+ function resolveResponseServiceTier(providerMetadata) {
475
+ if (!providerMetadata)
476
+ return;
477
+ for (const metadata of Object.values(providerMetadata)) {
478
+ const tier = parseReturnedServiceTier(metadata["service_tier"] ??
479
+ metadata["usage_metadata"]?.["traffic_type"]);
480
+ if (tier)
481
+ return tier;
482
+ }
483
+ }
484
+ function parseReturnedServiceTier(value) {
485
+ if (typeof value !== "string")
486
+ return undefined;
487
+ const n = value.toLowerCase();
488
+ switch (n) {
489
+ case "traffic_type_unspecified":
490
+ case "auto":
491
+ return "auto";
492
+ case "default":
493
+ case "on_demand":
494
+ case "on-demand":
495
+ case "shared":
496
+ return "default";
497
+ case "on_demand_flex":
498
+ case "flex":
499
+ return "flex";
500
+ case "on_demand_priority":
501
+ case "priority":
502
+ case "performance":
503
+ return "priority";
504
+ case "provisioned_throughput":
505
+ case "scale":
506
+ case "reserved":
507
+ case "dedicated":
508
+ case "provisioned":
509
+ case "throughput":
510
+ return "scale";
511
+ default:
512
+ return undefined;
513
+ }
514
+ }
478
515
  export const toChatCompletionsAssistantMessage = (result) => {
479
516
  const message = {
480
517
  role: "assistant",
@@ -13,7 +13,7 @@ import { getChatRequestAttributes, getChatResponseAttributes } from "./otel";
13
13
  import { ChatCompletionsBodySchema } from "./schema";
14
14
  export const chatCompletions = (config) => {
15
15
  const hooks = config.hooks;
16
- const handler = async (ctx) => {
16
+ const handler = async (ctx, cfg) => {
17
17
  const start = performance.now();
18
18
  ctx.operation = "chat";
19
19
  addSpanEvent("hebo.handler.started");
@@ -63,7 +63,7 @@ export const chatCompletions = (config) => {
63
63
  ctx.resolvedProviderId = languageModel.provider;
64
64
  logger.debug(`[chat] using ${languageModel.provider} for ${ctx.resolvedModelId}`);
65
65
  addSpanEvent("hebo.provider.resolved");
66
- const genAiSignalLevel = config.telemetry?.signals?.gen_ai;
66
+ const genAiSignalLevel = cfg.telemetry?.signals?.gen_ai;
67
67
  const genAiGeneralAttrs = getGenAiGeneralAttributes(ctx, genAiSignalLevel);
68
68
  setSpanAttributes(genAiGeneralAttrs);
69
69
  // Convert inputs to AI SDK call options.
@@ -88,7 +88,7 @@ export const chatCompletions = (config) => {
88
88
  headers: prepareForwardHeaders(ctx.request),
89
89
  abortSignal: ctx.request.signal,
90
90
  timeout: {
91
- totalMs: 5 * 60 * 1000,
91
+ totalMs: ctx.body.service_tier === "flex" ? cfg.timeouts.flex : cfg.timeouts.normal,
92
92
  },
93
93
  onAbort: () => {
94
94
  throw new DOMException("The operation was aborted.", "AbortError");
@@ -122,7 +122,7 @@ export const chatCompletions = (config) => {
122
122
  model: languageModelWithMiddleware,
123
123
  headers: prepareForwardHeaders(ctx.request),
124
124
  abortSignal: ctx.request.signal,
125
- timeout: 5 * 60 * 1000,
125
+ timeout: ctx.body.service_tier === "flex" ? cfg.timeouts.flex : cfg.timeouts.normal,
126
126
  experimental_include: {
127
127
  requestBody: false,
128
128
  responseBody: false,
@@ -1,5 +1,5 @@
1
1
  import type { Attributes } from "@opentelemetry/api";
2
2
  import type { ChatCompletions, ChatCompletionsBody } from "./schema";
3
3
  import { type TelemetrySignalLevel } from "../../types";
4
- export declare const getChatRequestAttributes: (inputs: ChatCompletionsBody, signalLevel?: TelemetrySignalLevel) => Attributes;
4
+ export declare const getChatRequestAttributes: (body: ChatCompletionsBody, signalLevel?: TelemetrySignalLevel) => Attributes;
5
5
  export declare const getChatResponseAttributes: (completions: ChatCompletions, signalLevel?: TelemetrySignalLevel) => Attributes;
@@ -99,31 +99,32 @@ const toMessageParts = (message) => {
99
99
  throw new Error(`Unhandled content part type: ${message.role}`);
100
100
  }
101
101
  };
102
- export const getChatRequestAttributes = (inputs, signalLevel) => {
102
+ export const getChatRequestAttributes = (body, signalLevel) => {
103
103
  if (!signalLevel || signalLevel === "off")
104
104
  return {};
105
105
  const attrs = {};
106
- if (inputs.seed !== undefined) {
107
- Object.assign(attrs, { "gen_ai.request.seed": inputs.seed });
106
+ if (body.seed !== undefined) {
107
+ Object.assign(attrs, { "gen_ai.request.seed": body.seed });
108
108
  }
109
109
  if (signalLevel !== "required") {
110
110
  Object.assign(attrs, {
111
111
  // FUTURE: add reasoning info
112
- "gen_ai.request.stream": inputs.stream,
113
- "gen_ai.request.frequency_penalty": inputs.frequency_penalty,
114
- "gen_ai.request.max_tokens": inputs.max_completion_tokens,
115
- "gen_ai.request.presence_penalty": inputs.presence_penalty,
116
- "gen_ai.request.stop_sequences": inputs.stop
117
- ? Array.isArray(inputs.stop)
118
- ? inputs.stop
119
- : [inputs.stop]
112
+ "gen_ai.request.stream": body.stream,
113
+ "gen_ai.request.service_tier": body.service_tier,
114
+ "gen_ai.request.frequency_penalty": body.frequency_penalty,
115
+ "gen_ai.request.max_tokens": body.max_completion_tokens,
116
+ "gen_ai.request.presence_penalty": body.presence_penalty,
117
+ "gen_ai.request.stop_sequences": body.stop
118
+ ? Array.isArray(body.stop)
119
+ ? body.stop
120
+ : [body.stop]
120
121
  : undefined,
121
- "gen_ai.request.temperature": inputs.temperature,
122
- "gen_ai.request.top_p": inputs.top_p,
122
+ "gen_ai.request.temperature": body.temperature,
123
+ "gen_ai.request.top_p": body.top_p,
123
124
  });
124
- if (inputs.metadata) {
125
- for (const key in inputs.metadata) {
126
- attrs[`gen_ai.request.metadata.${key}`] = inputs.metadata[key];
125
+ if (body.metadata) {
126
+ for (const key in body.metadata) {
127
+ attrs[`gen_ai.request.metadata.${key}`] = body.metadata[key];
127
128
  }
128
129
  }
129
130
  }
@@ -134,10 +135,10 @@ export const getChatRequestAttributes = (inputs, signalLevel) => {
134
135
  // "gen_ai.system_instructions": inputs.messages
135
136
  // .filter((m) => m.role === "system")
136
137
  // .map((m) => JSON.stringify(toTextPart(m.content))),
137
- "gen_ai.input.messages": inputs.messages
138
+ "gen_ai.input.messages": body.messages
138
139
  //.filter((m) => m.role !== "system")
139
140
  .map((m) => JSON.stringify({ role: m.role, parts: toMessageParts(m) })),
140
- "gen_ai.tool.definitions": inputs.tools?.map((toolDefinition) => JSON.stringify(toolDefinition)),
141
+ "gen_ai.tool.definitions": body.tools?.map((toolDefinition) => JSON.stringify(toolDefinition)),
141
142
  });
142
143
  }
143
144
  return attrs;
@@ -151,6 +152,7 @@ export const getChatResponseAttributes = (completions, signalLevel) => {
151
152
  if (signalLevel !== "required") {
152
153
  Object.assign(attrs, {
153
154
  "gen_ai.response.finish_reasons": completions.choices?.map((c) => c.finish_reason),
155
+ "gen_ai.response.service_tier": completions.service_tier,
154
156
  "gen_ai.usage.total_tokens": completions.usage?.total_tokens,
155
157
  "gen_ai.usage.input_tokens": completions.usage?.prompt_tokens,
156
158
  "gen_ai.usage.cache_read.input_tokens": completions.usage?.prompt_tokens_details?.cached_tokens,