@hebo-ai/gateway 0.5.2 → 0.6.0-rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. package/README.md +32 -1
  2. package/dist/endpoints/chat-completions/converters.js +100 -18
  3. package/dist/endpoints/chat-completions/handler.js +6 -2
  4. package/dist/endpoints/chat-completions/index.d.ts +1 -0
  5. package/dist/endpoints/chat-completions/index.js +1 -0
  6. package/dist/endpoints/chat-completions/otel.js +1 -0
  7. package/dist/endpoints/chat-completions/schema.d.ts +244 -0
  8. package/dist/endpoints/chat-completions/schema.js +36 -4
  9. package/dist/endpoints/embeddings/handler.js +3 -1
  10. package/dist/endpoints/embeddings/index.d.ts +1 -0
  11. package/dist/endpoints/embeddings/index.js +1 -0
  12. package/dist/lifecycle.js +2 -2
  13. package/dist/middleware/debug.d.ts +3 -0
  14. package/dist/middleware/debug.js +27 -0
  15. package/dist/middleware/matcher.js +2 -0
  16. package/dist/models/anthropic/middleware.d.ts +1 -0
  17. package/dist/models/anthropic/middleware.js +17 -1
  18. package/dist/models/google/middleware.d.ts +1 -0
  19. package/dist/models/google/middleware.js +18 -1
  20. package/dist/models/openai/middleware.d.ts +1 -0
  21. package/dist/models/openai/middleware.js +23 -1
  22. package/dist/providers/bedrock/middleware.d.ts +1 -0
  23. package/dist/providers/bedrock/middleware.js +52 -1
  24. package/dist/telemetry/fetch.d.ts +1 -1
  25. package/dist/telemetry/fetch.js +23 -3
  26. package/dist/telemetry/index.d.ts +1 -0
  27. package/dist/telemetry/index.js +1 -0
  28. package/package.json +17 -12
  29. package/src/endpoints/chat-completions/converters.test.ts +85 -1
  30. package/src/endpoints/chat-completions/converters.ts +139 -18
  31. package/src/endpoints/chat-completions/handler.test.ts +2 -0
  32. package/src/endpoints/chat-completions/index.ts +1 -0
  33. package/src/endpoints/chat-completions/otel.ts +1 -0
  34. package/src/endpoints/chat-completions/schema.ts +38 -4
  35. package/src/endpoints/embeddings/index.ts +1 -0
  36. package/src/lifecycle.ts +2 -2
  37. package/src/models/anthropic/middleware.test.ts +45 -1
  38. package/src/models/anthropic/middleware.ts +21 -1
  39. package/src/models/google/middleware.test.ts +30 -1
  40. package/src/models/google/middleware.ts +20 -1
  41. package/src/models/openai/middleware.test.ts +32 -1
  42. package/src/models/openai/middleware.ts +25 -1
  43. package/src/providers/bedrock/middleware.test.ts +121 -1
  44. package/src/providers/bedrock/middleware.ts +61 -1
  45. package/src/telemetry/fetch.ts +31 -4
  46. package/src/telemetry/index.ts +1 -0
package/README.md CHANGED
@@ -38,7 +38,7 @@ bun install @hebo-ai/gateway
38
38
  - Runtime Support
39
39
  - [Vercel Edge](#vercel-edge) | [Cloudflare Workers](#cloudflare-workers) | [Deno Deploy](#deno-deploy) | [AWS Lambda](#aws-lambda)
40
40
  - OpenAI Extensions
41
- - [Reasoning](#reasoning)
41
+ - [Reasoning](#reasoning) | [Prompt Caching](#prompt-caching)
42
42
  - Advanced Usage
43
43
  - [Passing Framework State to Hooks](#passing-framework-state-to-hooks) | [Selective Route Mounting](#selective-route-mounting) | [Low-level Schemas & Converters](#low-level-schemas--converters)
44
44
 
@@ -565,6 +565,37 @@ Advanced models (like Anthropic Claude 3.7 or Gemini 3) surface structured reaso
565
565
 
566
566
  For **Gemini 3** models, returning the thought signature via `extra_content` is mandatory to resume the chain-of-thought; failing to do so may result in errors or degraded performance.
567
567
 
568
+ ### Prompt Caching
569
+
570
+ The chat completions endpoint supports both implicit (provider-managed) and explicit prompt caching across OpenAI-compatible providers.
571
+
572
+ Accepted request fields:
573
+
574
+ - `prompt_cache_key` + `prompt_cache_retention` (OpenAI style)
575
+ - `cache_control` (OpenRouter / Vercel / Claude style)
576
+ - `cached_content` (Gemini style)
577
+
578
+ ```json
579
+ {
580
+ "model": "anthropic/claude-sonnet-4.6",
581
+ "messages": [
582
+ {
583
+ "role": "system",
584
+ "content": "Reusable policy and instructions",
585
+ "cache_control": { "type": "ephemeral", "ttl": "1h" }
586
+ },
587
+ { "role": "user", "content": "Apply policy to this request." }
588
+ ]
589
+ }
590
+ ```
591
+
592
+ Provider behavior:
593
+
594
+ - **OpenAI-compatible**: forwards `prompt_cache_key` and `prompt_cache_retention` as native provider options.
595
+ - **Anthropic Claude**: maps top-level caching to Anthropic cache control, while message/part `cache_control` breakpoints are preserved.
596
+ - **Google Gemini**: maps `cached_content` to Gemini `cachedContent`.
597
+ - **Amazon Nova (Bedrock)**: maps `cache_control` to Bedrock `cachePoints` and inserts an automatic cache point on a stable prefix when none is provided.
598
+
568
599
  ## 🧪 Advanced Usage
569
600
 
570
601
  ### Logger Settings
@@ -5,8 +5,9 @@ import { OpenAIError, toOpenAIError } from "../../errors/openai";
5
5
  import { toResponse } from "../../utils/response";
6
6
  // --- Request Flow ---
7
7
  export function convertToTextCallOptions(params) {
8
- const { messages, tools, tool_choice, temperature, max_tokens, max_completion_tokens, response_format, reasoning_effort, reasoning, frequency_penalty, presence_penalty, seed, stop, top_p, ...rest } = params;
8
+ const { messages, tools, tool_choice, temperature, max_tokens, max_completion_tokens, response_format, reasoning_effort, reasoning, prompt_cache_key, prompt_cache_retention, cached_content, cache_control, frequency_penalty, presence_penalty, seed, stop, top_p, ...rest } = params;
9
9
  Object.assign(rest, parseReasoningOptions(reasoning_effort, reasoning));
10
+ Object.assign(rest, parsePromptCachingOptions(prompt_cache_key, prompt_cache_retention, cached_content, cache_control));
10
11
  const { toolChoice, activeTools } = convertToToolChoiceOptions(tool_choice);
11
12
  return {
12
13
  messages: convertToModelMessages(messages),
@@ -44,6 +45,11 @@ export function convertToModelMessages(messages) {
44
45
  if (message.role === "tool")
45
46
  continue;
46
47
  if (message.role === "system") {
48
+ if (message.cache_control) {
49
+ message.providerOptions = {
50
+ unknown: { cache_control: message.cache_control },
51
+ };
52
+ }
47
53
  modelMessages.push(message);
48
54
  continue;
49
55
  }
@@ -67,15 +73,21 @@ function indexToolMessages(messages) {
67
73
  return map;
68
74
  }
69
75
  export function fromChatCompletionsUserMessage(message) {
70
- return {
76
+ const out = {
71
77
  role: "user",
72
78
  content: Array.isArray(message.content)
73
79
  ? fromChatCompletionsContent(message.content)
74
80
  : message.content,
75
81
  };
82
+ if (message.cache_control) {
83
+ out.providerOptions = {
84
+ unknown: { cache_control: message.cache_control },
85
+ };
86
+ }
87
+ return out;
76
88
  }
77
89
  export function fromChatCompletionsAssistantMessage(message) {
78
- const { tool_calls, role, content, extra_content, reasoning_details } = message;
90
+ const { tool_calls, role, content, extra_content, reasoning_details, cache_control } = message;
79
91
  const parts = [];
80
92
  if (reasoning_details?.length) {
81
93
  for (const detail of reasoning_details) {
@@ -111,10 +123,16 @@ export function fromChatCompletionsAssistantMessage(message) {
111
123
  : content;
112
124
  for (const part of inputContent) {
113
125
  if (part.type === "text") {
114
- parts.push({
126
+ const textPart = {
115
127
  type: "text",
116
128
  text: part.text,
117
- });
129
+ };
130
+ if (part.cache_control) {
131
+ textPart.providerOptions = {
132
+ unknown: { cache_control: part.cache_control },
133
+ };
134
+ }
135
+ parts.push(textPart);
118
136
  }
119
137
  }
120
138
  }
@@ -141,6 +159,9 @@ export function fromChatCompletionsAssistantMessage(message) {
141
159
  if (extra_content) {
142
160
  out.providerOptions = extra_content;
143
161
  }
162
+ if (cache_control) {
163
+ ((out.providerOptions ??= { unknown: {} })["unknown"] ??= {})["cache_control"] = cache_control;
164
+ }
144
165
  return out;
145
166
  }
146
167
  export function fromChatCompletionsToolResultMessage(message, toolById) {
@@ -165,40 +186,68 @@ export function fromChatCompletionsContent(content) {
165
186
  return content.map((part) => {
166
187
  switch (part.type) {
167
188
  case "image_url":
168
- return fromImageUrlPart(part.image_url.url);
189
+ return fromImageUrlPart(part.image_url.url, part.cache_control);
169
190
  case "file":
170
- return fromFilePart(part.file.data, part.file.media_type, part.file.filename);
191
+ return fromFilePart(part.file.data, part.file.media_type, part.file.filename, part.cache_control);
171
192
  case "input_audio":
172
- return fromFilePart(part.input_audio.data, `audio/${part.input_audio.format}`);
173
- default:
174
- return part;
193
+ return fromFilePart(part.input_audio.data, `audio/${part.input_audio.format}`, undefined, part.cache_control);
194
+ default: {
195
+ const out = {
196
+ type: "text",
197
+ text: part.text,
198
+ };
199
+ if (part.cache_control) {
200
+ out.providerOptions = {
201
+ unknown: { cache_control: part.cache_control },
202
+ };
203
+ }
204
+ return out;
205
+ }
175
206
  }
176
207
  });
177
208
  }
178
- function fromImageUrlPart(url) {
209
+ function fromImageUrlPart(url, cacheControl) {
179
210
  if (url.startsWith("data:")) {
180
211
  const { mimeType, base64Data } = parseDataUrl(url);
181
- return fromFilePart(base64Data, mimeType);
212
+ return fromFilePart(base64Data, mimeType, undefined, cacheControl);
182
213
  }
183
- return {
214
+ const out = {
184
215
  type: "image",
185
216
  image: new URL(url),
186
217
  };
218
+ if (cacheControl) {
219
+ out.providerOptions = {
220
+ unknown: { cache_control: cacheControl },
221
+ };
222
+ }
223
+ return out;
187
224
  }
188
- function fromFilePart(base64Data, mediaType, filename) {
225
+ function fromFilePart(base64Data, mediaType, filename, cacheControl) {
189
226
  if (mediaType.startsWith("image/")) {
190
- return {
227
+ const out = {
191
228
  type: "image",
192
229
  image: z.util.base64ToUint8Array(base64Data),
193
230
  mediaType,
194
231
  };
232
+ if (cacheControl) {
233
+ out.providerOptions = {
234
+ unknown: { cache_control: cacheControl },
235
+ };
236
+ }
237
+ return out;
195
238
  }
196
- return {
239
+ const out = {
197
240
  type: "file",
198
241
  data: z.util.base64ToUint8Array(base64Data),
199
242
  filename,
200
243
  mediaType,
201
244
  };
245
+ if (cacheControl) {
246
+ out.providerOptions = {
247
+ unknown: { cache_control: cacheControl },
248
+ };
249
+ }
250
+ return out;
202
251
  }
203
252
  export const convertToToolSet = (tools) => {
204
253
  if (!tools) {
@@ -295,6 +344,31 @@ function parseReasoningOptions(reasoning_effort, reasoning) {
295
344
  }
296
345
  return out;
297
346
  }
347
+ function parsePromptCachingOptions(prompt_cache_key, prompt_cache_retention, cached_content, cache_control) {
348
+ const out = {};
349
+ const syncedCacheKey = prompt_cache_key ?? cached_content;
350
+ const syncedCachedContent = cached_content ?? prompt_cache_key;
351
+ let syncedCacheRetention = prompt_cache_retention;
352
+ if (!syncedCacheRetention && cache_control?.ttl) {
353
+ syncedCacheRetention = cache_control.ttl === "24h" ? "24h" : "in_memory";
354
+ }
355
+ let syncedCacheControl = cache_control;
356
+ if (!syncedCacheControl && syncedCacheRetention) {
357
+ syncedCacheControl = {
358
+ type: "ephemeral",
359
+ ttl: syncedCacheRetention === "24h" ? "24h" : "5m",
360
+ };
361
+ }
362
+ if (syncedCacheKey)
363
+ out["prompt_cache_key"] = syncedCacheKey;
364
+ if (syncedCacheRetention)
365
+ out["prompt_cache_retention"] = syncedCacheRetention;
366
+ if (syncedCachedContent)
367
+ out["cached_content"] = syncedCachedContent;
368
+ if (syncedCacheControl)
369
+ out["cache_control"] = syncedCacheControl;
370
+ return out;
371
+ }
298
372
  // --- Response Flow ---
299
373
  export function toChatCompletions(result, model) {
300
374
  return {
@@ -494,8 +568,16 @@ export function toChatCompletionsUsage(usage) {
494
568
  if (reasoning !== undefined)
495
569
  out.completion_tokens_details = { reasoning_tokens: reasoning };
496
570
  const cached = usage.inputTokenDetails?.cacheReadTokens;
497
- if (cached !== undefined)
498
- out.prompt_tokens_details = { cached_tokens: cached };
571
+ const cacheWrite = usage.inputTokenDetails?.cacheWriteTokens;
572
+ if (cached !== undefined || cacheWrite !== undefined) {
573
+ out.prompt_tokens_details = {};
574
+ if (cached !== undefined) {
575
+ out.prompt_tokens_details.cached_tokens = cached;
576
+ }
577
+ if (cacheWrite !== undefined) {
578
+ out.prompt_tokens_details.cache_write_tokens = cacheWrite;
579
+ }
580
+ }
499
581
  return out;
500
582
  }
501
583
  export function toChatCompletionsToolCall(id, name, args, providerMetadata) {
@@ -28,6 +28,7 @@ export const chatCompletions = (config) => {
28
28
  catch {
29
29
  throw new GatewayError("Invalid JSON", 400);
30
30
  }
31
+ logger.trace({ requestId: ctx.requestId, body: ctx.body }, "[chat] ChatCompletionsBody");
31
32
  addSpanEvent("hebo.request.deserialized");
32
33
  const parsed = ChatCompletionsBodySchema.safeParse(ctx.body);
33
34
  if (!parsed.success) {
@@ -37,7 +38,8 @@ export const chatCompletions = (config) => {
37
38
  ctx.body = parsed.data;
38
39
  addSpanEvent("hebo.request.parsed");
39
40
  if (hooks?.before) {
40
- ctx.body = (await hooks.before(ctx)) ?? ctx.body;
41
+ ctx.body =
42
+ (await hooks.before(ctx)) ?? ctx.body;
41
43
  addSpanEvent("hebo.hooks.before.completed");
42
44
  }
43
45
  // Resolve model + provider (hooks may override defaults).
@@ -70,7 +72,7 @@ export const chatCompletions = (config) => {
70
72
  options: textOptions,
71
73
  }, "[chat] AI SDK options");
72
74
  addSpanEvent("hebo.options.prepared");
73
- setSpanAttributes(getChatRequestAttributes(inputs, genAiSignalLevel));
75
+ setSpanAttributes(getChatRequestAttributes(ctx.body, genAiSignalLevel));
74
76
  // Build middleware chain (model -> forward params -> provider).
75
77
  const languageModelWithMiddleware = wrapLanguageModel({
76
78
  model: languageModel,
@@ -93,6 +95,7 @@ export const chatCompletions = (config) => {
93
95
  onFinish: (res) => {
94
96
  addSpanEvent("hebo.ai-sdk.completed");
95
97
  const streamResult = toChatCompletions(res, ctx.resolvedModelId);
98
+ logger.trace({ requestId: ctx.requestId, result: streamResult }, "[chat] ChatCompletions");
96
99
  addSpanEvent("hebo.result.transformed");
97
100
  const genAiResponseAttrs = getChatResponseAttributes(streamResult, genAiSignalLevel);
98
101
  setSpanAttributes(genAiResponseAttrs);
@@ -129,6 +132,7 @@ export const chatCompletions = (config) => {
129
132
  addSpanEvent("hebo.ai-sdk.completed");
130
133
  // Transform result.
131
134
  ctx.result = toChatCompletions(result, ctx.resolvedModelId);
135
+ logger.trace({ requestId: ctx.requestId, result: ctx.result }, "[chat] ChatCompletions");
132
136
  addSpanEvent("hebo.result.transformed");
133
137
  const genAiResponseAttrs = getChatResponseAttributes(ctx.result, genAiSignalLevel);
134
138
  setSpanAttributes(genAiResponseAttrs);
@@ -1,3 +1,4 @@
1
1
  export * from "./converters";
2
2
  export * from "./handler";
3
3
  export * from "./schema";
4
+ export * from "./otel";
@@ -1,3 +1,4 @@
1
1
  export * from "./converters";
2
2
  export * from "./handler";
3
3
  export * from "./schema";
4
+ export * from "./otel";
@@ -77,6 +77,7 @@ export const getChatRequestAttributes = (inputs, signalLevel) => {
77
77
  }
78
78
  if (signalLevel !== "required") {
79
79
  Object.assign(attrs, {
80
+ // FUTURE: add reasoning info
80
81
  "gen_ai.request.stream": inputs.stream,
81
82
  "gen_ai.request.frequency_penalty": inputs.frequency_penalty,
82
83
  "gen_ai.request.max_tokens": inputs.max_completion_tokens,