@vellumai/assistant 0.10.1-staging.1 → 0.10.1-staging.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,29 +13,23 @@
13
13
  * 2. Finds a vision-capable profile for captioning via `findVisionProfile`.
14
14
  * If none exists, images are replaced with a fail-open placeholder so the
15
15
  * model at least knows an image was present.
16
- * 3. Persists each image to the workspace attachments directory (content-hash
17
- * deduped) so the original image is accessible to future vision-capable
18
- * turns or subagents.
19
- * 4. Captions each `ImageContent` block through the `vision` call site (with
20
- * an in-memory content-hash cache to avoid re-captioning across turns), and
21
- * replaces the block with `[Image: <caption>] (saved to <path>)`.
16
+ * 3. Replaces each `ImageContent` block with a `[Image …]` text caption via
17
+ * {@link captionImageBlocks} (which also persists the original and caches
18
+ * captions across turns).
22
19
  *
23
- * Fail-open is the dominant error mode: a captioning failure leaves a
24
- * placeholder text block (with the saved image path) rather than the raw
25
- * image (which would cause a provider rejection on a text-only model) or
26
- * dropping the image entirely (which would lose information).
20
+ * The companion `post-tool-use` hook applies the same substitution to images a
21
+ * tool returns (e.g. a browser screenshot).
27
22
  */
28
23
 
29
24
  import {
30
25
  doesSupportVision,
31
26
  getModelProfiles,
32
- type ImageContent,
33
27
  type PluginHookFn,
34
28
  type UserPromptSubmitContext,
35
29
  } from "@vellumai/plugin-api";
36
30
 
37
- import { persistImage } from "../src/image-persist.js";
38
- import { captionImage, findVisionProfile } from "../src/vision-caption.js";
31
+ import { captionImageBlocks } from "../src/caption-blocks.js";
32
+ import { findVisionProfile } from "../src/vision-caption.js";
39
33
 
40
34
  const userPromptSubmit: PluginHookFn<UserPromptSubmitContext> = async (ctx) => {
41
35
  // Resolve the active profile from modelProfileKey, falling back to the
@@ -57,39 +51,11 @@ const userPromptSubmit: PluginHookFn<UserPromptSubmitContext> = async (ctx) => {
57
51
  // Scan all messages for image blocks and replace them with captions.
58
52
  let imageCount = 0;
59
53
  for (const message of ctx.latestMessages) {
60
- for (let i = 0; i < message.content.length; i++) {
61
- const block = message.content[i];
62
- if (block.type !== "image") continue;
63
-
64
- imageCount++;
65
- const image = block as ImageContent;
66
-
67
- // Persist the image to the workspace so it's accessible to future
68
- // vision-capable turns or subagents.
69
- const savedPath = persistImage(
70
- image.source.data,
71
- image.source.media_type,
72
- );
73
-
74
- if (visionProfileKey != null) {
75
- const caption = await captionImage(image, visionProfileKey, ctx.logger);
76
- const pathSuffix = savedPath != null ? ` (saved to ${savedPath})` : "";
77
- message.content[i] = {
78
- type: "text",
79
- text:
80
- caption != null
81
- ? `[Image: ${caption}]${pathSuffix}`
82
- : `[Image: captioning failed — unable to describe]${pathSuffix}`,
83
- };
84
- } else {
85
- // No vision profile configured at all — fail-open placeholder.
86
- const pathSuffix = savedPath != null ? ` (saved to ${savedPath})` : "";
87
- message.content[i] = {
88
- type: "text",
89
- text: `[Image: no vision-capable model configured to describe this image]${pathSuffix}`,
90
- };
91
- }
92
- }
54
+ imageCount += await captionImageBlocks(
55
+ message.content,
56
+ visionProfileKey,
57
+ ctx.logger,
58
+ );
93
59
  }
94
60
 
95
61
  if (imageCount > 0) {
@@ -0,0 +1,77 @@
1
+ /**
2
+ * Shared image→text substitution for the image-fallback plugin's hooks.
3
+ *
4
+ * Two hooks replace `image` content blocks with a text caption when the active
5
+ * model can't process images: `user-prompt-submit` handles user-attached
6
+ * images, and `post-tool-use` handles images a tool returns (e.g. a browser
7
+ * screenshot). This module holds the per-block substitution they share —
8
+ * persist the original image to a known location, caption it via a
9
+ * vision-capable profile, and swap in a `[Image …]` text block.
10
+ *
11
+ * The caption text states up front that the active model can't view images and
12
+ * the image was auto-described to text, so the model treats the block as a
13
+ * derived description rather than a verbatim transcript.
14
+ *
15
+ * Fail-open is the dominant error mode: a captioning failure leaves a
16
+ * placeholder text block rather than the raw image (which a text-only provider
17
+ * would reject) or nothing (which would lose information).
18
+ */
19
+
20
+ import type {
21
+ ContentBlock,
22
+ ImageContent,
23
+ PluginLogger,
24
+ } from "@vellumai/plugin-api";
25
+
26
+ import { persistImage } from "./image-persist.js";
27
+ import { captionImage } from "./vision-caption.js";
28
+
29
+ /**
30
+ * Replace every `image` block in `blocks` (in place) with a text caption so a
31
+ * text-only model can still reason about the image's content. Returns the
32
+ * number of image blocks replaced.
33
+ *
34
+ * @param blocks Content-block array to scan and mutate in place.
35
+ * @param visionProfileKey Key of a vision-capable profile for captioning, or
36
+ * `null` when none is configured (fail-open
37
+ * placeholder).
38
+ * @param logger Turn-scoped logger for attribution.
39
+ */
40
+ export async function captionImageBlocks(
41
+ blocks: ContentBlock[],
42
+ visionProfileKey: string | null,
43
+ logger: PluginLogger,
44
+ ): Promise<number> {
45
+ let imageCount = 0;
46
+
47
+ for (let i = 0; i < blocks.length; i++) {
48
+ const block = blocks[i];
49
+ if (block.type !== "image") continue;
50
+
51
+ imageCount++;
52
+ const image = block as ImageContent;
53
+
54
+ // Persist the original to a known, content-hash-deduped location so it
55
+ // survives the text substitution and stays findable on disk.
56
+ persistImage(image.source.data, image.source.media_type);
57
+
58
+ if (visionProfileKey != null) {
59
+ const caption = await captionImage(image, visionProfileKey, logger);
60
+ blocks[i] = {
61
+ type: "text",
62
+ text:
63
+ caption != null
64
+ ? `[Image auto-described for text-only model: ${caption}]`
65
+ : `[Image: auto-description failed (text-only model)]`,
66
+ };
67
+ } else {
68
+ // No vision profile configured at all — fail-open placeholder.
69
+ blocks[i] = {
70
+ type: "text",
71
+ text: `[Image: no vision-capable model configured to describe it]`,
72
+ };
73
+ }
74
+ }
75
+
76
+ return imageCount;
77
+ }
@@ -2,10 +2,10 @@
2
2
  * Persist image data to the workspace attachments directory.
3
3
  *
4
4
  * When the active model is text-only, the image-fallback plugin captions the
5
- * image and substitutes a text block. Saving the raw image to disk and
6
- * referencing the path in the caption text means a future turn with a
7
- * vision-capable model (or a subagent) could still access the original image
8
- * via file_read, and the user can find the image at a known location.
5
+ * image and substitutes a text block. Saving the raw image to a known,
6
+ * content-hash-deduped location means the original survives the text
7
+ * substitution and stays findable on disk for the user (or a subagent with a
8
+ * vision-capable model that reads it via file_read).
9
9
  */
10
10
 
11
11
  import { existsSync, mkdirSync, writeFileSync } from "node:fs";
@@ -36,10 +36,7 @@ function extensionForMediaType(mediaType: string): string {
36
36
  * Save an image's base64 data to the attachments dir if not already present.
37
37
  * Returns the absolute file path, or `null` when the write fails.
38
38
  */
39
- export function persistImage(
40
- data: string,
41
- mediaType: string,
42
- ): string | null {
39
+ export function persistImage(data: string, mediaType: string): string | null {
43
40
  try {
44
41
  mkdirSync(ATTACHMENTS_DIR, { recursive: true });
45
42
 
@@ -47,6 +47,7 @@ import historyRepairStop from "./history-repair/hooks/stop.js";
47
47
  import historyRepairUserPromptSubmit from "./history-repair/hooks/user-prompt-submit.js";
48
48
  import historyRepairPkg from "./history-repair/package.json" with { type: "json" };
49
49
  import { resetRepairStateStoreForTests } from "./history-repair/repair-state-store.js";
50
+ import imageFallbackPostToolUse from "./image-fallback/hooks/post-tool-use.js";
50
51
  import imageFallbackUserPromptSubmit from "./image-fallback/hooks/user-prompt-submit.js";
51
52
  import imageFallbackPkg from "./image-fallback/package.json" with { type: "json" };
52
53
  import { resetCaptionCacheForTests } from "./image-fallback/src/caption-cache.js";
@@ -81,12 +82,14 @@ import toolResultTruncatePostToolUse from "./tool-result-truncate/hooks/post-too
81
82
  import toolResultTruncatePkg from "./tool-result-truncate/package.json" with { type: "json" };
82
83
 
83
84
  /**
84
- * `image-fallback` — a `user-prompt-submit` hook that captions image blocks via
85
- * a vision-capable profile when the active model is text-only, substituting the
86
- * caption as a `[Image: <caption>]` text block so the model can still reason
87
- * about the image's content. Self-gates on `isNonInteractive`; fail-open with a
88
- * placeholder when no vision profile is configured or captioning fails. An
89
- * in-memory content-hash cache avoids re-captioning the same image across turns.
85
+ * `image-fallback` — captions image blocks via a vision-capable profile when
86
+ * the active model is text-only, substituting the caption as an `[Image …]`
87
+ * text block so the model can still reason about the image's content. The
88
+ * `user-prompt-submit` hook handles user-attached images; the `post-tool-use`
89
+ * hook handles images a tool returns (e.g. a browser screenshot) nested in the
90
+ * tool result's `contentBlocks`. Fail-open with a placeholder when no vision
91
+ * profile is configured or captioning fails. An in-memory content-hash cache
92
+ * avoids re-captioning the same image across turns.
90
93
  */
91
94
  export const defaultImageFallbackPlugin: Plugin = {
92
95
  manifest: {
@@ -95,6 +98,7 @@ export const defaultImageFallbackPlugin: Plugin = {
95
98
  },
96
99
  hooks: {
97
100
  "user-prompt-submit": imageFallbackUserPromptSubmit,
101
+ "post-tool-use": imageFallbackPostToolUse,
98
102
  },
99
103
  };
100
104
 
@@ -49,6 +49,7 @@ interface ProviderCall {
49
49
  options: SendMessageOptions | undefined;
50
50
  }
51
51
  const providerCalls: ProviderCall[] = [];
52
+ const warnCalls: Array<{ args: unknown[] }> = [];
52
53
 
53
54
  mock.module("../../../../providers/provider-send-message.js", () => ({
54
55
  getConfiguredProvider: async () => providerStub,
@@ -57,10 +58,12 @@ mock.module("../../../../providers/provider-send-message.js", () => ({
57
58
  }));
58
59
 
59
60
  mock.module("../../../../util/logger.js", () => ({
60
- getLogger: () =>
61
- new Proxy({} as Record<string, unknown>, {
62
- get: (_t, prop) => (prop === "child" ? () => ({}) : () => {}),
61
+ getLogger: () => ({
62
+ warn: (...args: unknown[]) => warnCalls.push({ args }),
63
+ child: () => ({
64
+ warn: (...args: unknown[]) => warnCalls.push({ args }),
63
65
  }),
66
+ }),
64
67
  }));
65
68
 
66
69
  const { selectPool, MemoryV3RetrievalUnavailableError } =
@@ -97,10 +100,21 @@ function noToolResponse(): ProviderResponse {
97
100
  model: "stub-model",
98
101
  stopReason: "end_turn",
99
102
  usage: { inputTokens: 0, outputTokens: 0 },
103
+ rawRequest: { model: "MiniMaxAI/MiniMax-M3" },
104
+ rawResponse: { model: "accounts/fireworks/models/minimax-m3" },
100
105
  content: [{ type: "text", text: "no tool call" }],
101
106
  };
102
107
  }
103
108
 
109
+ function wrongToolResponse(): ProviderResponse {
110
+ return {
111
+ model: "stub-model",
112
+ stopReason: "tool_use",
113
+ usage: { inputTokens: 0, outputTokens: 0 },
114
+ content: [{ type: "tool_use", id: "tu-1", name: "wrong_tool", input: {} }],
115
+ };
116
+ }
117
+
104
118
  /** Provider returning a different response per call (the i-th call returns
105
119
  * responses[i], or the last entry once exhausted). */
106
120
  function makeSequenceProvider(responses: ProviderResponse[]): Provider {
@@ -118,12 +132,12 @@ function makeSequenceProvider(responses: ProviderResponse[]): Provider {
118
132
 
119
133
  /** Provider that records each call and then throws — the throw-after-retries
120
134
  * path (the provider's own RetryProvider has already exhausted its backoff). */
121
- function makeThrowingProvider(): Provider {
135
+ function makeThrowingProvider(message = "boom"): Provider {
122
136
  return {
123
137
  name: "throwing",
124
138
  sendMessage: async (messages, options) => {
125
139
  providerCalls.push({ messages, options });
126
- throw new Error("boom");
140
+ throw new Error(message);
127
141
  },
128
142
  };
129
143
  }
@@ -167,9 +181,19 @@ function sentBlocks(callIndex = 0): RenderedBlock[] {
167
181
  .content as unknown as RenderedBlock[];
168
182
  }
169
183
 
184
+ function warnPayloads(): Array<Record<string, unknown>> {
185
+ return warnCalls
186
+ .map((call) => call.args[0])
187
+ .filter(
188
+ (payload): payload is Record<string, unknown> =>
189
+ payload !== null && typeof payload === "object",
190
+ );
191
+ }
192
+
170
193
  beforeEach(() => {
171
194
  providerStub = null;
172
195
  providerCalls.length = 0;
196
+ warnCalls.length = 0;
173
197
  });
174
198
 
175
199
  // ---------------------------------------------------------------------------
@@ -250,6 +274,62 @@ describe("selectPool — infrastructure failures throw", () => {
250
274
  MemoryV3RetrievalUnavailableError,
251
275
  );
252
276
  expect(providerCalls).toHaveLength(3);
277
+ const payloads = warnPayloads();
278
+ const attemptPayloads = payloads.filter(
279
+ (payload) => payload.reason === "missing_tool_use",
280
+ );
281
+ expect(attemptPayloads).toHaveLength(3);
282
+ expect(attemptPayloads[0]).toMatchObject({
283
+ attempt: 1,
284
+ reason: "missing_tool_use",
285
+ providerName: "stub",
286
+ candidateCount: 4,
287
+ stableCount: 2,
288
+ finderCount: 2,
289
+ response: {
290
+ model: "stub-model",
291
+ stopReason: "end_turn",
292
+ requestModel: "MiniMaxAI/MiniMax-M3",
293
+ responseModel: "accounts/fireworks/models/minimax-m3",
294
+ contentBlockTypes: ["text"],
295
+ toolUseNames: [],
296
+ },
297
+ });
298
+ const aggregatePayload = payloads.find((payload) =>
299
+ Array.isArray(payload.failures),
300
+ );
301
+ expect(aggregatePayload?.providerName).toBe("stub");
302
+ const failures = aggregatePayload?.failures as
303
+ | Array<Record<string, unknown>>
304
+ | undefined;
305
+ expect(failures?.[0]).toMatchObject({ reason: "missing_tool_use" });
306
+ });
307
+
308
+ test("wrong tool_use name logs the unexpected name before throwing", async () => {
309
+ providerStub = makeProvider(wrongToolResponse());
310
+ await expect(selectPool(makePool(), makeTurn("x"))).rejects.toThrow(
311
+ MemoryV3RetrievalUnavailableError,
312
+ );
313
+ expect(providerCalls).toHaveLength(3);
314
+ expect(
315
+ warnPayloads().filter(
316
+ (payload) => payload.reason === "unexpected_tool_name",
317
+ ),
318
+ ).toEqual([
319
+ expect.objectContaining({
320
+ attempt: 1,
321
+ reason: "unexpected_tool_name",
322
+ providerName: "stub",
323
+ toolName: "wrong_tool",
324
+ response: expect.objectContaining({
325
+ stopReason: "tool_use",
326
+ contentBlockTypes: ["tool_use"],
327
+ toolUseNames: ["wrong_tool"],
328
+ }),
329
+ }),
330
+ expect.objectContaining({ attempt: 2 }),
331
+ expect.objectContaining({ attempt: 3 }),
332
+ ]);
253
333
  });
254
334
 
255
335
  test("schema mismatch → throws after retrying", async () => {
@@ -258,6 +338,17 @@ describe("selectPool — infrastructure failures throw", () => {
258
338
  MemoryV3RetrievalUnavailableError,
259
339
  );
260
340
  expect(providerCalls).toHaveLength(3);
341
+ expect(
342
+ warnPayloads().filter((payload) => payload.reason === "schema_mismatch"),
343
+ ).toEqual([
344
+ expect.objectContaining({
345
+ attempt: 1,
346
+ reason: "schema_mismatch",
347
+ schemaIssues: [expect.objectContaining({ path: "ids" })],
348
+ }),
349
+ expect.objectContaining({ attempt: 2 }),
350
+ expect.objectContaining({ attempt: 3 }),
351
+ ]);
261
352
  });
262
353
 
263
354
  test("provider throw → throws after retrying", async () => {
@@ -266,6 +357,44 @@ describe("selectPool — infrastructure failures throw", () => {
266
357
  MemoryV3RetrievalUnavailableError,
267
358
  );
268
359
  expect(providerCalls).toHaveLength(3);
360
+ expect(
361
+ warnPayloads().filter((payload) => payload.reason === "provider_error"),
362
+ ).toEqual([
363
+ expect.objectContaining({
364
+ attempt: 1,
365
+ reason: "provider_error",
366
+ providerName: "throwing",
367
+ error: { name: "Error", message: "boom" },
368
+ }),
369
+ expect.objectContaining({ attempt: 2 }),
370
+ expect.objectContaining({ attempt: 3 }),
371
+ ]);
372
+ });
373
+
374
+ test("provider throw redacts sensitive message details in diagnostics", async () => {
375
+ const providerSecret = ["sk-proj-", "a".repeat(40)].join("");
376
+ const message = `provider rejected Authorization: Bearer ${providerSecret}`;
377
+ providerStub = makeThrowingProvider(message);
378
+
379
+ let thrown: unknown;
380
+ try {
381
+ await selectPool(makePool(), makeTurn("x"));
382
+ } catch (error) {
383
+ thrown = error;
384
+ }
385
+
386
+ expect(thrown).toBeInstanceOf(MemoryV3RetrievalUnavailableError);
387
+ expect((thrown as Error).message).not.toContain(providerSecret);
388
+ expect((thrown as Error).message).toContain("[REDACTED]");
389
+
390
+ const providerErrors = warnPayloads().filter(
391
+ (payload) => payload.reason === "provider_error",
392
+ );
393
+ const error = providerErrors[0]?.error as
394
+ | Record<string, unknown>
395
+ | undefined;
396
+ expect(error?.message).not.toContain(providerSecret);
397
+ expect(error?.message).toContain("[REDACTED]");
269
398
  });
270
399
 
271
400
  test("a malformed response that recovers on retry returns its pages", async () => {