@polpo-ai/tools 0.6.31 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/dist/__tests__/email-tools.test.d.ts +2 -0
  2. package/dist/__tests__/email-tools.test.d.ts.map +1 -0
  3. package/dist/__tests__/email-tools.test.js +705 -0
  4. package/dist/__tests__/email-tools.test.js.map +1 -0
  5. package/dist/__tests__/extended-tools.test.d.ts +2 -0
  6. package/dist/__tests__/extended-tools.test.d.ts.map +1 -0
  7. package/dist/__tests__/extended-tools.test.js +743 -0
  8. package/dist/__tests__/extended-tools.test.js.map +1 -0
  9. package/dist/__tests__/external-api-tools.test.d.ts +2 -0
  10. package/dist/__tests__/external-api-tools.test.d.ts.map +1 -0
  11. package/dist/__tests__/external-api-tools.test.js +1731 -0
  12. package/dist/__tests__/external-api-tools.test.js.map +1 -0
  13. package/dist/__tests__/memory-tools.test.d.ts +2 -0
  14. package/dist/__tests__/memory-tools.test.d.ts.map +1 -0
  15. package/dist/__tests__/memory-tools.test.js +0 -0
  16. package/dist/__tests__/memory-tools.test.js.map +1 -0
  17. package/dist/__tests__/system-tools.test.d.ts +2 -0
  18. package/dist/__tests__/system-tools.test.d.ts.map +1 -0
  19. package/dist/__tests__/system-tools.test.js +417 -0
  20. package/dist/__tests__/system-tools.test.js.map +1 -0
  21. package/dist/adapters/node-shell.d.ts +9 -0
  22. package/dist/adapters/node-shell.d.ts.map +1 -1
  23. package/dist/adapters/node-shell.js +40 -9
  24. package/dist/adapters/node-shell.js.map +1 -1
  25. package/dist/audio-tools.d.ts +25 -27
  26. package/dist/audio-tools.d.ts.map +1 -1
  27. package/dist/audio-tools.js +156 -438
  28. package/dist/audio-tools.js.map +1 -1
  29. package/dist/browser-tools.d.ts.map +1 -1
  30. package/dist/browser-tools.js +5 -1
  31. package/dist/browser-tools.js.map +1 -1
  32. package/dist/email-tools.d.ts.map +1 -1
  33. package/dist/email-tools.js +11 -3
  34. package/dist/email-tools.js.map +1 -1
  35. package/dist/image-tools.d.ts +27 -25
  36. package/dist/image-tools.d.ts.map +1 -1
  37. package/dist/image-tools.js +151 -332
  38. package/dist/image-tools.js.map +1 -1
  39. package/dist/index.d.ts +1 -2
  40. package/dist/index.d.ts.map +1 -1
  41. package/dist/index.js +3 -2
  42. package/dist/index.js.map +1 -1
  43. package/dist/lib/edge-speech-model.d.ts +61 -0
  44. package/dist/lib/edge-speech-model.d.ts.map +1 -0
  45. package/dist/lib/edge-speech-model.js +144 -0
  46. package/dist/lib/edge-speech-model.js.map +1 -0
  47. package/dist/lib/exa-search-provider.d.ts +27 -0
  48. package/dist/lib/exa-search-provider.d.ts.map +1 -0
  49. package/dist/lib/exa-search-provider.js +109 -0
  50. package/dist/lib/exa-search-provider.js.map +1 -0
  51. package/dist/lib/provider-resolver.d.ts +54 -0
  52. package/dist/lib/provider-resolver.d.ts.map +1 -0
  53. package/dist/lib/provider-resolver.js +115 -0
  54. package/dist/lib/provider-resolver.js.map +1 -0
  55. package/dist/search-tools.d.ts +10 -13
  56. package/dist/search-tools.d.ts.map +1 -1
  57. package/dist/search-tools.js +63 -140
  58. package/dist/search-tools.js.map +1 -1
  59. package/dist/system-tools.d.ts +19 -5
  60. package/dist/system-tools.d.ts.map +1 -1
  61. package/dist/system-tools.js +48 -31
  62. package/dist/system-tools.js.map +1 -1
  63. package/package.json +16 -4
  64. package/dist/phone-tools.d.ts +0 -27
  65. package/dist/phone-tools.d.ts.map +0 -1
  66. package/dist/phone-tools.js +0 -577
  67. package/dist/phone-tools.js.map +0 -1
@@ -1,49 +1,50 @@
1
1
  /**
2
2
  * Image & video tools for generation and vision/analysis.
3
3
  *
4
- * Provides agent capabilities to:
5
- * - Generate images from text prompts (image_generate) — via fal.ai
6
- * - Generate videos from text prompts (video_generate) — via fal.ai
7
- * - Analyze/describe images using vision models (image_analyze) via OpenAI/Anthropic
4
+ * Architecture: thin wrappers over the Vercel AI SDK v6.
5
+ * - image_generate → `generateImage` against a configurable provider
6
+ * - video_generate → `experimental_generateVideo` against a configurable provider
7
+ * - image_analyze → `generateText` (multimodal) against a configurable provider
8
8
  *
9
- * Architecture: direct fetch() to provider REST APIs zero vendor SDK dependencies.
9
+ * Model selection: each tool picks its model in this order:
10
+ * 1. per-call `model` input parameter (`<provider>/<model>` string),
11
+ * 2. agent-config default passed to the factory (image/video/vision),
12
+ * 3. hardcoded fallback constant from @polpo-ai/core.
10
13
  *
11
- * Providers:
12
- * Image generation: fal.ai (FLUX models fal-ai/flux/dev default)
13
- * Video generation: fal.ai (Wan 2.2 fal-ai/wan/v2.2-1.3b/text-to-video default)
14
- * Vision/analysis: openai (gpt-4.1-mini), anthropic (Claude)
15
- *
16
- * Credential resolution order (same as email tools):
17
- * 1. Agent vault (per-agent credentials — e.g. service "fal" with key "key")
18
- * 2. Environment variables (global fallback)
19
- *
20
- * Environment variables (fallback):
21
- * FAL_KEY — fal.ai image/video generation
22
- * OPENAI_API_KEY — openai vision provider
23
- * ANTHROPIC_API_KEY — anthropic vision provider
14
+ * Provider names are not in the input schema anymore — they ride along
15
+ * with the model string. Every supported provider has a vault key
16
+ * convention (fal-ai, openai, anthropic) with an env-var fallback.
24
17
  */
25
18
  import { resolve, dirname, extname } from "node:path";
26
19
  import { Type } from "@sinclair/typebox";
20
+ import { parseModelString, DEFAULT_IMAGE_MODEL, DEFAULT_VIDEO_MODEL, DEFAULT_VISION_MODEL, } from "@polpo-ai/core";
27
21
  import { NodeFileSystem } from "./adapters/node-filesystem.js";
28
22
  import { resolveAllowedPaths, assertPathAllowed } from "./path-sandbox.js";
29
- // ─── Constants ───
23
+ import { resolveImageProvider, resolveVideoProvider, resolveVisionProvider, } from "./lib/provider-resolver.js";
30
24
  const MAX_IMAGE_SIZE = 20 * 1024 * 1024; // 20 MB
31
- const DEFAULT_TIMEOUT = 120_000; // 2 min for image generation
32
- const VIDEO_TIMEOUT = 300_000; // 5 min for video generation
33
- const FAL_QUEUE_POLL_INTERVAL = 3_000; // 3 sec polling for async queue
34
- // ─── Helpers ───
35
25
  function requireEnv(key) {
36
26
  const val = process.env[key];
37
27
  if (!val)
38
28
  throw new Error(`Missing environment variable: ${key}. Set it before using this tool.`);
39
29
  return val;
40
30
  }
41
- /** Resolve fal.ai API key: vault (service "fal-ai", key "key") > FAL_KEY env var. */
42
- function resolveFalKey(vault) {
43
- const fromVault = vault?.getKey("fal-ai", "key");
44
- if (fromVault)
45
- return fromVault;
46
- return requireEnv("FAL_KEY");
31
+ /** Resolve which model to actually use, in priority order. */
32
+ function resolveEffectiveModel(override, configured, fallback) {
33
+ return parseModelString(override ?? configured ?? fallback);
34
+ }
35
+ /** Vault-key resolution per provider. Throws with a clear message
36
+ * when neither vault nor env var has the credential. */
37
+ function resolveProviderKey(provider, vault) {
38
+ switch (provider) {
39
+ case "fal":
40
+ return vault?.getKey("fal-ai", "key") ?? requireEnv("FAL_KEY");
41
+ case "openai":
42
+ return vault?.getKey("openai", "key") ?? requireEnv("OPENAI_API_KEY");
43
+ case "anthropic":
44
+ return vault?.getKey("anthropic", "key") ?? requireEnv("ANTHROPIC_API_KEY");
45
+ default:
46
+ throw new Error(`Unknown provider '${provider}': no credential lookup defined`);
47
+ }
47
48
  }
48
49
  function imageMime(ext) {
49
50
  const map = {
@@ -58,76 +59,13 @@ function imageMime(ext) {
58
59
  };
59
60
  return map[ext.toLowerCase()] ?? "image/png";
60
61
  }
61
- /**
62
- * Submit a request to fal.ai queue and poll until completion.
63
- * Uses the queue endpoint (POST https://queue.fal.run/<model>) for reliability,
64
- * then polls the status endpoint until the result is ready.
65
- */
66
- async function falQueueRequest(modelId, input, apiKey, timeout, signal) {
67
- const controller = new AbortController();
68
- const timer = setTimeout(() => controller.abort(), timeout);
69
- if (signal)
70
- signal.addEventListener("abort", () => controller.abort(), { once: true });
71
- try {
72
- // Submit to queue
73
- const submitResp = await fetch(`https://queue.fal.run/${modelId}`, {
74
- method: "POST",
75
- headers: {
76
- Authorization: `Key ${apiKey}`,
77
- "Content-Type": "application/json",
78
- },
79
- body: JSON.stringify(input),
80
- signal: controller.signal,
81
- });
82
- if (!submitResp.ok) {
83
- const errText = await submitResp.text();
84
- throw new Error(`fal.ai queue submit ${submitResp.status}: ${errText}`);
85
- }
86
- const queueData = await submitResp.json();
87
- const requestId = queueData.request_id;
88
- const statusUrl = queueData.status_url ?? `https://queue.fal.run/${modelId}/requests/${requestId}/status`;
89
- const responseUrl = queueData.response_url ?? `https://queue.fal.run/${modelId}/requests/${requestId}`;
90
- // Poll for completion
91
- while (true) {
92
- await new Promise(r => setTimeout(r, FAL_QUEUE_POLL_INTERVAL));
93
- const statusResp = await fetch(statusUrl, {
94
- headers: { Authorization: `Key ${apiKey}` },
95
- signal: controller.signal,
96
- });
97
- if (!statusResp.ok) {
98
- throw new Error(`fal.ai status poll ${statusResp.status}`);
99
- }
100
- const status = await statusResp.json();
101
- if (status.status === "COMPLETED") {
102
- break;
103
- }
104
- if (status.status === "FAILED") {
105
- throw new Error(`fal.ai request failed: ${status.error ?? "unknown error"}`);
106
- }
107
- // IN_QUEUE or IN_PROGRESS — keep polling
108
- }
109
- // Fetch result
110
- const resultResp = await fetch(responseUrl, {
111
- headers: { Authorization: `Key ${apiKey}` },
112
- signal: controller.signal,
113
- });
114
- if (!resultResp.ok) {
115
- const errText = await resultResp.text();
116
- throw new Error(`fal.ai result fetch ${resultResp.status}: ${errText}`);
117
- }
118
- return await resultResp.json();
119
- }
120
- finally {
121
- clearTimeout(timer);
122
- }
123
- }
124
62
  // ─── Tool: image_generate ───
125
63
  const ImageGenerateSchema = Type.Object({
126
64
  prompt: Type.String({ description: "Text prompt describing the image to generate" }),
127
65
  path: Type.String({ description: "Output file path (e.g. 'output.png'). Format inferred from extension." }),
128
66
  model: Type.Optional(Type.String({
129
- description: "fal.ai model ID. Default: 'fal-ai/flux/dev'. " +
130
- "Other options: 'fal-ai/flux-pro/v1.1' (higher quality), 'fal-ai/flux/schnell' (faster).",
67
+ description: "Override the agent's image_model for this call. Format: '<provider>/<model>' " +
68
+ "(e.g. 'fal/fal-ai/flux/dev', 'fal/fal-ai/flux-pro/v1.1'). When omitted, uses the agent's configured image_model.",
131
69
  })),
132
70
  size: Type.Optional(Type.String({
133
71
  description: "Image size as 'WIDTHxHEIGHT' (e.g. '1024x1024', '1024x768', '768x1024'). Default: '1024x1024'.",
@@ -142,20 +80,21 @@ const ImageGenerateSchema = Type.Object({
142
80
  description: "Random seed for reproducible results. Omit for random.",
143
81
  })),
144
82
  });
145
- function createGenerateTool(cwd, sandbox, fs, vault) {
83
+ function createGenerateTool(cwd, sandbox, fs, configuredModel, vault) {
146
84
  return {
147
85
  name: "image_generate",
148
86
  label: "Generate Image",
149
- description: "Generate an image from a text prompt using fal.ai (FLUX models). " +
87
+ description: "Generate an image from a text prompt. " +
150
88
  "Output format inferred from file extension (png, jpg, webp). " +
151
- "Models: fal-ai/flux/dev (default, balanced), fal-ai/flux-pro/v1.1 (best quality), " +
152
- "fal-ai/flux/schnell (fastest). Credentials resolved from: agent vault > FAL_KEY env var.",
89
+ "Model is configured at agent level (image_model) — pass `model` here only to override per-call. " +
90
+ "Default: fal/fal-ai/flux/dev. Currently supports fal as image provider.",
153
91
  parameters: ImageGenerateSchema,
154
92
  async execute(_id, params, signal) {
155
93
  const filePath = resolve(cwd, params.path);
156
94
  assertPathAllowed(filePath, sandbox, "image_generate");
157
95
  try {
158
- return await generateFal(filePath, params, fs, vault, signal);
96
+ const parsed = resolveEffectiveModel(params.model, configuredModel, DEFAULT_IMAGE_MODEL);
97
+ return await generateImageWithSdk(filePath, parsed, params, fs, vault, signal);
159
98
  }
160
99
  catch (err) {
161
100
  return {
@@ -166,59 +105,51 @@ function createGenerateTool(cwd, sandbox, fs, vault) {
166
105
  },
167
106
  };
168
107
  }
169
- async function generateFal(filePath, params, fs, vault, signal) {
170
- const apiKey = resolveFalKey(vault);
171
- const model = params.model ?? "fal-ai/flux/dev";
172
- // Parse size into width/height
173
- let width = 1024, height = 1024;
174
- if (params.size) {
175
- const parts = params.size.split("x").map(Number);
176
- if (parts.length === 2 && parts[0] > 0 && parts[1] > 0) {
177
- width = parts[0];
178
- height = parts[1];
179
- }
180
- }
181
- const input = {
182
- prompt: params.prompt,
183
- image_size: { width, height },
184
- num_images: 1,
185
- };
108
+ async function generateImageWithSdk(filePath, parsed, params, fs, vault, signal) {
109
+ const { generateImage } = await import("ai");
110
+ const apiKey = resolveProviderKey(parsed.provider, vault);
111
+ const provider = await resolveImageProvider(parsed.provider, apiKey);
112
+ // fal-specific knobs go through providerOptions; the SDK passes them
113
+ // through to the model's input untouched.
114
+ const falOptions = {};
186
115
  if (params.num_inference_steps != null)
187
- input.num_inference_steps = params.num_inference_steps;
116
+ falOptions.num_inference_steps = params.num_inference_steps;
188
117
  if (params.guidance_scale != null)
189
- input.guidance_scale = params.guidance_scale;
190
- if (params.seed != null)
191
- input.seed = params.seed;
192
- const result = await falQueueRequest(model, input, apiKey, DEFAULT_TIMEOUT, signal);
193
- // fal.ai FLUX response: { images: [{ url, width, height, content_type }], ... }
194
- const images = result.images;
195
- if (!images || images.length === 0) {
196
- throw new Error("No images in fal.ai response");
118
+ falOptions.guidance_scale = params.guidance_scale;
119
+ const result = await generateImage({
120
+ model: provider.image(parsed.model),
121
+ prompt: params.prompt,
122
+ size: params.size,
123
+ seed: params.seed,
124
+ providerOptions: parsed.provider === "fal" && Object.keys(falOptions).length
125
+ ? { fal: falOptions }
126
+ : undefined,
127
+ abortSignal: signal,
128
+ });
129
+ const bytes = result.image.uint8Array;
130
+ if (!bytes || bytes.byteLength === 0) {
131
+ throw new Error("No image bytes in SDK response");
197
132
  }
198
- const imageUrl = images[0].url;
199
- const imgResp = await fetch(imageUrl);
200
- if (!imgResp.ok)
201
- throw new Error(`Failed to download generated image: ${imgResp.status}`);
202
- const buffer = Buffer.from(await imgResp.arrayBuffer());
203
133
  if (!fs.writeFileBuffer) {
204
134
  throw new Error("FileSystem implementation does not support writeFileBuffer (required for binary writes).");
205
135
  }
206
136
  await fs.mkdir(dirname(filePath));
207
- await fs.writeFileBuffer(filePath, new Uint8Array(buffer));
137
+ await fs.writeFileBuffer(filePath, bytes);
208
138
  const info = [
209
139
  `Image saved: ${filePath}`,
210
- `Size: ${(buffer.byteLength / 1024).toFixed(1)} KB`,
211
- `Model: ${model}`,
212
- `Dimensions: ${images[0].width}x${images[0].height}`,
140
+ `Size: ${(bytes.byteLength / 1024).toFixed(1)} KB`,
141
+ `Model: ${parsed.provider}/${parsed.model}`,
213
142
  ];
143
+ if (params.size)
144
+ info.push(`Dimensions: ${params.size}`);
214
145
  return {
215
146
  content: [{ type: "text", text: info.join("\n") }],
216
147
  details: {
217
- provider: "fal",
218
- model,
219
- size: `${images[0].width}x${images[0].height}`,
148
+ provider: parsed.provider,
149
+ model: parsed.model,
150
+ size: params.size,
220
151
  path: filePath,
221
- bytes: buffer.byteLength,
152
+ bytes: bytes.byteLength,
222
153
  },
223
154
  };
224
155
  }
@@ -227,40 +158,39 @@ const VideoGenerateSchema = Type.Object({
227
158
  prompt: Type.String({ description: "Text prompt describing the video to generate" }),
228
159
  path: Type.String({ description: "Output file path (e.g. 'output.mp4')." }),
229
160
  model: Type.Optional(Type.String({
230
- description: "fal.ai video model ID. Default: 'fal-ai/wan/v2.2-1.3b/text-to-video'. " +
231
- "Other options: 'fal-ai/wan/v2.2-a14b/text-to-video' (higher quality, slower).",
161
+ description: "Override the agent's video_model for this call. Format: '<provider>/<model>' " +
162
+ "(e.g. 'fal/luma-ray-2-flash', 'fal/luma-ray-2', 'fal/hunyuan-video'). When omitted, uses the agent's configured video_model.",
232
163
  })),
233
- num_frames: Type.Optional(Type.Number({
234
- description: "Number of frames to generate. Default: 81 (~5 seconds at 16fps).",
164
+ aspect_ratio: Type.Optional(Type.String({
165
+ description: "Aspect ratio as 'WIDTH:HEIGHT' (e.g. '16:9', '9:16', '1:1').",
235
166
  })),
236
167
  resolution: Type.Optional(Type.String({
237
- description: "Video resolution as 'WIDTHxHEIGHT' (e.g. '854x480', '1280x720'). Default: '854x480' (480p).",
168
+ description: "Resolution as 'WIDTHxHEIGHT' (e.g. '1280x720'). Provider-dependent.",
238
169
  })),
239
- num_inference_steps: Type.Optional(Type.Number({
240
- description: "Number of inference steps (higher = better quality, slower). Default: 30.",
170
+ duration: Type.Optional(Type.Number({
171
+ description: "Video duration in seconds. Provider-dependent typical range 4-10.",
241
172
  })),
242
- guidance_scale: Type.Optional(Type.Number({
243
- description: "Guidance scale — how closely to follow the prompt. Default: 5.0.",
173
+ fps: Type.Optional(Type.Number({
174
+ description: "Frames per second. Provider-dependent.",
244
175
  })),
245
176
  seed: Type.Optional(Type.Number({
246
177
  description: "Random seed for reproducible results. Omit for random.",
247
178
  })),
248
179
  });
249
- function createVideoGenerateTool(cwd, sandbox, fs, vault) {
180
+ function createVideoGenerateTool(cwd, sandbox, fs, configuredModel, vault) {
250
181
  return {
251
182
  name: "video_generate",
252
183
  label: "Generate Video",
253
- description: "Generate a video from a text prompt using fal.ai (Wan 2.2 models). " +
254
- "Output saved as MP4. Models: fal-ai/wan/v2.2-1.3b/text-to-video (default, faster), " +
255
- "fal-ai/wan/v2.2-a14b/text-to-video (best quality). " +
256
- "Video generation takes 1-5 minutes depending on model and resolution. " +
257
- "Credentials resolved from: agent vault > FAL_KEY env var.",
184
+ description: "Generate a video from a text prompt. " +
185
+ "Output saved as MP4. Model is configured at agent level (video_model) — pass `model` here only to override " +
186
+ "per-call. Default: fal/luma-ray-2-flash. Currently supports fal as video provider.",
258
187
  parameters: VideoGenerateSchema,
259
188
  async execute(_id, params, signal) {
260
189
  const filePath = resolve(cwd, params.path);
261
190
  assertPathAllowed(filePath, sandbox, "video_generate");
262
191
  try {
263
- return await generateVideo(filePath, params, fs, vault, signal);
192
+ const parsed = resolveEffectiveModel(params.model, configuredModel, DEFAULT_VIDEO_MODEL);
193
+ return await generateVideoWithSdk(filePath, parsed, params, fs, vault, signal);
264
194
  }
265
195
  catch (err) {
266
196
  return {
@@ -271,55 +201,42 @@ function createVideoGenerateTool(cwd, sandbox, fs, vault) {
271
201
  },
272
202
  };
273
203
  }
274
- async function generateVideo(filePath, params, fs, vault, signal) {
275
- const apiKey = resolveFalKey(vault);
276
- const model = params.model ?? "fal-ai/wan/v2.2-1.3b/text-to-video";
277
- const input = {
204
+ async function generateVideoWithSdk(filePath, parsed, params, fs, vault, signal) {
205
+ const { experimental_generateVideo } = await import("ai");
206
+ const apiKey = resolveProviderKey(parsed.provider, vault);
207
+ const provider = await resolveVideoProvider(parsed.provider, apiKey);
208
+ const result = await experimental_generateVideo({
209
+ model: provider.video(parsed.model),
278
210
  prompt: params.prompt,
279
- };
280
- if (params.num_frames != null)
281
- input.num_frames = params.num_frames;
282
- if (params.num_inference_steps != null)
283
- input.num_inference_steps = params.num_inference_steps;
284
- if (params.guidance_scale != null)
285
- input.guidance_scale = params.guidance_scale;
286
- if (params.seed != null)
287
- input.seed = params.seed;
288
- // Parse resolution
289
- if (params.resolution) {
290
- const parts = params.resolution.split("x").map(Number);
291
- if (parts.length === 2 && parts[0] > 0 && parts[1] > 0) {
292
- input.resolution = { width: parts[0], height: parts[1] };
293
- }
294
- }
295
- const result = await falQueueRequest(model, input, apiKey, VIDEO_TIMEOUT, signal);
296
- // fal.ai video response: { video: { url, content_type, file_name, file_size } }
297
- const video = result.video;
298
- if (!video?.url) {
299
- throw new Error("No video in fal.ai response");
211
+ aspectRatio: params.aspect_ratio,
212
+ resolution: params.resolution,
213
+ duration: params.duration,
214
+ fps: params.fps,
215
+ seed: params.seed,
216
+ abortSignal: signal,
217
+ });
218
+ const bytes = result.video?.uint8Array;
219
+ if (!bytes || bytes.byteLength === 0) {
220
+ throw new Error("No video bytes in SDK response");
300
221
  }
301
- const videoResp = await fetch(video.url);
302
- if (!videoResp.ok)
303
- throw new Error(`Failed to download generated video: ${videoResp.status}`);
304
- const buffer = Buffer.from(await videoResp.arrayBuffer());
305
222
  if (!fs.writeFileBuffer) {
306
223
  throw new Error("FileSystem implementation does not support writeFileBuffer (required for binary writes).");
307
224
  }
308
225
  await fs.mkdir(dirname(filePath));
309
- await fs.writeFileBuffer(filePath, new Uint8Array(buffer));
310
- const sizeMB = (buffer.byteLength / 1024 / 1024).toFixed(2);
226
+ await fs.writeFileBuffer(filePath, bytes);
227
+ const sizeMB = (bytes.byteLength / 1024 / 1024).toFixed(2);
311
228
  const info = [
312
229
  `Video saved: ${filePath}`,
313
230
  `Size: ${sizeMB} MB`,
314
- `Model: ${model}`,
231
+ `Model: ${parsed.provider}/${parsed.model}`,
315
232
  ];
316
233
  return {
317
234
  content: [{ type: "text", text: info.join("\n") }],
318
235
  details: {
319
- provider: "fal",
320
- model,
236
+ provider: parsed.provider,
237
+ model: parsed.model,
321
238
  path: filePath,
322
- bytes: buffer.byteLength,
239
+ bytes: bytes.byteLength,
323
240
  },
324
241
  };
325
242
  }
@@ -327,21 +244,20 @@ async function generateVideo(filePath, params, fs, vault, signal) {
327
244
  const ImageAnalyzeSchema = Type.Object({
328
245
  path: Type.String({ description: "Path to the image file to analyze" }),
329
246
  prompt: Type.Optional(Type.String({ description: "Question or instruction for the vision model (default: 'Describe this image in detail')" })),
330
- provider: Type.Optional(Type.Union([
331
- Type.Literal("openai"),
332
- Type.Literal("anthropic"),
333
- ], { description: "Vision provider (default: openai)" })),
334
- model: Type.Optional(Type.String({ description: "Model name. OpenAI: 'gpt-4.1-mini' (default). Anthropic: 'claude-sonnet-4-20250514' (default)." })),
247
+ model: Type.Optional(Type.String({
248
+ description: "Override the agent's vision_model for this call. Format: '<provider>/<model>' " +
249
+ "(e.g. 'openai/gpt-4o-mini', 'anthropic/claude-sonnet-4-20250514'). When omitted, uses the agent's configured vision_model.",
250
+ })),
335
251
  max_tokens: Type.Optional(Type.Number({ description: "Max tokens in response (default: 1024)" })),
336
252
  });
337
- function createAnalyzeTool(cwd, sandbox, fs, vault) {
253
+ function createAnalyzeTool(cwd, sandbox, fs, configuredModel, vault) {
338
254
  return {
339
255
  name: "image_analyze",
340
256
  label: "Analyze Image",
341
257
  description: "Analyze an image using AI vision models. Can describe contents, extract text (OCR), " +
342
258
  "answer questions about the image, identify objects, read charts, etc. " +
343
- "Providers: openai (GPT-4.1-mini, default), anthropic (Claude). " +
344
- "Credentials resolved from: agent vault > OPENAI_API_KEY or ANTHROPIC_API_KEY env var.",
259
+ "Model is configured at agent level (vision_model) — pass `model` here only to override per-call. " +
260
+ "Default: openai/gpt-4o-mini. Supported providers: openai, anthropic.",
345
261
  parameters: ImageAnalyzeSchema,
346
262
  async execute(_id, params, signal) {
347
263
  const filePath = resolve(cwd, params.path);
@@ -369,143 +285,47 @@ function createAnalyzeTool(cwd, sandbox, fs, vault) {
369
285
  details: { error: "file_too_large", size: fileBuffer.byteLength },
370
286
  };
371
287
  }
372
- const provider = params.provider ?? "openai";
373
288
  try {
374
- if (provider === "openai") {
375
- return await analyzeOpenAI(filePath, fileBuffer, params, vault, signal);
376
- }
377
- else {
378
- return await analyzeAnthropic(filePath, fileBuffer, params, vault, signal);
379
- }
289
+ const parsed = resolveEffectiveModel(params.model, configuredModel, DEFAULT_VISION_MODEL);
290
+ return await analyzeWithSdk(filePath, fileBuffer, parsed, params, vault, signal);
380
291
  }
381
292
  catch (err) {
382
293
  return {
383
- content: [{ type: "text", text: `Image analysis error (${provider}): ${err.message}` }],
384
- details: { provider, error: err.message },
294
+ content: [{ type: "text", text: `Image analysis error: ${err.message}` }],
295
+ details: { error: err.message },
385
296
  };
386
297
  }
387
298
  },
388
299
  };
389
300
  }
390
- async function analyzeOpenAI(filePath, fileBuffer, params, vault, signal) {
391
- const apiKey = vault?.getKey("openai", "key") ?? requireEnv("OPENAI_API_KEY");
392
- const model = params.model ?? "gpt-4.1-mini";
393
- const prompt = params.prompt ?? "Describe this image in detail.";
394
- const maxTokens = params.max_tokens ?? 1024;
395
- const ext = extname(filePath).toLowerCase();
396
- const mime = imageMime(ext);
397
- const base64 = fileBuffer.toString("base64");
398
- const dataUrl = `data:${mime};base64,${base64}`;
399
- const controller = new AbortController();
400
- const timer = setTimeout(() => controller.abort(), DEFAULT_TIMEOUT);
401
- if (signal)
402
- signal.addEventListener("abort", () => controller.abort(), { once: true });
403
- const response = await fetch("https://api.openai.com/v1/chat/completions", {
404
- method: "POST",
405
- headers: {
406
- Authorization: `Bearer ${apiKey}`,
407
- "Content-Type": "application/json",
408
- },
409
- body: JSON.stringify({
410
- model,
411
- max_tokens: maxTokens,
412
- messages: [
413
- {
414
- role: "user",
415
- content: [
416
- { type: "text", text: prompt },
417
- { type: "image_url", image_url: { url: dataUrl, detail: "auto" } },
418
- ],
419
- },
420
- ],
421
- }),
422
- signal: controller.signal,
423
- });
424
- clearTimeout(timer);
425
- if (!response.ok) {
426
- const errText = await response.text();
427
- throw new Error(`OpenAI Vision API ${response.status}: ${errText}`);
428
- }
429
- const data = await response.json();
430
- const analysis = data.choices[0]?.message?.content ?? "";
431
- const usage = data.usage;
432
- return {
433
- content: [{ type: "text", text: analysis }],
434
- details: {
435
- provider: "openai",
436
- model,
437
- path: filePath,
438
- imageSize: fileBuffer.byteLength,
439
- tokens: usage?.total_tokens,
440
- promptTokens: usage?.prompt_tokens,
441
- completionTokens: usage?.completion_tokens,
442
- },
443
- };
444
- }
445
- async function analyzeAnthropic(filePath, fileBuffer, params, vault, signal) {
446
- const apiKey = vault?.getKey("anthropic", "key") ?? requireEnv("ANTHROPIC_API_KEY");
447
- const model = params.model ?? "claude-sonnet-4-20250514";
448
- const prompt = params.prompt ?? "Describe this image in detail.";
449
- const maxTokens = params.max_tokens ?? 1024;
301
+ async function analyzeWithSdk(filePath, fileBuffer, parsed, params, vault, signal) {
302
+ const { generateText } = await import("ai");
303
+ const apiKey = resolveProviderKey(parsed.provider, vault);
304
+ const provider = await resolveVisionProvider(parsed.provider, apiKey);
450
305
  const ext = extname(filePath).toLowerCase();
451
- const mime = imageMime(ext);
452
- const base64 = fileBuffer.toString("base64");
453
- // Anthropic only supports specific media types
454
- const supportedTypes = ["image/jpeg", "image/png", "image/gif", "image/webp"];
455
- const mediaType = supportedTypes.includes(mime) ? mime : "image/png";
456
- const controller = new AbortController();
457
- const timer = setTimeout(() => controller.abort(), DEFAULT_TIMEOUT);
458
- if (signal)
459
- signal.addEventListener("abort", () => controller.abort(), { once: true });
460
- const response = await fetch("https://api.anthropic.com/v1/messages", {
461
- method: "POST",
462
- headers: {
463
- "x-api-key": apiKey,
464
- "anthropic-version": "2023-06-01",
465
- "Content-Type": "application/json",
466
- },
467
- body: JSON.stringify({
468
- model,
469
- max_tokens: maxTokens,
470
- messages: [
471
- {
472
- role: "user",
473
- content: [
474
- {
475
- type: "image",
476
- source: {
477
- type: "base64",
478
- media_type: mediaType,
479
- data: base64,
480
- },
481
- },
482
- { type: "text", text: prompt },
483
- ],
484
- },
485
- ],
486
- }),
487
- signal: controller.signal,
306
+ const mediaType = imageMime(ext);
307
+ const result = await generateText({
308
+ model: provider(parsed.model),
309
+ maxOutputTokens: params.max_tokens ?? 1024,
310
+ messages: [{
311
+ role: "user",
312
+ content: [
313
+ { type: "text", text: params.prompt ?? "Describe this image in detail." },
314
+ { type: "image", image: new Uint8Array(fileBuffer), mediaType },
315
+ ],
316
+ }],
317
+ abortSignal: signal,
488
318
  });
489
- clearTimeout(timer);
490
- if (!response.ok) {
491
- const errText = await response.text();
492
- throw new Error(`Anthropic Vision API ${response.status}: ${errText}`);
493
- }
494
- const data = await response.json();
495
- const analysis = data.content
496
- .filter(b => b.type === "text" && b.text)
497
- .map(b => b.text)
498
- .join("\n");
499
- const usage = data.usage;
500
319
  return {
501
- content: [{ type: "text", text: analysis }],
320
+ content: [{ type: "text", text: result.text }],
502
321
  details: {
503
- provider: "anthropic",
504
- model,
322
+ provider: parsed.provider,
323
+ model: parsed.model,
505
324
  path: filePath,
506
325
  imageSize: fileBuffer.byteLength,
507
- inputTokens: usage?.input_tokens,
508
- outputTokens: usage?.output_tokens,
326
+ tokens: result.usage?.totalTokens,
327
+ promptTokens: result.usage?.inputTokens,
328
+ completionTokens: result.usage?.outputTokens,
509
329
  },
510
330
  };
511
331
  }
@@ -513,23 +333,22 @@ export const ALL_IMAGE_TOOL_NAMES = ["image_generate", "image_analyze", "video_g
513
333
  /**
514
334
  * Create image & video tools for generation, vision analysis, and video creation.
515
335
  *
516
- * @param cwd - Working directory for resolving file paths
517
- * @param allowedPaths - Sandbox paths for file validation
518
- * @param allowedTools - Optional filter — only include tools whose names appear here.
519
- * Supports wildcards expanded upstream (e.g. "image_*", "video_*").
520
- * @param vault - Resolved vault for credential resolution (fal-ai, openai, anthropic).
521
- * Credentials are resolved as: vault > environment variable.
336
+ * The 6-arg positional signature is preserved for back-compat. Prefer the
337
+ * options-object form (`{ cwd, vault, imageModel, ... }`) for new callers.
522
338
  */
523
339
  export function createImageTools(cwd, allowedPaths, allowedTools, vault, fs) {
524
- const sandbox = resolveAllowedPaths(cwd, allowedPaths);
525
- const _fs = fs ?? new NodeFileSystem();
340
+ const opts = typeof cwd === "string"
341
+ ? { cwd, allowedPaths, allowedTools, vault, fs }
342
+ : cwd;
343
+ const sandbox = resolveAllowedPaths(opts.cwd, opts.allowedPaths);
344
+ const _fs = opts.fs ?? new NodeFileSystem();
526
345
  const factories = {
527
- image_generate: () => createGenerateTool(cwd, sandbox, _fs, vault),
528
- image_analyze: () => createAnalyzeTool(cwd, sandbox, _fs, vault),
529
- video_generate: () => createVideoGenerateTool(cwd, sandbox, _fs, vault),
346
+ image_generate: () => createGenerateTool(opts.cwd, sandbox, _fs, opts.imageModel, opts.vault),
347
+ image_analyze: () => createAnalyzeTool(opts.cwd, sandbox, _fs, opts.visionModel, opts.vault),
348
+ video_generate: () => createVideoGenerateTool(opts.cwd, sandbox, _fs, opts.videoModel, opts.vault),
530
349
  };
531
- const names = allowedTools
532
- ? ALL_IMAGE_TOOL_NAMES.filter(n => allowedTools.some(a => a.toLowerCase() === n))
350
+ const names = opts.allowedTools
351
+ ? ALL_IMAGE_TOOL_NAMES.filter(n => opts.allowedTools.some(a => a.toLowerCase() === n))
533
352
  : ALL_IMAGE_TOOL_NAMES;
534
353
  return names.map(n => factories[n]());
535
354
  }