@blockrun/franklin 3.21.8 → 3.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,14 +13,49 @@ import { recordUsage } from '../stats/tracker.js';
13
13
  import { findModel, estimateCostUsd } from '../gateway-models.js';
14
14
  import { logger } from '../logger.js';
15
15
  /**
16
- * Models that accept a reference image via /v1/images/image2image. Currently
17
- * limited to OpenAI's edit endpoint — Gemini Nano Banana Pro and Grok Imagine
18
- * Image Pro need gateway-side support before they can be wired in here.
16
+ * Models that accept a reference image via /v1/images/image2image. Mirrors the
17
+ * gateway's EDIT_SUPPORTED_MODELS (src/app/api/v1/images/image2image/route.ts):
18
+ * both OpenAI gpt-image-* and Google Nano Banana support image-to-image edits.
19
19
  */
20
20
  export const EDIT_SUPPORTED_MODELS = new Set([
21
21
  'openai/gpt-image-1',
22
22
  'openai/gpt-image-2',
23
+ 'google/nano-banana',
24
+ 'google/nano-banana-pro',
23
25
  ]);
26
+ /**
27
+ * Mask-based inpainting is OpenAI-only. Gemini (Nano Banana) does prompt-based
28
+ * edits with no mask concept. Mirrors the gateway's MASK_SUPPORTED_MODELS.
29
+ */
30
+ export const MASK_SUPPORTED_MODELS = new Set([
31
+ 'openai/gpt-image-1',
32
+ 'openai/gpt-image-2',
33
+ ]);
34
+ /**
35
+ * Per-provider multi-image (fusion) cap. Mirrors the gateway's
36
+ * MAX_IMAGES_BY_PREFIX: OpenAI fuses up to 4 anchors, Gemini up to 3.
37
+ */
38
+ const MAX_IMAGES_BY_PREFIX = {
39
+ 'openai/': 4,
40
+ 'google/': 3,
41
+ };
42
+ /**
43
+ * Output-image count ceiling. The gateway has no hard max but price scales with
44
+ * n, so cap client-side to keep a typo from draining the wallet.
45
+ */
46
+ export const MAX_OUTPUT_IMAGES = 4;
47
+ /**
48
+ * Valid sizes per known image model, mirroring the gateway's IMAGE_MODELS.sizes
49
+ * (src/lib/models.ts). Used to fail cheaply before paying when a caller or the
50
+ * media router picks a size the model rejects. Models absent from this table
51
+ * (custom / future gateway models) skip validation and let the gateway decide.
52
+ */
53
+ export const IMAGE_MODEL_SIZES = {
54
+ 'openai/gpt-image-1': ['1024x1024', '1536x1024', '1024x1536'],
55
+ 'openai/gpt-image-2': ['1024x1024', '1536x1024', '1024x1536'],
56
+ 'google/nano-banana': ['1024x1024'],
57
+ 'google/nano-banana-pro': ['1024x1024', '2048x2048', '4096x4096'],
58
+ };
24
59
  export const REFERENCE_IMAGE_MAX_BYTES = 4_000_000;
25
60
  /**
26
61
  * Normalize a reference image into a base64 data URI for the gateway. The
@@ -77,21 +112,33 @@ export async function resolveReferenceImage(input, workingDir) {
77
112
  function buildExecute(deps) {
78
113
  return async function execute(input, ctx) {
79
114
  const rawInput = input;
80
- const { output_path, size, model, contentId, image_url } = rawInput;
115
+ const { output_path, size, model, contentId, image_url, mask } = rawInput;
81
116
  if (!rawInput.prompt) {
82
117
  return { output: 'Error: prompt is required', isError: true };
83
118
  }
84
- // Resolve the reference image (if any) before any paid call so we fail
85
- // cheaply on bad paths / oversize attachments. Holds the resolved data URI
86
- // / http URL that gets posted to /v1/images/image2image.
87
- let referenceImage;
88
- if (image_url) {
89
- try {
90
- referenceImage = await resolveReferenceImage(image_url, ctx.workingDir);
91
- }
92
- catch (err) {
93
- return { output: `Error: ${err.message}`, isError: true };
94
- }
119
+ // Collect reference images: image_url (single, back-compat) + images[]
120
+ // (fusion), in that order. Edit mode is active whenever at least one
121
+ // reference image is present the call then routes to image2image.
122
+ const referenceInputs = [
123
+ ...(image_url ? [image_url] : []),
124
+ ...(Array.isArray(rawInput.images) ? rawInput.images.filter(Boolean) : []),
125
+ ];
126
+ const editMode = referenceInputs.length > 0;
127
+ // Output count: 1–4. Reject out-of-range up front so a typo can't blow the
128
+ // wallet (price scales with n) or get silently clamped.
129
+ const n = rawInput.n ?? 1;
130
+ if (!Number.isInteger(n) || n < 1 || n > MAX_OUTPUT_IMAGES) {
131
+ return {
132
+ output: `Error: n must be an integer between 1 and ${MAX_OUTPUT_IMAGES} (got ${rawInput.n}).`,
133
+ isError: true,
134
+ };
135
+ }
136
+ // A mask only makes sense as an inpainting directive on a source image.
137
+ if (mask && !editMode) {
138
+ return {
139
+ output: 'Error: mask requires a source image. Pass image_url (or images) alongside mask.',
140
+ isError: true,
141
+ };
95
142
  }
96
143
  // One-shot refinement opt-out: leading `///` tells Franklin "don't
97
144
  // refine this prompt, I wrote it the way I want it." Strip the prefix
@@ -111,30 +158,60 @@ function buildExecute(deps) {
111
158
  // Reference-image mode forces an edit-capable model. If the caller named
112
159
  // an unsupported one, fail loudly so we don't silently downgrade their
113
160
  // request to text-only generation.
114
- if (referenceImage && model && !EDIT_SUPPORTED_MODELS.has(model)) {
161
+ if (editMode && model && !EDIT_SUPPORTED_MODELS.has(model)) {
115
162
  return {
116
163
  output: `Error: model ${model} does not support reference images. ` +
117
164
  `Use one of: ${[...EDIT_SUPPORTED_MODELS].join(', ')}.`,
118
165
  isError: true,
119
166
  };
120
167
  }
121
- let imageModel = model || (referenceImage ? 'openai/gpt-image-2' : 'openai/gpt-image-1');
168
+ let imageModel = model || (editMode ? 'openai/gpt-image-2' : 'openai/gpt-image-1');
122
169
  let imageSize = size || '1024x1024';
123
170
  let chosenPrompt = prompt;
171
+ // ── Edit-mode constraint checks (mirror the gateway, fail before paying) ──
172
+ if (editMode) {
173
+ // Mask inpainting is OpenAI-only.
174
+ if (mask && !MASK_SUPPORTED_MODELS.has(imageModel)) {
175
+ return {
176
+ output: `Error: model ${imageModel} does not support mask-based editing. ` +
177
+ `Mask inpainting is available on: ${[...MASK_SUPPORTED_MODELS].join(', ')}. ` +
178
+ `Omit mask to edit with ${imageModel}.`,
179
+ isError: true,
180
+ };
181
+ }
182
+ // A mask targets a single region — it has no meaning across multiple
183
+ // source images.
184
+ if (mask && referenceInputs.length > 1) {
185
+ return {
186
+ output: 'Error: mask cannot be combined with multiple source images. ' +
187
+ 'Send a single image with a mask, or multiple images without a mask.',
188
+ isError: true,
189
+ };
190
+ }
191
+ // Per-provider fusion cap.
192
+ const maxImages = MAX_IMAGES_BY_PREFIX[`${imageModel.split('/')[0]}/`] ?? 1;
193
+ if (referenceInputs.length > maxImages) {
194
+ return {
195
+ output: `Error: model ${imageModel} accepts at most ${maxImages} source ` +
196
+ `image${maxImages > 1 ? 's' : ''} per edit (got ${referenceInputs.length}).`,
197
+ isError: true,
198
+ };
199
+ }
200
+ }
124
201
  // Skip the proposal flow when a reference image is set: the media router
125
202
  // doesn't know which models support image-to-image, so its suggestions
126
203
  // would frequently be unusable (text-only models). Default to gpt-image-1
127
204
  // for now; a future router upgrade can pick between the four edit-capable
128
205
  // models based on the prompt.
129
206
  const autoApprove = process.env.FRANKLIN_MEDIA_AUTO_APPROVE_ALL === '1';
130
- if (!model && !autoApprove && ctx.onAskUser && !referenceImage) {
207
+ if (!model && !autoApprove && ctx.onAskUser && !editMode) {
131
208
  try {
132
209
  const chain = loadChain();
133
210
  const client = new ModelClient({ apiUrl: API_URLS[chain], chain });
134
211
  const proposal = await analyzeMediaRequest({
135
212
  kind: 'image',
136
213
  prompt,
137
- quantity: 1,
214
+ quantity: n,
138
215
  client,
139
216
  signal: ctx.abortSignal,
140
217
  skipRefine,
@@ -178,8 +255,20 @@ function buildExecute(deps) {
178
255
  if (imageModel === 'openai/gpt-image-2' && imageSize !== '1024x1024') {
179
256
  imageSize = '1024x1024';
180
257
  }
258
+ // Validate the size against the model's supported set before paying. The
259
+ // gateway rejects unsupported sizes with a 400, so catching it here saves
260
+ // a wasted round-trip (and historically a wasted x402 retry). Models not
261
+ // in the table (custom / future gateway models) skip this check.
262
+ const supportedSizes = IMAGE_MODEL_SIZES[imageModel];
263
+ if (supportedSizes && !supportedSizes.includes(imageSize)) {
264
+ return {
265
+ output: `Error: invalid size ${imageSize} for ${imageModel}. ` +
266
+ `Supported sizes: ${supportedSizes.join(', ')}.`,
267
+ isError: true,
268
+ };
269
+ }
181
270
  if (contentId && deps.library) {
182
- const decision = checkImageBudget(deps.library, contentId, imageModel, imageSize);
271
+ const decision = checkImageBudget(deps.library, contentId, imageModel, imageSize, n);
183
272
  if (!decision.ok) {
184
273
  // Normal text output, not isError — the agent should adapt (smaller
185
274
  // size, different model, raise budget) rather than trigger retry.
@@ -191,29 +280,49 @@ function buildExecute(deps) {
191
280
  };
192
281
  }
193
282
  }
283
+ // Resolve all reference images + the mask into base64 data URIs now, right
284
+ // before the paid call. Done after the cheap validations so bad paths /
285
+ // oversize attachments / unsupported combinations fail without any network
286
+ // or filesystem cost beyond what's necessary.
287
+ let referenceImages = [];
288
+ let resolvedMask;
289
+ if (editMode) {
290
+ try {
291
+ referenceImages = await Promise.all(referenceInputs.map(r => resolveReferenceImage(r, ctx.workingDir)));
292
+ if (mask)
293
+ resolvedMask = await resolveReferenceImage(mask, ctx.workingDir);
294
+ }
295
+ catch (err) {
296
+ return { output: `Error: ${err.message}`, isError: true };
297
+ }
298
+ }
194
299
  const chain = loadChain();
195
300
  const apiUrl = API_URLS[chain];
196
301
  // Reference-image mode hits the dedicated /v1/images/image2image endpoint;
197
302
  // otherwise stay on text-to-image generations.
198
- const endpoint = referenceImage
303
+ const endpoint = editMode
199
304
  ? `${apiUrl}/v1/images/image2image`
200
305
  : `${apiUrl}/v1/images/generations`;
201
306
  // Default output path
202
307
  const outPath = output_path
203
308
  ? (path.isAbsolute(output_path) ? output_path : path.resolve(ctx.workingDir, output_path))
204
309
  : path.resolve(ctx.workingDir, `generated-${Date.now()}.png`);
205
- const body = JSON.stringify(referenceImage
310
+ const body = JSON.stringify(editMode
206
311
  ? {
207
312
  model: imageModel,
208
313
  prompt: chosenPrompt,
209
- image: referenceImage,
314
+ // Gateway accepts a string (single) or array (fusion) for `image`.
315
+ // Send a string for the single-image case to keep that path byte-
316
+ // identical to before.
317
+ image: referenceImages.length === 1 ? referenceImages[0] : referenceImages,
318
+ ...(resolvedMask ? { mask: resolvedMask } : {}),
210
319
  size: imageSize,
211
- n: 1,
320
+ n,
212
321
  }
213
322
  : {
214
323
  model: imageModel,
215
324
  prompt: chosenPrompt,
216
- n: 1,
325
+ n,
217
326
  size: imageSize,
218
327
  response_format: 'b64_json',
219
328
  });
@@ -228,7 +337,7 @@ function buildExecute(deps) {
228
337
  // both x402 retry attempts plus the actual generation, which made
229
338
  // image-to-image effectively always time out. Image-to-image gets 3
230
339
  // minutes; text-to-image keeps the original 60s.
231
- const timeoutMs = referenceImage ? 180_000 : 60_000;
340
+ const timeoutMs = editMode ? 180_000 : 60_000;
232
341
  const timeout = setTimeout(() => controller.abort(), timeoutMs);
233
342
  // Wall-clock start of the paid call, including 402 retry + (optional)
234
343
  // 202 polling. Used by recordUsage below so franklin-stats.json
@@ -303,8 +412,8 @@ function buildExecute(deps) {
303
412
  }
304
413
  result = outcome.body;
305
414
  }
306
- const imageData = result.data?.[0];
307
- if (!imageData) {
415
+ const items = (result.data ?? []).filter((d) => !!d && (!!d.b64_json || !!d.url));
416
+ if (items.length === 0) {
308
417
  // Some gateways return 200 with an `error` / `message` field for
309
418
  // moderation, quota, or upstream-model failures instead of using
310
419
  // HTTP error codes. Without surfacing those, the agent sees only
@@ -329,39 +438,22 @@ function buildExecute(deps) {
329
438
  const detail = bits.length > 0 ? ` — ${bits.join('; ')}` : '';
330
439
  return { output: `No image data returned from API${detail}`, isError: true };
331
440
  }
332
- // Save image. The /v1/images/image2image endpoint returns Gemini results
333
- // as a data URI in `url`, so decode those locally instead of going through
334
- // fetch saves a network round-trip and avoids data:-URI fetch quirks.
335
- if (imageData.b64_json) {
336
- const buffer = Buffer.from(imageData.b64_json, 'base64');
337
- fs.mkdirSync(path.dirname(outPath), { recursive: true });
338
- fs.writeFileSync(outPath, buffer);
339
- }
340
- else if (imageData.url && imageData.url.startsWith('data:')) {
341
- const match = imageData.url.match(/^data:[^;]+;base64,(.+)$/);
342
- if (!match) {
343
- return { output: 'Malformed data URI in response', isError: true };
441
+ // Output paths: one image keeps the requested path verbatim; multiple
442
+ // images get a -1/-2/... suffix before the extension so nothing clobbers.
443
+ const targetPaths = items.length === 1 ? [outPath] : items.map((_, i) => withIndexSuffix(outPath, i + 1));
444
+ // Save each returned image. The /v1/images/image2image endpoint returns
445
+ // Gemini results as a data URI in `url`, so decode those locally instead
446
+ // of going through fetch — saves a round-trip and avoids data:-URI quirks.
447
+ const savedPaths = [];
448
+ for (let i = 0; i < items.length; i++) {
449
+ try {
450
+ await saveImageDataToFile(items[i], targetPaths[i]);
344
451
  }
345
- const buffer = Buffer.from(match[1], 'base64');
346
- fs.mkdirSync(path.dirname(outPath), { recursive: true });
347
- fs.writeFileSync(outPath, buffer);
348
- }
349
- else if (imageData.url) {
350
- // Download from URL (with 30s timeout)
351
- const dlCtrl = new AbortController();
352
- const dlTimeout = setTimeout(() => dlCtrl.abort(), 30_000);
353
- const imgResp = await fetch(imageData.url, { signal: dlCtrl.signal });
354
- clearTimeout(dlTimeout);
355
- const buffer = Buffer.from(await imgResp.arrayBuffer());
356
- fs.mkdirSync(path.dirname(outPath), { recursive: true });
357
- fs.writeFileSync(outPath, buffer);
358
- }
359
- else {
360
- return { output: 'No image data (b64_json or url) in response', isError: true };
452
+ catch (err) {
453
+ return { output: `Error saving image ${i + 1}: ${err.message}`, isError: true };
454
+ }
455
+ savedPaths.push(targetPaths[i]);
361
456
  }
362
- const fileSize = fs.statSync(outPath).size;
363
- const sizeKB = (fileSize / 1024).toFixed(1);
364
- const revisedPrompt = imageData.revised_prompt ? `\nRevised prompt: ${imageData.revised_prompt}` : '';
365
457
  // Stats: record this generation so it shows up in `franklin insights`
366
458
  // alongside chat spend. Before this, media generations bypassed
367
459
  // recordUsage entirely (only LLM chat calls were tracked), so the
@@ -372,26 +464,41 @@ function buildExecute(deps) {
372
464
  void (async () => {
373
465
  try {
374
466
  const m = await findModel(imageModel);
375
- const estCost = m ? estimateCostUsd(m, { quantity: 1 }) : 0;
467
+ const estCost = m ? estimateCostUsd(m, { quantity: items.length }) : 0;
376
468
  recordUsage(imageModel, 0, 0, estCost, latencyMs);
377
469
  }
378
470
  catch { /* ignore stats errors */ }
379
471
  })();
380
472
  let contentSummary = '';
381
473
  if (contentId && deps.library) {
382
- const rec = recordImageAsset(deps.library, {
383
- contentId,
384
- imagePath: outPath,
385
- model: imageModel,
386
- size: imageSize,
387
- });
388
- if (rec.ok) {
474
+ // Record each saved image as its own asset so the content's budget
475
+ // counts every paid output, not just the first.
476
+ let attached = 0;
477
+ let totalCost = 0;
478
+ let lastReason = '';
479
+ for (const p of savedPaths) {
480
+ const rec = recordImageAsset(deps.library, {
481
+ contentId,
482
+ imagePath: p,
483
+ model: imageModel,
484
+ size: imageSize,
485
+ });
486
+ if (rec.ok) {
487
+ attached++;
488
+ totalCost += rec.costUsd;
489
+ }
490
+ else {
491
+ lastReason = rec.reason;
492
+ }
493
+ }
494
+ if (attached > 0) {
389
495
  if (deps.onContentChange)
390
496
  await deps.onContentChange();
391
497
  const c = deps.library.get(contentId);
392
498
  contentSummary =
393
499
  `\n\n## Content updated\n` +
394
- `- Attached to \`${contentId}\` at est. $${rec.costUsd.toFixed(2)}\n` +
500
+ `- Attached ${attached} image${attached > 1 ? 's' : ''} to ` +
501
+ `\`${contentId}\` at est. $${totalCost.toFixed(2)}\n` +
395
502
  (c
396
503
  ? `- Spent: $${c.spentUsd.toFixed(2)} / $${c.budgetUsd.toFixed(2)} cap ` +
397
504
  `(remaining $${(c.budgetUsd - c.spentUsd).toFixed(2)})`
@@ -402,20 +509,33 @@ function buildExecute(deps) {
402
509
  // after a successful paid generation is rare (TOCTOU) but possible.
403
510
  contentSummary =
404
511
  `\n\n## Content NOT updated\n` +
405
- `- ${rec.reason}\n` +
406
- `- The image was generated and saved locally; cost was NOT recorded ` +
407
- `against the content budget.`;
512
+ `- ${lastReason}\n` +
513
+ `- The image${savedPaths.length > 1 ? 's were' : ' was'} generated and ` +
514
+ `saved locally; cost was NOT recorded against the content budget.`;
408
515
  }
409
516
  }
517
+ const revisedPrompt = items[0]?.revised_prompt
518
+ ? `\nRevised prompt: ${items[0].revised_prompt}`
519
+ : '';
520
+ const summaryLines = savedPaths.map(p => {
521
+ const kb = (fs.statSync(p).size / 1024).toFixed(1);
522
+ return `- ${p} (${kb}KB, ${imageSize})`;
523
+ });
524
+ const header = savedPaths.length === 1
525
+ ? `Image saved to ${savedPaths[0]} (${(fs.statSync(savedPaths[0]).size / 1024).toFixed(1)}KB, ${imageSize})`
526
+ : `${savedPaths.length} images saved:\n${summaryLines.join('\n')}`;
527
+ const openHint = savedPaths.length === 1
528
+ ? `\n\nOpen with: open ${savedPaths[0]}`
529
+ : `\n\nOpen with: open ${savedPaths.join(' ')}`;
410
530
  return {
411
- output: `Image saved to ${outPath} (${sizeKB}KB, ${imageSize})${revisedPrompt}\n\nOpen with: open ${outPath}${contentSummary}`,
531
+ output: `${header}${revisedPrompt}${openHint}${contentSummary}`,
412
532
  };
413
533
  }
414
534
  catch (err) {
415
535
  const msg = err.message || '';
416
536
  if (msg.includes('abort')) {
417
537
  return {
418
- output: referenceImage
538
+ output: editMode
419
539
  ? 'Image-to-image timed out (180s limit). The reference image may be too large or the model under load — try a smaller image or simpler prompt.'
420
540
  : 'Image generation timed out (60s limit). Try a simpler prompt.',
421
541
  isError: true,
@@ -428,6 +548,44 @@ function buildExecute(deps) {
428
548
  }
429
549
  };
430
550
  }
551
+ /** Insert a `-{idx}` suffix before the file extension: a.png → a-2.png. */
552
+ export function withIndexSuffix(p, idx) {
553
+ const ext = path.extname(p);
554
+ const base = ext ? p.slice(0, p.length - ext.length) : p;
555
+ return `${base}-${idx}${ext}`;
556
+ }
557
+ /**
558
+ * Save one gateway image item to disk. Handles b64_json, data-URI `url`
559
+ * (Gemini), and remote `url` (downloaded with a 30s timeout). Throws on a
560
+ * malformed or empty item.
561
+ */
562
+ async function saveImageDataToFile(imageData, destPath) {
563
+ fs.mkdirSync(path.dirname(destPath), { recursive: true });
564
+ if (imageData.b64_json) {
565
+ fs.writeFileSync(destPath, Buffer.from(imageData.b64_json, 'base64'));
566
+ return;
567
+ }
568
+ if (imageData.url && imageData.url.startsWith('data:')) {
569
+ const match = imageData.url.match(/^data:[^;]+;base64,(.+)$/);
570
+ if (!match)
571
+ throw new Error('Malformed data URI in response');
572
+ fs.writeFileSync(destPath, Buffer.from(match[1], 'base64'));
573
+ return;
574
+ }
575
+ if (imageData.url) {
576
+ const dlCtrl = new AbortController();
577
+ const dlTimeout = setTimeout(() => dlCtrl.abort(), 30_000);
578
+ try {
579
+ const imgResp = await fetch(imageData.url, { signal: dlCtrl.signal });
580
+ fs.writeFileSync(destPath, Buffer.from(await imgResp.arrayBuffer()));
581
+ }
582
+ finally {
583
+ clearTimeout(dlTimeout);
584
+ }
585
+ return;
586
+ }
587
+ throw new Error('No image data (b64_json or url) in response');
588
+ }
431
589
  // ─── Payment ───────────────────────────────────────────────────────────────
432
590
  async function signPayment(response, chain, endpoint) {
433
591
  try {
@@ -489,24 +647,30 @@ export function createImageGenCapability(deps = {}) {
489
647
  return {
490
648
  spec: {
491
649
  name: 'ImageGen',
492
- description: "Generate an image from a text prompt optionally with a reference " +
493
- "image for style transfer / character consistency / edits. Costs USDC " +
494
- "from the user's wallet — confirm before generating. Saves to a local " +
495
- "file. Default size: 1024x1024. Do NOT call repeatedly to iterate on " +
496
- "style ask the user first. Pass contentId to attach the result to " +
497
- "an existing Content piece: the content's budget is checked BEFORE " +
498
- "paying, and on success the image is recorded as an asset with its " +
499
- "estimated cost. Skipping contentId generates a one-off image with no " +
500
- "budget tracking. When image_url is set, only edit-capable models " +
501
- "(openai/gpt-image-1, openai/gpt-image-2) are accepted.",
650
+ description: "Generate or edit an image. Text-to-image from a prompt, or " +
651
+ "image-to-image when you pass a reference image (style transfer, " +
652
+ "character consistency, edits). Supports mask-based inpainting and " +
653
+ "multi-image fusion. Costs USDC from the user's wallet confirm " +
654
+ "before generating. Saves to local file(s). Default size: 1024x1024. " +
655
+ "Do NOT call repeatedly to iterate on style ask the user first. " +
656
+ "Pass contentId to attach the result to an existing Content piece: " +
657
+ "the content's budget is checked BEFORE paying, and on success each " +
658
+ "image is recorded as an asset with its estimated cost. Skipping " +
659
+ "contentId generates one-off images with no budget tracking. " +
660
+ "Edit-capable models: openai/gpt-image-1, openai/gpt-image-2, " +
661
+ "google/nano-banana, google/nano-banana-pro. Mask inpainting is " +
662
+ "OpenAI-only; multi-image fusion is capped at 4 (OpenAI) / 3 (Google).",
502
663
  input_schema: {
503
664
  type: 'object',
504
665
  properties: {
505
- prompt: { type: 'string', description: 'Text description of the image to generate' },
506
- output_path: { type: 'string', description: 'Where to save the image. Default: generated-<timestamp>.png in working directory' },
507
- size: { type: 'string', description: 'Image size: 1024x1024, 1792x1024, or 1024x1792. Default: 1024x1024. Note: openai/gpt-image-2 is forced to 1024x1024 (other sizes time out at the gateway).' },
508
- model: { type: 'string', description: 'Image model to use. Default: openai/gpt-image-1' },
509
- image_url: { type: 'string', description: 'Optional reference image (image-to-image / style transfer). Accepts an http(s) URL, a data URI, or a local file path. Only works with edit-capable models.' },
666
+ prompt: { type: 'string', description: 'Text description of the image to generate, or edit instructions when a reference image is provided' },
667
+ output_path: { type: 'string', description: 'Where to save the image. Default: generated-<timestamp>.png in working directory. With n>1, a -1/-2/... suffix is appended before the extension.' },
668
+ size: { type: 'string', description: 'Image size. gpt-image-1/2: 1024x1024, 1536x1024, 1024x1536. google/nano-banana: 1024x1024. google/nano-banana-pro: 1024x1024, 2048x2048, 4096x4096. Default: 1024x1024. Note: openai/gpt-image-2 is forced to 1024x1024 (other sizes time out at the gateway).' },
669
+ model: { type: 'string', description: 'Image model to use. Default: openai/gpt-image-1 (text-to-image) / openai/gpt-image-2 (image-to-image).' },
670
+ image_url: { type: 'string', description: 'Optional reference image (image-to-image / style transfer). Accepts an http(s) URL, a data URI, or a local file path. Only edit-capable models are accepted.' },
671
+ images: { type: 'array', items: { type: 'string' }, description: 'Optional list of reference images for multi-image fusion (e.g. subject + logo). Same forms as image_url. Merged with image_url. Cap: OpenAI 4, Google 3. Cannot combine with mask.' },
672
+ mask: { type: 'string', description: 'Optional mask for inpainting — transparent pixels mark the editable region. Same forms as image_url. OpenAI edit models only; cannot combine with multiple source images.' },
673
+ n: { type: 'number', description: 'Number of images to generate, 1-4. Default 1. Cost scales with n.' },
510
674
  contentId: { type: 'string', description: 'Optional Content id to attach this generation to. Pre-flight budget check + auto-record on success.' },
511
675
  },
512
676
  required: ['prompt'],
@@ -33,6 +33,8 @@ import { defiLlamaProtocolsCapability, defiLlamaProtocolCapability, defiLlamaCha
33
33
  import { predictionMarketCapability } from './prediction.js';
34
34
  import { modalCapabilities } from './modal.js';
35
35
  import { blockrunCapability } from './blockrun.js';
36
+ import { surfCapabilities } from './surf.js';
37
+ import { realFaceCapability } from './realface.js';
36
38
  import { listPhoneNumbersCapability, buyPhoneNumberCapability, renewPhoneNumberCapability, releasePhoneNumberCapability, phoneLookupCapability, phoneFraudCheckCapability, } from './phone.js';
37
39
  import { voiceCallCapability, voiceStatusCapability } from './voice.js';
38
40
  import { createTradingCapabilities } from './trading-execute.js';
@@ -166,7 +168,8 @@ export const allCapabilities = [
166
168
  defiLlamaYieldsCapability,
167
169
  defiLlamaPriceCapability,
168
170
  predictionMarketCapability, // Polymarket / Kalshi / matching / smart money via Predexon
169
- blockrunCapability, // Generic x402-paid gateway primitive — Surf, future partners (see /surf-* skills)
171
+ blockrunCapability, // Generic x402-paid gateway primitive — future partners + long-tail Surf paths
172
+ ...surfCapabilities, // SurfMarket / SurfChain / SurfSocial — endpoint-enum function tools (no path guessing, auto x402)
170
173
  // Phone & Voice — typed surface so the agent pattern-matches on the user
171
174
  // intent ("buy a number", "make a call") without needing to consult the
172
175
  // BlockRun primitive or the .well-known/x402 manifest. All wrap the same
@@ -179,6 +182,7 @@ export const allCapabilities = [
179
182
  phoneFraudCheckCapability, // PhoneFraudCheck — $0.05
180
183
  voiceCallCapability, // VoiceCall — $0.54 / call (Bland.ai)
181
184
  voiceStatusCapability, // VoiceStatus — free (poll)
185
+ realFaceCapability, // RealFace — init/status/enroll/list; $0.01 enroll → ta_xxx avatar for VideoGen
182
186
  // Modal GPU sandbox tools — registered but hidden by default (not in
183
187
  // CORE_TOOL_NAMES). Agent must `ActivateTool({names:["ModalCreate",...]})`
184
188
  // before they appear in its tool inventory. High-cost ($0.40/H100 create)
@@ -0,0 +1,29 @@
1
+ /**
2
+ * RealFace — enroll a real person's face as a reusable video avatar.
3
+ *
4
+ * Wraps BlockRun's /v1/realface/* flow so the agent never hand-rolls paths or
5
+ * x402. Enrollment is a three-step, human-in-the-loop flow because the upstream
6
+ * provider (Token360 / BytePlus) requires a live liveness check on a phone:
7
+ *
8
+ * 1. action="init" (FREE) → creates a REAL_FACE group, returns an `h5_link`.
9
+ * Show it to the user as a QR / URL. They scan it
10
+ * on their phone and do a ~1-minute liveness check
11
+ * (nod + blink). The link expires in 120s; call
12
+ * init again with the same group_id to refresh.
13
+ * 2. action="status" (FREE) → poll the group until status === "active" (the
14
+ * person finished the phone liveness). Bounded
15
+ * poll (~24s) so a quick scan resolves in one call.
16
+ * 3. action="enroll" ($0.01)→ uploads a face photo (public https URL), waits
17
+ * for the biometric match, returns the `ta_xxx`
18
+ * asset id. Pre-flights group-active (425 if not);
19
+ * no charge if the upload/match fails.
20
+ * action="list" (FREE) → lists the wallet's enrolled RealFace assets.
21
+ *
22
+ * Use the returned `ta_xxx` as `real_face_asset_id` on a VideoGen call with a
23
+ * Seedance 2.0 model for cross-frame character consistency.
24
+ *
25
+ * x402 signing mirrors src/tools/videogen.ts / blockrun.ts (kept as copy-paste
26
+ * per the same rationale documented there — a shared module is out of scope).
27
+ */
28
+ import type { CapabilityHandler } from '../agent/types.js';
29
+ export declare const realFaceCapability: CapabilityHandler;