@blockrun/franklin 3.21.8 → 3.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +12 -0
- package/dist/agent/context.js +5 -3
- package/dist/content/image-pricing.d.ts +7 -1
- package/dist/content/image-pricing.js +40 -17
- package/dist/content/record-image.d.ts +1 -1
- package/dist/content/record-image.js +2 -2
- package/dist/tools/blockrun.js +13 -4
- package/dist/tools/imagegen.d.ts +22 -3
- package/dist/tools/imagegen.js +252 -88
- package/dist/tools/index.js +5 -1
- package/dist/tools/realface.d.ts +29 -0
- package/dist/tools/realface.js +263 -0
- package/dist/tools/surf.d.ts +22 -0
- package/dist/tools/surf.js +281 -0
- package/dist/tools/tool-categories.js +7 -0
- package/dist/tools/videogen.js +44 -1
- package/dist/tools/voice.d.ts +1 -0
- package/dist/tools/voice.js +40 -21
- package/package.json +1 -1
package/dist/tools/imagegen.js
CHANGED
|
@@ -13,14 +13,49 @@ import { recordUsage } from '../stats/tracker.js';
|
|
|
13
13
|
import { findModel, estimateCostUsd } from '../gateway-models.js';
|
|
14
14
|
import { logger } from '../logger.js';
|
|
15
15
|
/**
|
|
16
|
-
* Models that accept a reference image via /v1/images/image2image.
|
|
17
|
-
*
|
|
18
|
-
*
|
|
16
|
+
* Models that accept a reference image via /v1/images/image2image. Mirrors the
|
|
17
|
+
* gateway's EDIT_SUPPORTED_MODELS (src/app/api/v1/images/image2image/route.ts):
|
|
18
|
+
* both OpenAI gpt-image-* and Google Nano Banana support image-to-image edits.
|
|
19
19
|
*/
|
|
20
20
|
export const EDIT_SUPPORTED_MODELS = new Set([
|
|
21
21
|
'openai/gpt-image-1',
|
|
22
22
|
'openai/gpt-image-2',
|
|
23
|
+
'google/nano-banana',
|
|
24
|
+
'google/nano-banana-pro',
|
|
23
25
|
]);
|
|
26
|
+
/**
|
|
27
|
+
* Mask-based inpainting is OpenAI-only. Gemini (Nano Banana) does prompt-based
|
|
28
|
+
* edits with no mask concept. Mirrors the gateway's MASK_SUPPORTED_MODELS.
|
|
29
|
+
*/
|
|
30
|
+
export const MASK_SUPPORTED_MODELS = new Set([
|
|
31
|
+
'openai/gpt-image-1',
|
|
32
|
+
'openai/gpt-image-2',
|
|
33
|
+
]);
|
|
34
|
+
/**
|
|
35
|
+
* Per-provider multi-image (fusion) cap. Mirrors the gateway's
|
|
36
|
+
* MAX_IMAGES_BY_PREFIX: OpenAI fuses up to 4 anchors, Gemini up to 3.
|
|
37
|
+
*/
|
|
38
|
+
const MAX_IMAGES_BY_PREFIX = {
|
|
39
|
+
'openai/': 4,
|
|
40
|
+
'google/': 3,
|
|
41
|
+
};
|
|
42
|
+
/**
|
|
43
|
+
* Output-image count ceiling. The gateway has no hard max but price scales with
|
|
44
|
+
* n, so cap client-side to keep a typo from draining the wallet.
|
|
45
|
+
*/
|
|
46
|
+
export const MAX_OUTPUT_IMAGES = 4;
|
|
47
|
+
/**
|
|
48
|
+
* Valid sizes per known image model, mirroring the gateway's IMAGE_MODELS.sizes
|
|
49
|
+
* (src/lib/models.ts). Used to fail cheaply before paying when a caller or the
|
|
50
|
+
* media router picks a size the model rejects. Models absent from this table
|
|
51
|
+
* (custom / future gateway models) skip validation and let the gateway decide.
|
|
52
|
+
*/
|
|
53
|
+
export const IMAGE_MODEL_SIZES = {
|
|
54
|
+
'openai/gpt-image-1': ['1024x1024', '1536x1024', '1024x1536'],
|
|
55
|
+
'openai/gpt-image-2': ['1024x1024', '1536x1024', '1024x1536'],
|
|
56
|
+
'google/nano-banana': ['1024x1024'],
|
|
57
|
+
'google/nano-banana-pro': ['1024x1024', '2048x2048', '4096x4096'],
|
|
58
|
+
};
|
|
24
59
|
export const REFERENCE_IMAGE_MAX_BYTES = 4_000_000;
|
|
25
60
|
/**
|
|
26
61
|
* Normalize a reference image into a base64 data URI for the gateway. The
|
|
@@ -77,21 +112,33 @@ export async function resolveReferenceImage(input, workingDir) {
|
|
|
77
112
|
function buildExecute(deps) {
|
|
78
113
|
return async function execute(input, ctx) {
|
|
79
114
|
const rawInput = input;
|
|
80
|
-
const { output_path, size, model, contentId, image_url } = rawInput;
|
|
115
|
+
const { output_path, size, model, contentId, image_url, mask } = rawInput;
|
|
81
116
|
if (!rawInput.prompt) {
|
|
82
117
|
return { output: 'Error: prompt is required', isError: true };
|
|
83
118
|
}
|
|
84
|
-
//
|
|
85
|
-
//
|
|
86
|
-
//
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
119
|
+
// Collect reference images: image_url (single, back-compat) + images[]
|
|
120
|
+
// (fusion), in that order. Edit mode is active whenever at least one
|
|
121
|
+
// reference image is present — the call then routes to image2image.
|
|
122
|
+
const referenceInputs = [
|
|
123
|
+
...(image_url ? [image_url] : []),
|
|
124
|
+
...(Array.isArray(rawInput.images) ? rawInput.images.filter(Boolean) : []),
|
|
125
|
+
];
|
|
126
|
+
const editMode = referenceInputs.length > 0;
|
|
127
|
+
// Output count: 1–4. Reject out-of-range up front so a typo can't blow the
|
|
128
|
+
// wallet (price scales with n) or get silently clamped.
|
|
129
|
+
const n = rawInput.n ?? 1;
|
|
130
|
+
if (!Number.isInteger(n) || n < 1 || n > MAX_OUTPUT_IMAGES) {
|
|
131
|
+
return {
|
|
132
|
+
output: `Error: n must be an integer between 1 and ${MAX_OUTPUT_IMAGES} (got ${rawInput.n}).`,
|
|
133
|
+
isError: true,
|
|
134
|
+
};
|
|
135
|
+
}
|
|
136
|
+
// A mask only makes sense as an inpainting directive on a source image.
|
|
137
|
+
if (mask && !editMode) {
|
|
138
|
+
return {
|
|
139
|
+
output: 'Error: mask requires a source image. Pass image_url (or images) alongside mask.',
|
|
140
|
+
isError: true,
|
|
141
|
+
};
|
|
95
142
|
}
|
|
96
143
|
// One-shot refinement opt-out: leading `///` tells Franklin "don't
|
|
97
144
|
// refine this prompt, I wrote it the way I want it." Strip the prefix
|
|
@@ -111,30 +158,60 @@ function buildExecute(deps) {
|
|
|
111
158
|
// Reference-image mode forces an edit-capable model. If the caller named
|
|
112
159
|
// an unsupported one, fail loudly so we don't silently downgrade their
|
|
113
160
|
// request to text-only generation.
|
|
114
|
-
if (
|
|
161
|
+
if (editMode && model && !EDIT_SUPPORTED_MODELS.has(model)) {
|
|
115
162
|
return {
|
|
116
163
|
output: `Error: model ${model} does not support reference images. ` +
|
|
117
164
|
`Use one of: ${[...EDIT_SUPPORTED_MODELS].join(', ')}.`,
|
|
118
165
|
isError: true,
|
|
119
166
|
};
|
|
120
167
|
}
|
|
121
|
-
let imageModel = model || (
|
|
168
|
+
let imageModel = model || (editMode ? 'openai/gpt-image-2' : 'openai/gpt-image-1');
|
|
122
169
|
let imageSize = size || '1024x1024';
|
|
123
170
|
let chosenPrompt = prompt;
|
|
171
|
+
// ── Edit-mode constraint checks (mirror the gateway, fail before paying) ──
|
|
172
|
+
if (editMode) {
|
|
173
|
+
// Mask inpainting is OpenAI-only.
|
|
174
|
+
if (mask && !MASK_SUPPORTED_MODELS.has(imageModel)) {
|
|
175
|
+
return {
|
|
176
|
+
output: `Error: model ${imageModel} does not support mask-based editing. ` +
|
|
177
|
+
`Mask inpainting is available on: ${[...MASK_SUPPORTED_MODELS].join(', ')}. ` +
|
|
178
|
+
`Omit mask to edit with ${imageModel}.`,
|
|
179
|
+
isError: true,
|
|
180
|
+
};
|
|
181
|
+
}
|
|
182
|
+
// A mask targets a single region — it has no meaning across multiple
|
|
183
|
+
// source images.
|
|
184
|
+
if (mask && referenceInputs.length > 1) {
|
|
185
|
+
return {
|
|
186
|
+
output: 'Error: mask cannot be combined with multiple source images. ' +
|
|
187
|
+
'Send a single image with a mask, or multiple images without a mask.',
|
|
188
|
+
isError: true,
|
|
189
|
+
};
|
|
190
|
+
}
|
|
191
|
+
// Per-provider fusion cap.
|
|
192
|
+
const maxImages = MAX_IMAGES_BY_PREFIX[`${imageModel.split('/')[0]}/`] ?? 1;
|
|
193
|
+
if (referenceInputs.length > maxImages) {
|
|
194
|
+
return {
|
|
195
|
+
output: `Error: model ${imageModel} accepts at most ${maxImages} source ` +
|
|
196
|
+
`image${maxImages > 1 ? 's' : ''} per edit (got ${referenceInputs.length}).`,
|
|
197
|
+
isError: true,
|
|
198
|
+
};
|
|
199
|
+
}
|
|
200
|
+
}
|
|
124
201
|
// Skip the proposal flow when a reference image is set: the media router
|
|
125
202
|
// doesn't know which models support image-to-image, so its suggestions
|
|
126
203
|
// would frequently be unusable (text-only models). Default to gpt-image-1
|
|
127
204
|
// for now; a future router upgrade can pick between the four edit-capable
|
|
128
205
|
// models based on the prompt.
|
|
129
206
|
const autoApprove = process.env.FRANKLIN_MEDIA_AUTO_APPROVE_ALL === '1';
|
|
130
|
-
if (!model && !autoApprove && ctx.onAskUser && !
|
|
207
|
+
if (!model && !autoApprove && ctx.onAskUser && !editMode) {
|
|
131
208
|
try {
|
|
132
209
|
const chain = loadChain();
|
|
133
210
|
const client = new ModelClient({ apiUrl: API_URLS[chain], chain });
|
|
134
211
|
const proposal = await analyzeMediaRequest({
|
|
135
212
|
kind: 'image',
|
|
136
213
|
prompt,
|
|
137
|
-
quantity:
|
|
214
|
+
quantity: n,
|
|
138
215
|
client,
|
|
139
216
|
signal: ctx.abortSignal,
|
|
140
217
|
skipRefine,
|
|
@@ -178,8 +255,20 @@ function buildExecute(deps) {
|
|
|
178
255
|
if (imageModel === 'openai/gpt-image-2' && imageSize !== '1024x1024') {
|
|
179
256
|
imageSize = '1024x1024';
|
|
180
257
|
}
|
|
258
|
+
// Validate the size against the model's supported set before paying. The
|
|
259
|
+
// gateway rejects unsupported sizes with a 400, so catching it here saves
|
|
260
|
+
// a wasted round-trip (and historically a wasted x402 retry). Models not
|
|
261
|
+
// in the table (custom / future gateway models) skip this check.
|
|
262
|
+
const supportedSizes = IMAGE_MODEL_SIZES[imageModel];
|
|
263
|
+
if (supportedSizes && !supportedSizes.includes(imageSize)) {
|
|
264
|
+
return {
|
|
265
|
+
output: `Error: invalid size ${imageSize} for ${imageModel}. ` +
|
|
266
|
+
`Supported sizes: ${supportedSizes.join(', ')}.`,
|
|
267
|
+
isError: true,
|
|
268
|
+
};
|
|
269
|
+
}
|
|
181
270
|
if (contentId && deps.library) {
|
|
182
|
-
const decision = checkImageBudget(deps.library, contentId, imageModel, imageSize);
|
|
271
|
+
const decision = checkImageBudget(deps.library, contentId, imageModel, imageSize, n);
|
|
183
272
|
if (!decision.ok) {
|
|
184
273
|
// Normal text output, not isError — the agent should adapt (smaller
|
|
185
274
|
// size, different model, raise budget) rather than trigger retry.
|
|
@@ -191,29 +280,49 @@ function buildExecute(deps) {
|
|
|
191
280
|
};
|
|
192
281
|
}
|
|
193
282
|
}
|
|
283
|
+
// Resolve all reference images + the mask into base64 data URIs now, right
|
|
284
|
+
// before the paid call. Done after the cheap validations so bad paths /
|
|
285
|
+
// oversize attachments / unsupported combinations fail without any network
|
|
286
|
+
// or filesystem cost beyond what's necessary.
|
|
287
|
+
let referenceImages = [];
|
|
288
|
+
let resolvedMask;
|
|
289
|
+
if (editMode) {
|
|
290
|
+
try {
|
|
291
|
+
referenceImages = await Promise.all(referenceInputs.map(r => resolveReferenceImage(r, ctx.workingDir)));
|
|
292
|
+
if (mask)
|
|
293
|
+
resolvedMask = await resolveReferenceImage(mask, ctx.workingDir);
|
|
294
|
+
}
|
|
295
|
+
catch (err) {
|
|
296
|
+
return { output: `Error: ${err.message}`, isError: true };
|
|
297
|
+
}
|
|
298
|
+
}
|
|
194
299
|
const chain = loadChain();
|
|
195
300
|
const apiUrl = API_URLS[chain];
|
|
196
301
|
// Reference-image mode hits the dedicated /v1/images/image2image endpoint;
|
|
197
302
|
// otherwise stay on text-to-image generations.
|
|
198
|
-
const endpoint =
|
|
303
|
+
const endpoint = editMode
|
|
199
304
|
? `${apiUrl}/v1/images/image2image`
|
|
200
305
|
: `${apiUrl}/v1/images/generations`;
|
|
201
306
|
// Default output path
|
|
202
307
|
const outPath = output_path
|
|
203
308
|
? (path.isAbsolute(output_path) ? output_path : path.resolve(ctx.workingDir, output_path))
|
|
204
309
|
: path.resolve(ctx.workingDir, `generated-${Date.now()}.png`);
|
|
205
|
-
const body = JSON.stringify(
|
|
310
|
+
const body = JSON.stringify(editMode
|
|
206
311
|
? {
|
|
207
312
|
model: imageModel,
|
|
208
313
|
prompt: chosenPrompt,
|
|
209
|
-
image
|
|
314
|
+
// Gateway accepts a string (single) or array (fusion) for `image`.
|
|
315
|
+
// Send a string for the single-image case to keep that path byte-
|
|
316
|
+
// identical to before.
|
|
317
|
+
image: referenceImages.length === 1 ? referenceImages[0] : referenceImages,
|
|
318
|
+
...(resolvedMask ? { mask: resolvedMask } : {}),
|
|
210
319
|
size: imageSize,
|
|
211
|
-
n
|
|
320
|
+
n,
|
|
212
321
|
}
|
|
213
322
|
: {
|
|
214
323
|
model: imageModel,
|
|
215
324
|
prompt: chosenPrompt,
|
|
216
|
-
n
|
|
325
|
+
n,
|
|
217
326
|
size: imageSize,
|
|
218
327
|
response_format: 'b64_json',
|
|
219
328
|
});
|
|
@@ -228,7 +337,7 @@ function buildExecute(deps) {
|
|
|
228
337
|
// both x402 retry attempts plus the actual generation, which made
|
|
229
338
|
// image-to-image effectively always time out. Image-to-image gets 3
|
|
230
339
|
// minutes; text-to-image keeps the original 60s.
|
|
231
|
-
const timeoutMs =
|
|
340
|
+
const timeoutMs = editMode ? 180_000 : 60_000;
|
|
232
341
|
const timeout = setTimeout(() => controller.abort(), timeoutMs);
|
|
233
342
|
// Wall-clock start of the paid call, including 402 retry + (optional)
|
|
234
343
|
// 202 polling. Used by recordUsage below so franklin-stats.json
|
|
@@ -303,8 +412,8 @@ function buildExecute(deps) {
|
|
|
303
412
|
}
|
|
304
413
|
result = outcome.body;
|
|
305
414
|
}
|
|
306
|
-
const
|
|
307
|
-
if (
|
|
415
|
+
const items = (result.data ?? []).filter((d) => !!d && (!!d.b64_json || !!d.url));
|
|
416
|
+
if (items.length === 0) {
|
|
308
417
|
// Some gateways return 200 with an `error` / `message` field for
|
|
309
418
|
// moderation, quota, or upstream-model failures instead of using
|
|
310
419
|
// HTTP error codes. Without surfacing those, the agent sees only
|
|
@@ -329,39 +438,22 @@ function buildExecute(deps) {
|
|
|
329
438
|
const detail = bits.length > 0 ? ` — ${bits.join('; ')}` : '';
|
|
330
439
|
return { output: `No image data returned from API${detail}`, isError: true };
|
|
331
440
|
}
|
|
332
|
-
//
|
|
333
|
-
//
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
if (!match) {
|
|
343
|
-
return { output: 'Malformed data URI in response', isError: true };
|
|
441
|
+
// Output paths: one image keeps the requested path verbatim; multiple
|
|
442
|
+
// images get a -1/-2/... suffix before the extension so nothing clobbers.
|
|
443
|
+
const targetPaths = items.length === 1 ? [outPath] : items.map((_, i) => withIndexSuffix(outPath, i + 1));
|
|
444
|
+
// Save each returned image. The /v1/images/image2image endpoint returns
|
|
445
|
+
// Gemini results as a data URI in `url`, so decode those locally instead
|
|
446
|
+
// of going through fetch — saves a round-trip and avoids data:-URI quirks.
|
|
447
|
+
const savedPaths = [];
|
|
448
|
+
for (let i = 0; i < items.length; i++) {
|
|
449
|
+
try {
|
|
450
|
+
await saveImageDataToFile(items[i], targetPaths[i]);
|
|
344
451
|
}
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
else if (imageData.url) {
|
|
350
|
-
// Download from URL (with 30s timeout)
|
|
351
|
-
const dlCtrl = new AbortController();
|
|
352
|
-
const dlTimeout = setTimeout(() => dlCtrl.abort(), 30_000);
|
|
353
|
-
const imgResp = await fetch(imageData.url, { signal: dlCtrl.signal });
|
|
354
|
-
clearTimeout(dlTimeout);
|
|
355
|
-
const buffer = Buffer.from(await imgResp.arrayBuffer());
|
|
356
|
-
fs.mkdirSync(path.dirname(outPath), { recursive: true });
|
|
357
|
-
fs.writeFileSync(outPath, buffer);
|
|
358
|
-
}
|
|
359
|
-
else {
|
|
360
|
-
return { output: 'No image data (b64_json or url) in response', isError: true };
|
|
452
|
+
catch (err) {
|
|
453
|
+
return { output: `Error saving image ${i + 1}: ${err.message}`, isError: true };
|
|
454
|
+
}
|
|
455
|
+
savedPaths.push(targetPaths[i]);
|
|
361
456
|
}
|
|
362
|
-
const fileSize = fs.statSync(outPath).size;
|
|
363
|
-
const sizeKB = (fileSize / 1024).toFixed(1);
|
|
364
|
-
const revisedPrompt = imageData.revised_prompt ? `\nRevised prompt: ${imageData.revised_prompt}` : '';
|
|
365
457
|
// Stats: record this generation so it shows up in `franklin insights`
|
|
366
458
|
// alongside chat spend. Before this, media generations bypassed
|
|
367
459
|
// recordUsage entirely (only LLM chat calls were tracked), so the
|
|
@@ -372,26 +464,41 @@ function buildExecute(deps) {
|
|
|
372
464
|
void (async () => {
|
|
373
465
|
try {
|
|
374
466
|
const m = await findModel(imageModel);
|
|
375
|
-
const estCost = m ? estimateCostUsd(m, { quantity:
|
|
467
|
+
const estCost = m ? estimateCostUsd(m, { quantity: items.length }) : 0;
|
|
376
468
|
recordUsage(imageModel, 0, 0, estCost, latencyMs);
|
|
377
469
|
}
|
|
378
470
|
catch { /* ignore stats errors */ }
|
|
379
471
|
})();
|
|
380
472
|
let contentSummary = '';
|
|
381
473
|
if (contentId && deps.library) {
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
474
|
+
// Record each saved image as its own asset so the content's budget
|
|
475
|
+
// counts every paid output, not just the first.
|
|
476
|
+
let attached = 0;
|
|
477
|
+
let totalCost = 0;
|
|
478
|
+
let lastReason = '';
|
|
479
|
+
for (const p of savedPaths) {
|
|
480
|
+
const rec = recordImageAsset(deps.library, {
|
|
481
|
+
contentId,
|
|
482
|
+
imagePath: p,
|
|
483
|
+
model: imageModel,
|
|
484
|
+
size: imageSize,
|
|
485
|
+
});
|
|
486
|
+
if (rec.ok) {
|
|
487
|
+
attached++;
|
|
488
|
+
totalCost += rec.costUsd;
|
|
489
|
+
}
|
|
490
|
+
else {
|
|
491
|
+
lastReason = rec.reason;
|
|
492
|
+
}
|
|
493
|
+
}
|
|
494
|
+
if (attached > 0) {
|
|
389
495
|
if (deps.onContentChange)
|
|
390
496
|
await deps.onContentChange();
|
|
391
497
|
const c = deps.library.get(contentId);
|
|
392
498
|
contentSummary =
|
|
393
499
|
`\n\n## Content updated\n` +
|
|
394
|
-
`- Attached
|
|
500
|
+
`- Attached ${attached} image${attached > 1 ? 's' : ''} to ` +
|
|
501
|
+
`\`${contentId}\` at est. $${totalCost.toFixed(2)}\n` +
|
|
395
502
|
(c
|
|
396
503
|
? `- Spent: $${c.spentUsd.toFixed(2)} / $${c.budgetUsd.toFixed(2)} cap ` +
|
|
397
504
|
`(remaining $${(c.budgetUsd - c.spentUsd).toFixed(2)})`
|
|
@@ -402,20 +509,33 @@ function buildExecute(deps) {
|
|
|
402
509
|
// after a successful paid generation is rare (TOCTOU) but possible.
|
|
403
510
|
contentSummary =
|
|
404
511
|
`\n\n## Content NOT updated\n` +
|
|
405
|
-
`- ${
|
|
406
|
-
`- The image
|
|
407
|
-
`against the content budget.`;
|
|
512
|
+
`- ${lastReason}\n` +
|
|
513
|
+
`- The image${savedPaths.length > 1 ? 's were' : ' was'} generated and ` +
|
|
514
|
+
`saved locally; cost was NOT recorded against the content budget.`;
|
|
408
515
|
}
|
|
409
516
|
}
|
|
517
|
+
const revisedPrompt = items[0]?.revised_prompt
|
|
518
|
+
? `\nRevised prompt: ${items[0].revised_prompt}`
|
|
519
|
+
: '';
|
|
520
|
+
const summaryLines = savedPaths.map(p => {
|
|
521
|
+
const kb = (fs.statSync(p).size / 1024).toFixed(1);
|
|
522
|
+
return `- ${p} (${kb}KB, ${imageSize})`;
|
|
523
|
+
});
|
|
524
|
+
const header = savedPaths.length === 1
|
|
525
|
+
? `Image saved to ${savedPaths[0]} (${(fs.statSync(savedPaths[0]).size / 1024).toFixed(1)}KB, ${imageSize})`
|
|
526
|
+
: `${savedPaths.length} images saved:\n${summaryLines.join('\n')}`;
|
|
527
|
+
const openHint = savedPaths.length === 1
|
|
528
|
+
? `\n\nOpen with: open ${savedPaths[0]}`
|
|
529
|
+
: `\n\nOpen with: open ${savedPaths.join(' ')}`;
|
|
410
530
|
return {
|
|
411
|
-
output:
|
|
531
|
+
output: `${header}${revisedPrompt}${openHint}${contentSummary}`,
|
|
412
532
|
};
|
|
413
533
|
}
|
|
414
534
|
catch (err) {
|
|
415
535
|
const msg = err.message || '';
|
|
416
536
|
if (msg.includes('abort')) {
|
|
417
537
|
return {
|
|
418
|
-
output:
|
|
538
|
+
output: editMode
|
|
419
539
|
? 'Image-to-image timed out (180s limit). The reference image may be too large or the model under load — try a smaller image or simpler prompt.'
|
|
420
540
|
: 'Image generation timed out (60s limit). Try a simpler prompt.',
|
|
421
541
|
isError: true,
|
|
@@ -428,6 +548,44 @@ function buildExecute(deps) {
|
|
|
428
548
|
}
|
|
429
549
|
};
|
|
430
550
|
}
|
|
551
|
+
/** Insert a `-{idx}` suffix before the file extension: a.png → a-2.png. */
|
|
552
|
+
export function withIndexSuffix(p, idx) {
|
|
553
|
+
const ext = path.extname(p);
|
|
554
|
+
const base = ext ? p.slice(0, p.length - ext.length) : p;
|
|
555
|
+
return `${base}-${idx}${ext}`;
|
|
556
|
+
}
|
|
557
|
+
/**
|
|
558
|
+
* Save one gateway image item to disk. Handles b64_json, data-URI `url`
|
|
559
|
+
* (Gemini), and remote `url` (downloaded with a 30s timeout). Throws on a
|
|
560
|
+
* malformed or empty item.
|
|
561
|
+
*/
|
|
562
|
+
async function saveImageDataToFile(imageData, destPath) {
|
|
563
|
+
fs.mkdirSync(path.dirname(destPath), { recursive: true });
|
|
564
|
+
if (imageData.b64_json) {
|
|
565
|
+
fs.writeFileSync(destPath, Buffer.from(imageData.b64_json, 'base64'));
|
|
566
|
+
return;
|
|
567
|
+
}
|
|
568
|
+
if (imageData.url && imageData.url.startsWith('data:')) {
|
|
569
|
+
const match = imageData.url.match(/^data:[^;]+;base64,(.+)$/);
|
|
570
|
+
if (!match)
|
|
571
|
+
throw new Error('Malformed data URI in response');
|
|
572
|
+
fs.writeFileSync(destPath, Buffer.from(match[1], 'base64'));
|
|
573
|
+
return;
|
|
574
|
+
}
|
|
575
|
+
if (imageData.url) {
|
|
576
|
+
const dlCtrl = new AbortController();
|
|
577
|
+
const dlTimeout = setTimeout(() => dlCtrl.abort(), 30_000);
|
|
578
|
+
try {
|
|
579
|
+
const imgResp = await fetch(imageData.url, { signal: dlCtrl.signal });
|
|
580
|
+
fs.writeFileSync(destPath, Buffer.from(await imgResp.arrayBuffer()));
|
|
581
|
+
}
|
|
582
|
+
finally {
|
|
583
|
+
clearTimeout(dlTimeout);
|
|
584
|
+
}
|
|
585
|
+
return;
|
|
586
|
+
}
|
|
587
|
+
throw new Error('No image data (b64_json or url) in response');
|
|
588
|
+
}
|
|
431
589
|
// ─── Payment ───────────────────────────────────────────────────────────────
|
|
432
590
|
async function signPayment(response, chain, endpoint) {
|
|
433
591
|
try {
|
|
@@ -489,24 +647,30 @@ export function createImageGenCapability(deps = {}) {
|
|
|
489
647
|
return {
|
|
490
648
|
spec: {
|
|
491
649
|
name: 'ImageGen',
|
|
492
|
-
description: "Generate an image from a
|
|
493
|
-
"image
|
|
494
|
-
"
|
|
495
|
-
"
|
|
496
|
-
"
|
|
497
|
-
"
|
|
498
|
-
"
|
|
499
|
-
"
|
|
500
|
-
"
|
|
501
|
-
"
|
|
650
|
+
description: "Generate or edit an image. Text-to-image from a prompt, or " +
|
|
651
|
+
"image-to-image when you pass a reference image (style transfer, " +
|
|
652
|
+
"character consistency, edits). Supports mask-based inpainting and " +
|
|
653
|
+
"multi-image fusion. Costs USDC from the user's wallet — confirm " +
|
|
654
|
+
"before generating. Saves to local file(s). Default size: 1024x1024. " +
|
|
655
|
+
"Do NOT call repeatedly to iterate on style — ask the user first. " +
|
|
656
|
+
"Pass contentId to attach the result to an existing Content piece: " +
|
|
657
|
+
"the content's budget is checked BEFORE paying, and on success each " +
|
|
658
|
+
"image is recorded as an asset with its estimated cost. Skipping " +
|
|
659
|
+
"contentId generates one-off images with no budget tracking. " +
|
|
660
|
+
"Edit-capable models: openai/gpt-image-1, openai/gpt-image-2, " +
|
|
661
|
+
"google/nano-banana, google/nano-banana-pro. Mask inpainting is " +
|
|
662
|
+
"OpenAI-only; multi-image fusion is capped at 4 (OpenAI) / 3 (Google).",
|
|
502
663
|
input_schema: {
|
|
503
664
|
type: 'object',
|
|
504
665
|
properties: {
|
|
505
|
-
prompt: { type: 'string', description: 'Text description of the image to generate' },
|
|
506
|
-
output_path: { type: 'string', description: 'Where to save the image. Default: generated-<timestamp>.png in working directory' },
|
|
507
|
-
size: { type: 'string', description: 'Image size: 1024x1024,
|
|
508
|
-
model: { type: 'string', description: 'Image model to use. Default: openai/gpt-image-1' },
|
|
509
|
-
image_url: { type: 'string', description: 'Optional reference image (image-to-image / style transfer). Accepts an http(s) URL, a data URI, or a local file path. Only
|
|
666
|
+
prompt: { type: 'string', description: 'Text description of the image to generate, or edit instructions when a reference image is provided' },
|
|
667
|
+
output_path: { type: 'string', description: 'Where to save the image. Default: generated-<timestamp>.png in working directory. With n>1, a -1/-2/... suffix is appended before the extension.' },
|
|
668
|
+
size: { type: 'string', description: 'Image size. gpt-image-1/2: 1024x1024, 1536x1024, 1024x1536. google/nano-banana: 1024x1024. google/nano-banana-pro: 1024x1024, 2048x2048, 4096x4096. Default: 1024x1024. Note: openai/gpt-image-2 is forced to 1024x1024 (other sizes time out at the gateway).' },
|
|
669
|
+
model: { type: 'string', description: 'Image model to use. Default: openai/gpt-image-1 (text-to-image) / openai/gpt-image-2 (image-to-image).' },
|
|
670
|
+
image_url: { type: 'string', description: 'Optional reference image (image-to-image / style transfer). Accepts an http(s) URL, a data URI, or a local file path. Only edit-capable models are accepted.' },
|
|
671
|
+
images: { type: 'array', items: { type: 'string' }, description: 'Optional list of reference images for multi-image fusion (e.g. subject + logo). Same forms as image_url. Merged with image_url. Cap: OpenAI 4, Google 3. Cannot combine with mask.' },
|
|
672
|
+
mask: { type: 'string', description: 'Optional mask for inpainting — transparent pixels mark the editable region. Same forms as image_url. OpenAI edit models only; cannot combine with multiple source images.' },
|
|
673
|
+
n: { type: 'number', description: 'Number of images to generate, 1-4. Default 1. Cost scales with n.' },
|
|
510
674
|
contentId: { type: 'string', description: 'Optional Content id to attach this generation to. Pre-flight budget check + auto-record on success.' },
|
|
511
675
|
},
|
|
512
676
|
required: ['prompt'],
|
package/dist/tools/index.js
CHANGED
|
@@ -33,6 +33,8 @@ import { defiLlamaProtocolsCapability, defiLlamaProtocolCapability, defiLlamaCha
|
|
|
33
33
|
import { predictionMarketCapability } from './prediction.js';
|
|
34
34
|
import { modalCapabilities } from './modal.js';
|
|
35
35
|
import { blockrunCapability } from './blockrun.js';
|
|
36
|
+
import { surfCapabilities } from './surf.js';
|
|
37
|
+
import { realFaceCapability } from './realface.js';
|
|
36
38
|
import { listPhoneNumbersCapability, buyPhoneNumberCapability, renewPhoneNumberCapability, releasePhoneNumberCapability, phoneLookupCapability, phoneFraudCheckCapability, } from './phone.js';
|
|
37
39
|
import { voiceCallCapability, voiceStatusCapability } from './voice.js';
|
|
38
40
|
import { createTradingCapabilities } from './trading-execute.js';
|
|
@@ -166,7 +168,8 @@ export const allCapabilities = [
|
|
|
166
168
|
defiLlamaYieldsCapability,
|
|
167
169
|
defiLlamaPriceCapability,
|
|
168
170
|
predictionMarketCapability, // Polymarket / Kalshi / matching / smart money via Predexon
|
|
169
|
-
blockrunCapability, // Generic x402-paid gateway primitive —
|
|
171
|
+
blockrunCapability, // Generic x402-paid gateway primitive — future partners + long-tail Surf paths
|
|
172
|
+
...surfCapabilities, // SurfMarket / SurfChain / SurfSocial — endpoint-enum function tools (no path guessing, auto x402)
|
|
170
173
|
// Phone & Voice — typed surface so the agent pattern-matches on the user
|
|
171
174
|
// intent ("buy a number", "make a call") without needing to consult the
|
|
172
175
|
// BlockRun primitive or the .well-known/x402 manifest. All wrap the same
|
|
@@ -179,6 +182,7 @@ export const allCapabilities = [
|
|
|
179
182
|
phoneFraudCheckCapability, // PhoneFraudCheck — $0.05
|
|
180
183
|
voiceCallCapability, // VoiceCall — $0.54 / call (Bland.ai)
|
|
181
184
|
voiceStatusCapability, // VoiceStatus — free (poll)
|
|
185
|
+
realFaceCapability, // RealFace — init/status/enroll/list; $0.01 enroll → ta_xxx avatar for VideoGen
|
|
182
186
|
// Modal GPU sandbox tools — registered but hidden by default (not in
|
|
183
187
|
// CORE_TOOL_NAMES). Agent must `ActivateTool({names:["ModalCreate",...]})`
|
|
184
188
|
// before they appear in its tool inventory. High-cost ($0.40/H100 create)
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* RealFace — enroll a real person's face as a reusable video avatar.
|
|
3
|
+
*
|
|
4
|
+
* Wraps BlockRun's /v1/realface/* flow so the agent never hand-rolls paths or
|
|
5
|
+
* x402. Enrollment is a three-step, human-in-the-loop flow because the upstream
|
|
6
|
+
* provider (Token360 / BytePlus) requires a live liveness check on a phone:
|
|
7
|
+
*
|
|
8
|
+
* 1. action="init" (FREE) → creates a REAL_FACE group, returns an `h5_link`.
|
|
9
|
+
* Show it to the user as a QR / URL. They scan it
|
|
10
|
+
* on their phone and do a ~1-minute liveness check
|
|
11
|
+
* (nod + blink). The link expires in 120s; call
|
|
12
|
+
* init again with the same group_id to refresh.
|
|
13
|
+
* 2. action="status" (FREE) → poll the group until status === "active" (the
|
|
14
|
+
* person finished the phone liveness). Bounded
|
|
15
|
+
* poll (~24s) so a quick scan resolves in one call.
|
|
16
|
+
* 3. action="enroll" ($0.01)→ uploads a face photo (public https URL), waits
|
|
17
|
+
* for the biometric match, returns the `ta_xxx`
|
|
18
|
+
* asset id. Pre-flights group-active (425 if not);
|
|
19
|
+
* no charge if the upload/match fails.
|
|
20
|
+
* action="list" (FREE) → lists the wallet's enrolled RealFace assets.
|
|
21
|
+
*
|
|
22
|
+
* Use the returned `ta_xxx` as `real_face_asset_id` on a VideoGen call with a
|
|
23
|
+
* Seedance 2.0 model for cross-frame character consistency.
|
|
24
|
+
*
|
|
25
|
+
* x402 signing mirrors src/tools/videogen.ts / blockrun.ts (kept as copy-paste
|
|
26
|
+
* per the same rationale documented there — a shared module is out of scope).
|
|
27
|
+
*/
|
|
28
|
+
import type { CapabilityHandler } from '../agent/types.js';
|
|
29
|
+
export declare const realFaceCapability: CapabilityHandler;
|