ima2-gen 2.0.0 → 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +150 -0
- package/README.md +12 -12
- package/bin/commands/backfillThumbs.js +24 -0
- package/bin/commands/edit.js +7 -6
- package/bin/commands/gen.js +13 -6
- package/bin/commands/multimode.js +5 -4
- package/bin/commands/node.js +4 -4
- package/bin/ima2.js +21 -11
- package/bin/lib/config-store.js +1 -1
- package/docs/API.md +184 -10
- package/docs/CLI.md +11 -4
- package/docs/FAQ.ko.md +16 -0
- package/docs/FAQ.md +30 -0
- package/docs/PROMPT_STUDIO.md +3 -1
- package/docs/README.ko.md +7 -3
- package/docs/migration/runtime-test-inventory.md +17 -1
- package/lib/agentImageVideoGen.js +261 -0
- package/lib/agentRuntime.js +11 -260
- package/lib/agentSettings.js +1 -1
- package/lib/agyImageAdapter.js +259 -0
- package/lib/capabilities.js +2 -1
- package/lib/configKeys.js +1 -1
- package/lib/errorClassify.js +8 -7
- package/lib/eventBus.js +71 -0
- package/lib/geminiApiImageAdapter.js +179 -0
- package/lib/generationErrors.js +3 -1
- package/lib/grokImageAdapter.js +74 -128
- package/lib/grokImageCore.js +153 -0
- package/lib/grokMultimodeAdapter.js +7 -4
- package/lib/grokRuntime.js +3 -0
- package/lib/grokSizeMapper.js +13 -1
- package/lib/grokVideoAdapter.js +14 -7
- package/lib/grokVideoCanvas.js +13 -0
- package/lib/grokVideoPlannerPrompt.js +53 -6
- package/lib/historyList.js +19 -2
- package/lib/imageModels.js +15 -0
- package/lib/imageThumb.js +38 -0
- package/lib/inflight.js +54 -17
- package/lib/multimodeHelpers.js +10 -0
- package/lib/nodeHelpers.js +59 -0
- package/lib/oauthProxy/prompts.js +30 -36
- package/lib/promptBuilder/systemPrompt.js +2 -5
- package/lib/promptSafetyPolicy.js +1 -5
- package/lib/providerOptions.js +36 -1
- package/lib/responsesFallback.js +53 -44
- package/lib/routeHelpers.js +44 -0
- package/lib/runtimeContext.js +27 -0
- package/lib/ssePublish.js +12 -0
- package/lib/storageMigration.js +1 -1
- package/lib/storyboardPrefix.js +28 -0
- package/lib/thumbBackfill.js +70 -0
- package/lib/vertexAuth.js +44 -0
- package/lib/videoThumb.js +60 -0
- package/package.json +7 -2
- package/routes/agy.js +44 -0
- package/routes/auth.js +242 -0
- package/routes/edit.js +48 -8
- package/routes/events.js +78 -0
- package/routes/generate.js +135 -135
- package/routes/history.js +13 -0
- package/routes/index.js +8 -0
- package/routes/keys.js +254 -0
- package/routes/multimode.js +138 -62
- package/routes/nodes.js +107 -129
- package/routes/quota.js +58 -7
- package/routes/video.js +107 -20
- package/server.js +123 -0
- package/skills/ima2/SKILL.md +98 -21
- package/ui/dist/.vite/manifest.json +12 -12
- package/ui/dist/assets/AgentWorkspace-Dth6YijN.js +3 -0
- package/ui/dist/assets/{CardNewsWorkspace-BN-ga1lG.js → CardNewsWorkspace-Dav3K5CT.js} +2 -2
- package/ui/dist/assets/{NodeCanvas-BbMa4IhI.js → NodeCanvas-C4ifFzB1.js} +2 -2
- package/ui/dist/assets/{PromptBuilderPanel-DRwBJRDQ.js → PromptBuilderPanel-CEcyU9PL.js} +1 -1
- package/ui/dist/assets/{PromptImportDialog-Dp85kHCq.js → PromptImportDialog-CgQ94Gth.js} +2 -2
- package/ui/dist/assets/{PromptImportDiscoverySection-BE8Q8MLD.js → PromptImportDiscoverySection-CuzyzbNI.js} +1 -1
- package/ui/dist/assets/{PromptImportFolderSection-PtH5x0sc.js → PromptImportFolderSection-DHLGlO6l.js} +1 -1
- package/ui/dist/assets/{PromptLibraryPanel-FnM9tHI9.js → PromptLibraryPanel-BOe18we8.js} +2 -2
- package/ui/dist/assets/SettingsWorkspace-Cdgnm4Wa.js +1 -0
- package/ui/dist/assets/index-C5PSahkr.js +1 -0
- package/ui/dist/assets/index-Dn2AhL6d.css +1 -0
- package/ui/dist/assets/index-Tjqx6wUV.js +23 -0
- package/ui/dist/index.html +2 -2
- package/ui/dist/assets/AgentWorkspace-C21zqdTZ.js +0 -3
- package/ui/dist/assets/SettingsWorkspace-MARPGyBL.js +0 -1
- package/ui/dist/assets/index-BAFI6htx.js +0 -42
- package/ui/dist/assets/index-BSXxr_Bt.js +0 -1
- package/ui/dist/assets/index-DS-ADE7U.css +0 -1
package/routes/video.js
CHANGED
|
@@ -2,7 +2,11 @@ import { mkdir, readFile, unlink, writeFile } from "fs/promises";
|
|
|
2
2
|
import { atomicWriteJson } from "../lib/atomicWrite.js";
|
|
3
3
|
import { join } from "path";
|
|
4
4
|
import { randomBytes } from "crypto";
|
|
5
|
-
import {
|
|
5
|
+
import { execFile } from "child_process";
|
|
6
|
+
import { tmpdir } from "os";
|
|
7
|
+
import { promisify } from "util";
|
|
8
|
+
const execFileAsync = promisify(execFile);
|
|
9
|
+
import { startJob, finishJob, registerJobAbortController, isJobCanceled, isStartJobFailure, setJobPhase, INFLIGHT_RETRY_AFTER_SECONDS } from "../lib/inflight.js";
|
|
6
10
|
import { isGenerationCanceledError, makeGenerationCanceledError } from "../lib/generationCancel.js";
|
|
7
11
|
import { logEvent, logError } from "../lib/logger.js";
|
|
8
12
|
import { invalidateHistoryIndex } from "../lib/historyIndex.js";
|
|
@@ -13,10 +17,23 @@ import { extractGeneratedVideoFrameB64 } from "../lib/videoFrameExtract.js";
|
|
|
13
17
|
import { normalizeGrokVideoModel, normalizeVideoResolution, normalizeVideoAspectRatio, normalizeVideoDuration, deriveVideoMode, clampVideoDuration, MAX_REF2V_REFERENCES, } from "../lib/imageModels.js";
|
|
14
18
|
import { errInfo } from "../lib/errInfo.js";
|
|
15
19
|
import { requireRuntimeContext } from "../lib/runtimeContext.js";
|
|
20
|
+
import { generateVideoThumbnail } from "../lib/videoThumb.js";
|
|
21
|
+
import { publish } from "../lib/eventBus.js";
|
|
22
|
+
import { publishJobEvent } from "../lib/ssePublish.js";
|
|
16
23
|
function sendSse(res, event, data) {
|
|
17
24
|
res.write(`event: ${event}\n`);
|
|
18
25
|
res.write(`data: ${JSON.stringify(data)}\n\n`);
|
|
19
26
|
}
|
|
27
|
+
function dualEmitVideo(res, requestId, event, data) {
|
|
28
|
+
if (!res.writableEnded)
|
|
29
|
+
sendSse(res, event, data);
|
|
30
|
+
if (event === "done") {
|
|
31
|
+
publishJobEvent(requestId, event, data);
|
|
32
|
+
}
|
|
33
|
+
else {
|
|
34
|
+
publish(requestId, event, data);
|
|
35
|
+
}
|
|
36
|
+
}
|
|
20
37
|
function toArray(v) {
|
|
21
38
|
return Array.isArray(v) ? v : [];
|
|
22
39
|
}
|
|
@@ -50,6 +67,32 @@ async function resolveSourceImage(ctx, sourceImage, sourceFilename) {
|
|
|
50
67
|
}
|
|
51
68
|
return { b64: null, filename: null };
|
|
52
69
|
}
|
|
70
|
+
const STORYBOARD_TRIM_SECONDS = "1.0";
|
|
71
|
+
async function trimStoryboardLeadIn(buffer, requestId) {
|
|
72
|
+
const tmpIn = join(tmpdir(), `ima2_sb_trim_in_${requestId.replace(/[^a-zA-Z0-9_-]/g, "_")}.mp4`);
|
|
73
|
+
const tmpOut = join(tmpdir(), `ima2_sb_trim_out_${requestId.replace(/[^a-zA-Z0-9_-]/g, "_")}.mp4`);
|
|
74
|
+
try {
|
|
75
|
+
await writeFile(tmpIn, buffer);
|
|
76
|
+
logEvent("video", "storyboard:trim-start", { requestId, inputBytes: buffer.length, trimSeconds: STORYBOARD_TRIM_SECONDS });
|
|
77
|
+
await execFileAsync("ffmpeg", [
|
|
78
|
+
"-y", "-ss", STORYBOARD_TRIM_SECONDS, "-i", tmpIn,
|
|
79
|
+
"-c:v", "libx264", "-preset", "fast", "-crf", "18",
|
|
80
|
+
"-c:a", "aac", "-b:a", "128k",
|
|
81
|
+
"-avoid_negative_ts", "make_zero", tmpOut,
|
|
82
|
+
], { timeout: 60_000 });
|
|
83
|
+
const trimmed = await readFile(tmpOut);
|
|
84
|
+
logEvent("video", "storyboard:trimmed", { requestId, originalBytes: buffer.length, trimmedBytes: trimmed.length, trimSeconds: STORYBOARD_TRIM_SECONDS });
|
|
85
|
+
return trimmed;
|
|
86
|
+
}
|
|
87
|
+
catch (trimError) {
|
|
88
|
+
logEvent("video", "storyboard:trim-exec-error", { requestId, error: trimError.message, stderr: trimError.stderr?.slice?.(0, 500) });
|
|
89
|
+
throw trimError;
|
|
90
|
+
}
|
|
91
|
+
finally {
|
|
92
|
+
await unlink(tmpIn).catch(() => { });
|
|
93
|
+
await unlink(tmpOut).catch(() => { });
|
|
94
|
+
}
|
|
95
|
+
}
|
|
53
96
|
export function registerVideoRoutes(app, ctxRaw) {
|
|
54
97
|
const ctx = requireRuntimeContext(ctxRaw);
|
|
55
98
|
app.post("/api/video/generate", async (req, res) => {
|
|
@@ -58,30 +101,39 @@ export function registerVideoRoutes(app, ctxRaw) {
|
|
|
58
101
|
: typeof req.body?.clientRequestId === "string"
|
|
59
102
|
? req.body.clientRequestId
|
|
60
103
|
: req.id;
|
|
104
|
+
const asyncMode = req.body?.async === true;
|
|
61
105
|
let finishStatus = "completed";
|
|
62
106
|
let finishHttpStatus = 200;
|
|
63
107
|
let finishErrorCode;
|
|
64
108
|
let finishMeta = {};
|
|
65
109
|
let finishCanceled = false;
|
|
66
110
|
const cancelController = new AbortController();
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
111
|
+
if (!asyncMode) {
|
|
112
|
+
res.setHeader("Content-Type", "text/event-stream; charset=utf-8");
|
|
113
|
+
res.setHeader("Cache-Control", "no-cache, no-transform");
|
|
114
|
+
res.setHeader("Connection", "keep-alive");
|
|
115
|
+
res.flushHeaders?.();
|
|
116
|
+
}
|
|
71
117
|
const fail = (status, code, error, extra = {}) => {
|
|
72
118
|
const httpStatus = status ?? 500;
|
|
73
119
|
finishStatus = "error";
|
|
74
120
|
finishHttpStatus = httpStatus;
|
|
75
121
|
finishErrorCode = code;
|
|
76
|
-
|
|
122
|
+
const payload = { error, code, status: httpStatus, requestId, ...extra };
|
|
123
|
+
publish(requestId, "error", payload);
|
|
124
|
+
if (asyncMode && !res.headersSent) {
|
|
125
|
+
return res.status(httpStatus).json(payload);
|
|
126
|
+
}
|
|
127
|
+
if (!res.writableEnded)
|
|
128
|
+
sendSse(res, "error", payload);
|
|
77
129
|
};
|
|
78
130
|
try {
|
|
79
131
|
const { prompt, provider = "grok", model: rawModel } = req.body || {};
|
|
80
132
|
const sessionId = typeof req.body?.sessionId === "string" ? req.body.sessionId : null;
|
|
81
133
|
const clientNodeId = typeof req.body?.clientNodeId === "string" ? req.body.clientNodeId : null;
|
|
82
134
|
const topic = typeof req.body?.topic === "string" ? req.body.topic.trim() : "";
|
|
83
|
-
if (provider !== "grok")
|
|
84
|
-
return fail(400, "VIDEO_PROVIDER_UNSUPPORTED", "video generation requires provider 'grok'");
|
|
135
|
+
if (provider !== "grok" && provider !== "grok-api")
|
|
136
|
+
return fail(400, provider === "agy" ? "AGY_VIDEO_UNSUPPORTED" : "VIDEO_PROVIDER_UNSUPPORTED", provider === "agy" ? "Gemini (agy) does not support video generation" : "video generation requires provider 'grok' or 'grok-api'");
|
|
85
137
|
const storyboardActive = req.body?.storyboard === true;
|
|
86
138
|
const storyboardPrefix = storyboardActive
|
|
87
139
|
? [
|
|
@@ -98,6 +150,14 @@ export function registerVideoRoutes(app, ctxRaw) {
|
|
|
98
150
|
"- Lock lighting direction, color palette, environment, and style.",
|
|
99
151
|
"- Describe ONLY what changes: action, camera movement, dialogue, sound.",
|
|
100
152
|
"",
|
|
153
|
+
"STORYBOARD IMAGE SOURCE RULE (HIGHEST PRIORITY — OVERRIDES ALL OTHER RULES):",
|
|
154
|
+
"- The source image is a 3x3 storyboard grid. Panel 1 (top-left) is a BLACK LEAD-IN FRAME — it contains no scene content.",
|
|
155
|
+
"- The video starts from black (Panel 1), then transitions into the action scene from Panel 2.",
|
|
156
|
+
"- Panels 2-9 contain the action sequence. Describe and animate only Panels 2-9.",
|
|
157
|
+
"- Start your rewritten prompt with: 'Fading in from black into the full-screen scene of [Panel 2 description],' — the server auto-trims the black lead-in.",
|
|
158
|
+
"- The storyboard grid must NEVER appear as a visible grid in any frame. The output is a single continuous cinematic clip.",
|
|
159
|
+
"- Do NOT reference Panel 1 in the action description — it is only a technical black frame.",
|
|
160
|
+
"",
|
|
101
161
|
"PROMPT STRUCTURE (layered caption format):",
|
|
102
162
|
"- Shot foundation: type + camera motion (dolly, pan, tracking, crane, static).",
|
|
103
163
|
"- Subject: action with intensity modifiers (crashes violently, drifts gently).",
|
|
@@ -164,25 +224,36 @@ export function registerVideoRoutes(app, ctxRaw) {
|
|
|
164
224
|
}
|
|
165
225
|
if (resolved.length > MAX_REF2V_REFERENCES)
|
|
166
226
|
return fail(400, "GROK_VIDEO_REF_TOO_MANY", `at most ${MAX_REF2V_REFERENCES} reference images`);
|
|
167
|
-
const
|
|
227
|
+
const incomingProviderUrl = typeof req.body?.providerUrl === "string" && req.body.providerUrl.startsWith("http") ? req.body.providerUrl : null;
|
|
228
|
+
const mode = incomingProviderUrl ? "image-to-video" : deriveVideoMode(resolved.length);
|
|
168
229
|
const duration = clampVideoDuration(durationCheck.duration, mode);
|
|
169
230
|
const referenceImages = mode === "reference-to-video" ? resolved.map((r) => r.b64) : undefined;
|
|
170
|
-
const sourceB64 = mode === "image-to-video" ? resolved[0]?.b64 : undefined;
|
|
231
|
+
const sourceB64 = incomingProviderUrl || (mode === "image-to-video" ? resolved[0]?.b64 : undefined);
|
|
171
232
|
const sourceFilename = resolved[0]?.filename ?? null;
|
|
172
|
-
startJob({
|
|
233
|
+
const started = startJob({
|
|
173
234
|
requestId,
|
|
174
235
|
kind: "video",
|
|
175
236
|
prompt: activePrompt,
|
|
176
237
|
meta: { kind: "video", sessionId, clientNodeId, model: modelCheck.model, mode, duration, resolution: resolutionCheck.resolution },
|
|
177
238
|
});
|
|
239
|
+
if (started && isStartJobFailure(started)) {
|
|
240
|
+
if (started.code === "TOO_MANY_JOBS") {
|
|
241
|
+
res.setHeader("Retry-After", String(INFLIGHT_RETRY_AFTER_SECONDS));
|
|
242
|
+
}
|
|
243
|
+
return fail(started.code === "TOO_MANY_JOBS" ? 429 : 409, started.code, started.code === "TOO_MANY_JOBS"
|
|
244
|
+
? "Too many concurrent generation jobs"
|
|
245
|
+
: "Request ID already in use");
|
|
246
|
+
}
|
|
178
247
|
registerJobAbortController(requestId, cancelController);
|
|
248
|
+
if (asyncMode)
|
|
249
|
+
res.status(202).json({ requestId });
|
|
179
250
|
await mkdir(ctx.config.storage.generatedDir, { recursive: true });
|
|
180
251
|
logEvent("video", "request", { requestId, mode, duration, resolution: resolutionCheck.resolution, aspectRatio: aspectCheck.aspectRatio });
|
|
181
252
|
const startTime = Date.now();
|
|
182
253
|
const onEvent = (ev) => {
|
|
183
254
|
if (ev.phase === "submitted") {
|
|
184
255
|
setJobPhase(requestId, "streaming");
|
|
185
|
-
|
|
256
|
+
dualEmitVideo(res, requestId, "submitted", {
|
|
186
257
|
requestId,
|
|
187
258
|
xaiVideoRequestId: ev.xaiVideoRequestId,
|
|
188
259
|
requestedModel: ev.requestedModel,
|
|
@@ -191,11 +262,11 @@ export function registerVideoRoutes(app, ctxRaw) {
|
|
|
191
262
|
});
|
|
192
263
|
}
|
|
193
264
|
else if (ev.phase === "progress") {
|
|
194
|
-
|
|
265
|
+
dualEmitVideo(res, requestId, "progress", { requestId, progress: typeof ev.progress === "number" ? ev.progress / 100 : null, stalled: Boolean(ev.stalled) });
|
|
195
266
|
}
|
|
196
267
|
else {
|
|
197
268
|
setJobPhase(requestId, "planning");
|
|
198
|
-
|
|
269
|
+
dualEmitVideo(res, requestId, "planning", { requestId });
|
|
199
270
|
}
|
|
200
271
|
};
|
|
201
272
|
// Build prompt with series chain context
|
|
@@ -205,6 +276,7 @@ export function registerVideoRoutes(app, ctxRaw) {
|
|
|
205
276
|
: activePrompt;
|
|
206
277
|
const effectivePrompt = storyboardPrefix + basePrompt;
|
|
207
278
|
const plannerModel = typeof req.body?.plannerModel === "string" ? req.body.plannerModel.trim() : undefined;
|
|
279
|
+
const directApiKey = provider === "grok-api" ? ctx.xaiApiKey : undefined;
|
|
208
280
|
const result = await generateVideoViaGrok(effectivePrompt, ctx, {
|
|
209
281
|
model: modelCheck.model,
|
|
210
282
|
mode,
|
|
@@ -217,7 +289,9 @@ export function registerVideoRoutes(app, ctxRaw) {
|
|
|
217
289
|
requestId,
|
|
218
290
|
continuityLineage: parentLineage,
|
|
219
291
|
plannerModel: plannerModel || undefined,
|
|
292
|
+
directApiKey,
|
|
220
293
|
onEvent,
|
|
294
|
+
storyboardActive,
|
|
221
295
|
});
|
|
222
296
|
const rand = randomBytes(ctx.config.ids.generatedHexBytes).toString("hex");
|
|
223
297
|
const filename = `${Date.now()}_${rand}.mp4`;
|
|
@@ -231,13 +305,14 @@ export function registerVideoRoutes(app, ctxRaw) {
|
|
|
231
305
|
const meta = {
|
|
232
306
|
kind: "video",
|
|
233
307
|
mediaType: "video",
|
|
308
|
+
providerUrl: result.url,
|
|
234
309
|
requestId,
|
|
235
310
|
sessionId,
|
|
236
311
|
clientNodeId,
|
|
237
312
|
prompt: activePrompt,
|
|
238
313
|
userPrompt: activePrompt,
|
|
239
314
|
revisedPrompt: result.revisedPrompt,
|
|
240
|
-
provider
|
|
315
|
+
provider,
|
|
241
316
|
model: result.effectiveModel,
|
|
242
317
|
requestedModel: result.requestedModel,
|
|
243
318
|
effectiveModel: result.effectiveModel,
|
|
@@ -260,14 +335,25 @@ export function registerVideoRoutes(app, ctxRaw) {
|
|
|
260
335
|
...(topic ? { videoSeries: { topic, chainIndex: chain.length } } : {}),
|
|
261
336
|
...(storyboardActive ? { storyboard: true } : {}),
|
|
262
337
|
};
|
|
263
|
-
|
|
338
|
+
let finalBuffer = result.videoBuffer;
|
|
339
|
+
if (storyboardActive) {
|
|
340
|
+
try {
|
|
341
|
+
finalBuffer = await trimStoryboardLeadIn(result.videoBuffer, requestId);
|
|
342
|
+
}
|
|
343
|
+
catch (trimErr) {
|
|
344
|
+
logEvent("video", "storyboard:trim-failed", { requestId, error: trimErr.message });
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
await saveGeneratedVideoArtifact(ctx, filename, finalBuffer, meta);
|
|
348
|
+
generateVideoThumbnail(join(ctx.config.storage.generatedDir, filename)).catch(() => { });
|
|
264
349
|
invalidateHistoryIndex();
|
|
265
350
|
finishMeta = { filename, xaiVideoRequestId: result.xaiVideoRequestId };
|
|
266
351
|
logEvent("video", "saved", { requestId, filename, bytes: result.videoBuffer.length, elapsedMs: Date.now() - startTime });
|
|
267
|
-
|
|
352
|
+
dualEmitVideo(res, requestId, "done", {
|
|
268
353
|
requestId,
|
|
269
354
|
filename,
|
|
270
355
|
url: `/generated/${encodeURIComponent(filename)}`,
|
|
356
|
+
providerUrl: result.url,
|
|
271
357
|
mediaType: "video",
|
|
272
358
|
revisedPrompt: result.revisedPrompt,
|
|
273
359
|
elapsed,
|
|
@@ -287,19 +373,20 @@ export function registerVideoRoutes(app, ctxRaw) {
|
|
|
287
373
|
finishCanceled = true;
|
|
288
374
|
finishHttpStatus = canceled.status;
|
|
289
375
|
finishErrorCode = canceled.code;
|
|
290
|
-
|
|
376
|
+
dualEmitVideo(res, requestId, "error", { error: canceled.message, code: canceled.code, status: canceled.status, requestId });
|
|
291
377
|
}
|
|
292
378
|
else {
|
|
293
379
|
finishStatus = "error";
|
|
294
380
|
finishHttpStatus = err.status || 500;
|
|
295
381
|
finishErrorCode = err.code || "GROK_VIDEO_FAILED";
|
|
296
382
|
logError("video", "error", err.raw, { requestId, code: finishErrorCode });
|
|
297
|
-
|
|
383
|
+
dualEmitVideo(res, requestId, "error", { error: err.message, code: finishErrorCode, status: finishHttpStatus, requestId });
|
|
298
384
|
}
|
|
299
385
|
}
|
|
300
386
|
finally {
|
|
301
387
|
finishJob(requestId, { canceled: finishCanceled, status: finishStatus, httpStatus: finishHttpStatus, errorCode: finishErrorCode, meta: finishMeta });
|
|
302
|
-
res.
|
|
388
|
+
if (!res.writableEnded)
|
|
389
|
+
res.end();
|
|
303
390
|
}
|
|
304
391
|
});
|
|
305
392
|
}
|
package/server.js
CHANGED
|
@@ -42,6 +42,92 @@ async function loadApiKey() {
|
|
|
42
42
|
}
|
|
43
43
|
return { apiKey: null, apiKeySource: "none" };
|
|
44
44
|
}
|
|
45
|
+
async function loadXaiApiKey() {
|
|
46
|
+
if (process.env.XAI_API_KEY) {
|
|
47
|
+
return { apiKey: process.env.XAI_API_KEY, apiKeySource: "env" };
|
|
48
|
+
}
|
|
49
|
+
const candidates = [
|
|
50
|
+
config.storage.configFile,
|
|
51
|
+
join(rootDir, ".ima2", "config.json"),
|
|
52
|
+
];
|
|
53
|
+
for (const cfgPath of candidates) {
|
|
54
|
+
if (!existsSync(cfgPath))
|
|
55
|
+
continue;
|
|
56
|
+
try {
|
|
57
|
+
const cfg = JSON.parse(await readFile(cfgPath, "utf-8"));
|
|
58
|
+
if (cfg.xaiApiKey)
|
|
59
|
+
return { apiKey: cfg.xaiApiKey, apiKeySource: "config" };
|
|
60
|
+
}
|
|
61
|
+
catch { }
|
|
62
|
+
}
|
|
63
|
+
return { apiKey: null, apiKeySource: "none" };
|
|
64
|
+
}
|
|
65
|
+
async function loadGeminiApiKey() {
|
|
66
|
+
if (process.env.GEMINI_API_KEY) {
|
|
67
|
+
return { apiKey: process.env.GEMINI_API_KEY, apiKeySource: "env" };
|
|
68
|
+
}
|
|
69
|
+
const candidates = [
|
|
70
|
+
config.storage.configFile,
|
|
71
|
+
join(rootDir, ".ima2", "config.json"),
|
|
72
|
+
];
|
|
73
|
+
for (const cfgPath of candidates) {
|
|
74
|
+
if (!existsSync(cfgPath))
|
|
75
|
+
continue;
|
|
76
|
+
try {
|
|
77
|
+
const cfg = JSON.parse(await readFile(cfgPath, "utf-8"));
|
|
78
|
+
if (cfg.geminiApiKey)
|
|
79
|
+
return { apiKey: cfg.geminiApiKey, apiKeySource: "config" };
|
|
80
|
+
}
|
|
81
|
+
catch { }
|
|
82
|
+
}
|
|
83
|
+
return { apiKey: null, apiKeySource: "none" };
|
|
84
|
+
}
|
|
85
|
+
async function loadVertexKey() {
|
|
86
|
+
const envJson = process.env.VERTEX_SERVICE_ACCOUNT_JSON;
|
|
87
|
+
if (envJson) {
|
|
88
|
+
try {
|
|
89
|
+
const parsed = JSON.parse(envJson);
|
|
90
|
+
return { json: envJson, projectId: parsed.project_id || null, source: "env" };
|
|
91
|
+
}
|
|
92
|
+
catch {
|
|
93
|
+
return { json: null, projectId: null, source: "none" };
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
const candidates = [
|
|
97
|
+
config.storage.configFile,
|
|
98
|
+
join(rootDir, ".ima2", "config.json"),
|
|
99
|
+
];
|
|
100
|
+
for (const cfgPath of candidates) {
|
|
101
|
+
if (!existsSync(cfgPath))
|
|
102
|
+
continue;
|
|
103
|
+
try {
|
|
104
|
+
const cfg = JSON.parse(await readFile(cfgPath, "utf-8"));
|
|
105
|
+
if (cfg.vertexServiceAccountJson) {
|
|
106
|
+
const parsed = JSON.parse(cfg.vertexServiceAccountJson);
|
|
107
|
+
return { json: cfg.vertexServiceAccountJson, projectId: parsed.project_id || null, source: "config" };
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
catch { }
|
|
111
|
+
}
|
|
112
|
+
return { json: null, projectId: null, source: "none" };
|
|
113
|
+
}
|
|
114
|
+
async function loadGeminiAuthMode() {
|
|
115
|
+
const candidates = [
|
|
116
|
+
config.storage.configFile,
|
|
117
|
+
join(rootDir, ".ima2", "config.json"),
|
|
118
|
+
];
|
|
119
|
+
for (const cfgPath of candidates) {
|
|
120
|
+
if (!existsSync(cfgPath))
|
|
121
|
+
continue;
|
|
122
|
+
try {
|
|
123
|
+
const cfg = JSON.parse(await readFile(cfgPath, "utf-8"));
|
|
124
|
+
if (cfg.geminiAuthMode === "vertex" || cfg.geminiAuthMode === "apikey")
|
|
125
|
+
return cfg.geminiAuthMode;
|
|
126
|
+
}
|
|
127
|
+
catch { }
|
|
128
|
+
}
|
|
129
|
+
return undefined;
|
|
130
|
+
}
|
|
45
131
|
async function createOpenAI(apiKey) {
|
|
46
132
|
if (!apiKey)
|
|
47
133
|
return null;
|
|
@@ -143,6 +229,10 @@ export async function createRuntimeContext(overrides = {}) {
|
|
|
143
229
|
apiKeySource: overrides.apiKeySource ?? (overrides.apiKey ? "env" : "none"),
|
|
144
230
|
}
|
|
145
231
|
: await loadApiKey();
|
|
232
|
+
const loadedXaiKey = await loadXaiApiKey();
|
|
233
|
+
const loadedGeminiKey = await loadGeminiApiKey();
|
|
234
|
+
const loadedVertexKey = await loadVertexKey();
|
|
235
|
+
const geminiAuthMode = await loadGeminiAuthMode();
|
|
146
236
|
const apiKey = loadedKey.apiKey;
|
|
147
237
|
const openai = overrides.openai ?? await createOpenAI(apiKey);
|
|
148
238
|
const oauthPort = config.oauth.proxyPort;
|
|
@@ -170,6 +260,16 @@ export async function createRuntimeContext(overrides = {}) {
|
|
|
170
260
|
openai,
|
|
171
261
|
startedAt: overrides.startedAt ?? Date.now(),
|
|
172
262
|
packageVersion: overrides.packageVersion ?? readPackageVersion(),
|
|
263
|
+
xaiApiKey: loadedXaiKey.apiKey ?? undefined,
|
|
264
|
+
xaiApiKeySource: loadedXaiKey.apiKeySource,
|
|
265
|
+
hasXaiApiKey: !!loadedXaiKey.apiKey,
|
|
266
|
+
geminiApiKey: loadedGeminiKey.apiKey ?? undefined,
|
|
267
|
+
geminiApiKeySource: loadedGeminiKey.apiKeySource,
|
|
268
|
+
hasGeminiApiKey: !!loadedGeminiKey.apiKey,
|
|
269
|
+
vertexServiceAccountJson: loadedVertexKey.json ?? undefined,
|
|
270
|
+
vertexProjectId: loadedVertexKey.projectId ?? undefined,
|
|
271
|
+
hasVertexKey: !!loadedVertexKey.json,
|
|
272
|
+
geminiAuthMode,
|
|
173
273
|
oauthReadyPromise: oauthReadyPromise,
|
|
174
274
|
markGrokProxyPort: ({ url, port } = {}) => {
|
|
175
275
|
if (port)
|
|
@@ -194,6 +294,13 @@ export async function createRuntimeContext(overrides = {}) {
|
|
|
194
294
|
};
|
|
195
295
|
if (!config.oauth.autoStart)
|
|
196
296
|
ctx.markOAuthReady({ url: ctx.oauthUrl, port: ctx.oauthPort });
|
|
297
|
+
if (loadedVertexKey.json) {
|
|
298
|
+
try {
|
|
299
|
+
const { initVertexAuth } = await import("./lib/vertexAuth.js");
|
|
300
|
+
initVertexAuth(loadedVertexKey.json);
|
|
301
|
+
}
|
|
302
|
+
catch { /* vertex init failure is non-fatal */ }
|
|
303
|
+
}
|
|
197
304
|
return ctx;
|
|
198
305
|
}
|
|
199
306
|
export async function startServer(overrides = {}) {
|
|
@@ -283,6 +390,22 @@ export async function startServer(overrides = {}) {
|
|
|
283
390
|
const err = errInfo(e);
|
|
284
391
|
console.error("[db] bootstrap failed:", err.message);
|
|
285
392
|
}
|
|
393
|
+
// Background thumbnail backfill for updated users (recursive — covers video
|
|
394
|
+
// series subdirectories like continuous_*/clip_NN.mp4, not just top level).
|
|
395
|
+
(async () => {
|
|
396
|
+
try {
|
|
397
|
+
const { backfillThumbnails } = await import("./lib/thumbBackfill.js");
|
|
398
|
+
const r = await backfillThumbnails(ctx.config.storage.generatedDir);
|
|
399
|
+
if (r.created > 0) {
|
|
400
|
+
console.log(`[thumbs] backfill: ${r.created} created, ${r.skipped} skipped, ${r.failed} failed (${r.total} media files)`);
|
|
401
|
+
const { invalidateHistoryIndex } = await import("./lib/historyIndex.js");
|
|
402
|
+
invalidateHistoryIndex();
|
|
403
|
+
}
|
|
404
|
+
}
|
|
405
|
+
catch (e) {
|
|
406
|
+
console.warn("[thumbs] backfill failed:", e instanceof Error ? e.message : e);
|
|
407
|
+
}
|
|
408
|
+
})();
|
|
286
409
|
server.on("error", (err) => {
|
|
287
410
|
console.error("[server] Failed to start:", err?.message || err);
|
|
288
411
|
process.exit(1);
|
package/skills/ima2/SKILL.md
CHANGED
|
@@ -143,13 +143,22 @@ Do not use positional edit prompts. `ima2 edit` requires `--prompt`.
|
|
|
143
143
|
|
|
144
144
|
## Parallel Generation
|
|
145
145
|
|
|
146
|
-
There is no `--parallel` flag. For
|
|
146
|
+
There is no `--parallel` flag. For multiple candidates from the same prompt,
|
|
147
|
+
prefer one server-side batch request:
|
|
147
148
|
|
|
148
149
|
```bash
|
|
149
|
-
ima2 gen "
|
|
150
|
-
ima2
|
|
151
|
-
|
|
152
|
-
|
|
150
|
+
ima2 gen "four poster candidates" -n 4 -d ./out --quality high
|
|
151
|
+
ima2 multimode "four different poster directions" --max-images 4
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
For truly different prompts, independent CLI jobs can run concurrently against
|
|
155
|
+
the same server. Capture request IDs with JSON output, then monitor or cancel:
|
|
156
|
+
|
|
157
|
+
```bash
|
|
158
|
+
ima2 gen "variation 1" --quality high --json
|
|
159
|
+
ima2 gen "variation 2" --quality high --json
|
|
160
|
+
ima2 ps --json
|
|
161
|
+
ima2 cancel <requestId>
|
|
153
162
|
```
|
|
154
163
|
|
|
155
164
|
Treat `capabilities.limits.maxParallel` as advisory client-side queue guidance only.
|
|
@@ -402,6 +411,12 @@ Generate a high-quality still image first, then animate it. This produces better
|
|
|
402
411
|
|
|
403
412
|
**Critical rule for i2v**: Compose ALL characters and the environment together in ONE image. Do NOT use individual portrait refs for i2v — the video model needs a single composed scene to animate from.
|
|
404
413
|
|
|
414
|
+
**Keyframe image provider rule (MANDATORY)**:
|
|
415
|
+
- **Primary**: GPT Image 2 (OpenAI, `provider: oauth`) with `quality: high`, maximum resolution matching the target video aspect ratio. For 16:9 video use `1792x1024`. For 1:1 use `1024x1024`. For 9:16 use `1024x1792`.
|
|
416
|
+
- **Fallback**: Grok (`provider: grok`, model `grok-imagine-image-quality`). Only aspect ratio must match — resolution does not matter because i2v accepts any resolution source image and internally rescales.
|
|
417
|
+
- GPT Image 2 produces superior keyframes: better lighting coherence, character consistency, and fine detail that survives i2v animation. Always try GPT first.
|
|
418
|
+
- The i2v model internally rescales the source image to its native resolution regardless of input size, so there is no benefit to upscaling a Grok fallback image.
|
|
419
|
+
|
|
405
420
|
**ref2v vs i2v decision**:
|
|
406
421
|
|
|
407
422
|
| Scenario | Use | Why |
|
|
@@ -412,8 +427,12 @@ Generate a high-quality still image first, then animate it. This produces better
|
|
|
412
427
|
|
|
413
428
|
```bash
|
|
414
429
|
# Multi-character scene: compose BOTH characters in one image first
|
|
430
|
+
# Primary: GPT Image 2 at high quality, max resolution, aspect ratio matching 16:9 video
|
|
415
431
|
ima2 gen "cinematic wide shot of Bruce Lee in yellow tracksuit facing Elon Musk in dark gi, underground fight arena, dramatic lighting, 16:9" --quality high --size 1792x1024 -o scene.png
|
|
416
432
|
|
|
433
|
+
# Fallback if GPT fails: Grok quality model, match aspect ratio only
|
|
434
|
+
# ima2 gen "same prompt" --provider grok --model grok-imagine-image-quality --size 1824x1024 -o scene.png
|
|
435
|
+
|
|
417
436
|
# Then animate from the composed scene
|
|
418
437
|
ima2 video "Bruce throws a rapid jeet kune do combination" --ref scene.png --duration 10 --resolution 720p --aspect-ratio 16:9
|
|
419
438
|
```
|
|
@@ -438,30 +457,88 @@ ima2 video "close-up of rain drops on a neon sign reflection" \
|
|
|
438
457
|
|
|
439
458
|
The planner receives previous prompts from the same topic as continuity context. This is best-effort prompt guidance, not a guarantee that subjects, palette, or style will remain identical. For branch-local continuation, use `ima2 video continue` instead.
|
|
440
459
|
|
|
441
|
-
#### Storyboard-to-Video Chaining (
|
|
460
|
+
#### Storyboard-to-Video Chaining (9-panel storyboard → i2v loop)
|
|
461
|
+
|
|
462
|
+
The highest-quality video production workflow. Since Grok i2v accepts only **one image input**, pack the entire action sequence into a single 3×3 (9-panel) storyboard grid image. The i2v model reads the panels as a visual script and animates the progression.
|
|
463
|
+
|
|
464
|
+
**Full workflow**:
|
|
465
|
+
|
|
466
|
+
```
|
|
467
|
+
keyframe image (GPT high)
|
|
468
|
+
→ GPT i2i with reference → 9-panel storyboard grid
|
|
469
|
+
→ Grok i2v (reads panels, animates sequence)
|
|
470
|
+
→ extract last frame
|
|
471
|
+
→ GPT i2i with last frame → next 9-panel storyboard
|
|
472
|
+
→ Grok i2v
|
|
473
|
+
→ repeat
|
|
474
|
+
```
|
|
475
|
+
|
|
476
|
+
**Step 1 — Opening keyframe** (GPT Image 2, `quality: high`, max resolution matching target aspect ratio):
|
|
477
|
+
|
|
478
|
+
```bash
|
|
479
|
+
ima2 gen "cinematic wide shot of two fighters in a dojo, dramatic lighting" \
|
|
480
|
+
--quality high --size 1792x1024 --storyboard
|
|
481
|
+
```
|
|
482
|
+
|
|
483
|
+
Fallback: Grok `grok-imagine-image-quality`, match aspect ratio only — resolution does not matter because i2v internally rescales.
|
|
442
484
|
|
|
443
|
-
|
|
485
|
+
**Step 2 — 9-panel storyboard grid** (GPT Image 2 with keyframe as reference):
|
|
444
486
|
|
|
445
487
|
```bash
|
|
446
|
-
#
|
|
447
|
-
ima2 gen "
|
|
488
|
+
# Use the keyframe as reference, prompt describes 9 sequential panels
|
|
489
|
+
ima2 gen "Using this scene as reference, create a 3x3 storyboard grid (9 panels, thin black borders) showing a 15-second action sequence. Panel 1 (0s): ... Panel 2 (2s): ... Panel 9 (15s): ... Maintain identical character designs across all panels." \
|
|
490
|
+
--ref keyframe.png --quality high --size 1024x1024
|
|
491
|
+
```
|
|
448
492
|
|
|
449
|
-
|
|
450
|
-
|
|
493
|
+
**9-panel storyboard rules**:
|
|
494
|
+
- Grid layout: 3×3, thin black borders between panels
|
|
495
|
+
- Read order: left-to-right, top-to-bottom (panels 1-9)
|
|
496
|
+
- **Panel 1 (top-left) MUST be solid black** — this is a lead-in frame, not content. The i2v model starts from Panel 1's pixels; a black frame ensures the video begins with a clean fade-in instead of showing the grid. The 1-second black lead-in is auto-trimmed by the server.
|
|
497
|
+
- Panels 2-9 carry the action sequence (8 key moments with timestamps)
|
|
498
|
+
- Character designs MUST be identical across all panels
|
|
499
|
+
- Vary camera angle per panel for dynamic energy
|
|
500
|
+
- Each panel should look like a film still, not a sketch
|
|
501
|
+
- Do NOT add timestamp labels or text to panels — they burn into the video
|
|
502
|
+
- Square format (1024×1024) works best — i2v rescales internally
|
|
451
503
|
|
|
452
|
-
|
|
453
|
-
CLIP1=$(ima2 ls -n 1 --json | jq -r '.items[0].filename')
|
|
454
|
-
ima2 video continue "Elon counterattacks with haymaker" --video "$CLIP1" --duration 10
|
|
504
|
+
**Step 3 — Animate storyboard via i2v**:
|
|
455
505
|
|
|
456
|
-
|
|
506
|
+
```bash
|
|
507
|
+
ima2 video "This is a 9-panel storyboard. Animate the full sequence as one continuous 15-second clip following panels left-to-right, top-to-bottom. Panel 1: ... Panel 9: ... Sound: [describe music, SFX, dialogue]. Camera: [describe movement per beat]." \
|
|
508
|
+
--ref storyboard.png --duration 15 --resolution 720p --model grok-imagine-video-1.5-preview
|
|
457
509
|
```
|
|
458
510
|
|
|
459
|
-
**
|
|
460
|
-
-
|
|
461
|
-
-
|
|
462
|
-
-
|
|
463
|
-
-
|
|
464
|
-
-
|
|
511
|
+
**i2v prompt rules for storyboard input**:
|
|
512
|
+
- Explicitly state "This is a 9-panel storyboard" at the start
|
|
513
|
+
- Reference each panel by number with its action description
|
|
514
|
+
- Always include Sound/Music direction — never leave audio undefined
|
|
515
|
+
- Include Camera direction per beat (wide, close-up, tracking, handheld, slow-mo)
|
|
516
|
+
- Describe the end frame explicitly for continuation
|
|
517
|
+
|
|
518
|
+
**Step 4 — Extract last frame and repeat**:
|
|
519
|
+
|
|
520
|
+
```bash
|
|
521
|
+
# Extract last frame via ffmpeg
|
|
522
|
+
ffmpeg -sseof -0.1 -i clip.mp4 -frames:v 1 -q:v 2 -update 1 lastframe.jpg -y
|
|
523
|
+
|
|
524
|
+
# Generate next storyboard using last frame as reference
|
|
525
|
+
ima2 gen "Using this fight scene last frame as reference, create a 3x3 storyboard grid..." \
|
|
526
|
+
--ref lastframe.jpg --quality high --size 1024x1024
|
|
527
|
+
|
|
528
|
+
# Animate next storyboard
|
|
529
|
+
ima2 video "This is a 9-panel storyboard..." --ref storyboard2.png --duration 15
|
|
530
|
+
```
|
|
531
|
+
|
|
532
|
+
**Fallback: continueFromVideo** — If a storyboard image triggers content moderation (common with intense action/fight scenes), fall back to `video continue` with a detailed text prompt instead:
|
|
533
|
+
|
|
534
|
+
```bash
|
|
535
|
+
ima2 video continue "detailed action description with sound and camera direction" \
|
|
536
|
+
--video "$PREV_CLIP" --duration 15
|
|
537
|
+
```
|
|
538
|
+
|
|
539
|
+
**Clip duration is flexible** — use 15s for action-dense sequences with many beats, 10s for transitions, 5s for quick cuts. The 9-panel storyboard works best with 15s clips (each panel ≈ 1.5-2s of screen time).
|
|
540
|
+
|
|
541
|
+
**Music and sound are MANDATORY** in i2v prompts — describe the score (orchestral, percussion, taiko drums), sound effects (impacts, whooshes, crashes), dialogue lines, and audio transitions. "No music" or undefined audio produces flat, lifeless output.
|
|
465
542
|
|
|
466
543
|
#### Video Continuation (extend/sequel)
|
|
467
544
|
|