ima2-gen 2.0.0 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. package/CHANGELOG.md +150 -0
  2. package/README.md +12 -12
  3. package/bin/commands/backfillThumbs.js +24 -0
  4. package/bin/commands/edit.js +7 -6
  5. package/bin/commands/gen.js +13 -6
  6. package/bin/commands/multimode.js +5 -4
  7. package/bin/commands/node.js +4 -4
  8. package/bin/ima2.js +21 -11
  9. package/bin/lib/config-store.js +1 -1
  10. package/docs/API.md +184 -10
  11. package/docs/CLI.md +11 -4
  12. package/docs/FAQ.ko.md +16 -0
  13. package/docs/FAQ.md +30 -0
  14. package/docs/PROMPT_STUDIO.md +3 -1
  15. package/docs/README.ko.md +7 -3
  16. package/docs/migration/runtime-test-inventory.md +17 -1
  17. package/lib/agentImageVideoGen.js +261 -0
  18. package/lib/agentRuntime.js +11 -260
  19. package/lib/agentSettings.js +1 -1
  20. package/lib/agyImageAdapter.js +259 -0
  21. package/lib/capabilities.js +2 -1
  22. package/lib/configKeys.js +1 -1
  23. package/lib/errorClassify.js +8 -7
  24. package/lib/eventBus.js +71 -0
  25. package/lib/geminiApiImageAdapter.js +179 -0
  26. package/lib/generationErrors.js +3 -1
  27. package/lib/grokImageAdapter.js +74 -128
  28. package/lib/grokImageCore.js +153 -0
  29. package/lib/grokMultimodeAdapter.js +7 -4
  30. package/lib/grokRuntime.js +3 -0
  31. package/lib/grokSizeMapper.js +13 -1
  32. package/lib/grokVideoAdapter.js +14 -7
  33. package/lib/grokVideoCanvas.js +13 -0
  34. package/lib/grokVideoPlannerPrompt.js +53 -6
  35. package/lib/historyList.js +19 -2
  36. package/lib/imageModels.js +15 -0
  37. package/lib/imageThumb.js +38 -0
  38. package/lib/inflight.js +54 -17
  39. package/lib/multimodeHelpers.js +10 -0
  40. package/lib/nodeHelpers.js +59 -0
  41. package/lib/oauthProxy/prompts.js +30 -36
  42. package/lib/promptBuilder/systemPrompt.js +2 -5
  43. package/lib/promptSafetyPolicy.js +1 -5
  44. package/lib/providerOptions.js +36 -1
  45. package/lib/responsesFallback.js +53 -44
  46. package/lib/routeHelpers.js +44 -0
  47. package/lib/runtimeContext.js +27 -0
  48. package/lib/ssePublish.js +12 -0
  49. package/lib/storageMigration.js +1 -1
  50. package/lib/storyboardPrefix.js +28 -0
  51. package/lib/thumbBackfill.js +70 -0
  52. package/lib/vertexAuth.js +44 -0
  53. package/lib/videoThumb.js +60 -0
  54. package/package.json +7 -2
  55. package/routes/agy.js +44 -0
  56. package/routes/auth.js +242 -0
  57. package/routes/edit.js +48 -8
  58. package/routes/events.js +78 -0
  59. package/routes/generate.js +135 -135
  60. package/routes/history.js +13 -0
  61. package/routes/index.js +8 -0
  62. package/routes/keys.js +254 -0
  63. package/routes/multimode.js +138 -62
  64. package/routes/nodes.js +107 -129
  65. package/routes/quota.js +58 -7
  66. package/routes/video.js +107 -20
  67. package/server.js +123 -0
  68. package/skills/ima2/SKILL.md +98 -21
  69. package/ui/dist/.vite/manifest.json +12 -12
  70. package/ui/dist/assets/AgentWorkspace-Dth6YijN.js +3 -0
  71. package/ui/dist/assets/{CardNewsWorkspace-BN-ga1lG.js → CardNewsWorkspace-Dav3K5CT.js} +2 -2
  72. package/ui/dist/assets/{NodeCanvas-BbMa4IhI.js → NodeCanvas-C4ifFzB1.js} +2 -2
  73. package/ui/dist/assets/{PromptBuilderPanel-DRwBJRDQ.js → PromptBuilderPanel-CEcyU9PL.js} +1 -1
  74. package/ui/dist/assets/{PromptImportDialog-Dp85kHCq.js → PromptImportDialog-CgQ94Gth.js} +2 -2
  75. package/ui/dist/assets/{PromptImportDiscoverySection-BE8Q8MLD.js → PromptImportDiscoverySection-CuzyzbNI.js} +1 -1
  76. package/ui/dist/assets/{PromptImportFolderSection-PtH5x0sc.js → PromptImportFolderSection-DHLGlO6l.js} +1 -1
  77. package/ui/dist/assets/{PromptLibraryPanel-FnM9tHI9.js → PromptLibraryPanel-BOe18we8.js} +2 -2
  78. package/ui/dist/assets/SettingsWorkspace-Cdgnm4Wa.js +1 -0
  79. package/ui/dist/assets/index-C5PSahkr.js +1 -0
  80. package/ui/dist/assets/index-Dn2AhL6d.css +1 -0
  81. package/ui/dist/assets/index-Tjqx6wUV.js +23 -0
  82. package/ui/dist/index.html +2 -2
  83. package/ui/dist/assets/AgentWorkspace-C21zqdTZ.js +0 -3
  84. package/ui/dist/assets/SettingsWorkspace-MARPGyBL.js +0 -1
  85. package/ui/dist/assets/index-BAFI6htx.js +0 -42
  86. package/ui/dist/assets/index-BSXxr_Bt.js +0 -1
  87. package/ui/dist/assets/index-DS-ADE7U.css +0 -1
package/routes/video.js CHANGED
@@ -2,7 +2,11 @@ import { mkdir, readFile, unlink, writeFile } from "fs/promises";
2
2
  import { atomicWriteJson } from "../lib/atomicWrite.js";
3
3
  import { join } from "path";
4
4
  import { randomBytes } from "crypto";
5
- import { startJob, finishJob, registerJobAbortController, isJobCanceled, setJobPhase } from "../lib/inflight.js";
5
+ import { execFile } from "child_process";
6
+ import { tmpdir } from "os";
7
+ import { promisify } from "util";
8
+ const execFileAsync = promisify(execFile);
9
+ import { startJob, finishJob, registerJobAbortController, isJobCanceled, isStartJobFailure, setJobPhase, INFLIGHT_RETRY_AFTER_SECONDS } from "../lib/inflight.js";
6
10
  import { isGenerationCanceledError, makeGenerationCanceledError } from "../lib/generationCancel.js";
7
11
  import { logEvent, logError } from "../lib/logger.js";
8
12
  import { invalidateHistoryIndex } from "../lib/historyIndex.js";
@@ -13,10 +17,23 @@ import { extractGeneratedVideoFrameB64 } from "../lib/videoFrameExtract.js";
13
17
  import { normalizeGrokVideoModel, normalizeVideoResolution, normalizeVideoAspectRatio, normalizeVideoDuration, deriveVideoMode, clampVideoDuration, MAX_REF2V_REFERENCES, } from "../lib/imageModels.js";
14
18
  import { errInfo } from "../lib/errInfo.js";
15
19
  import { requireRuntimeContext } from "../lib/runtimeContext.js";
20
+ import { generateVideoThumbnail } from "../lib/videoThumb.js";
21
+ import { publish } from "../lib/eventBus.js";
22
+ import { publishJobEvent } from "../lib/ssePublish.js";
16
23
  function sendSse(res, event, data) {
17
24
  res.write(`event: ${event}\n`);
18
25
  res.write(`data: ${JSON.stringify(data)}\n\n`);
19
26
  }
27
+ function dualEmitVideo(res, requestId, event, data) {
28
+ if (!res.writableEnded)
29
+ sendSse(res, event, data);
30
+ if (event === "done") {
31
+ publishJobEvent(requestId, event, data);
32
+ }
33
+ else {
34
+ publish(requestId, event, data);
35
+ }
36
+ }
20
37
  function toArray(v) {
21
38
  return Array.isArray(v) ? v : [];
22
39
  }
@@ -50,6 +67,32 @@ async function resolveSourceImage(ctx, sourceImage, sourceFilename) {
50
67
  }
51
68
  return { b64: null, filename: null };
52
69
  }
70
+ const STORYBOARD_TRIM_SECONDS = "1.0";
71
+ async function trimStoryboardLeadIn(buffer, requestId) {
72
+ const tmpIn = join(tmpdir(), `ima2_sb_trim_in_${requestId.replace(/[^a-zA-Z0-9_-]/g, "_")}.mp4`);
73
+ const tmpOut = join(tmpdir(), `ima2_sb_trim_out_${requestId.replace(/[^a-zA-Z0-9_-]/g, "_")}.mp4`);
74
+ try {
75
+ await writeFile(tmpIn, buffer);
76
+ logEvent("video", "storyboard:trim-start", { requestId, inputBytes: buffer.length, trimSeconds: STORYBOARD_TRIM_SECONDS });
77
+ await execFileAsync("ffmpeg", [
78
+ "-y", "-ss", STORYBOARD_TRIM_SECONDS, "-i", tmpIn,
79
+ "-c:v", "libx264", "-preset", "fast", "-crf", "18",
80
+ "-c:a", "aac", "-b:a", "128k",
81
+ "-avoid_negative_ts", "make_zero", tmpOut,
82
+ ], { timeout: 60_000 });
83
+ const trimmed = await readFile(tmpOut);
84
+ logEvent("video", "storyboard:trimmed", { requestId, originalBytes: buffer.length, trimmedBytes: trimmed.length, trimSeconds: STORYBOARD_TRIM_SECONDS });
85
+ return trimmed;
86
+ }
87
+ catch (trimError) {
88
+ logEvent("video", "storyboard:trim-exec-error", { requestId, error: trimError.message, stderr: trimError.stderr?.slice?.(0, 500) });
89
+ throw trimError;
90
+ }
91
+ finally {
92
+ await unlink(tmpIn).catch(() => { });
93
+ await unlink(tmpOut).catch(() => { });
94
+ }
95
+ }
53
96
  export function registerVideoRoutes(app, ctxRaw) {
54
97
  const ctx = requireRuntimeContext(ctxRaw);
55
98
  app.post("/api/video/generate", async (req, res) => {
@@ -58,30 +101,39 @@ export function registerVideoRoutes(app, ctxRaw) {
58
101
  : typeof req.body?.clientRequestId === "string"
59
102
  ? req.body.clientRequestId
60
103
  : req.id;
104
+ const asyncMode = req.body?.async === true;
61
105
  let finishStatus = "completed";
62
106
  let finishHttpStatus = 200;
63
107
  let finishErrorCode;
64
108
  let finishMeta = {};
65
109
  let finishCanceled = false;
66
110
  const cancelController = new AbortController();
67
- res.setHeader("Content-Type", "text/event-stream; charset=utf-8");
68
- res.setHeader("Cache-Control", "no-cache, no-transform");
69
- res.setHeader("Connection", "keep-alive");
70
- res.flushHeaders?.();
111
+ if (!asyncMode) {
112
+ res.setHeader("Content-Type", "text/event-stream; charset=utf-8");
113
+ res.setHeader("Cache-Control", "no-cache, no-transform");
114
+ res.setHeader("Connection", "keep-alive");
115
+ res.flushHeaders?.();
116
+ }
71
117
  const fail = (status, code, error, extra = {}) => {
72
118
  const httpStatus = status ?? 500;
73
119
  finishStatus = "error";
74
120
  finishHttpStatus = httpStatus;
75
121
  finishErrorCode = code;
76
- sendSse(res, "error", { error, code, status: httpStatus, requestId, ...extra });
122
+ const payload = { error, code, status: httpStatus, requestId, ...extra };
123
+ publish(requestId, "error", payload);
124
+ if (asyncMode && !res.headersSent) {
125
+ return res.status(httpStatus).json(payload);
126
+ }
127
+ if (!res.writableEnded)
128
+ sendSse(res, "error", payload);
77
129
  };
78
130
  try {
79
131
  const { prompt, provider = "grok", model: rawModel } = req.body || {};
80
132
  const sessionId = typeof req.body?.sessionId === "string" ? req.body.sessionId : null;
81
133
  const clientNodeId = typeof req.body?.clientNodeId === "string" ? req.body.clientNodeId : null;
82
134
  const topic = typeof req.body?.topic === "string" ? req.body.topic.trim() : "";
83
- if (provider !== "grok")
84
- return fail(400, "VIDEO_PROVIDER_UNSUPPORTED", "video generation requires provider 'grok'");
135
+ if (provider !== "grok" && provider !== "grok-api")
136
+ return fail(400, provider === "agy" ? "AGY_VIDEO_UNSUPPORTED" : "VIDEO_PROVIDER_UNSUPPORTED", provider === "agy" ? "Gemini (agy) does not support video generation" : "video generation requires provider 'grok' or 'grok-api'");
85
137
  const storyboardActive = req.body?.storyboard === true;
86
138
  const storyboardPrefix = storyboardActive
87
139
  ? [
@@ -98,6 +150,14 @@ export function registerVideoRoutes(app, ctxRaw) {
98
150
  "- Lock lighting direction, color palette, environment, and style.",
99
151
  "- Describe ONLY what changes: action, camera movement, dialogue, sound.",
100
152
  "",
153
+ "STORYBOARD IMAGE SOURCE RULE (HIGHEST PRIORITY — OVERRIDES ALL OTHER RULES):",
154
+ "- The source image is a 3x3 storyboard grid. Panel 1 (top-left) is a BLACK LEAD-IN FRAME — it contains no scene content.",
155
+ "- The video starts from black (Panel 1), then transitions into the action scene from Panel 2.",
156
+ "- Panels 2-9 contain the action sequence. Describe and animate only Panels 2-9.",
157
+ "- Start your rewritten prompt with: 'Fading in from black into the full-screen scene of [Panel 2 description],' — the server auto-trims the black lead-in.",
158
+ "- The storyboard grid must NEVER appear as a visible grid in any frame. The output is a single continuous cinematic clip.",
159
+ "- Do NOT reference Panel 1 in the action description — it is only a technical black frame.",
160
+ "",
101
161
  "PROMPT STRUCTURE (layered caption format):",
102
162
  "- Shot foundation: type + camera motion (dolly, pan, tracking, crane, static).",
103
163
  "- Subject: action with intensity modifiers (crashes violently, drifts gently).",
@@ -164,25 +224,36 @@ export function registerVideoRoutes(app, ctxRaw) {
164
224
  }
165
225
  if (resolved.length > MAX_REF2V_REFERENCES)
166
226
  return fail(400, "GROK_VIDEO_REF_TOO_MANY", `at most ${MAX_REF2V_REFERENCES} reference images`);
167
- const mode = deriveVideoMode(resolved.length);
227
+ const incomingProviderUrl = typeof req.body?.providerUrl === "string" && req.body.providerUrl.startsWith("http") ? req.body.providerUrl : null;
228
+ const mode = incomingProviderUrl ? "image-to-video" : deriveVideoMode(resolved.length);
168
229
  const duration = clampVideoDuration(durationCheck.duration, mode);
169
230
  const referenceImages = mode === "reference-to-video" ? resolved.map((r) => r.b64) : undefined;
170
- const sourceB64 = mode === "image-to-video" ? resolved[0]?.b64 : undefined;
231
+ const sourceB64 = incomingProviderUrl || (mode === "image-to-video" ? resolved[0]?.b64 : undefined);
171
232
  const sourceFilename = resolved[0]?.filename ?? null;
172
- startJob({
233
+ const started = startJob({
173
234
  requestId,
174
235
  kind: "video",
175
236
  prompt: activePrompt,
176
237
  meta: { kind: "video", sessionId, clientNodeId, model: modelCheck.model, mode, duration, resolution: resolutionCheck.resolution },
177
238
  });
239
+ if (started && isStartJobFailure(started)) {
240
+ if (started.code === "TOO_MANY_JOBS") {
241
+ res.setHeader("Retry-After", String(INFLIGHT_RETRY_AFTER_SECONDS));
242
+ }
243
+ return fail(started.code === "TOO_MANY_JOBS" ? 429 : 409, started.code, started.code === "TOO_MANY_JOBS"
244
+ ? "Too many concurrent generation jobs"
245
+ : "Request ID already in use");
246
+ }
178
247
  registerJobAbortController(requestId, cancelController);
248
+ if (asyncMode)
249
+ res.status(202).json({ requestId });
179
250
  await mkdir(ctx.config.storage.generatedDir, { recursive: true });
180
251
  logEvent("video", "request", { requestId, mode, duration, resolution: resolutionCheck.resolution, aspectRatio: aspectCheck.aspectRatio });
181
252
  const startTime = Date.now();
182
253
  const onEvent = (ev) => {
183
254
  if (ev.phase === "submitted") {
184
255
  setJobPhase(requestId, "streaming");
185
- sendSse(res, "submitted", {
256
+ dualEmitVideo(res, requestId, "submitted", {
186
257
  requestId,
187
258
  xaiVideoRequestId: ev.xaiVideoRequestId,
188
259
  requestedModel: ev.requestedModel,
@@ -191,11 +262,11 @@ export function registerVideoRoutes(app, ctxRaw) {
191
262
  });
192
263
  }
193
264
  else if (ev.phase === "progress") {
194
- sendSse(res, "progress", { requestId, progress: typeof ev.progress === "number" ? ev.progress / 100 : null, stalled: Boolean(ev.stalled) });
265
+ dualEmitVideo(res, requestId, "progress", { requestId, progress: typeof ev.progress === "number" ? ev.progress / 100 : null, stalled: Boolean(ev.stalled) });
195
266
  }
196
267
  else {
197
268
  setJobPhase(requestId, "planning");
198
- sendSse(res, "planning", { requestId });
269
+ dualEmitVideo(res, requestId, "planning", { requestId });
199
270
  }
200
271
  };
201
272
  // Build prompt with series chain context
@@ -205,6 +276,7 @@ export function registerVideoRoutes(app, ctxRaw) {
205
276
  : activePrompt;
206
277
  const effectivePrompt = storyboardPrefix + basePrompt;
207
278
  const plannerModel = typeof req.body?.plannerModel === "string" ? req.body.plannerModel.trim() : undefined;
279
+ const directApiKey = provider === "grok-api" ? ctx.xaiApiKey : undefined;
208
280
  const result = await generateVideoViaGrok(effectivePrompt, ctx, {
209
281
  model: modelCheck.model,
210
282
  mode,
@@ -217,7 +289,9 @@ export function registerVideoRoutes(app, ctxRaw) {
217
289
  requestId,
218
290
  continuityLineage: parentLineage,
219
291
  plannerModel: plannerModel || undefined,
292
+ directApiKey,
220
293
  onEvent,
294
+ storyboardActive,
221
295
  });
222
296
  const rand = randomBytes(ctx.config.ids.generatedHexBytes).toString("hex");
223
297
  const filename = `${Date.now()}_${rand}.mp4`;
@@ -231,13 +305,14 @@ export function registerVideoRoutes(app, ctxRaw) {
231
305
  const meta = {
232
306
  kind: "video",
233
307
  mediaType: "video",
308
+ providerUrl: result.url,
234
309
  requestId,
235
310
  sessionId,
236
311
  clientNodeId,
237
312
  prompt: activePrompt,
238
313
  userPrompt: activePrompt,
239
314
  revisedPrompt: result.revisedPrompt,
240
- provider: "grok",
315
+ provider,
241
316
  model: result.effectiveModel,
242
317
  requestedModel: result.requestedModel,
243
318
  effectiveModel: result.effectiveModel,
@@ -260,14 +335,25 @@ export function registerVideoRoutes(app, ctxRaw) {
260
335
  ...(topic ? { videoSeries: { topic, chainIndex: chain.length } } : {}),
261
336
  ...(storyboardActive ? { storyboard: true } : {}),
262
337
  };
263
- await saveGeneratedVideoArtifact(ctx, filename, result.videoBuffer, meta);
338
+ let finalBuffer = result.videoBuffer;
339
+ if (storyboardActive) {
340
+ try {
341
+ finalBuffer = await trimStoryboardLeadIn(result.videoBuffer, requestId);
342
+ }
343
+ catch (trimErr) {
344
+ logEvent("video", "storyboard:trim-failed", { requestId, error: trimErr.message });
345
+ }
346
+ }
347
+ await saveGeneratedVideoArtifact(ctx, filename, finalBuffer, meta);
348
+ generateVideoThumbnail(join(ctx.config.storage.generatedDir, filename)).catch(() => { });
264
349
  invalidateHistoryIndex();
265
350
  finishMeta = { filename, xaiVideoRequestId: result.xaiVideoRequestId };
266
351
  logEvent("video", "saved", { requestId, filename, bytes: result.videoBuffer.length, elapsedMs: Date.now() - startTime });
267
- sendSse(res, "done", {
352
+ dualEmitVideo(res, requestId, "done", {
268
353
  requestId,
269
354
  filename,
270
355
  url: `/generated/${encodeURIComponent(filename)}`,
356
+ providerUrl: result.url,
271
357
  mediaType: "video",
272
358
  revisedPrompt: result.revisedPrompt,
273
359
  elapsed,
@@ -287,19 +373,20 @@ export function registerVideoRoutes(app, ctxRaw) {
287
373
  finishCanceled = true;
288
374
  finishHttpStatus = canceled.status;
289
375
  finishErrorCode = canceled.code;
290
- sendSse(res, "error", { error: canceled.message, code: canceled.code, status: canceled.status, requestId });
376
+ dualEmitVideo(res, requestId, "error", { error: canceled.message, code: canceled.code, status: canceled.status, requestId });
291
377
  }
292
378
  else {
293
379
  finishStatus = "error";
294
380
  finishHttpStatus = err.status || 500;
295
381
  finishErrorCode = err.code || "GROK_VIDEO_FAILED";
296
382
  logError("video", "error", err.raw, { requestId, code: finishErrorCode });
297
- sendSse(res, "error", { error: err.message, code: finishErrorCode, status: finishHttpStatus, requestId });
383
+ dualEmitVideo(res, requestId, "error", { error: err.message, code: finishErrorCode, status: finishHttpStatus, requestId });
298
384
  }
299
385
  }
300
386
  finally {
301
387
  finishJob(requestId, { canceled: finishCanceled, status: finishStatus, httpStatus: finishHttpStatus, errorCode: finishErrorCode, meta: finishMeta });
302
- res.end();
388
+ if (!res.writableEnded)
389
+ res.end();
303
390
  }
304
391
  });
305
392
  }
package/server.js CHANGED
@@ -42,6 +42,92 @@ async function loadApiKey() {
42
42
  }
43
43
  return { apiKey: null, apiKeySource: "none" };
44
44
  }
45
+ async function loadXaiApiKey() {
46
+ if (process.env.XAI_API_KEY) {
47
+ return { apiKey: process.env.XAI_API_KEY, apiKeySource: "env" };
48
+ }
49
+ const candidates = [
50
+ config.storage.configFile,
51
+ join(rootDir, ".ima2", "config.json"),
52
+ ];
53
+ for (const cfgPath of candidates) {
54
+ if (!existsSync(cfgPath))
55
+ continue;
56
+ try {
57
+ const cfg = JSON.parse(await readFile(cfgPath, "utf-8"));
58
+ if (cfg.xaiApiKey)
59
+ return { apiKey: cfg.xaiApiKey, apiKeySource: "config" };
60
+ }
61
+ catch { }
62
+ }
63
+ return { apiKey: null, apiKeySource: "none" };
64
+ }
65
+ async function loadGeminiApiKey() {
66
+ if (process.env.GEMINI_API_KEY) {
67
+ return { apiKey: process.env.GEMINI_API_KEY, apiKeySource: "env" };
68
+ }
69
+ const candidates = [
70
+ config.storage.configFile,
71
+ join(rootDir, ".ima2", "config.json"),
72
+ ];
73
+ for (const cfgPath of candidates) {
74
+ if (!existsSync(cfgPath))
75
+ continue;
76
+ try {
77
+ const cfg = JSON.parse(await readFile(cfgPath, "utf-8"));
78
+ if (cfg.geminiApiKey)
79
+ return { apiKey: cfg.geminiApiKey, apiKeySource: "config" };
80
+ }
81
+ catch { }
82
+ }
83
+ return { apiKey: null, apiKeySource: "none" };
84
+ }
85
+ async function loadVertexKey() {
86
+ const envJson = process.env.VERTEX_SERVICE_ACCOUNT_JSON;
87
+ if (envJson) {
88
+ try {
89
+ const parsed = JSON.parse(envJson);
90
+ return { json: envJson, projectId: parsed.project_id || null, source: "env" };
91
+ }
92
+ catch {
93
+ return { json: null, projectId: null, source: "none" };
94
+ }
95
+ }
96
+ const candidates = [
97
+ config.storage.configFile,
98
+ join(rootDir, ".ima2", "config.json"),
99
+ ];
100
+ for (const cfgPath of candidates) {
101
+ if (!existsSync(cfgPath))
102
+ continue;
103
+ try {
104
+ const cfg = JSON.parse(await readFile(cfgPath, "utf-8"));
105
+ if (cfg.vertexServiceAccountJson) {
106
+ const parsed = JSON.parse(cfg.vertexServiceAccountJson);
107
+ return { json: cfg.vertexServiceAccountJson, projectId: parsed.project_id || null, source: "config" };
108
+ }
109
+ }
110
+ catch { }
111
+ }
112
+ return { json: null, projectId: null, source: "none" };
113
+ }
114
+ async function loadGeminiAuthMode() {
115
+ const candidates = [
116
+ config.storage.configFile,
117
+ join(rootDir, ".ima2", "config.json"),
118
+ ];
119
+ for (const cfgPath of candidates) {
120
+ if (!existsSync(cfgPath))
121
+ continue;
122
+ try {
123
+ const cfg = JSON.parse(await readFile(cfgPath, "utf-8"));
124
+ if (cfg.geminiAuthMode === "vertex" || cfg.geminiAuthMode === "apikey")
125
+ return cfg.geminiAuthMode;
126
+ }
127
+ catch { }
128
+ }
129
+ return undefined;
130
+ }
45
131
  async function createOpenAI(apiKey) {
46
132
  if (!apiKey)
47
133
  return null;
@@ -143,6 +229,10 @@ export async function createRuntimeContext(overrides = {}) {
143
229
  apiKeySource: overrides.apiKeySource ?? (overrides.apiKey ? "env" : "none"),
144
230
  }
145
231
  : await loadApiKey();
232
+ const loadedXaiKey = await loadXaiApiKey();
233
+ const loadedGeminiKey = await loadGeminiApiKey();
234
+ const loadedVertexKey = await loadVertexKey();
235
+ const geminiAuthMode = await loadGeminiAuthMode();
146
236
  const apiKey = loadedKey.apiKey;
147
237
  const openai = overrides.openai ?? await createOpenAI(apiKey);
148
238
  const oauthPort = config.oauth.proxyPort;
@@ -170,6 +260,16 @@ export async function createRuntimeContext(overrides = {}) {
170
260
  openai,
171
261
  startedAt: overrides.startedAt ?? Date.now(),
172
262
  packageVersion: overrides.packageVersion ?? readPackageVersion(),
263
+ xaiApiKey: loadedXaiKey.apiKey ?? undefined,
264
+ xaiApiKeySource: loadedXaiKey.apiKeySource,
265
+ hasXaiApiKey: !!loadedXaiKey.apiKey,
266
+ geminiApiKey: loadedGeminiKey.apiKey ?? undefined,
267
+ geminiApiKeySource: loadedGeminiKey.apiKeySource,
268
+ hasGeminiApiKey: !!loadedGeminiKey.apiKey,
269
+ vertexServiceAccountJson: loadedVertexKey.json ?? undefined,
270
+ vertexProjectId: loadedVertexKey.projectId ?? undefined,
271
+ hasVertexKey: !!loadedVertexKey.json,
272
+ geminiAuthMode,
173
273
  oauthReadyPromise: oauthReadyPromise,
174
274
  markGrokProxyPort: ({ url, port } = {}) => {
175
275
  if (port)
@@ -194,6 +294,13 @@ export async function createRuntimeContext(overrides = {}) {
194
294
  };
195
295
  if (!config.oauth.autoStart)
196
296
  ctx.markOAuthReady({ url: ctx.oauthUrl, port: ctx.oauthPort });
297
+ if (loadedVertexKey.json) {
298
+ try {
299
+ const { initVertexAuth } = await import("./lib/vertexAuth.js");
300
+ initVertexAuth(loadedVertexKey.json);
301
+ }
302
+ catch { /* vertex init failure is non-fatal */ }
303
+ }
197
304
  return ctx;
198
305
  }
199
306
  export async function startServer(overrides = {}) {
@@ -283,6 +390,22 @@ export async function startServer(overrides = {}) {
283
390
  const err = errInfo(e);
284
391
  console.error("[db] bootstrap failed:", err.message);
285
392
  }
393
+ // Background thumbnail backfill for updated users (recursive — covers video
394
+ // series subdirectories like continuous_*/clip_NN.mp4, not just top level).
395
+ (async () => {
396
+ try {
397
+ const { backfillThumbnails } = await import("./lib/thumbBackfill.js");
398
+ const r = await backfillThumbnails(ctx.config.storage.generatedDir);
399
+ if (r.created > 0) {
400
+ console.log(`[thumbs] backfill: ${r.created} created, ${r.skipped} skipped, ${r.failed} failed (${r.total} media files)`);
401
+ const { invalidateHistoryIndex } = await import("./lib/historyIndex.js");
402
+ invalidateHistoryIndex();
403
+ }
404
+ }
405
+ catch (e) {
406
+ console.warn("[thumbs] backfill failed:", e instanceof Error ? e.message : e);
407
+ }
408
+ })();
286
409
  server.on("error", (err) => {
287
410
  console.error("[server] Failed to start:", err?.message || err);
288
411
  process.exit(1);
@@ -143,13 +143,22 @@ Do not use positional edit prompts. `ima2 edit` requires `--prompt`.
143
143
 
144
144
  ## Parallel Generation
145
145
 
146
- There is no `--parallel` flag. For CLI-controlled parallel work, start several normal jobs:
146
+ There is no `--parallel` flag. For multiple candidates from the same prompt,
147
+ prefer one server-side batch request:
147
148
 
148
149
  ```bash
149
- ima2 gen "variation 1" --quality high
150
- ima2 gen "variation 2" --quality high
151
- ima2 gen "variation 3" --quality high
152
- ima2 gen "variation 4" --quality high
150
+ ima2 gen "four poster candidates" -n 4 -d ./out --quality high
151
+ ima2 multimode "four different poster directions" --max-images 4
152
+ ```
153
+
154
+ For truly different prompts, independent CLI jobs can run concurrently against
155
+ the same server. Capture request IDs with JSON output, then monitor or cancel:
156
+
157
+ ```bash
158
+ ima2 gen "variation 1" --quality high --json
159
+ ima2 gen "variation 2" --quality high --json
160
+ ima2 ps --json
161
+ ima2 cancel <requestId>
153
162
  ```
154
163
 
155
164
  Treat `capabilities.limits.maxParallel` as advisory client-side queue guidance only.
@@ -402,6 +411,12 @@ Generate a high-quality still image first, then animate it. This produces better
402
411
 
403
412
  **Critical rule for i2v**: Compose ALL characters and the environment together in ONE image. Do NOT use individual portrait refs for i2v — the video model needs a single composed scene to animate from.
404
413
 
414
+ **Keyframe image provider rule (MANDATORY)**:
415
+ - **Primary**: GPT Image 2 (OpenAI, `provider: oauth`) with `quality: high`, maximum resolution matching the target video aspect ratio. For 16:9 video use `1792x1024`. For 1:1 use `1024x1024`. For 9:16 use `1024x1792`.
416
+ - **Fallback**: Grok (`provider: grok`, model `grok-imagine-image-quality`). Only aspect ratio must match — resolution does not matter because i2v accepts any resolution source image and internally rescales.
417
+ - GPT Image 2 produces superior keyframes: better lighting coherence, character consistency, and fine detail that survives i2v animation. Always try GPT first.
418
+ - The i2v model internally rescales the source image to its native resolution regardless of input size, so there is no benefit to upscaling a Grok fallback image.
419
+
405
420
  **ref2v vs i2v decision**:
406
421
 
407
422
  | Scenario | Use | Why |
@@ -412,8 +427,12 @@ Generate a high-quality still image first, then animate it. This produces better
412
427
 
413
428
  ```bash
414
429
  # Multi-character scene: compose BOTH characters in one image first
430
+ # Primary: GPT Image 2 at high quality, max resolution, aspect ratio matching 16:9 video
415
431
  ima2 gen "cinematic wide shot of Bruce Lee in yellow tracksuit facing Elon Musk in dark gi, underground fight arena, dramatic lighting, 16:9" --quality high --size 1792x1024 -o scene.png
416
432
 
433
+ # Fallback if GPT fails: Grok quality model, match aspect ratio only
434
+ # ima2 gen "same prompt" --provider grok --model grok-imagine-image-quality --size 1824x1024 -o scene.png
435
+
417
436
  # Then animate from the composed scene
418
437
  ima2 video "Bruce throws a rapid jeet kune do combination" --ref scene.png --duration 10 --resolution 720p --aspect-ratio 16:9
419
438
  ```
@@ -438,30 +457,88 @@ ima2 video "close-up of rain drops on a neon sign reflection" \
438
457
 
439
458
  The planner receives previous prompts from the same topic as continuity context. This is best-effort prompt guidance, not a guarantee that subjects, palette, or style will remain identical. For branch-local continuation, use `ima2 video continue` instead.
440
459
 
441
- #### Storyboard-to-Video Chaining (imagevideo→lastframe loop)
460
+ #### Storyboard-to-Video Chaining (9-panel storyboard i2v loop)
461
+
462
+ The highest-quality video production workflow. Since Grok i2v accepts only **one image input**, pack the entire action sequence into a single 3×3 (9-panel) storyboard grid image. The i2v model reads the panels as a visual script and animates the progression.
463
+
464
+ **Full workflow**:
465
+
466
+ ```
467
+ keyframe image (GPT high)
468
+ → GPT i2i with reference → 9-panel storyboard grid
469
+ → Grok i2v (reads panels, animates sequence)
470
+ → extract last frame
471
+ → GPT i2i with last frame → next 9-panel storyboard
472
+ → Grok i2v
473
+ → repeat
474
+ ```
475
+
476
+ **Step 1 — Opening keyframe** (GPT Image 2, `quality: high`, max resolution matching target aspect ratio):
477
+
478
+ ```bash
479
+ ima2 gen "cinematic wide shot of two fighters in a dojo, dramatic lighting" \
480
+ --quality high --size 1792x1024 --storyboard
481
+ ```
482
+
483
+ Fallback: Grok `grok-imagine-image-quality`, match aspect ratio only — resolution does not matter because i2v internally rescales.
442
484
 
443
- For maximum control, generate each keyframe as a GPT Image 2 still, animate it, extract the last frame, and use it as the anchor for the next keyframe:
485
+ **Step 2 9-panel storyboard grid** (GPT Image 2 with keyframe as reference):
444
486
 
445
487
  ```bash
446
- # Step 1: Generate composed keyframe
447
- ima2 gen "Bruce and Elon face off in underground arena, dramatic lighting" --quality high --size 1792x1024 -o frame1.png
488
+ # Use the keyframe as reference, prompt describes 9 sequential panels
489
+ ima2 gen "Using this scene as reference, create a 3x3 storyboard grid (9 panels, thin black borders) showing a 15-second action sequence. Panel 1 (0s): ... Panel 2 (2s): ... Panel 9 (15s): ... Maintain identical character designs across all panels." \
490
+ --ref keyframe.png --quality high --size 1024x1024
491
+ ```
448
492
 
449
- # Step 2: Animate (i2v, 10s clip)
450
- ima2 video "Bruce throws JKD combination" --ref frame1.png --duration 10 --resolution 720p
493
+ **9-panel storyboard rules**:
494
+ - Grid layout: 3×3, thin black borders between panels
495
+ - Read order: left-to-right, top-to-bottom (panels 1-9)
496
+ - **Panel 1 (top-left) MUST be solid black** — this is a lead-in frame, not content. The i2v model starts from Panel 1's pixels; a black frame ensures the video begins with a clean fade-in instead of showing the grid. The 1-second black lead-in is auto-trimmed by the server.
497
+ - Panels 2-9 carry the action sequence (8 key moments with timestamps)
498
+ - Character designs MUST be identical across all panels
499
+ - Vary camera angle per panel for dynamic energy
500
+ - Each panel should look like a film still, not a sketch
501
+ - Do NOT add timestamp labels or text to panels — they burn into the video
502
+ - Square format (1024×1024) works best — i2v rescales internally
451
503
 
452
- # Step 3: Continue from last frame (sequential, not parallel)
453
- CLIP1=$(ima2 ls -n 1 --json | jq -r '.items[0].filename')
454
- ima2 video continue "Elon counterattacks with haymaker" --video "$CLIP1" --duration 10
504
+ **Step 3 Animate storyboard via i2v**:
455
505
 
456
- # Repeat: each clip's last frame seeds the next
506
+ ```bash
507
+ ima2 video "This is a 9-panel storyboard. Animate the full sequence as one continuous 15-second clip following panels left-to-right, top-to-bottom. Panel 1: ... Panel 9: ... Sound: [describe music, SFX, dialogue]. Camera: [describe movement per beat]." \
508
+ --ref storyboard.png --duration 15 --resolution 720p --model grok-imagine-video-1.5-preview
457
509
  ```
458
510
 
459
- **GPT Image 2 storyboard prompting rules** (from production research):
460
- - Copy character visual descriptions **verbatim** across all frame prompts — do not paraphrase
461
- - First frame is the **anchor**: all subsequent frames inherit its composition, lighting, and character designs
462
- - Change **one variable per step**: shot scale, action, or camera keep everything else constant
463
- - Use the `images.edit` API with `image[]` array or Responses API `input_image` content blocks for multi-ref
464
- - ChatGPT Thinking mode (not API) can produce up to 8 consistent frames from one prompt; API users should generate frames sequentially with shared character descriptions
511
+ **i2v prompt rules for storyboard input**:
512
+ - Explicitly state "This is a 9-panel storyboard" at the start
513
+ - Reference each panel by number with its action description
514
+ - Always include Sound/Music directionnever leave audio undefined
515
+ - Include Camera direction per beat (wide, close-up, tracking, handheld, slow-mo)
516
+ - Describe the end frame explicitly for continuation
517
+
518
+ **Step 4 — Extract last frame and repeat**:
519
+
520
+ ```bash
521
+ # Extract last frame via ffmpeg
522
+ ffmpeg -sseof -0.1 -i clip.mp4 -frames:v 1 -q:v 2 -update 1 lastframe.jpg -y
523
+
524
+ # Generate next storyboard using last frame as reference
525
+ ima2 gen "Using this fight scene last frame as reference, create a 3x3 storyboard grid..." \
526
+ --ref lastframe.jpg --quality high --size 1024x1024
527
+
528
+ # Animate next storyboard
529
+ ima2 video "This is a 9-panel storyboard..." --ref storyboard2.png --duration 15
530
+ ```
531
+
532
+ **Fallback: continueFromVideo** — If a storyboard image triggers content moderation (common with intense action/fight scenes), fall back to `video continue` with a detailed text prompt instead:
533
+
534
+ ```bash
535
+ ima2 video continue "detailed action description with sound and camera direction" \
536
+ --video "$PREV_CLIP" --duration 15
537
+ ```
538
+
539
+ **Clip duration is flexible** — use 15s for action-dense sequences with many beats, 10s for transitions, 5s for quick cuts. The 9-panel storyboard works best with 15s clips (each panel ≈ 1.5-2s of screen time).
540
+
541
+ **Music and sound are MANDATORY** in i2v prompts — describe the score (orchestral, percussion, taiko drums), sound effects (impacts, whooshes, crashes), dialogue lines, and audio transitions. "No music" or undefined audio produces flat, lifeless output.
465
542
 
466
543
  #### Video Continuation (extend/sequel)
467
544