ima2-gen 2.0.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/README.md +2 -11
  2. package/bin/commands/backfillThumbs.js +18 -0
  3. package/bin/commands/edit.js +7 -6
  4. package/bin/commands/gen.js +7 -6
  5. package/bin/commands/multimode.js +5 -4
  6. package/bin/commands/node.js +4 -4
  7. package/bin/ima2.js +7 -1
  8. package/bin/lib/config-store.js +1 -1
  9. package/docs/API.md +55 -4
  10. package/docs/CLI.md +9 -3
  11. package/docs/PROMPT_STUDIO.md +3 -1
  12. package/docs/migration/runtime-test-inventory.md +3 -1
  13. package/lib/agentRuntime.js +22 -16
  14. package/lib/agentSettings.js +1 -1
  15. package/lib/agyImageAdapter.js +232 -0
  16. package/lib/capabilities.js +2 -1
  17. package/lib/configKeys.js +1 -1
  18. package/lib/geminiApiImageAdapter.js +183 -0
  19. package/lib/grokImageAdapter.js +16 -9
  20. package/lib/grokMultimodeAdapter.js +2 -1
  21. package/lib/grokRuntime.js +3 -0
  22. package/lib/grokSizeMapper.js +13 -1
  23. package/lib/grokVideoAdapter.js +14 -7
  24. package/lib/historyList.js +18 -2
  25. package/lib/imageModels.js +15 -0
  26. package/lib/imageThumb.js +38 -0
  27. package/lib/providerOptions.js +36 -1
  28. package/lib/responsesFallback.js +52 -44
  29. package/lib/runtimeContext.js +27 -0
  30. package/lib/storageMigration.js +1 -1
  31. package/lib/thumbBackfill.js +59 -0
  32. package/lib/vertexAuth.js +44 -0
  33. package/lib/videoThumb.js +60 -0
  34. package/package.json +4 -2
  35. package/routes/auth.js +238 -0
  36. package/routes/edit.js +41 -7
  37. package/routes/generate.js +40 -12
  38. package/routes/history.js +13 -0
  39. package/routes/index.js +4 -0
  40. package/routes/keys.js +254 -0
  41. package/routes/multimode.js +39 -6
  42. package/routes/nodes.js +57 -35
  43. package/routes/quota.js +58 -7
  44. package/routes/video.js +7 -3
  45. package/server.js +123 -0
  46. package/ui/dist/.vite/manifest.json +12 -12
  47. package/ui/dist/assets/AgentWorkspace-CYv84Rus.js +3 -0
  48. package/ui/dist/assets/{CardNewsWorkspace-BN-ga1lG.js → CardNewsWorkspace-Dqyc1WZ1.js} +2 -2
  49. package/ui/dist/assets/{NodeCanvas-BbMa4IhI.js → NodeCanvas-ChEXzQbb.js} +2 -2
  50. package/ui/dist/assets/{PromptBuilderPanel-DRwBJRDQ.js → PromptBuilderPanel-B95ZufnR.js} +1 -1
  51. package/ui/dist/assets/{PromptImportDialog-Dp85kHCq.js → PromptImportDialog-DGOwFQET.js} +2 -2
  52. package/ui/dist/assets/{PromptImportDiscoverySection-BE8Q8MLD.js → PromptImportDiscoverySection-CgvdnR49.js} +1 -1
  53. package/ui/dist/assets/{PromptImportFolderSection-PtH5x0sc.js → PromptImportFolderSection-CfUye9J8.js} +1 -1
  54. package/ui/dist/assets/{PromptLibraryPanel-FnM9tHI9.js → PromptLibraryPanel-B9kndPw1.js} +2 -2
  55. package/ui/dist/assets/SettingsWorkspace-B3tgLrmF.js +1 -0
  56. package/ui/dist/assets/index-BhcvL0g-.js +1 -0
  57. package/ui/dist/assets/index-BtK3YhJc.js +39 -0
  58. package/ui/dist/assets/index-ClOLOjnA.css +1 -0
  59. package/ui/dist/index.html +2 -2
  60. package/ui/dist/assets/AgentWorkspace-C21zqdTZ.js +0 -3
  61. package/ui/dist/assets/SettingsWorkspace-MARPGyBL.js +0 -1
  62. package/ui/dist/assets/index-BAFI6htx.js +0 -42
  63. package/ui/dist/assets/index-BSXxr_Bt.js +0 -1
  64. package/ui/dist/assets/index-DS-ADE7U.css +0 -1
package/README.md CHANGED
@@ -83,16 +83,6 @@ npm install -g ima2-gen@latest
83
83
 
84
84
  Ctrl+C now performs a clean shutdown — closing the database, stopping child processes, and releasing file locks. On older versions (< 1.1.22) or if you see `EBUSY` on Windows, use the install script which handles stale process cleanup automatically.
85
85
 
86
- ## What's New in v1.1.22
87
-
88
- - **Storyboard mode**: composer toggle for maintaining character/scene continuity across sequential frames. Works in both image and video pipelines.
89
- - **Planner model selection**: choose the Grok planner model (grok-4.3 default) from video settings or via `--planner-model` CLI flag.
90
- - **Video frame copy**: First/Mid/Last frame extraction buttons on video results for easy keyframe copying.
91
- - **Multi-character dialogue**: video/image planners now identify characters by visual appearance (clothing + physique + props) instead of names, improving dialogue attribution.
92
- - **Graceful shutdown**: Ctrl+C now properly closes DB, server sockets, and child processes — fixes Windows EBUSY on npm update.
93
- - **Cross-platform install scripts**: one-click install for macOS, Windows, and Linux (auto-detects nvm/fnm/brew/winget).
94
- - **Atomic sidecar writes**: metadata files now use temp+rename to prevent corruption on crash.
95
-
96
86
  ## What It Does
97
87
 
98
88
  - **Classic mode**: generate, edit, reuse the current image, paste references, and continue from history.
@@ -109,11 +99,12 @@ Ctrl+C now performs a clean shutdown — closing the database, stopping child pr
109
99
 
110
100
  ## Provider Paths
111
101
 
112
- Image generation can run through the local Codex/ChatGPT OAuth path, a configured OpenAI API key, or the bundled Grok provider.
102
+ Image generation can run through the local Codex/ChatGPT OAuth path, a configured OpenAI API key, the bundled Grok provider, or the Gemini provider via Antigravity CLI.
113
103
 
114
104
  - `provider: "oauth"` uses the local Codex OAuth proxy.
115
105
  - `provider: "api"` calls the OpenAI Responses API with the hosted `image_generation` tool.
116
106
  - `provider: "grok"` starts bundled `progrok` on `127.0.0.1:18645`, runs mandatory xAI Web Search plus a planner pass (default: `grok-4.3`, configurable in settings or via `--planner-model`), then calls xAI Images API through the local proxy.
107
+ - `provider: "agy"` spawns the Antigravity CLI (`agy -p`) to generate images via Google Gemini's `default_api:generate_image` tool (model: `nano-banana-2`). Output is fixed at 1024×1024 JPEG, max 3 reference images. No web search, quality, or size controls.
117
108
  - API-key generation supports classic generate, edit, mask-guided edit, multimode, and node generation.
118
109
  - Grok generation supports Classic, Node, and Agent flows. If a Classic reference, Node parent image, or Agent current image is present, ima2 switches the final Grok call to xAI image edit so image-to-image context is preserved.
119
110
 
@@ -0,0 +1,18 @@
1
+ import { config } from "../../config.js";
2
+ import { backfillThumbnails } from "../../lib/thumbBackfill.js";
3
+ import { invalidateHistoryIndex } from "../../lib/historyIndex.js";
4
+ export async function backfillThumbs() {
5
+ const dir = config.storage.generatedDir;
6
+ console.log(`[thumbs] Scanning ${dir} (recursive) for missing thumbnails...`);
7
+ let r;
8
+ try {
9
+ r = await backfillThumbnails(dir);
10
+ }
11
+ catch (e) {
12
+ console.error("[thumbs] Backfill failed:", e instanceof Error ? e.message : e);
13
+ return;
14
+ }
15
+ if (r.created > 0)
16
+ invalidateHistoryIndex();
17
+ console.log(`[thumbs] Done: ${r.created} created, ${r.skipped} skipped (already exist), ${r.failed} failed out of ${r.total} media files.`);
18
+ }
@@ -7,8 +7,8 @@ import { createCliRequestId, recoverGeneratedOutputs, formatRecoveryHint } from
7
7
  import { errInfo } from "../../lib/errInfo.js";
8
8
  const VALID_MODES = new Set(["auto", "direct"]);
9
9
  const VALID_MODERATION = new Set(["auto", "low"]);
10
- const VALID_PROVIDERS = new Set(["auto", "oauth", "api", "grok"]);
11
- const KNOWN_IMAGE_MODELS = new Set(["gpt-5.5", "gpt-5.4", "gpt-5.4-mini", "gpt-5.3-codex-spark", "grok-imagine-image", "grok-imagine-image-quality"]);
10
+ const VALID_PROVIDERS = new Set(["auto", "oauth", "api", "grok", "grok-api", "agy", "gemini-api"]);
11
+ const KNOWN_IMAGE_MODELS = new Set(["gpt-5.5", "gpt-5.4", "gpt-5.4-mini", "gpt-5.3-codex-spark", "grok-imagine-image", "grok-imagine-image-quality", "nano-banana-2", "nano-banana-pro"]);
12
12
  const SPEC = {
13
13
  flags: {
14
14
  prompt: { short: "p", type: "string" },
@@ -40,8 +40,9 @@ const HELP = `
40
40
  -s, --size <WxH>
41
41
  -o, --out <file>
42
42
  --json
43
- --model <gpt-5.5|gpt-5.4|gpt-5.4-mini|grok-imagine-image|grok-imagine-image-quality>
44
- --provider <auto|oauth|api|grok> Provider (oauth = GPT OAuth; grok = xAI Grok)
43
+ --model <gpt-5.5|gpt-5.4|gpt-5.4-mini|grok-imagine-image|grok-imagine-image-quality|nano-banana-2|nano-banana-pro>
44
+ --provider <auto|oauth|api|grok|grok-api|agy|gemini-api>
45
+ Provider (oauth = GPT OAuth; grok = xAI Grok; agy/gemini-api = Gemini)
45
46
  --mode <auto|direct> Prompt handling mode. Default: auto
46
47
  --moderation <auto|low> Default: low
47
48
  --session <id> Apply session style sheet if enabled
@@ -64,10 +65,10 @@ export default async function editCmd(argv) {
64
65
  if (!VALID_MODERATION.has(String(args.moderation)))
65
66
  die(2, "--moderation must be one of: auto, low");
66
67
  if (args.provider && !VALID_PROVIDERS.has(String(args.provider))) {
67
- die(2, "--provider must be one of: auto, oauth, api, grok");
68
+ die(2, "--provider must be one of: auto, oauth, api, grok, grok-api, agy, gemini-api");
68
69
  }
69
70
  if (args.model && !KNOWN_IMAGE_MODELS.has(String(args.model))) {
70
- die(2, "--model must be one of: gpt-5.5, gpt-5.4, gpt-5.4-mini, gpt-5.3-codex-spark, grok-imagine-image, grok-imagine-image-quality");
71
+ die(2, "--model must be one of: gpt-5.5, gpt-5.4, gpt-5.4-mini, gpt-5.3-codex-spark, grok-imagine-image, grok-imagine-image-quality, nano-banana-2, nano-banana-pro");
71
72
  }
72
73
  const VALID_REASONING = new Set(["none", "low", "medium", "high", "xhigh"]);
73
74
  if (args["reasoning-effort"] && !VALID_REASONING.has(String(args["reasoning-effort"]))) {
@@ -7,8 +7,8 @@ import { createCliRequestId, recoverGeneratedOutputs, formatRecoveryHint } from
7
7
  import { errInfo } from "../../lib/errInfo.js";
8
8
  const VALID_MODES = new Set(["auto", "direct"]);
9
9
  const VALID_MODERATION = new Set(["auto", "low"]);
10
- const VALID_PROVIDERS = new Set(["auto", "oauth", "api", "grok"]);
11
- const KNOWN_IMAGE_MODELS = new Set(["gpt-5.5", "gpt-5.4", "gpt-5.4-mini", "gpt-5.3-codex-spark", "grok-imagine-image", "grok-imagine-image-quality"]);
10
+ const VALID_PROVIDERS = new Set(["auto", "oauth", "api", "grok", "grok-api", "agy", "gemini-api"]);
11
+ const KNOWN_IMAGE_MODELS = new Set(["gpt-5.5", "gpt-5.4", "gpt-5.4-mini", "gpt-5.3-codex-spark", "grok-imagine-image", "grok-imagine-image-quality", "nano-banana-2", "nano-banana-pro"]);
12
12
  const SPEC = {
13
13
  flags: {
14
14
  quality: { short: "q", type: "string", default: "low" },
@@ -51,8 +51,9 @@ const HELP = `
51
51
  --stdin Read prompt from stdin
52
52
  --timeout <sec> Default: 180
53
53
  --server <url> Override server URL
54
- --model <gpt-5.5|gpt-5.4|gpt-5.4-mini|grok-imagine-image|grok-imagine-image-quality>
55
- --provider <auto|oauth|api|grok> Provider (oauth = GPT OAuth; grok = xAI Grok)
54
+ --model <gpt-5.5|gpt-5.4|gpt-5.4-mini|grok-imagine-image|grok-imagine-image-quality|nano-banana-2|nano-banana-pro>
55
+ --provider <auto|oauth|api|grok|grok-api|agy|gemini-api>
56
+ Provider (oauth = GPT OAuth; grok = xAI Grok; agy/gemini-api = Gemini)
56
57
  --mode <auto|direct> Prompt handling mode. Default: auto
57
58
  --moderation <auto|low> Default: low
58
59
  --session <id> Apply session style sheet if enabled
@@ -88,10 +89,10 @@ export default async function genCmd(argv) {
88
89
  if (!VALID_MODERATION.has(String(args.moderation)))
89
90
  die(2, "--moderation must be one of: auto, low");
90
91
  if (args.provider && !VALID_PROVIDERS.has(String(args.provider))) {
91
- die(2, "--provider must be one of: auto, oauth, api, grok");
92
+ die(2, "--provider must be one of: auto, oauth, api, grok, grok-api, agy, gemini-api");
92
93
  }
93
94
  if (args.model && !KNOWN_IMAGE_MODELS.has(String(args.model))) {
94
- die(2, "--model must be one of: gpt-5.5, gpt-5.4, gpt-5.4-mini, gpt-5.3-codex-spark, grok-imagine-image, grok-imagine-image-quality");
95
+ die(2, "--model must be one of: gpt-5.5, gpt-5.4, gpt-5.4-mini, gpt-5.3-codex-spark, grok-imagine-image, grok-imagine-image-quality, nano-banana-2, nano-banana-pro");
95
96
  }
96
97
  const VALID_REASONING = new Set(["none", "low", "medium", "high", "xhigh"]);
97
98
  if (args["reasoning-effort"] && !VALID_REASONING.has(String(args["reasoning-effort"]))) {
@@ -40,8 +40,9 @@ const HELP = `
40
40
  -o, --out <file> First image (implies --max-images 1)
41
41
  -d, --out-dir <dir> Output dir for multiple images
42
42
  --json
43
- --model <gpt-5.5|gpt-5.4|gpt-5.4-mini>
44
- --provider <auto|oauth|api|grok> Provider (oauth = GPT OAuth; grok = xAI Grok)
43
+ --model <gpt-5.5|gpt-5.4|gpt-5.4-mini|grok-imagine-image|grok-imagine-image-quality|nano-banana-2|nano-banana-pro>
44
+ --provider <auto|oauth|api|grok|grok-api|agy|gemini-api>
45
+ Provider (oauth = GPT OAuth; grok = xAI Grok; agy/gemini-api = Gemini)
45
46
  --mode <auto|direct> Prompt handling mode. Default: auto
46
47
  --ref <file> Attach reference image (repeatable, max 5)
47
48
  --reasoning-effort <none|low|medium|high|xhigh>
@@ -60,11 +61,11 @@ export default async function multimodeCmd(argv) {
60
61
  const prompt = args.positional.join(" ");
61
62
  if (!prompt)
62
63
  die(2, "prompt required");
63
- const VALID_PROVIDERS = new Set(["auto", "oauth", "api", "grok"]);
64
+ const VALID_PROVIDERS = new Set(["auto", "oauth", "api", "grok", "grok-api", "agy", "gemini-api"]);
64
65
  const VALID_MODES = new Set(["auto", "direct"]);
65
66
  const VALID_REASONING = new Set(["none", "low", "medium", "high", "xhigh"]);
66
67
  if (args.provider && !VALID_PROVIDERS.has(String(args.provider))) {
67
- die(2, "--provider must be one of: auto, oauth, api, grok");
68
+ die(2, "--provider must be one of: auto, oauth, api, grok, grok-api, agy, gemini-api");
68
69
  }
69
70
  if (!VALID_MODES.has(String(args.mode)))
70
71
  die(2, "--mode must be one of: auto, direct");
@@ -8,11 +8,11 @@ const HELP = `
8
8
  ima2 node <subcommand> [options]
9
9
 
10
10
  Subcommands:
11
- generate <prompt...> [--parent <nodeId>] [--ref <file>...] [--provider <auto|oauth|api|grok>] [--no-stream] [...gen-style flags]
11
+ generate <prompt...> [--parent <nodeId>] [--ref <file>...] [--provider <auto|oauth|api|grok|grok-api|agy|gemini-api>] [--no-stream] [...gen-style flags]
12
12
  show <nodeId> [--json]
13
13
 
14
14
  Generate options:
15
- --provider <auto|oauth|api|grok> Provider for this request; grok uses progrok proxy
15
+ --provider <auto|oauth|api|grok|grok-api|agy|gemini-api> Provider for this request
16
16
  `;
17
17
  const GEN_FLAGS = {
18
18
  quality: { short: "q", type: "string", default: "low" },
@@ -58,10 +58,10 @@ async function generateSub(argv) {
58
58
  if (!prompt)
59
59
  die(2, "prompt required");
60
60
  const refs = (Array.isArray(args.ref) ? args.ref : []);
61
- const VALID_PROVIDERS = new Set(["auto", "oauth", "api", "grok"]);
61
+ const VALID_PROVIDERS = new Set(["auto", "oauth", "api", "grok", "grok-api", "agy", "gemini-api"]);
62
62
  const VALID_REASONING = new Set(["none", "low", "medium", "high", "xhigh"]);
63
63
  if (args.provider && !VALID_PROVIDERS.has(String(args.provider))) {
64
- die(2, "--provider must be one of: auto, oauth, api, grok");
64
+ die(2, "--provider must be one of: auto, oauth, api, grok, grok-api, agy, gemini-api");
65
65
  }
66
66
  if (args["reasoning-effort"] && !VALID_REASONING.has(String(args["reasoning-effort"]))) {
67
67
  die(2, "--reasoning-effort must be one of: none, low, medium, high, xhigh");
package/bin/ima2.js CHANGED
@@ -288,6 +288,7 @@ function showHelp() {
288
288
  cancel <id> Mark an in-flight job canceled (ima2 cancel --help)
289
289
  inflight <sub> Inflight jobs (ls / rm) (ima2 inflight --help)
290
290
  storage <sub> Storage status / open-dir (ima2 storage --help)
291
+ backfill-thumbs Generate missing thumbnails for gallery performance
291
292
  billing API usage / quota
292
293
  providers Configured providers
293
294
  oauth <sub> GPT OAuth proxy status (ima2 oauth --help)
@@ -332,7 +333,7 @@ if (args.includes("-v") || args.includes("--version")) {
332
333
  process.exit(0);
333
334
  }
334
335
  if ((!command || args.includes("-h") || args.includes("--help"))
335
- && !["doctor", "gen", "video", "edit", "ls", "show", "ps", "cancel", "session", "history", "prompt", "multimode", "node", "annotate", "canvas-versions", "metadata", "comfy", "cardnews", "inflight", "storage", "billing", "providers", "oauth", "grok", "config", "defaults", "capabilities", "skill", "ping"].includes(command)) {
336
+ && !["doctor", "gen", "video", "edit", "ls", "show", "ps", "cancel", "session", "history", "prompt", "multimode", "node", "annotate", "canvas-versions", "metadata", "comfy", "cardnews", "inflight", "storage", "billing", "providers", "oauth", "grok", "config", "defaults", "capabilities", "skill", "ping", "backfill-thumbs"].includes(command)) {
336
337
  showHelp();
337
338
  process.exit(command ? 0 : 1);
338
339
  }
@@ -406,6 +407,11 @@ switch (command) {
406
407
  await mod.default(args.slice(1));
407
408
  break;
408
409
  }
410
+ case "backfill-thumbs": {
411
+ const { backfillThumbs } = await import("./commands/backfillThumbs.js");
412
+ await backfillThumbs();
413
+ break;
414
+ }
409
415
  case "storage":
410
416
  case "billing":
411
417
  case "providers":
@@ -102,7 +102,7 @@ export function envOverrideForKey(key) {
102
102
  return { envVar, value: String(process.env[envVar]) };
103
103
  }
104
104
  export function displayPath(p) {
105
- const home = process.env.HOME || "";
105
+ const home = process.env.HOME || process.env.USERPROFILE || "";
106
106
  return home && p.startsWith(home) ? p.replace(home, "~") : p;
107
107
  }
108
108
  export function restartNotice() {
package/docs/API.md CHANGED
@@ -10,11 +10,14 @@ http://localhost:3333
10
10
 
11
11
  ## Provider Policy
12
12
 
13
- Image generation supports OAuth, API-key, and Grok providers.
13
+ Image generation supports OAuth, API-key, Grok, and Gemini (agy) providers.
14
14
 
15
15
  - `provider: "oauth"` uses the local Codex OAuth proxy.
16
16
  - `provider: "api"` uses the OpenAI Responses API with the hosted `image_generation` tool.
17
17
  - `provider: "grok"` uses the bundled progrok xAI proxy. Classic, Node, and Agent generation run mandatory xAI Web Search through `/v1/responses`, then run a `grok-4.3` planner call with a forced local `generate_image` function, then ima2 executes xAI `/v1/images/generations`. If reference images, a Node parent image, or an Agent current image are attached, the final step switches to xAI `/v1/images/edits` so image-to-image context is preserved.
18
+ - `provider: "agy"` spawns the Antigravity CLI (`agy -p`) to generate images via Google Gemini's `default_api:generate_image` tool. Model is `nano-banana-2`. Output is fixed at 1024×1024 JPEG. Max 3 reference images (i2i). No web search, quality, size, or mask controls. Multimode returns a single image. Video is unsupported (`AGY_VIDEO_UNSUPPORTED`).
19
+ - `provider: "grok-api"` uses a direct xAI API key instead of the bundled progrok OAuth proxy. Same pipeline as `grok` (Web Search → planner → `/v1/images/generations`), same aspect ratio and resolution options. Requires an xAI API key configured via the web UI key management or `XAI_API_KEY` env var. Also supports video generation.
20
+ - `provider: "gemini-api"` calls the Google Generative Language API directly (or Vertex AI with a service account JSON). Supports models `nano-banana-2` (Gemini 3.1 Flash Image) and `nano-banana-pro` (Gemini 3 Pro Image). Supports variable aspect ratios and resolutions (512px–4K). Requires a `GEMINI_API_KEY` env var, web UI key management, or a Vertex AI service account JSON. No web search or mask controls.
18
21
  - API-key generation covers classic generate, edit, mask-guided edit, multimode, and node generation.
19
22
  - If `provider: "api"` is requested without an API key, routes fail before upstream with `401` and `API_KEY_REQUIRED`.
20
23
  - Grok generation maps `size` to xAI `aspect_ratio` and `resolution`; it does not send an OpenAI-style `size` field upstream. Grok edit uses xAI `/v1/images/edits`; Grok mask edit remains unsupported and returns `GROK_MASK_UNSUPPORTED`.
@@ -100,7 +103,8 @@ Text-to-image and reference-guided root generation.
100
103
  "provider": "oauth",
101
104
  "model": "gpt-5.4",
102
105
  "references": [],
103
- "requestId": "optional-client-id"
106
+ "requestId": "optional-client-id",
107
+ "storyboard": false
104
108
  }
105
109
  ```
106
110
 
@@ -108,6 +112,9 @@ Supported quality values: `low`, `medium`, `high`.
108
112
 
109
113
  Supported moderation values: `auto`, `low`.
110
114
 
115
+ When `storyboard` is `true`, the server prepends storyboard keyframe instructions so image
116
+ generations maintain character and scene continuity for multi-shot video production.
117
+
111
118
  Recommended model: `gpt-5.4`. Current app default: `gpt-5.4-mini`. `gpt-5.5` is the strongest quality option when supported, but callers should expect higher quota pressure and possible Codex CLI/backend capability requirements.
112
119
 
113
120
  When `provider` is `"grok"`, supported models are `grok-imagine-image` and
@@ -267,13 +274,17 @@ Generate a video via the Grok video provider. Returns Server-Sent Events.
267
274
  | `referenceFilenames` | string[] | — | Existing generated files for reference-to-video |
268
275
  | `continueFromVideo` | string | — | Generated `.mp4` parent; server extracts its last frame and rebuilds lineage from sidecar |
269
276
  | `continuityLineage` | object | — | Optional client hint; used only when `continueFromVideo` is absent |
277
+ | `plannerModel` | string | `grok-4.3` | Grok video planner model override (also via settings UI or `IMA2_GROK_PLANNER_MODEL`) |
278
+ | `storyboard` | boolean | `false` | Enable storyboard mode — maintains character/scene continuity across sequential clips |
270
279
 
271
280
  Blank prompts return `PROMPT_REQUIRED` with a `guidance` string. The active
272
281
  prompt should describe visual flow, motion flow, sound/music/no-music,
273
282
  dialogue/no-dialogue, ending frame, and duration pacing. The video planner uses
274
283
  the selected duration as the full clip runtime and expands short requests into a
275
284
  production-level sequence with opening composition, connected motion/emotion
276
- change, and a stable ending frame suitable for continuation.
285
+ change, and a stable ending frame suitable for continuation. For multi-character
286
+ scenes, the planner identifies speakers by visual appearance (clothing, physique,
287
+ position, props) rather than names, and attributes each dialogue line accordingly.
277
288
 
278
289
  When `continueFromVideo` is present, the server treats the generated `.mp4`
279
290
  sidecar as authoritative. Client `continuityLineage` cannot override it. The
@@ -313,7 +324,7 @@ Grok prompt surfaces used by video APIs:
313
324
 
314
325
  | Surface | Model | Responsibility |
315
326
  |---|---|---|
316
- | Video planner | `grok-4.3` | Converts user prompt, search context, refs, and optional continuity lineage into the final English video prompt. It must structure core subject, action/motion, camera/composition, environment/style, dialogue/audio, ending-frame handoff, and constraints. |
327
+ | Video planner | `grok-4.3` (override via `plannerModel`) | Converts user prompt, search context, refs, and optional continuity lineage into the final English video prompt. It must structure core subject, action/motion, camera/composition, environment/style, dialogue/audio, ending-frame handoff, and constraints. Multi-character dialogue uses appearance-based speaker identification. |
317
328
  | Video generation | xAI video model | Receives the planner prompt plus `sourceImage` or `referenceImages` when present. |
318
329
  | Video analysis | `grok-4.3` | Reads first/last frame images from `/api/video/analyze` and returns recreation/continuation guidance. |
319
330
 
@@ -461,6 +472,44 @@ Style-sheet extraction can require an API key/openai client. Image generation al
461
472
  | `GROK_RATE_LIMITED` | xAI returned a rate-limit response through progrok |
462
473
  | `GROK_AUTH_FAILED` | progrok could not authenticate the xAI request |
463
474
  | `GROK_SEARCH_TIMEOUT` / `GROK_PLANNER_TIMEOUT` / `GROK_IMAGE_TIMEOUT` | The Grok search, planner, or image API step exceeded its timeout budget |
475
+ | `AGY_GENERATION_FAILED` | Gemini (agy) image generation failed |
476
+ | `AGY_TIMEOUT` | Agy CLI process exceeded its 360-second timeout |
477
+ | `AGY_PROCESS_ERROR` | Agy CLI binary failed to start or crashed |
478
+ | `AGY_QUOTA_EXHAUSTED` | Gemini API quota exhausted (rate limit) |
479
+ | `AGY_PARSE_FAILED` | Could not parse artifact path from agy output |
480
+ | `AGY_ARTIFACT_NOT_FOUND` | Agy reported an artifact path that does not exist |
481
+ | `AGY_PATH_REJECTED` | Agy artifact path was outside allowed directories |
482
+ | `AGY_VIDEO_UNSUPPORTED` | Video generation is not supported by the Gemini (agy) provider |
483
+ | `AGY_MASK_UNSUPPORTED` | Mask-based editing is not supported by the Gemini (agy) provider |
484
+ | `AGY_REF_TOO_MANY` | Too many reference images for agy (max 3) |
485
+ | `GEMINI_API_KEY_MISSING` | Gemini API key or Vertex AI credentials not configured |
486
+ | `GEMINI_API_RATE_LIMITED` | Gemini API rate limited (429) |
487
+ | `GEMINI_API_BAD_REQUEST` | Gemini API bad request (400/403) |
488
+ | `GEMINI_API_SAFETY_BLOCKED` | Gemini API generation blocked by safety filter |
489
+ | `GEMINI_API_NO_IMAGE` | Gemini API returned no image in response |
490
+ | `VIDEO_PROVIDER_UNSUPPORTED` | Video generation requires provider `"grok"` or `"grok-api"` |
491
+
492
+ ## Key Management
493
+
494
+ API key management endpoints for configuring provider credentials at runtime through the web UI or HTTP API.
495
+
496
+ | Endpoint | Method | Description |
497
+ |---|---|---|
498
+ | `/api/keys/status` | GET | Returns configured/valid/maskedKey status for all providers (openai, xai, gemini, vertex) |
499
+ | `/api/keys/:provider` | PUT | Save an API key. Body: `{ "apiKey": "..." }`. Validates key format and upstream before saving to config.json. Provider: `openai`, `xai`, or `gemini`. |
500
+ | `/api/keys/:provider` | DELETE | Remove a config-sourced API key. Env-sourced keys cannot be removed (`ENV_KEY_IMMUTABLE`). |
501
+ | `/api/keys/vertex` | PUT | Save a Vertex AI service account JSON. Body: `{ "serviceAccountJson": "..." }`. Validates JSON structure (`type: "service_account"`, `project_id` required). |
502
+ | `/api/keys/vertex` | DELETE | Remove a config-sourced Vertex AI service account. |
503
+
504
+ Keys saved via PUT are stored in `config.json` and hot-updated in the runtime context (no server restart required). Keys loaded from environment variables (`OPENAI_API_KEY`, `XAI_API_KEY`, `GEMINI_API_KEY`, `VERTEX_SERVICE_ACCOUNT_JSON`) take precedence and are immutable through the API.
505
+
506
+ ## Thumbnail Backfill
507
+
508
+ | Endpoint | Method | Description |
509
+ |---|---|---|
510
+ | `/api/history/backfill-thumbnails` | POST | Generate missing `.thumb.jpg` thumbnails for all images and videos in the generated directory. Returns `{ ok, total, created, skipped, failed }`. Also available offline via `ima2 backfill-thumbs`. |
511
+
512
+ Thumbnails are also generated automatically on server startup for any media files that lack them.
464
513
 
465
514
  ## Endpoint → CLI Mapping
466
515
 
@@ -499,6 +548,8 @@ Most server routes under `/api/*` have a CLI wrapper. The exception is **Agent M
499
548
  | `GET /api/billing` / `GET /api/providers` / `GET /api/oauth/status` / `GET /api/grok/status` | `ima2 billing` / `ima2 providers` / `ima2 oauth status` / `ima2 grok status` |
500
549
  | `GET /api/health` | `ima2 ping` |
501
550
  | `GET /api/capabilities` | `ima2 capabilities` |
551
+ | `POST /api/history/backfill-thumbnails` | `ima2 backfill-thumbs` |
552
+ | `GET /api/keys/status`, `PUT/DELETE /api/keys/:provider`, `PUT/DELETE /api/keys/vertex` | Web UI only (Settings > API Keys) |
502
553
  | `GET/POST/PATCH/DELETE /api/agent/*` (sessions, turns, queue) | — (Agent Mode; web UI only, no CLI) |
503
554
  | `POST /api/prompt-builder/chat` | `ima2 prompt build` |
504
555
 
package/docs/CLI.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # CLI Reference
2
2
 
3
- Most server routes under `/api/*` have a CLI wrapper; Agent Mode (`/api/agent/*`) is web-UI-only and has no `ima2` subcommand. The prompt builder HTTP route (`POST /api/prompt-builder/chat`) is available through `ima2 prompt build`. The CLI is a thin shell over the local server, so most commands require a running `ima2 serve` (the few exceptions — `serve`, `setup`, `doctor`, `status`, `open`, `reset`, `config`, `grok`, `skill`, `capabilities`, and local `defaults` inspection — work without a live server).
3
+ Most server routes under `/api/*` have a CLI wrapper; Agent Mode (`/api/agent/*`) is web-UI-only and has no `ima2` subcommand. The prompt builder HTTP route (`POST /api/prompt-builder/chat`) is available through `ima2 prompt build`. The CLI is a thin shell over the local server, so most commands require a running `ima2 serve` (the few exceptions — `serve`, `setup`, `doctor`, `status`, `open`, `reset`, `config`, `grok`, `skill`, `capabilities`, `backfill-thumbs`, and local `defaults` inspection — work without a live server).
4
4
 
5
5
  For a quick start, see the [main README](../README.md). For endpoint mapping, see [API.md](API.md).
6
6
 
@@ -16,6 +16,7 @@ For a quick start, see the [main README](../README.md). For endpoint mapping, se
16
16
  | `ima2 open` | Open the web UI in a browser |
17
17
  | `ima2 grok login/status/models/proxy` | Manage the bundled progrok runtime used by the Grok provider |
18
18
  | `ima2 reset` | Remove saved config |
19
+ | `ima2 backfill-thumbs` | Generate missing gallery thumbnails for images and videos (offline, no running server needed) |
19
20
 
20
21
  ## Common flags
21
22
 
@@ -53,13 +54,14 @@ Agents should start from the packaged skill and capability commands instead of g
53
54
  | `ima2 node generate` | Node-mode generate (SSE; supports `--no-stream`) |
54
55
  | `ima2 node show <nodeId>` | Read node metadata |
55
56
 
56
- Generation flags include `--provider <auto|oauth|api|grok>`, `--reasoning-effort {none\|low\|medium\|high\|xhigh}`, `--web-search` / `--no-web-search`, `--model`, `--mode`, `--moderation`, `--ref <file>` (repeatable, up to 5 where supported), `-q low|medium|high`, `-n <count>`, `-o <file>`.
57
+ Generation flags include `--provider <auto|oauth|api|grok|grok-api|agy|gemini-api>`, `--reasoning-effort {none\|low\|medium\|high\|xhigh}`, `--web-search` / `--no-web-search`, `--model`, `--mode`, `--moderation`, `--ref <file>` (repeatable, up to 5 where supported), `-q low|medium|high`, `-n <count>`, `-o <file>`.
57
58
 
58
59
  Provider override semantics:
59
60
 
60
61
  - `api` forces the API-key Responses path and requires a configured API key.
61
62
  - `oauth` forces the local OAuth proxy path.
62
63
  - `grok` uses the bundled progrok xAI proxy (`127.0.0.1:18645`). Classic generation first runs mandatory xAI Web Search through Responses API, then asks `grok-4.3` to call ima2's local `generate_image` tool, then ima2 executes xAI `/v1/images/generations`. If `--ref` images are attached, the final step uses xAI `/v1/images/edits` instead so image-to-image/reference context is preserved. Models: `grok-imagine-image`, `grok-imagine-image-quality`. Size is mapped to xAI `aspect_ratio` and `resolution`; the UI web-search toggle is OpenAI-provider-only because Grok search is always on in this path.
64
+ - `agy` spawns the Antigravity CLI to generate via Google Gemini (`nano-banana-2`). Fixed 1024×1024 JPEG output, max 3 refs. No web search, quality, size, or mask controls.
63
65
  - `auto` preserves route default behavior and currently resolves to GPT OAuth unless server routing changes.
64
66
 
65
67
  `ima2 serve` starts the bundled Grok proxy automatically. No separate `progrok`
@@ -105,7 +107,7 @@ mockup`.
105
107
  For dense or critical text, keep the text large and explicit. Exact placement,
106
108
  small text, and pixel-perfect typography can still need iteration or post-editing.
107
109
 
108
- Multimode-specific flags include `--max-images <1..8>`, `--ref <file>` (repeatable, max 5), `--mode <auto|direct>`, `--provider <auto|oauth|api|grok>`, and `--show-partial`. `ima2 edit --mask` remains intentionally deferred to #31 because current mask plumbing is guided edit rather than guaranteed true masked/inpaint semantics.
110
+ Multimode-specific flags include `--max-images <1..8>`, `--ref <file>` (repeatable, max 5), `--mode <auto|direct>`, `--provider <auto|oauth|api|grok|grok-api|agy|gemini-api>`, and `--show-partial`. `ima2 edit --mask` remains intentionally deferred to #31 because current mask plumbing is guided edit rather than guaranteed true masked/inpaint semantics.
109
111
 
110
112
  ## Video
111
113
 
@@ -126,6 +128,8 @@ Video generate flags:
126
128
  | `--resolution <480p\|720p>` | Video resolution (default: 480p) |
127
129
  | `--aspect-ratio <ratio\|auto>` | 1:1, 16:9, 9:16, 4:3, 3:4, 3:2, 2:3, auto (default: auto) |
128
130
  | `--model <name>` | `grok-imagine-video` or `grok-imagine-video-1.5-preview` |
131
+ | `--planner-model <name>` | Grok planner override (default: `grok-4.3`; also in settings UI and `IMA2_GROK_PLANNER_MODEL`) |
132
+ | `--storyboard` | Enable storyboard mode — maintains character/scene continuity across sequential clips |
129
133
  | `--ref <file>` | Attach source/reference image (repeatable, max 7) |
130
134
  | `-o, --out <file>` | Output file path |
131
135
  | `-d, --out-dir <dir>` | Output directory |
@@ -160,6 +164,8 @@ Video continue flags:
160
164
  | `--aspect-ratio <ratio\|auto>` | New clip aspect ratio |
161
165
  | `--model <name>` | Optional video generation model |
162
166
 
167
+ Video continue also accepts `--planner-model` and `--storyboard`.
168
+
163
169
  Video mode is auto-detected from `--ref` count:
164
170
 
165
171
  | Refs | Mode |
@@ -12,10 +12,12 @@ you want a reproducible way to report a workspace issue.
12
12
  | Area | What it does | Notes |
13
13
  |---|---|---|
14
14
  | Composer | Holds the prompt for the next request. | Selecting an existing image is view-only. It should not overwrite the composer. |
15
+ | Storyboard | Maintains character and scene continuity across sequential frames. | Toggle in the composer. Works for image and video generation; image keyframes are composed for video production. |
15
16
  | Multimode | Starts several separate image requests from the current prompt. | Each slot is a candidate output, not a collage panel or a guaranteed scene sequence. |
16
17
  | 1:1 Direct | Sends the prompt through with less rewriting by the app. | Use it for exact wording, strict prompt experiments, or provider-side prompt syntax. |
17
18
  | Model quick menu | Changes the image model and reasoning effort from the sidebar header. | The full Settings workspace remains the detailed configuration page. |
18
- | Recent generations | Shows the visible Prompt Studio history domain. | Arrow keys move inside the same visible recent domain instead of hidden older rows. Video items render as video thumbnails. Drag any thumbnail to the composer to add it as a reference image. |
19
+ | Recent generations | Shows the visible Prompt Studio history domain. | Arrow keys move inside the same visible recent domain instead of hidden older rows. Video items render as video thumbnails. Drag any thumbnail to the composer to add it as a reference image. Video results expose First, Mid, and Last frame buttons to copy keyframes. |
20
+ | Video settings | Controls Grok video duration, resolution, aspect ratio, and planner model. | Default planner model is `grok-4.3`; override per request when needed. |
19
21
  | Gallery | Browses saved local images, All/Favorites tabs, and folders. | Favorite toggles should preserve the gallery viewport you were browsing. |
20
22
  | Prompt library | Imports saved prompt text into the composer intentionally. | Library insert/continue actions are explicit prompt imports; passive image selection is not. |
21
23
 
@@ -4,7 +4,7 @@ Generated by `npm run test:inventory` (script: `scripts/classify-tests.mjs`).
4
4
 
5
5
  _Tests considered "runtime-importing" if they import from `../lib/`, `../routes/`, `../bin/`, `../server`, or `../config`._
6
6
 
7
- Total: 175 (runtime: 60, contract: 115)
7
+ Total: 177 (runtime: 61, contract: 116)
8
8
 
9
9
  ## Runtime-importing tests
10
10
  - `tests/agent-mode-auto-planner-contract.test.ts`
@@ -64,6 +64,7 @@ Total: 175 (runtime: 60, contract: 115)
64
64
  - `tests/star-prompt.test.ts`
65
65
  - `tests/storage-migration.test.ts`
66
66
  - `tests/style-sheet.test.ts`
67
+ - `tests/thumb-backfill.test.ts`
67
68
  - `tests/videoContinuity.test.ts`
68
69
  - `tests/videoExtendedRoute.test.ts`
69
70
  - `tests/videoRoute.test.ts`
@@ -155,6 +156,7 @@ Total: 175 (runtime: 60, contract: 115)
155
156
  - `tests/node-layout-contract.test.js`
156
157
  - `tests/node-pending-recovery-contract.test.js`
157
158
  - `tests/node-regen-actions-contract.test.js`
159
+ - `tests/node-session-evaporation-contract.test.js`
158
160
  - `tests/node-ui-contract.test.js`
159
161
  - `tests/oauth-masked-edit-contract.test.js`
160
162
  - `tests/oauth-proxy-edit-mask-contract.test.js`
@@ -10,6 +10,7 @@ import { detectImageMimeFromB64 } from "./refs.js";
10
10
  import { resolveProviderOptions } from "./providerOptions.js";
11
11
  import { generateViaResponses } from "./responsesImageAdapter.js";
12
12
  import { generateViaGrok } from "./grokImageAdapter.js";
13
+ import { generateViaAgy } from "./agyImageAdapter.js";
13
14
  import { generateVideoViaGrok } from "./grokVideoAdapter.js";
14
15
  import { parseVideoParams } from "./agentGenerationPlanner.js";
15
16
  import { appendAgentTurn, buildImageContextManifest, getAgentImages, getAgentSession, importAgentImage, recordAgentWebFinding, restartAgentRuntimeSession, } from "./agentStore.js";
@@ -46,7 +47,7 @@ export async function runAgentGenerationPlan(ctx, sessionId, prompt, plan, optio
46
47
  const session = getAgentSession(sessionId);
47
48
  if (!session)
48
49
  throw notFound(sessionId);
49
- const webSearchEnabled = options.provider === "grok" ? true : options.webSearchEnabled ?? session.webSearchEnabled;
50
+ const webSearchEnabled = options.provider === "agy" ? false : options.provider === "grok" ? true : options.webSearchEnabled ?? session.webSearchEnabled;
50
51
  const enabledTools = webSearchEnabled
51
52
  ? [...AGENT_ALLOWED_TOOLS]
52
53
  : ["ima2.get_image_context", "ima2.generate_image", "ima2.generate_video"];
@@ -234,21 +235,26 @@ async function generateAgentImage(ctx, sessionId, prompt, manifest, webSearchEna
234
235
  const effectiveModel = activeProvider === "grok" && options.quality === "high"
235
236
  ? "grok-imagine-image-quality"
236
237
  : providerOptions.model;
237
- const response = activeProvider === "grok"
238
- ? await generateViaGrok(`${manifest}\n\nUser request:\n${prompt}`, ctx, {
239
- model: effectiveModel,
240
- size: providerOptions.size,
238
+ const response = activeProvider === "agy"
239
+ ? await generateViaAgy(`${manifest}\n\nUser request:\n${prompt}`, {
241
240
  requestId,
242
241
  signal: options.signal ?? undefined,
243
- references: await loadAgentCurrentImageReferences(ctx, sessionId),
244
242
  })
245
- : await generateViaResponses(activeProvider, `${manifest}\n\nUser request:\n${prompt}`, options.quality ?? "medium", providerOptions.size, options.moderation ?? "low", [], requestId, "auto", ctx, {
246
- model: providerOptions.model,
247
- reasoningEffort: providerOptions.reasoningEffort,
248
- webSearchEnabled,
249
- signal: options.signal,
250
- });
251
- const format = activeProvider === "grok"
243
+ : activeProvider === "grok"
244
+ ? await generateViaGrok(`${manifest}\n\nUser request:\n${prompt}`, ctx, {
245
+ model: effectiveModel,
246
+ size: providerOptions.size,
247
+ requestId,
248
+ signal: options.signal ?? undefined,
249
+ references: await loadAgentCurrentImageReferences(ctx, sessionId),
250
+ })
251
+ : await generateViaResponses(activeProvider, `${manifest}\n\nUser request:\n${prompt}`, options.quality ?? "medium", providerOptions.size, options.moderation ?? "low", [], requestId, "auto", ctx, {
252
+ model: providerOptions.model,
253
+ reasoningEffort: providerOptions.reasoningEffort,
254
+ webSearchEnabled,
255
+ signal: options.signal,
256
+ });
257
+ const format = activeProvider === "grok" || activeProvider === "agy"
252
258
  ? imageFormatFromMime(("mime" in response ? response.mime : undefined) || detectImageMimeFromB64(response.b64) || "image/jpeg")
253
259
  : options.format ?? "png";
254
260
  const image = await persistAgentImage(ctx, sessionId, prompt, format, requestId, response, {
@@ -430,13 +436,13 @@ async function persistAgentVideo(ctx, sessionId, prompt, requestId, result) {
430
436
  function recordSearchFindings(sessionId, prompt, count, provider) {
431
437
  if (!count)
432
438
  return [];
433
- const isGrok = provider === "grok";
439
+ const providerLabel = provider === "grok" ? "Grok" : provider === "agy" ? "Gemini" : "Responses";
434
440
  return [
435
441
  recordAgentWebFinding({
436
442
  sessionId,
437
443
  query: prompt,
438
- title: isGrok ? "Grok visual research" : "Responses web_search",
439
- snippet: `${isGrok ? "Grok" : "Responses"} reported ${count} web search call${count === 1 ? "" : "s"}.`,
444
+ title: `${providerLabel} visual research`,
445
+ snippet: `${providerLabel} reported ${count} web search call${count === 1 ? "" : "s"}.`,
440
446
  }),
441
447
  ];
442
448
  }
@@ -1,4 +1,4 @@
1
- const PROVIDERS = new Set(["oauth", "api", "grok"]);
1
+ const PROVIDERS = new Set(["oauth", "api", "grok", "grok-api", "agy", "gemini-api"]);
2
2
  const QUALITIES = new Set(["low", "medium", "high"]);
3
3
  const FORMATS = new Set(["png", "jpeg", "webp"]);
4
4
  const MODERATIONS = new Set(["auto", "low"]);