@sogni-ai/sogni-creative-agent-skill 3.3.2 → 3.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +16 -6
- package/SKILL.md +22 -14
- package/generated/creative-agent-runtime.mjs +7 -7
- package/llm.txt +3 -3
- package/openclaw.plugin.json +1 -1
- package/package.json +2 -2
- package/skill-package.json +1 -1
- package/sogni-agent.mjs +81 -26
- package/version.mjs +1 -1
package/README.md
CHANGED
|
@@ -361,7 +361,7 @@ Run `sogni-agent --help` for the full CLI. Below are the options and tables most
|
|
|
361
361
|
| `--workflow-max-cost <n>`, `--confirm-cost`, `--no-confirm-cost` | Set durable workflow capacity ceiling and explicit cost confirmation |
|
|
362
362
|
| `--storyboard-frames <n>` | Beat count for `--api-workflow storyboard-video` |
|
|
363
363
|
| `--video-prompt`, `--negative-prompt`, `--generate-audio`, `--expand-prompt` | Generated-keyframe durable workflow step controls |
|
|
364
|
-
| `--watch-workflow`, `--list-workflows`, `--get-workflow <id>`, `--workflow-events <id>`, `--stream-workflow <id>`, `--cancel-workflow <id>` | Manage durable workflows |
|
|
364
|
+
| `--watch-workflow`, `--list-workflows`, `--get-workflow <id>`, `--workflow-events <id>`, `--stream-workflow <id>`, `--cancel-workflow <id>`, `--resume-workflow <id>` | Manage durable workflows |
|
|
365
365
|
| `--api-tools <mode>`, `--no-api-tool-execution`, `--llm-model <id>`, `--task-profile <profile>`, `--max-tokens <n>`, `--thinking` / `--no-thinking`, `--api-base-url <url>` | Tune hosted API requests |
|
|
366
366
|
| `--list-api-models`, `--get-api-model <id>` | Inspect Sogni Intelligence LLM models |
|
|
367
367
|
| `--list-replays [n]`, `--get-replay <id>`, `--ingest-replay <json\|@path>` | Manage Sogni Intelligence replay records (use `@path` to load JSON from a file) |
|
|
@@ -415,6 +415,7 @@ Music generation uses `--music` and outputs `mp3` by default. `--audio` remains
|
|
|
415
415
|
- **WAN models** use dimensions divisible by 16, min 480 px, max 1536 px.
|
|
416
416
|
- **LTX family** (`ltx2-*`, `ltx23-*`) uses dimensions divisible by 64. The current wrapper caps non-WAN video dimensions at 2048 px on the long side.
|
|
417
417
|
- **Seedance** runs at fixed 24 fps and supports 4–15 s durations. Other default/WAN paths support up to 10 s; LTX and WAN animate workflows support up to 20 s.
|
|
418
|
+
- For spoken dialogue, budget roughly 3 words per second plus about 1 second for each meaningful acting beat or pause. Keep quoted speech under the model's hard per-clip word budget.
|
|
418
419
|
- The script auto-normalizes video sizes to satisfy these constraints.
|
|
419
420
|
- Use `--target-resolution <px>` for bare resolution requests like "720p" — it targets the short side and preserves the inherited aspect ratio.
|
|
420
421
|
- Natural-language aspect requests like "portrait", "square", "16:9", or "9:16" are inferred when width/height aren't explicitly set. Combined requests like "720p 9:16" keep the requested short side while applying the requested shape.
|
|
@@ -489,7 +490,7 @@ sogni-agent --persona-list
|
|
|
489
490
|
sogni-agent --persona-remove "Mark"
|
|
490
491
|
```
|
|
491
492
|
|
|
492
|
-
Stored at `~/.config/sogni/personas/`.
|
|
493
|
+
Stored at `~/.config/sogni/personas/`. Personas resolve by explicit saved name, id, or tag/alias; relationship phrases are not treated as persona identifiers.
|
|
493
494
|
|
|
494
495
|
### Memory (persistent preferences)
|
|
495
496
|
|
|
@@ -530,8 +531,8 @@ Hosted API modes require `SOGNI_API_KEY`.
|
|
|
530
531
|
- **`--api-workflow storyboard-video`** generates a storyline, creates a single GPT Image 2 storyboard sheet, then passes that artifact into Seedance as the video reference. The `-Q fast|hq|pro` preset maps to GPT Image 2 low/medium/high quality for that storyboard sheet.
|
|
531
532
|
- **Media references** from `-c`, `--ref`, `--ref-end`, `--ref-audio`, `--reference-audio-identity`, and `--ref-video` are forwarded as `media_references` metadata in hosted API requests. API chat also attaches image refs as vision inputs. Local file references are uploaded to Sogni media storage first, then forwarded as retrievable URLs so durable executors do not depend on `data:` URI support. Durable workflow JSON can bind those references into step arguments with `sourceStepId: "$input_media"`. Use direct CLI mode for private media that must not leave the local machine.
|
|
532
533
|
- **Cost controls** use `--workflow-max-cost <n>` to reject workflow starts above a capacity-unit ceiling, and `--confirm-cost` / `--no-confirm-cost` to forward explicit billing confirmation.
|
|
533
|
-
- Manage runs with `--watch-workflow`, `--workflow-events`, `--stream-workflow`, `--list-workflows`, `--get-workflow`, and `--
|
|
534
|
-
- **Replay records** use `/v1/replay/records`: `--list-replays [limit]`, `--get-replay <runId>`, and `--ingest-replay <json
|
|
534
|
+
- Manage runs with `--watch-workflow`, `--workflow-events`, `--stream-workflow`, `--list-workflows`, `--get-workflow`, `--cancel-workflow`, and `--resume-workflow`. Use `--workflow-input` to provide exact durable workflow JSON.
|
|
535
|
+
- **Replay records** use `/v1/replay/records`: `--list-replays [limit]`, `--get-replay <runId>`, and `--ingest-replay <json|@path>` expose redacted RunRecord storage for Sogni Intelligence replay/debug viewers.
|
|
535
536
|
|
|
536
537
|
Override the API origin with `--api-base-url`, `SOGNI_API_BASE_URL`, or `SOGNI_REST_ENDPOINT`.
|
|
537
538
|
Hosted API credentials are only sent to `https://api.sogni.ai` by default. Add trusted custom
|
|
@@ -557,17 +558,26 @@ sogni-agent -n 4 "a {cat|dog} in a {garden|kitchen}"
|
|
|
557
558
|
|
|
558
559
|
Options cycle sequentially per image. Without `{...}` syntax, `-n` produces multiple images with the same prompt.
|
|
559
560
|
|
|
561
|
+
For video, use the same pattern when every output shares the same source/end assets and settings and only the prompt text varies:
|
|
562
|
+
|
|
563
|
+
```bash
|
|
564
|
+
sogni-agent --video --ref hero.png -n 3 --duration 5 \
|
|
565
|
+
"{the subject smiles and waves|the subject turns toward the window|the subject raises a hand in greeting}"
|
|
566
|
+
```
|
|
567
|
+
|
|
568
|
+
If each clip needs different source images, end frames, durations, audio slices, or other per-output settings, keep those as separate per-clip workflow arguments instead of collapsing them into a Dynamic Prompt branch.
|
|
569
|
+
|
|
560
570
|
---
|
|
561
571
|
|
|
562
572
|
## Token Auto-Fallback
|
|
563
573
|
|
|
564
|
-
Use `--token-type auto` to retry with SOGNI tokens when SPARK is insufficient:
|
|
574
|
+
Use `--token-type auto` to retry native Sogni models with SOGNI tokens when SPARK is insufficient:
|
|
565
575
|
|
|
566
576
|
```bash
|
|
567
577
|
sogni-agent --token-type auto "a dragon eating tacos"
|
|
568
578
|
```
|
|
569
579
|
|
|
570
|
-
Tries SPARK first (free daily tokens), then falls back to SOGNI if the balance is too low.
|
|
580
|
+
Tries SPARK first (free daily tokens), then falls back to SOGNI if the balance is too low. Vendor models such as Seedance and GPT Image 2 require Premium Spark eligibility and never use SOGNI fallback.
|
|
571
581
|
|
|
572
582
|
---
|
|
573
583
|
|
package/SKILL.md
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: sogni-creative-agent-skill
|
|
3
|
-
description: "Sogni Creative Agent Skill: agent skill and CLI for image, video, and music generation using Sogni AI's decentralized GPU network. Supports personas (named people with saved reference photos and voice clips), persistent memories
|
|
3
|
+
description: "Sogni Creative Agent Skill: agent skill and CLI for image, video, and music generation using Sogni AI's decentralized GPU network. Supports personas (named people with saved reference photos and voice clips), persistent memories, custom personality, style transfer, angle synthesis, Seedance/LTX/WAN video, music/lyrics, hosted chat, durable workflows, replay records, and multi-step creative workflows. Ask the agent to \"draw\", \"generate\", \"create an image\", \"make a video/animate\", \"make music\", \"apply a style\", or \"generate me as a superhero\"."
|
|
4
4
|
metadata:
|
|
5
|
-
version: "3.3.
|
|
5
|
+
version: "3.3.4"
|
|
6
6
|
homepage: https://sogni.ai
|
|
7
7
|
clawdbot:
|
|
8
8
|
emoji: "🎨"
|
|
@@ -165,7 +165,11 @@ sogni-agent -Q pro "a cat wearing a hat" # flux2_dev, 40 steps, 1024x1024 (
|
|
|
165
165
|
sogni-agent -n 3 "a {red|blue|green} sports car"
|
|
166
166
|
# → generates "a red sports car", "a blue sports car", "a green sports car"
|
|
167
167
|
|
|
168
|
-
#
|
|
168
|
+
# Prompt-only video takes from the same source image
|
|
169
|
+
sogni-agent --video --ref hero.png -n 3 --duration 5 \
|
|
170
|
+
"{the subject smiles and waves|the subject turns toward the window|the subject raises a hand in greeting}"
|
|
171
|
+
|
|
172
|
+
# Token auto-fallback for native Sogni models (tries SPARK, falls back to SOGNI)
|
|
169
173
|
sogni-agent --token-type auto "a cat wearing a hat"
|
|
170
174
|
|
|
171
175
|
# Save to file
|
|
@@ -732,7 +736,7 @@ For **any transition video work**, always use the **Sogni skill/plugin** (not ra
|
|
|
732
736
|
|
|
733
737
|
### Insufficient Funds Handling
|
|
734
738
|
|
|
735
|
-
Use `--token-type auto` to automatically retry with SOGNI tokens when SPARK is insufficient.
|
|
739
|
+
Use `--token-type auto` to automatically retry native Sogni models with SOGNI tokens when SPARK is insufficient. Vendor models such as Seedance and GPT Image 2 require Premium Spark eligibility and never fall back to SOGNI.
|
|
736
740
|
|
|
737
741
|
When you see **"Debit Error: Insufficient funds"** even with auto-fallback, reply:
|
|
738
742
|
|
|
@@ -876,6 +880,7 @@ Whenever the chosen video model is `ltx23-22b-fp8_t2v_distilled`, do not pass th
|
|
|
876
880
|
- Keep people, clothing, props, and locations concrete and stable across the whole paragraph.
|
|
877
881
|
- Give the scene one main action thread from start to finish. Use connectors like `as`, `while`, and `then` so motion reads as a continuous filmed moment.
|
|
878
882
|
- If the user asks for dialogue, embed the spoken words inline as prose and identify who is speaking and how they deliver the line.
|
|
883
|
+
- Budget spoken dialogue at about 3 words per second, plus about 1 second for each meaningful acting beat or pause.
|
|
879
884
|
- Express emotion through visible physical cues such as posture, grip, jaw tension, breathing, or pacing. Ambient sound can be woven into the prose naturally.
|
|
880
885
|
- Use positive phrasing only. Do not add negative prompts, "no ..." clauses, on-screen text/logo requests, vague filler words like `beautiful` or `nice`, or structural markup such as `[DIALOGUE]`.
|
|
881
886
|
- Keep action density proportional to duration. For short clips, describe one main beat rather than several separate events.
|
|
@@ -951,7 +956,7 @@ sogni-agent -q --video --ref /path/to/image.png -m ltx23-22b-fp8_i2v_distilled -
|
|
|
951
956
|
# Photobooth: stylize a face photo
|
|
952
957
|
sogni-agent -q --photobooth --ref /path/to/face.jpg -o /tmp/stylized.png "80s fashion portrait"
|
|
953
958
|
|
|
954
|
-
# Token auto-fallback (tries SPARK first, retries with SOGNI on insufficient balance)
|
|
959
|
+
# Token auto-fallback for native Sogni models (tries SPARK first, retries with SOGNI on insufficient balance)
|
|
955
960
|
sogni-agent -q --token-type auto -o /tmp/generated.png "user's prompt"
|
|
956
961
|
|
|
957
962
|
# Check current SPARK/SOGNI balances (no prompt required)
|
|
@@ -989,6 +994,15 @@ sogni-agent -q -n 4 "a portrait in {oil painting|watercolor|pencil sketch|pop ar
|
|
|
989
994
|
|
|
990
995
|
Options cycle sequentially per image. Without `{...}` syntax, `-n` generates multiple images with the same prompt.
|
|
991
996
|
|
|
997
|
+
For video, use the same `{...}` + `-n` pattern when all outputs share the same source image, end image, duration, audio, and settings and only prompt text varies:
|
|
998
|
+
|
|
999
|
+
```bash
|
|
1000
|
+
sogni-agent --video --ref hero.png -n 3 --duration 5 \
|
|
1001
|
+
"{the subject smiles and waves|the subject turns toward the window|the subject raises a hand in greeting}"
|
|
1002
|
+
```
|
|
1003
|
+
|
|
1004
|
+
If clips need different source images, end frames, durations, audio windows, or other per-output settings, keep them as separate per-clip workflow arguments. Do not force those into a single Dynamic Prompt branch.
|
|
1005
|
+
|
|
992
1006
|
### Token Auto-Fallback
|
|
993
1007
|
|
|
994
1008
|
Use `--token-type auto` when the user's SPARK balance might be low. It tries SPARK first (free daily tokens) and automatically retries with SOGNI if insufficient.
|
|
@@ -1086,7 +1100,7 @@ Balance check example (`--json --balance`):
|
|
|
1086
1100
|
|
|
1087
1101
|
## Cost
|
|
1088
1102
|
|
|
1089
|
-
Uses Spark tokens from your Sogni account. 512x512 images are most cost-efficient. Use `--token-type auto` to automatically fall back to SOGNI tokens when SPARK is insufficient.
|
|
1103
|
+
Uses Spark tokens from your Sogni account. 512x512 images are most cost-efficient. Use `--token-type auto` to automatically fall back to SOGNI tokens for native Sogni models when SPARK is insufficient. Seedance and GPT Image 2 are vendor models and require Premium Spark eligibility; they never use SOGNI fallback.
|
|
1090
1104
|
|
|
1091
1105
|
## Persona System
|
|
1092
1106
|
|
|
@@ -1116,19 +1130,13 @@ sogni-agent --persona-remove "Mark"
|
|
|
1116
1130
|
|
|
1117
1131
|
### Persona Pipeline Rules
|
|
1118
1132
|
|
|
1119
|
-
When a user mentions a persona
|
|
1133
|
+
When a user mentions a persona by explicit saved name, id, or tag/alias:
|
|
1120
1134
|
|
|
1121
1135
|
1. **For images:** Use `--persona "Name" "prompt"` which auto-injects the persona's reference photo as context and selects the Qwen editing model
|
|
1122
1136
|
2. **For video with voice cloning:** The persona's voice clip is used as `--reference-audio-identity` when `--video` is combined with `--persona`
|
|
1123
1137
|
3. **For video without voice clip:** Describe the voice in the prompt ("speaks in a warm alto with a British accent")
|
|
1124
1138
|
|
|
1125
|
-
**
|
|
1126
|
-
- "me" / "myself" / "I" → persona with `relationship: self`
|
|
1127
|
-
- "my wife" / "my husband" / "my partner" → persona with `relationship: partner`
|
|
1128
|
-
- "my son" / "my daughter" / "my kid" → persona with `relationship: child`
|
|
1129
|
-
- "my dog" / "my cat" / "my pet" → persona with `relationship: pet`
|
|
1130
|
-
|
|
1131
|
-
**Important:** User-uploaded photos are NOT personas. Only use `--persona` when referring to a saved persona by name or pronoun. For ad-hoc photos, use `-c` (context image) directly.
|
|
1139
|
+
**Important:** User-uploaded photos are NOT personas. Only use `--persona` when referring to a saved persona by explicit name, id, or tag/alias. For ad-hoc photos, use `-c` (context image) directly.
|
|
1132
1140
|
|
|
1133
1141
|
## Memory System
|
|
1134
1142
|
|
|
@@ -2147,7 +2147,7 @@ const PROMPT_CONTRACTS = [
|
|
|
2147
2147
|
"contractId": "orbit_video_v1",
|
|
2148
2148
|
"version": "1.0.0",
|
|
2149
2149
|
"toolName": "orbit_video",
|
|
2150
|
-
"baseDescription": "orbit_video is a self-contained pipeline that handles angle generation, video transitions,\nand stitching internally. If the user uploaded an image, call orbit_video directly — it uses\nthe upload as the front view. If no image exists yet, generate
|
|
2150
|
+
"baseDescription": "orbit_video is a self-contained pipeline that handles angle generation, video transitions,\nand stitching internally. If the user uploaded an image, call orbit_video directly — it uses\nthe upload as the front view. If no image exists yet, generate one front-view image first,\nthen call orbit_video. Avoid pre-generating multiple angles or variations for orbit_video\nunless the user explicitly asked to review custom source angles before orbiting.\n\nORBIT DIALOGUE: When the user wants spoken dialogue in an orbit video, use the\ndialogue parameter rather than prompt. Dialogue goes in the specified segment — put\nmotion/foley in prompt. If the user says \"only in the first segment\" or \"just at the start\",\nset dialogueSegment=0 (default). Never put dialogue text in the prompt parameter — it will\nbe duplicated across all segments.\n\nORBIT ANGLES: Do NOT send the angles parameter for standard 360° orbits — omit it entirely.\nThe default (right side view, back view, left side view at 90° increments) is correct for all\nnormal orbit requests. Only send angles when the user explicitly asks for specific azimuth\npositions (e.g. \"show me from the front-right and back-left only\") or a partial orbit.\n\nORBIT DIALOGUE UPDATE: For dialogue in multiple/every orbit segment, before every 90-degree\nturn, or with per-turn sequence numbers, use the dialogues array instead of the single dialogue\nparameter. Default 360-degree orbit has 4 transitions, so provide 4 short lines in order; leave\nprompt for subject, action, ambient audio, and foley only. Preserve the real names from the\nrequest/prior result; do not invent placeholder speaker tags. For a couple/persona request\nphrased as \"us\", \"we\", or \"my wife and I\", each per-turn line should make the named people\nspeak together. When the user picks a generated image by 1-based number (\"number 3\",\n\"use #3\"), pass sourceImageIndex as that number minus one (number 3 -> sourceImageIndex=2)\ninstead of omitting it.",
|
|
2151
2151
|
"parameterDocs": {
|
|
2152
2152
|
"dialogue": "Spoken dialogue for the first/default orbit segment. Do NOT put dialogue in prompt — it repeats across all segments.",
|
|
2153
2153
|
"dialogues": "Per-segment dialogue lines array. Use for multi-segment dialogue (4 lines for full 360° orbit).",
|
|
@@ -2160,12 +2160,12 @@ const PROMPT_CONTRACTS = [
|
|
|
2160
2160
|
"contractId": "animate_photo_v1",
|
|
2161
2161
|
"version": "1.0.0",
|
|
2162
2162
|
"toolName": "animate_photo",
|
|
2163
|
-
"baseDescription": "animate_photo produces video from one or more source images using LTX 2.3.\n\nVIDEO PROMPT QUOTING: In video prompts, ONLY use double quotes for spoken dialogue.\nSpeaker tags are allowed outside the quotes for screenplay-style dialogue, e.g.\nCHARACTER: \"We made it.\" Never put on-screen text, overlay text, titles, captions, signs,\nwatermarks, or any visual text in quotes — describe them without quotes (e.g. bold white text\nreading CONGRATULATIONS overlays the lower third). Quotes signal speech to the model;\nquoting non-speech text confuses audio generation.\n\nDIALOGUE DURATION: Spoken dialogue in video prompts must fit the clip duration. Estimate\nat 2.5 words per second for natural cinematic delivery, plus ~1 second per acting beat\n(pauses, gestures, glances between lines). If the user did NOT explicitly request a specific\nduration (using default 5s), extend the duration to fit the dialogue (max 20s). If the user\nexplicitly requested a specific duration, condense the dialogue to fit while preserving meaning.\nAlways check: total dialogue words ÷ 2.5 + beat count ≤ clip duration.\n\nLATEST GENERATED IMAGE FOLLOW-UP: When the newest user turn asks to animate, make a video,\nor make a clip from a generated image/result (for example \"the apple\", \"this one\",\n\"the latest image\"), use animate_photo with that latest generated image. Do not inherit an\nolder Seedance model, resolution, or duration from an unrelated prior turn unless the newest\nuser turn explicitly says Seedance or confirms an immediately suggested Seedance video stage.\nLTX supports exact 2-20s durations, so honor requests like 3s exactly.\n\nWORD BUDGET PER CLIP: The handler REJECTS clips whose spoken dialogue exceeds the budget\n— there is NO auto-trim, so plan dialogue lengths up-front. Hard maximum is 3.75 spoken\nwords per second. Ceilings: 5s = 18 words, 6s = 22 words, 8s = 30 words, 10s = 37 words,\n15s = 56 words, 20s = 75 words. Aim below these ceilings. If a scene's dialogue won't fit,\ntighten the lines, raise the per-clip duration, or split into two segments — do NOT submit\nand hope it works. Spoken words inside double quotes count toward the budget; speaker tags\nand visual/action prose are free.\n\nBATCH VIDEO PER-CLIP DURATION: For a multi-segment animate_photo batch\n(sourceImageIndices + prompts) when the user states a TOTAL video length but NO per-clip\nlength, target 15 seconds per clip when dialogue is involved, and pass that duration\nexplicitly. Example: 60s total → 4 segments × 15s, NOT 6×10s or 12×5s. There is NO 3-clip\nbatch cap: sourceImageIndices supports up to 16 clips, so never split one planned batch into\n\"first 3\" and \"remaining clips\" calls. Do NOT split a planned 15s dialogue scene into multiple\nshorter clips just because a retry complains about word budget; keep duration=15 and tighten\nthe line. Use 5s clips only for single short motion beats or one very short spoken phrase.\nIf the user explicitly specifies a per-clip duration, honor that instead.\n\nN-VERSIONS-OF-A-VIDEO PATTERN: NEVER call animate_photo N times sequentially — ALWAYS\nuse sourceImageIndices in ONE call so all N projects run in parallel. Two flavors:\n(A) SHARED CONTENT — one edit_image/generate_image call with numberOfVariations=N + {|}\nDynamic Prompts to make N distinct source images, then ONE animate_photo call with\nsourceImageIndices=[start..start+N-1] and a single shared prompt.\n(B) PER-CLIP CONTENT — when each clip has DIFFERENT dialogue, jokes, narration, or motion,\npass BOTH sourceImageIndices AND prompts (array of N strings, one per clip) in the SAME\nsingle animate_photo call. The top-level prompt is still required — pass a brief batch summary.\n\nCRITICAL: sourceImageIndices values MUST be read from the latest edit_image/generate_image\ntool result's startIndex field — if startIndex=3 and 4 images were generated, pass\nsourceImageIndices=[3,4,5,6], NOT [0,1,2,3]. Negative indices refer to uploaded images:\n-1 first upload, -2 second upload, -3 third upload. Use repeated -1 entries only when\nintentionally reusing the primary uploaded image. When prompts is supplied, prompts.length\nMUST equal sourceImageIndices.length.\n\nSEEDANCE UPLOADED STORYBOARD DEFAULT: If the user uploaded a storyboard, shot sheet,\nor visual trailer board and asks to make a trailer/video/movie/clip from it, do NOT use\nanimate_photo on the board image and do NOT split it into four LTX clips. Use generate_video\nwith Seedance referenceImageIndices for one continuous clip unless the user explicitly asks\nfor separate LTX clips or first-frame/last-frame animation.\n\nSCREENPLAY / STORYBOARD ANIMATE RULE: For full storyboard projects, use one\nanimate_photo batch with sourceImageIndices + prompts so each clip keeps its own exact\nscene text, stable cast anchors, and screenplay-style speaker-tagged dialogue, and all video\nclips render in parallel. Every speaking clip's video prompt must include that clip's actual\nquoted dialogue, not placeholders such as \"while speaking\", \"dialogue begins\", \"explaining\",\nor \"final line lands\". If each generated scene keyframe should be both the first and last frame\nof its own stitched segment, call animate_photo with sourceImageIndices=[start..end],\nframeRole=\"both\", prompts=[...], and OMIT endImageIndex/endImageIndices so the handler\nuses each source as its own end frame.\n\nUPLOADED REFERENCE LOOPED SKITS: When the user supplies one uploaded reference image and\nasks for several scripted/storyboard/dialogue segments to reuse that same image as BOTH the\nfirst frame and last frame of each segment before stitching, do it in ONE animate_photo call:\nsourceImageIndices=[-1,-1,...], frameRole=\"both\", endImageIndex=-1 (or matching\nendImageIndices=[-1,-1,...]), duration equal to the requested per-segment duration, and\nprompts=[one full scene prompt per segment]. Each prompt must preserve the exact screenplay\nspeaker tags and quoted dialogue from that scene, e.g. HOST: \"...\" GUEST: \"...\". Do not\ndrop speaker tags, convert them to generic narration, omit the last-frame contract, analyze\nthe image first, generate new keyframes first, or split the batch into serial calls. After\nthe single animate_photo batch completes, call stitch_video with the returned video indices.\n\nFor adjacent transition chains: N images create N-1 clips — call animate_photo with\nframeRole=\"both\", sourceImageIndices=[start..end-1], endImageIndices=[start+1..end],\nprompts=[one transition prompt per adjacent pair], then stitch_video. If 5 uploaded images\nare the keyframe sequence, use sourceImageIndices=[-1,-2,-3,-4],\nendImageIndices=[-2,-3,-4,-5], frameRole=\"both\", prompts length 4, then stitch_video.\nDo NOT set endImageIndex=-1 in generated-keyframe patterns — that means every clip ends\non the primary uploaded image.\n\nUPLOADED FIRST-FRAME/LAST-FRAME TRANSITION CHAINS: If the user uploads multiple images\nand asks for a video that transitions from image to image, changes country/version every\nN seconds, or says to use first-frame/last-frame for each pair, call animate_photo directly.\nDo not call edit_image, generate_image, analyze_image, or map_assets_for_model first — the\nuploaded images are already the keyframes. For N uploaded images, create N-1 adjacent clips\nunless the user explicitly asks for a loop back to the first image. Use per-clip duration\nfrom \"every N seconds\" when present; otherwise divide the requested total by the number of\nadjacent clips. After animate_photo returns the batch videos, always call stitch_video with\nthose video indices before finalizing.",
|
|
2163
|
+
"baseDescription": "animate_photo produces video from one or more source images using LTX 2.3.\n\nVIDEO PROMPT QUOTING: In video prompts, ONLY use double quotes for spoken dialogue.\nSpeaker tags are allowed outside the quotes for screenplay-style dialogue, e.g.\nCHARACTER: \"We made it.\" Never put on-screen text, overlay text, titles, captions, signs,\nwatermarks, or any visual text in quotes — describe them without quotes (e.g. bold white text\nreading CONGRATULATIONS overlays the lower third). Quotes signal speech to the model;\nquoting non-speech text confuses audio generation.\n\nDIALOGUE DURATION: Spoken dialogue in video prompts must fit the clip duration. Estimate\nat 3 words per second for natural cinematic delivery, plus ~1 second per acting beat\n(pauses, gestures, glances between lines). If the user did NOT explicitly request a specific\nduration (using default 5s), extend the duration to fit the dialogue (max 20s). If the user\nexplicitly requested a specific duration, condense the dialogue to fit while preserving meaning.\nCheck: total dialogue words ÷ 3 + beat count ≤ clip duration.\n\nLATEST GENERATED IMAGE FOLLOW-UP: When the newest user turn asks to animate, make a video,\nor make a clip from a generated image/result (for example \"the apple\", \"this one\",\n\"the latest image\"), use animate_photo with that latest generated image. Do not inherit an\nolder Seedance model, resolution, or duration from an unrelated prior turn unless the newest\nuser turn explicitly says Seedance or confirms an immediately suggested Seedance video stage.\nLTX supports exact 2-20s durations, so honor requests like 3s exactly.\n\nWORD BUDGET PER CLIP: The handler REJECTS clips whose spoken dialogue exceeds the budget\n— there is NO auto-trim, so plan dialogue lengths up-front. Hard maximum is 3.75 spoken\nwords per second. Ceilings: 5s = 18 words, 6s = 22 words, 8s = 30 words, 10s = 37 words,\n15s = 56 words, 20s = 75 words. Aim below these ceilings. If a scene's dialogue won't fit,\ntighten the lines, raise the per-clip duration, or split into two segments — do NOT submit\nand hope it works. Spoken words inside double quotes count toward the budget; speaker tags\nand visual/action prose are free.\n\nBATCH VIDEO PER-CLIP DURATION: For a multi-segment animate_photo batch\n(sourceImageIndices + prompts) when the user states a TOTAL video length but NO per-clip\nlength, target 15 seconds per clip when dialogue is involved, and pass that duration\nexplicitly. Example: 60s total → 4 segments × 15s, NOT 6×10s or 12×5s. There is NO 3-clip\nbatch cap: sourceImageIndices supports up to 16 clips, so keep one planned batch together\nunless the user explicitly wants isolated projects or per-output settings require it.\nDo NOT split a planned 15s dialogue scene into multiple\nshorter clips just because a retry complains about word budget; keep duration=15 and tighten\nthe line. Use 5s clips only for single short motion beats or one very short spoken phrase.\nIf the user explicitly specifies a per-clip duration, honor that instead.\n\nN-VERSIONS-OF-A-VIDEO PATTERN: Avoid sequential animate_photo calls for N outputs.\nPrefer one Dynamic Prompt project when only prompt text varies, and reserve\nsourceImageIndices multi-project fan-out for source/end asset differences, isolated retry\nlifecycle, or other per-output parameter differences. Flavors:\n(A0) SAME SOURCE / PROMPT-ONLY TAKES — use sourceImageIndex, numberOfVariations=N,\nand ONE Dynamic Prompt branch in prompt: \"{full prompt 1|full prompt 2|...}\". This\nsubmits shared settings and source assets once; Sogni socket creates N jobs in one project.\n(A) SHARED CONTENT — one edit_image/generate_image call with numberOfVariations=N + {|}\nDynamic Prompts to make N distinct source images, then ONE animate_photo call with\nsourceImageIndices=[start..start+N-1] and a single shared prompt.\n(B) PER-CLIP ASSET WIRING — when each clip has DIFFERENT source images, end frames,\naudio windows, durations, dimensions, or other non-prompt parameters,\npass BOTH sourceImageIndices AND prompts (array of N strings, one per clip) in the SAME\nsingle animate_photo call. The top-level prompt is still required — pass a brief batch summary.\nFor explicit last/end-frame-only batches, reuse the image through sourceImageIndices but set\nframeRole=\"end\" and omit endImageIndex/endImageIndices. This means each listed image is the\nlast frame for its corresponding clip and no first/start frame is supplied.\nIf the user explicitly asks for Dynamic Prompt / Dynamic Template syntax, prefer flavor A0\nwhenever every output uses the same source/end assets and shared settings, even if they also\nask to stitch the completed clips afterward.\n\nCRITICAL: sourceImageIndices values MUST be read from the latest edit_image/generate_image\ntool result's startIndex field — if startIndex=3 and 4 images were generated, pass\nsourceImageIndices=[3,4,5,6], NOT [0,1,2,3]. Negative indices refer to uploaded images:\n-1 first upload, -2 second upload, -3 third upload. Use repeated -1 entries only when\nintentionally reusing the primary uploaded image. When prompts is supplied, prompts.length\nMUST equal sourceImageIndices.length.\n\nSEEDANCE UPLOADED STORYBOARD DEFAULT: If the user uploaded a storyboard, shot sheet,\nor visual trailer board and asks to make a trailer/video/movie/clip from it, do NOT use\nanimate_photo on the board image and do NOT split it into four LTX clips. Use generate_video\nwith Seedance referenceImageIndices for one continuous clip unless the user explicitly asks\nfor separate LTX clips or first-frame/last-frame animation.\n\nSCREENPLAY / STORYBOARD ANIMATE RULE: For full storyboard projects, use one\nanimate_photo batch with sourceImageIndices + prompts so each clip keeps its own exact\nscene text, stable cast anchors, and screenplay-style speaker-tagged dialogue, and all video\nclips render in parallel. Every speaking clip's video prompt must include that clip's actual\nquoted dialogue, not placeholders such as \"while speaking\", \"dialogue begins\", \"explaining\",\nor \"final line lands\". If each generated scene keyframe should be both the first and last frame\nof its own stitched segment, call animate_photo with sourceImageIndices=[start..end],\nframeRole=\"both\", prompts=[...], and OMIT endImageIndex/endImageIndices so the handler\nuses each source as its own end frame.\n\nUPLOADED REFERENCE LOOPED PROMPT-ONLY TAKES: When the user supplies one uploaded reference\nimage and asks for N clips that all reuse that same image as BOTH the first frame and last\nframe, with only the action/prompt text varying, do it in ONE Dynamic Prompt project:\nsourceImageIndex=-1, frameRole=\"both\", endImageIndex=-1, numberOfVariations=N, and prompt\nas ONE branch: \"{full prompt 1|full prompt 2|...|full prompt N}\". Do NOT use\nsourceImageIndices=[-1,-1,...] or prompts=[...] for this prompt-only shape. After the\nsingle animate_photo project completes, call stitch_video with the returned video indices\nif the user requested a final stitched video.\nEach Dynamic Prompt option must be a complete natural-language motion prompt for the video\nmodel. Do not include orchestration labels such as \"clip 1 of N\", \"overall request context\",\n\"use the uploaded image as the source frame\", \"follow the user request\", or \"make this clip\ndistinct\"; the attached first/last-frame image is already wired through arguments.\nKeep explicit shared constraints outside the branch before \"{...}\" and honor them inside every\noption: locked/static camera, subtle motion, consistent flames/embers, and silent expression-only\nphysical performance when requested. Non-speaking expressions such as yawns, smiles, kisses, or shy\nmouth-covering gestures are acceptable when they are the requested physical performance; do not\nturn them into speech, singing, or dialogue-like lip motion.\nIf the user gives avoid/no/don't constraints for a Dynamic Prompt batch, the shared prefix before\nthe branch must include their positive equivalents, such as single-subject empty background, clean\nblank surfaces, crisp sharp focus, same room/layout continuity, silent expression-only physical\nperformance, consistent flame/ember motion, and natural anatomically consistent hands. Do not place\nthose constraints only in negativePrompt or omit them from prompt.\nFor videoModel=\"wan22\", write motion-only visual prompts. WAN 2.2 does not generate audio,\nso omit soundtrack, ambience, room tone, music, hums, sighs, spoken words, voice, and SFX cues.\nFor videoModel=\"wan22\" and \"ltx23\", the prompt field is the positive prompt. Translate user\navoid/no/don't constraints into affirmative production constraints instead of copying negative\nphrasing into prompt. Examples: \"no people in background\" -> single subject focus with an empty\nbackground; \"no text\" -> clean blank surfaces; \"don't make it blurry\" -> crisp sharp focus;\n\"no weird hands\" -> natural anatomically consistent hands; \"don't change the room\" -> the same\nroom and layout remain consistent. Preserve exact quoted visible text or dialogue when the user\nexplicitly requests it, and keep surrounding surfaces blank.\n\nUPLOADED REFERENCE LOOPED SCRIPTED SKITS: When the user supplies one uploaded reference image\nand asks for several scripted/storyboard/dialogue segments to reuse that same image as BOTH\nthe first frame and last frame of each segment before stitching, do it in ONE animate_photo call:\nsourceImageIndices=[-1,-1,...], frameRole=\"both\", endImageIndex=-1 (or matching\nendImageIndices=[-1,-1,...]), duration equal to the requested per-segment duration, and\nprompts=[one full scene prompt per segment]. Each prompt must preserve the exact screenplay\nspeaker tags and quoted dialogue from that scene, e.g. HOST: \"...\" GUEST: \"...\". Do not\ndrop speaker tags, convert them to generic narration, omit the last-frame contract, analyze\nthe image first, generate new keyframes first, or split the batch into serial calls. After\nthe single animate_photo batch completes, call stitch_video with the returned video indices.\n\nFor adjacent transition chains: N images create N-1 clips — call animate_photo with\nframeRole=\"both\", sourceImageIndices=[start..end-1], endImageIndices=[start+1..end],\nprompts=[one transition prompt per adjacent pair], then stitch_video. If 5 uploaded images\nare the keyframe sequence, use sourceImageIndices=[-1,-2,-3,-4],\nendImageIndices=[-2,-3,-4,-5], frameRole=\"both\", prompts length 4, then stitch_video.\nDo NOT set endImageIndex=-1 in generated-keyframe patterns — that means every clip ends\non the primary uploaded image.\n\nUPLOADED FIRST-FRAME/LAST-FRAME TRANSITION CHAINS: If the user uploads multiple images\nand asks for a video that transitions from image to image, changes country/version every\nN seconds, or says to use first-frame/last-frame for each pair, call animate_photo directly.\nDo not call edit_image, generate_image, analyze_image, or map_assets_for_model first — the\nuploaded images are already the keyframes. For N uploaded images, create N-1 adjacent clips\nunless the user explicitly asks for a loop back to the first image. Use per-clip duration\nfrom \"every N seconds\" when present; otherwise divide the requested total by the number of\nadjacent clips. After animate_photo returns the batch videos, call stitch_video with\nthose video indices before finalizing unless the user explicitly asked to keep separate clips only.",
|
|
2164
2164
|
"parameterDocs": {
|
|
2165
|
-
"sourceImageIndices": "Batch source image indices. Read startIndex from prior generate_image/edit_image result. Negative = uploaded images (-1 = first upload).",
|
|
2165
|
+
"sourceImageIndices": "Batch source image indices. Read startIndex from prior generate_image/edit_image result. Negative = uploaded images (-1 = first upload). May be paired with frameRole=\"end\" only for explicit last/end-frame-only fan-out.",
|
|
2166
2166
|
"prompts": "Per-clip prompt array. Length MUST equal sourceImageIndices.length when both are set.",
|
|
2167
2167
|
"duration": "Per-clip duration in seconds. Target 15s when dialogue is involved and total length is given without per-clip spec.",
|
|
2168
|
-
"frameRole": "Set to \"both\" for first+last frame transitions using sourceImageIndices + endImageIndices.",
|
|
2168
|
+
"frameRole": "Set to \"end\" for explicit last/end-frame-only fan-out; set to \"both\" for first+last frame transitions using sourceImageIndices + endImageIndices.",
|
|
2169
2169
|
"endImageIndices": "End frames for adjacent-chain transitions. N images → N-1 clips."
|
|
2170
2170
|
}
|
|
2171
2171
|
},
|
|
@@ -2184,7 +2184,7 @@ const PROMPT_CONTRACTS = [
|
|
|
2184
2184
|
"contractId": "generate_video_v1",
|
|
2185
2185
|
"version": "1.1.0",
|
|
2186
2186
|
"toolName": "generate_video",
|
|
2187
|
-
"baseDescription": "generate_video produces text-to-video clips and Seedance multimodal reference videos.\nUse for text-only video generation with no source image input. For Seedance, also use this\ntool when uploaded/generated images, videos, or audio are loose references. Use animate_photo\nonly when a non-Seedance source image must become the first frame of an LTX/WAN animation.\n\nSEEDANCE UPLOADED STORYBOARD DEFAULT: When the user uploads a storyboard, shot sheet,\nmood board, or trailer concept image and asks to make a movie trailer/video/clip from it,\ndefault to one Seedance generate_video call with referenceImageIndices=[-1]. Do not first\nextract panels with edit_image, do not generate replacement keyframes, and do not make four\nseparate LTX animate_photo clips unless the user explicitly asks for separate clips or LTX.\nUse seedance2 when premium Spark access is available; if premium access is unavailable,\nexplain the limitation or use the best non-Seedance fallback the user accepts.\n\nEXACT / INCLUDED VIDEO PROMPTS: If the user asks for a Seedance video using uploaded or\ngenerated references and says to use a prompt exactly, pass only that literal quoted prompt\nto generate_video and set skipPromptProcessing=true plus expandPrompt=false. Do not treat\nwords inside the literal prompt, such as storyboard, script, thumbnails, or panels, as a\nrequest to create a storyboard image. If the user includes a timecoded script inside a\nvideo request, keep it in the generate_video prompt. Explicit constraints like no storyboard\npanels, no subtitles, or no captions are constraints on the video render, not instructions\nto call edit_image or generate_image.\n\nSTORYTELLING / COMMERCIAL / TRAILER PROMPTS: For creative video requests, turn the brief\ninto timed, causally connected visual beats before writing the final prompt. Default social\nvideo is 15s 9:16 with a strong first 1-2s, visible escalation, payoff, and brand/CTA/final\nimage. Commercials should show audience desire/problem, transformation, proof/benefit, and\nCTA. Trailers should follow hook → world → disruption → escalation → reveal → title/CTA.\nEvery beat must be generatable: subject, setting, action, camera, lighting, audio, and text\nrole where relevant. Avoid vague \"cinematic\" filler, feature dumps, and beautiful images with\nno visible change.\n\nVIDEO PROMPT QUOTING: ONLY use double quotes for spoken dialogue in video prompts. Never\nquote on-screen text, titles, captions, or visual text elements — describe them without\nquotes. Quotes signal speech to the model and confuse audio generation.\n\nSTORYBOARD TEXT: Structural headings, section numbers, slide titles, panel titles, and\ncaptions in storyboard references may become short audio-only narration/VO or\nkey-message beats, but they are not subtitles, title cards, lower thirds, or visible\noverlays unless the user explicitly asks for visible text, on-screen text, a title\ncard, subtitle, lower third, signage, or CTA. Keep narration as separate brief phrases\nwith pauses; do not concatenate storyboard labels into run-on voiceover.\n\nDIALOGUE DURATION: Spoken dialogue must fit the clip. Estimate
|
|
2187
|
+
"baseDescription": "generate_video produces text-to-video clips and Seedance multimodal reference videos.\nUse for text-only video generation with no source image input. For Seedance, also use this\ntool when uploaded/generated images, videos, or audio are loose references. Use animate_photo\nonly when a non-Seedance source image must become the first frame of an LTX/WAN animation.\n\nSEEDANCE UPLOADED STORYBOARD DEFAULT: When the user uploads a storyboard, shot sheet,\nmood board, or trailer concept image and asks to make a movie trailer/video/clip from it,\ndefault to one Seedance generate_video call with referenceImageIndices=[-1]. Do not first\nextract panels with edit_image, do not generate replacement keyframes, and do not make four\nseparate LTX animate_photo clips unless the user explicitly asks for separate clips or LTX.\nUse seedance2 when premium Spark access is available; if premium access is unavailable,\nexplain the limitation or use the best non-Seedance fallback the user accepts.\n\nEXACT / INCLUDED VIDEO PROMPTS: If the user asks for a Seedance video using uploaded or\ngenerated references and says to use a prompt exactly, pass only that literal quoted prompt\nto generate_video and set skipPromptProcessing=true plus expandPrompt=false. Do not treat\nwords inside the literal prompt, such as storyboard, script, thumbnails, or panels, as a\nrequest to create a storyboard image. If the user includes a timecoded script inside a\nvideo request, keep it in the generate_video prompt. Explicit constraints like no storyboard\npanels, no subtitles, or no captions are constraints on the video render, not instructions\nto call edit_image or generate_image.\n\nSTORYTELLING / COMMERCIAL / TRAILER PROMPTS: For creative video requests, turn the brief\ninto timed, causally connected visual beats before writing the final prompt. Default social\nvideo is 15s 9:16 with a strong first 1-2s, visible escalation, payoff, and brand/CTA/final\nimage. Commercials should show audience desire/problem, transformation, proof/benefit, and\nCTA. Trailers should follow hook → world → disruption → escalation → reveal → title/CTA.\nEvery beat must be generatable: subject, setting, action, camera, lighting, audio, and text\nrole where relevant. Avoid vague \"cinematic\" filler, feature dumps, and beautiful images with\nno visible change.\n\nVIDEO PROMPT QUOTING: ONLY use double quotes for spoken dialogue in video prompts. Never\nquote on-screen text, titles, captions, or visual text elements — describe them without\nquotes. Quotes signal speech to the model and confuse audio generation.\n\nSTORYBOARD TEXT: Structural headings, section numbers, slide titles, panel titles, and\ncaptions in storyboard references may become short audio-only narration/VO or\nkey-message beats, but they are not subtitles, title cards, lower thirds, or visible\noverlays unless the user explicitly asks for visible text, on-screen text, a title\ncard, subtitle, lower third, signage, or CTA. Keep narration as separate brief phrases\nwith pauses; do not concatenate storyboard labels into run-on voiceover.\n\nDIALOGUE DURATION: Spoken dialogue must fit the clip. Estimate 3 words per second\nnatural delivery plus ~1s per acting beat. Hard maximum 3.75 words/second.\nCheck: dialogue words ÷ 3 + beats ≤ duration. Do not submit oversized dialogue.\n\nLATEST USER DURATION WINS: In follow-up turns, use the newest duration the user states,\neven if a previous assistant message mentioned a longer script/runtime. For example, if\nhistory says \"the full script is 66 seconds\" but the user now says \"do a 30 second version\",\ngenerate the 30 second version. Do not ask a clarification question just because history\ncontains another duration; treat the latest user request as the override.\n\nSEEDANCE SHORT-DURATION LIMIT: Seedance supports 4-15s clips. If the user explicitly asks\nfor Seedance below 4s, do not silently round up. Ask whether they prefer a 4s Seedance clip\nor an exact-duration LTX clip. If the user did not explicitly ask for Seedance, choose the\nmodel/tool that can satisfy the requested duration exactly.",
|
|
2188
2188
|
"parameterDocs": {
|
|
2189
2189
|
"prompt": "Video prompt. Use double quotes ONLY for spoken dialogue. Describe visual text without quotes.",
|
|
2190
2190
|
"duration": "Clip duration in seconds. Plan dialogue word count against the 3.75 words/second ceiling."
|
|
@@ -2194,7 +2194,7 @@ const PROMPT_CONTRACTS = [
|
|
|
2194
2194
|
"contractId": "edit_image_v1",
|
|
2195
2195
|
"version": "1.0.0",
|
|
2196
2196
|
"toolName": "edit_image",
|
|
2197
|
-
"baseDescription": "edit_image applies instruction-based edits to uploaded or generated images. Use when\nuploaded or reference images must guide identity or likeness.\n\nImage-to-Image prompt order: [IDENTITY LOCK] → [REQUESTED EDIT] → [REFERENCE ROLE\nMAPPING] → [POSE/COMPOSITION] → [STYLE] → [LIGHTING/REALISM] → [PRESERVE ALL\nUNMENTIONED DETAILS]. GOLDEN RULE: When editing a person,
|
|
2197
|
+
"baseDescription": "edit_image applies instruction-based edits to uploaded or generated images. Use when\nuploaded or reference images must guide identity or likeness.\n\nImage-to-Image prompt order: [IDENTITY LOCK] → [REQUESTED EDIT] → [REFERENCE ROLE\nMAPPING] → [POSE/COMPOSITION] → [STYLE] → [LIGHTING/REALISM] → [PRESERVE ALL\nUNMENTIONED DETAILS]. GOLDEN RULE: When editing a person, state which image owns\nidentity so it is not ambiguous. Describe only the DELTA — what changes. Don't\nrewrite the entire image; the base image already contains most of the truth. Default to minimal\nchange. For multi-image edits, assign one primary role per reference image (identity, pose,\noutfit, style, environment). Do not let a style/pose/clothing reference silently override the face.\nUse positive constraints — \"preserve exact facial likeness, face structure, eye shape, nose\nshape, mouth shape, jawline, skin tone, hairline, apparent age, and overall recognizability\"\n— not vague negatives like \"don't mess up the face\".\n\nUPLOADED IMAGE VARIANT SETS: When the user supplies a photo/portrait/reference image and\nasks for N distinct generated images deriving from that source while changing paired\nper-output details, prefer one edit_image call with sourceImageIndex=-1,\nnumberOfVariations=N, and ONE Dynamic Prompt branch with N complete options. Each option\nmust be a full concrete image prompt for one output, including the uploaded subject/reference\nanchor, requested pose or placement preservation, the specific changed appearance/style/role,\nclothing or surface details when relevant, setting/background, and any requested label text or\nvisual symbol. If one option is a remade original/preserved source and the rest are themed\nvariants, the original option should explicitly say to preserve the original clothing/wardrobe/outfit\nand background/setting, plus any requested added label, flag, logo, symbol, or prop.\nDo not call generate_image, analyze_image, or multiple serial edit_image calls first.\n\nSELECTION-GATED IMAGE STAGES: If the user asks for N image options and says they will pick\none before a later dance/video/animation, prefer one edit_image call with numberOfVariations=N\nand one Dynamic Prompt branch. After images are created, stop and ask the user to choose\nunless the user explicitly asked to run the later stage immediately.\n\nMULTI-PERSONA (COMBINED): When multiple personas must appear in the SAME scene, make\none edit_image call with all persona faces in one prompt and omit personaName.\nPer-persona splits (one call each with personaName set) are RARE — only when the user\nexplicitly asks for solo images of each person individually.\n\nSTORYBOARD IMAGE BATCH RULE: When rendering scene keyframes from a screenplay/storyboard,\nnumberOfVariations is only the count; the prompt should be one Dynamic Prompt branch with one\nfull keyframe prompt per scene:\n{scene 1 full keyframe prompt|scene 2 full keyframe prompt|...|scene N full keyframe prompt}.\nDo not set numberOfVariations=N with only the first scene prompt — that creates N versions of\nscene 1. For full project requests, one edit_image batch for all scene keyframes, then one\nanimate_photo batch for all video clips in parallel.\nException: if the storyboard/shot sheet is already uploaded and the user asks to make a\ntrailer/video/movie/clip from that uploaded board, do not extract panels or redraw keyframes.\nUse generate_video with Seedance references for one continuous clip unless the user explicitly\nasks for separate image keyframes or a storyboard sheet output.\n\nDIRECT UPLOADED GPT IMAGE 2 STORYBOARD SHEETS: If the user uploaded reference images and\nasks for one finished GPT Image 2 storyboard/keyframe sheet now, call edit_image directly\nwith sourceImageIndex=-1, model=\"gpt-image-2\", numberOfVariations=1, and the requested\ncanvas/aspect settings. If the user did not explicitly specify a storyboard page/canvas/sheet\nshape, default the GPT Image 2 storyboard sheet pixel dimensions to a balanced grid that hosts\nthe target cell aspect ratio natively (e.g., 12 cells with 9:16 portrait video target -> ~3:4\nportrait sheet around 1728x2304; 12 cells with 16:9 landscape video target -> ~4:3 landscape\nsheet around 2304x1728; 6 cells with 9:16 target -> ~27:32 portrait sheet around 1840x2176). Do\nNOT default the sheet to 2560x1440 landscape when cells are portrait — a landscape sheet with\na portrait-cell grid physically forces cells to ~4:3 landscape and the model will not render\n9:16 portrait rectangles inside it. Keep individual scene-cell/frame areas at the target video\naspect ratio. Do not call map_assets_for_model, analyze_image, generate_image, or a separate\nplanning tool first. The uploaded files are already available as references; describe their\nroles plainly in the edit_image prompt and generate the sheet in that call.\n\nDO NOT USE edit_image FOR UPLOADED REFERENCE LOOPED VIDEO SEGMENTS: If the user says the\nsame uploaded image/reference should be reused as the first frame and last frame of each\nscripted segment/scene/clip before stitching, they are explicitly asking to animate the\nuploaded image, not to generate new storyboard keyframes. Do not call edit_image for that\nrequest. Call animate_photo once with repeated uploaded source indices and per-scene prompts.",
|
|
2198
2198
|
"parameterDocs": {
|
|
2199
2199
|
"sourceImageIndex": "Index of uploaded/generated image. Use -1 for the first uploaded image.",
|
|
2200
2200
|
"numberOfVariations": "Number of output variants. When > 1, use a Dynamic Prompt branch with one complete prompt per output.",
|
|
@@ -2205,7 +2205,7 @@ const PROMPT_CONTRACTS = [
|
|
|
2205
2205
|
"contractId": "generate_image_v1",
|
|
2206
2206
|
"version": "1.1.0",
|
|
2207
2207
|
"toolName": "generate_image",
|
|
2208
|
-
"baseDescription": "generate_image creates images from text descriptions. Use for text-only image generation;\nuse edit_image when uploaded or reference images must guide identity/likeness.\nException: Z-image and Z-image Turbo image-to-image/enhancement requests use generate_image\nwith model=\"z-turbo\" or model=\"z-image\", sourceImageIndex=-1, and starting_image_strength;\ndo not route explicit Z-image Turbo uploaded-image enhancement to edit_image because\nedit_image does not expose Z-image models.\n\nBATCH FAN-OUT (
|
|
2208
|
+
"baseDescription": "generate_image creates images from text descriptions. Use for text-only image generation;\nuse edit_image when uploaded or reference images must guide identity/likeness.\nException: Z-image and Z-image Turbo image-to-image/enhancement requests use generate_image\nwith model=\"z-turbo\" or model=\"z-image\", sourceImageIndex=-1, and starting_image_strength;\ndo not route explicit Z-image Turbo uploaded-image enhancement to edit_image because\nedit_image does not expose Z-image models.\n\nBATCH FAN-OUT DEFAULT (READ BEFORE ANYTHING ELSE BELOW):\nWhen the user explicitly asks for N images in the CURRENT turn, set numberOfVariations=N\nin one call. Avoid multiple serial generate_image calls unless the user explicitly wants\nindependent projects, isolated approvals, or per-output settings that cannot share one project.\nDo not omit numberOfVariations and try to \"generate the next one after this finishes\".\nTrigger phrasings:\n\"draw N\", \"make N\", \"give me N\", \"show me N\", \"render N\", \"create N\", \"generate N\",\n\"N more\", \"another N\", \"N as separate\", \"N separate images\", \"N different images\",\n\"N options\", \"N takes\", \"N versions\", \"N variations\", \"N pictures of\",\n\"all at the same time\", \"in parallel\", \"side by side as separate\".\n\nTHE PRIOR TURN DOES NOT ANCHOR THE CURRENT TURN. If the prior assistant turn used\nnumberOfVariations=1 with a composite \"N subjects in one image\" prompt, and the user\nnow says \"draw N more as separate images\" / \"as separate\" / \"separately\", DO NOT carry\nover numberOfVariations=1 from the prior call. The user is correcting that interpretation;\nset numberOfVariations=N for THIS call with one self-contained prompt per image via {|}\nDynamic Prompt branches. The new turn's count + separation language always wins over the\nprevious turn's pattern.\n\nWHEN BATCH FAN-OUT DOES NOT APPLY: numberOfVariations=1 with multiple subjects packed into\nONE prompt is correct only when the user clearly wants a SINGLE composite image (e.g.\n\"draw 2 goats in a meadow\" with no separation language, or explicit \"in one image\" / \"one\npicture of N\" / \"single image\" / \"composite\" / \"sheet\" / \"side-by-side composition\").\n\nFLUX.2 PROMPT ORDER: [SUBJECT] → [ATTRIBUTES] → [ACTION/POSE] → [CAMERA/FRAMING]\n→ [ENVIRONMENT] → [LIGHTING] → [STYLE/MEDIUM] → [MATERIALS/TEXTURES] →\n[SECONDARY DETAILS]. By default, start with the main subject and concrete observable\nattributes; use mood or atmosphere first only when the user explicitly asks for that shape.\nUse concrete nouns and observable adjectives — \"soft overcast daylight\" not \"nice lighting\".\nGood defaults when user is underspecified: medium shot for portraits, wide shot for\nenvironments, eye-level angle, soft natural light for realism.\n\nDYNAMIC PROMPTS: When numberOfVariations > 1, use Dynamic Prompt syntax to make each\nvariation meaningfully different — not just seed-different. Syntax: {a|b|c} cycles\nsequentially, {@a|b|c} picks randomly, {~a|b} paired cycling across groups. Rules: (1) Vary\nONLY what the user left unspecified — lock in everything they specified. (2) Match option\ncount to numberOfVariations so every result is unique. (3) Briefly tell the user what you're\nvarying without exposing raw {|} syntax unless the user asks to inspect the prompt.\n(4) Skip when: user wants consistency, prompt is fully\nspecified, user typed their own {|} syntax, or iterating on a specific result. (5) Do not put\nthe count or the word \"versions\"/\"variations\" inside the prompt — the prompt always describes\na single image. The multiplicity comes ONLY from numberOfVariations + the {|} syntax.\nLINKED VARIANTS: when multiple attributes must stay paired per result, use ONE top-level\nDynamic Prompt branch with one complete self-contained prompt per output. Do NOT split\nlinked fields into separate Dynamic Prompt groups.\n\nSELECTION-GATED IMAGE STAGES: If the user asks for N image options and says they will pick\none before a later dance/video/animation, call generate_image once with numberOfVariations=N.\nAfter images are created, stop and ask the user to choose unless the user explicitly asked\nto run the later stage immediately.\n\nIMAGE→VIDEO DIMENSION RULE: When generating an image that will feed into a video tool\n(animate_photo, sound_to_video, etc.), the image MUST be generated at the SAME aspect\nratio and dimensions as the target video. Default video aspect ratio is 16:9 landscape —\npass aspectRatio=\"16:9\" (or the user's specified/reference ratio) so the source image\nmatches the video output. Do not generate a square image for a widescreen video. Exception:\na composite GPT Image 2 storyboard/keyframe sheet for a later Seedance video is a board,\nnot a single source frame; unless the user explicitly specifies a storyboard page/canvas/sheet\nshape, default the sheet image dimensions to a balanced grid that hosts the target\nscene-cell/frame aspect natively (portrait video target -> portrait or square sheet whose\ncolumns x rows grid produces ~9:16 cells; landscape video target -> landscape sheet whose\nrows x columns grid produces ~16:9 cells). Each scene-cell/frame area preserves the target\nvideo aspect ratio.\n\nSTORYBOARD IMAGE BATCH RULE: When rendering scene keyframes from a screenplay/storyboard,\nnumberOfVariations is only the count; the prompt should be one Dynamic Prompt branch with one\nfull keyframe prompt per scene:\n{scene 1 full keyframe prompt|scene 2 full keyframe prompt|...|scene N full keyframe prompt}.\nDo not set numberOfVariations=N with only the first scene prompt — that creates N versions of\nscene 1. For full project requests, one generate_image batch for all scene keyframes, then\none animate_photo batch for all video clips in parallel.\n\nSTORYTELLING / BRAND / SOCIAL IMAGE PROMPTS: If generating a storyboard, ad concept,\ntrailer sheet, meme, creator post, or provocative social concept, make the first frame or\npanel immediately legible. Preserve the user's requested tone and audience. Use concrete\ncomposition, persona, product/brand role, caption placement, readable required text, and a\nclear visual transformation or punchline. For provocative adult social content, keep subjects\nclearly adult and consensual, PG-13/non-explicit, and avoid minor-coded styling or school-coded\nsettings while still optimizing visual magnet, persona, caption bait, and replay/comment value.\n\nGPT IMAGE 2 STORYBOARD SHEET → SEEDANCE AUTO-PROCEED: If the user asks to run the whole\nGPT Image 2 storyboard/keyframe sheet plus Seedance workflow without approval, the FIRST\ngenerate_image call must create ONE composite storyboard/keyframe sheet, not loose concept\nart and not separate keyframes. Use model=\"gpt-image-2\", numberOfVariations=1, and a\ncompiled storyboard prompt that literally includes: \"Create exactly N sequential video\nstoryboard frames as one composite storyboard image\", \"Target final video aspect ratio: X\",\na `SCENES:` section, and exactly N concrete scene entries named `SCENE_01`, `SCENE_02`,\netc. Each scene entry must include `Visual/Action:`, `Camera/Motion:`, `Dialogue/VO:`\n(use `[no dialogue]` when silent), `Audio/SFX:`, and any reference/visible-text notes\nneeded for that scene. Do not send only a source brief, storyboard concept, or generic\nlayout instructions as the prompt; malformed compiled storyboard prompts are blocked by\nquality audit instead of being repaired at runtime. If the user does not explicitly specify\na frame count, choose N with the shared storyboard density default: at least one key visual\nbeat about every 2 seconds, rounded up and clamped to 6-16 total storyboard frames\n(for example, a 60 second commercial defaults to N=16). Unless the user explicitly specifies another\nstoryboard page/canvas/sheet shape, default the GPT Image 2 storyboard sheet pixel dimensions\nto a balanced grid that hosts the target cell aspect natively: for a 9:16 portrait video,\npick a portrait-leaning sheet whose columns x rows grid produces ~9:16 cells (e.g., 12 cells\n-> ~3:4 sheet around 1728x2304, 6 cells -> ~27:32 around 1840x2176, 9 cells -> ~9:16 around\n1504x2672); for a 16:9 landscape video, pick a landscape sheet whose rows x columns grid\nproduces ~16:9 cells (e.g., 12 cells -> ~4:3 sheet around 2304x1728). Do not force landscape\n2560x1440 when cells are portrait — a landscape sheet with a portrait-cell grid cannot host\n9:16 cells without crushing them. Preserve the requested final video aspect ratio for every\nframe area. After\nthat image completes, call generate_video once using the generated storyboard board as\n@Image1/referenceImageIndices=[0], with skipPromptProcessing=false only when the user\nexplicitly wants the storyboard text rewritten; otherwise preserve the compiled shot guide\nand use skipPromptProcessing=true, expandPrompt=false.\n\nDO NOT USE generate_image FOR UPLOADED REFERENCE LOOPED VIDEO SEGMENTS: If the user says\nthe same uploaded image/reference should be reused as the first frame and last frame of each\nscripted segment/scene/clip before stitching, they are explicitly asking to animate the\nuploaded image, not to generate new storyboard keyframes. Do not call generate_image for\nthat request. Call animate_photo once with repeated uploaded source indices and per-scene\nprompts.\n\nREUSING RESULTS: When the user asks to redo, retry, or revise (e.g., \"try a new version\",\n\"redo the video with X\"), reuse the existing source images — do NOT regenerate them unless\nthe user explicitly asks for new images or describes changes to the images themselves.\nReference the existing result indices from the prior generation. If unsure whether the user\nwants new images, ask — don't regenerate by default.",
|
|
2209
2209
|
"parameterDocs": {
|
|
2210
2210
|
"prompt": "Text description. Follow FLUX.2 prompt order: subject first. Use Dynamic Prompt syntax when numberOfVariations > 1.",
|
|
2211
2211
|
"numberOfVariations": "Number of distinct outputs. Use Dynamic Prompt {|} syntax to vary one attribute per image. Never put the count in the prompt itself.",
|
package/llm.txt
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
# sogni-creative-agent-skill
|
|
2
2
|
|
|
3
|
-
Agent skill and CLI for Sogni AI image and
|
|
4
|
-
source for Claude Code, OpenClaw, Hermes Agent, Manus AI, and other
|
|
5
|
-
runtimes.
|
|
3
|
+
Agent skill and CLI for Sogni AI image, video, and music generation. Works as
|
|
4
|
+
a skill source for Claude Code, OpenClaw, Hermes Agent, Manus AI, and other
|
|
5
|
+
agent runtimes.
|
|
6
6
|
|
|
7
7
|
## Install (pick the integration that matches your environment)
|
|
8
8
|
|
package/openclaw.plugin.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"id": "sogni-creative-agent-skill",
|
|
3
3
|
"name": "Sogni Creative Agent Skill — Image, Video & Music Generation",
|
|
4
4
|
"description": "Agent skill and CLI for Sogni AI image, video, and music generation.",
|
|
5
|
-
"version": "3.3.
|
|
5
|
+
"version": "3.3.4",
|
|
6
6
|
"skills": [
|
|
7
7
|
"."
|
|
8
8
|
],
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@sogni-ai/sogni-creative-agent-skill",
|
|
3
|
-
"version": "3.3.
|
|
3
|
+
"version": "3.3.4",
|
|
4
4
|
"description": "Sogni Creative Agent Skill: agent skill and CLI for Sogni AI image, video, and music generation.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "sogni-agent.mjs",
|
|
@@ -67,7 +67,7 @@
|
|
|
67
67
|
"sogni-agent.mjs"
|
|
68
68
|
],
|
|
69
69
|
"dependencies": {
|
|
70
|
-
"@sogni-ai/sogni-intelligence-client": "^
|
|
70
|
+
"@sogni-ai/sogni-intelligence-client": "^3.0.8",
|
|
71
71
|
"execa": "^9.6.1",
|
|
72
72
|
"json5": "^2.2.3",
|
|
73
73
|
"sharp": "^0.34.5"
|
package/skill-package.json
CHANGED
package/sogni-agent.mjs
CHANGED
|
@@ -721,6 +721,10 @@ function buildBalanceError(message, details) {
|
|
|
721
721
|
return err;
|
|
722
722
|
}
|
|
723
723
|
|
|
724
|
+
function isStructuredInsufficientBalanceError(error) {
|
|
725
|
+
return Boolean(error && typeof error === 'object' && error.code === 'INSUFFICIENT_BALANCE');
|
|
726
|
+
}
|
|
727
|
+
|
|
724
728
|
function gcdInt(a, b) {
|
|
725
729
|
let x = Math.abs(Math.trunc(a));
|
|
726
730
|
let y = Math.abs(Math.trunc(b));
|
|
@@ -4020,6 +4024,23 @@ async function imageDataUriFromPathOrUrl(pathOrUrl) {
|
|
|
4020
4024
|
return `data:${mimeType};base64,${buffer.toString('base64')}`;
|
|
4021
4025
|
}
|
|
4022
4026
|
|
|
4027
|
+
const DEFAULT_API_CHAT_SYSTEM_PROMPT = `ROLE: You are Sogni Agent, a practical creative production assistant for Sogni's media tools. Be direct, specific, inventive, and warm. Avoid generic text-only LLM framing and describe Sogni's real media capabilities when they are relevant.
|
|
4028
|
+
|
|
4029
|
+
V2 TURN ARCHITECTURE:
|
|
4030
|
+
- Hosted chat may run a classifier/planner before the assistant round. That stage proposes text/tool/workflow mode and the allowed tool surface; it does not call tools or spend credits.
|
|
4031
|
+
- In the assistant/execution round, use only the tools currently exposed to you. If the user asked Sogni to generate, edit, animate, render, analyze, or otherwise execute media and the matching tool is available, call it.
|
|
4032
|
+
- If the current round is text-only, answer the question completely in prose. Product, model, pricing, credit, capability, and "what can you do?" questions are usually text-only until the user asks you to start making media.
|
|
4033
|
+
- If required input is missing, ask a concise clarifying question. For underspecified creative taste, choose a reasonable default and proceed.
|
|
4034
|
+
- Do not narrate hidden planning, tool selection, JSON, function names, or internal architecture to the user.
|
|
4035
|
+
|
|
4036
|
+
SOGNI PRODUCT KNOWLEDGE:
|
|
4037
|
+
- Sogni can create and edit images, generate and transform videos, compose music/lyrics, restore photos, apply styles, analyze media, and use uploaded or generated assets as references.
|
|
4038
|
+
- GPT Image 2 in Sogni creates images from text prompts, edits/restyles uploaded or generated references, builds storyboard/keyframe sheets, character/reference boards, ad/product composites, and layout/text-heavy stills.
|
|
4039
|
+
- For action requests, use image generation for text-to-image and image editing when references guide identity, likeness, composition, style, objects, logos, or products. Paid renders show a preflight estimate before spending.
|
|
4040
|
+
- Featured workflow: GPT Image 2 storyboard/keyframes -> Seedance 2.0 for finished social videos such as ads, trailers, character intros, and storyboard-to-video flows.
|
|
4041
|
+
- For Sogni, model, GPT Image, Seedance, or creative capability questions, describe the media tools Sogni can use instead of falling back to generic text-only limitations.
|
|
4042
|
+
- For unknown product facts, state uncertainty and point to docs.sogni.ai or Discord.`;
|
|
4043
|
+
|
|
4023
4044
|
/**
|
|
4024
4045
|
* Build the persona/memory/personality dynamic-system-prompt suffix the
|
|
4025
4046
|
* skill injects into `/v1/chat/completions` (and durable
|
|
@@ -4054,8 +4075,7 @@ function buildSkillDynamicSystemPrompt() {
|
|
|
4054
4075
|
}
|
|
4055
4076
|
suffix += `\nUser's people: ${personaContext}.`;
|
|
4056
4077
|
suffix += '\n\nPERSONA RULES:'
|
|
4057
|
-
+ '\n-
|
|
4058
|
-
+ '\n- Match personas by explicit name, self-referencing pronouns, OR relationship phrases ("my wife", "my son", "my dog", etc.).'
|
|
4078
|
+
+ '\n- Match personas only by explicit listed name or tag/alias. Do not infer persona identity from relationship phrases alone.'
|
|
4059
4079
|
+ '\n- When creating images of personas, prefer image-editing with the persona\'s reference photo over generating from scratch.'
|
|
4060
4080
|
+ '\n- If the user mentions someone not listed, suggest adding them via `--persona-add`.';
|
|
4061
4081
|
}
|
|
@@ -4069,7 +4089,7 @@ function buildSkillDynamicSystemPrompt() {
|
|
|
4069
4089
|
const memories = loadMemories();
|
|
4070
4090
|
if (memories.length > 0) {
|
|
4071
4091
|
const memoryContext = memories.map((m) => `${m.key}: ${m.value}`).join('; ');
|
|
4072
|
-
suffix += `\nUser preferences (
|
|
4092
|
+
suffix += `\nUser preferences (apply unless the latest user request overrides them): ${memoryContext}`;
|
|
4073
4093
|
}
|
|
4074
4094
|
} catch {
|
|
4075
4095
|
// best-effort
|
|
@@ -4096,8 +4116,7 @@ async function buildApiChatMessages(apiMediaRefs, apiMediaReferences) {
|
|
|
4096
4116
|
// tokens, Wan numeric tokens). Wiring it through here keeps the public
|
|
4097
4117
|
// skill's --api-chat behavior aligned with sogni-chat and the
|
|
4098
4118
|
// /v1/chat/completions endpoint when references are present.
|
|
4099
|
-
const baseSystem = options.apiSystemPrompt ||
|
|
4100
|
-
'You are a concise creative production assistant. Use Sogni creative tools when they help produce concrete media.';
|
|
4119
|
+
const baseSystem = options.apiSystemPrompt || DEFAULT_API_CHAT_SYSTEM_PROMPT;
|
|
4101
4120
|
const dynamicSuffix = buildSkillDynamicSystemPrompt();
|
|
4102
4121
|
const systemWithDynamic = dynamicSuffix ? `${baseSystem}${dynamicSuffix}` : baseSystem;
|
|
4103
4122
|
const system = apiMediaRefs.length > 0
|
|
@@ -4129,6 +4148,45 @@ function apiChatTemplateKwargs() {
|
|
|
4129
4148
|
return { enable_thinking: options.apiThinking };
|
|
4130
4149
|
}
|
|
4131
4150
|
|
|
4151
|
+
function chatRunEventPayload(event) {
|
|
4152
|
+
if (!event || typeof event !== 'object') return event;
|
|
4153
|
+
return event.payload || event.data || event;
|
|
4154
|
+
}
|
|
4155
|
+
|
|
4156
|
+
function chatRunAssistantDelta(type, payload) {
|
|
4157
|
+
if (type === 'assistant_message_delta' && typeof payload?.content === 'string') {
|
|
4158
|
+
return payload.content;
|
|
4159
|
+
}
|
|
4160
|
+
if (
|
|
4161
|
+
chatRunTerminalStatus(type, payload)
|
|
4162
|
+
|| chatRunFailureStatus(type)
|
|
4163
|
+
|| chatRunWaitingStatus(type)
|
|
4164
|
+
|| type === 'tool_call_progress'
|
|
4165
|
+
) {
|
|
4166
|
+
return null;
|
|
4167
|
+
}
|
|
4168
|
+
return payload?.delta?.content
|
|
4169
|
+
|| payload?.choices?.[0]?.delta?.content
|
|
4170
|
+
|| (typeof payload?.content === 'string' ? payload.content : null);
|
|
4171
|
+
}
|
|
4172
|
+
|
|
4173
|
+
function chatRunTerminalStatus(type, payload) {
|
|
4174
|
+
if (type === 'run_completed' || type === 'run.completed' || type === 'completed' || type === 'done') {
|
|
4175
|
+
return payload?.status || 'completed';
|
|
4176
|
+
}
|
|
4177
|
+
if (type === 'run_partial_failure') return payload?.status || 'partial_failure';
|
|
4178
|
+
if (type === 'run_cancelled' || type === 'cancelled') return payload?.status || 'cancelled';
|
|
4179
|
+
return null;
|
|
4180
|
+
}
|
|
4181
|
+
|
|
4182
|
+
function chatRunFailureStatus(type) {
|
|
4183
|
+
return type === 'run_failed' || type === 'run.failed' || type === 'failed' || type === 'error';
|
|
4184
|
+
}
|
|
4185
|
+
|
|
4186
|
+
function chatRunWaitingStatus(type) {
|
|
4187
|
+
return type === 'run_waiting_for_user' || type === 'waiting_for_user';
|
|
4188
|
+
}
|
|
4189
|
+
|
|
4132
4190
|
async function runApiChat(log) {
|
|
4133
4191
|
const creds = loadCredentials();
|
|
4134
4192
|
const apiKey = requireApiKeyCredentials(creds, '--api-chat');
|
|
@@ -4293,12 +4351,9 @@ async function runApiChatDurable(log, { apiKey, body }) {
|
|
|
4293
4351
|
|
|
4294
4352
|
for await (const event of helpers.sdkChatRunsStreamEvents(client, runId, {})) {
|
|
4295
4353
|
const type = event?.type || event?.event || '';
|
|
4296
|
-
const payload = event
|
|
4354
|
+
const payload = chatRunEventPayload(event);
|
|
4297
4355
|
// Stream assistant message deltas as they arrive.
|
|
4298
|
-
const delta =
|
|
4299
|
-
payload?.delta?.content
|
|
4300
|
-
|| payload?.choices?.[0]?.delta?.content
|
|
4301
|
-
|| (typeof payload?.content === 'string' ? payload.content : null);
|
|
4356
|
+
const delta = chatRunAssistantDelta(type, payload);
|
|
4302
4357
|
if (typeof delta === 'string' && delta) {
|
|
4303
4358
|
assistantParts.push(delta);
|
|
4304
4359
|
if (!options.json) {
|
|
@@ -4364,16 +4419,25 @@ async function runApiChatDurable(log, { apiKey, body }) {
|
|
|
4364
4419
|
if (Array.isArray(eventWorkflows) && eventWorkflows.length > 0) {
|
|
4365
4420
|
workflows.push(...eventWorkflows);
|
|
4366
4421
|
}
|
|
4367
|
-
|
|
4368
|
-
|
|
4422
|
+
const terminalStatus = chatRunTerminalStatus(type, payload);
|
|
4423
|
+
if (terminalStatus) {
|
|
4424
|
+
finalStatus = terminalStatus;
|
|
4369
4425
|
break;
|
|
4370
4426
|
}
|
|
4371
|
-
if (type
|
|
4427
|
+
if (chatRunFailureStatus(type)) {
|
|
4372
4428
|
const error = new Error(payload?.error?.message || 'Durable chat run failed.');
|
|
4373
4429
|
error.code = payload?.error?.code || 'DURABLE_CHAT_RUN_FAILED';
|
|
4374
4430
|
error.details = { runId, payload };
|
|
4375
4431
|
throw error;
|
|
4376
4432
|
}
|
|
4433
|
+
if (chatRunWaitingStatus(type)) {
|
|
4434
|
+
finalStatus = payload?.status || 'waiting_for_user';
|
|
4435
|
+
if (!options.json) {
|
|
4436
|
+
const reason = payload?.reason || payload?.waiting?.reason || 'user input required';
|
|
4437
|
+
log(`Durable chat run is waiting for user input: ${reason}`);
|
|
4438
|
+
}
|
|
4439
|
+
break;
|
|
4440
|
+
}
|
|
4377
4441
|
}
|
|
4378
4442
|
},
|
|
4379
4443
|
);
|
|
@@ -5363,20 +5427,11 @@ function resolvePersonaByName(name) {
|
|
|
5363
5427
|
// Match by name (case-insensitive)
|
|
5364
5428
|
let match = personas.find(p => p.name.toLowerCase() === name.toLowerCase());
|
|
5365
5429
|
if (match) return match;
|
|
5430
|
+
// Match by stable id
|
|
5431
|
+
match = personas.find(p => typeof p.id === 'string' && p.id.toLowerCase() === name.toLowerCase());
|
|
5432
|
+
if (match) return match;
|
|
5366
5433
|
// Match by tag
|
|
5367
5434
|
match = personas.find(p => p.tags?.some(t => t.toLowerCase() === name.toLowerCase()));
|
|
5368
|
-
if (match) return match;
|
|
5369
|
-
// Match implicit pronouns
|
|
5370
|
-
const lower = name.toLowerCase();
|
|
5371
|
-
if (lower === 'me' || lower === 'myself' || lower === 'i') {
|
|
5372
|
-
match = personas.find(p => p.relationship === 'self');
|
|
5373
|
-
} else if (lower.includes('wife') || lower.includes('husband') || lower.includes('partner')) {
|
|
5374
|
-
match = personas.find(p => p.relationship === 'partner');
|
|
5375
|
-
} else if (lower.includes('son') || lower.includes('daughter') || lower.includes('kid') || lower.includes('child')) {
|
|
5376
|
-
match = personas.find(p => p.relationship === 'child');
|
|
5377
|
-
} else if (lower.includes('dog') || lower.includes('cat') || lower.includes('pet')) {
|
|
5378
|
-
match = personas.find(p => p.relationship === 'pet');
|
|
5379
|
-
}
|
|
5380
5435
|
return match || null;
|
|
5381
5436
|
}
|
|
5382
5437
|
|
|
@@ -7963,7 +8018,7 @@ async function main() {
|
|
|
7963
8018
|
|
|
7964
8019
|
} catch (error) {
|
|
7965
8020
|
// Token auto-fallback: if using auto mode and got insufficient balance, retry with the other token
|
|
7966
|
-
const isBalanceError =
|
|
8021
|
+
const isBalanceError = isStructuredInsufficientBalanceError(error);
|
|
7967
8022
|
if (_allowAutoTokenFallback && isBalanceError && options.tokenType === 'spark') {
|
|
7968
8023
|
log('Insufficient SPARK balance — retrying with SOGNI tokens...');
|
|
7969
8024
|
options.tokenType = 'sogni';
|
package/version.mjs
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
export const PACKAGE_VERSION = '3.3.
|
|
1
|
+
export const PACKAGE_VERSION = '3.3.4';
|