@sogni-ai/sogni-intelligence-client 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +33 -0
- package/LICENSE +22 -0
- package/README.md +259 -0
- package/chatRun/index.d.ts +1 -0
- package/chatRun/index.js +2 -0
- package/context/index.d.ts +1 -0
- package/context/index.js +2 -0
- package/contracts/index.d.ts +1 -0
- package/contracts/index.js +2 -0
- package/dist/chatRun/costApproval.d.ts +10 -0
- package/dist/chatRun/costApproval.d.ts.map +1 -0
- package/dist/chatRun/costApproval.js +70 -0
- package/dist/chatRun/costApproval.js.map +1 -0
- package/dist/chatRun/index.d.ts +207 -0
- package/dist/chatRun/index.d.ts.map +1 -0
- package/dist/chatRun/index.js +350 -0
- package/dist/chatRun/index.js.map +1 -0
- package/dist/client/SogniClientWrapper.d.ts +86 -0
- package/dist/client/SogniClientWrapper.d.ts.map +1 -0
- package/dist/client/SogniClientWrapper.js +914 -0
- package/dist/client/SogniClientWrapper.js.map +1 -0
- package/dist/context/index.d.ts +56 -0
- package/dist/context/index.d.ts.map +1 -0
- package/dist/context/index.js +460 -0
- package/dist/context/index.js.map +1 -0
- package/dist/contracts/backboneDurableWorkflow.d.ts +82 -0
- package/dist/contracts/backboneDurableWorkflow.d.ts.map +1 -0
- package/dist/contracts/backboneDurableWorkflow.js +96 -0
- package/dist/contracts/backboneDurableWorkflow.js.map +1 -0
- package/dist/contracts/backboneToolCatalog.d.ts +7 -0
- package/dist/contracts/backboneToolCatalog.d.ts.map +1 -0
- package/dist/contracts/backboneToolCatalog.js +62 -0
- package/dist/contracts/backboneToolCatalog.js.map +1 -0
- package/dist/contracts/composeWorkflowTypes.d.ts +6 -0
- package/dist/contracts/composeWorkflowTypes.d.ts.map +1 -0
- package/dist/contracts/composeWorkflowTypes.js +3 -0
- package/dist/contracts/composeWorkflowTypes.js.map +1 -0
- package/dist/contracts/data/costEstimation.d.ts +5 -0
- package/dist/contracts/data/costEstimation.d.ts.map +1 -0
- package/dist/contracts/data/costEstimation.js +22 -0
- package/dist/contracts/data/costEstimation.js.map +1 -0
- package/dist/contracts/data/gatingPolicies.d.ts +6 -0
- package/dist/contracts/data/gatingPolicies.d.ts.map +1 -0
- package/dist/contracts/data/gatingPolicies.js +218 -0
- package/dist/contracts/data/gatingPolicies.js.map +1 -0
- package/dist/contracts/data/gatingPoliciesToolSurface.d.ts +7 -0
- package/dist/contracts/data/gatingPoliciesToolSurface.d.ts.map +1 -0
- package/dist/contracts/data/gatingPoliciesToolSurface.js +61 -0
- package/dist/contracts/data/gatingPoliciesToolSurface.js.map +1 -0
- package/dist/contracts/data/index.d.ts +14 -0
- package/dist/contracts/data/index.d.ts.map +1 -0
- package/dist/contracts/data/index.js +57 -0
- package/dist/contracts/data/index.js.map +1 -0
- package/dist/contracts/data/promptContracts.d.ts +5 -0
- package/dist/contracts/data/promptContracts.d.ts.map +1 -0
- package/dist/contracts/data/promptContracts.js +1286 -0
- package/dist/contracts/data/promptContracts.js.map +1 -0
- package/dist/contracts/data/repairRecipes.d.ts +5 -0
- package/dist/contracts/data/repairRecipes.d.ts.map +1 -0
- package/dist/contracts/data/repairRecipes.js +197 -0
- package/dist/contracts/data/repairRecipes.js.map +1 -0
- package/dist/contracts/data/toolCatalog.d.ts +31 -0
- package/dist/contracts/data/toolCatalog.d.ts.map +1 -0
- package/dist/contracts/data/toolCatalog.js +129 -0
- package/dist/contracts/data/toolCatalog.js.map +1 -0
- package/dist/contracts/data/toolCostMetadata.d.ts +16 -0
- package/dist/contracts/data/toolCostMetadata.d.ts.map +1 -0
- package/dist/contracts/data/toolCostMetadata.js +284 -0
- package/dist/contracts/data/toolCostMetadata.js.map +1 -0
- package/dist/contracts/data/toolPermissions.d.ts +27 -0
- package/dist/contracts/data/toolPermissions.d.ts.map +1 -0
- package/dist/contracts/data/toolPermissions.js +78 -0
- package/dist/contracts/data/toolPermissions.js.map +1 -0
- package/dist/contracts/evaluators.d.ts +94 -0
- package/dist/contracts/evaluators.d.ts.map +1 -0
- package/dist/contracts/evaluators.js +468 -0
- package/dist/contracts/evaluators.js.map +1 -0
- package/dist/contracts/hostedComposition.d.ts +85 -0
- package/dist/contracts/hostedComposition.d.ts.map +1 -0
- package/dist/contracts/hostedComposition.js +139 -0
- package/dist/contracts/hostedComposition.js.map +1 -0
- package/dist/contracts/hostedToolValidation.d.ts +47 -0
- package/dist/contracts/hostedToolValidation.d.ts.map +1 -0
- package/dist/contracts/hostedToolValidation.js +301 -0
- package/dist/contracts/hostedToolValidation.js.map +1 -0
- package/dist/contracts/idLoraPrompt.d.ts +13 -0
- package/dist/contracts/idLoraPrompt.d.ts.map +1 -0
- package/dist/contracts/idLoraPrompt.js +78 -0
- package/dist/contracts/idLoraPrompt.js.map +1 -0
- package/dist/contracts/imagePrompt.d.ts +16 -0
- package/dist/contracts/imagePrompt.d.ts.map +1 -0
- package/dist/contracts/imagePrompt.js +148 -0
- package/dist/contracts/imagePrompt.js.map +1 -0
- package/dist/contracts/index.d.ts +48 -0
- package/dist/contracts/index.d.ts.map +1 -0
- package/dist/contracts/index.js +156 -0
- package/dist/contracts/index.js.map +1 -0
- package/dist/contracts/musicComposition.d.ts +17 -0
- package/dist/contracts/musicComposition.d.ts.map +1 -0
- package/dist/contracts/musicComposition.js +188 -0
- package/dist/contracts/musicComposition.js.map +1 -0
- package/dist/contracts/promptContract.d.ts +11 -0
- package/dist/contracts/promptContract.d.ts.map +1 -0
- package/dist/contracts/promptContract.js +37 -0
- package/dist/contracts/promptContract.js.map +1 -0
- package/dist/contracts/promptOverrideMarker.d.ts +2 -0
- package/dist/contracts/promptOverrideMarker.d.ts.map +1 -0
- package/dist/contracts/promptOverrideMarker.js +5 -0
- package/dist/contracts/promptOverrideMarker.js.map +1 -0
- package/dist/contracts/randomThemes.d.ts +5 -0
- package/dist/contracts/randomThemes.d.ts.map +1 -0
- package/dist/contracts/randomThemes.js +159 -0
- package/dist/contracts/randomThemes.js.map +1 -0
- package/dist/contracts/registry.d.ts +29 -0
- package/dist/contracts/registry.d.ts.map +1 -0
- package/dist/contracts/registry.js +104 -0
- package/dist/contracts/registry.js.map +1 -0
- package/dist/contracts/repairRecipe.d.ts +14 -0
- package/dist/contracts/repairRecipe.d.ts.map +1 -0
- package/dist/contracts/repairRecipe.js +38 -0
- package/dist/contracts/repairRecipe.js.map +1 -0
- package/dist/contracts/storyboard.d.ts +113 -0
- package/dist/contracts/storyboard.d.ts.map +1 -0
- package/dist/contracts/storyboard.js +7 -0
- package/dist/contracts/storyboard.js.map +1 -0
- package/dist/contracts/telemetry.d.ts +57 -0
- package/dist/contracts/telemetry.d.ts.map +1 -0
- package/dist/contracts/telemetry.js +37 -0
- package/dist/contracts/telemetry.js.map +1 -0
- package/dist/contracts/toolGatingPolicy.d.ts +19 -0
- package/dist/contracts/toolGatingPolicy.d.ts.map +1 -0
- package/dist/contracts/toolGatingPolicy.js +63 -0
- package/dist/contracts/toolGatingPolicy.js.map +1 -0
- package/dist/contracts/toolPromptMarkers.d.ts +9 -0
- package/dist/contracts/toolPromptMarkers.d.ts.map +1 -0
- package/dist/contracts/toolPromptMarkers.js +13 -0
- package/dist/contracts/toolPromptMarkers.js.map +1 -0
- package/dist/contracts/toolSurface.d.ts +46 -0
- package/dist/contracts/toolSurface.d.ts.map +1 -0
- package/dist/contracts/toolSurface.js +119 -0
- package/dist/contracts/toolSurface.js.map +1 -0
- package/dist/contracts/turnPolicy.d.ts +22 -0
- package/dist/contracts/turnPolicy.d.ts.map +1 -0
- package/dist/contracts/turnPolicy.js +16 -0
- package/dist/contracts/turnPolicy.js.map +1 -0
- package/dist/contracts/videoComposition.d.ts +35 -0
- package/dist/contracts/videoComposition.d.ts.map +1 -0
- package/dist/contracts/videoComposition.js +224 -0
- package/dist/contracts/videoComposition.js.map +1 -0
- package/dist/index.d.ts +11 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +59 -0
- package/dist/index.js.map +1 -0
- package/dist/media/aspectRatio.d.ts +15 -0
- package/dist/media/aspectRatio.d.ts.map +1 -0
- package/dist/media/aspectRatio.js +72 -0
- package/dist/media/aspectRatio.js.map +1 -0
- package/dist/media/audioReference.d.ts +57 -0
- package/dist/media/audioReference.d.ts.map +1 -0
- package/dist/media/audioReference.js +194 -0
- package/dist/media/audioReference.js.map +1 -0
- package/dist/media/cameraAngle.d.ts +170 -0
- package/dist/media/cameraAngle.d.ts.map +1 -0
- package/dist/media/cameraAngle.js +48 -0
- package/dist/media/cameraAngle.js.map +1 -0
- package/dist/media/characterSheet.d.ts +9 -0
- package/dist/media/characterSheet.d.ts.map +1 -0
- package/dist/media/characterSheet.js +54 -0
- package/dist/media/characterSheet.js.map +1 -0
- package/dist/media/danceMontage.d.ts +18 -0
- package/dist/media/danceMontage.d.ts.map +1 -0
- package/dist/media/danceMontage.js +34 -0
- package/dist/media/danceMontage.js.map +1 -0
- package/dist/media/enhancementProfiles.d.ts +158 -0
- package/dist/media/enhancementProfiles.d.ts.map +1 -0
- package/dist/media/enhancementProfiles.js +224 -0
- package/dist/media/enhancementProfiles.js.map +1 -0
- package/dist/media/generationJob.d.ts +81 -0
- package/dist/media/generationJob.d.ts.map +1 -0
- package/dist/media/generationJob.js +91 -0
- package/dist/media/generationJob.js.map +1 -0
- package/dist/media/gptImage.d.ts +21 -0
- package/dist/media/gptImage.d.ts.map +1 -0
- package/dist/media/gptImage.js +162 -0
- package/dist/media/gptImage.js.map +1 -0
- package/dist/media/imageDimensions.d.ts +24 -0
- package/dist/media/imageDimensions.d.ts.map +1 -0
- package/dist/media/imageDimensions.js +64 -0
- package/dist/media/imageDimensions.js.map +1 -0
- package/dist/media/index.d.ts +16 -0
- package/dist/media/index.d.ts.map +1 -0
- package/dist/media/index.js +32 -0
- package/dist/media/index.js.map +1 -0
- package/dist/media/musicSettings.d.ts +86 -0
- package/dist/media/musicSettings.d.ts.map +1 -0
- package/dist/media/musicSettings.js +234 -0
- package/dist/media/musicSettings.js.map +1 -0
- package/dist/media/vendorModelPremium.d.ts +21 -0
- package/dist/media/vendorModelPremium.d.ts.map +1 -0
- package/dist/media/vendorModelPremium.js +89 -0
- package/dist/media/vendorModelPremium.js.map +1 -0
- package/dist/media/videoAppSettings.d.ts +41 -0
- package/dist/media/videoAppSettings.d.ts.map +1 -0
- package/dist/media/videoAppSettings.js +128 -0
- package/dist/media/videoAppSettings.js.map +1 -0
- package/dist/media/videoContentLimit.d.ts +15 -0
- package/dist/media/videoContentLimit.d.ts.map +1 -0
- package/dist/media/videoContentLimit.js +169 -0
- package/dist/media/videoContentLimit.js.map +1 -0
- package/dist/media/videoReference.d.ts +35 -0
- package/dist/media/videoReference.d.ts.map +1 -0
- package/dist/media/videoReference.js +77 -0
- package/dist/media/videoReference.js.map +1 -0
- package/dist/media/videoSettings.d.ts +50 -0
- package/dist/media/videoSettings.d.ts.map +1 -0
- package/dist/media/videoSettings.js +200 -0
- package/dist/media/videoSettings.js.map +1 -0
- package/dist/openai-tools/_manifests.generated.d.ts +4 -0
- package/dist/openai-tools/_manifests.generated.d.ts.map +1 -0
- package/dist/openai-tools/_manifests.generated.js +1792 -0
- package/dist/openai-tools/_manifests.generated.js.map +1 -0
- package/dist/openai-tools/app-tools.json +297 -0
- package/dist/openai-tools/composition-tools.json +228 -0
- package/dist/openai-tools/generation-tools.json +1263 -0
- package/dist/openai-tools/index.d.ts +17 -0
- package/dist/openai-tools/index.d.ts.map +1 -0
- package/dist/openai-tools/index.js +32 -0
- package/dist/openai-tools/index.js.map +1 -0
- package/dist/public-skill-runtime/index.d.ts +1222 -0
- package/dist/public-skill-runtime/index.d.ts.map +1 -0
- package/dist/public-skill-runtime/index.js +6492 -0
- package/dist/public-skill-runtime/index.js.map +1 -0
- package/dist/replay/index.d.ts +4 -0
- package/dist/replay/index.d.ts.map +1 -0
- package/dist/replay/index.js +12 -0
- package/dist/replay/index.js.map +1 -0
- package/dist/replay/redact.d.ts +7 -0
- package/dist/replay/redact.d.ts.map +1 -0
- package/dist/replay/redact.js +108 -0
- package/dist/replay/redact.js.map +1 -0
- package/dist/replay/types.d.ts +61 -0
- package/dist/replay/types.d.ts.map +1 -0
- package/dist/replay/types.js +24 -0
- package/dist/replay/types.js.map +1 -0
- package/dist/runtime/chatTypes.d.ts +47 -0
- package/dist/runtime/chatTypes.d.ts.map +1 -0
- package/dist/runtime/chatTypes.js +3 -0
- package/dist/runtime/chatTypes.js.map +1 -0
- package/dist/runtime/durableWorkflowClient.d.ts +80 -0
- package/dist/runtime/durableWorkflowClient.d.ts.map +1 -0
- package/dist/runtime/durableWorkflowClient.js +312 -0
- package/dist/runtime/durableWorkflowClient.js.map +1 -0
- package/dist/runtime/index.d.ts +3 -0
- package/dist/runtime/index.d.ts.map +1 -0
- package/dist/runtime/index.js +18 -0
- package/dist/runtime/index.js.map +1 -0
- package/dist/schemas/errors/error.schema.json +21 -0
- package/dist/schemas/errors/repair-control.schema.json +40 -0
- package/dist/schemas/events/artifact-reference.schema.json +22 -0
- package/dist/schemas/events/progress-event.schema.json +28 -0
- package/dist/schemas/events/workflow-event.schema.json +22 -0
- package/dist/schemas/storyboards/storyboard-planning-contract.schema.json +108 -0
- package/dist/schemas/tools/add_subtitles.schema.json +77 -0
- package/dist/schemas/tools/animate_photo.schema.json +104 -0
- package/dist/schemas/tools/apply_style.schema.json +37 -0
- package/dist/schemas/tools/change_angle.schema.json +30 -0
- package/dist/schemas/tools/compose_instrumental.schema.json +24 -0
- package/dist/schemas/tools/compose_lyrics.schema.json +28 -0
- package/dist/schemas/tools/compose_script.schema.json +68 -0
- package/dist/schemas/tools/compose_workflow.schema.json +67 -0
- package/dist/schemas/tools/compose_workflow_template.schema.json +156 -0
- package/dist/schemas/tools/dance_montage.schema.json +47 -0
- package/dist/schemas/tools/edit_image.schema.json +74 -0
- package/dist/schemas/tools/enhance_prompt.schema.json +76 -0
- package/dist/schemas/tools/extend_video.schema.json +42 -0
- package/dist/schemas/tools/generate_image.schema.json +104 -0
- package/dist/schemas/tools/generate_music.schema.json +62 -0
- package/dist/schemas/tools/generate_video.schema.json +97 -0
- package/dist/schemas/tools/orbit_video.schema.json +70 -0
- package/dist/schemas/tools/overlay_video.schema.json +126 -0
- package/dist/schemas/tools/refine_result.schema.json +43 -0
- package/dist/schemas/tools/replace_video_segment.schema.json +60 -0
- package/dist/schemas/tools/restore_photo.schema.json +47 -0
- package/dist/schemas/tools/sound_to_video.schema.json +70 -0
- package/dist/schemas/tools/stitch_video.schema.json +52 -0
- package/dist/schemas/tools/video_to_video.schema.json +77 -0
- package/dist/schemas/workflows/durable-workflow-run.schema.json +165 -0
- package/dist/schemas/workflows/durable-workflow-step.schema.json +141 -0
- package/dist/skill-runtime-source/crossSurfaceParity.d.ts +23 -0
- package/dist/skill-runtime-source/crossSurfaceParity.d.ts.map +1 -0
- package/dist/skill-runtime-source/crossSurfaceParity.js +472 -0
- package/dist/skill-runtime-source/crossSurfaceParity.js.map +1 -0
- package/dist/skill-runtime-source/index.d.ts +4 -0
- package/dist/skill-runtime-source/index.d.ts.map +1 -0
- package/dist/skill-runtime-source/index.js +20 -0
- package/dist/skill-runtime-source/index.js.map +1 -0
- package/dist/skill-runtime-source/seedanceAudioWindow.d.ts +8 -0
- package/dist/skill-runtime-source/seedanceAudioWindow.d.ts.map +1 -0
- package/dist/skill-runtime-source/seedanceAudioWindow.js +52 -0
- package/dist/skill-runtime-source/seedanceAudioWindow.js.map +1 -0
- package/dist/skill-runtime-source/workflowStatus.d.ts +17 -0
- package/dist/skill-runtime-source/workflowStatus.d.ts.map +1 -0
- package/dist/skill-runtime-source/workflowStatus.js +353 -0
- package/dist/skill-runtime-source/workflowStatus.js.map +1 -0
- package/dist/skills/asset_reference_management/index.d.ts +6 -0
- package/dist/skills/asset_reference_management/index.d.ts.map +1 -0
- package/dist/skills/asset_reference_management/index.js +18 -0
- package/dist/skills/asset_reference_management/index.js.map +1 -0
- package/dist/skills/asset_reference_management/manifest.d.ts +42 -0
- package/dist/skills/asset_reference_management/manifest.d.ts.map +1 -0
- package/dist/skills/asset_reference_management/manifest.js +237 -0
- package/dist/skills/asset_reference_management/manifest.js.map +1 -0
- package/dist/skills/asset_reference_management/modelRefRegistry.d.ts +24 -0
- package/dist/skills/asset_reference_management/modelRefRegistry.d.ts.map +1 -0
- package/dist/skills/asset_reference_management/modelRefRegistry.js +136 -0
- package/dist/skills/asset_reference_management/modelRefRegistry.js.map +1 -0
- package/dist/skills/asset_reference_management/types.d.ts +31 -0
- package/dist/skills/asset_reference_management/types.d.ts.map +1 -0
- package/dist/skills/asset_reference_management/types.js +3 -0
- package/dist/skills/asset_reference_management/types.js.map +1 -0
- package/dist/tools/definitions/add-subtitles/definition.d.ts +4 -0
- package/dist/tools/definitions/add-subtitles/definition.d.ts.map +1 -0
- package/dist/tools/definitions/add-subtitles/definition.js +83 -0
- package/dist/tools/definitions/add-subtitles/definition.js.map +1 -0
- package/dist/tools/definitions/animate-photo/definition.d.ts +3 -0
- package/dist/tools/definitions/animate-photo/definition.d.ts.map +1 -0
- package/dist/tools/definitions/animate-photo/definition.js +124 -0
- package/dist/tools/definitions/animate-photo/definition.js.map +1 -0
- package/dist/tools/definitions/apply-style/definition.d.ts +3 -0
- package/dist/tools/definitions/apply-style/definition.d.ts.map +1 -0
- package/dist/tools/definitions/apply-style/definition.js +50 -0
- package/dist/tools/definitions/apply-style/definition.js.map +1 -0
- package/dist/tools/definitions/change-angle/definition.d.ts +3 -0
- package/dist/tools/definitions/change-angle/definition.d.ts.map +1 -0
- package/dist/tools/definitions/change-angle/definition.js +49 -0
- package/dist/tools/definitions/change-angle/definition.js.map +1 -0
- package/dist/tools/definitions/dance-montage/dances.d.ts +11 -0
- package/dist/tools/definitions/dance-montage/dances.d.ts.map +1 -0
- package/dist/tools/definitions/dance-montage/dances.js +90 -0
- package/dist/tools/definitions/dance-montage/dances.js.map +1 -0
- package/dist/tools/definitions/dance-montage/definition.d.ts +3 -0
- package/dist/tools/definitions/dance-montage/definition.d.ts.map +1 -0
- package/dist/tools/definitions/dance-montage/definition.js +45 -0
- package/dist/tools/definitions/dance-montage/definition.js.map +1 -0
- package/dist/tools/definitions/edit-image/definition.d.ts +3 -0
- package/dist/tools/definitions/edit-image/definition.d.ts.map +1 -0
- package/dist/tools/definitions/edit-image/definition.js +128 -0
- package/dist/tools/definitions/edit-image/definition.js.map +1 -0
- package/dist/tools/definitions/extend-video/definition.d.ts +3 -0
- package/dist/tools/definitions/extend-video/definition.d.ts.map +1 -0
- package/dist/tools/definitions/extend-video/definition.js +51 -0
- package/dist/tools/definitions/extend-video/definition.js.map +1 -0
- package/dist/tools/definitions/generate-image/definition.d.ts +3 -0
- package/dist/tools/definitions/generate-image/definition.d.ts.map +1 -0
- package/dist/tools/definitions/generate-image/definition.js +107 -0
- package/dist/tools/definitions/generate-image/definition.js.map +1 -0
- package/dist/tools/definitions/generate-music/definition.d.ts +3 -0
- package/dist/tools/definitions/generate-music/definition.d.ts.map +1 -0
- package/dist/tools/definitions/generate-music/definition.js +75 -0
- package/dist/tools/definitions/generate-music/definition.js.map +1 -0
- package/dist/tools/definitions/generate-video/definition.d.ts +3 -0
- package/dist/tools/definitions/generate-video/definition.d.ts.map +1 -0
- package/dist/tools/definitions/generate-video/definition.js +120 -0
- package/dist/tools/definitions/generate-video/definition.js.map +1 -0
- package/dist/tools/definitions/index.d.ts +25 -0
- package/dist/tools/definitions/index.d.ts.map +1 -0
- package/dist/tools/definitions/index.js +66 -0
- package/dist/tools/definitions/index.js.map +1 -0
- package/dist/tools/definitions/orbit-video/definition.d.ts +3 -0
- package/dist/tools/definitions/orbit-video/definition.d.ts.map +1 -0
- package/dist/tools/definitions/orbit-video/definition.js +103 -0
- package/dist/tools/definitions/orbit-video/definition.js.map +1 -0
- package/dist/tools/definitions/overlay-video/definition.d.ts +4 -0
- package/dist/tools/definitions/overlay-video/definition.d.ts.map +1 -0
- package/dist/tools/definitions/overlay-video/definition.js +142 -0
- package/dist/tools/definitions/overlay-video/definition.js.map +1 -0
- package/dist/tools/definitions/refine-result/definition.d.ts +3 -0
- package/dist/tools/definitions/refine-result/definition.d.ts.map +1 -0
- package/dist/tools/definitions/refine-result/definition.js +56 -0
- package/dist/tools/definitions/refine-result/definition.js.map +1 -0
- package/dist/tools/definitions/replace-video-segment/definition.d.ts +3 -0
- package/dist/tools/definitions/replace-video-segment/definition.d.ts.map +1 -0
- package/dist/tools/definitions/replace-video-segment/definition.js +65 -0
- package/dist/tools/definitions/replace-video-segment/definition.js.map +1 -0
- package/dist/tools/definitions/restore-photo/definition.d.ts +3 -0
- package/dist/tools/definitions/restore-photo/definition.d.ts.map +1 -0
- package/dist/tools/definitions/restore-photo/definition.js +58 -0
- package/dist/tools/definitions/restore-photo/definition.js.map +1 -0
- package/dist/tools/definitions/sound-to-video/definition.d.ts +3 -0
- package/dist/tools/definitions/sound-to-video/definition.d.ts.map +1 -0
- package/dist/tools/definitions/sound-to-video/definition.js +91 -0
- package/dist/tools/definitions/sound-to-video/definition.js.map +1 -0
- package/dist/tools/definitions/stitch-video/definition.d.ts +4 -0
- package/dist/tools/definitions/stitch-video/definition.d.ts.map +1 -0
- package/dist/tools/definitions/stitch-video/definition.js +89 -0
- package/dist/tools/definitions/stitch-video/definition.js.map +1 -0
- package/dist/tools/definitions/types.d.ts +15 -0
- package/dist/tools/definitions/types.d.ts.map +1 -0
- package/dist/tools/definitions/types.js +3 -0
- package/dist/tools/definitions/types.js.map +1 -0
- package/dist/tools/definitions/video-to-video/definition.d.ts +3 -0
- package/dist/tools/definitions/video-to-video/definition.d.ts.map +1 -0
- package/dist/tools/definitions/video-to-video/definition.js +101 -0
- package/dist/tools/definitions/video-to-video/definition.js.map +1 -0
- package/dist/tools/index.d.ts +22 -0
- package/dist/tools/index.d.ts.map +1 -0
- package/dist/tools/index.js +83 -0
- package/dist/tools/index.js.map +1 -0
- package/dist/tools/normalizeArgs.d.ts +2 -0
- package/dist/tools/normalizeArgs.d.ts.map +1 -0
- package/dist/tools/normalizeArgs.js +40 -0
- package/dist/tools/normalizeArgs.js.map +1 -0
- package/dist/tools/result.d.ts +47 -0
- package/dist/tools/result.d.ts.map +1 -0
- package/dist/tools/result.js +38 -0
- package/dist/tools/result.js.map +1 -0
- package/dist/tools/shared/downloadFilename.d.ts +18 -0
- package/dist/tools/shared/downloadFilename.d.ts.map +1 -0
- package/dist/tools/shared/downloadFilename.js +157 -0
- package/dist/tools/shared/downloadFilename.js.map +1 -0
- package/dist/tools/shared/dynamicPromptBranches.d.ts +20 -0
- package/dist/tools/shared/dynamicPromptBranches.d.ts.map +1 -0
- package/dist/tools/shared/dynamicPromptBranches.js +193 -0
- package/dist/tools/shared/dynamicPromptBranches.js.map +1 -0
- package/dist/tools/shared/errorClassification.d.ts +15 -0
- package/dist/tools/shared/errorClassification.d.ts.map +1 -0
- package/dist/tools/shared/errorClassification.js +78 -0
- package/dist/tools/shared/errorClassification.js.map +1 -0
- package/dist/tools/shared/imageEncoding.d.ts +2 -0
- package/dist/tools/shared/imageEncoding.d.ts.map +1 -0
- package/dist/tools/shared/imageEncoding.js +11 -0
- package/dist/tools/shared/imageEncoding.js.map +1 -0
- package/dist/tools/shared/llmHelpers.d.ts +14 -0
- package/dist/tools/shared/llmHelpers.d.ts.map +1 -0
- package/dist/tools/shared/llmHelpers.js +145 -0
- package/dist/tools/shared/llmHelpers.js.map +1 -0
- package/dist/tools/shared/modelRegistry.d.ts +10 -0
- package/dist/tools/shared/modelRegistry.d.ts.map +1 -0
- package/dist/tools/shared/modelRegistry.js +98 -0
- package/dist/tools/shared/modelRegistry.js.map +1 -0
- package/dist/tools/shared/multiImageIntent.d.ts +2 -0
- package/dist/tools/shared/multiImageIntent.d.ts.map +1 -0
- package/dist/tools/shared/multiImageIntent.js +13 -0
- package/dist/tools/shared/multiImageIntent.js.map +1 -0
- package/dist/tools/shared/numberOfVariationsAlignment.d.ts +2 -0
- package/dist/tools/shared/numberOfVariationsAlignment.d.ts.map +1 -0
- package/dist/tools/shared/numberOfVariationsAlignment.js +37 -0
- package/dist/tools/shared/numberOfVariationsAlignment.js.map +1 -0
- package/dist/tools/shared/policyChecks.d.ts +25 -0
- package/dist/tools/shared/policyChecks.d.ts.map +1 -0
- package/dist/tools/shared/policyChecks.js +79 -0
- package/dist/tools/shared/policyChecks.js.map +1 -0
- package/dist/tools/shared/promptRefinementCache.d.ts +10 -0
- package/dist/tools/shared/promptRefinementCache.d.ts.map +1 -0
- package/dist/tools/shared/promptRefinementCache.js +81 -0
- package/dist/tools/shared/promptRefinementCache.js.map +1 -0
- package/dist/tools/shared/promptSanitizer.d.ts +2 -0
- package/dist/tools/shared/promptSanitizer.d.ts.map +1 -0
- package/dist/tools/shared/promptSanitizer.js +73 -0
- package/dist/tools/shared/promptSanitizer.js.map +1 -0
- package/dist/tools/shared/seedancePolicyErrors.d.ts +33 -0
- package/dist/tools/shared/seedancePolicyErrors.d.ts.map +1 -0
- package/dist/tools/shared/seedancePolicyErrors.js +239 -0
- package/dist/tools/shared/seedancePolicyErrors.js.map +1 -0
- package/dist/tools/shared/slotFailureSummary.d.ts +7 -0
- package/dist/tools/shared/slotFailureSummary.d.ts.map +1 -0
- package/dist/tools/shared/slotFailureSummary.js +63 -0
- package/dist/tools/shared/slotFailureSummary.js.map +1 -0
- package/dist/tools/shared/visionDescriptionCache.d.ts +5 -0
- package/dist/tools/shared/visionDescriptionCache.d.ts.map +1 -0
- package/dist/tools/shared/visionDescriptionCache.js +35 -0
- package/dist/tools/shared/visionDescriptionCache.js.map +1 -0
- package/dist/types/index.d.ts +221 -0
- package/dist/types/index.d.ts.map +1 -0
- package/dist/types/index.js +34 -0
- package/dist/types/index.js.map +1 -0
- package/dist/utils/errors.d.ts +38 -0
- package/dist/utils/errors.d.ts.map +1 -0
- package/dist/utils/errors.js +99 -0
- package/dist/utils/errors.js.map +1 -0
- package/dist/utils/helpers.d.ts +36 -0
- package/dist/utils/helpers.d.ts.map +1 -0
- package/dist/utils/helpers.js +445 -0
- package/dist/utils/helpers.js.map +1 -0
- package/dist/workflows/bindings.d.ts +23 -0
- package/dist/workflows/bindings.d.ts.map +1 -0
- package/dist/workflows/bindings.js +220 -0
- package/dist/workflows/bindings.js.map +1 -0
- package/dist/workflows/executor-ports.d.ts +11 -0
- package/dist/workflows/executor-ports.d.ts.map +1 -0
- package/dist/workflows/executor-ports.js +3 -0
- package/dist/workflows/executor-ports.js.map +1 -0
- package/dist/workflows/executor.d.ts +17 -0
- package/dist/workflows/executor.d.ts.map +1 -0
- package/dist/workflows/executor.js +526 -0
- package/dist/workflows/executor.js.map +1 -0
- package/dist/workflows/index.d.ts +11 -0
- package/dist/workflows/index.d.ts.map +1 -0
- package/dist/workflows/index.js +48 -0
- package/dist/workflows/index.js.map +1 -0
- package/dist/workflows/primitives/index.d.ts +3 -0
- package/dist/workflows/primitives/index.d.ts.map +1 -0
- package/dist/workflows/primitives/index.js +19 -0
- package/dist/workflows/primitives/index.js.map +1 -0
- package/dist/workflows/primitives/retryUntilCondition.d.ts +39 -0
- package/dist/workflows/primitives/retryUntilCondition.d.ts.map +1 -0
- package/dist/workflows/primitives/retryUntilCondition.js +102 -0
- package/dist/workflows/primitives/retryUntilCondition.js.map +1 -0
- package/dist/workflows/primitives/validateWithRubric.d.ts +28 -0
- package/dist/workflows/primitives/validateWithRubric.d.ts.map +1 -0
- package/dist/workflows/primitives/validateWithRubric.js +148 -0
- package/dist/workflows/primitives/validateWithRubric.js.map +1 -0
- package/dist/workflows/summarize.d.ts +7 -0
- package/dist/workflows/summarize.d.ts.map +1 -0
- package/dist/workflows/summarize.js +54 -0
- package/dist/workflows/summarize.js.map +1 -0
- package/dist/workflows/types.d.ts +302 -0
- package/dist/workflows/types.d.ts.map +1 -0
- package/dist/workflows/types.js +3 -0
- package/dist/workflows/types.js.map +1 -0
- package/dist/workflows/validation.d.ts +10 -0
- package/dist/workflows/validation.d.ts.map +1 -0
- package/dist/workflows/validation.js +340 -0
- package/dist/workflows/validation.js.map +1 -0
- package/dist-esm/chatRun/costApproval.js +64 -0
- package/dist-esm/chatRun/costApproval.js.map +1 -0
- package/dist-esm/chatRun/index.js +327 -0
- package/dist-esm/chatRun/index.js.map +1 -0
- package/dist-esm/client/SogniClientWrapper.js +877 -0
- package/dist-esm/client/SogniClientWrapper.js.map +1 -0
- package/dist-esm/context/index.js +453 -0
- package/dist-esm/context/index.js.map +1 -0
- package/dist-esm/contracts/backboneDurableWorkflow.js +90 -0
- package/dist-esm/contracts/backboneDurableWorkflow.js.map +1 -0
- package/dist-esm/contracts/backboneToolCatalog.js +59 -0
- package/dist-esm/contracts/backboneToolCatalog.js.map +1 -0
- package/dist-esm/contracts/composeWorkflowTypes.js +2 -0
- package/dist-esm/contracts/composeWorkflowTypes.js.map +1 -0
- package/dist-esm/contracts/data/costEstimation.js +18 -0
- package/dist-esm/contracts/data/costEstimation.js.map +1 -0
- package/dist-esm/contracts/data/gatingPolicies.js +214 -0
- package/dist-esm/contracts/data/gatingPolicies.js.map +1 -0
- package/dist-esm/contracts/data/gatingPoliciesToolSurface.js +57 -0
- package/dist-esm/contracts/data/gatingPoliciesToolSurface.js.map +1 -0
- package/dist-esm/contracts/data/index.js +23 -0
- package/dist-esm/contracts/data/index.js.map +1 -0
- package/dist-esm/contracts/data/promptContracts.js +1282 -0
- package/dist-esm/contracts/data/promptContracts.js.map +1 -0
- package/dist-esm/contracts/data/repairRecipes.js +193 -0
- package/dist-esm/contracts/data/repairRecipes.js.map +1 -0
- package/dist-esm/contracts/data/toolCatalog.js +122 -0
- package/dist-esm/contracts/data/toolCatalog.js.map +1 -0
- package/dist-esm/contracts/data/toolCostMetadata.js +277 -0
- package/dist-esm/contracts/data/toolCostMetadata.js.map +1 -0
- package/dist-esm/contracts/data/toolPermissions.js +70 -0
- package/dist-esm/contracts/data/toolPermissions.js.map +1 -0
- package/dist-esm/contracts/evaluators.js +463 -0
- package/dist-esm/contracts/evaluators.js.map +1 -0
- package/dist-esm/contracts/hostedComposition.js +128 -0
- package/dist-esm/contracts/hostedComposition.js.map +1 -0
- package/dist-esm/contracts/hostedToolValidation.js +296 -0
- package/dist-esm/contracts/hostedToolValidation.js.map +1 -0
- package/dist-esm/contracts/idLoraPrompt.js +72 -0
- package/dist-esm/contracts/idLoraPrompt.js.map +1 -0
- package/dist-esm/contracts/imagePrompt.js +143 -0
- package/dist-esm/contracts/imagePrompt.js.map +1 -0
- package/dist-esm/contracts/index.js +27 -0
- package/dist-esm/contracts/index.js.map +1 -0
- package/dist-esm/contracts/musicComposition.js +182 -0
- package/dist-esm/contracts/musicComposition.js.map +1 -0
- package/dist-esm/contracts/promptContract.js +34 -0
- package/dist-esm/contracts/promptContract.js.map +1 -0
- package/dist-esm/contracts/promptOverrideMarker.js +2 -0
- package/dist-esm/contracts/promptOverrideMarker.js.map +1 -0
- package/dist-esm/contracts/randomThemes.js +154 -0
- package/dist-esm/contracts/randomThemes.js.map +1 -0
- package/dist-esm/contracts/registry.js +100 -0
- package/dist-esm/contracts/registry.js.map +1 -0
- package/dist-esm/contracts/repairRecipe.js +35 -0
- package/dist-esm/contracts/repairRecipe.js.map +1 -0
- package/dist-esm/contracts/storyboard.js +4 -0
- package/dist-esm/contracts/storyboard.js.map +1 -0
- package/dist-esm/contracts/telemetry.js +33 -0
- package/dist-esm/contracts/telemetry.js.map +1 -0
- package/dist-esm/contracts/toolGatingPolicy.js +60 -0
- package/dist-esm/contracts/toolGatingPolicy.js.map +1 -0
- package/dist-esm/contracts/toolPromptMarkers.js +10 -0
- package/dist-esm/contracts/toolPromptMarkers.js.map +1 -0
- package/dist-esm/contracts/toolSurface.js +110 -0
- package/dist-esm/contracts/toolSurface.js.map +1 -0
- package/dist-esm/contracts/turnPolicy.js +13 -0
- package/dist-esm/contracts/turnPolicy.js.map +1 -0
- package/dist-esm/contracts/videoComposition.js +216 -0
- package/dist-esm/contracts/videoComposition.js.map +1 -0
- package/dist-esm/index.js +10 -0
- package/dist-esm/index.js.map +1 -0
- package/dist-esm/media/aspectRatio.js +65 -0
- package/dist-esm/media/aspectRatio.js.map +1 -0
- package/dist-esm/media/audioReference.js +186 -0
- package/dist-esm/media/audioReference.js.map +1 -0
- package/dist-esm/media/cameraAngle.js +41 -0
- package/dist-esm/media/cameraAngle.js.map +1 -0
- package/dist-esm/media/characterSheet.js +48 -0
- package/dist-esm/media/characterSheet.js.map +1 -0
- package/dist-esm/media/danceMontage.js +29 -0
- package/dist-esm/media/danceMontage.js.map +1 -0
- package/dist-esm/media/enhancementProfiles.js +219 -0
- package/dist-esm/media/enhancementProfiles.js.map +1 -0
- package/dist-esm/media/generationJob.js +87 -0
- package/dist-esm/media/generationJob.js.map +1 -0
- package/dist-esm/media/gptImage.js +150 -0
- package/dist-esm/media/gptImage.js.map +1 -0
- package/dist-esm/media/imageDimensions.js +59 -0
- package/dist-esm/media/imageDimensions.js.map +1 -0
- package/dist-esm/media/index.js +16 -0
- package/dist-esm/media/index.js.map +1 -0
- package/dist-esm/media/musicSettings.js +230 -0
- package/dist-esm/media/musicSettings.js.map +1 -0
- package/dist-esm/media/vendorModelPremium.js +81 -0
- package/dist-esm/media/vendorModelPremium.js.map +1 -0
- package/dist-esm/media/videoAppSettings.js +125 -0
- package/dist-esm/media/videoAppSettings.js.map +1 -0
- package/dist-esm/media/videoContentLimit.js +162 -0
- package/dist-esm/media/videoContentLimit.js.map +1 -0
- package/dist-esm/media/videoReference.js +72 -0
- package/dist-esm/media/videoReference.js.map +1 -0
- package/dist-esm/media/videoSettings.js +191 -0
- package/dist-esm/media/videoSettings.js.map +1 -0
- package/dist-esm/openai-tools/_manifests.generated.js +1789 -0
- package/dist-esm/openai-tools/_manifests.generated.js.map +1 -0
- package/dist-esm/openai-tools/app-tools.json +297 -0
- package/dist-esm/openai-tools/composition-tools.json +228 -0
- package/dist-esm/openai-tools/generation-tools.json +1263 -0
- package/dist-esm/openai-tools/index.js +27 -0
- package/dist-esm/openai-tools/index.js.map +1 -0
- package/dist-esm/package.json +3 -0
- package/dist-esm/public-skill-runtime/index.js +6390 -0
- package/dist-esm/public-skill-runtime/index.js.map +1 -0
- package/dist-esm/replay/index.js +3 -0
- package/dist-esm/replay/index.js.map +1 -0
- package/dist-esm/replay/redact.js +102 -0
- package/dist-esm/replay/redact.js.map +1 -0
- package/dist-esm/replay/types.js +20 -0
- package/dist-esm/replay/types.js.map +1 -0
- package/dist-esm/runtime/chatTypes.js +2 -0
- package/dist-esm/runtime/chatTypes.js.map +1 -0
- package/dist-esm/runtime/durableWorkflowClient.js +295 -0
- package/dist-esm/runtime/durableWorkflowClient.js.map +1 -0
- package/dist-esm/runtime/index.js +2 -0
- package/dist-esm/runtime/index.js.map +1 -0
- package/dist-esm/schemas/errors/error.schema.json +21 -0
- package/dist-esm/schemas/errors/repair-control.schema.json +40 -0
- package/dist-esm/schemas/events/artifact-reference.schema.json +22 -0
- package/dist-esm/schemas/events/progress-event.schema.json +28 -0
- package/dist-esm/schemas/events/workflow-event.schema.json +22 -0
- package/dist-esm/schemas/storyboards/storyboard-planning-contract.schema.json +108 -0
- package/dist-esm/schemas/tools/add_subtitles.schema.json +77 -0
- package/dist-esm/schemas/tools/animate_photo.schema.json +104 -0
- package/dist-esm/schemas/tools/apply_style.schema.json +37 -0
- package/dist-esm/schemas/tools/change_angle.schema.json +30 -0
- package/dist-esm/schemas/tools/compose_instrumental.schema.json +24 -0
- package/dist-esm/schemas/tools/compose_lyrics.schema.json +28 -0
- package/dist-esm/schemas/tools/compose_script.schema.json +68 -0
- package/dist-esm/schemas/tools/compose_workflow.schema.json +67 -0
- package/dist-esm/schemas/tools/compose_workflow_template.schema.json +156 -0
- package/dist-esm/schemas/tools/dance_montage.schema.json +47 -0
- package/dist-esm/schemas/tools/edit_image.schema.json +74 -0
- package/dist-esm/schemas/tools/enhance_prompt.schema.json +76 -0
- package/dist-esm/schemas/tools/extend_video.schema.json +42 -0
- package/dist-esm/schemas/tools/generate_image.schema.json +104 -0
- package/dist-esm/schemas/tools/generate_music.schema.json +62 -0
- package/dist-esm/schemas/tools/generate_video.schema.json +97 -0
- package/dist-esm/schemas/tools/orbit_video.schema.json +70 -0
- package/dist-esm/schemas/tools/overlay_video.schema.json +126 -0
- package/dist-esm/schemas/tools/refine_result.schema.json +43 -0
- package/dist-esm/schemas/tools/replace_video_segment.schema.json +60 -0
- package/dist-esm/schemas/tools/restore_photo.schema.json +47 -0
- package/dist-esm/schemas/tools/sound_to_video.schema.json +70 -0
- package/dist-esm/schemas/tools/stitch_video.schema.json +52 -0
- package/dist-esm/schemas/tools/video_to_video.schema.json +77 -0
- package/dist-esm/schemas/workflows/durable-workflow-run.schema.json +165 -0
- package/dist-esm/schemas/workflows/durable-workflow-step.schema.json +141 -0
- package/dist-esm/skill-runtime-source/crossSurfaceParity.js +469 -0
- package/dist-esm/skill-runtime-source/crossSurfaceParity.js.map +1 -0
- package/dist-esm/skill-runtime-source/index.js +4 -0
- package/dist-esm/skill-runtime-source/index.js.map +1 -0
- package/dist-esm/skill-runtime-source/seedanceAudioWindow.js +47 -0
- package/dist-esm/skill-runtime-source/seedanceAudioWindow.js.map +1 -0
- package/dist-esm/skill-runtime-source/workflowStatus.js +348 -0
- package/dist-esm/skill-runtime-source/workflowStatus.js.map +1 -0
- package/dist-esm/skills/asset_reference_management/index.js +3 -0
- package/dist-esm/skills/asset_reference_management/index.js.map +1 -0
- package/dist-esm/skills/asset_reference_management/manifest.js +228 -0
- package/dist-esm/skills/asset_reference_management/manifest.js.map +1 -0
- package/dist-esm/skills/asset_reference_management/modelRefRegistry.js +129 -0
- package/dist-esm/skills/asset_reference_management/modelRefRegistry.js.map +1 -0
- package/dist-esm/skills/asset_reference_management/types.js +2 -0
- package/dist-esm/skills/asset_reference_management/types.js.map +1 -0
- package/dist-esm/tools/definitions/add-subtitles/definition.js +80 -0
- package/dist-esm/tools/definitions/add-subtitles/definition.js.map +1 -0
- package/dist-esm/tools/definitions/animate-photo/definition.js +121 -0
- package/dist-esm/tools/definitions/animate-photo/definition.js.map +1 -0
- package/dist-esm/tools/definitions/apply-style/definition.js +47 -0
- package/dist-esm/tools/definitions/apply-style/definition.js.map +1 -0
- package/dist-esm/tools/definitions/change-angle/definition.js +46 -0
- package/dist-esm/tools/definitions/change-angle/definition.js.map +1 -0
- package/dist-esm/tools/definitions/dance-montage/dances.js +86 -0
- package/dist-esm/tools/definitions/dance-montage/dances.js.map +1 -0
- package/dist-esm/tools/definitions/dance-montage/definition.js +42 -0
- package/dist-esm/tools/definitions/dance-montage/definition.js.map +1 -0
- package/dist-esm/tools/definitions/edit-image/definition.js +125 -0
- package/dist-esm/tools/definitions/edit-image/definition.js.map +1 -0
- package/dist-esm/tools/definitions/extend-video/definition.js +48 -0
- package/dist-esm/tools/definitions/extend-video/definition.js.map +1 -0
- package/dist-esm/tools/definitions/generate-image/definition.js +104 -0
- package/dist-esm/tools/definitions/generate-image/definition.js.map +1 -0
- package/dist-esm/tools/definitions/generate-music/definition.js +72 -0
- package/dist-esm/tools/definitions/generate-music/definition.js.map +1 -0
- package/dist-esm/tools/definitions/generate-video/definition.js +117 -0
- package/dist-esm/tools/definitions/generate-video/definition.js.map +1 -0
- package/dist-esm/tools/definitions/index.js +41 -0
- package/dist-esm/tools/definitions/index.js.map +1 -0
- package/dist-esm/tools/definitions/orbit-video/definition.js +100 -0
- package/dist-esm/tools/definitions/orbit-video/definition.js.map +1 -0
- package/dist-esm/tools/definitions/overlay-video/definition.js +139 -0
- package/dist-esm/tools/definitions/overlay-video/definition.js.map +1 -0
- package/dist-esm/tools/definitions/refine-result/definition.js +53 -0
- package/dist-esm/tools/definitions/refine-result/definition.js.map +1 -0
- package/dist-esm/tools/definitions/replace-video-segment/definition.js +62 -0
- package/dist-esm/tools/definitions/replace-video-segment/definition.js.map +1 -0
- package/dist-esm/tools/definitions/restore-photo/definition.js +55 -0
- package/dist-esm/tools/definitions/restore-photo/definition.js.map +1 -0
- package/dist-esm/tools/definitions/sound-to-video/definition.js +88 -0
- package/dist-esm/tools/definitions/sound-to-video/definition.js.map +1 -0
- package/dist-esm/tools/definitions/stitch-video/definition.js +86 -0
- package/dist-esm/tools/definitions/stitch-video/definition.js.map +1 -0
- package/dist-esm/tools/definitions/types.js +2 -0
- package/dist-esm/tools/definitions/types.js.map +1 -0
- package/dist-esm/tools/definitions/video-to-video/definition.js +98 -0
- package/dist-esm/tools/definitions/video-to-video/definition.js.map +1 -0
- package/dist-esm/tools/index.js +18 -0
- package/dist-esm/tools/index.js.map +1 -0
- package/dist-esm/tools/normalizeArgs.js +37 -0
- package/dist-esm/tools/normalizeArgs.js.map +1 -0
- package/dist-esm/tools/result.js +31 -0
- package/dist-esm/tools/result.js.map +1 -0
- package/dist-esm/tools/shared/downloadFilename.js +147 -0
- package/dist-esm/tools/shared/downloadFilename.js.map +1 -0
- package/dist-esm/tools/shared/dynamicPromptBranches.js +183 -0
- package/dist-esm/tools/shared/dynamicPromptBranches.js.map +1 -0
- package/dist-esm/tools/shared/errorClassification.js +74 -0
- package/dist-esm/tools/shared/errorClassification.js.map +1 -0
- package/dist-esm/tools/shared/imageEncoding.js +8 -0
- package/dist-esm/tools/shared/imageEncoding.js.map +1 -0
- package/dist-esm/tools/shared/llmHelpers.js +137 -0
- package/dist-esm/tools/shared/llmHelpers.js.map +1 -0
- package/dist-esm/tools/shared/modelRegistry.js +91 -0
- package/dist-esm/tools/shared/modelRegistry.js.map +1 -0
- package/dist-esm/tools/shared/multiImageIntent.js +10 -0
- package/dist-esm/tools/shared/multiImageIntent.js.map +1 -0
- package/dist-esm/tools/shared/numberOfVariationsAlignment.js +34 -0
- package/dist-esm/tools/shared/numberOfVariationsAlignment.js.map +1 -0
- package/dist-esm/tools/shared/policyChecks.js +73 -0
- package/dist-esm/tools/shared/policyChecks.js.map +1 -0
- package/dist-esm/tools/shared/promptRefinementCache.js +70 -0
- package/dist-esm/tools/shared/promptRefinementCache.js.map +1 -0
- package/dist-esm/tools/shared/promptSanitizer.js +70 -0
- package/dist-esm/tools/shared/promptSanitizer.js.map +1 -0
- package/dist-esm/tools/shared/seedancePolicyErrors.js +232 -0
- package/dist-esm/tools/shared/seedancePolicyErrors.js.map +1 -0
- package/dist-esm/tools/shared/slotFailureSummary.js +60 -0
- package/dist-esm/tools/shared/slotFailureSummary.js.map +1 -0
- package/dist-esm/tools/shared/visionDescriptionCache.js +29 -0
- package/dist-esm/tools/shared/visionDescriptionCache.js.map +1 -0
- package/dist-esm/types/index.js +31 -0
- package/dist-esm/types/index.js.map +1 -0
- package/dist-esm/utils/errors.js +86 -0
- package/dist-esm/utils/errors.js.map +1 -0
- package/dist-esm/utils/helpers.js +419 -0
- package/dist-esm/utils/helpers.js.map +1 -0
- package/dist-esm/workflows/bindings.js +212 -0
- package/dist-esm/workflows/bindings.js.map +1 -0
- package/dist-esm/workflows/executor-ports.js +2 -0
- package/dist-esm/workflows/executor-ports.js.map +1 -0
- package/dist-esm/workflows/executor.js +522 -0
- package/dist-esm/workflows/executor.js.map +1 -0
- package/dist-esm/workflows/index.js +8 -0
- package/dist-esm/workflows/index.js.map +1 -0
- package/dist-esm/workflows/primitives/index.js +3 -0
- package/dist-esm/workflows/primitives/index.js.map +1 -0
- package/dist-esm/workflows/primitives/retryUntilCondition.js +95 -0
- package/dist-esm/workflows/primitives/retryUntilCondition.js.map +1 -0
- package/dist-esm/workflows/primitives/validateWithRubric.js +141 -0
- package/dist-esm/workflows/primitives/validateWithRubric.js.map +1 -0
- package/dist-esm/workflows/summarize.js +51 -0
- package/dist-esm/workflows/summarize.js.map +1 -0
- package/dist-esm/workflows/types.js +2 -0
- package/dist-esm/workflows/types.js.map +1 -0
- package/dist-esm/workflows/validation.js +330 -0
- package/dist-esm/workflows/validation.js.map +1 -0
- package/media/index.d.ts +1 -0
- package/media/index.js +2 -0
- package/package.json +213 -0
- package/public-skill-runtime/index.d.ts +1 -0
- package/public-skill-runtime/index.js +2 -0
- package/replay/index.d.ts +1 -0
- package/replay/index.js +2 -0
- package/runtime/index.d.ts +1 -0
- package/runtime/index.js +2 -0
- package/skills/asset_reference_management/index.d.ts +1 -0
- package/skills/asset_reference_management/index.js +2 -0
- package/src/skill-runtime-source/crossSurfaceParity.ts +525 -0
- package/src/skill-runtime-source/index.ts +10 -0
- package/src/skill-runtime-source/seedanceAudioWindow.ts +61 -0
- package/src/skill-runtime-source/workflowStatus.ts +375 -0
- package/tools/index.d.ts +1 -0
- package/tools/index.js +2 -0
- package/workflows/index.d.ts +1 -0
- package/workflows/index.js +2 -0
|
@@ -0,0 +1,1263 @@
|
|
|
1
|
+
{
|
|
2
|
+
"version": "2026-04-27.1",
|
|
3
|
+
"source": "sogni-creative-agent/src/tools/definitions/*/definition.ts",
|
|
4
|
+
"schemaRefs": {
|
|
5
|
+
"generate_image": "../schemas/tools/generate_image.schema.json",
|
|
6
|
+
"generate_video": "../schemas/tools/generate_video.schema.json",
|
|
7
|
+
"generate_music": "../schemas/tools/generate_music.schema.json",
|
|
8
|
+
"edit_image": "../schemas/tools/edit_image.schema.json",
|
|
9
|
+
"apply_style": "../schemas/tools/apply_style.schema.json",
|
|
10
|
+
"restore_photo": "../schemas/tools/restore_photo.schema.json",
|
|
11
|
+
"refine_result": "../schemas/tools/refine_result.schema.json",
|
|
12
|
+
"animate_photo": "../schemas/tools/animate_photo.schema.json",
|
|
13
|
+
"change_angle": "../schemas/tools/change_angle.schema.json",
|
|
14
|
+
"video_to_video": "../schemas/tools/video_to_video.schema.json",
|
|
15
|
+
"stitch_video": "../schemas/tools/stitch_video.schema.json",
|
|
16
|
+
"orbit_video": "../schemas/tools/orbit_video.schema.json",
|
|
17
|
+
"dance_montage": "../schemas/tools/dance_montage.schema.json",
|
|
18
|
+
"sound_to_video": "../schemas/tools/sound_to_video.schema.json",
|
|
19
|
+
"extend_video": "../schemas/tools/extend_video.schema.json",
|
|
20
|
+
"replace_video_segment": "../schemas/tools/replace_video_segment.schema.json",
|
|
21
|
+
"overlay_video": "../schemas/tools/overlay_video.schema.json",
|
|
22
|
+
"add_subtitles": "../schemas/tools/add_subtitles.schema.json"
|
|
23
|
+
},
|
|
24
|
+
"tools": [
|
|
25
|
+
{
|
|
26
|
+
"type": "function",
|
|
27
|
+
"function": {
|
|
28
|
+
"name": "generate_image",
|
|
29
|
+
"description": "Generate a new image from a text description. Usually this is text-only: do NOT use this tool when the user expects an existing image to be reused or preserved in the result. That includes (a) people from My Personas, and (b) uploaded assets such as logos, brand marks, mascots, product shots, photos, screenshots, sketches, character designs, or other reference images they want carried through. Use edit_image with sourceImageIndex=-1 (or the appropriate generated index) instead. Exception: when the user explicitly requests Z-image, Z Image, or Z-image Turbo for an uploaded-image enhancement/image-to-image request, use this tool with model=\"z-turbo\" or model=\"z-image\", sourceImageIndex=-1, and starting_image_strength because edit_image does not expose Z-image models.",
|
|
30
|
+
"parameters": {
|
|
31
|
+
"type": "object",
|
|
32
|
+
"properties": {
|
|
33
|
+
"prompt": {
|
|
34
|
+
"type": "string",
|
|
35
|
+
"description": "Text description of the image (50-200 words). POSITIVE phrasing only. Be specific and vivid — reference real artists, franchises, and aesthetics by name.\n\nLITERAL PROMPT OVERRIDE: If the user explicitly says not to modify the prompt, or to use it exactly/verbatim/as-is, copy the identified prompt text verbatim instead of applying these construction rules unless a hard requirement is missing.\n\nPROMPT ORDER (follow this structure): [SUBJECT] → [ATTRIBUTES] → [ACTION/POSE] → [CAMERA/FRAMING] → [ENVIRONMENT] → [LIGHTING] → [STYLE/MEDIUM] → [MATERIALS/TEXTURES] → [SECONDARY DETAILS]. Always lead with the main subject and its concrete, observable attributes — never start with mood or atmosphere. Put the most visually decisive details early.\n\nSPECIFICITY: Use concrete nouns and observable adjectives (\"weathered leather jacket\", not \"cool outfit\"). Specify framing (close-up, medium shot, full body, wide shot), angle (eye level, low angle, high angle, overhead), lighting type (\"soft overcast daylight\", \"warm golden-hour sunlight\", \"moody neon spill with deep shadows\"), and medium/style (\"photorealistic editorial photography\", \"cinematic still frame\", \"clean anime illustration\"). Include materials and textures when relevant (\"brushed aluminum\", \"wet asphalt reflections\", \"heavy wool texture\").\n\nDEFAULTS (fill in when user is underspecified): Framing: medium shot for portraits, wide shot for environments, full-body for fashion/outfits. Angle: eye level unless dramatic perspective requested. Lighting: soft natural light for realism, clean studio light for product shots. Style: photorealistic for realistic models, matching the model's native style for stylized models (e.g. anime illustration for pony/animagine). Reference real artists and franchises by name (\"in the style of Monet's Water Lilies\", \"Wes Anderson symmetrical pastel composition\", \"cyberpunk Blade Runner neon city\", \"shot on 85mm f/1.4 with shallow depth of field\").\n\nAVOID: Starting with abstract mood words alone. Burying the subject after a long style preamble. Stacking incompatible styles. Overloading with competing focal points. Vague phrases like \"very cool\" or \"epic vibes\".\n\nCHARACTER / MASCOT SHEETS: When the user asks for a character sheet, mascot sheet, model sheet, turnaround, expression sheet, or reusable character reference board, create ONE comprehensive professional reference-board image, not separate variations. Include a large hero pose, front / 3/4 / side / back turnaround views, an expression row, action/personality poses, accessories or props, color palette swatches, and compact notes such as personality, fun facts, or brand usage when appropriate. Preserve exact user-provided brand names, slogans, logo text, and requested copy verbatim; incidental tiny notes may be generated by the image model if the user did not provide exact wording. Keep the character consistent across every panel and use clean readable typography.\n\nBATCH VARIATIONS: When numberOfVariations > 1, the prompt must describe ONE subject in ONE scene — never mention counts, \"versions\", \"different\", or \"multiple\" in the prompt text. NEVER describe multiple copies or duplicates of the subject in a single image (no grids, collages, or side-by-side). Use Dynamic Prompt syntax to vary ONE dimension across separate images. Example: user asks \"4 cats in different spots\" → numberOfVariations=4, prompt=\"a black cat {lounging in a sunlit window|prowling through autumn leaves|sitting on a vintage bookshelf|curled up by a fireplace}\" — each output is ONE cat in ONE spot. Vary setting, style, lighting, expression, or composition — never override what the user specified. Preserve any requested orientation, aspect ratio, or exact pixel dimensions across every variation.\n\nSELECTION-GATED IMAGE STAGES: If the user asks for multiple image options/takes/versions and says they will pick one before a later dance, animation, or video, this tool call is still the first step. Generate the complete image batch now with the exact requested count, Dynamic Prompt options for each output, and the final video/image aspect ratio. Do not ask the user to choose before the images exist, and do not call video tools until after the user selects an image.\n\nLINKED VARIANTS: If multiple details must stay paired per output — visual style, outfit, label text, symbol, setting, character, prop, location, or before/after keyframe details — use ONE top-level Dynamic Prompt branch with one complete prompt per output. Do NOT use separate Dynamic Prompt groups for details that must stay together; unpaired groups can mix attributes. If the user asks for per-variant facial, identity, or appearance changes, repeat that guidance inside EVERY option. When the user names a subject or character, write that name or stable role inside every Dynamic Prompt option; a shared prefix outside the branch is not enough because each option must stand alone. Correct shape: \"{full prompt for variant 1 with all paired details|full prompt for variant 2 with all paired details|...}\".\n\nSCREENPLAY / STORYBOARD BATCHES: For multi-scene commercials, storyboards, or shot lists, numberOfVariations must equal the scene count and the prompt MUST be a SINGLE top-level dynamic branch containing one full scene prompt per option, e.g. \"{scene 1 full prompt|scene 2 full prompt|scene 3 full prompt}\". This is the required way to batch scenes with materially different content while still rendering one image per scene. If recurring characters appear, use stable character names and repeat the same visual anchors in every scene option where they appear (age range, build, hairstyle, outfit silhouette, color palette, signature prop/accessory, posture). Do not rename, merge, redesign, or drift characters between scene keyframes unless the user asks. Include speaker-tagged dialogue details when dialogue affects the keyframe, e.g. CHARACTER: \"We made it.\" NEVER set numberOfVariations=N with only scene 1's prompt — that creates N duplicate versions of scene 1, not N scenes. If the scene count is 16 or fewer, do it in ONE call. Do NOT generate scene 1 first or split into smaller batches unless the user explicitly asks.\n\nCOMPOSITE GPT IMAGE 2 STORYBOARD SHEETS: When numberOfVariations=1 and the user asks for one composite video storyboard/keyframe sheet, the prompt must be a compiled storyboard prompt, not a concept summary. Include a SCENES: section with exactly the requested number of concrete entries named SCENE_01, SCENE_02, etc. Every scene entry must include Visual/Action, Camera/Motion, Dialogue/VO (or [no dialogue]), Audio/SFX, and any visible text or reference usage for that scene. Do not provide only the source brief or generic layout instructions; malformed compiled storyboard prompts are blocked by quality audit.\n\nVIDEO KEYFRAMES: When generating images intended as first+last frames for video (animate_photo with frameRole=\"both\"), use numberOfVariations=2 with Dynamic Prompts to create both frames in one call. Make each frame a distinct scene that creates a compelling transition. The video handler will inspect both generated frames and build a scene-aware transition prompt, so focus this image prompt on producing strong start/end visuals. Example: \"a serene lake {at dawn with mist rising and soft pink sky|at dusk with fireflies and deep blue twilight}\"."
|
|
36
|
+
},
|
|
37
|
+
"model": {
|
|
38
|
+
"type": "string",
|
|
39
|
+
"enum": [
|
|
40
|
+
"gpt-image-2",
|
|
41
|
+
"z-turbo",
|
|
42
|
+
"z-image",
|
|
43
|
+
"chroma-v46-flash",
|
|
44
|
+
"chroma-detail",
|
|
45
|
+
"flux1-krea",
|
|
46
|
+
"flux2",
|
|
47
|
+
"pony-v7",
|
|
48
|
+
"qwen-2512",
|
|
49
|
+
"qwen-2512-lightning",
|
|
50
|
+
"albedo-xl",
|
|
51
|
+
"animagine-xl",
|
|
52
|
+
"anima-pencil-xl",
|
|
53
|
+
"art-universe-xl",
|
|
54
|
+
"hyphoria-real",
|
|
55
|
+
"analog-madness-xl",
|
|
56
|
+
"cyberrealistic-xl",
|
|
57
|
+
"real-dream-xl",
|
|
58
|
+
"faetastic-xl",
|
|
59
|
+
"zavychroma-xl",
|
|
60
|
+
"pony-faetality",
|
|
61
|
+
"dreamshaper-xl"
|
|
62
|
+
],
|
|
63
|
+
"description": "DO NOT SET THIS PARAMETER unless the user names a specific model, asks for a very complex image render, asks for a video storyboard/storyboard sheet/contact sheet/panel layout image, or explicitly asks for Z-image/Z-image Turbo image-to-image. The app auto-selects based on quality settings. Set \"gpt-image-2\" when the user asks for a ChatGPT, OpenAI, GPT, GPT-2, GPT Image, or gpt-image-2 image/model, when they explicitly request very strong text rendering, or by default for complex single-image renders that need dense labels, crisp typography, multi-panel composition, timing notes, foley notes, professional storyboard-sheet layout, or a comprehensive character/mascot/model sheet with turnarounds, expressions, accessories, palette swatches, and brand notes. Set \"z-turbo\" when the user asks for Z-image Turbo; set \"z-image\" when they ask for Z-image without Turbo. If the user names another image model, honor that requested model instead. A model preference usually does not change which tool to use; the Z-image image-to-image exception uses sourceImageIndex plus starting_image_strength on this tool. NSFW rule: \"gpt-image-2\"/\"flux2\"/\"flux1-krea\" CANNOT do nudity — use \"pony-v7\", \"chroma-detail\", \"chroma-v46-flash\", or \"z-turbo\" instead."
|
|
64
|
+
},
|
|
65
|
+
"width": {
|
|
66
|
+
"type": "number",
|
|
67
|
+
"description": "Output image width in pixels. Default: 1024. Supported range is 256-2560 for default Z/Qwen/Flux.2 image models and 256-2048 for legacy/specialized image models. For gpt-image-2, dimensions are flexible up to 3840px on either edge with max 3:1 aspect ratio and a total pixel budget from 655,360 to 8,294,400; the renderer snaps to the nearest valid multiple-of-16 size. Set when the user specifies a width, exact pixel dimensions, or a named resolution (e.g., \"1280 wide\", \"1280x720\", \"720p\", \"1080x1920\", \"3840x2160\"). If the user gives only one dimension, set only that dimension and preserve/infer the sensible aspect ratio. User-requested dimensions override the default media quality, including Pro. Non-multiple-of-16 values are accepted when in bounds, so do not ask the user to adjust by a few pixels."
|
|
68
|
+
},
|
|
69
|
+
"height": {
|
|
70
|
+
"type": "number",
|
|
71
|
+
"description": "Output image height in pixels. Default: 1024. Supported range is 256-2560 for default Z/Qwen/Flux.2 image models and 256-2048 for legacy/specialized image models. For gpt-image-2, dimensions are flexible up to 3840px on either edge with max 3:1 aspect ratio and a total pixel budget from 655,360 to 8,294,400; the renderer snaps to the nearest valid multiple-of-16 size. Set when the user specifies a height, exact pixel dimensions, or a named resolution (e.g., \"720 high\", \"1280x720\", \"720p\", \"1080x1920\", \"2160x3840\"). If the user gives only one dimension, set only that dimension and preserve/infer the sensible aspect ratio. User-requested dimensions override the default media quality, including Pro. Non-multiple-of-16 values are accepted when in bounds, so do not ask the user to adjust by a few pixels."
|
|
72
|
+
},
|
|
73
|
+
"numberOfVariations": {
|
|
74
|
+
"type": "number",
|
|
75
|
+
"description": "Number of variations (1-16). Use the user's exact requested count in one call whenever they ask for multiple images/options/takes/versions, including images that will feed a later video after the user picks one. For screenplay/storyboard batches, this must equal the scene count AND the prompt must contain one Dynamic Prompt branch with one full scene prompt per scene; never set numberOfVariations=N with only one scene prompt. Default: 1.",
|
|
76
|
+
"minimum": 1,
|
|
77
|
+
"maximum": 16
|
|
78
|
+
},
|
|
79
|
+
"negativePrompt": {
|
|
80
|
+
"type": "string",
|
|
81
|
+
"description": "Things to avoid in the generated image. Only set when the user explicitly mentions what to avoid. E.g., \"no watermarks, no text, no blurry edges\"."
|
|
82
|
+
},
|
|
83
|
+
"starting_image_strength": {
|
|
84
|
+
"type": "number",
|
|
85
|
+
"description": "Image-to-image strength (0.0-1.0). Only used when a source image is available and model supports img2img. Higher values = more deviation from the source image. 0.35 = conservative enhancement, 0.5 = balanced, 0.8 = creative. Set this with sourceImageIndex when the user explicitly requests Z-image/Z-image Turbo enhancement or any supported img2img starting-image workflow."
|
|
86
|
+
},
|
|
87
|
+
"sourceImageIndex": {
|
|
88
|
+
"type": "number",
|
|
89
|
+
"description": "Which result image to use as starting image for img2img (0-based index). -1 = original upload. Omit to auto-select latest result. Only relevant when starting_image_strength is set."
|
|
90
|
+
},
|
|
91
|
+
"seed": {
|
|
92
|
+
"type": "integer",
|
|
93
|
+
"description": "Random seed for reproducibility. Use -1 for random (default). Set a specific seed when the user wants to reproduce a previous result."
|
|
94
|
+
},
|
|
95
|
+
"guidance": {
|
|
96
|
+
"type": "number",
|
|
97
|
+
"description": "Guidance scale override. Higher values = more prompt adherence. Model-specific defaults are used if omitted. Only set when the user explicitly requests a guidance value."
|
|
98
|
+
},
|
|
99
|
+
"gptImageQuality": {
|
|
100
|
+
"type": "string",
|
|
101
|
+
"enum": [
|
|
102
|
+
"low",
|
|
103
|
+
"medium",
|
|
104
|
+
"high",
|
|
105
|
+
"auto"
|
|
106
|
+
],
|
|
107
|
+
"description": "Optional GPT Image 2 rendering quality. Only set with model=\"gpt-image-2\" when the user explicitly asks for low/fast, medium/balanced, high/final, or auto quality. Otherwise omit it and let the host app media quality setting map Fast to low, HQ to medium, and Pro to high."
|
|
108
|
+
},
|
|
109
|
+
"outputFormat": {
|
|
110
|
+
"type": "string",
|
|
111
|
+
"enum": [
|
|
112
|
+
"png",
|
|
113
|
+
"jpg",
|
|
114
|
+
"jpeg",
|
|
115
|
+
"webp"
|
|
116
|
+
],
|
|
117
|
+
"description": "Optional output file format for generated images. Set only when the user explicitly requests PNG, JPG/JPEG, or WebP. Hosts should normalize \"jpeg\" to the Sogni project format \"jpg\"."
|
|
118
|
+
},
|
|
119
|
+
"aspectRatio": {
|
|
120
|
+
"type": "string",
|
|
121
|
+
"description": "Do NOT set unless the user explicitly requests an aspect ratio, format, orientation, or exact pixel dimensions. When a reference/source image is used and the user did not ask to change its shape, omit this field so the handler preserves the selected source image's own ratio.\n\nFormats: \"16:9\", \"9:16\", \"4:5\", \"1:1\", \"4:3\", \"3:2\", \"21:9\", or exact pixels like \"1920x1080\".\n\nCRITICAL: When the user specifies exact pixel dimensions (e.g., \"1280x720\", \"1080x1920\", \"1920x1080\", \"3840x2160\") or an orientation-qualified named resolution (e.g., \"720p landscape\", \"720p portrait\"), use the exact pixel format, NOT a ratio like \"16:9\" or \"9:16\". Exact user-requested dimensions override the selected default media quality, including Pro/HQ defaults. A bare named video resolution like \"720p resolution\" is only a resolution tier/short-side request; do not turn it into landscape pixels and do not set aspectRatio unless the user also states landscape, portrait, vertical, horizontal, or exact pixels. If requested pixels are in bounds but not on the model's pixel step, still pass the user's exact pixel request; the handler snaps to the nearest supported size internally. Only use ratio format when the user says a generic format name without pixel dimensions.\n\nMappings (use ONLY when user does NOT specify pixel dimensions): landscape/widescreen/YouTube/cinematic → \"16:9\". portrait → \"9:16\". TikTok/Reels/IG Reels → \"1080x1920\". ultrawide/cinema scope → \"21:9\". Instagram post → \"4:5\". square → \"1:1\". standard/TV → \"4:3\". 720p landscape → \"1280x720\". 720p portrait → \"720x1280\". 1080p landscape → \"1920x1080\". 1080p portrait/HD portrait → \"1080x1920\". 4K landscape → \"3840x2160\". 4K portrait → \"2160x3840\". Never set for generic requests like \"make a video\".\n\nSet this whenever the user specifies an image or downstream video orientation/aspect ratio such as 9:16, 16:9, portrait, vertical, landscape, widescreen, TikTok/Reels/Shorts, or exact pixels. This includes selection-gated image batches that will feed a later video or dance after the user picks one. For GPT Image 2 exact size requests, preserve exact pixel intent when possible and prefer popular GPT sizes such as 1536x1024, 1024x1536, 2048x1152, 3840x2160, and 2160x3840. GPT Image 2 does not support transparent-background output; do not promise a transparent result for this model."
|
|
122
|
+
}
|
|
123
|
+
},
|
|
124
|
+
"required": [
|
|
125
|
+
"prompt"
|
|
126
|
+
]
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
},
|
|
130
|
+
{
|
|
131
|
+
"type": "function",
|
|
132
|
+
"function": {
|
|
133
|
+
"name": "generate_video",
|
|
134
|
+
"description": "Generate a video from text or Seedance multimodal references. LTX 2.3 generates audio natively (dialogue, sounds, ambient music) — describe audio in the prompt. If the user provides exact speech, include it in double quotes; if they only imply speech, describe the performance and voice without inventing quoted words. Never use placeholders such as \"while speaking\", \"dialogue begins\", \"explaining\", or \"final line lands\". PERSONA VOICE: Only when the user explicitly asks to use/clone a registered persona voice clip, call resolve_personas first, then set voicePersonaName to select which persona's voice clip to use. Do not set voicePersonaName for ordinary character dialogue or inferred voices; describe those voices in the prompt for native LTX audio. For cross-persona narration (e.g. David narrates a video of Aleyna), resolve both personas and set voicePersonaName to the narrator only if that registered voice was requested. Persona voice requires ltx23 (WAN 2.2 does not support voice identity). For non-Seedance syncing to a specific song or audio track, use sound_to_video instead. For non-Seedance animation from a locked source photo, use animate_photo. Do NOT use for My Personas unless generating a Seedance reference-based video — standard persona videos use resolve_personas → edit_image → animate_photo. SEEDANCE DEFAULT: For seedance2 or seedance2-fast, default to exactly one 4-15s video unless the user explicitly asks for multiple separate outputs. Multiple beats, shots, or scene descriptions in one up-to-15s Seedance prompt are still one video. If the user requests one continuous Seedance video longer than 15s, preserve the requested total duration in the prompt/context; chat orchestration should split it into supported segment renders and stitch them instead of clamping it to a 15s excerpt. Uploaded/generated storyboard, shot-sheet, or trailer-concept images used as Seedance references should become one Seedance generate_video call by default; do not extract panels with edit_image and do not animate the storyboard sheet with LTX unless the user explicitly asks for separate non-Seedance clips. Seedance loose image, video, and audio references go through this tool; do not use animate_photo sourceImageIndex/frameRole/endImageIndex for Seedance. If an uploaded video is the source clip to transform, upscale, enhance, restyle, or remaster, use video_to_video with controlMode=\"seedance-v2v\" instead of generate_video referenceVideoIndices. If the uploaded audio is the primary sync target, lip-sync target, or requested as sound-to-video/audio-sync, use sound_to_video with videoModel=\"seedance2\" instead of this tool. Use referenceAudioIndices here only when audio is a loose reference under an image/video-anchored Seedance shot. For Seedance, every image — first frame, last frame, or loose reference — is passed through referenceImageIndices (auto-uploaded as referenceImageUrls). Anchor frame intent in the prompt with @Image tags such as \"Use @Image1 as the opening shot reference. Begin the video with a composition, subject placement, lighting, mood, and camera framing that closely match @Image1.\" (or @Image2 as the final shot reference). For seamless-loop or \"first frame and last frame identical\" requests with a single uploaded image, anchor it explicitly as both: \"Use @Image1 as both the first frame and last frame so the video loops cleanly back to the opening composition.\" Assign each useful @Image/@Video/@Audio tag a role. APPROVED STORYBOARD PRODUCTION: When the user asks for a production workflow from an approved storyboard, the chat orchestrator should use the durable CampaignStoryboard contract: render the composite board, audit it, generate per-scene GPT Image 2 keyframes, then render Seedance scene clips and stitch them. Do not replace that with a generic storyboard-reference video unless the user asks for a fast draft. PARTIAL VIDEO EDITS: Do NOT call generate_video to re-render an existing rendered/uploaded video just to change part of it (the bumper, the intro, the end card, a single scene, the last few seconds, etc.). Use replace_video_segment for that — it preserves the unchanged portion, keeps the original audio outside the replaced window, and costs far less. Likewise use extend_video to add new time to the end without rewriting the rest. If the request is vague, ask about vision/mood/style first. Only call once you have clear creative intent.",
|
|
135
|
+
"parameters": {
|
|
136
|
+
"type": "object",
|
|
137
|
+
"properties": {
|
|
138
|
+
"prompt": {
|
|
139
|
+
"type": "string",
|
|
140
|
+
"description": "Write one flowing paragraph like a cinematographer describing a shot. Present tense, specific natural language. Longer clips need longer prompts; close-ups need more detail than wide shots.\n\nLITERAL PROMPT OVERRIDE: If the user explicitly says not to modify the prompt, or to use it exactly/verbatim/as-is, copy the identified prompt text verbatim instead of applying these construction rules unless a hard requirement is missing. Set skipPromptProcessing=true; for Seedance also set expandPrompt=false.\n\nSTRUCTURE: shot/style → subject (age, clothing, hairstyle, distinguishing details) → environment, lighting, atmosphere → action beat by beat → camera movement → audio and dialogue.\n\nCAST CONTINUITY: For screenplay, script, storyboard, commercial, series, or other longer-form video tasks with recurring characters, use stable character names and repeat the same visual anchors every time they appear (age range, build, hairstyle, outfit silhouette, color palette, signature prop/accessory, posture, voice). Do not rename, merge, redesign, or drift characters between scenes unless the user asks.\n\nMOTION PACING: Scale complexity to duration. <=6s: 1 main action beat + 1 simple camera move. Around 10s: 2-3 clear action beats + 1 camera move. >10s: up to 4 action beats in clear sequence. Prefer fewer readable beats over dense micro-actions, especially in short clips.\n\nBLOCKING: Direct the layout like scene blocking. State left/right placement, foreground/background, facing toward/away, and relative distance when multiple subjects or important objects are involved.\n\nACTION: Drive motion with concrete verbs. Specify who moves, what moves, how it moves, and what the camera does. Avoid generic phrases like \"comes alive.\"\n\nDIALOGUE: Put user-provided spoken lines in double quotes. For screenplay-style or longer-form tasks, prefix each spoken line with a stable speaker tag outside the quotes, e.g. CHARACTER: \"We made it.\" Break long speech into short quoted phrases with acting beats between them (gestures, pauses, glances). If the user asks for speech but provides no exact words, describe the visible delivery, voice quality, and emotion without inventing quoted dialogue; ask only when exact wording is the point of the request. Never write placeholders such as \"while speaking\", \"dialogue begins\", \"explaining\", or \"final line lands\". Show emotion through visible behavior — not \"she is sad\", instead \"she looks down, pauses, and her voice cracks\". QUOTING RULE: ONLY use double quotes for spoken dialogue. Never quote on-screen text, overlay text, titles, captions, signs, or any visual text — describe them without quotes.\n\nSTORYBOARD TEXT: For storyboard references, structural headings, section numbers, slide titles, panel titles, and captions may become short audio-only narration/voiceover or key-message beats, but they are not subtitles, title cards, lower thirds, or visible overlays unless the user explicitly asks for visible text/on-screen text/title card/subtitle/lower third/signage/CTA. Do not concatenate storyboard labels into run-on voiceover; use separate brief phrases with pauses.\n\nAUDIO: Prompt sound intentionally — voice quality, volume, room tone, ambience, music, weather, footsteps. Include language or accent if relevant. Useful voice/volume anchors: whisper, mutter, shout, scream, energetic announcer, resonant voice with gravitas, distorted radio-style, robotic monotone, childlike curiosity.\n\nCAMERA: Cinematic terms — close-up, tracking shot, dolly in, handheld, slow arc, static frame. Describe movement relative to subject.\n\nFor specific characters (movies, TV): describe visual appearance — don't rely on names alone.\n\nFor complex/creative scenes (characters, dialogue, skits): capture the full creative intent. The system auto-expands into a detailed prompt.\n\nAVOID: Vague prompts, too many characters at once, conflicting lighting logic, readable text or logos, abstract emotions with no visible behavior, rigid numeric constraints (exact angles, counts, speeds).\n\nBATCH VARIATIONS: When numberOfVariations > 1, use Dynamic Prompt syntax. Lock in any camera/subject/style the user specified, vary the rest. Example: \"slow dolly in on a city street {at dawn with golden light|during a rainstorm|at night with neon reflections}\"."
|
|
141
|
+
},
|
|
142
|
+
"expandPrompt": {
|
|
143
|
+
"type": "boolean",
|
|
144
|
+
"description": "Seedance only. Whether to run the shared Seedance prompt shaper before dispatch. Defaults to true; set false only when the user explicitly asks to submit the compact prompt directly or not modify the prompt."
|
|
145
|
+
},
|
|
146
|
+
"skipPromptProcessing": {
|
|
147
|
+
"type": "boolean",
|
|
148
|
+
"description": "Bypass automatic prompt shaping/refinement and voice-identity prompt formatting so the prompt text is sent unchanged to the video model. Set true ONLY when the user explicitly says not to modify/rewrite/enhance/expand/change/improve the prompt, or to use/send it exactly, verbatim, or as-is, AND the provided prompt already satisfies the tool requirements. Continue to set non-prompt parameters such as model, duration, count, aspect ratio, and seed. For Seedance literal prompt requests, also set expandPrompt=false. Do not set for ordinary underspecified requests."
|
|
149
|
+
},
|
|
150
|
+
"duration": {
|
|
151
|
+
"type": "number",
|
|
152
|
+
"description": "Video duration in seconds. Default: 5. Range: 2-20. Use when the user explicitly requests a specific length.",
|
|
153
|
+
"minimum": 2,
|
|
154
|
+
"maximum": 20
|
|
155
|
+
},
|
|
156
|
+
"negativePrompt": {
|
|
157
|
+
"type": "string",
|
|
158
|
+
"description": "Non-Seedance only. Optional negative prompt for video models that expose a separate negative-prompt field. Do not set for seedance2 or seedance2-fast; rewrite user-provided Seedance avoid/ban/no-X requests as positive visual instructions in prompt."
|
|
159
|
+
},
|
|
160
|
+
"videoModel": {
|
|
161
|
+
"type": "string",
|
|
162
|
+
"enum": [
|
|
163
|
+
"ltx23",
|
|
164
|
+
"wan22",
|
|
165
|
+
"seedance2",
|
|
166
|
+
"seedance2-fast"
|
|
167
|
+
],
|
|
168
|
+
"description": "Video model. \"ltx23\" (default): LTX 2.3 with native audio; Fast/HQ use the distilled 8-step worker and Default Media Quality Pro uses the non-distilled dev worker. \"wan22\": Fast 4-step, simple motion, no audio. Default: \"ltx23\". For ordinary Seedance 2.0 video requests, use seedance2-fast by default: 720p unless Default Media Quality is Fast, which should use 480p. Use seedance2 when the user explicitly asks for 1080p / the non-standard full-quality version, or whenever a generated/uploaded video storyboard image is the Seedance reference unless the user explicitly asks for a draft or the Seedance fast model/version. Storyboard-reference Seedance requests default to High Quality: set targetResolution 720 even when Default Media Quality is Fast, unless the user explicitly asks for another named resolution such as 480p. Default Media Quality Pro alone is not an explicit 1080p request; keep default Seedance resolution at 720p unless the user asks for 1080p or another resolution. Seedance supports multimodal loose reference assets: images (up to 9), videos (up to 3), and audios (up to 3), with no more than 12 asset files total. Use @Image1/@Video1/@Audio1 style references in creative briefs when assigning roles. Assign every useful reference asset a role and prefer positive preservation constraints. If an uploaded video is the source clip to transform, upscale, enhance, restyle, or remaster, use video_to_video with controlMode=\"seedance-v2v\" instead of generate_video referenceVideoIndices."
|
|
169
|
+
},
|
|
170
|
+
"generateAudio": {
|
|
171
|
+
"type": "boolean",
|
|
172
|
+
"description": "Seedance only. Whether Seedance should generate a native audio track. Omit by default; set false only when the user explicitly asks for silent output or no audio."
|
|
173
|
+
},
|
|
174
|
+
"referenceImageIndices": {
|
|
175
|
+
"type": "array",
|
|
176
|
+
"items": {
|
|
177
|
+
"type": "number"
|
|
178
|
+
},
|
|
179
|
+
"description": "Seedance only. Image references for @Image tags. Use negative indices for uploaded images (-1 first upload, -2 second upload) and non-negative indices for generated image results. Omit by default: uploaded images are auto-forwarded as @Image references. Anchor frame intent in the prompt with @Image tags: \"Use @Image1 as the opening shot reference. Begin the video with a composition, subject placement, lighting, mood, and camera framing that closely match @Image1.\" (or @Image2 as the final shot reference). For seamless-loop or \"first frame and last frame identical\" requests with a single uploaded image, anchor it explicitly as both: \"Use @Image1 as both the first frame and last frame so the video loops cleanly back to the opening composition.\" Do not use animate_photo sourceImageIndex/frameRole/endImageIndex for Seedance."
|
|
180
|
+
},
|
|
181
|
+
"referenceVideoIndices": {
|
|
182
|
+
"type": "array",
|
|
183
|
+
"items": {
|
|
184
|
+
"type": "number"
|
|
185
|
+
},
|
|
186
|
+
"description": "Seedance only. Optional loose video references. Use negative indices for uploaded videos (-1 first uploaded video, -2 second uploaded video) and non-negative indices for generated video results. Omit by default: uploaded videos are auto-forwarded as @Video references. Set to choose a subset or include previously generated video URLs. Do not use this for uploaded source-video transforms, upscales, enhancements, restyles, or remasters; use video_to_video with controlMode=\"seedance-v2v\" instead."
|
|
187
|
+
},
|
|
188
|
+
"referenceAudioIndices": {
|
|
189
|
+
"type": "array",
|
|
190
|
+
"items": {
|
|
191
|
+
"type": "number"
|
|
192
|
+
},
|
|
193
|
+
"description": "Seedance only. Optional loose audio references. Use negative indices for uploaded audio files (-1 first uploaded audio, -2 second uploaded audio) and non-negative indices for generated audio results. Omit by default: uploaded audio is auto-forwarded as @Audio references when the Seedance request also has an image or video reference. Use this only for loose background, mood, timing, or style references under an image/video-anchored Seedance shot. If the uploaded audio is the primary sync target, lip-sync target, or requested as sound-to-video/audio-sync, use sound_to_video with videoModel=\"seedance2\" instead. Audio-only Seedance requests are unsupported; use sound_to_video for uploaded-audio-only workflows."
|
|
194
|
+
},
|
|
195
|
+
"width": {
|
|
196
|
+
"type": "number",
|
|
197
|
+
"description": "Video width in pixels. LTX 2.3: 640-3840. WAN: 480-1536. Default resolution depends on model and quality tier: LTX Fast about 720p and High/Pro about 1080p; WAN Fast uses 480p short side and High/Pro uses 720p short side. Set width only when the user specifies an exact width or orientation-qualified exact pixels. A bare named resolution like \"720p resolution\" is a short-side target, not an instruction to make landscape 1280x720. If the user gives only one exact dimension, set only that dimension and preserve/infer the sensible aspect ratio. User-requested exact dimensions override the default media quality. Mappings when orientation is explicit: 480p landscape=854x480, 480p portrait=480x854, 720p landscape=1280x720, 720p portrait=720x1280, 1080p landscape=1920x1080, 1080p portrait=1080x1920, 4K landscape=3840x2160. Non-step values are accepted when in bounds; LTX snaps to the nearest 64px step and WAN snaps to the nearest 16px step internally, so do not ask the user to adjust by a few pixels."
|
|
198
|
+
},
|
|
199
|
+
"height": {
|
|
200
|
+
"type": "number",
|
|
201
|
+
"description": "Video height in pixels. LTX 2.3: 640-3840. WAN: 480-1536. Set height only when the user specifies an exact height or orientation-qualified exact pixels. A bare named resolution like \"720p resolution\" is a short-side target; do not convert it to landscape dimensions unless the user says landscape/horizontal/widescreen. If the user gives only one exact dimension, set only that dimension and preserve/infer the sensible aspect ratio. User-requested exact dimensions override Default Media Quality, including Pro. Non-step values are accepted when in bounds; LTX snaps to the nearest 64px step and WAN snaps to the nearest 16px step internally, so do not ask the user to adjust by a few pixels."
|
|
202
|
+
},
|
|
203
|
+
"targetResolution": {
|
|
204
|
+
"type": "number",
|
|
205
|
+
"description": "Short-side video resolution target in pixels. Use when the user asks for a bare named resolution such as \"480p\", \"720p\", or \"1080p\" without exact pixels or an output orientation. Also set for default Seedance 2.0 requests: 480 when Default Media Quality is Fast, otherwise 720, including Pro unless the user explicitly asks for 1080p or another resolution. For generated/uploaded storyboard images used as Seedance references, default to 720 even when Default Media Quality is Fast; only use 480 when the user explicitly asks for 480p. This preserves/inherits the current video shape instead of forcing landscape. Do NOT set width, height, or exact-pixel aspectRatio for bare named resolution requests. If the user says \"720p portrait\" or \"720p landscape\", use exact width/height/aspectRatio instead."
|
|
206
|
+
},
|
|
207
|
+
"numberOfVariations": {
|
|
208
|
+
"type": "number",
|
|
209
|
+
"description": "Number of variations (1-16). Use 1 unless user explicitly requests multiple separate video outputs. For Seedance, default to 1 because each variation is an expensive separate render.",
|
|
210
|
+
"minimum": 1,
|
|
211
|
+
"maximum": 16
|
|
212
|
+
},
|
|
213
|
+
"aspectRatio": {
|
|
214
|
+
"type": "string",
|
|
215
|
+
"description": "Do NOT set unless the user explicitly requests an aspect ratio, format, orientation, or exact pixel dimensions. When a reference/source image is used and the user did not ask to change its shape, omit this field so the handler preserves the selected source image's own ratio.\n\nFormats: \"16:9\", \"9:16\", \"4:5\", \"1:1\", \"4:3\", \"3:2\", \"21:9\", or exact pixels like \"1920x1080\".\n\nCRITICAL: When the user specifies exact pixel dimensions (e.g., \"1280x720\", \"1080x1920\", \"1920x1080\", \"3840x2160\") or an orientation-qualified named resolution (e.g., \"720p landscape\", \"720p portrait\"), use the exact pixel format, NOT a ratio like \"16:9\" or \"9:16\". Exact user-requested dimensions override the selected default media quality, including Pro/HQ defaults. A bare named video resolution like \"720p resolution\" is only a resolution tier/short-side request; do not turn it into landscape pixels and do not set aspectRatio unless the user also states landscape, portrait, vertical, horizontal, or exact pixels. If requested pixels are in bounds but not on the model's pixel step, still pass the user's exact pixel request; the handler snaps to the nearest supported size internally. Only use ratio format when the user says a generic format name without pixel dimensions.\n\nMappings (use ONLY when user does NOT specify pixel dimensions): landscape/widescreen/YouTube/cinematic → \"16:9\". portrait → \"9:16\". TikTok/Reels/IG Reels → \"1080x1920\". ultrawide/cinema scope → \"21:9\". Instagram post → \"4:5\". square → \"1:1\". standard/TV → \"4:3\". 720p landscape → \"1280x720\". 720p portrait → \"720x1280\". 1080p landscape → \"1920x1080\". 1080p portrait/HD portrait → \"1080x1920\". 4K landscape → \"3840x2160\". 4K portrait → \"2160x3840\". Never set for generic requests like \"make a video\"."
|
|
216
|
+
},
|
|
217
|
+
"voicePersonaName": {
|
|
218
|
+
"type": "string",
|
|
219
|
+
"description": "ONLY when the user explicitly requests a registered/reference persona voice clip. Name of the persona whose voice clip to use as referenceAudioIdentity. Set this when the narrator/speaker is a different persona than the one described in the video (e.g. \"David\" narrates a scene featuring Aleyna), or to explicitly select a requested voice when multiple personas with voice clips are resolved. Do NOT set this for ordinary character dialogue, inferred voices, or personas without a voice clip — LTX 2.3 generates voice natively from the text prompt instead. Requires ltx23."
|
|
220
|
+
}
|
|
221
|
+
},
|
|
222
|
+
"required": [
|
|
223
|
+
"prompt"
|
|
224
|
+
]
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
},
|
|
228
|
+
{
|
|
229
|
+
"type": "function",
|
|
230
|
+
"function": {
|
|
231
|
+
"name": "generate_music",
|
|
232
|
+
"description": "Generate music from a text description. Creates original songs with optional lyrics, BPM, key signature, and duration control. Use when the user wants to create music, a song, a beat, a melody, background music, or any audio content.",
|
|
233
|
+
"parameters": {
|
|
234
|
+
"type": "object",
|
|
235
|
+
"properties": {
|
|
236
|
+
"prompt": {
|
|
237
|
+
"type": "string",
|
|
238
|
+
"description": "Genre, mood, and style description for the music. Be specific about musical characteristics.\n\nLITERAL PROMPT OVERRIDE: If the user explicitly says not to modify the prompt, or to use it exactly/verbatim/as-is, copy the identified prompt text verbatim instead of applying these construction rules unless a hard requirement is missing.\n\nExamples:\n- \"upbeat electronic dance music with driving bass and synth arpeggios\"\n- \"mellow jazz ballad with soft piano, brushed drums, and walking bass\"\n- \"epic orchestral soundtrack with soaring strings and powerful brass\"\n- \"lo-fi hip hop beat with vinyl crackle, muted keys, and chill vibes\"\n- \"acoustic folk song with fingerpicked guitar and warm harmonies\"\n\nInclude:\n- Genre (rock, jazz, electronic, classical, hip-hop, etc.)\n- Mood (happy, melancholic, energetic, relaxing, epic, etc.)\n- Instruments (piano, guitar, drums, synth, strings, etc.)\n- Style descriptors (driving, mellow, atmospheric, punchy, etc.)\n\nBATCH VARIATIONS: When numberOfVariations > 1, use Dynamic Prompt syntax to vary ONE dimension across separate tracks. Lock in any genre/mood/instruments the user specified, vary the rest. Example: \"{lo-fi hip hop beat with muted keys|jazz piano trio with brushed drums|ambient electronic with soft pads} with warm reverb and vinyl texture\"."
|
|
239
|
+
},
|
|
240
|
+
"duration": {
|
|
241
|
+
"type": "number",
|
|
242
|
+
"description": "Duration in seconds. Default: 30. Range: 10-600 (10 seconds to 10 minutes). Short clips: 10-30s. Standard songs: 120-300s.",
|
|
243
|
+
"minimum": 10,
|
|
244
|
+
"maximum": 600
|
|
245
|
+
},
|
|
246
|
+
"bpm": {
|
|
247
|
+
"type": "number",
|
|
248
|
+
"description": "Beats per minute / tempo. Default: 120. Range: 30-300. Slow ballad: 60-80. Mid-tempo: 90-120. Upbeat: 120-140. Fast dance: 140-180. Very fast: 180+.",
|
|
249
|
+
"minimum": 30,
|
|
250
|
+
"maximum": 300
|
|
251
|
+
},
|
|
252
|
+
"keyscale": {
|
|
253
|
+
"type": "string",
|
|
254
|
+
"description": "Musical key and scale. E.g., \"C major\", \"A minor\", \"F# minor\", \"Bb major\". Default: \"C major\". Only set when the user specifies a key or when a particular mood calls for it (minor keys for sad/dark, major for happy/bright)."
|
|
255
|
+
},
|
|
256
|
+
"lyrics": {
|
|
257
|
+
"type": "string",
|
|
258
|
+
"description": "Song lyrics. Optional — omit for instrumental music. Format: write lyrics naturally with line breaks. The model will attempt to sing these lyrics with the generated music. Works best with clear, rhythmic phrasing that matches the BPM."
|
|
259
|
+
},
|
|
260
|
+
"model": {
|
|
261
|
+
"type": "string",
|
|
262
|
+
"enum": [
|
|
263
|
+
"turbo",
|
|
264
|
+
"sft"
|
|
265
|
+
],
|
|
266
|
+
"description": "ACE-Step model variant. \"turbo\" (default): Higher quality audio generation with 4-16 steps and half the cost. Always use turbo unless the user explicitly requests the SFT model. \"sft\": Experimental model with lower audio quality but very strong lyric handling. 10-200 steps, full cost. Only use when the user specifically asks for SFT. Default: \"turbo\"."
|
|
267
|
+
},
|
|
268
|
+
"timesig": {
|
|
269
|
+
"type": "number",
|
|
270
|
+
"enum": [
|
|
271
|
+
2,
|
|
272
|
+
3,
|
|
273
|
+
4,
|
|
274
|
+
6
|
|
275
|
+
],
|
|
276
|
+
"description": "Time signature (beats per measure). 4 = 4/4 time (default, most common). 3 = 3/4 time (waltz). 2 = 2/4 time (march). 6 = 6/8 time (compound). Default: 4."
|
|
277
|
+
},
|
|
278
|
+
"numberOfVariations": {
|
|
279
|
+
"type": "number",
|
|
280
|
+
"description": "Number of variations (1-16). Use 1 unless user requests multiple. Default: 1.",
|
|
281
|
+
"minimum": 1,
|
|
282
|
+
"maximum": 16
|
|
283
|
+
}
|
|
284
|
+
},
|
|
285
|
+
"required": [
|
|
286
|
+
"prompt"
|
|
287
|
+
]
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
},
|
|
291
|
+
{
|
|
292
|
+
"type": "function",
|
|
293
|
+
"function": {
|
|
294
|
+
"name": "edit_image",
|
|
295
|
+
"description": "Generate images guided by reference photos. Supports GPT Image 2 up to 16 images, Flux.2 up to 6 images, and Qwen up to 3 images. Best for style-guided generation, combining elements from multiple images, ANY persona image creation, and any uploaded brand asset reuse — logos, brand marks, mascots, product shots, photos, screenshots, sketches, or character designs the user expects to appear in or guide the result. ALWAYS use this (never generate_image) when persona photos OR uploaded image assets meant for reuse are in context — even if a specific model is requested. Exception: explicit Z-image/Z-image Turbo uploaded-image enhancement uses generate_image with sourceImageIndex and starting_image_strength because edit_image does not expose Z-image models. If a previous edit_image attempt did not preserve the uploaded asset well, stay on edit_image and tighten the prompt or switch model — do not fall back to generate_image, which has no access to the upload at all. For direct edits (remove objects, enhance), use restore_photo or refine_result unless the user explicitly requested Z-image.",
|
|
296
|
+
"parameters": {
|
|
297
|
+
"type": "object",
|
|
298
|
+
"properties": {
|
|
299
|
+
"prompt": {
|
|
300
|
+
"type": "string",
|
|
301
|
+
"description": "Edit instruction describing what to generate using the reference images as guidance. 50-200 words recommended.\n\nLITERAL PROMPT OVERRIDE: If the user explicitly says not to modify the prompt, or to use it exactly/verbatim/as-is, copy the identified prompt text verbatim instead of applying these construction rules unless a hard requirement is missing.\n\nPROMPT CONSTRUCTION ORDER — build the prompt in this sequence:\n1. IDENTITY LOCK — state which picture owns the person's identity (GOLDEN RULE: never leave identity ambiguous when editing a person)\n2. REQUESTED EDIT — describe only what CHANGES (the delta), not the whole image\n3. REFERENCE ROLE MAPPING — assign each picture ONE primary role: base_identity (face/person), pose_reference, outfit_reference, style_reference, background_reference, or color_reference\n4. POSE / COMPOSITION — pose, framing, camera angle (omit if unchanged)\n5. STYLE — artistic style, genre, era (omit if unchanged)\n6. LIGHTING / REALISM — \"maintain realistic anatomy, perspective, and lighting integration\"\n7. PRESERVE clause — always end with \"preserve all unmentioned details\"\n\nIDENTITY LOCK (required when a person is in any reference image):\n\"Preserve the exact facial likeness from picture N — face structure, eye shape, nose shape, mouth shape, jawline, skin tone, hairline, apparent age, and overall recognizability.\"\nNever let a style, pose, or clothing reference silently override the face. If multiple images are provided, explicitly state \"identity comes only from picture N — do not borrow identity from other pictures.\"\n\nMINIMAL-CHANGE PRINCIPLE: The base image already contains the subject, composition, camera angle, expression, lighting, and background. Describe only the delta. Use positive constraints (\"preserve exact facial likeness\") not negative ones (\"don't change the face\").\n\nSINGLE-IMAGE PATTERN:\n\"Preserve the exact facial likeness and recognizability of the person from picture 1. [Describe only the requested change]. Keep the same pose, framing, camera angle, and expression unless the user specifically requests changes to these. Preserve all unmentioned details.\"\n\nMULTI-IMAGE PATTERN:\n\"Use the person from picture 1 as the final subject and preserve their exact facial likeness. [Requested edit]. Identity comes only from picture 1. Pose from picture 2. Outfit from picture 3. Do not borrow identity from pictures 2 or 3. Maintain realistic anatomy, perspective, and lighting integration. Preserve all unmentioned details.\"\n\nCREATIVE TRANSFORMATIONS — be vivid and reference-specific, name the artist, franchise, or era, but always anchor identity first:\n - \"Preserve the exact facial likeness from picture 1. Transform them into a Renaissance oil painting in the style of Vermeer — rich warm tones, dramatic chiaroscuro lighting, ornate period clothing. Maintain realistic anatomy. Preserve all unmentioned details.\"\n - \"Preserve the exact facial likeness from picture 1. Reimagine them as a Marvel superhero — cinematic dramatic lighting, heroic pose, detailed costume with cape, glowing energy effects. Preserve all unmentioned details.\"\n - \"Preserve the exact facial likeness from picture 1. Transform them into a Studio Ghibli anime character — soft watercolor backgrounds, gentle Ghibli-style rendering, whimsical atmosphere. Preserve all unmentioned details.\"\n - \"Preserve the exact facial likeness from picture 1. Place them into a Star Wars scene — Jedi robes, lightsaber glow, dramatic sci-fi backdrop. Preserve all unmentioned details.\"\n - \"Preserve the exact facial likeness from picture 1. Turn them into a GTA loading screen character — bold outlines, saturated colors, attitude-filled pose, urban backdrop. Preserve all unmentioned details.\"\n\nFAILURE MODES TO AVOID:\n- Face drift: identity source not specified, or style/pose reference overrides the face\n- Over-editing: for simple edits, prompt rewrites the entire image instead of describing the delta (creative transformations may intentionally change more)\n- Reference confusion: multiple images provided without explicit role mapping\n\nCHARACTER / MASCOT SHEETS: When the user asks for a character sheet, mascot sheet, model sheet, turnaround, expression sheet, or reusable character reference board using uploaded references, create ONE comprehensive professional reference-board image, not separate variations. Map reference roles clearly first (for example: picture 1 = character identity/style reference, picture 2 = logo/brand asset) and keep the character identity consistent across every panel. Include a large hero pose, front / 3/4 / side / back turnaround views, an expression row, action/personality poses, accessories or props, color palette swatches, and compact notes such as personality, fun facts, or brand usage when appropriate. Preserve exact user-provided brand names, slogans, logo text, and requested copy verbatim; incidental tiny notes may be generated by the image model if the user did not provide exact wording. Use clean readable typography.\n\nBATCH VARIATIONS: When numberOfVariations > 1, the prompt must describe ONE subject in ONE scene — never mention counts, \"versions\", \"different\", or \"multiple\" in the prompt text. NEVER describe multiple copies or duplicates of the subject in a single image (no grids, collages, or side-by-side). Use Dynamic Prompt syntax to vary ONE dimension across separate images. For personas: vary scene, activity, expression, or environment — never vary identity. Example: user asks \"4 versions at the beach\" → numberOfVariations=4, prompt=\"[persona] at the beach {building a sandcastle|surfing a wave|reading under a palm tree|flying a kite}\" — each output is ONE person doing ONE activity. For direct edits: vary the approach, e.g., numberOfVariations=3, prompt=\"make the sky {a vibrant sunset|stormy and dramatic|clear blue}\". Preserve any requested orientation, aspect ratio, or exact pixel dimensions across every variation.\n\nSELECTION-GATED IMAGE STAGES: If the user asks for multiple reference-guided image options/takes/versions and says they will pick one before a later dance, animation, or video, this edit_image call is still the first step. Generate the complete image batch now with sourceImageIndex set to the relevant reference, the exact requested count, Dynamic Prompt options for each output, and the final video/image aspect ratio. Do not ask the user to choose before the images exist, and do not call video tools until after the user selects an image.\n\nLINKED VARIANTS: If multiple details must stay paired per output — visual style, identity cues, outfit, label text, symbols, setting, character, prop, location, or before/after keyframe details — use ONE top-level Dynamic Prompt branch with one complete prompt per output. Do NOT use separate Dynamic Prompt groups for details that must stay together; unpaired groups can mix attributes. If the user asks for per-variant facial, identity, or appearance changes, repeat that guidance inside EVERY option while also preserving recognizability. When the user names a subject or character, write that name or stable role inside every Dynamic Prompt option; a shared prefix outside the branch is not enough because each option must stand alone as a complete identity contract.\n\nEach option must be a fully concrete description — name the actual garment or styling, the actual setting, the actual accessories, and the literal text or symbol shown on screen when requested. Never use meta-placeholder phrasing such as \"style-specific outfit\", \"variant-specific background\", \"include the requested symbol\", \"include a humorous alternate name\", or \"bake the name and symbol into the image\" — those describe the task instead of the image.\n\nORIGINAL + VARIANT BATCHES: When one option is a remade/preserved original and the other options are themed variants, the original option still needs a concrete visual contract. Say to preserve the original clothing/wardrobe/outfit and original background/setting, then name any requested added text, label, flag, logo, symbol, or prop for that original option. Do not leave the original option as only \"unmodified original person\"; it must be as fully specified as every themed option.\n\nNEW SETTING PER OPTION: When the variant theme implies a new place, culture, era, or context, every option must name its own setting (location, props, lighting). Do NOT carry the source background forward, do NOT write \"in the same pose and placement as the original photo\" without also naming the new background, and do NOT rely on \"preserve all unmentioned details\" to handle the setting — the new setting IS a mentioned detail.\n\nRECOGNIZABILITY OVER FEATURE LOCK: For ethnic / age / character / art-style transformations, do NOT paste the strict IDENTITY LOCK feature list (\"face structure, eye shape, nose shape, mouth shape, jawline, skin tone, hairline\") inside each option — that list contradicts the requested face change and the source face will pass through unchanged. Anchor recognizability per option through apparent age, signature hair silhouette, build, posture, and expression, and explicitly allow skin tone, facial features, and proportions to shift toward the target.\n\nCorrect shape (each option self-contained, concrete, with a fresh setting and a recognizability anchor instead of a strict feature lock):\n\"{The subject wearing [specific garment, color, cut, and material], standing in [specific NEW setting with props and lighting — never the source background], bold text at the bottom reads [literal requested text], [specific requested visual symbol] appears as a sign or prop, [requested per-variant facial or appearance shift, e.g. \"skin tone, eye shape, and bone structure shift toward <target> features\"], recognizable through apparent age, signature hair silhouette, build, posture, and expression|The subject wearing [second specific garment, color, cut, and material], standing in [second specific NEW setting with props and lighting], bold text at the bottom reads [second literal requested text], [second requested visual symbol] appears as a sign or prop, [second requested facial or appearance shift], recognizable through apparent age, signature hair silhouette, build, posture, and expression|...}\"\n\nWrong shape (placeholder labels masquerading as prompts):\n\"{First variant with variant-specific facial features, placeholder wardrobe, alternate name, and requested symbol baked in|Second variant with different variant-specific facial features, placeholder wardrobe, alternate name, and requested symbol baked in|...}\"\n\nAlso wrong (strict feature lock + no new setting — the source face and source background pass through unchanged):\n\"{Preserve the exact facial likeness — face structure, eye shape, nose shape, mouth shape, jawline, skin tone, hairline. Reimagine as <variant>: [garment description], standing in the exact same pose and placement as the original photo. Preserve all unmentioned details.|Preserve the exact facial likeness — [same strict lock]. Reimagine as <other variant>: [other garment], standing in the exact same pose and placement as the original photo. Preserve all unmentioned details.|...}\"\n\nSCREENPLAY / STORYBOARD BATCHES: For multi-scene story, commercial, or longer-form video keyframes, use one Dynamic Prompt branch with one full scene prompt per option. Recurring characters must keep stable names and repeated visual anchors in every scene option where they appear: face/identity source if available, age range, build, hairstyle, outfit silhouette, color palette, signature prop/accessory, posture, and role. Do not let style, scene changes, or pose references alter identity. Include screenplay-style speaker tags when dialogue matters, e.g. CHARACTER: \"We made it.\"\n\nCOMPOSITE GPT IMAGE 2 STORYBOARD SHEETS: When numberOfVariations=1 and the user asks for one composite video storyboard/keyframe sheet using uploaded or generated references, the prompt must be a compiled storyboard prompt, not a concept summary. Include a SCENES: section with exactly the requested number of concrete entries named SCENE_01, SCENE_02, etc. Every scene entry must include Visual/Action, Camera/Motion, Dialogue/VO (or [no dialogue]), Audio/SFX, and any visible text or reference usage for that scene. Do not provide only the source brief or generic layout instructions; malformed compiled storyboard prompts are blocked by quality audit."
|
|
302
|
+
},
|
|
303
|
+
"model": {
|
|
304
|
+
"type": "string",
|
|
305
|
+
"enum": [
|
|
306
|
+
"gpt-image-2",
|
|
307
|
+
"qwen-lightning",
|
|
308
|
+
"qwen",
|
|
309
|
+
"flux2"
|
|
310
|
+
],
|
|
311
|
+
"description": "DO NOT SET THIS PARAMETER unless the user names a specific edit model, asks for a very complex reference-guided image render, or asks for a video storyboard/storyboard sheet/contact sheet/panel layout image using references. The app auto-selects based on quality settings. Set \"gpt-image-2\" when the user asks for a ChatGPT, OpenAI, GPT, GPT-2, GPT Image, or gpt-image-2 reference-guided image/edit/model, or by default for complex single-image renders that need dense labels, crisp typography, multi-panel composition, timing notes, foley notes, professional storyboard-sheet layout, or a comprehensive character/mascot/model sheet with turnarounds, expressions, accessories, palette swatches, and brand notes. Z-image and Z-image Turbo are not edit_image models; route those explicit uploaded-image enhancement requests to generate_image with sourceImageIndex and starting_image_strength. If the user names another edit/image model, honor that requested model instead. GPT Image 2 always processes input images at high fidelity; do not set input_fidelity."
|
|
312
|
+
},
|
|
313
|
+
"sourceImageIndex": {
|
|
314
|
+
"type": "number",
|
|
315
|
+
"description": "Index of the primary image to use as the main reference. For follow-up edits when generated image results already exist, use the 0-based generated image result index; for example, editing the latest generated storyboard/image should use that generated result index so the model modifies the existing image instead of redrawing from uploads. When no generated image results exist, use sourceImageIndex=-1 to use the uploaded image references. The primary image and any additional uploaded images are passed as context images to guide generation."
|
|
316
|
+
},
|
|
317
|
+
"numberOfVariations": {
|
|
318
|
+
"type": "number",
|
|
319
|
+
"description": "Number of variations (1-16). Pass the user's EXACT requested count in ONE call — never split into multiple calls and never call edit_image again to \"add more\". \"4 variations\" → numberOfVariations=4 in a single call. Use the exact requested count for reference-guided images that will feed a later video after the user picks one. For screenplay/storyboard batches, the prompt must contain one Dynamic Prompt branch with one full scene prompt per scene; never set numberOfVariations=N with only scene 1's prompt. Use 1 unless the user explicitly asks for multiple. Default: 1.",
|
|
320
|
+
"minimum": 1,
|
|
321
|
+
"maximum": 16
|
|
322
|
+
},
|
|
323
|
+
"width": {
|
|
324
|
+
"type": "number",
|
|
325
|
+
"description": "Output image width in pixels. Defaults to the context image width. Supported range is 256-2560 for Qwen/Flux.2 edit models. For gpt-image-2, dimensions are flexible up to 3840px on either edge with max 3:1 aspect ratio and a total pixel budget from 655,360 to 8,294,400; the renderer snaps to the nearest valid multiple-of-16 size. Set when the user specifies a width, exact pixel dimensions, or a named resolution (e.g., \"1280 wide\", \"1280x720\", \"720p\", \"3840x2160\"). If the user gives only one dimension, set only that dimension and preserve/infer the sensible aspect ratio. User-requested dimensions override the default media quality, including Pro. Non-multiple-of-16 values are accepted when in bounds; the renderer snaps to the nearest supported size internally, so do not ask the user to adjust by a few pixels."
|
|
326
|
+
},
|
|
327
|
+
"height": {
|
|
328
|
+
"type": "number",
|
|
329
|
+
"description": "Output image height in pixels. Defaults to the context image height. Supported range is 256-2560 for Qwen/Flux.2 edit models. For gpt-image-2, dimensions are flexible up to 3840px on either edge with max 3:1 aspect ratio and a total pixel budget from 655,360 to 8,294,400; the renderer snaps to the nearest valid multiple-of-16 size. Set when the user specifies a height, exact pixel dimensions, or a named resolution (e.g., \"720 high\", \"1280x720\", \"720p\", \"2160x3840\"). If the user gives only one dimension, set only that dimension and preserve/infer the sensible aspect ratio. User-requested dimensions override the default media quality, including Pro. Non-multiple-of-16 values are accepted when in bounds; the renderer snaps to the nearest supported size internally, so do not ask the user to adjust by a few pixels."
|
|
330
|
+
},
|
|
331
|
+
"aspectRatio": {
|
|
332
|
+
"type": "string",
|
|
333
|
+
"description": "Do NOT set unless the user explicitly requests an aspect ratio, format, orientation, or exact pixel dimensions. When a reference/source image is used and the user did not ask to change its shape, omit this field so the handler preserves the selected source image's own ratio.\n\nFormats: \"16:9\", \"9:16\", \"4:5\", \"1:1\", \"4:3\", \"3:2\", \"21:9\", or exact pixels like \"1920x1080\".\n\nCRITICAL: When the user specifies exact pixel dimensions (e.g., \"1280x720\", \"1080x1920\", \"1920x1080\", \"3840x2160\") or an orientation-qualified named resolution (e.g., \"720p landscape\", \"720p portrait\"), use the exact pixel format, NOT a ratio like \"16:9\" or \"9:16\". Exact user-requested dimensions override the selected default media quality, including Pro/HQ defaults. A bare named video resolution like \"720p resolution\" is only a resolution tier/short-side request; do not turn it into landscape pixels and do not set aspectRatio unless the user also states landscape, portrait, vertical, horizontal, or exact pixels. If requested pixels are in bounds but not on the model's pixel step, still pass the user's exact pixel request; the handler snaps to the nearest supported size internally. Only use ratio format when the user says a generic format name without pixel dimensions.\n\nMappings (use ONLY when user does NOT specify pixel dimensions): landscape/widescreen/YouTube/cinematic → \"16:9\". portrait → \"9:16\". TikTok/Reels/IG Reels → \"1080x1920\". ultrawide/cinema scope → \"21:9\". Instagram post → \"4:5\". square → \"1:1\". standard/TV → \"4:3\". 720p landscape → \"1280x720\". 720p portrait → \"720x1280\". 1080p landscape → \"1920x1080\". 1080p portrait/HD portrait → \"1080x1920\". 4K landscape → \"3840x2160\". 4K portrait → \"2160x3840\". Never set for generic requests like \"make a video\".\n\nSet this whenever the user specifies an image or downstream video orientation/aspect ratio such as 9:16, 16:9, portrait, vertical, landscape, widescreen, TikTok/Reels/Shorts, or exact pixels. This includes selection-gated reference-guided image batches that will feed a later video or dance after the user picks one. For GPT Image 2 exact size requests, preserve exact pixel intent when possible and prefer popular GPT sizes such as 1536x1024, 1024x1536, 2048x1152, 3840x2160, and 2160x3840. GPT Image 2 does not support transparent-background output; do not promise a transparent result for this model."
|
|
334
|
+
},
|
|
335
|
+
"gptImageQuality": {
|
|
336
|
+
"type": "string",
|
|
337
|
+
"enum": [
|
|
338
|
+
"low",
|
|
339
|
+
"medium",
|
|
340
|
+
"high",
|
|
341
|
+
"auto"
|
|
342
|
+
],
|
|
343
|
+
"description": "Optional GPT Image 2 rendering quality. Only set with model=\"gpt-image-2\" when the user explicitly asks for low/fast, medium/balanced, high/final, or auto quality. Otherwise omit it and let the host app media quality setting map Fast to low, HQ to medium, and Pro to high."
|
|
344
|
+
},
|
|
345
|
+
"outputFormat": {
|
|
346
|
+
"type": "string",
|
|
347
|
+
"enum": [
|
|
348
|
+
"png",
|
|
349
|
+
"jpg",
|
|
350
|
+
"jpeg",
|
|
351
|
+
"webp"
|
|
352
|
+
],
|
|
353
|
+
"description": "Optional output file format for generated images. Set only when the user explicitly requests PNG, JPG/JPEG, or WebP. Hosts should normalize \"jpeg\" to the Sogni project format \"jpg\"."
|
|
354
|
+
},
|
|
355
|
+
"personaName": {
|
|
356
|
+
"type": "string",
|
|
357
|
+
"description": "RARE — only set this when the user EXPLICITLY asks for solo images of one specific person (\"a portrait of just [name]\", \"4 solos of [name] alone\"). When set, the handler filters context to ONLY that persona's reference photo, so any other personas in your prompt will be missing their reference. DEFAULT for multi-persona requests is to OMIT this and put both faces in one combined call. Never set this for \"make us as X\", \"the two of us\", \"my wife and I\", or any phrasing that puts both personas in the same scene — that's a single combined call with no personaName."
|
|
358
|
+
}
|
|
359
|
+
},
|
|
360
|
+
"required": [
|
|
361
|
+
"prompt"
|
|
362
|
+
]
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
},
|
|
366
|
+
{
|
|
367
|
+
"type": "function",
|
|
368
|
+
"function": {
|
|
369
|
+
"name": "apply_style",
|
|
370
|
+
"description": "Apply an artistic style, era-specific look, or creative transformation to a photo. Use when the user wants to change the visual style (e.g., \"make it look like the 70s\", \"oil painting style\", \"vintage polaroid look\"). Can handle any creative transformation. One style per call. IMPORTANT: When previous results exist, this tool automatically uses the LATEST result image unless you specify a different sourceImageIndex or the user explicitly says \"original\". So just call it without sourceImageIndex for follow-up requests.",
|
|
371
|
+
"parameters": {
|
|
372
|
+
"type": "object",
|
|
373
|
+
"properties": {
|
|
374
|
+
"prompt": {
|
|
375
|
+
"type": "string",
|
|
376
|
+
"description": "Style prompt for Qwen Image Edit 2511 (50-200 words, natural language sentences).\n\nLITERAL PROMPT OVERRIDE: If the user explicitly says not to modify the prompt, or to use it exactly/verbatim/as-is, copy the identified prompt text verbatim instead of applying these construction rules unless a hard requirement is missing.\n\nPROMPT ORDER: [IDENTITY LOCK if people] → [STYLE TRANSFER INSTRUCTION] → [PRESERVE UNMENTIONED DETAILS]\n\nRules:\n- Use POSITIVE phrasing only. The model ignores negatives (\"preserve exact facial likeness\" NOT \"don't change the face\").\n- Transfer the visual STYLE ONLY, not the identity. Borrow palette, texture, contrast behavior, and stylistic treatment — NOT face structure.\n- Reference known art styles, artists, and franchises BY NAME to anchor the style — be specific, never generic.\n- Describe specific visual characteristics: brushstrokes, color palette, texture, composition approach, mood.\n- For era looks: describe the photographic qualities of that era (e.g., \"warm faded Kodachrome tones with soft vignette, typical of 1970s amateur photography\").\n- CRITICAL for photos with people: FRONT-LOAD identity preservation BEFORE the style instruction. Start with \"Preserve exact facial likeness, face structure, eye shape, nose shape, mouth shape, jawline, skin tone, hairline, apparent age, and overall recognizability.\" Then describe the style. End with \"Keep the subject recognizable as the same person. Maintain exact positioning, poses, and composition.\"\n- Go bold with pop culture and iconic styles: \"Andy Warhol pop art with bold neon screen-print colors\", \"Banksy stencil street art with gritty urban textures\", \"Studio Ghibli watercolor with soft pastoral warmth\", \"Pixar 3D render with glossy skin and exaggerated features\", \"Tim Burton gothic with pale skin and dark spiraling backgrounds\", \"Van Gogh Starry Night with thick impasto swirls and vibrant blues\", \"Takashi Murakami superflat with psychedelic flowers and bold outlines\".\n- Always end with \"Preserve the subject's identity, pose, and composition.\""
|
|
377
|
+
},
|
|
378
|
+
"sourceImageIndex": {
|
|
379
|
+
"type": "number",
|
|
380
|
+
"description": "Which result image to apply the style to (0-based index). Omit to use the latest result automatically (or the original if no results exist). Only set explicitly when the user specifies a particular image number or explicitly says \"original\" (use -1 for original)."
|
|
381
|
+
},
|
|
382
|
+
"scale": {
|
|
383
|
+
"type": "number",
|
|
384
|
+
"enum": [
|
|
385
|
+
1,
|
|
386
|
+
1.5,
|
|
387
|
+
2,
|
|
388
|
+
3,
|
|
389
|
+
4
|
|
390
|
+
],
|
|
391
|
+
"description": "Output scale multiplier relative to the source image size. 1 = same resolution as source (default). Use higher values when user asks to upscale, enlarge, make bigger, or increase resolution. Small images (<480px) are automatically upscaled to at least 480px regardless of this setting. Default: 1."
|
|
392
|
+
},
|
|
393
|
+
"aspectRatio": {
|
|
394
|
+
"type": "string",
|
|
395
|
+
"description": "Do NOT set unless the user explicitly requests an aspect ratio, format, orientation, or exact pixel dimensions. When a reference/source image is used and the user did not ask to change its shape, omit this field so the handler preserves the selected source image's own ratio.\n\nFormats: \"16:9\", \"9:16\", \"4:5\", \"1:1\", \"4:3\", \"3:2\", \"21:9\", or exact pixels like \"1920x1080\".\n\nCRITICAL: When the user specifies exact pixel dimensions (e.g., \"1280x720\", \"1080x1920\", \"1920x1080\", \"3840x2160\") or an orientation-qualified named resolution (e.g., \"720p landscape\", \"720p portrait\"), use the exact pixel format, NOT a ratio like \"16:9\" or \"9:16\". Exact user-requested dimensions override the selected default media quality, including Pro/HQ defaults. A bare named video resolution like \"720p resolution\" is only a resolution tier/short-side request; do not turn it into landscape pixels and do not set aspectRatio unless the user also states landscape, portrait, vertical, horizontal, or exact pixels. If requested pixels are in bounds but not on the model's pixel step, still pass the user's exact pixel request; the handler snaps to the nearest supported size internally. Only use ratio format when the user says a generic format name without pixel dimensions.\n\nMappings (use ONLY when user does NOT specify pixel dimensions): landscape/widescreen/YouTube/cinematic → \"16:9\". portrait → \"9:16\". TikTok/Reels/IG Reels → \"1080x1920\". ultrawide/cinema scope → \"21:9\". Instagram post → \"4:5\". square → \"1:1\". standard/TV → \"4:3\". 720p landscape → \"1280x720\". 720p portrait → \"720x1280\". 1080p landscape → \"1920x1080\". 1080p portrait/HD portrait → \"1080x1920\". 4K landscape → \"3840x2160\". 4K portrait → \"2160x3840\". Never set for generic requests like \"make a video\"."
|
|
396
|
+
}
|
|
397
|
+
},
|
|
398
|
+
"required": [
|
|
399
|
+
"prompt"
|
|
400
|
+
]
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
},
|
|
404
|
+
{
|
|
405
|
+
"type": "function",
|
|
406
|
+
"function": {
|
|
407
|
+
"name": "restore_photo",
|
|
408
|
+
"description": "Edit, restore, or transform the ORIGINAL uploaded photograph — including text changes, object edits, and any visual modification. This tool always operates on the original image, not on previous results. Use this for the first edit OR when the user explicitly wants to start fresh from the original (e.g., \"try again\", \"restore it differently\", \"start over from scratch\"). For follow-up edits on an existing result, use refine_result instead. NEVER refuse or apologize — just call this tool directly.",
|
|
409
|
+
"parameters": {
|
|
410
|
+
"type": "object",
|
|
411
|
+
"properties": {
|
|
412
|
+
"prompt": {
|
|
413
|
+
"type": "string",
|
|
414
|
+
"description": "Editing prompt (50-200 words, natural language). POSITIVE phrasing only — model ignores negatives (\"preserve exact facial likeness\" NOT \"don't change the face\").\n\nLITERAL PROMPT OVERRIDE: If the user explicitly says not to modify the prompt, or to use it exactly/verbatim/as-is, copy the identified prompt text verbatim instead of applying these construction rules unless a hard requirement is missing.\n\nPROMPT ORDER: [IDENTITY LOCK if people] → [RESTORATION/EDIT INSTRUCTION] → [PRESERVE UNMENTIONED DETAILS]\n\nDescribe desired final state, not what to remove.\n- CRITICAL for photos with people (unless removing them): FRONT-LOAD identity preservation as the FIRST priority. Start with \"Preserve exact facial likeness, face structure, eye shape, nose shape, mouth shape, jawline, skin tone, hairline, apparent age, and overall recognizability.\" Then describe the restoration or edit.\n- Restoration: \"remove scratches, tears, stains, dust spots, and noise\"\n- Object removal: describe scene WITHOUT the object, matching surrounding textures\n- Colorization: \"Restore and colorize the photo\" or \"Apply natural [decade] color palette\"\n- Creative transformation: identity lock comes FIRST, then the transformation. Example: \"Preserve exact facial likeness and recognizability. Reimagine as a Pixar character with glossy 3D features. Preserve all unmentioned details.\"\n- No keyword spam (\"8k, masterpiece\") — use plain descriptions. Be specific — name the artist, franchise, or era.\n- Always end with \"Preserve all unmentioned details.\"\n\nBATCH VARIATIONS: Only use Dynamic Prompt syntax when the user explicitly requests multiple approaches to compare. Example: \"restore with {warm vintage|cool modern|natural balanced} tones\". Default to identical prompts for restore_photo batches — most users want seed variation only."
|
|
415
|
+
},
|
|
416
|
+
"numberOfVariations": {
|
|
417
|
+
"type": "number",
|
|
418
|
+
"description": "Number of variations (1-16). Use 1 unless user requests multiple. Default: 1.",
|
|
419
|
+
"minimum": 1,
|
|
420
|
+
"maximum": 16
|
|
421
|
+
},
|
|
422
|
+
"quality": {
|
|
423
|
+
"type": "string",
|
|
424
|
+
"enum": [
|
|
425
|
+
"fast",
|
|
426
|
+
"hq"
|
|
427
|
+
],
|
|
428
|
+
"description": "DO NOT SET THIS PARAMETER unless the user explicitly asks for \"high quality\" or \"fast\". The app auto-selects based on quality settings."
|
|
429
|
+
},
|
|
430
|
+
"scale": {
|
|
431
|
+
"type": "number",
|
|
432
|
+
"enum": [
|
|
433
|
+
1,
|
|
434
|
+
1.5,
|
|
435
|
+
2,
|
|
436
|
+
3,
|
|
437
|
+
4
|
|
438
|
+
],
|
|
439
|
+
"description": "Output scale multiplier relative to the source image size. 1 = same resolution as source (default). Use higher values when user asks to upscale, enlarge, make bigger, or increase resolution. Small images (<480px) are automatically upscaled to at least 480px regardless of this setting. Default: 1."
|
|
440
|
+
},
|
|
441
|
+
"aspectRatio": {
|
|
442
|
+
"type": "string",
|
|
443
|
+
"description": "Do NOT set unless the user explicitly requests an aspect ratio, format, orientation, or exact pixel dimensions. When a reference/source image is used and the user did not ask to change its shape, omit this field so the handler preserves the selected source image's own ratio.\n\nFormats: \"16:9\", \"9:16\", \"4:5\", \"1:1\", \"4:3\", \"3:2\", \"21:9\", or exact pixels like \"1920x1080\".\n\nCRITICAL: When the user specifies exact pixel dimensions (e.g., \"1280x720\", \"1080x1920\", \"1920x1080\", \"3840x2160\") or an orientation-qualified named resolution (e.g., \"720p landscape\", \"720p portrait\"), use the exact pixel format, NOT a ratio like \"16:9\" or \"9:16\". Exact user-requested dimensions override the selected default media quality, including Pro/HQ defaults. A bare named video resolution like \"720p resolution\" is only a resolution tier/short-side request; do not turn it into landscape pixels and do not set aspectRatio unless the user also states landscape, portrait, vertical, horizontal, or exact pixels. If requested pixels are in bounds but not on the model's pixel step, still pass the user's exact pixel request; the handler snaps to the nearest supported size internally. Only use ratio format when the user says a generic format name without pixel dimensions.\n\nMappings (use ONLY when user does NOT specify pixel dimensions): landscape/widescreen/YouTube/cinematic → \"16:9\". portrait → \"9:16\". TikTok/Reels/IG Reels → \"1080x1920\". ultrawide/cinema scope → \"21:9\". Instagram post → \"4:5\". square → \"1:1\". standard/TV → \"4:3\". 720p landscape → \"1280x720\". 720p portrait → \"720x1280\". 1080p landscape → \"1920x1080\". 1080p portrait/HD portrait → \"1080x1920\". 4K landscape → \"3840x2160\". 4K portrait → \"2160x3840\". Never set for generic requests like \"make a video\"."
|
|
444
|
+
}
|
|
445
|
+
},
|
|
446
|
+
"required": [
|
|
447
|
+
"prompt"
|
|
448
|
+
]
|
|
449
|
+
}
|
|
450
|
+
}
|
|
451
|
+
},
|
|
452
|
+
{
|
|
453
|
+
"type": "function",
|
|
454
|
+
"function": {
|
|
455
|
+
"name": "refine_result",
|
|
456
|
+
"description": "Make ANY edit to an existing result image. This is the DEFAULT tool for follow-up requests after results exist. Use whenever the user wants to modify, adjust, or build upon a previous result — including brightness, color, sharpening, object removal, background changes, further restoration, or any other edit. If the user does not specify which image, use the most recent result (index 0 if only one result, or the last result the user referenced). Only use restore_photo instead if the user explicitly wants to start over from the original upload.",
|
|
457
|
+
"parameters": {
|
|
458
|
+
"type": "object",
|
|
459
|
+
"properties": {
|
|
460
|
+
"prompt": {
|
|
461
|
+
"type": "string",
|
|
462
|
+
"description": "Targeted refinement prompt for Qwen Image Edit 2511 (50-150 words, natural language sentences).\n\nLITERAL PROMPT OVERRIDE: If the user explicitly says not to modify the prompt, or to use it exactly/verbatim/as-is, copy the identified prompt text verbatim instead of applying these construction rules unless a hard requirement is missing.\n\nPROMPT ORDER: [IDENTITY LOCK if people] → [SPECIFIC CHANGE] → [PRESERVE EVERYTHING ELSE]\n\nRules:\n- Use POSITIVE phrasing only. The model ignores negatives (\"preserve exact facial likeness\" NOT \"don't change the face\").\n- Describe ONLY what needs to change (the delta). The base image already contains most of the truth — do not rewrite the entire image.\n- Be specific about what to change: \"warmer skin tones\", \"cooler shadows\", \"sharper facial features\", \"more natural greens\".\n- For creative refinements: lean into specifics — \"add more dramatic Rembrandt lighting\", \"push the colors more toward Warhol neon pop\", \"make the anime eyes larger and more expressive\", \"add more superhero energy with glowing effects\".\n- CRITICAL for photos with people: FRONT-LOAD identity preservation before the edit. Start with \"Preserve exact facial likeness, face structure, eye shape, nose shape, mouth shape, jawline, skin tone, hairline, apparent age, and overall recognizability.\"\n- ALWAYS end with \"Preserve all unmentioned details\" to prevent unwanted changes.\n\nBATCH VARIATIONS: Only use Dynamic Prompt syntax when the user explicitly asks to explore different refinement directions. Example: \"refine with {more contrast|softer lighting|richer colors}\". Default to identical prompts for refine_result batches."
|
|
463
|
+
},
|
|
464
|
+
"sourceImageIndex": {
|
|
465
|
+
"type": "number",
|
|
466
|
+
"description": "Which result image to refine (0-based index). If the user specifies an image number, use that index. If omitted, the latest result is used automatically. When multiple results exist and the user previously referenced a specific one, use that one."
|
|
467
|
+
},
|
|
468
|
+
"numberOfVariations": {
|
|
469
|
+
"type": "number",
|
|
470
|
+
"description": "Number of variations (1-16). Use 1 unless user requests multiple. Default: 1.",
|
|
471
|
+
"minimum": 1,
|
|
472
|
+
"maximum": 16
|
|
473
|
+
},
|
|
474
|
+
"scale": {
|
|
475
|
+
"type": "number",
|
|
476
|
+
"enum": [
|
|
477
|
+
1,
|
|
478
|
+
1.5,
|
|
479
|
+
2,
|
|
480
|
+
3,
|
|
481
|
+
4
|
|
482
|
+
],
|
|
483
|
+
"description": "Output scale multiplier relative to the source image size. 1 = same resolution as source (default). Use higher values when user asks to upscale, enlarge, make bigger, or increase resolution. Small images (<480px) are automatically upscaled to at least 480px regardless of this setting. Default: 1."
|
|
484
|
+
},
|
|
485
|
+
"aspectRatio": {
|
|
486
|
+
"type": "string",
|
|
487
|
+
"description": "Do NOT set unless the user explicitly requests an aspect ratio, format, orientation, or exact pixel dimensions. When a reference/source image is used and the user did not ask to change its shape, omit this field so the handler preserves the selected source image's own ratio.\n\nFormats: \"16:9\", \"9:16\", \"4:5\", \"1:1\", \"4:3\", \"3:2\", \"21:9\", or exact pixels like \"1920x1080\".\n\nCRITICAL: When the user specifies exact pixel dimensions (e.g., \"1280x720\", \"1080x1920\", \"1920x1080\", \"3840x2160\") or an orientation-qualified named resolution (e.g., \"720p landscape\", \"720p portrait\"), use the exact pixel format, NOT a ratio like \"16:9\" or \"9:16\". Exact user-requested dimensions override the selected default media quality, including Pro/HQ defaults. A bare named video resolution like \"720p resolution\" is only a resolution tier/short-side request; do not turn it into landscape pixels and do not set aspectRatio unless the user also states landscape, portrait, vertical, horizontal, or exact pixels. If requested pixels are in bounds but not on the model's pixel step, still pass the user's exact pixel request; the handler snaps to the nearest supported size internally. Only use ratio format when the user says a generic format name without pixel dimensions.\n\nMappings (use ONLY when user does NOT specify pixel dimensions): landscape/widescreen/YouTube/cinematic → \"16:9\". portrait → \"9:16\". TikTok/Reels/IG Reels → \"1080x1920\". ultrawide/cinema scope → \"21:9\". Instagram post → \"4:5\". square → \"1:1\". standard/TV → \"4:3\". 720p landscape → \"1280x720\". 720p portrait → \"720x1280\". 1080p landscape → \"1920x1080\". 1080p portrait/HD portrait → \"1080x1920\". 4K landscape → \"3840x2160\". 4K portrait → \"2160x3840\". Never set for generic requests like \"make a video\"."
|
|
488
|
+
}
|
|
489
|
+
},
|
|
490
|
+
"required": [
|
|
491
|
+
"prompt"
|
|
492
|
+
]
|
|
493
|
+
}
|
|
494
|
+
}
|
|
495
|
+
},
|
|
496
|
+
{
|
|
497
|
+
"type": "function",
|
|
498
|
+
"function": {
|
|
499
|
+
"name": "animate_photo",
|
|
500
|
+
"description": "Animate a photo into video with motion, audio, and dialogue using LTX 2.3 or WAN 2.2. Do NOT use this tool for seedance2 or seedance2-fast. Seedance 2.0 media references must go through generate_video with referenceImageIndices/referenceVideoIndices/referenceAudioIndices and @Image/@Video/@Audio role text in the prompt; for seamless-loop Seedance requests with one uploaded image, the prompt should anchor it as both the first frame and last frame. LTX/WAN NOTE: uploaded audio files are not loose references for ltx23/wan22; use sound_to_video when uploaded audio is the primary sync target. DANCE REQUESTS (\"make them dance\", \"do the X dance\"): use dance_montage — NOT this tool. LTX 2.3 generates audio natively — describe dialogue and ambient sounds directly in the prompt (do NOT pre-generate audio for this tool). If the user provides exact speech, include it in double quotes; if they only imply speech, describe the performance and voice without inventing quoted words. Never use placeholders such as \"while speaking\", \"dialogue begins\", \"explaining\", or \"final line lands\". PERSONA VOICE: Only when the user explicitly asks to use/clone a registered persona voice clip, call resolve_personas first, then set voicePersonaName to select which persona's voice clip to use. Do not set voicePersonaName for ordinary character dialogue or inferred voices; describe those voices in the prompt for native LTX audio. For cross-persona narration (e.g. David narrates a video of Aleyna), resolve both personas and set voicePersonaName to the narrator only if that registered voice was requested. Persona voice requires ltx23 — always use ltx23 when persona voice is requested (WAN 2.2 does not support voice identity). PERSONA PIPELINE: For persona videos, ensure an image of the persona exists before calling animate_photo. The standard pipeline is: resolve_personas → edit_image → animate_photo. If a suitable persona image already exists (user uploaded one, a prior edit_image/generate_image result, OR the user explicitly says to use the Persona image/reference photo directly), skip edit_image and animate directly. After resolve_personas, this tool can animate the injected persona image directly when that explicit direct-use instruction is given. Auto-uses the latest result image (from any prior tool) unless sourceImageIndex is set. Supports start-frame (default), end-frame, and start+end interpolation modes for LTX/WAN — ask the user which frame role their image should play if they mention \"end frame\", \"last frame\", or provide two images. FIRST+LAST FRAME WORKFLOW: When the user wants a non-Seedance video using two different scenes as start and end frames, FIRST generate both images in a single generate_image/edit_image call with numberOfVariations=2 and Dynamic Prompts, THEN call animate_photo with frameRole=\"both\", sourceImageIndex=0, endImageIndex=1. Never generate the two frames in separate tool calls. In frameRole=\"both\", the handler automatically inspects both images and upgrades the base prompt into a scene-aware smooth transition prompt, so your prompt should state the desired transition style, action, dialogue, and audio rather than trying to list every visible object. If the request is vague, analyze the image first and suggest 2-3 specific animation ideas tailored to what you see. Only call once you have clear creative intent. N-VIDEOS PATTERN — ALWAYS BATCH IN ONE CALL: When the user wants N video versions or a multi-segment stitched non-Seedance video, NEVER call animate_photo N times. Always use sourceImageIndices in a single call so all N projects run in parallel. sourceImageIndices supports up to 16 entries; there is NO 3-clip cap, so do not split one planned batch into \"first 3\" and \"remaining\" calls. For a dialogue-heavy total-duration request with no explicit per-clip duration, prefer 15-second clips (30s total = 2 clips × 15s), not 5×6s or 6×5s. Two flavors: (A) SHARED CONTENT — when all N clips have the same dialogue/motion but different source visuals (different scenes, outfits, environments, persona looks), first generate N distinct images via ONE edit_image/generate_image call with numberOfVariations=N + Dynamic Prompts {|}, then call animate_photo with sourceImageIndices=[start..start+N-1] and a single shared `prompt`. If all segments intentionally reuse the primary uploaded image instead of generated source images, use sourceImageIndices=[-1,-1,...] with one -1 per segment. For a long or multi-segment video from a single supplied/uploaded image WITHOUT a requested image/keyframe/version generation stage, use sourceImageIndices=[-1,-1,...] and per-clip prompts. Only set frameRole=\"both\" and endImageIndex=-1 when the user explicitly says the same uploaded/source/original image should be both the first and last frame of every segment. If the user requests generated source images first, honor that image stage, then animate the generated result indices. When using generated scene keyframes and each clip should begin and end on its own scene image for stitching, call animate_photo with frameRole=\"both\" and sourceImageIndices=[start..end] but OMIT endImageIndex; do not set endImageIndex=-1 unless every source is the uploaded image. (B) PER-CLIP CONTENT — when each clip has DIFFERENT dialogue, jokes, narration, or motion (e.g. \"4 videos where each tells a different joke\"), pass BOTH sourceImageIndices AND `prompts` (an array of N strings, one per clip) in the same single call. Each prompt must independently anchor the visible characters, scene action, camera, audio, exact screenplay-style speaker tags, and exact quoted dialogue for that segment. If you just wrote or displayed a script/table, copy the exact dialogue lines into the corresponding per-clip prompts; do not summarize them as speech activity. If using named speaker tags with any multi-person reference image or generated scene keyframe, include one explicit cast map in each prompt that binds each name to visible position, clothing, and props/actions, e.g. SPEAKER_A = left person holding a prop; SPEAKER_B = center person with tablet; SPEAKER_C = right person near table. Do not also describe the same people again as generic man/boy/girl/woman/character subjects. For screenplay, storyboard, commercial, series, or other longer-form tasks with recurring characters, preserve the same character names and repeated visual anchors in every per-clip prompt where each character appears. The fan-out launches all N projects in parallel with their respective per-clip prompts. Use the standard single-source path (numberOfVariations only) when the user wants motion variety from a single fixed frame instead.",
|
|
501
|
+
"parameters": {
|
|
502
|
+
"type": "object",
|
|
503
|
+
"properties": {
|
|
504
|
+
"prompt": {
|
|
505
|
+
"type": "string",
|
|
506
|
+
"description": "I2V RULE: Do NOT re-describe what is visible in the input image. Focus on the transition from stillness — motion, expression changes, what happens next, camera movement, and sound.\n\nLITERAL PROMPT OVERRIDE: If the user explicitly says not to modify the prompt, or to use it exactly/verbatim/as-is, copy the identified prompt text verbatim instead of applying these construction rules unless a hard requirement is missing. Set skipPromptProcessing=true; for Seedance also set expandPrompt=false.\n\nSTRUCTURE: \"[How the subject begins to move]. [What changes next]. [Camera behavior]. [Audio].\"\n\nMOTION PACING: Scale complexity to duration. <=6s: 1 main action beat + 1 simple camera move. Around 10s: 2-3 clear action beats + 1 camera move. >10s: up to 4 action beats in clear sequence. Prefer fewer readable beats over dense micro-actions, especially in short clips.\n\nBLOCKING: Use the image as the anchor and direct only meaningful layout changes. If the prompt introduces multiple moving subjects, state left/right placement, foreground/background, facing toward/away, and relative distance.\n\nACTION: One flowing paragraph. Describe motion beat by beat with temporal connectors (\"as\", \"then\", \"while\"). Specify who moves, what moves, how it moves, and what the camera does. One main thread — avoid too many actions at once or generic phrases like \"comes alive.\"\n\nDIALOGUE: Put user-provided spoken lines in double quotes. For screenplay-style or longer-form tasks, prefix each spoken line with a stable speaker tag outside the quotes, e.g. CHARACTER: \"We made it.\" Break long speech into short quoted phrases with acting beats between them (gestures, pauses, glances). If the user asks for speech but provides no exact words, describe the visible delivery, voice quality, and emotion without inventing quoted dialogue; ask only when exact wording is the point of the request. Never write placeholders such as \"while speaking\", \"dialogue begins\", \"explaining\", or \"final line lands\". Show emotion through visible behavior, not labels. LTX 2.3 generates audio natively. QUOTING RULE: ONLY use double quotes for spoken dialogue. Never quote on-screen text, overlay text, titles, captions, signs, or any visual text — describe them without quotes (e.g. bold white text reading CONGRATULATIONS overlays the lower third).\n\nAUDIO: Prompt sound intentionally — voice quality, volume, room tone, ambience, music, weather, footsteps. Include language or accent if relevant. Useful voice/volume anchors: whisper, mutter, shout, scream, energetic announcer, resonant voice with gravitas, distorted radio-style, robotic monotone, childlike curiosity.\n\nCAMERA: Cinematic terms — slow push-in, static tripod, handheld, slow arc, dolly in. Describe movement relative to subject.\n\nFor first+last-frame transitions (frameRole=\"both\"), write a concise base request for the transition style, action, dialogue, and audio. The handler will inspect both frames and expand it into a scene-aware prompt that maps visible objects and subjects between frames.\n\nFor specific characters (movies, TV): describe visual appearance — don't rely on names alone.\n\nFor complex/creative scenes (characters talking, skits), capture full creative intent — system auto-expands into detailed prompt.\n\nAVOID: Re-describing the image, vague prompts, too many actions at once, abstract emotions without visible behavior, rigid numeric constraints, readable text or logos.\n\nWAN 2.2 (\"wan22\"): 30-150 words, subtle natural movements.\n\nBATCH VARIATIONS: When numberOfVariations > 1, use Dynamic Prompt syntax to vary motion, camera, or atmosphere while preserving the user's specified elements. Example: \"{gentle sway with soft birdsong|dramatic zoom with rolling thunder|slow pan with ambient music}\"."
|
|
507
|
+
},
|
|
508
|
+
"expandPrompt": {
|
|
509
|
+
"type": "boolean",
|
|
510
|
+
"description": "Optional. Set false only for pipeline-authored prompts that should bypass model-specific prompt expansion."
|
|
511
|
+
},
|
|
512
|
+
"skipPromptProcessing": {
|
|
513
|
+
"type": "boolean",
|
|
514
|
+
"description": "Bypass automatic prompt shaping/refinement, image-description anchoring, transition-prompt rewriting, and voice-identity prompt formatting so the prompt text is sent unchanged to the video model. Set true ONLY when the user explicitly says not to modify/rewrite/enhance/expand/change/improve the prompt, or to use/send it exactly, verbatim, or as-is, AND the provided prompt already satisfies the tool requirements. Continue to set non-prompt parameters such as source indices, frameRole, model, duration, count, and aspect ratio. For Seedance literal prompt requests, also set expandPrompt=false. Do not set for ordinary underspecified requests."
|
|
515
|
+
},
|
|
516
|
+
"videoModel": {
|
|
517
|
+
"type": "string",
|
|
518
|
+
"enum": [
|
|
519
|
+
"ltx23",
|
|
520
|
+
"wan22"
|
|
521
|
+
],
|
|
522
|
+
"description": "Which video model to use. \"ltx23\" (default): LTX 2.3 with native audio; Fast/HQ use the distilled 8-step worker and Default Media Quality Pro uses the non-distilled dev worker. \"wan22\": Fast 4-step, simple motion, no audio. Use ltx23 for most requests. Use wan22 for quick simple motions without audio. Default: \"ltx23\". Do not set seedance2 or seedance2-fast here; use generate_video with referenceImageIndices and @Image role text for Seedance."
|
|
523
|
+
},
|
|
524
|
+
"negativePrompt": {
|
|
525
|
+
"type": "string",
|
|
526
|
+
"description": "Optional negative prompt for LTX/Wan image-to-video models. Use only when the user explicitly states what should be avoided; do not use this field for Seedance workflows."
|
|
527
|
+
},
|
|
528
|
+
"duration": {
|
|
529
|
+
"type": "number",
|
|
530
|
+
"description": "Video duration in seconds. Default: 5. Use when the user explicitly requests a specific length (e.g., \"make a 10 second video\"). Range: 2-20."
|
|
531
|
+
},
|
|
532
|
+
"targetResolution": {
|
|
533
|
+
"type": "number",
|
|
534
|
+
"description": "Short-side video resolution target in pixels. Use when the user asks for a bare named resolution such as \"480p\", \"720p\", or \"1080p\" without exact pixels or an output orientation. This preserves the source image aspect ratio. Do NOT set width, height, or exact-pixel aspectRatio for bare named resolution requests. If the user says \"720p portrait\" or \"720p landscape\", use exact-pixel aspectRatio instead."
|
|
535
|
+
},
|
|
536
|
+
"sourceImageIndex": {
|
|
537
|
+
"type": "number",
|
|
538
|
+
"description": "Which image to use as the START frame. Use 0-based non-negative indices for generated result images. Use negative indices for uploaded images: -1 = first/primary upload, -2 = second upload, -3 = third upload, etc. Omit to auto-select: uses the latest result for \"start\"/\"end\" modes, or the FIRST result for \"both\" mode. IMPORTANT: When frameRole is \"both\", set this to the start frame image index and endImageIndex to the end frame image index."
|
|
539
|
+
},
|
|
540
|
+
"sourceImageIndices": {
|
|
541
|
+
"type": "array",
|
|
542
|
+
"items": {
|
|
543
|
+
"type": "number"
|
|
544
|
+
},
|
|
545
|
+
"minItems": 1,
|
|
546
|
+
"maxItems": 16,
|
|
547
|
+
"description": "Array of source frame indices — one video is generated per entry as its own SDK project, all running in PARALLEL. Use 0-based non-negative result indices for generated images. Use negative indices for uploaded images: -1 = first/primary upload, -2 = second upload, -3 = third upload, etc. Repeating -1 is allowed and is REQUIRED for multi-segment videos that reuse the same uploaded image as every segment's start frame. By default all projects share the `prompt`/`voice`/`duration`, but you can pass `prompts` (array) to give each clip its own dialogue/motion. ALWAYS use this for non-Seedance \"N videos\" or multi-segment request — never call animate_photo N times sequentially. Do NOT combine with `numberOfVariations`, `sourceImageIndex`, or frameRole=\"end\". You MAY combine with frameRole=\"both\" when clips need end frames. For adjacent transition chains across generated images, use sourceImageIndices=[start..end-1] and endImageIndices=[start+1..end] so N images produce N-1 transition clips. If the uploaded/original image starts the chain and generated results are the remaining frames, use sourceImageIndices=[-1,start..end-1] and endImageIndices=[start..end]. If the user supplies multiple uploaded images as the actual keyframe sequence, use adjacent negative uploaded indices, e.g. 5 uploaded images become sourceImageIndices=[-1,-2,-3,-4], endImageIndices=[-2,-3,-4,-5], frameRole=\"both\", prompts length 4, then stitch_video. If the user specifies transition motion, camera behavior, actions, dialogue, or audio, copy those instructions into every corresponding per-clip prompt; only invent a generic smooth transition when the user does not specify one. If the user asks for a seamless loop or final transition from the last image back to the first, close the chain by including the last image as a source and the first image as the final end frame, e.g. 5 uploaded images become sourceImageIndices=[-1,-2,-3,-4,-5], endImageIndices=[-2,-3,-4,-5,-1]. For generated scene keyframes that should each loop to themselves, omit endImageIndex/endImageIndices so each source image is also its own end frame. Set endImageIndex=-1 only when every sourceImageIndices entry is also -1 and every segment reuses the first uploaded image. Range: 1–16 indices. For generated image batches, values MUST be read from the latest edit_image/generate_image tool result's `startIndex` field. If startIndex=3 and 4 images were generated in that batch, pass `[3,4,5,6]` (NOT `[0,1,2,3]`). Do NOT assume generated indices start at 0 — they don't if there are prior results in the conversation."
|
|
548
|
+
},
|
|
549
|
+
"prompts": {
|
|
550
|
+
"type": "array",
|
|
551
|
+
"items": {
|
|
552
|
+
"type": "string"
|
|
553
|
+
},
|
|
554
|
+
"minItems": 1,
|
|
555
|
+
"maxItems": 16,
|
|
556
|
+
"description": "Per-clip prompts for fan-out — use when the user wants DIFFERENT dialogue, jokes, narration, or motion in each video. MUST be paired with `sourceImageIndices` and have the SAME length. Each entry is the full prompt for the corresponding source image. If a clip has speech, include exact spoken words in double quotes with stable speaker tags; do NOT write placeholders like \"while speaking\", \"dialogue begins\", \"explaining\", or \"final line lands\". If you just wrote a script/table/storyboard, copy that clip's exact dialogue into this prompt. When named speakers appear in a multi-person reference image or generated keyframe, start each entry with one compact cast map that binds names to visible anchors before dialogue, e.g. Cast map: SPEAKER_A is the left person holding the prop; SPEAKER_B is the center person with the tablet; SPEAKER_C is the right person near the table. Then move directly into action/dialogue; do not describe those same people again as generic man/boy/girl/woman/character subjects. This prevents speaker tags from being assigned to the wrong visible character. When set, the top-level `prompt` parameter is ignored (still required by the schema — just pass any descriptive string, e.g. a brief summary of the batch). Example: 4 source images of a couple, \"make each video have a different joke\" → sourceImageIndices=[0,1,2,3], prompts=[\"Cast map: She is the left woman in the blue dress; He is the right man in the gray jacket. She says: \\\"Why did the scarecrow win an award?\\\" He grins.\", \"Cast map: He is the right man in the gray jacket; She is the left woman in the blue dress. He says: \\\"Because he was outstanding in his field!\\\" She laughs.\", \"...\", \"...\"]. Omit this when all clips share the same dialogue/motion (visual variety only) — fan-out will use the shared `prompt` for every clip."
|
|
557
|
+
},
|
|
558
|
+
"numberOfVariations": {
|
|
559
|
+
"type": "number",
|
|
560
|
+
"description": "Number of variations (1-16). Use 1 unless user explicitly requests multiple separate video outputs.",
|
|
561
|
+
"minimum": 1,
|
|
562
|
+
"maximum": 16
|
|
563
|
+
},
|
|
564
|
+
"aspectRatio": {
|
|
565
|
+
"type": "string",
|
|
566
|
+
"description": "Do NOT set unless the user explicitly requests an aspect ratio, format, orientation, or exact pixel dimensions. When a reference/source image is used and the user did not ask to change its shape, omit this field so the handler preserves the selected source image's own ratio.\n\nFormats: \"16:9\", \"9:16\", \"4:5\", \"1:1\", \"4:3\", \"3:2\", \"21:9\", or exact pixels like \"1920x1080\".\n\nCRITICAL: When the user specifies exact pixel dimensions (e.g., \"1280x720\", \"1080x1920\", \"1920x1080\", \"3840x2160\") or an orientation-qualified named resolution (e.g., \"720p landscape\", \"720p portrait\"), use the exact pixel format, NOT a ratio like \"16:9\" or \"9:16\". Exact user-requested dimensions override the selected default media quality, including Pro/HQ defaults. A bare named video resolution like \"720p resolution\" is only a resolution tier/short-side request; do not turn it into landscape pixels and do not set aspectRatio unless the user also states landscape, portrait, vertical, horizontal, or exact pixels. If requested pixels are in bounds but not on the model's pixel step, still pass the user's exact pixel request; the handler snaps to the nearest supported size internally. Only use ratio format when the user says a generic format name without pixel dimensions.\n\nMappings (use ONLY when user does NOT specify pixel dimensions): landscape/widescreen/YouTube/cinematic → \"16:9\". portrait → \"9:16\". TikTok/Reels/IG Reels → \"1080x1920\". ultrawide/cinema scope → \"21:9\". Instagram post → \"4:5\". square → \"1:1\". standard/TV → \"4:3\". 720p landscape → \"1280x720\". 720p portrait → \"720x1280\". 1080p landscape → \"1920x1080\". 1080p portrait/HD portrait → \"1080x1920\". 4K landscape → \"3840x2160\". 4K portrait → \"2160x3840\". Never set for generic requests like \"make a video\"."
|
|
567
|
+
},
|
|
568
|
+
"frameRole": {
|
|
569
|
+
"type": "string",
|
|
570
|
+
"enum": [
|
|
571
|
+
"start",
|
|
572
|
+
"end",
|
|
573
|
+
"both"
|
|
574
|
+
],
|
|
575
|
+
"description": "How to use the source image(s) for non-Seedance video generation. \"start\" (default): image is the first frame — video animates forward from it. \"end\": image is the last frame — video leads up to it. \"both\": two images provided — interpolates between start and end frames. For single clips using \"both\", set sourceImageIndex to the start frame and endImageIndex to the end frame. For sourceImageIndices fan-out using repeated -1, use frameRole=\"both\" and set endImageIndex=-1 when every segment must use the uploaded image as the shared last frame. For adjacent generated-image transitions, use frameRole=\"both\" with matching sourceImageIndices and endImageIndices arrays. Omit endImageIndex/endImageIndices only when each source image should also be its own end frame. The handler inspects different start/end frames and generates a detailed transition prompt automatically."
|
|
576
|
+
},
|
|
577
|
+
"endImageIndex": {
|
|
578
|
+
"type": "number",
|
|
579
|
+
"description": "Which image to use as the END frame. Use 0-based non-negative indices for generated results. Use negative indices for uploaded images: -1 = first/primary upload, -2 = second upload, -3 = third upload, etc. For a single frameRole=\"both\" transition between two different images, set this to the desired end frame. For sourceImageIndices fan-out where each generated keyframe should also be its own last frame, OMIT this field. Use a shared uploaded endImageIndex only when every sourceImageIndices entry is also an uploaded image; otherwise use endImageIndices for per-clip end frames."
|
|
580
|
+
},
|
|
581
|
+
"endImageIndices": {
|
|
582
|
+
"type": "array",
|
|
583
|
+
"items": {
|
|
584
|
+
"type": "number"
|
|
585
|
+
},
|
|
586
|
+
"minItems": 1,
|
|
587
|
+
"maxItems": 16,
|
|
588
|
+
"description": "Per-clip END frame indices for sourceImageIndices fan-out. Use ONLY with frameRole=\"both\". Length MUST exactly match sourceImageIndices. Use 0-based non-negative indices for generated results and negative indices for uploaded images (-1 first upload, -2 second upload, etc.). Use this for transition chains between generated images, e.g. 5 generated images at indices [0,1,2,3,4] should become 4 transition clips with sourceImageIndices=[0,1,2,3], endImageIndices=[1,2,3,4], prompts length 4, duration as requested, then stitch_video. If the chain starts on the uploaded image and continues through generated results [0,1,2,3], use sourceImageIndices=[-1,0,1,2] and endImageIndices=[0,1,2,3]. If the user supplies 5 uploaded images as the sequence, use sourceImageIndices=[-1,-2,-3,-4] and endImageIndices=[-2,-3,-4,-5]. If the user requests a seamless loop or final transition back to the first image, append that loop closure: sourceImageIndices=[-1,-2,-3,-4,-5], endImageIndices=[-2,-3,-4,-5,-1]. Do NOT also set endImageIndex when using this."
|
|
589
|
+
},
|
|
590
|
+
"voicePersonaName": {
|
|
591
|
+
"type": "string",
|
|
592
|
+
"description": "ONLY when the user explicitly requests a registered/reference persona voice clip. Name of the persona whose voice clip to use as referenceAudioIdentity. Set this when the narrator/speaker is a different persona than the one shown in the video (e.g. \"David\" narrates a video of Aleyna), or to explicitly select a requested voice when multiple personas with voice clips are resolved. Do NOT set this for ordinary character dialogue, inferred voices, or personas without a voice clip — LTX 2.3 generates voice natively from the text prompt instead. Requires ltx23."
|
|
593
|
+
}
|
|
594
|
+
},
|
|
595
|
+
"required": [
|
|
596
|
+
"prompt"
|
|
597
|
+
]
|
|
598
|
+
}
|
|
599
|
+
}
|
|
600
|
+
},
|
|
601
|
+
{
|
|
602
|
+
"type": "function",
|
|
603
|
+
"function": {
|
|
604
|
+
"name": "change_angle",
|
|
605
|
+
"description": "Generate the photo from a different camera angle or perspective. Uses AI to create a new view of the subject as if photographed from a different position. Use when the user wants to see the subject from another angle, generate a different view, create a portrait from a specific direction, or get a closeup/wide shot. Examples: \"show me from the left side\", \"generate a 3/4 portrait view\", \"closeup from slightly above\". IMPORTANT: When previous results exist, this tool automatically uses the LATEST result image unless you specify a different sourceImageIndex or the user explicitly says \"original\".",
|
|
606
|
+
"parameters": {
|
|
607
|
+
"type": "object",
|
|
608
|
+
"properties": {
|
|
609
|
+
"description": {
|
|
610
|
+
"type": "string",
|
|
611
|
+
"description": "EXACT camera angle string. You MUST construct this by concatenating exactly one value from each category below, separated by single spaces. No commas, no extra words.\n\nFormat: \"[azimuth] [elevation] [distance]\"\n\nAzimuth (pick one): \"front view\", \"front-right quarter view\", \"right side view\", \"back-right quarter view\", \"back view\", \"back-left quarter view\", \"left side view\", \"front-left quarter view\"\nElevation (pick one): \"low-angle shot\", \"eye-level shot\", \"elevated shot\", \"high-angle shot\"\nDistance (pick one): \"close-up\", \"medium shot\", \"wide shot\"\n\nExamples:\n- \"front-right quarter view eye-level shot medium shot\"\n- \"left side view eye-level shot close-up\"\n- \"front view low-angle shot wide shot\"\n- \"right side view elevated shot medium shot\"\n\nMap user requests: \"from the left\" → \"left side view\", \"looking up at\" → \"low-angle shot\", \"closeup\" → \"close-up\", \"3/4 view\" → \"front-right quarter view\" or \"front-left quarter view\", \"portrait\" → \"front-right quarter view eye-level shot medium shot\".\nDefault elevation to \"eye-level shot\" and distance to \"medium shot\" when not specified."
|
|
612
|
+
},
|
|
613
|
+
"sourceImageIndex": {
|
|
614
|
+
"type": "number",
|
|
615
|
+
"description": "Which result image to use as source (0-based index). Omit to use the latest result automatically (or the original if no results exist). Only set explicitly when the user specifies a particular image number or explicitly says \"original\" (use -1 for original)."
|
|
616
|
+
},
|
|
617
|
+
"loraStrength": {
|
|
618
|
+
"type": "number",
|
|
619
|
+
"description": "LoRA strength for angle generation (0.1-1.0). Default: 0.9. Lower values preserve more of the original appearance, higher values produce stronger angle changes. Only set when the user wants to control the transformation intensity."
|
|
620
|
+
},
|
|
621
|
+
"aspectRatio": {
|
|
622
|
+
"type": "string",
|
|
623
|
+
"description": "Do NOT set unless the user explicitly requests an aspect ratio, format, orientation, or exact pixel dimensions. When a reference/source image is used and the user did not ask to change its shape, omit this field so the handler preserves the selected source image's own ratio.\n\nFormats: \"16:9\", \"9:16\", \"4:5\", \"1:1\", \"4:3\", \"3:2\", \"21:9\", or exact pixels like \"1920x1080\".\n\nCRITICAL: When the user specifies exact pixel dimensions (e.g., \"1280x720\", \"1080x1920\", \"1920x1080\", \"3840x2160\") or an orientation-qualified named resolution (e.g., \"720p landscape\", \"720p portrait\"), use the exact pixel format, NOT a ratio like \"16:9\" or \"9:16\". Exact user-requested dimensions override the selected default media quality, including Pro/HQ defaults. A bare named video resolution like \"720p resolution\" is only a resolution tier/short-side request; do not turn it into landscape pixels and do not set aspectRatio unless the user also states landscape, portrait, vertical, horizontal, or exact pixels. If requested pixels are in bounds but not on the model's pixel step, still pass the user's exact pixel request; the handler snaps to the nearest supported size internally. Only use ratio format when the user says a generic format name without pixel dimensions.\n\nMappings (use ONLY when user does NOT specify pixel dimensions): landscape/widescreen/YouTube/cinematic → \"16:9\". portrait → \"9:16\". TikTok/Reels/IG Reels → \"1080x1920\". ultrawide/cinema scope → \"21:9\". Instagram post → \"4:5\". square → \"1:1\". standard/TV → \"4:3\". 720p landscape → \"1280x720\". 720p portrait → \"720x1280\". 1080p landscape → \"1920x1080\". 1080p portrait/HD portrait → \"1080x1920\". 4K landscape → \"3840x2160\". 4K portrait → \"2160x3840\". Never set for generic requests like \"make a video\"."
|
|
624
|
+
}
|
|
625
|
+
},
|
|
626
|
+
"required": [
|
|
627
|
+
"description"
|
|
628
|
+
]
|
|
629
|
+
}
|
|
630
|
+
}
|
|
631
|
+
},
|
|
632
|
+
{
|
|
633
|
+
"type": "function",
|
|
634
|
+
"function": {
|
|
635
|
+
"name": "video_to_video",
|
|
636
|
+
"description": "Transform an existing video using AI. Uses WAN 2.2 Animate (move/replace) with a reference image to animate a photo with the video's motion or swap the video's subject, LTX-2.3 V2V ControlNet (canny/pose/depth/detailer) for video-only transforms, or Seedance V2V when the user explicitly asks to transform, upscale, enhance, restyle, or remaster an uploaded video with Seedance. Requires an uploaded video file. Use when the user wants to animate a photo with video motion, replace subjects in a video, restyle an existing video, or enhance video quality.",
|
|
637
|
+
"parameters": {
|
|
638
|
+
"type": "object",
|
|
639
|
+
"properties": {
|
|
640
|
+
"prompt": {
|
|
641
|
+
"type": "string",
|
|
642
|
+
"description": "Describe the TARGET appearance (not the transformation process). 2-4 present-tense sentences.\n\nLITERAL PROMPT OVERRIDE: If the user explicitly says not to modify the prompt, or to use it exactly/verbatim/as-is, copy the identified prompt text verbatim instead of applying these construction rules unless a hard requirement is missing. For Seedance, set expandPrompt=false.\n\nFor LTX-2.3 canny/depth/pose modes, the source video preserves composition, depth, or motion. Spend prompt detail on style, atmosphere, lighting, surface texture, color palette, scale, and pacing.\n\nExamples by mode:\n- animate-move (DEFAULT — WAN 2.2 Animate Move: applies camera/motion from source video to reference image): \"Smooth cinematic camera movement following the subject through the scene.\"\n- animate-replace (WAN 2.2 Animate Replace: replaces the subject in the source video with the reference image): \"The person from the reference photo performing the actions from the video.\"\n- canny (LTX-2.3 — edge-detection restyle): \"Hand-drawn watercolor anime style with soft ink edges, muted teal and coral palette, rain mist, neon reflections, warm rim light, preserving original silhouettes and composition.\"\n- pose (LTX-2.3 — tracks skeleton, replace person): \"A glossy cartoon robot with exaggerated proportions, brushed metal texture, glowing cyan joints, energetic stage lighting, preserving the original dance timing and pose.\"\n- depth (LTX-2.3 — depth-map restyle): \"A misty alpine valley at golden hour, expansive scale, volumetric haze, cool blue shadows, warm rim light, cinematic depth, lingering continuous shot.\"\n- detailer (LTX-2.3 — enhance quality): DESCRIBE THE SOURCE, do not request changes. Append quality qualifiers only. E.g. \"The same scene, ultra-sharp and clean, crisp high-resolution detail, preserving all original content, composition, and color.\" Avoid words like \"enhanced textures\", \"restyled\", or any new subjects/objects — they cause drift.\n- seedance-v2v (BytePlus Dreamina Seedance 2.0 V2V): \"Restyle the source clip in a watercolor look with soft ink edges, while preserving its motion and composition.\" Use natural prose; Seedance reads the reference video holistically rather than via control-net constraints, so describe target style/mood/dialogue rather than control strength.\n\nPresent tense. Positive phrasing. Concrete visual details.\n\nBATCH VARIATIONS: When numberOfVariations > 1, use Dynamic Prompt syntax to vary the artistic treatment while keeping control mode and structural intent consistent. Example: \"transform to {watercolor with soft edges|oil painting with bold strokes|anime with clean lines} style\"."
|
|
643
|
+
},
|
|
644
|
+
"expandPrompt": {
|
|
645
|
+
"type": "boolean",
|
|
646
|
+
"description": "Seedance only. Whether to run the shared Seedance prompt shaper before dispatch. Defaults to true; set false only when the user explicitly asks to submit the compact prompt directly or not modify the prompt."
|
|
647
|
+
},
|
|
648
|
+
"videoSourceIndex": {
|
|
649
|
+
"type": "number",
|
|
650
|
+
"description": "Which uploaded video to transform. OMIT this field when there is only one uploaded video — the tool auto-selects it. Only pass when you need to pick among multiple uploaded videos. Indexing: 0-based (0 = first uploaded video, 1 = second). Note: this differs from analyze_video which uses negative indices; this tool also tolerates the negative form (-1 = first uploaded) for convenience."
|
|
651
|
+
},
|
|
652
|
+
"controlMode": {
|
|
653
|
+
"type": "string",
|
|
654
|
+
"enum": [
|
|
655
|
+
"animate-move",
|
|
656
|
+
"animate-replace",
|
|
657
|
+
"canny",
|
|
658
|
+
"pose",
|
|
659
|
+
"depth",
|
|
660
|
+
"detailer",
|
|
661
|
+
"seedance-v2v"
|
|
662
|
+
],
|
|
663
|
+
"description": "How the source video and (optional) reference image interact. Pick by user intent:\n• \"animate-move\" (DEFAULT) — WAN 2.2 Animate Move. Applies camera movement and motion from the source video to the reference image, bringing a still photo to life. Requires sourceImageIndex.\n• \"animate-replace\" — WAN 2.2 Animate Replace. Replaces the subject in the source video with the person/character from the reference image, keeping the video's background and motion. Requires sourceImageIndex.\n• \"canny\" — LTX-2.3 edge-detection control. Best for restyling while preserving exact composition and silhouettes (e.g. \"make this footage look like anime / oil painting / watercolor\"). Use for subjects with crisp edges — people, objects, graphics. Video-only; no reference image needed.\n• \"pose\" — LTX-2.3 skeletal tracking. Best for replacing a person while keeping their motion (e.g. \"turn this dancer into a robot\"). Image optional — if provided, controls appearance; otherwise the prompt drives appearance. Requires person-centric motion.\n• \"depth\" — LTX-2.3 depth-map control. Best for restyling scenes with perspective, camera movement, or volumetric content (landscapes, interiors, camera pans). Preserves 3D spatial layout rather than 2D edges; more forgiving than canny when edges are noisy. Video-only.\n• \"detailer\" — LTX-2.3 quality enhancement. Sharpens detail and texture WITHOUT restyling. The prompt must DESCRIBE THE ORIGINAL scene with quality qualifiers (sharp, clean, high-resolution) — never request content changes, new textures, or a new look. Pick this when the user asks to \"improve quality\", \"enhance\", \"upscale\", or \"sharpen\" without a creative transformation.\n• \"seedance-v2v\" — BytePlus Dreamina Seedance 2.0 video-to-video. Use only when the user explicitly asks for Seedance on the uploaded source video, such as Seedance Fast upscale, enhance, remaster, restyle, or transform. High-fidelity quality, native audio, time-coded scene control. Seedance V2V reads @Video1 holistically. Use it for restyling, motion transfer, extension, subject replacement, or scene transformation, and assign @Video1 a clear role such as source clip, camera movement, action timing, edit rhythm, or continuation anchor. Distinct from canny/depth/pose which use control-net constraints — Seedance treats the reference video holistically.\nCanny vs depth: canny preserves silhouettes and fine outlines — pick it for subject-led scenes and graphic restyles. Depth preserves 3D structure — pick it for scenes where the camera moves or spatial layout matters more than edge fidelity. Default: \"animate-move\"."
|
|
664
|
+
},
|
|
665
|
+
"negativePrompt": {
|
|
666
|
+
"type": "string",
|
|
667
|
+
"description": "Non-Seedance only. Optional negative prompt for LTX/Wan video-to-video models. Do not set when controlMode is seedance-v2v or videoModel is seedance2/seedance2-fast; rewrite user-provided Seedance avoid/ban/no-X requests as positive prompt instructions."
|
|
668
|
+
},
|
|
669
|
+
"videoModel": {
|
|
670
|
+
"type": "string",
|
|
671
|
+
"enum": [
|
|
672
|
+
"ltx23-v2v",
|
|
673
|
+
"wan22-animate",
|
|
674
|
+
"seedance2",
|
|
675
|
+
"seedance2-fast"
|
|
676
|
+
],
|
|
677
|
+
"description": "Model selector for this video-to-video request. Usually omit; controlMode chooses the non-Seedance model. For controlMode=\"seedance-v2v\", use \"seedance2-fast\" by default, especially when the user asks for Seedance Fast/seedance-fast or 480p/720p. Use \"seedance2\" only when the user explicitly asks for the full/non-fast Seedance model or 1080p."
|
|
678
|
+
},
|
|
679
|
+
"generateAudio": {
|
|
680
|
+
"type": "boolean",
|
|
681
|
+
"description": "Seedance V2V only. Whether Seedance should generate/retain a native audio track. Omit by default; set false only when the user explicitly asks for silent output or no audio. Use only with controlMode=\"seedance-v2v\"."
|
|
682
|
+
},
|
|
683
|
+
"targetResolution": {
|
|
684
|
+
"type": "number",
|
|
685
|
+
"description": "Seedance V2V only. Short-side output resolution target in pixels. Use when the user asks for a bare named resolution such as \"480p\", \"720p\", or \"1080p\" without exact dimensions. For Seedance V2V fast, 480p and 720p are supported; preserve the source video shape instead of forcing landscape pixels."
|
|
686
|
+
},
|
|
687
|
+
"sourceImageIndex": {
|
|
688
|
+
"type": "number",
|
|
689
|
+
"description": "Optional index of a reference image (0-based). Required for \"animate-move\" and \"animate-replace\". Optional for \"pose\" (controls appearance if provided). Ignored by \"canny\", \"depth\", and \"detailer\"."
|
|
690
|
+
},
|
|
691
|
+
"duration": {
|
|
692
|
+
"type": "number",
|
|
693
|
+
"description": "Output video duration in seconds. Range: 2-20 for WAN/LTX modes and 4-15 for controlMode=\"seedance-v2v\". If omitted, the tool matches the uploaded source video duration when available (capped to the selected model range); otherwise it falls back to 10s for WAN Animate Move/Replace and 5s for LTX-2.3/Seedance modes. For long stitched/bulk WAN Animate Move/Replace work with no explicit per-clip length, prefer about 10s clips rather than 5s chunks. Only pass this when the user explicitly requests a different length.",
|
|
694
|
+
"minimum": 2,
|
|
695
|
+
"maximum": 20
|
|
696
|
+
},
|
|
697
|
+
"numberOfVariations": {
|
|
698
|
+
"type": "number",
|
|
699
|
+
"description": "Number of video variations to generate (1-16). Default: 1.",
|
|
700
|
+
"minimum": 1,
|
|
701
|
+
"maximum": 16
|
|
702
|
+
}
|
|
703
|
+
},
|
|
704
|
+
"required": [
|
|
705
|
+
"prompt"
|
|
706
|
+
]
|
|
707
|
+
}
|
|
708
|
+
}
|
|
709
|
+
},
|
|
710
|
+
{
|
|
711
|
+
"type": "function",
|
|
712
|
+
"function": {
|
|
713
|
+
"name": "stitch_video",
|
|
714
|
+
"description": "Combine multiple videos into a single continuous video. Sources can be previously generated clips (non-negative indices into the session video-result array, populated by animate_photo, generate_video, sound_to_video, video_to_video, dance_montage — use videoStartIndex from their results to find the indices) and/or uploaded videos (negative indices: -1 = first uploaded video, -2 = second, etc.). Mix and match in any playback order — for example, pass [0, -1] to play the first generated clip followed by the first uploaded video (a generated bumper followed by the user's existing footage). Use when the user wants to join, merge, concatenate, or combine clips, including when they ask to add a generated bumper / intro / outro / tag / sting to an uploaded video. When the user asks to stitch \"these\" or all uploaded videos and does not name a different playback order, use the current upload/UI order exactly: [-1, -2, ...]. If the user explicitly asks for a different order, honor that requested order. Requires at least 2 source videos in total. Never ask the user to re-upload videos that were already generated or that are already attached to the session. When the user generated music with generate_music in this same session and wants it on the stitch (or asked for a music video / soundtrack), pass a non-negative audioIndex to attach that generated track. When the user uploaded an audio file and wants it overlaid on the stitched video (e.g. \"stitch the audio after\", \"overlay the audio\", \"audio on top of the video\"), pass a negative audioIndex (-1 = first uploaded audio, -2 = second, etc.). In both cases the source clips' own audio is replaced by the chosen track. When the user asks for a fade, dissolve, wipe, or slide between clips, pass `transition`; omit `transition` for a hard cut (the default). Do not use this for alternating/interleaved time slices such as \"alternate 1 second from each video\"; this tool only concatenates whole clips end-to-end. Use repeated replace_video_segment calls with replacementVideoIndex and replacementStartSeconds/replacementEndSeconds for existing-video interleaving.",
|
|
715
|
+
"parameters": {
|
|
716
|
+
"type": "object",
|
|
717
|
+
"properties": {
|
|
718
|
+
"videoIndices": {
|
|
719
|
+
"type": "array",
|
|
720
|
+
"items": {
|
|
721
|
+
"type": "number"
|
|
722
|
+
},
|
|
723
|
+
"description": "Ordered list of source video indices, in the desired playback order. Non-negative values are 0-based indices into the session generated-video array (results from animate_photo, generate_video, sound_to_video, video_to_video, dance_montage in this conversation). Negative values reference uploaded videos: -1 = first uploaded video, -2 = second, etc. Indices may be mixed — for example, [0, -1] plays the first generated clip followed by the first uploaded video. For vague \"these clips\" / \"all uploaded videos\" requests, use current upload/UI order [-1, -2, ...] unless the user explicitly says to reverse or otherwise reorder them."
|
|
724
|
+
},
|
|
725
|
+
"audioIndex": {
|
|
726
|
+
"type": "number",
|
|
727
|
+
"description": "Optional index of the audio track to mux onto the stitched output. Non-negative values are 0-based indices into the session generated-audio array (results from generate_music). Negative values reference uploaded audio: -1 = first uploaded audio, -2 = second, etc. When set, the chosen track is muxed onto the stitched output and the source clips' own audio is dropped. Use a non-negative value when the user generated music in the same session or asked for a soundtrack / music video stitch; use a negative value when the user wants their uploaded audio overlaid on the stitched video (e.g. \"stitch the audio after\", \"overlay the audio\"). Omit for a silent or source-audio-preserving stitch."
|
|
728
|
+
},
|
|
729
|
+
"transition": {
|
|
730
|
+
"type": "object",
|
|
731
|
+
"description": "Optional crossfade between adjacent clips. Omit for a hard-cut concat. When set, every adjacent pair of clips is joined with the same transition type and duration.",
|
|
732
|
+
"properties": {
|
|
733
|
+
"type": {
|
|
734
|
+
"type": "string",
|
|
735
|
+
"enum": [
|
|
736
|
+
"fade",
|
|
737
|
+
"dissolve",
|
|
738
|
+
"wipeleft",
|
|
739
|
+
"wiperight",
|
|
740
|
+
"slideup",
|
|
741
|
+
"slidedown"
|
|
742
|
+
],
|
|
743
|
+
"description": "\"fade\" / \"dissolve\" = soft mix; \"wipeleft\" / \"wiperight\" = horizontal wipe; \"slideup\" / \"slidedown\" = vertical slide. Maps to ffmpeg xfade transition names."
|
|
744
|
+
},
|
|
745
|
+
"durationSeconds": {
|
|
746
|
+
"type": "number",
|
|
747
|
+
"minimum": 0.2,
|
|
748
|
+
"maximum": 2,
|
|
749
|
+
"description": "Length of the crossfade in seconds. Default 0.5. Capped at 2s."
|
|
750
|
+
}
|
|
751
|
+
},
|
|
752
|
+
"required": [
|
|
753
|
+
"type"
|
|
754
|
+
]
|
|
755
|
+
}
|
|
756
|
+
},
|
|
757
|
+
"required": [
|
|
758
|
+
"videoIndices"
|
|
759
|
+
]
|
|
760
|
+
}
|
|
761
|
+
}
|
|
762
|
+
},
|
|
763
|
+
{
|
|
764
|
+
"type": "function",
|
|
765
|
+
"function": {
|
|
766
|
+
"name": "orbit_video",
|
|
767
|
+
"description": "Create a 360-degree orbit video around a subject. This is a SELF-CONTAINED pipeline — it automatically generates angle views (via change_angle), creates transition video clips, and stitches them into one seamless looping video. You only need ONE source image as the front view — either an uploaded image or a previously generated result. If the user uploaded an image, call this tool directly without generating anything first. Do NOT pre-generate multiple angles or variations — this tool handles everything internally. Use when the user asks for a \"360 pan\", \"orbit\", \"rotate around\", \"spin around\", or \"turntable\" view.",
|
|
768
|
+
"parameters": {
|
|
769
|
+
"type": "object",
|
|
770
|
+
"properties": {
|
|
771
|
+
"elevation": {
|
|
772
|
+
"type": "string",
|
|
773
|
+
"enum": [
|
|
774
|
+
"low-angle shot",
|
|
775
|
+
"eye-level shot",
|
|
776
|
+
"elevated shot",
|
|
777
|
+
"high-angle shot"
|
|
778
|
+
],
|
|
779
|
+
"description": "Camera elevation for all angles. Default: \"eye-level shot\"."
|
|
780
|
+
},
|
|
781
|
+
"distance": {
|
|
782
|
+
"type": "string",
|
|
783
|
+
"enum": [
|
|
784
|
+
"close-up",
|
|
785
|
+
"medium shot",
|
|
786
|
+
"wide shot"
|
|
787
|
+
],
|
|
788
|
+
"description": "Camera distance for all angles. Default: \"medium shot\"."
|
|
789
|
+
},
|
|
790
|
+
"prompt": {
|
|
791
|
+
"type": "string",
|
|
792
|
+
"description": "Describe the SUBJECT and ambient environment (for example, a concise description of the visible subject, location, weather, props, and ambience). Do NOT describe camera motion, rotation, panning, orbiting, or 360-degree movement — camera motion is handled automatically. Do NOT put spoken dialogue here — use the dialogue parameter instead. Music is automatically suppressed — use generate_music separately."
|
|
793
|
+
},
|
|
794
|
+
"dialogue": {
|
|
795
|
+
"type": "string",
|
|
796
|
+
"description": "Spoken dialogue or narration for a SINGLE segment of the orbit video. This is applied ONLY to the segment specified by dialogueSegment (default: first segment). All other segments get foley/ambient audio only. Keep it brief — each segment is 2.5 seconds (~6 words max). If the user asks for dialogue in \"just the first segment\" or \"only at the start\", put the speech here and leave prompt for motion/foley only. If the user asks for dialogue in multiple/every segment, use dialogues instead."
|
|
797
|
+
},
|
|
798
|
+
"dialogues": {
|
|
799
|
+
"type": "array",
|
|
800
|
+
"items": {
|
|
801
|
+
"type": "string"
|
|
802
|
+
},
|
|
803
|
+
"description": "Per-segment spoken dialogue lines for multiple orbit transitions. Use this when the user asks for dialogue in multiple segments, every turn, or before each 90-degree turn. With the default standard 360° orbit there are 4 transitions, so provide exactly 4 short lines in order. Each line should be brief enough for a 2.5 second segment (~6 words max). Preserve real names from the request or prior generated image; never invent placeholder speakers. For \"us\"/\"we\"/couple requests, make the named people speak together. Omit entries or use an empty string for segments that should have foley/ambient audio only. Do NOT also put these dialogue lines in prompt."
|
|
804
|
+
},
|
|
805
|
+
"dialogueSegment": {
|
|
806
|
+
"type": "number",
|
|
807
|
+
"description": "Which transition segment receives the dialogue (0-based index into the transition sequence). 0 = first transition (default), last index = wrap-back to front. With default angles there are 4 transitions (0-3). With custom angles the count equals angles.length + 1. Only used when dialogue is provided."
|
|
808
|
+
},
|
|
809
|
+
"angles": {
|
|
810
|
+
"type": "array",
|
|
811
|
+
"items": {
|
|
812
|
+
"type": "string",
|
|
813
|
+
"enum": [
|
|
814
|
+
"front-right quarter view",
|
|
815
|
+
"right side view",
|
|
816
|
+
"back-right quarter view",
|
|
817
|
+
"back view",
|
|
818
|
+
"back-left quarter view",
|
|
819
|
+
"left side view",
|
|
820
|
+
"front-left quarter view"
|
|
821
|
+
]
|
|
822
|
+
},
|
|
823
|
+
"description": "OMIT THIS PARAMETER for standard 360° orbits — the default (3 angles at 90° increments: right, back, left + source as front = 4 transitions) works for nearly all requests. Only provide this when the user explicitly asks for specific angles, a partial orbit, or extra-smooth rotation. Each additional angle costs extra credits and generation time. Values are clockwise azimuths between the source (front) and wrap-back."
|
|
824
|
+
},
|
|
825
|
+
"sourceImageIndex": {
|
|
826
|
+
"type": "number",
|
|
827
|
+
"description": "Which result image to orbit around (0-based). If the user picked a 1-based image number, subtract 1 and set this explicitly (number 3 -> 2). Omit only when the user did not choose a specific prior result; then the tool uses the latest result or original upload."
|
|
828
|
+
}
|
|
829
|
+
},
|
|
830
|
+
"required": []
|
|
831
|
+
}
|
|
832
|
+
}
|
|
833
|
+
},
|
|
834
|
+
{
|
|
835
|
+
"type": "function",
|
|
836
|
+
"function": {
|
|
837
|
+
"name": "dance_montage",
|
|
838
|
+
"description": "REQUIRED for ALL dance video requests — do NOT use animate_photo or generate_video for dances. Uses real choreography reference videos to transfer dance motion onto a photo via WAN 2.2 Animate Move. Output is always 9:16 480p portrait. Do NOT use this for bare TikTok/Reels/Shorts/social-video requests unless the user explicitly asks for a dance, choreography, dance trend, or named dance preset. UPLOADED PHOTO: When the user asks for a dance \"using this photo\" or \"with this photo\", call dance_montage directly on the uploaded photo; do NOT call edit_image/generate_image first just to prepare, stylize, restyle, reframe, make full-body, or reinterpret the subject. Words that identify a dance preset or vibe, such as \"Barbie\", \"Metric\", \"Black Sheep\", \"Rasputin\", or \"TikTok dance trend\", are NOT requests for image prep. Only create image prep first when the user explicitly asks for a new look, outfit, variation set, multiple characters, or loaded persona identity preservation. IMAGE PREP: When generating images for dance (via edit_image or generate_image), ALWAYS use aspectRatio=\"9:16\". CRITICAL — IMAGE COUNT: Generate exactly 1 image (numberOfVariations=1) for dance requests UNLESS the user explicitly asks for variations, different looks, or multiple characters (e.g. \"4 different outfits\", \"alternate between a cat and a dog\"). A single consistent image is used for ALL video segments to ensure visual consistency in the final stitched dance video. When the user DOES request multiple variations, batch them into ONE tool call using numberOfVariations + Dynamic Prompts — never split into multiple batches. PERSONAS: When personas are loaded, ALWAYS generate images via edit_image FIRST (using the persona reference photos for identity preservation), then call dance_montage — it will automatically use all generated images. Never use imagePrompt for persona dance requests — edit_image with persona context photos produces far better likeness. USING GENERATED IMAGES: When images have already been generated earlier in the conversation, simply call dance_montage WITHOUT sourceImageIndex — all previously generated images are used automatically as alternating montage segments. Do NOT tell the user to \"upload\" images that were already generated. Requires at least one uploaded photo, previously generated image, or loaded personas. Best results with photos of people.",
|
|
839
|
+
"parameters": {
|
|
840
|
+
"type": "object",
|
|
841
|
+
"properties": {
|
|
842
|
+
"dance": {
|
|
843
|
+
"type": "string",
|
|
844
|
+
"enum": [
|
|
845
|
+
"rasputin",
|
|
846
|
+
"big-guy",
|
|
847
|
+
"keep-it-gangsta",
|
|
848
|
+
"this-is-america",
|
|
849
|
+
"chinese-new-year",
|
|
850
|
+
"spongebob",
|
|
851
|
+
"chanel",
|
|
852
|
+
"crystal-light-aerobics-1988",
|
|
853
|
+
"plastic-dream-sequence"
|
|
854
|
+
],
|
|
855
|
+
"description": "Which dance choreography to use. \"rasputin\": Boney M - Rasputin (Viral Russian TikTok Dance, max 32s). \"big-guy\": Ice Spice - Big Guy (From \"The SpongeBob Movie: Search for SquarePants\" movie, max 11s). \"keep-it-gangsta\": Nhale ft. Dezzy Hollow - Keep it Gangsta (Hip-hop gangsta dance, max 21s). \"this-is-america\": Childish Gambino - This Is America (Iconic choreography from the This Is America music video, max 22s). \"chinese-new-year\": 弥渡山歌 (Midu Echoing) - Dan Thy (Chinese New Year Dance, Chinese Military Dance Trend, max 18s). \"spongebob\": SpongeBob - Stadium Rave (Jellyfish Jam Dance from SpongeBob SquarePants, max 27s). \"chanel\": Tyla - Chanel (Put me in Chanel dance, max 14s). \"crystal-light-aerobics-1988\": Crystal Light National Aerobics Championship 1988 (80s aerobics dance from the 1988 Crystal Light National Aerobics Championship, max 52s). \"plastic-dream-sequence\": Metric - Black Sheep (Barbie plastic dream sequence dance, max 28s).."
|
|
856
|
+
},
|
|
857
|
+
"duration": {
|
|
858
|
+
"type": "number",
|
|
859
|
+
"description": "Total video duration in seconds. Range: 8-30. OMIT this parameter unless the user explicitly requests a specific length — the handler defaults to the chosen dance's reference video length (capped at 30s) so the full choreography plays through. Each dance has its own max based on its reference video; the handler caps automatically.",
|
|
860
|
+
"minimum": 8,
|
|
861
|
+
"maximum": 30
|
|
862
|
+
},
|
|
863
|
+
"sourceImageIndex": {
|
|
864
|
+
"type": "number",
|
|
865
|
+
"description": "Which previously generated result image to use (0-based index). Use -1 for the original uploaded image. When omitted, all previously generated images are used automatically as alternating montage segments."
|
|
866
|
+
},
|
|
867
|
+
"imagePrompt": {
|
|
868
|
+
"type": "string",
|
|
869
|
+
"description": "Creative style/look for auto-generated images when no pre-generated images are available and no personas are loaded. For persona requests, always generate images via edit_image first — it preserves identity far better. This is a fallback only. If omitted, uses a default full-body portrait style."
|
|
870
|
+
},
|
|
871
|
+
"singleClip": {
|
|
872
|
+
"type": "boolean",
|
|
873
|
+
"description": "When true, renders the entire dance as one continuous clip (no stitching). Only works for durations ≤ 20s. Use when the user explicitly asks for a single video or one unbroken clip. Default: false (splits into segments for faster concurrent rendering)."
|
|
874
|
+
}
|
|
875
|
+
},
|
|
876
|
+
"required": [
|
|
877
|
+
"dance"
|
|
878
|
+
]
|
|
879
|
+
}
|
|
880
|
+
}
|
|
881
|
+
},
|
|
882
|
+
{
|
|
883
|
+
"type": "function",
|
|
884
|
+
"function": {
|
|
885
|
+
"name": "sound_to_video",
|
|
886
|
+
"description": "Generate video synchronized to audio. Use when the user has uploaded an audio file (mp3, wav, m4a, flac) and the audio is the primary sync target, especially uploaded-audio-only workflows. Also use after generate_music (\"turn that song into a video\", \"make a music video from that\"). Auto-detects generated audio from generate_music if no audio file is uploaded. Seedance animate_photo/generate_video can also attach uploaded audio as a loose @Audio reference when an image or video reference anchors the request; use this tool instead when the soundtrack itself should drive the video. If the user provides a reference image, use ltx23-ia2v; for lip-sync with a face image, use wan-s2v; if no image, use ltx23-a2v. If the user wants dialogue/audio WITHOUT pre-existing audio, use animate_photo instead (LTX 2.3 generates audio natively). Note: Persona voice clips from resolve_personas are NOT used by this tool — for persona voice identity in video, use animate_photo or generate_video instead. LONG AUDIO ON SEEDANCE: Seedance caps each clip at 15s. When the user uploads audio longer than 15s and Seedance is selected (seedance2 or seedance2-fast), do NOT clamp to 15s and drop the rest — split the run into multiple sound_to_video calls in the same turn (one per 15s segment, so a 20s audio becomes two clips: audioStart=0 duration=15, then audioStart=15 duration=5) and finish with a single stitch_video call referencing the resulting clip indices in order with audioIndex pointing at the same uploaded audio so the stitched output carries the full original soundtrack. LTX/WAN models accept up to 20s per clip, so single-call is fine for them.",
|
|
887
|
+
"parameters": {
|
|
888
|
+
"type": "object",
|
|
889
|
+
"properties": {
|
|
890
|
+
"prompt": {
|
|
891
|
+
"type": "string",
|
|
892
|
+
"description": "Describe the video like a cinematographer. Let the audio define timing — use the prompt for visual interpretation. One flowing paragraph, present tense, specific natural language.\n\nLITERAL PROMPT OVERRIDE: If the user explicitly says not to modify the prompt, or to use it exactly/verbatim/as-is, copy the identified prompt text verbatim instead of applying these construction rules unless a hard requirement is missing. For Seedance, set expandPrompt=false.\n\nSTRUCTURE: shot/style and scale → subject → environment, lighting, color, texture, atmosphere → visual action synced to audio → camera movement. For LTX 2.3 image+audio mode, do not re-describe static details already visible in the reference image; focus on motion, action, camera, and how the image responds to the audio.\n\nMOTION PACING: Scale complexity to duration. <=6s: 1 main visual beat + 1 simple camera move. Around 10s: 2-3 clear beats + 1 camera move. >10s: up to 4 beats in clear sequence. Let the audio define timing, but avoid stacking subject, camera, and environment motion in short clips.\n\nBLOCKING: Direct layout when it affects the shot: left/right placement, foreground/background, facing direction, and relative distance between subjects.\n\nLIP-SYNC: Shot framing, speaker's appearance and setting, physical performance synced to audio — gestures, expressions, jaw movement between phrases. Include acting beats.\n\nMUSIC VISUALIZATION: Visual style, environment, and how elements react to rhythm and energy.\n\nAUDIO-REACTIVE: Motion and visual changes that correspond to sounds in the track.\n\nLTX VOCABULARY: camera (tracking, dolly, pan, tilt, handheld, static frame), lighting/atmosphere (golden hour, neon glow, dramatic shadows, fog, rain, smoke, reflections), scale/pacing (expansive, epic, intimate, claustrophobic, slow motion, time-lapse, lingering shot, continuous shot), style/genre (film noir, painterly, cyberpunk, stop-motion, claymation, 2D/3D animation, hand-drawn, fantasy, thriller, experimental film).\n\nAVOID: Vague prompts, too many competing visual elements, abstract descriptions without visible behavior, rigid numeric constraints, readable text or logos. QUOTING RULE: ONLY use double quotes for spoken dialogue. Never quote on-screen text, overlay text, titles, captions, signs, or any visual text — describe them without quotes.\n\nBATCH VARIATIONS: When numberOfVariations > 1, use Dynamic Prompt syntax to vary the visual interpretation while keeping audio sync intent consistent. Example: \"{abstract neon visualization|nature scene with swaying trees|urban street with rain} synced to the beat\"."
|
|
893
|
+
},
|
|
894
|
+
"expandPrompt": {
|
|
895
|
+
"type": "boolean",
|
|
896
|
+
"description": "Seedance only. Whether to run the shared Seedance prompt shaper before dispatch. Defaults to true; set false only when the user explicitly asks to submit the compact prompt directly or not modify the prompt."
|
|
897
|
+
},
|
|
898
|
+
"audioSourceIndex": {
|
|
899
|
+
"type": "number",
|
|
900
|
+
"description": "Index of the uploaded audio file to use (0-based, from uploaded files list). If only one audio file is uploaded, use 0. If no audio was uploaded but generate_music was used earlier, omit this — the tool will automatically find the generated audio."
|
|
901
|
+
},
|
|
902
|
+
"sourceImageIndex": {
|
|
903
|
+
"type": "number",
|
|
904
|
+
"description": "Optional index of an uploaded image to use as the starting frame (0-based). Required for lip-sync models (WAN S2V). For audio-only-to-video models (LTX 2.3 A2V), this is optional — omit it to generate video purely from text + audio."
|
|
905
|
+
},
|
|
906
|
+
"audioStart": {
|
|
907
|
+
"type": "number",
|
|
908
|
+
"description": "Start offset in seconds into the audio track. Use when the user says \"start 20 seconds in\", \"skip the intro\", \"use the chorus at 1:30\", etc. Default: 0 (beginning of audio). The video will be synced to the audio starting from this point.",
|
|
909
|
+
"minimum": 0
|
|
910
|
+
},
|
|
911
|
+
"duration": {
|
|
912
|
+
"type": "number",
|
|
913
|
+
"description": "Video duration in seconds. Default: 5. Range: 2-20. For music videos, use the MAXIMUM duration (20) since the audio is always longer than the video limit. Use when the user explicitly requests a specific length.",
|
|
914
|
+
"minimum": 2,
|
|
915
|
+
"maximum": 20
|
|
916
|
+
},
|
|
917
|
+
"videoModel": {
|
|
918
|
+
"type": "string",
|
|
919
|
+
"enum": [
|
|
920
|
+
"wan-s2v",
|
|
921
|
+
"seedance2",
|
|
922
|
+
"seedance2-fast",
|
|
923
|
+
"ltx23-ia2v",
|
|
924
|
+
"ltx23-a2v"
|
|
925
|
+
],
|
|
926
|
+
"description": "Video model. \"ltx23-ia2v\" (default when image available): LTX 2.3 image+audio to video, audio-reactive with a reference image; Fast/HQ use the distilled 8-step worker and Default Media Quality Pro uses the non-distilled dev worker. \"ltx23-a2v\" (default when no image): LTX 2.3 audio-only to video, no image needed, creates video purely from text prompt + audio with the same quality-tier routing. \"wan-s2v\": WAN 2.2 sound-to-video, best for lip-sync with a face image, fast 4-step. \"seedance2\": Seedance 2.0 audio-reference video, 4-15s; this tool supplies the audio plus a required reference image because Seedance text+audio without image/video is unsupported. \"seedance2-fast\": Seedance 2.0 Fast (720p cap) — pick this whenever the user says \"Seedance fast\", \"seedance-fast\", or asks for 480p/720p; pick \"seedance2\" only when they explicitly request 1080p or the full Seedance variant. For Seedance audio-reference prompts, preserve exact spoken dialogue when the user supplied it, and assign @Image1/@Audio1 roles. If the user asks for speech without words, describe the vocal performance without inventing quoted dialogue. Treat lip-sync, voice cloning, and real-human reference behavior as provider-sensitive rather than guaranteed. Omit to auto-select based on whether an image is present."
|
|
927
|
+
},
|
|
928
|
+
"generateAudio": {
|
|
929
|
+
"type": "boolean",
|
|
930
|
+
"description": "Seedance only. Whether Seedance should include a generated/native audio track in the final video. Omit by default so the reference audio drives the result; set false only for explicit silent output."
|
|
931
|
+
},
|
|
932
|
+
"numberOfVariations": {
|
|
933
|
+
"type": "number",
|
|
934
|
+
"description": "Number of video variations to generate (1-16). Default: 1.",
|
|
935
|
+
"minimum": 1,
|
|
936
|
+
"maximum": 16
|
|
937
|
+
},
|
|
938
|
+
"targetResolution": {
|
|
939
|
+
"type": "number",
|
|
940
|
+
"description": "Short-side video resolution target in pixels. Use ONLY when the user asks for a bare named resolution such as \"480p\", \"720p\", or \"1080p\" without exact pixels or an output orientation. This preserves the source/reference aspect ratio. Do NOT set exact-pixel aspectRatio for bare named resolution requests. If the user says \"720p portrait\" or \"720p landscape\", use exact-pixel aspectRatio instead."
|
|
941
|
+
},
|
|
942
|
+
"aspectRatio": {
|
|
943
|
+
"type": "string",
|
|
944
|
+
"description": "Do NOT set unless the user explicitly requests an aspect ratio, format, orientation, or exact pixel dimensions. When a reference/source image is used and the user did not ask to change its shape, omit this field so the handler preserves the selected source image's own ratio.\n\nFormats: \"16:9\", \"9:16\", \"4:5\", \"1:1\", \"4:3\", \"3:2\", \"21:9\", or exact pixels like \"1920x1080\".\n\nCRITICAL: When the user specifies exact pixel dimensions (e.g., \"1280x720\", \"1080x1920\", \"1920x1080\", \"3840x2160\") or an orientation-qualified named resolution (e.g., \"720p landscape\", \"720p portrait\"), use the exact pixel format, NOT a ratio like \"16:9\" or \"9:16\". Exact user-requested dimensions override the selected default media quality, including Pro/HQ defaults. A bare named video resolution like \"720p resolution\" is only a resolution tier/short-side request; do not turn it into landscape pixels and do not set aspectRatio unless the user also states landscape, portrait, vertical, horizontal, or exact pixels. If requested pixels are in bounds but not on the model's pixel step, still pass the user's exact pixel request; the handler snaps to the nearest supported size internally. Only use ratio format when the user says a generic format name without pixel dimensions.\n\nMappings (use ONLY when user does NOT specify pixel dimensions): landscape/widescreen/YouTube/cinematic → \"16:9\". portrait → \"9:16\". TikTok/Reels/IG Reels → \"1080x1920\". ultrawide/cinema scope → \"21:9\". Instagram post → \"4:5\". square → \"1:1\". standard/TV → \"4:3\". 720p landscape → \"1280x720\". 720p portrait → \"720x1280\". 1080p landscape → \"1920x1080\". 1080p portrait/HD portrait → \"1080x1920\". 4K landscape → \"3840x2160\". 4K portrait → \"2160x3840\". Never set for generic requests like \"make a video\"."
|
|
945
|
+
}
|
|
946
|
+
},
|
|
947
|
+
"required": [
|
|
948
|
+
"prompt"
|
|
949
|
+
]
|
|
950
|
+
}
|
|
951
|
+
}
|
|
952
|
+
},
|
|
953
|
+
{
|
|
954
|
+
"type": "function",
|
|
955
|
+
"function": {
|
|
956
|
+
"name": "extend_video",
|
|
957
|
+
"description": "Extend a video by adding new time to the end. Works on BOTH videos previously rendered in this session AND user-uploaded videos — set videoIndex to a negative number (e.g. -1) to target an uploaded video when no prior render exists. The base video is auto-selected from the most recent video in this session unless videoIndex is set. For LTX-2.3 base clips, the tool extracts the last frame and renders an image-to-video continuation. For Seedance base clips, the tool extracts a trailing reference segment and renders a video-to-video continuation. Returns both the standalone new segment and a spliced composite (base + new segment). Use when the user asks to \"make it longer\", \"extend the video\", \"add another N seconds\", \"continue the scene\", \"add an outro/bumper to the end\", etc. Prefer this over generate_image+animate_photo+stitch_video for \"add a bumper/outro to this video\" — extend_video preserves the original base bytes, audio, and timing instead of re-encoding them. Do not use this tool to render fresh videos from scratch — call generate_video or animate_photo for that. Output durations follow each model's native limits (LTX 2-20s, Seedance 4-15s) for the new segment alone.",
|
|
958
|
+
"parameters": {
|
|
959
|
+
"type": "object",
|
|
960
|
+
"properties": {
|
|
961
|
+
"prompt": {
|
|
962
|
+
"type": "string",
|
|
963
|
+
"description": "What should happen during the extension — describe motion, action, dialogue, and audio for the appended seconds, NOT the entire video. For LTX continuations, preserve user-provided spoken dialogue in double quotes; if speech is requested without exact words, describe the delivery without inventing quoted dialogue. If the user did not specify what should happen, write a brief continuation that preserves the existing tone (e.g. \"the scene continues with the same camera and pacing\")."
|
|
964
|
+
},
|
|
965
|
+
"duration": {
|
|
966
|
+
"type": "number",
|
|
967
|
+
"description": "Length in seconds of the new appended segment (NOT total final length). LTX 2-20, Seedance 4-15. Default: 5.",
|
|
968
|
+
"minimum": 2,
|
|
969
|
+
"maximum": 20
|
|
970
|
+
},
|
|
971
|
+
"videoIndex": {
|
|
972
|
+
"type": "number",
|
|
973
|
+
"description": "Which video result to extend. Default: -1 (most recent video in this session). Use 0-based non-negative indices for prior tool result videos. Use negative indices for uploaded videos: -1 = most recent video result OR first uploaded video when no prior render exists."
|
|
974
|
+
},
|
|
975
|
+
"videoModel": {
|
|
976
|
+
"type": "string",
|
|
977
|
+
"enum": [
|
|
978
|
+
"auto",
|
|
979
|
+
"ltx23",
|
|
980
|
+
"seedance2",
|
|
981
|
+
"seedance2-fast"
|
|
982
|
+
],
|
|
983
|
+
"description": "Which model to use for the new segment. Default: \"auto\" — detect from the base video's producer (Seedance base → Seedance, otherwise LTX-2.3). Override only when the user explicitly requests a different model."
|
|
984
|
+
},
|
|
985
|
+
"keepOriginalAudio": {
|
|
986
|
+
"type": "boolean",
|
|
987
|
+
"description": "Has no effect for extend_video (the new segment is appended after the base, so the base audio is always preserved through the original portion and the new segment carries its own audio). Reserved for parity with replace_video_segment."
|
|
988
|
+
}
|
|
989
|
+
},
|
|
990
|
+
"required": [
|
|
991
|
+
"duration"
|
|
992
|
+
]
|
|
993
|
+
}
|
|
994
|
+
}
|
|
995
|
+
},
|
|
996
|
+
{
|
|
997
|
+
"type": "function",
|
|
998
|
+
"function": {
|
|
999
|
+
"name": "replace_video_segment",
|
|
1000
|
+
"description": "Regenerate a slice of a video and splice the new segment into the original at the same position. Works on BOTH videos previously rendered in this session AND user-uploaded videos — set videoIndex to a negative number (e.g. -1) to target an uploaded video when no prior render exists. Use when the user asks to modify a portion of an existing video while keeping the rest intact: \"regenerate from 5s to 10s\", \"redo the last 3 seconds\", \"swap out the middle of the video\", \"replace the bumper at the end\", \"swap the end card\", \"change the outro\", \"redo the intro\", \"change the ending\", \"replace the last clip\", etc. Also use this tool for pure ffmpeg splices where the replacement already exists as another uploaded or generated video: \"splice video 2 into video 1\", \"insert the second clip at 5s\", or \"replace 5s to 15s with uploaded clip 2\". In that case pass replacementVideoIndex, do not call generate_video/animate_photo/video_to_video, and set endSeconds=startSeconds for an insertion that should not remove time from the base video. For time-sliced edits such as \"alternate 1 second from each video\", pass replacementStartSeconds and replacementEndSeconds to cut the next source slice out of the replacement video before splicing it into the base. Repeat this call for each alternating window. Use replacement windows (endSeconds=startSeconds+sliceDuration), not insertion windows, unless the user explicitly asks to lengthen the output by inserting extra slices. replacementStartSeconds and replacementEndSeconds must be concrete non-negative seconds; never use -1 as an end-of-source sentinel. Do not use stitch_video for interleaving because stitch_video only concatenates whole clips end-to-end. STRONGLY PREFER this over re-running generate_video / animate_photo on the original prompt when the user only wants part of the video changed — re-rendering wastes credits, loses the unchanged sections, and breaks the original timing. If the user does not specify the exact start/end seconds (e.g. \"replace the bumper at the end\"), call analyze_video first to identify the correct window, OR derive it from the storyboard timing already in the conversation (e.g. last beat's time range). Do not guess wildly — pick a sensible bumper/end-card window such as the final 1-3 seconds when the storyboard says scene_07 is 14-15s. Returns both the standalone replacement clip and the spliced composite. For LTX-2.3 and Wan 2.2 base videos the tool locks both ends with first/last-frame keyframes for seamless edges. For Seedance base videos the tool uses the original window as a reference for video-to-video transformation. If a requested window is shorter than the selected model's native render minimum, the handler renders a slightly larger handled clip, trims the result back to the requested seconds, then splices exactly that requested range. By default the regenerated segment's audio replaces the original audio in the [startSeconds, endSeconds] window, so new motion stays in sync with new sound. Pass keepOriginalAudio=true only when the user explicitly asks to keep the existing audio — phrasings like \"keep the audio\", \"leave the original audio\", \"preserve the music/score/dialogue\", \"don't change the audio\". If the user uses an ambiguous phrasing such as \"with the audio\" (which could mean either \"with the original audio kept\" or \"with new audio\"), DO NOT call this tool yet — first ask the user whether to preserve or replace the original audio in the replaced window. When replacementVideoIndex is set, the existing replacement clip's own audio is used; pass keepOriginalAudio=true only when the user explicitly wants the base video audio to stay over the replacement window.",
|
|
1001
|
+
"parameters": {
|
|
1002
|
+
"type": "object",
|
|
1003
|
+
"properties": {
|
|
1004
|
+
"startSeconds": {
|
|
1005
|
+
"type": "number",
|
|
1006
|
+
"description": "Start of the window (in seconds) inside the base video that should be regenerated. Must be ≥ 0 and < endSeconds. For \"the last N seconds\" requests, set startSeconds = max(0, baseDuration - N). When unsure of the exact base duration, you may pass a sentinel value of -1 to mean \"from the end of the base video\"; the handler will resolve it after probing."
|
|
1007
|
+
},
|
|
1008
|
+
"endSeconds": {
|
|
1009
|
+
"type": "number",
|
|
1010
|
+
"description": "End of the window (in seconds) inside the base video. For regenerated segments it must be > startSeconds and ≤ base video duration. When replacementVideoIndex is set, endSeconds may equal startSeconds to insert the replacement clip at that timestamp without removing any base-video time. For alternating/interleaved time-slice edits, use endSeconds=startSeconds+sliceDuration so the source slice replaces that base window; do not use insertion unless the user explicitly asks to lengthen the output. Pass -1 to mean \"until the end of the base video\". Windows shorter than the selected model's native render minimum are rendered with handles and trimmed before splicing; windows longer than the model maximum must be split."
|
|
1011
|
+
},
|
|
1012
|
+
"prompt": {
|
|
1013
|
+
"type": "string",
|
|
1014
|
+
"description": "What should happen in the replaced window — motion, action, dialogue, audio. For LTX include exact spoken words in double quotes when speech is requested. For Seedance V2V describe the transformation relative to the existing visuals (the original window is provided as a reference clip)."
|
|
1015
|
+
},
|
|
1016
|
+
"videoIndex": {
|
|
1017
|
+
"type": "number",
|
|
1018
|
+
"description": "Which video to edit. Default: -1 (most recent video in this session, falling back to the first uploaded video when no prior render exists). Use 0-based non-negative indices for prior tool result videos. For uploaded videos with no prior render, leave this absent or pass -1 — the handler will pick the uploaded base automatically."
|
|
1019
|
+
},
|
|
1020
|
+
"replacementVideoIndex": {
|
|
1021
|
+
"type": "number",
|
|
1022
|
+
"description": "Optional existing video clip to splice into the base video instead of regenerating a segment. Non-negative values reference prior generated videos; negative values reference uploaded videos (-1 = first uploaded video, -2 = second, etc.). Use for requests like \"splice video 2 into video 1\", \"replace 5s to 15s with uploaded clip 2\", or alternating/interleaved edits that pull timed slices from another existing clip. When this is set, the operation is pure ffmpeg post-production and keeps the replacement clip audio unless keepOriginalAudio=true."
|
|
1023
|
+
},
|
|
1024
|
+
"replacementStartSeconds": {
|
|
1025
|
+
"type": "number",
|
|
1026
|
+
"minimum": 0,
|
|
1027
|
+
"description": "Optional start time, in seconds, inside replacementVideoIndex. Must be a concrete non-negative source time; do not use -1 sentinels for replacement source windows. Use with replacementEndSeconds when only a slice of the replacement clip should be spliced. Example: alternating 1-second clips from video 1 and video 2 should replace base window 1..2 with the first replacement slice by setting replacementVideoIndex=-2, replacementStartSeconds=0, replacementEndSeconds=1."
|
|
1028
|
+
},
|
|
1029
|
+
"replacementEndSeconds": {
|
|
1030
|
+
"type": "number",
|
|
1031
|
+
"minimum": 0,
|
|
1032
|
+
"description": "Optional end time, in seconds, inside replacementVideoIndex. Must be a concrete non-negative source time greater than replacementStartSeconds. Do not pass -1 to mean \"end of replacement video\"; use the known uploaded/generated clip duration from metadata for routine time-sliced edits, or omit both replacementStartSeconds and replacementEndSeconds to use the whole replacement clip. Do not call analyze_video just to learn duration."
|
|
1033
|
+
},
|
|
1034
|
+
"videoModel": {
|
|
1035
|
+
"type": "string",
|
|
1036
|
+
"enum": [
|
|
1037
|
+
"auto",
|
|
1038
|
+
"ltx23",
|
|
1039
|
+
"wan22",
|
|
1040
|
+
"seedance2",
|
|
1041
|
+
"seedance2-fast"
|
|
1042
|
+
],
|
|
1043
|
+
"description": "Which model to use for the new segment. Default: \"auto\" — detect from the base video's producer (Seedance base → Seedance, Wan base → Wan 2.2, otherwise LTX-2.3). Override only when the user explicitly requests a different model."
|
|
1044
|
+
},
|
|
1045
|
+
"keepOriginalAudio": {
|
|
1046
|
+
"type": "boolean",
|
|
1047
|
+
"description": "When true, the audio from the original [startSeconds, endSeconds] window is muxed onto the regenerated visuals so the user keeps the original dialogue/score. When false (default), the new clip's own audio is used (LTX renders fresh audio; Seedance V2V depends on generateAudio). Default: false. Set true only when the user explicitly asks to preserve the existing audio; if the user uses an ambiguous phrasing like \"with the audio\", ask the user to clarify rather than guessing."
|
|
1048
|
+
}
|
|
1049
|
+
},
|
|
1050
|
+
"required": [
|
|
1051
|
+
"startSeconds",
|
|
1052
|
+
"endSeconds"
|
|
1053
|
+
]
|
|
1054
|
+
}
|
|
1055
|
+
}
|
|
1056
|
+
},
|
|
1057
|
+
{
|
|
1058
|
+
"type": "function",
|
|
1059
|
+
"function": {
|
|
1060
|
+
"name": "overlay_video",
|
|
1061
|
+
"description": "Burn text and/or logo/watermark image overlays onto a previously rendered or uploaded video. Use when the user asks to add a title, caption, label, watermark, brand logo, sponsor mark, lower-third, tagline, sticker, or any persistent text/graphic over the existing video frames. Multiple overlays can be supplied in one call (e.g. a corner logo plus a top-center title). Each overlay can optionally be limited to a [startSeconds, endSeconds] time range. When the user asks for an overlay to appear for a specific window (for example \"2 seconds in the middle\"), set startSeconds/endSeconds on the overlay item in the same call. Negative startSeconds/endSeconds are relative to the end of the base video, so startSeconds=-2 with omitted endSeconds means \"the last 2 seconds\". When replacing a video time window with an uploaded still image or screenshot, use an image overlay with widthPct=100 and fit=\"cover\" for that window. This is a pure ffmpeg post-production op — it does not regenerate the video. Do not use for generative intro/outro/bumper/end-card/start-card requests; those add or regenerate video time and should use extend_video or replace_video_segment. Do not call it again just to refine default size/placement after it succeeds; finalize and wait for user feedback. Do not use for animated typography, kinetic captions, or moving stickers; this lays down static overlays only.",
|
|
1062
|
+
"parameters": {
|
|
1063
|
+
"type": "object",
|
|
1064
|
+
"properties": {
|
|
1065
|
+
"sourceVideoIndex": {
|
|
1066
|
+
"type": "number",
|
|
1067
|
+
"description": "Which video to overlay onto. Omit to use the most recent generated video, or the first uploaded video when no generated video exists. Non-negative values are 0-based indices into prior generated video results. Negative values reference uploaded videos: -1 = first uploaded video, -2 = second, etc., falling back to the most recent generated video when no uploads exist."
|
|
1068
|
+
},
|
|
1069
|
+
"overlays": {
|
|
1070
|
+
"type": "array",
|
|
1071
|
+
"minItems": 1,
|
|
1072
|
+
"description": "Ordered list of overlays to burn in. Each overlay is rendered on top of all previous overlays. Either kind=\"text\" (with `text` and styling) or kind=\"image\" (with `sourceImageIndex`).",
|
|
1073
|
+
"items": {
|
|
1074
|
+
"type": "object",
|
|
1075
|
+
"properties": {
|
|
1076
|
+
"kind": {
|
|
1077
|
+
"type": "string",
|
|
1078
|
+
"enum": [
|
|
1079
|
+
"text",
|
|
1080
|
+
"image"
|
|
1081
|
+
],
|
|
1082
|
+
"description": "Overlay kind. \"text\" renders drawtext; \"image\" composites an existing image asset."
|
|
1083
|
+
},
|
|
1084
|
+
"position": {
|
|
1085
|
+
"type": "string",
|
|
1086
|
+
"enum": [
|
|
1087
|
+
"top-left",
|
|
1088
|
+
"top-center",
|
|
1089
|
+
"top-right",
|
|
1090
|
+
"center",
|
|
1091
|
+
"bottom-left",
|
|
1092
|
+
"bottom-center",
|
|
1093
|
+
"bottom-right"
|
|
1094
|
+
],
|
|
1095
|
+
"description": "Anchor position on the frame. Pixel offsets nudge inward; the renderer pads each anchor by a small safe margin so overlays do not touch the frame edge."
|
|
1096
|
+
},
|
|
1097
|
+
"offsetX": {
|
|
1098
|
+
"type": "number",
|
|
1099
|
+
"description": "Optional horizontal offset in pixels. Positive = inward from the anchor edge."
|
|
1100
|
+
},
|
|
1101
|
+
"offsetY": {
|
|
1102
|
+
"type": "number",
|
|
1103
|
+
"description": "Optional vertical offset in pixels. Positive = inward from the anchor edge."
|
|
1104
|
+
},
|
|
1105
|
+
"startSeconds": {
|
|
1106
|
+
"type": "number",
|
|
1107
|
+
"description": "Show the overlay from this time. Default 0 (show from the start). Negative values are relative to the end of the base video; startSeconds=-2 means start 2 seconds before the end."
|
|
1108
|
+
},
|
|
1109
|
+
"endSeconds": {
|
|
1110
|
+
"type": "number",
|
|
1111
|
+
"description": "Hide the overlay at this time. Default = full video duration. Negative values are relative to the end of the base video."
|
|
1112
|
+
},
|
|
1113
|
+
"text": {
|
|
1114
|
+
"type": "string",
|
|
1115
|
+
"description": "Overlay text. Required when kind=\"text\". Use plain text; line breaks are honored."
|
|
1116
|
+
},
|
|
1117
|
+
"fontSizePct": {
|
|
1118
|
+
"type": "number",
|
|
1119
|
+
"minimum": 1,
|
|
1120
|
+
"maximum": 30,
|
|
1121
|
+
"description": "Font size as a percentage of the video height. Default: 6 (≈ 43px on a 720p frame). Only valid when kind=\"text\"."
|
|
1122
|
+
},
|
|
1123
|
+
"color": {
|
|
1124
|
+
"type": "string",
|
|
1125
|
+
"description": "Text fill color (CSS hex like \"#FFFFFF\" or named ffmpeg color). Default \"#FFFFFF\". Only valid when kind=\"text\"."
|
|
1126
|
+
},
|
|
1127
|
+
"outlineColor": {
|
|
1128
|
+
"type": "string",
|
|
1129
|
+
"description": "Text outline color. Default \"#000000\" with a thin stroke for legibility. Only valid when kind=\"text\"."
|
|
1130
|
+
},
|
|
1131
|
+
"backgroundColor": {
|
|
1132
|
+
"type": [
|
|
1133
|
+
"string",
|
|
1134
|
+
"null"
|
|
1135
|
+
],
|
|
1136
|
+
"description": "Optional rgba background pill behind the text (e.g. \"rgba(0,0,0,0.5)\"). null = no box. Only valid when kind=\"text\"."
|
|
1137
|
+
},
|
|
1138
|
+
"fontWeight": {
|
|
1139
|
+
"type": "string",
|
|
1140
|
+
"enum": [
|
|
1141
|
+
"normal",
|
|
1142
|
+
"bold"
|
|
1143
|
+
],
|
|
1144
|
+
"description": "Default \"normal\". Only valid when kind=\"text\"."
|
|
1145
|
+
},
|
|
1146
|
+
"sourceImageIndex": {
|
|
1147
|
+
"type": "number",
|
|
1148
|
+
"description": "Which image to overlay. Required when kind=\"image\". Non-negative values are 0-based indices into prior generated image results. Negative values reference uploaded images in image-only order: -1 = first uploaded image, -2 = second, etc. If the user uploaded one video and one logo image, the logo is sourceImageIndex=-1."
|
|
1149
|
+
},
|
|
1150
|
+
"widthPct": {
|
|
1151
|
+
"type": "number",
|
|
1152
|
+
"minimum": 1,
|
|
1153
|
+
"maximum": 100,
|
|
1154
|
+
"description": "Logo width as a percentage of the video width. Default: 15. Only valid when kind=\"image\"."
|
|
1155
|
+
},
|
|
1156
|
+
"opacity": {
|
|
1157
|
+
"type": "number",
|
|
1158
|
+
"minimum": 0,
|
|
1159
|
+
"maximum": 1,
|
|
1160
|
+
"description": "Image overlay opacity, 0..1. Default 1.0 (fully opaque). Only valid when kind=\"image\"."
|
|
1161
|
+
},
|
|
1162
|
+
"fit": {
|
|
1163
|
+
"type": "string",
|
|
1164
|
+
"enum": [
|
|
1165
|
+
"contain",
|
|
1166
|
+
"cover"
|
|
1167
|
+
],
|
|
1168
|
+
"description": "Image sizing mode. Default \"contain\" scales by widthPct and preserves the full overlay image. \"cover\" scales/crops the image to cover the full video frame; use with widthPct=100 for screenshot/still-frame replacement windows."
|
|
1169
|
+
}
|
|
1170
|
+
},
|
|
1171
|
+
"required": [
|
|
1172
|
+
"kind",
|
|
1173
|
+
"position"
|
|
1174
|
+
]
|
|
1175
|
+
}
|
|
1176
|
+
}
|
|
1177
|
+
},
|
|
1178
|
+
"required": [
|
|
1179
|
+
"overlays"
|
|
1180
|
+
]
|
|
1181
|
+
}
|
|
1182
|
+
}
|
|
1183
|
+
},
|
|
1184
|
+
{
|
|
1185
|
+
"type": "function",
|
|
1186
|
+
"function": {
|
|
1187
|
+
"name": "add_subtitles",
|
|
1188
|
+
"description": "Burn subtitles into a video from caller-supplied cues or an SRT/VTT string. Use when the user asks to add captions, subtitles, on-screen dialogue, or burned-in lyrics to a video. Either pass `cues` as an array of {startSeconds, endSeconds, text}, or pass a full `srt` string. Pace cues like real subtitles: split the script into multiple short cues (typically 1.5–4 seconds each, ~1–8 words per cue, roughly 15–20 characters per second of cue duration). Never burn a single cue that spans the entire clip — even a static image should get progressively revealed lines, not one paragraph held on screen the whole time. Auto-transcription (auto_transcribe=true) is not yet enabled and will return USER_INPUT_INCOMPLETE — when the user has not supplied lines, ask them for the cue text and timing instead of calling with auto_transcribe. If the user explicitly asks you to write, invent, improvise, or make up captions/subtitles, create a few short, generic cue lines yourself and call this tool with cues; do not ask a follow-up for exact wording in that case.",
|
|
1189
|
+
"parameters": {
|
|
1190
|
+
"type": "object",
|
|
1191
|
+
"properties": {
|
|
1192
|
+
"sourceVideoIndex": {
|
|
1193
|
+
"type": "number",
|
|
1194
|
+
"description": "Which video to subtitle. Default: -1 (most recent generated or uploaded video). Non-negative values are 0-based indices into prior generated video results. Negative values reference uploaded videos."
|
|
1195
|
+
},
|
|
1196
|
+
"cues": {
|
|
1197
|
+
"type": "array",
|
|
1198
|
+
"description": "Ordered subtitle cues. Each cue has startSeconds, endSeconds, and the line of text to display. Provide either `cues` or `srt`, not both. Aim for multiple short cues (1.5–4s each, ~1–8 words) rather than one long cue spanning the full clip.",
|
|
1199
|
+
"items": {
|
|
1200
|
+
"type": "object",
|
|
1201
|
+
"properties": {
|
|
1202
|
+
"startSeconds": {
|
|
1203
|
+
"type": "number",
|
|
1204
|
+
"minimum": 0
|
|
1205
|
+
},
|
|
1206
|
+
"endSeconds": {
|
|
1207
|
+
"type": "number",
|
|
1208
|
+
"minimum": 0
|
|
1209
|
+
},
|
|
1210
|
+
"text": {
|
|
1211
|
+
"type": "string"
|
|
1212
|
+
}
|
|
1213
|
+
},
|
|
1214
|
+
"required": [
|
|
1215
|
+
"startSeconds",
|
|
1216
|
+
"endSeconds",
|
|
1217
|
+
"text"
|
|
1218
|
+
]
|
|
1219
|
+
}
|
|
1220
|
+
},
|
|
1221
|
+
"srt": {
|
|
1222
|
+
"type": "string",
|
|
1223
|
+
"description": "Full SRT (or VTT) document as a string, used in place of `cues`. Useful when the user pastes a subtitle file directly. Provide either `cues` or `srt`, not both."
|
|
1224
|
+
},
|
|
1225
|
+
"auto_transcribe": {
|
|
1226
|
+
"type": "boolean",
|
|
1227
|
+
"description": "Reserved for future speech-to-text support. Currently returns USER_INPUT_INCOMPLETE so the LLM can ask the user to supply cues. Do not set this — gather cue text from the user instead."
|
|
1228
|
+
},
|
|
1229
|
+
"style": {
|
|
1230
|
+
"type": "object",
|
|
1231
|
+
"description": "Optional styling overrides for the burned subtitles.",
|
|
1232
|
+
"properties": {
|
|
1233
|
+
"fontSizePct": {
|
|
1234
|
+
"type": "number",
|
|
1235
|
+
"minimum": 1,
|
|
1236
|
+
"maximum": 30,
|
|
1237
|
+
"description": "Font size as a percentage of the video height. Default 6."
|
|
1238
|
+
},
|
|
1239
|
+
"color": {
|
|
1240
|
+
"type": "string",
|
|
1241
|
+
"description": "Subtitle fill color. Default \"#FFFFFF\"."
|
|
1242
|
+
},
|
|
1243
|
+
"outlineColor": {
|
|
1244
|
+
"type": "string",
|
|
1245
|
+
"description": "Subtitle outline color. Default \"#000000\"."
|
|
1246
|
+
},
|
|
1247
|
+
"position": {
|
|
1248
|
+
"type": "string",
|
|
1249
|
+
"enum": [
|
|
1250
|
+
"bottom",
|
|
1251
|
+
"top",
|
|
1252
|
+
"center"
|
|
1253
|
+
],
|
|
1254
|
+
"description": "Vertical placement of the subtitle line. Default \"bottom\"."
|
|
1255
|
+
}
|
|
1256
|
+
}
|
|
1257
|
+
}
|
|
1258
|
+
}
|
|
1259
|
+
}
|
|
1260
|
+
}
|
|
1261
|
+
}
|
|
1262
|
+
]
|
|
1263
|
+
}
|