vibe-aigc 0.4.0__tar.gz → 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {vibe_aigc-0.4.0/vibe_aigc.egg-info → vibe_aigc-0.6.0}/PKG-INFO +1 -1
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/pyproject.toml +5 -4
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/vibe_aigc/__init__.py +117 -102
- vibe_aigc-0.6.0/vibe_aigc/audio.py +405 -0
- vibe_aigc-0.6.0/vibe_aigc/composer_general.py +453 -0
- vibe_aigc-0.6.0/vibe_aigc/discovery.py +416 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/vibe_aigc/knowledge.py +368 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/vibe_aigc/llm.py +38 -5
- vibe_aigc-0.6.0/vibe_aigc/model_registry.py +790 -0
- vibe_aigc-0.6.0/vibe_aigc/mv_pipeline.py +650 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/vibe_aigc/planner.py +169 -1
- vibe_aigc-0.6.0/vibe_aigc/vibe_backend.py +381 -0
- vibe_aigc-0.6.0/vibe_aigc/vlm_feedback.py +289 -0
- vibe_aigc-0.6.0/vibe_aigc/workflow_backend.py +318 -0
- vibe_aigc-0.6.0/vibe_aigc/workflow_composer.py +661 -0
- vibe_aigc-0.6.0/vibe_aigc/workflow_executor.py +530 -0
- vibe_aigc-0.6.0/vibe_aigc/workflow_registry.py +609 -0
- vibe_aigc-0.6.0/vibe_aigc/workflow_strategies.py +778 -0
- vibe_aigc-0.6.0/vibe_aigc/workflows.py +391 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0/vibe_aigc.egg-info}/PKG-INFO +1 -1
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/vibe_aigc.egg-info/SOURCES.txt +13 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/LICENSE +0 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/README.md +0 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/setup.cfg +0 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/tests/test_adaptive_replanning.py +0 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/tests/test_agents.py +0 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/tests/test_assets.py +0 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/tests/test_auto_checkpoint.py +0 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/tests/test_automatic_checkpoints.py +0 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/tests/test_checkpoint_serialization.py +0 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/tests/test_error_handling.py +0 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/tests/test_executor.py +0 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/tests/test_feedback_system.py +0 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/tests/test_integration.py +0 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/tests/test_knowledge_base.py +0 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/tests/test_metaplanner_resume.py +0 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/tests/test_metaplanner_visualization.py +0 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/tests/test_models.py +0 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/tests/test_parallel_execution.py +0 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/tests/test_planner.py +0 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/tests/test_progress_callbacks.py +0 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/tests/test_tools.py +0 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/tests/test_visualization.py +0 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/tests/test_workflow_resume.py +0 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/vibe_aigc/agents.py +0 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/vibe_aigc/assets.py +0 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/vibe_aigc/character.py +0 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/vibe_aigc/cli.py +0 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/vibe_aigc/comfyui.py +0 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/vibe_aigc/executor.py +0 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/vibe_aigc/models.py +0 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/vibe_aigc/persistence.py +0 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/vibe_aigc/tools.py +0 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/vibe_aigc/tools_multimodal.py +0 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/vibe_aigc/video.py +0 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/vibe_aigc/visualization.py +0 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/vibe_aigc.egg-info/dependency_links.txt +0 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/vibe_aigc.egg-info/entry_points.txt +0 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/vibe_aigc.egg-info/requires.txt +0 -0
- {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/vibe_aigc.egg-info/top_level.txt +0 -0
|
@@ -8,7 +8,7 @@ exclude = ["tests*", "docs*", "examples*", "landing*"]
|
|
|
8
8
|
|
|
9
9
|
[project]
|
|
10
10
|
name = "vibe-aigc"
|
|
11
|
-
version = "0.
|
|
11
|
+
version = "0.6.0"
|
|
12
12
|
description = "A New Paradigm for Content Generation via Agentic Orchestration"
|
|
13
13
|
authors = [{name = "Vibe AIGC Contributors"}]
|
|
14
14
|
license = "MIT"
|
|
@@ -66,6 +66,7 @@ python_version = "3.12"
|
|
|
66
66
|
warn_return_any = true
|
|
67
67
|
warn_unused_configs = true
|
|
68
68
|
ignore_missing_imports = true
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
|
|
@@ -1,103 +1,118 @@
|
|
|
1
|
-
"""Vibe AIGC: A New Paradigm for Content Generation via Agentic Orchestration.
|
|
2
|
-
|
|
3
|
-
This package implements the Vibe AIGC paradigm from the paper:
|
|
4
|
-
"Vibe AIGC: A New Paradigm for Content Generation via Agentic Orchestration"
|
|
5
|
-
|
|
6
|
-
Architecture (Paper Section 5):
|
|
7
|
-
- MetaPlanner: Decomposes Vibes into workflows (Section 5.2)
|
|
8
|
-
- KnowledgeBase: Domain-specific expert knowledge (Section 5.3)
|
|
9
|
-
- ToolRegistry: Atomic tool library for content generation (Section 5.4)
|
|
10
|
-
- Agents: Specialized role-based agents (Section 4 examples)
|
|
11
|
-
- AssetBank: Character and style consistency management
|
|
12
|
-
"""
|
|
13
|
-
|
|
14
|
-
from .models import Vibe, WorkflowPlan, WorkflowNode, WorkflowNodeType
|
|
15
|
-
from .planner import MetaPlanner
|
|
16
|
-
from .llm import LLMClient, LLMConfig
|
|
17
|
-
from .executor import WorkflowExecutor, ExecutionStatus, ExecutionResult
|
|
18
|
-
|
|
19
|
-
# Paper Section 5.3: Domain-Specific Expert Knowledge Base
|
|
20
|
-
from .knowledge import (
|
|
21
|
-
KnowledgeBase,
|
|
22
|
-
DomainKnowledge,
|
|
23
|
-
create_knowledge_base
|
|
24
|
-
)
|
|
25
|
-
|
|
26
|
-
# Paper Section 5.4: Atomic Tool Library
|
|
27
|
-
from .tools import (
|
|
28
|
-
ToolRegistry,
|
|
29
|
-
BaseTool,
|
|
30
|
-
ToolResult,
|
|
31
|
-
ToolSpec,
|
|
32
|
-
ToolCategory,
|
|
33
|
-
LLMTool,
|
|
34
|
-
TemplateTool,
|
|
35
|
-
CombineTool,
|
|
36
|
-
create_default_registry
|
|
37
|
-
)
|
|
38
|
-
|
|
39
|
-
# Multi-Modal Tools (Image, Video, Audio, Search)
|
|
40
|
-
from .tools_multimodal import (
|
|
41
|
-
ImageGenerationTool,
|
|
42
|
-
VideoGenerationTool,
|
|
43
|
-
AudioGenerationTool,
|
|
44
|
-
TTSTool,
|
|
45
|
-
SearchTool,
|
|
46
|
-
ScrapeTool,
|
|
47
|
-
register_multimodal_tools,
|
|
48
|
-
create_full_registry
|
|
49
|
-
)
|
|
50
|
-
|
|
51
|
-
# Paper Section 4: Specialized Agents
|
|
52
|
-
from .agents import (
|
|
53
|
-
BaseAgent,
|
|
54
|
-
AgentRole,
|
|
55
|
-
AgentContext,
|
|
56
|
-
AgentResult,
|
|
57
|
-
AgentRegistry,
|
|
58
|
-
WriterAgent,
|
|
59
|
-
ResearcherAgent,
|
|
60
|
-
EditorAgent,
|
|
61
|
-
DirectorAgent,
|
|
62
|
-
DesignerAgent,
|
|
63
|
-
ScreenwriterAgent,
|
|
64
|
-
ComposerAgent,
|
|
65
|
-
create_default_agents
|
|
66
|
-
)
|
|
67
|
-
|
|
68
|
-
# Asset Bank for Consistency
|
|
69
|
-
from .assets import (
|
|
70
|
-
AssetBank,
|
|
71
|
-
Character,
|
|
72
|
-
StyleGuide,
|
|
73
|
-
Artifact,
|
|
74
|
-
create_asset_bank
|
|
75
|
-
)
|
|
76
|
-
|
|
77
|
-
__version__ = "0.2.0"
|
|
78
|
-
__all__ = [
|
|
79
|
-
# Core models
|
|
80
|
-
"Vibe", "WorkflowPlan", "WorkflowNode", "WorkflowNodeType",
|
|
81
|
-
# MetaPlanner (Section 5.2)
|
|
82
|
-
"MetaPlanner", "LLMClient", "LLMConfig",
|
|
83
|
-
# Executor
|
|
84
|
-
"WorkflowExecutor", "ExecutionStatus", "ExecutionResult",
|
|
85
|
-
# Knowledge Base (Section 5.3)
|
|
86
|
-
"KnowledgeBase", "DomainKnowledge", "create_knowledge_base",
|
|
87
|
-
# Tool Registry (Section 5.4)
|
|
88
|
-
"ToolRegistry", "BaseTool", "ToolResult", "ToolSpec", "ToolCategory",
|
|
89
|
-
"LLMTool", "TemplateTool", "CombineTool", "create_default_registry",
|
|
90
|
-
# Multi-Modal Tools
|
|
91
|
-
"ImageGenerationTool", "VideoGenerationTool", "AudioGenerationTool",
|
|
92
|
-
"TTSTool", "SearchTool", "ScrapeTool",
|
|
93
|
-
"register_multimodal_tools", "create_full_registry",
|
|
94
|
-
# Agents (Section 4 examples)
|
|
95
|
-
"BaseAgent", "AgentRole", "AgentContext", "AgentResult", "AgentRegistry",
|
|
96
|
-
"WriterAgent", "ResearcherAgent", "EditorAgent", "DirectorAgent",
|
|
97
|
-
"DesignerAgent", "ScreenwriterAgent", "ComposerAgent",
|
|
98
|
-
"create_default_agents",
|
|
99
|
-
# Asset Bank
|
|
100
|
-
"AssetBank", "Character", "StyleGuide", "Artifact", "create_asset_bank"
|
|
101
|
-
]
|
|
102
|
-
#
|
|
1
|
+
"""Vibe AIGC: A New Paradigm for Content Generation via Agentic Orchestration.
|
|
2
|
+
|
|
3
|
+
This package implements the Vibe AIGC paradigm from the paper:
|
|
4
|
+
"Vibe AIGC: A New Paradigm for Content Generation via Agentic Orchestration"
|
|
5
|
+
|
|
6
|
+
Architecture (Paper Section 5):
|
|
7
|
+
- MetaPlanner: Decomposes Vibes into workflows (Section 5.2)
|
|
8
|
+
- KnowledgeBase: Domain-specific expert knowledge (Section 5.3)
|
|
9
|
+
- ToolRegistry: Atomic tool library for content generation (Section 5.4)
|
|
10
|
+
- Agents: Specialized role-based agents (Section 4 examples)
|
|
11
|
+
- AssetBank: Character and style consistency management
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from .models import Vibe, WorkflowPlan, WorkflowNode, WorkflowNodeType
|
|
15
|
+
from .planner import MetaPlanner
|
|
16
|
+
from .llm import LLMClient, LLMConfig
|
|
17
|
+
from .executor import WorkflowExecutor, ExecutionStatus, ExecutionResult
|
|
18
|
+
|
|
19
|
+
# Paper Section 5.3: Domain-Specific Expert Knowledge Base
|
|
20
|
+
from .knowledge import (
|
|
21
|
+
KnowledgeBase,
|
|
22
|
+
DomainKnowledge,
|
|
23
|
+
create_knowledge_base
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
# Paper Section 5.4: Atomic Tool Library
|
|
27
|
+
from .tools import (
|
|
28
|
+
ToolRegistry,
|
|
29
|
+
BaseTool,
|
|
30
|
+
ToolResult,
|
|
31
|
+
ToolSpec,
|
|
32
|
+
ToolCategory,
|
|
33
|
+
LLMTool,
|
|
34
|
+
TemplateTool,
|
|
35
|
+
CombineTool,
|
|
36
|
+
create_default_registry
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
# Multi-Modal Tools (Image, Video, Audio, Search)
|
|
40
|
+
from .tools_multimodal import (
|
|
41
|
+
ImageGenerationTool,
|
|
42
|
+
VideoGenerationTool,
|
|
43
|
+
AudioGenerationTool,
|
|
44
|
+
TTSTool,
|
|
45
|
+
SearchTool,
|
|
46
|
+
ScrapeTool,
|
|
47
|
+
register_multimodal_tools,
|
|
48
|
+
create_full_registry
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# Paper Section 4: Specialized Agents
|
|
52
|
+
from .agents import (
|
|
53
|
+
BaseAgent,
|
|
54
|
+
AgentRole,
|
|
55
|
+
AgentContext,
|
|
56
|
+
AgentResult,
|
|
57
|
+
AgentRegistry,
|
|
58
|
+
WriterAgent,
|
|
59
|
+
ResearcherAgent,
|
|
60
|
+
EditorAgent,
|
|
61
|
+
DirectorAgent,
|
|
62
|
+
DesignerAgent,
|
|
63
|
+
ScreenwriterAgent,
|
|
64
|
+
ComposerAgent,
|
|
65
|
+
create_default_agents
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
# Asset Bank for Consistency
|
|
69
|
+
from .assets import (
|
|
70
|
+
AssetBank,
|
|
71
|
+
Character,
|
|
72
|
+
StyleGuide,
|
|
73
|
+
Artifact,
|
|
74
|
+
create_asset_bank
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
__version__ = "0.2.0"
|
|
78
|
+
__all__ = [
|
|
79
|
+
# Core models
|
|
80
|
+
"Vibe", "WorkflowPlan", "WorkflowNode", "WorkflowNodeType",
|
|
81
|
+
# MetaPlanner (Section 5.2)
|
|
82
|
+
"MetaPlanner", "LLMClient", "LLMConfig",
|
|
83
|
+
# Executor
|
|
84
|
+
"WorkflowExecutor", "ExecutionStatus", "ExecutionResult",
|
|
85
|
+
# Knowledge Base (Section 5.3)
|
|
86
|
+
"KnowledgeBase", "DomainKnowledge", "create_knowledge_base",
|
|
87
|
+
# Tool Registry (Section 5.4)
|
|
88
|
+
"ToolRegistry", "BaseTool", "ToolResult", "ToolSpec", "ToolCategory",
|
|
89
|
+
"LLMTool", "TemplateTool", "CombineTool", "create_default_registry",
|
|
90
|
+
# Multi-Modal Tools
|
|
91
|
+
"ImageGenerationTool", "VideoGenerationTool", "AudioGenerationTool",
|
|
92
|
+
"TTSTool", "SearchTool", "ScrapeTool",
|
|
93
|
+
"register_multimodal_tools", "create_full_registry",
|
|
94
|
+
# Agents (Section 4 examples)
|
|
95
|
+
"BaseAgent", "AgentRole", "AgentContext", "AgentResult", "AgentRegistry",
|
|
96
|
+
"WriterAgent", "ResearcherAgent", "EditorAgent", "DirectorAgent",
|
|
97
|
+
"DesignerAgent", "ScreenwriterAgent", "ComposerAgent",
|
|
98
|
+
"create_default_agents",
|
|
99
|
+
# Asset Bank
|
|
100
|
+
"AssetBank", "Character", "StyleGuide", "Artifact", "create_asset_bank"
|
|
101
|
+
]
|
|
102
|
+
# Model Registry - Auto-detect available models
|
|
103
|
+
from .model_registry import ModelRegistry, ModelCapability, ModelFamily, ModelSpec
|
|
104
|
+
|
|
105
|
+
# VLM Feedback - Visual quality assessment
|
|
106
|
+
from .vlm_feedback import VLMFeedback, FeedbackResult, MediaType, create_vlm_feedback
|
|
107
|
+
|
|
108
|
+
# ComfyUI backend for actual image generation
|
|
103
109
|
from .comfyui import ComfyUIBackend, ComfyUIConfig, ComfyUIImageTool, create_comfyui_registry
|
|
110
|
+
|
|
111
|
+
# Workflow templates
|
|
112
|
+
from .workflows import WorkflowLibrary, WorkflowTemplate, create_workflow_library
|
|
113
|
+
|
|
114
|
+
# Audio generation
|
|
115
|
+
from .audio import MusicGenBackend, RiffusionBackend, ElevenLabsBackend, MusicGenerationTool, TTSTool
|
|
116
|
+
|
|
117
|
+
# MV Pipeline
|
|
118
|
+
from .mv_pipeline import MVPipeline, Shot, Storyboard, create_mv
|
|
@@ -0,0 +1,405 @@
|
|
|
1
|
+
"""Audio generation for music videos.
|
|
2
|
+
|
|
3
|
+
Supports:
|
|
4
|
+
- Music generation (Riffusion, MusicGen)
|
|
5
|
+
- Voice/TTS (ElevenLabs, local TTS)
|
|
6
|
+
- Sound effects
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
import aiohttp
|
|
11
|
+
import base64
|
|
12
|
+
from typing import Any, Dict, List, Optional
|
|
13
|
+
from dataclasses import dataclass
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
from .tools import BaseTool, ToolResult, ToolSpec, ToolCategory
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class AudioConfig:
|
|
21
|
+
"""Configuration for audio generation."""
|
|
22
|
+
provider: str = "riffusion" # riffusion, musicgen, elevenlabs
|
|
23
|
+
api_key: Optional[str] = None
|
|
24
|
+
output_dir: str = "./audio_output"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class RiffusionBackend:
|
|
28
|
+
"""Music generation using Riffusion (via Replicate)."""
|
|
29
|
+
|
|
30
|
+
def __init__(self, api_token: Optional[str] = None):
|
|
31
|
+
self.api_token = api_token
|
|
32
|
+
self.base_url = "https://api.replicate.com/v1"
|
|
33
|
+
|
|
34
|
+
async def generate_music(
|
|
35
|
+
self,
|
|
36
|
+
prompt: str,
|
|
37
|
+
duration: float = 8.0, # seconds
|
|
38
|
+
seed: Optional[int] = None
|
|
39
|
+
) -> Dict[str, Any]:
|
|
40
|
+
"""Generate music from a text prompt.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
prompt: Description of the music (e.g., "upbeat electronic cyberpunk")
|
|
44
|
+
duration: Length in seconds
|
|
45
|
+
seed: Random seed for reproducibility
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
Dict with audio URL or error
|
|
49
|
+
"""
|
|
50
|
+
if not self.api_token:
|
|
51
|
+
return {"error": "No Replicate API token. Set REPLICATE_API_TOKEN."}
|
|
52
|
+
|
|
53
|
+
headers = {
|
|
54
|
+
"Authorization": f"Token {self.api_token}",
|
|
55
|
+
"Content-Type": "application/json"
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
payload = {
|
|
59
|
+
"version": "8cf61ea6c56afd61d8f5b9ffd14d7c216c0a93844ce2d82ac1c9ecc9c7f24e05",
|
|
60
|
+
"input": {
|
|
61
|
+
"prompt_a": prompt,
|
|
62
|
+
"denoising": 0.75,
|
|
63
|
+
"prompt_b": prompt, # Same prompt for consistency
|
|
64
|
+
"alpha": 0.5,
|
|
65
|
+
"num_inference_steps": 50,
|
|
66
|
+
"seed_image_id": "vibes"
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
if seed is not None:
|
|
71
|
+
payload["input"]["seed"] = seed
|
|
72
|
+
|
|
73
|
+
try:
|
|
74
|
+
async with aiohttp.ClientSession() as session:
|
|
75
|
+
# Start prediction
|
|
76
|
+
async with session.post(
|
|
77
|
+
f"{self.base_url}/predictions",
|
|
78
|
+
headers=headers,
|
|
79
|
+
json=payload
|
|
80
|
+
) as resp:
|
|
81
|
+
if resp.status != 201:
|
|
82
|
+
error = await resp.text()
|
|
83
|
+
return {"error": f"Failed to start: {error}"}
|
|
84
|
+
result = await resp.json()
|
|
85
|
+
prediction_id = result.get("id")
|
|
86
|
+
|
|
87
|
+
# Poll for completion
|
|
88
|
+
for _ in range(60): # Max 60 seconds
|
|
89
|
+
async with session.get(
|
|
90
|
+
f"{self.base_url}/predictions/{prediction_id}",
|
|
91
|
+
headers=headers
|
|
92
|
+
) as resp:
|
|
93
|
+
result = await resp.json()
|
|
94
|
+
status = result.get("status")
|
|
95
|
+
|
|
96
|
+
if status == "succeeded":
|
|
97
|
+
output = result.get("output", {})
|
|
98
|
+
return {
|
|
99
|
+
"audio_url": output.get("audio"),
|
|
100
|
+
"spectrogram_url": output.get("spectrogram"),
|
|
101
|
+
"prompt": prompt
|
|
102
|
+
}
|
|
103
|
+
elif status == "failed":
|
|
104
|
+
return {"error": result.get("error", "Generation failed")}
|
|
105
|
+
|
|
106
|
+
await asyncio.sleep(1)
|
|
107
|
+
|
|
108
|
+
return {"error": "Timeout waiting for generation"}
|
|
109
|
+
|
|
110
|
+
except Exception as e:
|
|
111
|
+
return {"error": str(e)}
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class MusicGenBackend:
|
|
115
|
+
"""Music generation using Meta's MusicGen (via Replicate)."""
|
|
116
|
+
|
|
117
|
+
def __init__(self, api_token: Optional[str] = None):
|
|
118
|
+
self.api_token = api_token
|
|
119
|
+
self.base_url = "https://api.replicate.com/v1"
|
|
120
|
+
|
|
121
|
+
async def generate_music(
|
|
122
|
+
self,
|
|
123
|
+
prompt: str,
|
|
124
|
+
duration: int = 8,
|
|
125
|
+
model_version: str = "melody", # small, medium, melody, large
|
|
126
|
+
continuation: bool = False,
|
|
127
|
+
input_audio: Optional[str] = None
|
|
128
|
+
) -> Dict[str, Any]:
|
|
129
|
+
"""Generate music using MusicGen.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
prompt: Text description of desired music
|
|
133
|
+
duration: Length in seconds (max 30)
|
|
134
|
+
model_version: Model size/type
|
|
135
|
+
continuation: Whether to continue from input_audio
|
|
136
|
+
input_audio: URL of audio to continue from
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
Dict with audio URL or error
|
|
140
|
+
"""
|
|
141
|
+
if not self.api_token:
|
|
142
|
+
return {"error": "No Replicate API token"}
|
|
143
|
+
|
|
144
|
+
headers = {
|
|
145
|
+
"Authorization": f"Token {self.api_token}",
|
|
146
|
+
"Content-Type": "application/json"
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
# MusicGen model on Replicate
|
|
150
|
+
payload = {
|
|
151
|
+
"version": "b05b1dff1d8c6dc63d14b0cdb42135378dcb87f6373b0d3d341ede46e59e2b38",
|
|
152
|
+
"input": {
|
|
153
|
+
"prompt": prompt,
|
|
154
|
+
"duration": min(duration, 30),
|
|
155
|
+
"model_version": model_version,
|
|
156
|
+
"output_format": "mp3",
|
|
157
|
+
"normalization_strategy": "peak"
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
if continuation and input_audio:
|
|
162
|
+
payload["input"]["continuation"] = True
|
|
163
|
+
payload["input"]["input_audio"] = input_audio
|
|
164
|
+
|
|
165
|
+
try:
|
|
166
|
+
async with aiohttp.ClientSession() as session:
|
|
167
|
+
async with session.post(
|
|
168
|
+
f"{self.base_url}/predictions",
|
|
169
|
+
headers=headers,
|
|
170
|
+
json=payload
|
|
171
|
+
) as resp:
|
|
172
|
+
if resp.status != 201:
|
|
173
|
+
return {"error": await resp.text()}
|
|
174
|
+
result = await resp.json()
|
|
175
|
+
prediction_id = result.get("id")
|
|
176
|
+
|
|
177
|
+
# Poll for completion
|
|
178
|
+
for _ in range(120): # MusicGen can take longer
|
|
179
|
+
async with session.get(
|
|
180
|
+
f"{self.base_url}/predictions/{prediction_id}",
|
|
181
|
+
headers=headers
|
|
182
|
+
) as resp:
|
|
183
|
+
result = await resp.json()
|
|
184
|
+
status = result.get("status")
|
|
185
|
+
|
|
186
|
+
if status == "succeeded":
|
|
187
|
+
return {
|
|
188
|
+
"audio_url": result.get("output"),
|
|
189
|
+
"prompt": prompt,
|
|
190
|
+
"duration": duration
|
|
191
|
+
}
|
|
192
|
+
elif status == "failed":
|
|
193
|
+
return {"error": result.get("error")}
|
|
194
|
+
|
|
195
|
+
await asyncio.sleep(1)
|
|
196
|
+
|
|
197
|
+
return {"error": "Timeout"}
|
|
198
|
+
|
|
199
|
+
except Exception as e:
|
|
200
|
+
return {"error": str(e)}
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
class ElevenLabsBackend:
|
|
204
|
+
"""Voice and speech synthesis using ElevenLabs."""
|
|
205
|
+
|
|
206
|
+
def __init__(self, api_key: Optional[str] = None):
|
|
207
|
+
self.api_key = api_key
|
|
208
|
+
self.base_url = "https://api.elevenlabs.io/v1"
|
|
209
|
+
|
|
210
|
+
async def text_to_speech(
|
|
211
|
+
self,
|
|
212
|
+
text: str,
|
|
213
|
+
voice_id: str = "21m00Tcm4TlvDq8ikWAM", # Rachel (default)
|
|
214
|
+
model_id: str = "eleven_monolingual_v1",
|
|
215
|
+
stability: float = 0.5,
|
|
216
|
+
similarity_boost: float = 0.75
|
|
217
|
+
) -> Dict[str, Any]:
|
|
218
|
+
"""Generate speech from text.
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
text: Text to speak
|
|
222
|
+
voice_id: ElevenLabs voice ID
|
|
223
|
+
model_id: Model to use
|
|
224
|
+
stability: Voice stability (0-1)
|
|
225
|
+
similarity_boost: Voice similarity (0-1)
|
|
226
|
+
|
|
227
|
+
Returns:
|
|
228
|
+
Dict with audio data or error
|
|
229
|
+
"""
|
|
230
|
+
if not self.api_key:
|
|
231
|
+
return {"error": "No ElevenLabs API key"}
|
|
232
|
+
|
|
233
|
+
headers = {
|
|
234
|
+
"xi-api-key": self.api_key,
|
|
235
|
+
"Content-Type": "application/json"
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
payload = {
|
|
239
|
+
"text": text,
|
|
240
|
+
"model_id": model_id,
|
|
241
|
+
"voice_settings": {
|
|
242
|
+
"stability": stability,
|
|
243
|
+
"similarity_boost": similarity_boost
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
try:
|
|
248
|
+
async with aiohttp.ClientSession() as session:
|
|
249
|
+
async with session.post(
|
|
250
|
+
f"{self.base_url}/text-to-speech/{voice_id}",
|
|
251
|
+
headers=headers,
|
|
252
|
+
json=payload
|
|
253
|
+
) as resp:
|
|
254
|
+
if resp.status != 200:
|
|
255
|
+
return {"error": await resp.text()}
|
|
256
|
+
|
|
257
|
+
audio_data = await resp.read()
|
|
258
|
+
return {
|
|
259
|
+
"audio_data": base64.b64encode(audio_data).decode(),
|
|
260
|
+
"format": "mp3",
|
|
261
|
+
"text": text
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
except Exception as e:
|
|
265
|
+
return {"error": str(e)}
|
|
266
|
+
|
|
267
|
+
async def list_voices(self) -> List[Dict[str, str]]:
|
|
268
|
+
"""List available voices."""
|
|
269
|
+
if not self.api_key:
|
|
270
|
+
return []
|
|
271
|
+
|
|
272
|
+
headers = {"xi-api-key": self.api_key}
|
|
273
|
+
|
|
274
|
+
try:
|
|
275
|
+
async with aiohttp.ClientSession() as session:
|
|
276
|
+
async with session.get(
|
|
277
|
+
f"{self.base_url}/voices",
|
|
278
|
+
headers=headers
|
|
279
|
+
) as resp:
|
|
280
|
+
if resp.status == 200:
|
|
281
|
+
data = await resp.json()
|
|
282
|
+
return [
|
|
283
|
+
{"id": v["voice_id"], "name": v["name"]}
|
|
284
|
+
for v in data.get("voices", [])
|
|
285
|
+
]
|
|
286
|
+
except:
|
|
287
|
+
pass
|
|
288
|
+
return []
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
class MusicGenerationTool(BaseTool):
|
|
292
|
+
"""Tool for generating music."""
|
|
293
|
+
|
|
294
|
+
def __init__(self, api_token: Optional[str] = None, backend: str = "musicgen"):
|
|
295
|
+
self.api_token = api_token
|
|
296
|
+
self.backend_name = backend
|
|
297
|
+
|
|
298
|
+
if backend == "riffusion":
|
|
299
|
+
self.backend = RiffusionBackend(api_token)
|
|
300
|
+
else:
|
|
301
|
+
self.backend = MusicGenBackend(api_token)
|
|
302
|
+
|
|
303
|
+
self._spec = ToolSpec(
|
|
304
|
+
name="music_generation",
|
|
305
|
+
description="Generate music from text description",
|
|
306
|
+
category=ToolCategory.AUDIO,
|
|
307
|
+
input_schema={
|
|
308
|
+
"type": "object",
|
|
309
|
+
"required": ["prompt"],
|
|
310
|
+
"properties": {
|
|
311
|
+
"prompt": {"type": "string", "description": "Music description"},
|
|
312
|
+
"duration": {"type": "integer", "default": 8},
|
|
313
|
+
"seed": {"type": "integer"}
|
|
314
|
+
}
|
|
315
|
+
},
|
|
316
|
+
output_schema={
|
|
317
|
+
"type": "object",
|
|
318
|
+
"properties": {
|
|
319
|
+
"audio_url": {"type": "string"},
|
|
320
|
+
"prompt": {"type": "string"}
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
@property
|
|
326
|
+
def spec(self) -> ToolSpec:
|
|
327
|
+
return self._spec
|
|
328
|
+
|
|
329
|
+
async def execute(
|
|
330
|
+
self,
|
|
331
|
+
inputs: Dict[str, Any],
|
|
332
|
+
context: Optional[Dict[str, Any]] = None
|
|
333
|
+
) -> ToolResult:
|
|
334
|
+
prompt = inputs.get("prompt", "")
|
|
335
|
+
if not prompt:
|
|
336
|
+
return ToolResult(success=False, output=None, error="No prompt")
|
|
337
|
+
|
|
338
|
+
result = await self.backend.generate_music(
|
|
339
|
+
prompt=prompt,
|
|
340
|
+
duration=inputs.get("duration", 8),
|
|
341
|
+
seed=inputs.get("seed")
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
if "error" in result:
|
|
345
|
+
return ToolResult(success=False, output=None, error=result["error"])
|
|
346
|
+
|
|
347
|
+
return ToolResult(
|
|
348
|
+
success=True,
|
|
349
|
+
output=result,
|
|
350
|
+
metadata={"backend": self.backend_name}
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
class TTSTool(BaseTool):
|
|
355
|
+
"""Tool for text-to-speech."""
|
|
356
|
+
|
|
357
|
+
def __init__(self, api_key: Optional[str] = None):
|
|
358
|
+
self.backend = ElevenLabsBackend(api_key)
|
|
359
|
+
self._spec = ToolSpec(
|
|
360
|
+
name="text_to_speech",
|
|
361
|
+
description="Convert text to speech audio",
|
|
362
|
+
category=ToolCategory.AUDIO,
|
|
363
|
+
input_schema={
|
|
364
|
+
"type": "object",
|
|
365
|
+
"required": ["text"],
|
|
366
|
+
"properties": {
|
|
367
|
+
"text": {"type": "string"},
|
|
368
|
+
"voice_id": {"type": "string"},
|
|
369
|
+
"stability": {"type": "number", "default": 0.5},
|
|
370
|
+
"similarity_boost": {"type": "number", "default": 0.75}
|
|
371
|
+
}
|
|
372
|
+
},
|
|
373
|
+
output_schema={
|
|
374
|
+
"type": "object",
|
|
375
|
+
"properties": {
|
|
376
|
+
"audio_data": {"type": "string", "description": "Base64 encoded audio"},
|
|
377
|
+
"format": {"type": "string"}
|
|
378
|
+
}
|
|
379
|
+
}
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
@property
|
|
383
|
+
def spec(self) -> ToolSpec:
|
|
384
|
+
return self._spec
|
|
385
|
+
|
|
386
|
+
async def execute(
|
|
387
|
+
self,
|
|
388
|
+
inputs: Dict[str, Any],
|
|
389
|
+
context: Optional[Dict[str, Any]] = None
|
|
390
|
+
) -> ToolResult:
|
|
391
|
+
text = inputs.get("text", "")
|
|
392
|
+
if not text:
|
|
393
|
+
return ToolResult(success=False, output=None, error="No text")
|
|
394
|
+
|
|
395
|
+
result = await self.backend.text_to_speech(
|
|
396
|
+
text=text,
|
|
397
|
+
voice_id=inputs.get("voice_id", "21m00Tcm4TlvDq8ikWAM"),
|
|
398
|
+
stability=inputs.get("stability", 0.5),
|
|
399
|
+
similarity_boost=inputs.get("similarity_boost", 0.75)
|
|
400
|
+
)
|
|
401
|
+
|
|
402
|
+
if "error" in result:
|
|
403
|
+
return ToolResult(success=False, output=None, error=result["error"])
|
|
404
|
+
|
|
405
|
+
return ToolResult(success=True, output=result)
|