vibe-aigc 0.4.0__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. {vibe_aigc-0.4.0/vibe_aigc.egg-info → vibe_aigc-0.6.0}/PKG-INFO +1 -1
  2. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/pyproject.toml +5 -4
  3. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/vibe_aigc/__init__.py +117 -102
  4. vibe_aigc-0.6.0/vibe_aigc/audio.py +405 -0
  5. vibe_aigc-0.6.0/vibe_aigc/composer_general.py +453 -0
  6. vibe_aigc-0.6.0/vibe_aigc/discovery.py +416 -0
  7. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/vibe_aigc/knowledge.py +368 -0
  8. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/vibe_aigc/llm.py +38 -5
  9. vibe_aigc-0.6.0/vibe_aigc/model_registry.py +790 -0
  10. vibe_aigc-0.6.0/vibe_aigc/mv_pipeline.py +650 -0
  11. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/vibe_aigc/planner.py +169 -1
  12. vibe_aigc-0.6.0/vibe_aigc/vibe_backend.py +381 -0
  13. vibe_aigc-0.6.0/vibe_aigc/vlm_feedback.py +289 -0
  14. vibe_aigc-0.6.0/vibe_aigc/workflow_backend.py +318 -0
  15. vibe_aigc-0.6.0/vibe_aigc/workflow_composer.py +661 -0
  16. vibe_aigc-0.6.0/vibe_aigc/workflow_executor.py +530 -0
  17. vibe_aigc-0.6.0/vibe_aigc/workflow_registry.py +609 -0
  18. vibe_aigc-0.6.0/vibe_aigc/workflow_strategies.py +778 -0
  19. vibe_aigc-0.6.0/vibe_aigc/workflows.py +391 -0
  20. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0/vibe_aigc.egg-info}/PKG-INFO +1 -1
  21. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/vibe_aigc.egg-info/SOURCES.txt +13 -0
  22. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/LICENSE +0 -0
  23. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/README.md +0 -0
  24. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/setup.cfg +0 -0
  25. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/tests/test_adaptive_replanning.py +0 -0
  26. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/tests/test_agents.py +0 -0
  27. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/tests/test_assets.py +0 -0
  28. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/tests/test_auto_checkpoint.py +0 -0
  29. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/tests/test_automatic_checkpoints.py +0 -0
  30. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/tests/test_checkpoint_serialization.py +0 -0
  31. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/tests/test_error_handling.py +0 -0
  32. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/tests/test_executor.py +0 -0
  33. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/tests/test_feedback_system.py +0 -0
  34. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/tests/test_integration.py +0 -0
  35. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/tests/test_knowledge_base.py +0 -0
  36. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/tests/test_metaplanner_resume.py +0 -0
  37. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/tests/test_metaplanner_visualization.py +0 -0
  38. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/tests/test_models.py +0 -0
  39. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/tests/test_parallel_execution.py +0 -0
  40. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/tests/test_planner.py +0 -0
  41. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/tests/test_progress_callbacks.py +0 -0
  42. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/tests/test_tools.py +0 -0
  43. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/tests/test_visualization.py +0 -0
  44. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/tests/test_workflow_resume.py +0 -0
  45. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/vibe_aigc/agents.py +0 -0
  46. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/vibe_aigc/assets.py +0 -0
  47. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/vibe_aigc/character.py +0 -0
  48. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/vibe_aigc/cli.py +0 -0
  49. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/vibe_aigc/comfyui.py +0 -0
  50. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/vibe_aigc/executor.py +0 -0
  51. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/vibe_aigc/models.py +0 -0
  52. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/vibe_aigc/persistence.py +0 -0
  53. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/vibe_aigc/tools.py +0 -0
  54. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/vibe_aigc/tools_multimodal.py +0 -0
  55. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/vibe_aigc/video.py +0 -0
  56. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/vibe_aigc/visualization.py +0 -0
  57. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/vibe_aigc.egg-info/dependency_links.txt +0 -0
  58. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/vibe_aigc.egg-info/entry_points.txt +0 -0
  59. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/vibe_aigc.egg-info/requires.txt +0 -0
  60. {vibe_aigc-0.4.0 → vibe_aigc-0.6.0}/vibe_aigc.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vibe-aigc
3
- Version: 0.4.0
3
+ Version: 0.6.0
4
4
  Summary: A New Paradigm for Content Generation via Agentic Orchestration
5
5
  Author: Vibe AIGC Contributors
6
6
  License-Expression: MIT
@@ -8,7 +8,7 @@ exclude = ["tests*", "docs*", "examples*", "landing*"]
8
8
 
9
9
  [project]
10
10
  name = "vibe-aigc"
11
- version = "0.4.0"
11
+ version = "0.6.0"
12
12
  description = "A New Paradigm for Content Generation via Agentic Orchestration"
13
13
  authors = [{name = "Vibe AIGC Contributors"}]
14
14
  license = "MIT"
@@ -66,6 +66,7 @@ python_version = "3.12"
66
66
  warn_return_any = true
67
67
  warn_unused_configs = true
68
68
  ignore_missing_imports = true
69
-
70
-
71
-
69
+
70
+
71
+
72
+
@@ -1,103 +1,118 @@
1
- """Vibe AIGC: A New Paradigm for Content Generation via Agentic Orchestration.
2
-
3
- This package implements the Vibe AIGC paradigm from the paper:
4
- "Vibe AIGC: A New Paradigm for Content Generation via Agentic Orchestration"
5
-
6
- Architecture (Paper Section 5):
7
- - MetaPlanner: Decomposes Vibes into workflows (Section 5.2)
8
- - KnowledgeBase: Domain-specific expert knowledge (Section 5.3)
9
- - ToolRegistry: Atomic tool library for content generation (Section 5.4)
10
- - Agents: Specialized role-based agents (Section 4 examples)
11
- - AssetBank: Character and style consistency management
12
- """
13
-
14
- from .models import Vibe, WorkflowPlan, WorkflowNode, WorkflowNodeType
15
- from .planner import MetaPlanner
16
- from .llm import LLMClient, LLMConfig
17
- from .executor import WorkflowExecutor, ExecutionStatus, ExecutionResult
18
-
19
- # Paper Section 5.3: Domain-Specific Expert Knowledge Base
20
- from .knowledge import (
21
- KnowledgeBase,
22
- DomainKnowledge,
23
- create_knowledge_base
24
- )
25
-
26
- # Paper Section 5.4: Atomic Tool Library
27
- from .tools import (
28
- ToolRegistry,
29
- BaseTool,
30
- ToolResult,
31
- ToolSpec,
32
- ToolCategory,
33
- LLMTool,
34
- TemplateTool,
35
- CombineTool,
36
- create_default_registry
37
- )
38
-
39
- # Multi-Modal Tools (Image, Video, Audio, Search)
40
- from .tools_multimodal import (
41
- ImageGenerationTool,
42
- VideoGenerationTool,
43
- AudioGenerationTool,
44
- TTSTool,
45
- SearchTool,
46
- ScrapeTool,
47
- register_multimodal_tools,
48
- create_full_registry
49
- )
50
-
51
- # Paper Section 4: Specialized Agents
52
- from .agents import (
53
- BaseAgent,
54
- AgentRole,
55
- AgentContext,
56
- AgentResult,
57
- AgentRegistry,
58
- WriterAgent,
59
- ResearcherAgent,
60
- EditorAgent,
61
- DirectorAgent,
62
- DesignerAgent,
63
- ScreenwriterAgent,
64
- ComposerAgent,
65
- create_default_agents
66
- )
67
-
68
- # Asset Bank for Consistency
69
- from .assets import (
70
- AssetBank,
71
- Character,
72
- StyleGuide,
73
- Artifact,
74
- create_asset_bank
75
- )
76
-
77
- __version__ = "0.2.0"
78
- __all__ = [
79
- # Core models
80
- "Vibe", "WorkflowPlan", "WorkflowNode", "WorkflowNodeType",
81
- # MetaPlanner (Section 5.2)
82
- "MetaPlanner", "LLMClient", "LLMConfig",
83
- # Executor
84
- "WorkflowExecutor", "ExecutionStatus", "ExecutionResult",
85
- # Knowledge Base (Section 5.3)
86
- "KnowledgeBase", "DomainKnowledge", "create_knowledge_base",
87
- # Tool Registry (Section 5.4)
88
- "ToolRegistry", "BaseTool", "ToolResult", "ToolSpec", "ToolCategory",
89
- "LLMTool", "TemplateTool", "CombineTool", "create_default_registry",
90
- # Multi-Modal Tools
91
- "ImageGenerationTool", "VideoGenerationTool", "AudioGenerationTool",
92
- "TTSTool", "SearchTool", "ScrapeTool",
93
- "register_multimodal_tools", "create_full_registry",
94
- # Agents (Section 4 examples)
95
- "BaseAgent", "AgentRole", "AgentContext", "AgentResult", "AgentRegistry",
96
- "WriterAgent", "ResearcherAgent", "EditorAgent", "DirectorAgent",
97
- "DesignerAgent", "ScreenwriterAgent", "ComposerAgent",
98
- "create_default_agents",
99
- # Asset Bank
100
- "AssetBank", "Character", "StyleGuide", "Artifact", "create_asset_bank"
101
- ]
102
- # ComfyUI backend for actual image generation
1
+ """Vibe AIGC: A New Paradigm for Content Generation via Agentic Orchestration.
2
+
3
+ This package implements the Vibe AIGC paradigm from the paper:
4
+ "Vibe AIGC: A New Paradigm for Content Generation via Agentic Orchestration"
5
+
6
+ Architecture (Paper Section 5):
7
+ - MetaPlanner: Decomposes Vibes into workflows (Section 5.2)
8
+ - KnowledgeBase: Domain-specific expert knowledge (Section 5.3)
9
+ - ToolRegistry: Atomic tool library for content generation (Section 5.4)
10
+ - Agents: Specialized role-based agents (Section 4 examples)
11
+ - AssetBank: Character and style consistency management
12
+ """
13
+
14
+ from .models import Vibe, WorkflowPlan, WorkflowNode, WorkflowNodeType
15
+ from .planner import MetaPlanner
16
+ from .llm import LLMClient, LLMConfig
17
+ from .executor import WorkflowExecutor, ExecutionStatus, ExecutionResult
18
+
19
+ # Paper Section 5.3: Domain-Specific Expert Knowledge Base
20
+ from .knowledge import (
21
+ KnowledgeBase,
22
+ DomainKnowledge,
23
+ create_knowledge_base
24
+ )
25
+
26
+ # Paper Section 5.4: Atomic Tool Library
27
+ from .tools import (
28
+ ToolRegistry,
29
+ BaseTool,
30
+ ToolResult,
31
+ ToolSpec,
32
+ ToolCategory,
33
+ LLMTool,
34
+ TemplateTool,
35
+ CombineTool,
36
+ create_default_registry
37
+ )
38
+
39
+ # Multi-Modal Tools (Image, Video, Audio, Search)
40
+ from .tools_multimodal import (
41
+ ImageGenerationTool,
42
+ VideoGenerationTool,
43
+ AudioGenerationTool,
44
+ TTSTool,
45
+ SearchTool,
46
+ ScrapeTool,
47
+ register_multimodal_tools,
48
+ create_full_registry
49
+ )
50
+
51
+ # Paper Section 4: Specialized Agents
52
+ from .agents import (
53
+ BaseAgent,
54
+ AgentRole,
55
+ AgentContext,
56
+ AgentResult,
57
+ AgentRegistry,
58
+ WriterAgent,
59
+ ResearcherAgent,
60
+ EditorAgent,
61
+ DirectorAgent,
62
+ DesignerAgent,
63
+ ScreenwriterAgent,
64
+ ComposerAgent,
65
+ create_default_agents
66
+ )
67
+
68
+ # Asset Bank for Consistency
69
+ from .assets import (
70
+ AssetBank,
71
+ Character,
72
+ StyleGuide,
73
+ Artifact,
74
+ create_asset_bank
75
+ )
76
+
77
+ __version__ = "0.2.0"
78
+ __all__ = [
79
+ # Core models
80
+ "Vibe", "WorkflowPlan", "WorkflowNode", "WorkflowNodeType",
81
+ # MetaPlanner (Section 5.2)
82
+ "MetaPlanner", "LLMClient", "LLMConfig",
83
+ # Executor
84
+ "WorkflowExecutor", "ExecutionStatus", "ExecutionResult",
85
+ # Knowledge Base (Section 5.3)
86
+ "KnowledgeBase", "DomainKnowledge", "create_knowledge_base",
87
+ # Tool Registry (Section 5.4)
88
+ "ToolRegistry", "BaseTool", "ToolResult", "ToolSpec", "ToolCategory",
89
+ "LLMTool", "TemplateTool", "CombineTool", "create_default_registry",
90
+ # Multi-Modal Tools
91
+ "ImageGenerationTool", "VideoGenerationTool", "AudioGenerationTool",
92
+ "TTSTool", "SearchTool", "ScrapeTool",
93
+ "register_multimodal_tools", "create_full_registry",
94
+ # Agents (Section 4 examples)
95
+ "BaseAgent", "AgentRole", "AgentContext", "AgentResult", "AgentRegistry",
96
+ "WriterAgent", "ResearcherAgent", "EditorAgent", "DirectorAgent",
97
+ "DesignerAgent", "ScreenwriterAgent", "ComposerAgent",
98
+ "create_default_agents",
99
+ # Asset Bank
100
+ "AssetBank", "Character", "StyleGuide", "Artifact", "create_asset_bank"
101
+ ]
102
+ # Model Registry - Auto-detect available models
103
+ from .model_registry import ModelRegistry, ModelCapability, ModelFamily, ModelSpec
104
+
105
+ # VLM Feedback - Visual quality assessment
106
+ from .vlm_feedback import VLMFeedback, FeedbackResult, MediaType, create_vlm_feedback
107
+
108
+ # ComfyUI backend for actual image generation
103
109
  from .comfyui import ComfyUIBackend, ComfyUIConfig, ComfyUIImageTool, create_comfyui_registry
110
+
111
+ # Workflow templates
112
+ from .workflows import WorkflowLibrary, WorkflowTemplate, create_workflow_library
113
+
114
+ # Audio generation
115
+ from .audio import MusicGenBackend, RiffusionBackend, ElevenLabsBackend, MusicGenerationTool, TTSTool
116
+
117
+ # MV Pipeline
118
+ from .mv_pipeline import MVPipeline, Shot, Storyboard, create_mv
@@ -0,0 +1,405 @@
1
+ """Audio generation for music videos.
2
+
3
+ Supports:
4
+ - Music generation (Riffusion, MusicGen)
5
+ - Voice/TTS (ElevenLabs, local TTS)
6
+ - Sound effects
7
+ """
8
+
9
+ import asyncio
10
+ import aiohttp
11
+ import base64
12
+ from typing import Any, Dict, List, Optional
13
+ from dataclasses import dataclass
14
+ from pathlib import Path
15
+
16
+ from .tools import BaseTool, ToolResult, ToolSpec, ToolCategory
17
+
18
+
19
+ @dataclass
20
+ class AudioConfig:
21
+ """Configuration for audio generation."""
22
+ provider: str = "riffusion" # riffusion, musicgen, elevenlabs
23
+ api_key: Optional[str] = None
24
+ output_dir: str = "./audio_output"
25
+
26
+
27
+ class RiffusionBackend:
28
+ """Music generation using Riffusion (via Replicate)."""
29
+
30
+ def __init__(self, api_token: Optional[str] = None):
31
+ self.api_token = api_token
32
+ self.base_url = "https://api.replicate.com/v1"
33
+
34
+ async def generate_music(
35
+ self,
36
+ prompt: str,
37
+ duration: float = 8.0, # seconds
38
+ seed: Optional[int] = None
39
+ ) -> Dict[str, Any]:
40
+ """Generate music from a text prompt.
41
+
42
+ Args:
43
+ prompt: Description of the music (e.g., "upbeat electronic cyberpunk")
44
+ duration: Length in seconds
45
+ seed: Random seed for reproducibility
46
+
47
+ Returns:
48
+ Dict with audio URL or error
49
+ """
50
+ if not self.api_token:
51
+ return {"error": "No Replicate API token. Set REPLICATE_API_TOKEN."}
52
+
53
+ headers = {
54
+ "Authorization": f"Token {self.api_token}",
55
+ "Content-Type": "application/json"
56
+ }
57
+
58
+ payload = {
59
+ "version": "8cf61ea6c56afd61d8f5b9ffd14d7c216c0a93844ce2d82ac1c9ecc9c7f24e05",
60
+ "input": {
61
+ "prompt_a": prompt,
62
+ "denoising": 0.75,
63
+ "prompt_b": prompt, # Same prompt for consistency
64
+ "alpha": 0.5,
65
+ "num_inference_steps": 50,
66
+ "seed_image_id": "vibes"
67
+ }
68
+ }
69
+
70
+ if seed is not None:
71
+ payload["input"]["seed"] = seed
72
+
73
+ try:
74
+ async with aiohttp.ClientSession() as session:
75
+ # Start prediction
76
+ async with session.post(
77
+ f"{self.base_url}/predictions",
78
+ headers=headers,
79
+ json=payload
80
+ ) as resp:
81
+ if resp.status != 201:
82
+ error = await resp.text()
83
+ return {"error": f"Failed to start: {error}"}
84
+ result = await resp.json()
85
+ prediction_id = result.get("id")
86
+
87
+ # Poll for completion
88
+ for _ in range(60): # Max 60 seconds
89
+ async with session.get(
90
+ f"{self.base_url}/predictions/{prediction_id}",
91
+ headers=headers
92
+ ) as resp:
93
+ result = await resp.json()
94
+ status = result.get("status")
95
+
96
+ if status == "succeeded":
97
+ output = result.get("output", {})
98
+ return {
99
+ "audio_url": output.get("audio"),
100
+ "spectrogram_url": output.get("spectrogram"),
101
+ "prompt": prompt
102
+ }
103
+ elif status == "failed":
104
+ return {"error": result.get("error", "Generation failed")}
105
+
106
+ await asyncio.sleep(1)
107
+
108
+ return {"error": "Timeout waiting for generation"}
109
+
110
+ except Exception as e:
111
+ return {"error": str(e)}
112
+
113
+
114
+ class MusicGenBackend:
115
+ """Music generation using Meta's MusicGen (via Replicate)."""
116
+
117
+ def __init__(self, api_token: Optional[str] = None):
118
+ self.api_token = api_token
119
+ self.base_url = "https://api.replicate.com/v1"
120
+
121
+ async def generate_music(
122
+ self,
123
+ prompt: str,
124
+ duration: int = 8,
125
+ model_version: str = "melody", # small, medium, melody, large
126
+ continuation: bool = False,
127
+ input_audio: Optional[str] = None
128
+ ) -> Dict[str, Any]:
129
+ """Generate music using MusicGen.
130
+
131
+ Args:
132
+ prompt: Text description of desired music
133
+ duration: Length in seconds (max 30)
134
+ model_version: Model size/type
135
+ continuation: Whether to continue from input_audio
136
+ input_audio: URL of audio to continue from
137
+
138
+ Returns:
139
+ Dict with audio URL or error
140
+ """
141
+ if not self.api_token:
142
+ return {"error": "No Replicate API token"}
143
+
144
+ headers = {
145
+ "Authorization": f"Token {self.api_token}",
146
+ "Content-Type": "application/json"
147
+ }
148
+
149
+ # MusicGen model on Replicate
150
+ payload = {
151
+ "version": "b05b1dff1d8c6dc63d14b0cdb42135378dcb87f6373b0d3d341ede46e59e2b38",
152
+ "input": {
153
+ "prompt": prompt,
154
+ "duration": min(duration, 30),
155
+ "model_version": model_version,
156
+ "output_format": "mp3",
157
+ "normalization_strategy": "peak"
158
+ }
159
+ }
160
+
161
+ if continuation and input_audio:
162
+ payload["input"]["continuation"] = True
163
+ payload["input"]["input_audio"] = input_audio
164
+
165
+ try:
166
+ async with aiohttp.ClientSession() as session:
167
+ async with session.post(
168
+ f"{self.base_url}/predictions",
169
+ headers=headers,
170
+ json=payload
171
+ ) as resp:
172
+ if resp.status != 201:
173
+ return {"error": await resp.text()}
174
+ result = await resp.json()
175
+ prediction_id = result.get("id")
176
+
177
+ # Poll for completion
178
+ for _ in range(120): # MusicGen can take longer
179
+ async with session.get(
180
+ f"{self.base_url}/predictions/{prediction_id}",
181
+ headers=headers
182
+ ) as resp:
183
+ result = await resp.json()
184
+ status = result.get("status")
185
+
186
+ if status == "succeeded":
187
+ return {
188
+ "audio_url": result.get("output"),
189
+ "prompt": prompt,
190
+ "duration": duration
191
+ }
192
+ elif status == "failed":
193
+ return {"error": result.get("error")}
194
+
195
+ await asyncio.sleep(1)
196
+
197
+ return {"error": "Timeout"}
198
+
199
+ except Exception as e:
200
+ return {"error": str(e)}
201
+
202
+
203
+ class ElevenLabsBackend:
204
+ """Voice and speech synthesis using ElevenLabs."""
205
+
206
+ def __init__(self, api_key: Optional[str] = None):
207
+ self.api_key = api_key
208
+ self.base_url = "https://api.elevenlabs.io/v1"
209
+
210
+ async def text_to_speech(
211
+ self,
212
+ text: str,
213
+ voice_id: str = "21m00Tcm4TlvDq8ikWAM", # Rachel (default)
214
+ model_id: str = "eleven_monolingual_v1",
215
+ stability: float = 0.5,
216
+ similarity_boost: float = 0.75
217
+ ) -> Dict[str, Any]:
218
+ """Generate speech from text.
219
+
220
+ Args:
221
+ text: Text to speak
222
+ voice_id: ElevenLabs voice ID
223
+ model_id: Model to use
224
+ stability: Voice stability (0-1)
225
+ similarity_boost: Voice similarity (0-1)
226
+
227
+ Returns:
228
+ Dict with audio data or error
229
+ """
230
+ if not self.api_key:
231
+ return {"error": "No ElevenLabs API key"}
232
+
233
+ headers = {
234
+ "xi-api-key": self.api_key,
235
+ "Content-Type": "application/json"
236
+ }
237
+
238
+ payload = {
239
+ "text": text,
240
+ "model_id": model_id,
241
+ "voice_settings": {
242
+ "stability": stability,
243
+ "similarity_boost": similarity_boost
244
+ }
245
+ }
246
+
247
+ try:
248
+ async with aiohttp.ClientSession() as session:
249
+ async with session.post(
250
+ f"{self.base_url}/text-to-speech/{voice_id}",
251
+ headers=headers,
252
+ json=payload
253
+ ) as resp:
254
+ if resp.status != 200:
255
+ return {"error": await resp.text()}
256
+
257
+ audio_data = await resp.read()
258
+ return {
259
+ "audio_data": base64.b64encode(audio_data).decode(),
260
+ "format": "mp3",
261
+ "text": text
262
+ }
263
+
264
+ except Exception as e:
265
+ return {"error": str(e)}
266
+
267
+ async def list_voices(self) -> List[Dict[str, str]]:
268
+ """List available voices."""
269
+ if not self.api_key:
270
+ return []
271
+
272
+ headers = {"xi-api-key": self.api_key}
273
+
274
+ try:
275
+ async with aiohttp.ClientSession() as session:
276
+ async with session.get(
277
+ f"{self.base_url}/voices",
278
+ headers=headers
279
+ ) as resp:
280
+ if resp.status == 200:
281
+ data = await resp.json()
282
+ return [
283
+ {"id": v["voice_id"], "name": v["name"]}
284
+ for v in data.get("voices", [])
285
+ ]
286
+ except:
287
+ pass
288
+ return []
289
+
290
+
291
+ class MusicGenerationTool(BaseTool):
292
+ """Tool for generating music."""
293
+
294
+ def __init__(self, api_token: Optional[str] = None, backend: str = "musicgen"):
295
+ self.api_token = api_token
296
+ self.backend_name = backend
297
+
298
+ if backend == "riffusion":
299
+ self.backend = RiffusionBackend(api_token)
300
+ else:
301
+ self.backend = MusicGenBackend(api_token)
302
+
303
+ self._spec = ToolSpec(
304
+ name="music_generation",
305
+ description="Generate music from text description",
306
+ category=ToolCategory.AUDIO,
307
+ input_schema={
308
+ "type": "object",
309
+ "required": ["prompt"],
310
+ "properties": {
311
+ "prompt": {"type": "string", "description": "Music description"},
312
+ "duration": {"type": "integer", "default": 8},
313
+ "seed": {"type": "integer"}
314
+ }
315
+ },
316
+ output_schema={
317
+ "type": "object",
318
+ "properties": {
319
+ "audio_url": {"type": "string"},
320
+ "prompt": {"type": "string"}
321
+ }
322
+ }
323
+ )
324
+
325
+ @property
326
+ def spec(self) -> ToolSpec:
327
+ return self._spec
328
+
329
+ async def execute(
330
+ self,
331
+ inputs: Dict[str, Any],
332
+ context: Optional[Dict[str, Any]] = None
333
+ ) -> ToolResult:
334
+ prompt = inputs.get("prompt", "")
335
+ if not prompt:
336
+ return ToolResult(success=False, output=None, error="No prompt")
337
+
338
+ result = await self.backend.generate_music(
339
+ prompt=prompt,
340
+ duration=inputs.get("duration", 8),
341
+ seed=inputs.get("seed")
342
+ )
343
+
344
+ if "error" in result:
345
+ return ToolResult(success=False, output=None, error=result["error"])
346
+
347
+ return ToolResult(
348
+ success=True,
349
+ output=result,
350
+ metadata={"backend": self.backend_name}
351
+ )
352
+
353
+
354
+ class TTSTool(BaseTool):
355
+ """Tool for text-to-speech."""
356
+
357
+ def __init__(self, api_key: Optional[str] = None):
358
+ self.backend = ElevenLabsBackend(api_key)
359
+ self._spec = ToolSpec(
360
+ name="text_to_speech",
361
+ description="Convert text to speech audio",
362
+ category=ToolCategory.AUDIO,
363
+ input_schema={
364
+ "type": "object",
365
+ "required": ["text"],
366
+ "properties": {
367
+ "text": {"type": "string"},
368
+ "voice_id": {"type": "string"},
369
+ "stability": {"type": "number", "default": 0.5},
370
+ "similarity_boost": {"type": "number", "default": 0.75}
371
+ }
372
+ },
373
+ output_schema={
374
+ "type": "object",
375
+ "properties": {
376
+ "audio_data": {"type": "string", "description": "Base64 encoded audio"},
377
+ "format": {"type": "string"}
378
+ }
379
+ }
380
+ )
381
+
382
+ @property
383
+ def spec(self) -> ToolSpec:
384
+ return self._spec
385
+
386
+ async def execute(
387
+ self,
388
+ inputs: Dict[str, Any],
389
+ context: Optional[Dict[str, Any]] = None
390
+ ) -> ToolResult:
391
+ text = inputs.get("text", "")
392
+ if not text:
393
+ return ToolResult(success=False, output=None, error="No text")
394
+
395
+ result = await self.backend.text_to_speech(
396
+ text=text,
397
+ voice_id=inputs.get("voice_id", "21m00Tcm4TlvDq8ikWAM"),
398
+ stability=inputs.get("stability", 0.5),
399
+ similarity_boost=inputs.get("similarity_boost", 0.75)
400
+ )
401
+
402
+ if "error" in result:
403
+ return ToolResult(success=False, output=None, error=result["error"])
404
+
405
+ return ToolResult(success=True, output=result)