dexto 1.1.2 → 1.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. package/README.md +270 -160
  2. package/agents/agent-registry.json +16 -2
  3. package/agents/agent-template.yml +6 -12
  4. package/agents/default-agent.yml +6 -16
  5. package/agents/nano-banana-agent/README.md +200 -0
  6. package/agents/nano-banana-agent/nano-banana-agent.yml +68 -0
  7. package/agents/podcast-agent/README.md +168 -0
  8. package/agents/podcast-agent/podcast-agent.yml +167 -0
  9. package/agents/triage-demo/billing-agent.yml +2 -2
  10. package/agents/triage-demo/product-info-agent.yml +2 -3
  11. package/agents/triage-demo/technical-support-agent.yml +2 -3
  12. package/dist/src/app/{chunk-OIBH674O.js → chunk-BKF5BGLX.js} +318 -35
  13. package/dist/src/app/{chunk-FVWAYUL4.js → chunk-N7FUUBGT.js} +1 -1
  14. package/dist/src/app/{chunk-UG5P4DIL.js → chunk-OONTQZRM.js} +2 -2
  15. package/dist/src/app/{cli-confirmation-handler-7BZ6BMSE.js → cli-confirmation-handler-7235V7GL.js} +1 -1
  16. package/dist/src/app/{errors-EYGUMLKB.js → errors-YCS63OK6.js} +1 -1
  17. package/dist/src/app/index.js +19 -8
  18. package/dist/src/app/{loader-LJJQ4NDP.js → loader-PVRMNHST.js} +2 -2
  19. package/dist/src/app/{path-O5L5AW7V.js → path-DJ5C7EUS.js} +1 -1
  20. package/dist/src/app/{registry-HIVAEL5E.js → registry-KOOLQYP4.js} +3 -3
  21. package/dist/src/app/{sqlite-backend-FK7U4D6Z.js → sqlite-backend-NCFS7FN6.js} +1 -1
  22. package/dist/src/app/webui/.next/standalone/.next/static/chunks/122-4d4c8aa883d114a2.js +1 -0
  23. package/dist/src/app/webui/.next/standalone/.next/static/chunks/216-f5dbf2145a48ae92.js +1 -0
  24. package/dist/src/app/webui/.next/standalone/.next/static/chunks/43-4f3d01c7feaf132f.js +1 -0
  25. package/dist/src/app/webui/.next/standalone/.next/static/chunks/app/{layout-43f98b6d34953fcf.js → layout-36c240720861a312.js} +1 -1
  26. package/dist/src/app/webui/.next/standalone/.next/static/chunks/app/page-ca08c66042cb54c8.js +1 -0
  27. package/dist/src/app/webui/.next/{static/chunks/app/playground/page-c51bb3cc58225dc3.js → standalone/.next/static/chunks/app/playground/page-07a79d22b26d37f4.js} +1 -1
  28. package/dist/src/app/webui/.next/standalone/.next/static/css/c1d26dc78adbeb53.css +3 -0
  29. package/dist/src/app/webui/.next/standalone/package.json +3 -1
  30. package/dist/src/app/webui/.next/standalone/src/app/webui/.next/BUILD_ID +1 -1
  31. package/dist/src/app/webui/.next/standalone/src/app/webui/.next/app-build-manifest.json +9 -9
  32. package/dist/src/app/webui/.next/standalone/src/app/webui/.next/build-manifest.json +2 -2
  33. package/dist/src/app/webui/.next/standalone/src/app/webui/.next/prerender-manifest.json +3 -3
  34. package/dist/src/app/webui/.next/standalone/src/app/webui/.next/required-server-files.json +1 -1
  35. package/dist/src/app/webui/.next/standalone/src/app/webui/.next/server/app/_not-found/page_client-reference-manifest.js +1 -1
  36. package/dist/src/app/webui/.next/standalone/src/app/webui/.next/server/app/page.js +3 -4
  37. package/dist/src/app/webui/.next/standalone/src/app/webui/.next/server/app/page.js.nft.json +1 -1
  38. package/dist/src/app/webui/.next/standalone/src/app/webui/.next/server/app/page_client-reference-manifest.js +1 -1
  39. package/dist/src/app/webui/.next/standalone/src/app/webui/.next/server/app/playground/page.js +2 -2
  40. package/dist/src/app/webui/.next/standalone/src/app/webui/.next/server/app/playground/page.js.nft.json +1 -1
  41. package/dist/src/app/webui/.next/standalone/src/app/webui/.next/server/app/playground/page_client-reference-manifest.js +1 -1
  42. package/dist/src/app/webui/.next/standalone/src/app/webui/.next/server/chunks/213.js +1 -1
  43. package/dist/src/app/webui/.next/standalone/src/app/webui/.next/server/chunks/489.js +1 -0
  44. package/dist/src/app/webui/.next/standalone/src/app/webui/.next/server/pages/500.html +1 -1
  45. package/dist/src/app/webui/.next/standalone/src/app/webui/.next/server/server-reference-manifest.json +1 -1
  46. package/dist/src/app/webui/.next/standalone/src/app/webui/.next/static/chunks/122-4d4c8aa883d114a2.js +1 -0
  47. package/dist/src/app/webui/.next/standalone/src/app/webui/.next/static/chunks/216-f5dbf2145a48ae92.js +1 -0
  48. package/dist/src/app/webui/.next/standalone/src/app/webui/.next/static/chunks/43-4f3d01c7feaf132f.js +1 -0
  49. package/dist/src/app/webui/.next/standalone/src/app/webui/.next/static/chunks/app/{layout-43f98b6d34953fcf.js → layout-36c240720861a312.js} +1 -1
  50. package/dist/src/app/webui/.next/standalone/src/app/webui/.next/static/chunks/app/page-ca08c66042cb54c8.js +1 -0
  51. package/dist/src/app/webui/.next/standalone/{.next/static/chunks/app/playground/page-c51bb3cc58225dc3.js → src/app/webui/.next/static/chunks/app/playground/page-07a79d22b26d37f4.js} +1 -1
  52. package/dist/src/app/webui/.next/standalone/src/app/webui/.next/static/css/c1d26dc78adbeb53.css +3 -0
  53. package/dist/src/app/webui/.next/standalone/src/app/webui/server.js +1 -1
  54. package/dist/src/app/webui/.next/static/chunks/122-4d4c8aa883d114a2.js +1 -0
  55. package/dist/src/app/webui/.next/static/chunks/216-f5dbf2145a48ae92.js +1 -0
  56. package/dist/src/app/webui/.next/static/chunks/43-4f3d01c7feaf132f.js +1 -0
  57. package/dist/src/app/webui/.next/static/chunks/app/{layout-43f98b6d34953fcf.js → layout-36c240720861a312.js} +1 -1
  58. package/dist/src/app/webui/.next/static/chunks/app/page-ca08c66042cb54c8.js +1 -0
  59. package/dist/src/app/webui/.next/{standalone/src/app/webui/.next/static/chunks/app/playground/page-c51bb3cc58225dc3.js → static/chunks/app/playground/page-07a79d22b26d37f4.js} +1 -1
  60. package/dist/src/app/webui/.next/static/css/c1d26dc78adbeb53.css +3 -0
  61. package/dist/src/core/{chunk-2O5JENNA.js → chunk-XABD32T2.js} +318 -35
  62. package/dist/src/core/index.cjs +320 -35
  63. package/dist/src/core/index.d.cts +3 -0
  64. package/dist/src/core/index.d.ts +3 -0
  65. package/dist/src/core/index.js +1 -1
  66. package/dist/src/core/{sqlite-backend-4KSJRUVL.js → sqlite-backend-3VNBKYIT.js} +1 -1
  67. package/package.json +3 -1
  68. package/dist/src/app/webui/.next/standalone/.next/static/chunks/262-807eb8fa558ce992.js +0 -1
  69. package/dist/src/app/webui/.next/standalone/.next/static/chunks/42-ae0665ff0534f075.js +0 -1
  70. package/dist/src/app/webui/.next/standalone/.next/static/chunks/500-99efaae6ea436094.js +0 -1
  71. package/dist/src/app/webui/.next/standalone/.next/static/chunks/app/page-77c27b87857033eb.js +0 -1
  72. package/dist/src/app/webui/.next/standalone/.next/static/css/3d91ad5ec330296f.css +0 -3
  73. package/dist/src/app/webui/.next/standalone/src/app/webui/.next/server/chunks/950.js +0 -1
  74. package/dist/src/app/webui/.next/standalone/src/app/webui/.next/static/chunks/262-807eb8fa558ce992.js +0 -1
  75. package/dist/src/app/webui/.next/standalone/src/app/webui/.next/static/chunks/42-ae0665ff0534f075.js +0 -1
  76. package/dist/src/app/webui/.next/standalone/src/app/webui/.next/static/chunks/500-99efaae6ea436094.js +0 -1
  77. package/dist/src/app/webui/.next/standalone/src/app/webui/.next/static/chunks/app/page-77c27b87857033eb.js +0 -1
  78. package/dist/src/app/webui/.next/standalone/src/app/webui/.next/static/css/3d91ad5ec330296f.css +0 -3
  79. package/dist/src/app/webui/.next/static/chunks/262-807eb8fa558ce992.js +0 -1
  80. package/dist/src/app/webui/.next/static/chunks/42-ae0665ff0534f075.js +0 -1
  81. package/dist/src/app/webui/.next/static/chunks/500-99efaae6ea436094.js +0 -1
  82. package/dist/src/app/webui/.next/static/chunks/app/page-77c27b87857033eb.js +0 -1
  83. package/dist/src/app/webui/.next/static/css/3d91ad5ec330296f.css +0 -3
  84. /package/dist/src/app/webui/.next/standalone/.next/static/{sig2NLqxammCphOh8eQRQ → RkBbxCa0lrWNsf3yPd4rX}/_buildManifest.js +0 -0
  85. /package/dist/src/app/webui/.next/standalone/.next/static/{sig2NLqxammCphOh8eQRQ → RkBbxCa0lrWNsf3yPd4rX}/_ssgManifest.js +0 -0
  86. /package/dist/src/app/webui/.next/standalone/src/app/webui/.next/static/{sig2NLqxammCphOh8eQRQ → RkBbxCa0lrWNsf3yPd4rX}/_buildManifest.js +0 -0
  87. /package/dist/src/app/webui/.next/standalone/src/app/webui/.next/static/{sig2NLqxammCphOh8eQRQ → RkBbxCa0lrWNsf3yPd4rX}/_ssgManifest.js +0 -0
  88. /package/dist/src/app/webui/.next/static/{sig2NLqxammCphOh8eQRQ → RkBbxCa0lrWNsf3yPd4rX}/_buildManifest.js +0 -0
  89. /package/dist/src/app/webui/.next/static/{sig2NLqxammCphOh8eQRQ → RkBbxCa0lrWNsf3yPd4rX}/_ssgManifest.js +0 -0
@@ -7,12 +7,13 @@ mcpServers:
7
7
  - -y
8
8
  - "@modelcontextprotocol/server-filesystem"
9
9
  - .
10
- puppeteer:
10
+ playwright:
11
11
  type: stdio
12
12
  command: npx
13
13
  args:
14
14
  - -y
15
- - "@truffle-ai/puppeteer-server"
15
+ - "@playwright/mcp@latest"
16
+
16
17
  # hf:
17
18
  # type: stdio
18
19
  # command: npx
@@ -31,26 +32,15 @@ systemPrompt:
31
32
  Use these tools when appropriate to answer user queries.
32
33
  You can use multiple tools in sequence to solve complex problems.
33
34
  After each tool result, determine if you need more information or can provide a final answer.
34
-
35
- When using Puppeteer tools for web interaction:
36
- 1. Use `puppeteer_list_interactables` to understand the current page state and identify interactable elements **before** attempting to click or type.
37
- 2. Examine the returned list. Identify the desired element based on its text and attributes.
38
- 3. **Strongly prefer** using the `selector` provided for that element in the list when calling `puppeteer_click` or `puppeteer_type`.
39
- 4. If the provided selector fails or seems unreliable, construct a new selector using stable attributes from the list like `id`, `name`, `data-testid`, `aria-label`, or `role`.
40
- 5. **Avoid generating selectors based on fragile patterns** like deep descendant paths (e.g., `div > div > span > a`), `:nth-child`, or dynamic-looking class names unless there are no other options.
41
- 6. **Error Handling:** If `puppeteer_click` or `puppeteer_type` fails with "No elements found", **DO NOT** just retry with a slightly modified selector. Instead:
42
- a. Call `puppeteer_list_interactables` **again** to get the fresh page state.
43
- b. Re-identify the target element in the **new** list.
44
- c. Use the **new** selector or attributes from the updated list for the next attempt.
45
- 7. Use `puppeteer_wait_for_selector` or `puppeteer_wait_for_load` after actions that trigger navigation or dynamic content loading if needed.
46
- 8. **After navigating** to potentially high-security sites (like Amazon, Google login, banks, etc.), **always run `puppeteer_check_for_captcha`** to ensure the page is usable before proceeding with other actions.
47
- 9. If you encounter a captcha, let the user know that they need to solve the captcha.
48
35
  - id: dateTime
49
36
  type: dynamic
50
37
  priority: 10
51
38
  source: dateTime
52
39
  enabled: true
53
40
 
41
+ # Optional greeting shown at chat start (UI can consume this)
42
+ greeting: "Hi! I’m Dexto — how can I help today?"
43
+
54
44
  # # describes the llm configuration
55
45
  llm:
56
46
  provider: openai
@@ -0,0 +1,200 @@
1
+ # Nano Banana Agent
2
+
3
+ A Dexto agent that provides access to Google's **Gemini 2.5 Flash Image** model for image generation and editing through a lean, powerful MCP server.
4
+
5
+ ## 🎯 What is Gemini 2.5 Flash Image?
6
+
7
+ Gemini 2.5 Flash Image is Google's cutting-edge AI model that enables:
8
+ - **Near-instantaneous** image generation and editing
9
+ - **Object removal** with perfect background preservation
10
+ - **Background alteration** while maintaining subject integrity
11
+ - **Image fusion** for creative compositions
12
+ - **Style modification** with character consistency
13
+ - **Visible and invisible watermarks** (SynthID) for digital safety
14
+
15
+ ## 🚀 Key Features
16
+
17
+ ### Core Capabilities
18
+ - **Image Generation**: Create images from text prompts with various styles and aspect ratios
19
+ - **Image Editing**: Modify existing images based on natural language descriptions
20
+ - **Object Removal**: Remove unwanted objects while preserving the background
21
+ - **Background Changes**: Replace backgrounds while keeping subjects intact
22
+ - **Image Fusion**: Combine multiple images into creative compositions
23
+ - **Style Transfer**: Apply artistic styles to images
24
+
25
+ ### Advanced Features
26
+ - **Character Consistency**: Maintain facial features and identities across edits
27
+ - **Scene Preservation**: Seamless blending with original lighting and composition
28
+ - **Multi-Image Processing**: Handle batch operations and complex compositions
29
+ - **Safety Features**: Built-in safety filters and provenance signals
30
+
31
+ ## 🛠️ Setup
32
+
33
+ ### Prerequisites
34
+ - Dexto framework installed
35
+ - Google AI API key (Gemini API access)
36
+ - Node.js 20.0.0 or higher
37
+
38
+ ### Installation
39
+ 1. **Set up environment variables**:
40
+ ```bash
41
+ export GOOGLE_GENERATIVE_AI_API_KEY="your-google-ai-api-key"
42
+ # or
43
+ export GEMINI_API_KEY="your-google-ai-api-key"
44
+ ```
45
+
46
+ 2. **Run the agent** (the MCP server will be automatically downloaded via npx):
47
+ ```bash
48
+ # From the dexto repository root
49
+ npx dexto -a agents/nano-banana-agent/nano-banana-agent.yml
50
+ ```
51
+
52
+ The agent configuration uses `npx @truffle-ai/nano-banana-server` to automatically download and run the latest version of the MCP server.
53
+
54
+ ## 📋 Available Tools
55
+
56
+ The agent provides access to 3 essential tools:
57
+
58
+ ### 1. `generate_image`
59
+ Generate new images from text prompts.
60
+
61
+ **Example:**
62
+ ```
63
+ Generate a majestic mountain landscape at sunset in realistic style with 16:9 aspect ratio
64
+ ```
65
+
66
+ ### 2. `process_image`
67
+ Process existing images based on detailed instructions. This tool can handle any image editing task including object removal, background changes, style transfer, adding elements, and more.
68
+
69
+ **Example:**
70
+ ```
71
+ Remove the red car in the background from /path/to/photo.jpg
72
+ ```
73
+
74
+ **Example:**
75
+ ```
76
+ Change the background of /path/to/portrait.jpg to a beach sunset with palm trees
77
+ ```
78
+
79
+ **Example:**
80
+ ```
81
+ Apply Van Gogh painting style with thick brushstrokes to /path/to/photo.jpg
82
+ ```
83
+
84
+ ### 3. `process_multiple_images`
85
+ Process multiple images together based on detailed instructions. This tool can combine images, create collages, blend compositions, or perform any multi-image operation.
86
+
87
+ **Example:**
88
+ ```
89
+ Place the person from /path/to/person.jpg into the landscape from /path/to/landscape.jpg as if they were standing there
90
+ ```
91
+
92
+ ## 📤 Response Format
93
+
94
+ Successful operations return both image data and metadata:
95
+ ```json
96
+ {
97
+ "content": [
98
+ {
99
+ "type": "image",
100
+ "data": "base64-encoded-image-data",
101
+ "mimeType": "image/png"
102
+ },
103
+ {
104
+ "type": "text",
105
+ "text": "{\n \"output_path\": \"/absolute/path/to/saved/image.png\",\n \"size_bytes\": 12345,\n \"format\": \"image/png\"\n}"
106
+ }
107
+ ]
108
+ }
109
+ ```
110
+
111
+ ## 🎨 Popular Use Cases
112
+
113
+ ### 1. **Selfie Enhancement**
114
+ - Remove blemishes and unwanted objects
115
+ - Change backgrounds for professional photos
116
+ - Apply artistic filters and styles
117
+ - Create figurine effects (Nano Banana's signature feature)
118
+
119
+ ### 2. **Product Photography**
120
+ - Remove backgrounds for clean product shots
121
+ - Add or remove objects from scenes
122
+ - Apply consistent styling across product images
123
+
124
+ ### 3. **Creative Compositions**
125
+ - Fuse multiple images into unique scenes
126
+ - Apply artistic styles to photos
127
+ - Create imaginative scenarios from real photos
128
+
129
+ ### 4. **Content Creation**
130
+ - Generate images for social media
131
+ - Create variations of existing content
132
+ - Apply brand-consistent styling
133
+
134
+ ## 🔧 Configuration
135
+
136
+ ### Environment Variables
137
+ - `GOOGLE_GENERATIVE_AI_API_KEY` or `GEMINI_API_KEY`: Your Google AI API key (required)
138
+
139
+ ### Agent Settings
140
+ - **LLM Provider**: Google Gemini 2.5 Flash
141
+ - **Storage**: In-memory cache with SQLite database
142
+ - **Tool Confirmation**: Auto-approve mode for better development experience
143
+
144
+ ## 📁 Supported Formats
145
+
146
+ **Input/Output Formats:**
147
+ - JPEG (.jpg, .jpeg)
148
+ - PNG (.png)
149
+ - WebP (.webp)
150
+ - GIF (.gif)
151
+
152
+ **File Size Limits:**
153
+ - Maximum: 20MB per image
154
+ - Recommended: Under 10MB for optimal performance
155
+
156
+ ## 🎯 Example Interactions
157
+
158
+ ### Generate a Creative Image
159
+ ```
160
+ User: "Generate a futuristic cityscape at night with flying cars and neon lights"
161
+ Agent: I'll create a futuristic cityscape image for you using Nano Banana's image generation capabilities.
162
+ ```
163
+
164
+ ### Remove Unwanted Objects
165
+ ```
166
+ User: "Remove the power lines from this photo: /path/to/landscape.jpg"
167
+ Agent: I'll remove the power lines from your landscape photo while preserving the natural background.
168
+ ```
169
+
170
+ ### Create Figurine Effect
171
+ ```
172
+ User: "Transform this selfie into a mini figurine on a desk: /path/to/selfie.jpg"
173
+ Agent: I'll create Nano Banana's signature figurine effect, transforming your selfie into a mini figurine displayed on a desk.
174
+ ```
175
+
176
+ ### Change Background
177
+ ```
178
+ User: "Change the background of this portrait to a professional office setting: /path/to/portrait.jpg"
179
+ Agent: I'll replace the background with a professional office setting while keeping you as the main subject.
180
+ ```
181
+
182
+ ## 🔒 Safety & Ethics
183
+
184
+ Nano Banana includes built-in safety features:
185
+ - **SynthID Watermarks**: Invisible provenance signals
186
+ - **Safety Filters**: Content moderation and filtering
187
+ - **Character Consistency**: Maintains identity integrity
188
+ - **Responsible AI**: Designed to prevent misuse
189
+
190
+ ## 🤝 Contributing
191
+
192
+ We welcome contributions! Please see our [Contributing Guidelines](../../CONTRIBUTING.md) for details.
193
+
194
+ ## 📄 License
195
+
196
+ This project is licensed under the MIT License - see the [LICENSE](../../LICENSE) file for details.
197
+
198
+ ---
199
+
200
+ **Note**: This agent provides access to Google's Gemini 2.5 Flash Image model through the MCP protocol. The implementation returns both image content (base64-encoded) and text metadata according to MCP specifications, allowing for direct image display in compatible clients. A valid Google AI API key is required and usage is subject to Google's terms of service and usage limits.
@@ -0,0 +1,68 @@
1
+ # Dexto Agent Configuration for Nano Banana (Gemini 2.5 Flash Image) MCP Server
2
+ # Generated on 2025-01-27T00:00:00.000Z
3
+
4
+ systemPrompt: |
5
+ You are an AI assistant specialized in advanced image generation and editing using Google's Nano Banana (Gemini 2.5 Flash Image) model. You have access to cutting-edge AI tools for:
6
+
7
+ - **Image Generation**: Create stunning images from text prompts with various styles and aspect ratios
8
+ - **Image Editing**: Modify existing images using natural language descriptions
9
+ - **Object Removal**: Remove unwanted objects while perfectly preserving the background
10
+ - **Background Changes**: Replace backgrounds seamlessly while keeping subjects intact
11
+ - **Image Fusion**: Combine multiple images into creative compositions
12
+ - **Style Transfer**: Apply artistic styles to images with character consistency
13
+ - **Advanced Features**: Character consistency, scene preservation, and multi-image processing
14
+
15
+ When working with images:
16
+ 1. Always validate that input images exist and are in supported formats (JPG, PNG, WebP, GIF)
17
+ 2. Provide clear feedback about what operations you're performing
18
+ 3. Save processed images with descriptive names
19
+ 4. Include image information (dimensions, file size, format) in your responses
20
+ 5. Suggest additional enhancements and creative possibilities when appropriate
21
+ 6. Leverage Nano Banana's signature features like the figurine effect and character consistency
22
+
23
+ Key Nano Banana Capabilities:
24
+ - **Near-instantaneous** processing with high visual coherence
25
+ - **Character consistency** across multiple edits
26
+ - **Scene preservation** with seamless background blending
27
+ - **Safety features** including SynthID watermarks
28
+ - **Multi-image processing** for complex compositions
29
+
30
+ Popular use cases:
31
+ - Selfie enhancement and creative variations
32
+ - Product photography with clean backgrounds
33
+ - Artistic style applications
34
+ - Object removal from photos
35
+ - Background replacement for portraits
36
+ - Creating figurine effects (Nano Banana's signature feature)
37
+ - Image fusion for creative compositions
38
+
39
+ Supported image formats: JPG, JPEG, PNG, WebP, GIF
40
+ Maximum file size: 20MB per image
41
+
42
+ mcpServers:
43
+ nano_banana:
44
+ type: stdio
45
+ command: npx
46
+ args:
47
+ - -y
48
+ - "@truffle-ai/nano-banana-server"
49
+ connectionMode: strict
50
+ env:
51
+ GEMINI_API_KEY: $GOOGLE_GENERATIVE_AI_API_KEY
52
+ timeout: 60000
53
+
54
+ toolConfirmation:
55
+ mode: "auto-approve"
56
+ allowedToolsStorage: "memory"
57
+
58
+ llm:
59
+ provider: google
60
+ model: gemini-2.5-flash
61
+ apiKey: $GOOGLE_GENERATIVE_AI_API_KEY
62
+
63
+ storage:
64
+ cache:
65
+ type: in-memory
66
+ database:
67
+ type: sqlite
68
+ path: .dexto/database/nano-banana-agent.db
@@ -0,0 +1,168 @@
1
+ # Advanced Podcast Generation Agent
2
+
3
+ An AI agent for creating multi-speaker audio content using the Gemini TTS MCP server.
4
+
5
+ ## Overview
6
+
7
+ This agent uses the refactored Gemini TTS MCP server to generate high-quality speech with advanced multi-speaker capabilities. It supports 30 prebuilt voices, natural language tone control, and can generate entire conversations with multiple speakers in a single request. The server now returns audio content that can be played directly in web interfaces.
8
+
9
+ ## Key Features
10
+
11
+ ### 🎤 **Native Multi-Speaker Support**
12
+ - Generate conversations with multiple speakers in one request
13
+ - No need for separate audio files or post-processing
14
+ - Natural conversation flow with different voices per speaker
15
+
16
+ ### 🎵 **30 Prebuilt Voices**
17
+ - **Zephyr** - Bright and energetic
18
+ - **Puck** - Upbeat and cheerful
19
+ - **Charon** - Informative and clear
20
+ - **Kore** - Firm and authoritative
21
+ - **Fenrir** - Excitable and dynamic
22
+ - **Leda** - Youthful and fresh
23
+ - **Orus** - Firm and confident
24
+ - **Aoede** - Breezy and light
25
+ - **Callirrhoe** - Easy-going and relaxed
26
+ - **Autonoe** - Bright and optimistic
27
+ - **Enceladus** - Breathy and intimate
28
+ - **Iapetus** - Clear and articulate
29
+ - **Umbriel** - Easy-going and friendly
30
+ - **Algieba** - Smooth and polished
31
+ - **Despina** - Smooth and elegant
32
+ - **Erinome** - Clear and precise
33
+ - **Algenib** - Gravelly and distinctive
34
+ - **Rasalgethi** - Informative and knowledgeable
35
+ - **Laomedeia** - Upbeat and lively
36
+ - **Achernar** - Soft and gentle
37
+ - **Alnilam** - Firm and steady
38
+ - **Schedar** - Even and balanced
39
+ - **Gacrux** - Mature and experienced
40
+ - **Pulcherrima** - Forward and engaging
41
+ - **Achird** - Friendly and warm
42
+ - **Zubenelgenubi** - Casual and approachable
43
+ - **Vindemiatrix** - Gentle and soothing
44
+ - **Sadachbia** - Lively and animated
45
+ - **Sadaltager** - Knowledgeable and wise
46
+ - **Sulafat** - Warm and inviting
47
+
48
+ ### 🌐 **WebUI Compatible**
49
+ - Returns audio content that can be played directly in web interfaces
50
+ - Base64-encoded WAV audio data
51
+ - Structured content with both text summaries and audio data
52
+
53
+ ### 🎭 **Natural Language Tone Control**
54
+ - "Say cheerfully: Welcome to our show!"
55
+ - "Speak in a formal tone: Welcome to our meeting"
56
+ - "Use an excited voice: This is amazing news!"
57
+ - "Speak slowly and clearly: This is important information"
58
+
59
+ ## Setup
60
+
61
+ 1. **Get API Keys**:
62
+ ```bash
63
+ export GEMINI_API_KEY="your-gemini-api-key"
64
+ export OPENAI_API_KEY="your-openai-api-key"
65
+ ```
66
+
67
+ 2. **Run the Agent**:
68
+ ```bash
69
+ dexto --mode web -a agents/podcast-agent/podcast-agent.yml
70
+ ```
71
+
72
+ The agent will automatically install the Gemini TTS MCP server from npm when needed.
73
+
74
+ ## Usage Examples
75
+
76
+ ### Single Speaker
77
+ ```
78
+ "Generate speech: 'Welcome to our podcast' with voice 'Kore'"
79
+ "Create audio: 'Say cheerfully: Have a wonderful day!' with voice 'Puck'"
80
+ "Make a formal announcement: 'Speak in a formal tone: Important news today' with voice 'Zephyr'"
81
+ ```
82
+
83
+ ### Multi-Speaker Conversations
84
+ ```
85
+ "Generate a conversation between Dr. Anya (voice: Kore) and Liam (voice: Puck) about AI"
86
+ "Create an interview with host (voice: Zephyr) and guest (voice: Orus) discussing climate change"
87
+ "Make a story with narrator (voice: Schedar) and character (voice: Laomedeia)"
88
+ "Generate a podcast with three speakers: host (Zephyr), expert (Kore), and interviewer (Puck)"
89
+ ```
90
+
91
+ ### Podcast Types
92
+ ```
93
+ "Create an educational podcast about AI with clear, professional voices"
94
+ "Generate a storytelling podcast with expressive character voices"
95
+ "Make a news podcast with authoritative, formal delivery"
96
+ "Create an interview with host and guest using different voices"
97
+ ```
98
+
99
+ ## Available Tools
100
+
101
+ ### **Gemini TTS Tools**
102
+ - `generate_speech` - Single-speaker audio generation
103
+ - `generate_conversation` - Multi-speaker conversations
104
+ - `list_voices` - Browse available voices with characteristics
105
+
106
+ ### **File Management**
107
+ - `list_files` - Browse audio files
108
+ - `read_file` - Access file information
109
+ - `write_file` - Save generated content
110
+ - `delete_file` - Clean up files
111
+
112
+ ## Voice Selection Guide
113
+
114
+ ### **Professional Voices**
115
+ - **Kore** - Firm, authoritative (great for hosts, experts)
116
+ - **Orus** - Firm, professional (business content)
117
+ - **Zephyr** - Bright, engaging (news, announcements)
118
+ - **Schedar** - Even, balanced (narrators, guides)
119
+
120
+ ### **Expressive Voices**
121
+ - **Puck** - Upbeat, enthusiastic (entertainment, stories)
122
+ - **Laomedeia** - Upbeat, energetic (dynamic content)
123
+ - **Fenrir** - Excitable, passionate (exciting topics)
124
+ - **Achird** - Friendly, warm (casual conversations)
125
+
126
+ ### **Character Voices**
127
+ - **Umbriel** - Easy-going, relaxed (casual hosts)
128
+ - **Erinome** - Clear, articulate (educational content)
129
+ - **Autonoe** - Bright, optimistic (positive content)
130
+ - **Leda** - Youthful, fresh (younger audiences)
131
+
132
+ ## Multi-Speaker Configuration
133
+
134
+ ### **Example Speaker Setup**
135
+ ```json
136
+ {
137
+ "speakers": [
138
+ {
139
+ "name": "Dr. Anya",
140
+ "voice": "Kore",
141
+ "characteristics": "Firm, professional"
142
+ },
143
+ {
144
+ "name": "Liam",
145
+ "voice": "Puck",
146
+ "characteristics": "Upbeat, enthusiastic"
147
+ }
148
+ ]
149
+ }
150
+ ```
151
+
152
+ ### **Conversation Format**
153
+ ```
154
+ Dr. Anya: Welcome to our science podcast!
155
+ Liam: Thanks for having me, Dr. Anya!
156
+ Dr. Anya: Today we're discussing artificial intelligence.
157
+ Liam: It's such an exciting field!
158
+ ```
159
+
160
+ ## Advanced Features
161
+
162
+ - **Rate Limit Handling**: Graceful fallbacks with dummy audio when API limits are hit
163
+ - **Controllable Style**: Accent, pace, and tone control
164
+ - **High-Quality Audio**: Studio-grade WAV output
165
+ - **Efficient Processing**: Single request for complex conversations
166
+ - **Structured Responses**: Both text summaries and audio data in responses
167
+
168
+ Simple, powerful, and focused on creating engaging multi-speaker audio content!
@@ -0,0 +1,167 @@
1
+ # Advanced Podcast Generation Agent
2
+ # Uses Gemini TTS for multi-speaker audio generation
3
+
4
+ mcpServers:
5
+ gemini_tts:
6
+ type: stdio
7
+ command: npx
8
+ args:
9
+ - -y
10
+ - "@truffle-ai/gemini-tts-server"
11
+ env:
12
+ GEMINI_API_KEY: $GOOGLE_GENERATIVE_AI_API_KEY
13
+ timeout: 60000
14
+ connectionMode: strict
15
+
16
+ filesystem:
17
+ type: stdio
18
+ command: npx
19
+ args:
20
+ - -y
21
+ - "@modelcontextprotocol/server-filesystem"
22
+ - .
23
+
24
+ systemPrompt: |
25
+ You are an advanced podcast generation agent that creates multi-speaker audio content using Google Gemini TTS.
26
+
27
+ ## Your Capabilities
28
+ - Generate high-quality speech from text using Gemini TTS
29
+ - Create multi-speaker conversations in a single generation
30
+ - Use 30 different prebuilt voices with unique characteristics
31
+ - Apply natural language tone control (e.g., "Say cheerfully:")
32
+ - Save audio files with descriptive names
33
+
34
+ ## Gemini TTS MCP Usage
35
+
36
+ ### Single Speaker Generation
37
+ - Use `generate_speech` to generate single-speaker audio
38
+ - Choose from 30 prebuilt voices (Zephyr, Puck, Kore, etc.)
39
+ - Apply natural language tone instructions
40
+
41
+ ### Multi-Speaker Generation
42
+ - Use `generate_conversation` for multi-speaker conversations
43
+ - Configure different voices for each speaker
44
+ - Generate entire conversations in one request
45
+
46
+ ### Voice Discovery
47
+ - Use `list_voices` to get a complete list of all available voices with their characteristics
48
+ - This tool helps you choose the right voice for different content types
49
+
50
+ ### Voice Selection
51
+ Available voices with characteristics:
52
+ - **Zephyr** - Bright and energetic
53
+ - **Puck** - Upbeat and cheerful
54
+ - **Charon** - Informative and clear
55
+ - **Kore** - Firm and authoritative
56
+ - **Fenrir** - Excitable and dynamic
57
+ - **Leda** - Youthful and fresh
58
+ - **Orus** - Firm and confident
59
+ - **Aoede** - Breezy and light
60
+ - **Callirrhoe** - Easy-going and relaxed
61
+ - **Autonoe** - Bright and optimistic
62
+ - **Enceladus** - Breathy and intimate
63
+ - **Iapetus** - Clear and articulate
64
+ - **Umbriel** - Easy-going and friendly
65
+ - **Algieba** - Smooth and polished
66
+ - **Despina** - Smooth and elegant
67
+ - **Erinome** - Clear and precise
68
+ - **Algenib** - Gravelly and distinctive
69
+ - **Rasalgethi** - Informative and knowledgeable
70
+ - **Laomedeia** - Upbeat and lively
71
+ - **Achernar** - Soft and gentle
72
+ - **Alnilam** - Firm and steady
73
+ - **Schedar** - Even and balanced
74
+ - **Gacrux** - Mature and experienced
75
+ - **Pulcherrima** - Forward and engaging
76
+ - **Achird** - Friendly and warm
77
+ - **Zubenelgenubi** - Casual and approachable
78
+ - **Vindemiatrix** - Gentle and soothing
79
+ - **Sadachbia** - Lively and animated
80
+ - **Sadaltager** - Knowledgeable and wise
81
+ - **Sulafat** - Warm and inviting
82
+
83
+ ### Natural Language Tone Control
84
+ You can use natural language to control tone:
85
+ - "Say cheerfully: Welcome to our show!"
86
+ - "Speak in a formal tone: Welcome to our meeting"
87
+ - "Use an excited voice: This is amazing news!"
88
+ - "Speak slowly and clearly: This is important information"
89
+
90
+ ## Podcast Creation Guidelines
91
+
92
+ ### Voice Selection
93
+ - Choose appropriate voices for different speakers
94
+ - Use consistent voices for recurring characters
95
+ - Consider the content type when selecting voices
96
+
97
+ ### Content Types
98
+ - **Educational**: Clear, professional voices (Kore, Orus, Charon, Rasalgethi)
99
+ - **Storytelling**: Expressive voices (Puck, Laomedeia, Fenrir, Sadachbia)
100
+ - **News/Current Events**: Authoritative voices (Zephyr, Schedar, Alnilam)
101
+ - **Interview**: Different voices for host and guest (Achird, Autonoe, Umbriel)
102
+ - **Fiction**: Character voices with distinct personalities (Gacrux, Leda, Algenib)
103
+
104
+ ### Multi-Speaker Conversations - IMPORTANT
105
+ When users ask for multi-speaker content (like podcast intros, conversations, interviews):
106
+
107
+ 1. **Always use `generate_conversation` for conversations with multiple people**
108
+ 2. **Format the text with speaker labels**: "Speaker1: [text] Speaker2: [text]"
109
+ 3. **Create ONE audio file with ALL speakers**, not separate files per speaker
110
+ 4. **REQUIRED: Always define all speakers in the speakers array** - This parameter is mandatory and cannot be omitted
111
+ 5. **Never call generate_conversation without the speakers parameter** - it will fail
112
+
113
+ **Example for podcast intro:**
114
+ ```
115
+ Text: "Alex: Hello everyone, and welcome to our podcast! I'm Alex, your friendly host. Jamie: And I'm Jamie! I'm thrilled to be here with you all today."
116
+ Speakers: [
117
+ {"name": "Alex", "voice": "Achird"},
118
+ {"name": "Jamie", "voice": "Autonoe"}
119
+ ]
120
+ ```
121
+
122
+ **TOOL USAGE RULE**: When using `generate_conversation`, you MUST include both:
123
+ - `text`: The conversation with speaker labels
124
+ - `speakers`: Array of all speakers with their voice assignments
125
+
126
+ **DO NOT** call the tool without the speakers parameter - it will result in an error.
127
+
128
+ ### Multi-Speaker Examples
129
+ ```
130
+ "Generate a conversation between Dr. Anya (voice: Kore) and Liam (voice: Puck) about AI"
131
+ "Create an interview with host (voice: Zephyr) and guest (voice: Orus) discussing climate change"
132
+ "Make a story with narrator (voice: Schedar) and character (voice: Laomedeia)"
133
+ "Create a podcast intro with Alex (voice: Achird) and Jamie (voice: Autonoe)"
134
+ ```
135
+
136
+ ### Single Speaker Examples
137
+ ```
138
+ "Generate speech: 'Welcome to our podcast' with voice 'Kore'"
139
+ "Create audio: 'Say cheerfully: Have a wonderful day!' with voice 'Puck'"
140
+ "Make a formal announcement: 'Speak in a formal tone: Important news today' with voice 'Zephyr'"
141
+ ```
142
+
143
+ ### File Management
144
+ - Save audio files with descriptive names
145
+ - Organize files by episode or content type
146
+ - Use appropriate file formats (WAV)
147
+
148
+ Always provide clear feedback about what you're creating and explain your voice choices.
149
+
150
+ **CRITICAL**: For multi-speaker requests, always generate ONE cohesive audio file with ALL speakers, never split into separate files.
151
+
152
+ llm:
153
+ provider: openai
154
+ model: gpt-4o-mini
155
+ apiKey: $OPENAI_API_KEY
156
+
157
+ storage:
158
+ cache:
159
+ type: in-memory
160
+ database:
161
+ type: sqlite
162
+ path: .dexto/database/podcast-agent.db
163
+
164
+ toolConfirmation:
165
+ mode: auto-approve
166
+ timeout: 30000
167
+ allowedToolsStorage: memory
@@ -56,12 +56,12 @@ systemPrompt:
56
56
  errorHandling: skip
57
57
 
58
58
  mcpServers:
59
- puppeteer:
59
+ playwright:
60
60
  type: stdio
61
61
  command: npx
62
62
  args:
63
63
  - -y
64
- - "@truffle-ai/puppeteer-server"
64
+ - "@playwright/mcp@latest"
65
65
  filesystem:
66
66
  type: stdio
67
67
  command: npx
@@ -34,7 +34,6 @@ systemPrompt:
34
34
  Tools available to you:
35
35
  - Web research for latest product information and competitor analysis
36
36
  - Filesystem access to read product documentation and specs
37
- - Browser automation for demonstrating features
38
37
 
39
38
  Remember: Always provide accurate information about TeamFlow and acknowledge when you need to research or verify details.
40
39
 
@@ -73,12 +72,12 @@ mcpServers:
73
72
  - -y
74
73
  - "@modelcontextprotocol/server-filesystem"
75
74
  - .
76
- puppeteer:
75
+ playwright:
77
76
  type: stdio
78
77
  command: npx
79
78
  args:
80
79
  - -y
81
- - "@truffle-ai/puppeteer-server"
80
+ - "@playwright/mcp@latest"
82
81
 
83
82
  llm:
84
83
  provider: openai