@heart-of-gold/toolkit 0.1.7 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/package.json +1 -1
- package/plugins/babel-fish/skills/audio/SKILL.md +279 -262
- package/src/utils/transform.ts +66 -1
package/README.md
CHANGED
|
@@ -17,7 +17,7 @@
|
|
|
17
17
|
bunx @heart-of-gold/toolkit install --to codex
|
|
18
18
|
```
|
|
19
19
|
|
|
20
|
-
The Codex target also applies Codex-specific wording transforms for flagship shared skills so interactive flows like `brainstorm` and `plan` more strongly encourage Codex's structured user-input UI instead of falling back to plain text when richer selection UX is available.
|
|
20
|
+
The Codex target also applies Codex-specific wording transforms for flagship shared skills so interactive flows like `brainstorm` and `plan` more strongly encourage Codex's structured user-input UI instead of falling back to plain text when richer selection UX is available. It also rewrites slash-command references in installed skill text — both plain (`/plan`, `/work`) and plugin-prefixed (`/deep-thought:review`, `/marvin:compound`) — to Codex-style `$...` skill invocations.
|
|
21
21
|
|
|
22
22
|
### OpenCode
|
|
23
23
|
```bash
|
package/package.json
CHANGED
|
@@ -34,8 +34,11 @@ Includes ready-to-run Python scripts. Claude writes them to a temp file and exec
|
|
|
34
34
|
| Hardcode API key in script | Leaks credentials to git history | Security incident |
|
|
35
35
|
| Skip voice selection | Default voice may not match content tone | Wasted credits on re-gen |
|
|
36
36
|
| Generate full podcast without preview | Long audio = expensive; mistakes compound | Non-refundable credits |
|
|
37
|
-
| Use `eleven_v3` for everything | 5,000 char limit — wrong for long-form | Truncated audio |
|
|
38
37
|
| Import pydub for concatenation | Broken on Python 3.13+ (audioop removed) | Runtime crash |
|
|
38
|
+
| Use VoiceSettings with cloned voices | Custom settings destabilize cloned voices | Garbled/robotic audio |
|
|
39
|
+
| Use `...` for pauses | Causes hesitation/nervousness artifacts | Unnatural stuttering |
|
|
40
|
+
| Use large chunks for long content | Quality degrades in second half | Robotic pacing |
|
|
41
|
+
| Skip `language_code` with accented speakers | Model guesses language from accent | Chinese/French mid-narration |
|
|
39
42
|
|
|
40
43
|
## Phase 0: Environment Setup
|
|
41
44
|
|
|
@@ -63,11 +66,9 @@ import os
|
|
|
63
66
|
|
|
64
67
|
def get_api_key() -> str:
|
|
65
68
|
"""Resolve ElevenLabs API key from CLI store, env var, or fail."""
|
|
66
|
-
# 1. CLI stored key
|
|
67
69
|
key_file = os.path.expanduser("~/.elevenlabs/api_key")
|
|
68
70
|
if os.path.exists(key_file):
|
|
69
71
|
return open(key_file).read().strip()
|
|
70
|
-
# 2. Environment variable
|
|
71
72
|
key = os.environ.get("ELEVENLABS_API_KEY", "")
|
|
72
73
|
if key:
|
|
73
74
|
return key
|
|
@@ -80,17 +81,15 @@ def get_api_key() -> str:
|
|
|
80
81
|
### Step 2: Install SDK (if needed)
|
|
81
82
|
|
|
82
83
|
```bash
|
|
83
|
-
# Check if installed
|
|
84
84
|
python3 -c "import elevenlabs" 2>/dev/null || uv pip install --system --break-system-packages elevenlabs
|
|
85
85
|
```
|
|
86
86
|
|
|
87
87
|
**IMPORTANT:** Do NOT install pydub. It's broken on Python 3.13+ (audioop removed). The scripts
|
|
88
|
-
below use raw MP3 byte concatenation
|
|
88
|
+
below use raw MP3 byte concatenation — MP3 is a frame-based format and files can be
|
|
89
89
|
concatenated directly.
|
|
90
90
|
|
|
91
91
|
**IMPORTANT:** On Python 3.14+, `client.text_to_speech.convert()` returns a **generator**, not
|
|
92
|
-
bytes. All scripts below use a `to_bytes()` helper to normalize this.
|
|
93
|
-
directly — always wrap with `to_bytes(audio)` first.
|
|
92
|
+
bytes. All scripts below use a `to_bytes()` helper to normalize this.
|
|
94
93
|
|
|
95
94
|
### Step 3: Verify Connection
|
|
96
95
|
|
|
@@ -111,16 +110,13 @@ for v in voices.voices[:10]:
|
|
|
111
110
|
```
|
|
112
111
|
|
|
113
112
|
**CRITICAL:** Voice IDs are **account-specific**. Never hardcode voice IDs from examples or
|
|
114
|
-
documentation — always run Step 3 first to discover the actual IDs available
|
|
115
|
-
account. The same voice name (e.g., "Alice") may have a different ID across accounts.
|
|
113
|
+
documentation — always run Step 3 first to discover the actual IDs available.
|
|
116
114
|
|
|
117
115
|
**Exit:** Auth verified, SDK installed, voices listed.
|
|
118
116
|
|
|
119
117
|
## Phase 1: Quick Text-to-Speech
|
|
120
118
|
|
|
121
|
-
**Entry:** User wants a single audio file from text (<
|
|
122
|
-
|
|
123
|
-
Write this script to a temp file and execute:
|
|
119
|
+
**Entry:** User wants a single audio file from text (< 5,000 chars).
|
|
124
120
|
|
|
125
121
|
```python
|
|
126
122
|
#!/usr/bin/env python3
|
|
@@ -128,16 +124,12 @@ Write this script to a temp file and execute:
|
|
|
128
124
|
import os
|
|
129
125
|
from elevenlabs.client import ElevenLabs
|
|
130
126
|
|
|
131
|
-
# --- CONFIG (Claude fills these — run Phase 0 Step 3 to list voice IDs) ---
|
|
132
127
|
TEXT = """Your text here."""
|
|
133
|
-
VOICE_ID = "FILL_FROM_VOICE_LIST"
|
|
134
|
-
MODEL_ID = "eleven_multilingual_v2"
|
|
135
|
-
OUTPUT_FORMAT = "mp3_44100_128"
|
|
128
|
+
VOICE_ID = "FILL_FROM_VOICE_LIST"
|
|
129
|
+
MODEL_ID = "eleven_multilingual_v2"
|
|
136
130
|
OUTPUT_PATH = "output.mp3"
|
|
137
|
-
# --- END CONFIG ---
|
|
138
131
|
|
|
139
132
|
def to_bytes(audio) -> bytes:
|
|
140
|
-
"""Normalize convert() output — returns bytes on <3.14, generator on >=3.14."""
|
|
141
133
|
return audio if isinstance(audio, bytes) else b"".join(audio)
|
|
142
134
|
|
|
143
135
|
key_file = os.path.expanduser("~/.elevenlabs/api_key")
|
|
@@ -149,338 +141,365 @@ audio = to_bytes(client.text_to_speech.convert(
|
|
|
149
141
|
text=TEXT,
|
|
150
142
|
voice_id=VOICE_ID,
|
|
151
143
|
model_id=MODEL_ID,
|
|
152
|
-
output_format=
|
|
144
|
+
output_format="mp3_44100_128",
|
|
145
|
+
language_code="en", # ALWAYS set for cloned/accented voices
|
|
153
146
|
))
|
|
154
147
|
|
|
155
148
|
with open(OUTPUT_PATH, "wb") as f:
|
|
156
149
|
f.write(audio)
|
|
157
|
-
|
|
158
|
-
size_kb = os.path.getsize(OUTPUT_PATH) / 1024
|
|
159
|
-
print(f"Saved to {OUTPUT_PATH} ({size_kb:.0f} KB)")
|
|
150
|
+
print(f"Saved to {OUTPUT_PATH} ({os.path.getsize(OUTPUT_PATH) / 1024:.0f} KB)")
|
|
160
151
|
```
|
|
161
152
|
|
|
162
|
-
**Exit:** Audio file saved
|
|
153
|
+
**Exit:** Audio file saved.
|
|
163
154
|
|
|
164
|
-
## Phase 2:
|
|
155
|
+
## Phase 2: Long-Form Narration (Blog Posts, Articles)
|
|
165
156
|
|
|
166
|
-
**Entry:** User wants
|
|
157
|
+
**Entry:** User wants narration of long-form content (> 5,000 chars).
|
|
167
158
|
|
|
168
|
-
|
|
159
|
+
**THIS IS THE CRITICAL PHASE.** Long-form audio requires special handling to maintain
|
|
160
|
+
quality throughout. The approach below was battle-tested and is the only one that
|
|
161
|
+
produces consistent quality across 10+ minute narrations.
|
|
169
162
|
|
|
170
|
-
|
|
171
|
-
generates a short silent audio clip via the API once and reuses it.
|
|
163
|
+
### Step 1: Prepare Speech Text
|
|
172
164
|
|
|
173
|
-
|
|
165
|
+
Create a separate `speech-text.md` adapted for listening:
|
|
174
166
|
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
167
|
+
| Written form | Speech form | Why |
|
|
168
|
+
|-------------|-------------|-----|
|
|
169
|
+
| `90%` | `ninety percent` | TTS mispronounces digits |
|
|
170
|
+
| `1.7 times` | `one point seven times` | Same |
|
|
171
|
+
| `2 AM` | `two in the morning` | Natural speech |
|
|
172
|
+
| `Kačka` | `Kachka` | Phonetic for TTS |
|
|
173
|
+
| `Žaneta` | `Zhaneta` | Phonetic for TTS |
|
|
174
|
+
| `Aibility` | `Eigh-bility` | Phonetic — write directly in text |
|
|
175
|
+
| `**bold text**` | `bold text` | Strip all markdown |
|
|
176
|
+
| `---` | *(remove)* | Strip section breaks |
|
|
178
177
|
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
""
|
|
182
|
-
|
|
183
|
-
|
|
178
|
+
**Pause control:**
|
|
179
|
+
- `<break time="0.7s" />` — sub-section pause (v2 supports SSML break tags)
|
|
180
|
+
- `<break time="1.0s" />` — major section transition
|
|
181
|
+
- `<break time="1.2s" />` — thesis/key moment (max recommended)
|
|
182
|
+
- **NEVER use `...`** — causes hesitation/nervousness artifacts
|
|
183
|
+
- **NEVER use more than 5-6 break tags total** — too many cause instability
|
|
184
|
+
- Let paragraph breaks and short sentences create natural pacing
|
|
185
|
+
|
|
186
|
+
**What NOT to do:**
|
|
187
|
+
- Don't add verbal filler ("Hey", "So look", "OK so") — sounds like a podcast host
|
|
188
|
+
- Don't over-break sentences into fragments — the model handles natural sentence rhythm fine
|
|
189
|
+
- Don't use `<lexeme>` tags — they get read aloud as text
|
|
190
|
+
- Don't rely on pronunciation dictionaries — they silently fail with some model/voice combos.
|
|
191
|
+
Write pronunciation phonetically directly in the text instead.
|
|
192
|
+
|
|
193
|
+
### Step 2: Generate with Request Stitching
|
|
184
194
|
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
195
|
+
**Why this approach:** Large chunks (4000+ chars) degrade in quality — the model loses
|
|
196
|
+
emotional range and natural pacing in the second half. Small chunks (800-1200 chars)
|
|
197
|
+
stay high quality. Request stitching chains them together for continuity.
|
|
188
198
|
|
|
189
|
-
|
|
199
|
+
**CRITICAL for cloned voices:**
|
|
200
|
+
- **`language_code="en"` is mandatory** — without it, the model guesses language from
|
|
201
|
+
accent and can switch to Chinese/French mid-narration
|
|
202
|
+
- **Do NOT pass VoiceSettings** — default settings produce the best results with cloned
|
|
203
|
+
voices. Every custom setting tested made it worse (garbled, robotic, unnatural)
|
|
190
204
|
|
|
191
|
-
|
|
205
|
+
```python
|
|
206
|
+
#!/usr/bin/env python3
|
|
207
|
+
"""ElevenLabs long-form narration with request stitching.
|
|
208
|
+
|
|
209
|
+
Splits text into small chunks, chains via previous_request_ids for
|
|
210
|
+
continuity, uses httpx directly to access request-id headers.
|
|
192
211
|
"""
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
212
|
+
import os
|
|
213
|
+
import httpx
|
|
214
|
+
|
|
215
|
+
# --- CONFIG ---
|
|
216
|
+
SPEECH_TEXT_PATH = "speech-text.md"
|
|
217
|
+
VOICE_ID = "FILL_FROM_VOICE_LIST"
|
|
218
|
+
OUTPUT_PATH = "speech.mp3"
|
|
219
|
+
CHUNK_SIZE = 1000 # chars per chunk — keep 800-1200 for quality
|
|
220
|
+
LANGUAGE_CODE = "en" # ALWAYS set for cloned/accented voices
|
|
197
221
|
# --- END CONFIG ---
|
|
198
222
|
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
return audio if isinstance(audio, bytes) else b"".join(audio)
|
|
223
|
+
api_key_file = os.path.expanduser("~/.elevenlabs/api_key")
|
|
224
|
+
api_key = open(api_key_file).read().strip() if os.path.exists(api_key_file) else os.environ["ELEVENLABS_API_KEY"]
|
|
202
225
|
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
client = ElevenLabs(api_key=api_key)
|
|
226
|
+
with open(SPEECH_TEXT_PATH, "r") as f:
|
|
227
|
+
text = f.read()
|
|
206
228
|
|
|
207
|
-
# Split
|
|
208
|
-
paragraphs =
|
|
229
|
+
# Split into small chunks at paragraph boundaries
|
|
230
|
+
paragraphs = text.split("\n\n")
|
|
209
231
|
chunks, current = [], ""
|
|
210
|
-
for
|
|
211
|
-
if len(current) + len(
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
current = para
|
|
232
|
+
for p in paragraphs:
|
|
233
|
+
if len(current) + len(p) + 2 > CHUNK_SIZE and current.strip():
|
|
234
|
+
chunks.append(current.strip())
|
|
235
|
+
current = p
|
|
215
236
|
else:
|
|
216
|
-
current = f"{current}\n\n{
|
|
217
|
-
if current:
|
|
218
|
-
chunks.append(current)
|
|
237
|
+
current = f"{current}\n\n{p}" if current else p
|
|
238
|
+
if current.strip():
|
|
239
|
+
chunks.append(current.strip())
|
|
219
240
|
|
|
220
|
-
print(f"Script: {len(
|
|
241
|
+
print(f"Script: {len(text)} chars -> {len(chunks)} chunks")
|
|
242
|
+
for i, c in enumerate(chunks):
|
|
243
|
+
print(f" Chunk {i+1}: {len(c)} chars")
|
|
221
244
|
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
output_format="mp3_44100_128",
|
|
228
|
-
))
|
|
245
|
+
url = f"https://api.elevenlabs.io/v1/text-to-speech/{VOICE_ID}"
|
|
246
|
+
headers = {"xi-api-key": api_key, "Content-Type": "application/json"}
|
|
247
|
+
|
|
248
|
+
all_audio = b""
|
|
249
|
+
prev_request_id = None
|
|
229
250
|
|
|
230
|
-
# Generate and concatenate
|
|
231
|
-
audio_parts = []
|
|
232
251
|
for i, chunk in enumerate(chunks):
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
252
|
+
data = {
|
|
253
|
+
"text": chunk,
|
|
254
|
+
"model_id": "eleven_multilingual_v2",
|
|
255
|
+
"output_format": "mp3_44100_128",
|
|
256
|
+
"language_code": LANGUAGE_CODE,
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
# Chain to previous chunk for prosody continuity
|
|
260
|
+
if prev_request_id:
|
|
261
|
+
data["previous_request_ids"] = [prev_request_id]
|
|
262
|
+
|
|
263
|
+
# Give forward context from next chunk
|
|
264
|
+
if i + 1 < len(chunks):
|
|
265
|
+
data["next_text"] = chunks[i + 1][:500]
|
|
266
|
+
|
|
267
|
+
print(f" [{i+1}/{len(chunks)}] {len(chunk)} chars...", end=" ", flush=True)
|
|
268
|
+
|
|
269
|
+
resp = httpx.post(url, json=data, headers=headers, timeout=60)
|
|
270
|
+
if resp.status_code != 200:
|
|
271
|
+
print(f"ERROR {resp.status_code}: {resp.text[:200]}")
|
|
272
|
+
break
|
|
273
|
+
|
|
274
|
+
prev_request_id = resp.headers.get("request-id")
|
|
275
|
+
all_audio += resp.content
|
|
276
|
+
print(f"done ({len(resp.content)//1024} KB)")
|
|
244
277
|
|
|
245
278
|
with open(OUTPUT_PATH, "wb") as f:
|
|
246
|
-
|
|
247
|
-
f.write(part)
|
|
279
|
+
f.write(all_audio)
|
|
248
280
|
|
|
249
281
|
size_mb = os.path.getsize(OUTPUT_PATH) / (1024 * 1024)
|
|
250
282
|
print(f"\nSaved to {OUTPUT_PATH} ({size_mb:.1f} MB)")
|
|
251
283
|
```
|
|
252
284
|
|
|
253
|
-
|
|
285
|
+
**Always test first:** Generate chunks 1-2 as a preview clip before committing to
|
|
286
|
+
the full generation. Credits are non-refundable.
|
|
287
|
+
|
|
288
|
+
### Step 3: Review and Iterate
|
|
289
|
+
|
|
290
|
+
Listen to the full audio. If specific sections sound off:
|
|
291
|
+
- Regenerate only that chunk using `previous_request_ids` (from the preceding chunk)
|
|
292
|
+
and `next_request_ids` (from the following chunk) to maintain flow
|
|
293
|
+
- Request IDs expire after 2 hours — regenerate within that window
|
|
294
|
+
|
|
295
|
+
**Exit:** Long-form narration audio saved.
|
|
296
|
+
|
|
297
|
+
## Phase 3: Voice Cloning
|
|
298
|
+
|
|
299
|
+
**Entry:** User wants a custom voice from their audio.
|
|
300
|
+
|
|
301
|
+
### Recording Requirements
|
|
302
|
+
|
|
303
|
+
| Requirement | Details |
|
|
304
|
+
|------------|---------|
|
|
305
|
+
| Duration | **1-2 minutes** (more than 3 min can be detrimental) |
|
|
306
|
+
| Content | Read your own writing — natural intonation matches best |
|
|
307
|
+
| Quality | Quiet room, no background noise, consistent distance from mic |
|
|
308
|
+
| Format | MP3 128kbps or higher, mono or stereo |
|
|
309
|
+
| Style | Consistent pace and tone — the clone replicates EVERYTHING |
|
|
310
|
+
| Avoid | Stumbles, "uhm"s, long pauses, whispers, shouting, music |
|
|
311
|
+
|
|
312
|
+
**CRITICAL:** Do NOT pre-process the recording with ffmpeg filters (silenceremove,
|
|
313
|
+
loudnorm, etc.). These strip voice characteristics the clone needs. The only
|
|
314
|
+
acceptable preprocessing is trimming to length.
|
|
315
|
+
|
|
316
|
+
### Instant Voice Clone
|
|
254
317
|
|
|
255
318
|
```python
|
|
256
|
-
|
|
257
|
-
"""ElevenLabs multi-voice podcast generator.
|
|
319
|
+
from elevenlabs import ElevenLabs
|
|
258
320
|
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
321
|
+
client = ElevenLabs(api_key=get_api_key())
|
|
322
|
+
|
|
323
|
+
voice = client.voices.ivc.create(
|
|
324
|
+
name="User Voice",
|
|
325
|
+
description="Natural speaking voice for narration",
|
|
326
|
+
files=[open("recording.mp3", "rb")],
|
|
327
|
+
remove_background_noise=False, # Preserve voice characteristics
|
|
328
|
+
)
|
|
329
|
+
print(f"Voice ID: {voice.voice_id}")
|
|
330
|
+
```
|
|
331
|
+
|
|
332
|
+
**After cloning, ALWAYS test with a short clip before generating long content:**
|
|
333
|
+
|
|
334
|
+
```python
|
|
335
|
+
audio_gen = client.text_to_speech.convert(
|
|
336
|
+
text="A short test sentence to verify the voice sounds right.",
|
|
337
|
+
voice_id=voice.voice_id,
|
|
338
|
+
model_id="eleven_multilingual_v2",
|
|
339
|
+
output_format="mp3_44100_128",
|
|
340
|
+
language_code="en",
|
|
341
|
+
# DO NOT pass voice_settings — defaults are best for clones
|
|
342
|
+
)
|
|
343
|
+
```
|
|
344
|
+
|
|
345
|
+
### Model Compatibility with Cloned Voices
|
|
346
|
+
|
|
347
|
+
| Model | Works with clones? | Notes |
|
|
348
|
+
|-------|-------------------|-------|
|
|
349
|
+
| `eleven_multilingual_v2` | **YES** — use this | Best voice fidelity with clones |
|
|
350
|
+
| `eleven_v3` | **NO** | Smooth output but voice identity completely lost |
|
|
351
|
+
| `eleven_flash_v2_5` | Untested | May work, lower quality expected |
|
|
352
|
+
| `eleven_turbo_v2_5` | Untested | May work |
|
|
353
|
+
|
|
354
|
+
### Voice Settings with Clones
|
|
355
|
+
|
|
356
|
+
**Do NOT override VoiceSettings for cloned voices.** Default settings produce the
|
|
357
|
+
best results. Every combination tested (stability 0.3-0.8, similarity 0.5-1.0,
|
|
358
|
+
style 0.3-0.7, speaker boost on/off) made the output worse — garbled, robotic,
|
|
359
|
+
or unnatural pacing.
|
|
360
|
+
|
|
361
|
+
If you must tweak, test with a single sentence first and compare to the no-settings
|
|
362
|
+
version before committing to a full generation.
|
|
363
|
+
|
|
364
|
+
**Exit:** Custom voice created and tested.
|
|
365
|
+
|
|
366
|
+
## Phase 4: Single-Voice Podcast
|
|
367
|
+
|
|
368
|
+
**Entry:** User wants podcast-style audio (single voice, long content).
|
|
369
|
+
|
|
370
|
+
Use the **Phase 2 Long-Form Narration** approach with request stitching.
|
|
371
|
+
The old approach (4500-char chunks with `previous_text`) produces lower
|
|
372
|
+
quality than small chunks with `previous_request_ids`.
|
|
373
|
+
|
|
374
|
+
## Phase 5: Multi-Voice Podcast (Dialogue)
|
|
375
|
+
|
|
376
|
+
```python
|
|
377
|
+
#!/usr/bin/env python3
|
|
378
|
+
"""ElevenLabs multi-voice podcast generator."""
|
|
262
379
|
import os
|
|
263
380
|
from elevenlabs.client import ElevenLabs
|
|
264
381
|
|
|
265
|
-
# --- CONFIG (Claude fills these — run Phase 0 Step 3 to list voice IDs) ---
|
|
266
382
|
SEGMENTS = [
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
("VOICE_ID_HOST", "Welcome to the show. Today we're talking about..."),
|
|
270
|
-
("VOICE_ID_GUEST", "Thanks for having me. Let's dive into the science."),
|
|
271
|
-
("VOICE_ID_HOST", "So how does this actually work?"),
|
|
272
|
-
("VOICE_ID_GUEST", "Great question. It starts with..."),
|
|
383
|
+
("VOICE_ID_HOST", "Welcome to the show..."),
|
|
384
|
+
("VOICE_ID_GUEST", "Thanks for having me..."),
|
|
273
385
|
]
|
|
274
386
|
MODEL_ID = "eleven_multilingual_v2"
|
|
275
387
|
OUTPUT_PATH = "dialogue-podcast.mp3"
|
|
276
|
-
# --- END CONFIG ---
|
|
277
388
|
|
|
278
389
|
def to_bytes(audio) -> bytes:
|
|
279
|
-
"""Normalize convert() output — returns bytes on <3.14, generator on >=3.14."""
|
|
280
390
|
return audio if isinstance(audio, bytes) else b"".join(audio)
|
|
281
391
|
|
|
282
|
-
|
|
283
|
-
VOICE_NAMES = {}
|
|
284
|
-
|
|
285
|
-
key_file = os.path.expanduser("~/.elevenlabs/api_key")
|
|
286
|
-
api_key = open(key_file).read().strip() if os.path.exists(key_file) else os.environ["ELEVENLABS_API_KEY"]
|
|
287
|
-
client = ElevenLabs(api_key=api_key)
|
|
288
|
-
|
|
289
|
-
# Resolve voice names for logging
|
|
290
|
-
try:
|
|
291
|
-
voices = client.voices.get_all()
|
|
292
|
-
VOICE_NAMES = {v.voice_id: v.name for v in voices.voices}
|
|
293
|
-
except Exception:
|
|
294
|
-
pass
|
|
295
|
-
|
|
296
|
-
# Generate silence for pauses
|
|
297
|
-
silence = to_bytes(client.text_to_speech.convert(
|
|
298
|
-
text="...",
|
|
299
|
-
voice_id=SEGMENTS[0][0],
|
|
300
|
-
model_id=MODEL_ID,
|
|
301
|
-
output_format="mp3_44100_128",
|
|
302
|
-
))
|
|
303
|
-
|
|
304
|
-
print(f"Generating {len(SEGMENTS)} segments...")
|
|
392
|
+
client = ElevenLabs(api_key=get_api_key())
|
|
305
393
|
|
|
306
394
|
audio_parts = []
|
|
307
395
|
for i, (voice_id, text) in enumerate(SEGMENTS):
|
|
308
|
-
|
|
309
|
-
preview = text[:60].replace("\n", " ")
|
|
310
|
-
print(f" [{i+1}/{len(SEGMENTS)}] {name}: {preview}...")
|
|
311
|
-
|
|
396
|
+
print(f" [{i+1}/{len(SEGMENTS)}] {text[:50]}...")
|
|
312
397
|
audio_bytes = to_bytes(client.text_to_speech.convert(
|
|
313
398
|
text=text,
|
|
314
399
|
voice_id=voice_id,
|
|
315
400
|
model_id=MODEL_ID,
|
|
316
401
|
output_format="mp3_44100_128",
|
|
402
|
+
language_code="en",
|
|
317
403
|
))
|
|
318
404
|
audio_parts.append(audio_bytes)
|
|
319
|
-
if i < len(SEGMENTS) - 1:
|
|
320
|
-
audio_parts.append(silence)
|
|
321
405
|
|
|
322
406
|
with open(OUTPUT_PATH, "wb") as f:
|
|
323
407
|
for part in audio_parts:
|
|
324
408
|
f.write(part)
|
|
325
409
|
|
|
326
|
-
|
|
327
|
-
print(f"\nSaved to {OUTPUT_PATH} ({size_mb:.1f} MB)")
|
|
410
|
+
print(f"Saved to {OUTPUT_PATH}")
|
|
328
411
|
```
|
|
329
412
|
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
## Phase 3: Voice Cloning
|
|
333
|
-
|
|
334
|
-
**Entry:** User wants a custom voice from audio samples.
|
|
335
|
-
|
|
336
|
-
### Instant Voice Clone (1-5 min of audio)
|
|
337
|
-
|
|
338
|
-
```python
|
|
339
|
-
voice = client.clone(
|
|
340
|
-
name="My Custom Voice",
|
|
341
|
-
description="Professional male, mid-30s, neutral accent",
|
|
342
|
-
files=["sample1.mp3", "sample2.mp3"],
|
|
343
|
-
)
|
|
344
|
-
print(f"Cloned voice ID: {voice.voice_id}")
|
|
345
|
-
```
|
|
346
|
-
|
|
347
|
-
### Voice Design (Generate New Voice)
|
|
348
|
-
|
|
349
|
-
```python
|
|
350
|
-
audio = client.text_to_speech.convert(
|
|
351
|
-
text="Testing a designed voice.",
|
|
352
|
-
voice_id="custom",
|
|
353
|
-
model_id="eleven_multilingual_v2",
|
|
354
|
-
)
|
|
355
|
-
```
|
|
356
|
-
|
|
357
|
-
**Exit:** Custom voice created and tested.
|
|
358
|
-
|
|
359
|
-
## Phase 4: Sound Effects
|
|
413
|
+
## Phase 6: Sound Effects
|
|
360
414
|
|
|
361
415
|
```python
|
|
362
416
|
audio = client.text_to_sound_effects.convert(
|
|
363
417
|
text="Heavy rain on a tin roof with distant thunder",
|
|
364
418
|
duration_seconds=10.0,
|
|
365
419
|
)
|
|
366
|
-
|
|
367
420
|
with open("rain.mp3", "wb") as f:
|
|
368
|
-
f.write(audio)
|
|
421
|
+
f.write(to_bytes(audio))
|
|
369
422
|
```
|
|
370
423
|
|
|
371
|
-
Tips: be specific ("footsteps on gravel" > "walking sounds"), include environment
|
|
424
|
+
Tips: be specific ("footsteps on gravel" > "walking sounds"), include environment, specify duration.
|
|
372
425
|
|
|
373
|
-
## Phase
|
|
426
|
+
## Phase 7: Speech-to-Speech (Voice Transform)
|
|
374
427
|
|
|
375
428
|
```python
|
|
376
429
|
with open("input.mp3", "rb") as f:
|
|
377
430
|
input_audio = f.read()
|
|
378
431
|
|
|
379
|
-
transformed = client.speech_to_speech.convert(
|
|
432
|
+
transformed = to_bytes(client.speech_to_speech.convert(
|
|
380
433
|
audio=input_audio,
|
|
381
434
|
voice_id="target_voice_id",
|
|
382
435
|
model_id="eleven_english_sts_v2",
|
|
383
|
-
)
|
|
384
|
-
|
|
436
|
+
))
|
|
385
437
|
with open("transformed.mp3", "wb") as f:
|
|
386
438
|
f.write(transformed)
|
|
387
439
|
```
|
|
388
440
|
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
## Phase 6: Audio Isolation (Noise Removal)
|
|
441
|
+
## Phase 8: Audio Isolation (Noise Removal)
|
|
392
442
|
|
|
393
443
|
```python
|
|
394
444
|
with open("noisy.mp3", "rb") as f:
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
clean = client.audio_isolation.audio_isolation(audio=noisy_audio)
|
|
398
|
-
|
|
445
|
+
clean = to_bytes(client.audio_isolation.audio_isolation(audio=f.read()))
|
|
399
446
|
with open("clean.mp3", "wb") as f:
|
|
400
447
|
f.write(clean)
|
|
401
448
|
```
|
|
402
449
|
|
|
403
|
-
## Phase 7: Dubbing / Translation
|
|
404
|
-
|
|
405
|
-
```python
|
|
406
|
-
result = client.dubbing.dub_a_video_or_an_audio_file(
|
|
407
|
-
file=open("video.mp4", "rb"),
|
|
408
|
-
target_lang="es",
|
|
409
|
-
source_lang="en",
|
|
410
|
-
)
|
|
411
|
-
dubbing_id = result.dubbing_id
|
|
412
|
-
|
|
413
|
-
# Poll for completion
|
|
414
|
-
import time
|
|
415
|
-
while True:
|
|
416
|
-
status = client.dubbing.get_dubbing_project_metadata(dubbing_id)
|
|
417
|
-
if status.status == "dubbed":
|
|
418
|
-
break
|
|
419
|
-
print(f"Status: {status.status}...")
|
|
420
|
-
time.sleep(10)
|
|
421
|
-
|
|
422
|
-
dubbed = client.dubbing.get_dubbed_file(dubbing_id, target_lang="es")
|
|
423
|
-
with open("dubbed_es.mp4", "wb") as f:
|
|
424
|
-
f.write(dubbed)
|
|
425
|
-
```
|
|
426
|
-
|
|
427
450
|
## CLI Quick Reference
|
|
428
451
|
|
|
429
|
-
When the ElevenLabs CLI (`elevenlabs`) is installed and authenticated:
|
|
430
|
-
|
|
431
452
|
```bash
|
|
432
|
-
# Auth
|
|
433
453
|
elevenlabs auth login # Interactive API key setup
|
|
434
454
|
elevenlabs auth whoami --no-ui # Check status
|
|
435
455
|
elevenlabs auth logout # Remove stored key
|
|
436
|
-
|
|
437
|
-
# Agents (conversational AI)
|
|
438
|
-
elevenlabs agents init # Init project
|
|
439
|
-
elevenlabs agents add "My Agent" # Create agent
|
|
440
|
-
elevenlabs agents push # Deploy to ElevenLabs
|
|
441
|
-
elevenlabs agents list --no-ui # List agents
|
|
442
|
-
|
|
443
|
-
# The CLI is focused on agent management, NOT TTS.
|
|
444
|
-
# For TTS/podcast/audio generation, use the Python SDK (this skill).
|
|
445
456
|
```
|
|
446
457
|
|
|
458
|
+
The CLI is focused on agent management, NOT TTS. For TTS, use the Python SDK.
|
|
459
|
+
|
|
447
460
|
## Model Selection
|
|
448
461
|
|
|
449
|
-
| Model ID | Best For | Char Limit | Latency |
|
|
450
|
-
|
|
451
|
-
| `
|
|
452
|
-
| `
|
|
453
|
-
| `eleven_flash_v2_5` | Ultra-low latency | 40,000 | ~75ms |
|
|
454
|
-
| `eleven_turbo_v2_5` | Quality + speed | 40,000 | ~250ms |
|
|
462
|
+
| Model ID | Best For | Char Limit | Latency | Clone Support |
|
|
463
|
+
|----------|----------|------------|---------|---------------|
|
|
464
|
+
| `eleven_multilingual_v2` | **Long-form, cloned voices** | 10,000 | Standard | **YES** |
|
|
465
|
+
| `eleven_v3` | Dramatic, expressive (stock voices) | 5,000 | ~300ms | NO — loses identity |
|
|
466
|
+
| `eleven_flash_v2_5` | Ultra-low latency | 40,000 | ~75ms | Untested |
|
|
467
|
+
| `eleven_turbo_v2_5` | Quality + speed | 40,000 | ~250ms | Untested |
|
|
455
468
|
|
|
456
469
|
```
|
|
457
|
-
|
|
458
|
-
├─ Yes →
|
|
470
|
+
Using a cloned voice?
|
|
471
|
+
├─ Yes → eleven_multilingual_v2 (only reliable option)
|
|
459
472
|
└─ No → Content > 5,000 chars?
|
|
460
473
|
├─ Yes → eleven_multilingual_v2
|
|
461
474
|
└─ No → Need dramatic delivery?
|
|
462
475
|
├─ Yes → eleven_v3
|
|
463
|
-
└─ No →
|
|
476
|
+
└─ No → Need low latency?
|
|
477
|
+
├─ Yes → eleven_flash_v2_5
|
|
478
|
+
└─ No → eleven_turbo_v2_5
|
|
464
479
|
```
|
|
465
480
|
|
|
466
|
-
##
|
|
481
|
+
## Pause & Pronunciation Control
|
|
467
482
|
|
|
468
|
-
|
|
469
|
-
|--------|-----------|------------|----------|
|
|
470
|
-
| Stable narration | 0.8 | 0.75 | Podcasts, audiobooks |
|
|
471
|
-
| Expressive | 0.3 | 0.85 | Dramatic reading |
|
|
472
|
-
| Balanced | 0.5 | 0.5 | General purpose |
|
|
483
|
+
### Pauses
|
|
473
484
|
|
|
474
|
-
|
|
475
|
-
|
|
485
|
+
| Method | Works? | Notes |
|
|
486
|
+
|--------|--------|-------|
|
|
487
|
+
| `<break time="0.7s" />` | **YES** (v2 only) | SSML break tag, up to 3s. Use sparingly (max 5-6 per generation) |
|
|
488
|
+
| Paragraph breaks | **YES** | Natural, reliable, no cost |
|
|
489
|
+
| Short sentences | **YES** | Best method — rhythm from writing |
|
|
490
|
+
| `...` ellipsis | **NO** | Causes hesitation/nervousness artifacts |
|
|
491
|
+
| Multiple dashes `-- --` | Somewhat | Inconsistent |
|
|
476
492
|
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
493
|
+
### Pronunciation
|
|
494
|
+
|
|
495
|
+
| Method | Works? | Notes |
|
|
496
|
+
|--------|--------|-------|
|
|
497
|
+
| Phonetic spelling in text | **YES** | Most reliable: "Eigh-bility" instead of "Aibility" |
|
|
498
|
+
| Pronunciation dictionary API | **UNRELIABLE** | Silently ignored with some model/voice combos |
|
|
499
|
+
| `<lexeme>` tags in text | **NO** | Read aloud as text |
|
|
500
|
+
| `<phoneme>` SSML tags | v2: NO, Flash v2: YES | Only works with specific models |
|
|
501
|
+
|
|
502
|
+
**Rule: Always use phonetic spelling directly in the speech text.** Don't rely on dictionaries or SSML phoneme tags.
|
|
484
503
|
|
|
485
504
|
## Output Formats
|
|
486
505
|
|
|
@@ -488,47 +507,45 @@ audio = client.text_to_speech.convert(
|
|
|
488
507
|
|--------|---------|----------|
|
|
489
508
|
| `mp3_44100_128` | High | Default, general purpose |
|
|
490
509
|
| `mp3_44100_192` | Highest MP3 | Archival |
|
|
491
|
-
| `mp3_22050_32` | Low | Previews |
|
|
492
510
|
| `pcm_44100` | Lossless | Post-processing |
|
|
493
511
|
|
|
494
|
-
## Error Handling
|
|
495
|
-
|
|
496
|
-
```python
|
|
497
|
-
from elevenlabs.core import ApiError
|
|
498
|
-
|
|
499
|
-
try:
|
|
500
|
-
audio = client.text_to_speech.convert(...)
|
|
501
|
-
except ApiError as e:
|
|
502
|
-
if e.status_code == 401:
|
|
503
|
-
print("Bad API key. Run: elevenlabs auth login")
|
|
504
|
-
elif e.status_code == 429:
|
|
505
|
-
print("Rate limited. Wait and retry.")
|
|
506
|
-
elif e.status_code == 422:
|
|
507
|
-
print(f"Invalid params: {e.body}")
|
|
508
|
-
else:
|
|
509
|
-
raise
|
|
510
|
-
```
|
|
511
|
-
|
|
512
512
|
## Cost Awareness
|
|
513
513
|
|
|
514
514
|
- Characters are the billing unit — every API call costs characters
|
|
515
|
+
- **Small-chunk stitching uses ~1.5x the character count** (overhead per request)
|
|
515
516
|
- **Preview short clips first** before generating long content
|
|
516
517
|
- **Cache generated audio** — don't regenerate the same text
|
|
517
518
|
- `eleven_flash_v2_5` is 50% cheaper than other models
|
|
518
|
-
-
|
|
519
|
+
- Request IDs expire after 2 hours — regenerate within that window
|
|
520
|
+
|
|
521
|
+
## Error Handling
|
|
522
|
+
|
|
523
|
+
```python
|
|
524
|
+
# When using httpx directly (for request stitching):
|
|
525
|
+
resp = httpx.post(url, json=data, headers=headers, timeout=60)
|
|
526
|
+
if resp.status_code == 401:
|
|
527
|
+
print("Bad API key.")
|
|
528
|
+
elif resp.status_code == 400 and "quota_exceeded" in resp.text:
|
|
529
|
+
print("Out of credits.")
|
|
530
|
+
elif resp.status_code != 200:
|
|
531
|
+
print(f"Error {resp.status_code}: {resp.text[:200]}")
|
|
532
|
+
```
|
|
519
533
|
|
|
520
534
|
## Validate
|
|
521
535
|
|
|
522
536
|
- [ ] API key loaded from `~/.elevenlabs/api_key` or env var, never hardcoded
|
|
523
|
-
- [ ] Model selected matches
|
|
524
|
-
- [ ]
|
|
525
|
-
- [ ]
|
|
526
|
-
- [ ]
|
|
537
|
+
- [ ] Model selected matches voice type (v2 for clones, see model table)
|
|
538
|
+
- [ ] `language_code` set for cloned or accented voices
|
|
539
|
+
- [ ] No VoiceSettings overrides for cloned voices
|
|
540
|
+
- [ ] No `...` ellipses in speech text
|
|
541
|
+
- [ ] Speech text reviewed — numbers written out, names phonetic
|
|
542
|
+
- [ ] Test clip generated and approved before full generation
|
|
543
|
+
- [ ] For long-form: using request stitching with small chunks
|
|
527
544
|
- [ ] Audio file saved outside git-tracked directories
|
|
528
545
|
- [ ] File size and duration reported to user
|
|
529
546
|
|
|
530
547
|
## What Makes This babel-fish
|
|
531
548
|
|
|
532
|
-
- **
|
|
533
|
-
- **
|
|
534
|
-
- **
|
|
549
|
+
- **Battle-tested** — every recommendation comes from proven success or documented failure
|
|
550
|
+
- **Request stitching** — the key to consistent long-form quality
|
|
551
|
+
- **Clone-aware** — different rules for cloned vs stock voices, learned the hard way
|
package/src/utils/transform.ts
CHANGED
|
@@ -1,8 +1,73 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Transform SKILL.md content from Claude Code conventions to target tool conventions.
|
|
3
3
|
*/
|
|
4
|
+
const CODEX_COMMAND_ALIASES: Record<string, string> = {
|
|
5
|
+
"/architect": "$architect",
|
|
6
|
+
"/architecture-review": "$architecture-review",
|
|
7
|
+
"/audio": "$audio",
|
|
8
|
+
"/babel-fish:visualize": "$visualize",
|
|
9
|
+
"/brainstorm": "$brainstorm",
|
|
10
|
+
"/capture": "$capture",
|
|
11
|
+
"/coach": "$coach",
|
|
12
|
+
"/codex": "$codex",
|
|
13
|
+
"/compound": "$compound",
|
|
14
|
+
"/craft-skill": "$craft-skill",
|
|
15
|
+
"/cto": "$cto",
|
|
16
|
+
"/deep-thought:architect": "$architect",
|
|
17
|
+
"/deep-thought:architecture-review": "$architecture-review",
|
|
18
|
+
"/deep-thought:brainstorm": "$brainstorm",
|
|
19
|
+
"/deep-thought:craft-skill": "$craft-skill",
|
|
20
|
+
"/deep-thought:cto": "$cto",
|
|
21
|
+
"/deep-thought:investigate": "$investigate",
|
|
22
|
+
"/deep-thought:plan": "$plan",
|
|
23
|
+
"/deep-thought:review": "$review",
|
|
24
|
+
"/deep-thought:think": "$think",
|
|
25
|
+
"/gemini": "$gemini",
|
|
26
|
+
"/goal-checkin": "$goal-checkin",
|
|
27
|
+
"/goal-setting": "$goal-setting",
|
|
28
|
+
"/guide:capture": "$capture",
|
|
29
|
+
"/guide:codex": "$codex",
|
|
30
|
+
"/guide:gemini": "$gemini",
|
|
31
|
+
"/guide:pipeline": "$pipeline",
|
|
32
|
+
"/guide:setup": "$setup",
|
|
33
|
+
"/guide:write-post": "$write-post",
|
|
34
|
+
"/image": "$image",
|
|
35
|
+
"/investigate": "$investigate",
|
|
36
|
+
"/marvin:compound": "$compound",
|
|
37
|
+
"/marvin:quick-review": "$quick-review",
|
|
38
|
+
"/marvin:redteam": "$redteam",
|
|
39
|
+
"/marvin:review": "$review",
|
|
40
|
+
"/marvin:scaffold": "$scaffold",
|
|
41
|
+
"/marvin:test-writer": "$test-writer",
|
|
42
|
+
"/marvin:work": "$work",
|
|
43
|
+
"/pipeline": "$pipeline",
|
|
44
|
+
"/plan": "$plan",
|
|
45
|
+
"/quick-review": "$quick-review",
|
|
46
|
+
"/redteam": "$redteam",
|
|
47
|
+
"/reflect": "$reflect",
|
|
48
|
+
"/review": "$review",
|
|
49
|
+
"/scaffold": "$scaffold",
|
|
50
|
+
"/setup": "$setup",
|
|
51
|
+
"/test-writer": "$test-writer",
|
|
52
|
+
"/think": "$think",
|
|
53
|
+
"/visualize": "$visualize",
|
|
54
|
+
"/work": "$work",
|
|
55
|
+
"/write-post": "$write-post",
|
|
56
|
+
};
|
|
57
|
+
|
|
58
|
+
function replaceCodexCommandAliases(content: string): string {
|
|
59
|
+
let transformed = content;
|
|
60
|
+
for (const [source, target] of Object.entries(CODEX_COMMAND_ALIASES).sort(
|
|
61
|
+
([a], [b]) => b.length - a.length
|
|
62
|
+
)) {
|
|
63
|
+
const escaped = source.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
64
|
+
transformed = transformed.replace(new RegExp(`${escaped}\\b`, "g"), target);
|
|
65
|
+
}
|
|
66
|
+
return transformed;
|
|
67
|
+
}
|
|
68
|
+
|
|
4
69
|
export function transformContentForCodex(content: string): string {
|
|
5
|
-
return content
|
|
70
|
+
return replaceCodexCommandAliases(content)
|
|
6
71
|
.replace(/~\/\.claude\//g, "~/.codex/")
|
|
7
72
|
.replace(/\.claude\//g, ".codex/")
|
|
8
73
|
.replace(
|