@heart-of-gold/toolkit 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -17,7 +17,7 @@
17
17
  bunx @heart-of-gold/toolkit install --to codex
18
18
  ```
19
19
 
20
- The Codex target also applies Codex-specific wording transforms for flagship shared skills so interactive flows like `brainstorm` and `plan` more strongly encourage Codex's structured user-input UI instead of falling back to plain text when richer selection UX is available.
20
+ The Codex target also applies Codex-specific wording transforms for flagship shared skills so interactive flows like `brainstorm` and `plan` more strongly encourage Codex's structured user-input UI instead of falling back to plain text when richer selection UX is available. It also rewrites slash-command references in installed skill text — both plain (`/plan`, `/work`) and plugin-prefixed (`/deep-thought:review`, `/marvin:compound`) — to Codex-style `$...` skill invocations.
21
21
 
22
22
  ### OpenCode
23
23
  ```bash
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@heart-of-gold/toolkit",
3
- "version": "0.1.7",
3
+ "version": "0.1.8",
4
4
  "type": "module",
5
5
  "description": "Cross-platform installer for Heart of Gold skills — works with Codex, OpenCode, Pi, Claude Code, and more",
6
6
  "bin": {
@@ -34,8 +34,11 @@ Includes ready-to-run Python scripts. Claude writes them to a temp file and exec
34
34
  | Hardcode API key in script | Leaks credentials to git history | Security incident |
35
35
  | Skip voice selection | Default voice may not match content tone | Wasted credits on re-gen |
36
36
  | Generate full podcast without preview | Long audio = expensive; mistakes compound | Non-refundable credits |
37
- | Use `eleven_v3` for everything | 5,000 char limit — wrong for long-form | Truncated audio |
38
37
  | Import pydub for concatenation | Broken on Python 3.13+ (audioop removed) | Runtime crash |
38
+ | Use VoiceSettings with cloned voices | Custom settings destabilize cloned voices | Garbled/robotic audio |
39
+ | Use `...` for pauses | Causes hesitation/nervousness artifacts | Unnatural stuttering |
40
+ | Use large chunks for long content | Quality degrades in second half | Robotic pacing |
41
+ | Skip `language_code` with accented speakers | Model guesses language from accent | Chinese/French mid-narration |
39
42
 
40
43
  ## Phase 0: Environment Setup
41
44
 
@@ -63,11 +66,9 @@ import os
63
66
 
64
67
  def get_api_key() -> str:
65
68
  """Resolve ElevenLabs API key from CLI store, env var, or fail."""
66
- # 1. CLI stored key
67
69
  key_file = os.path.expanduser("~/.elevenlabs/api_key")
68
70
  if os.path.exists(key_file):
69
71
  return open(key_file).read().strip()
70
- # 2. Environment variable
71
72
  key = os.environ.get("ELEVENLABS_API_KEY", "")
72
73
  if key:
73
74
  return key
@@ -80,17 +81,15 @@ def get_api_key() -> str:
80
81
  ### Step 2: Install SDK (if needed)
81
82
 
82
83
  ```bash
83
- # Check if installed
84
84
  python3 -c "import elevenlabs" 2>/dev/null || uv pip install --system --break-system-packages elevenlabs
85
85
  ```
86
86
 
87
87
  **IMPORTANT:** Do NOT install pydub. It's broken on Python 3.13+ (audioop removed). The scripts
88
- below use raw MP3 byte concatenation instead — MP3 is a frame-based format and files can be
88
+ below use raw MP3 byte concatenation — MP3 is a frame-based format and files can be
89
89
  concatenated directly.
90
90
 
91
91
  **IMPORTANT:** On Python 3.14+, `client.text_to_speech.convert()` returns a **generator**, not
92
- bytes. All scripts below use a `to_bytes()` helper to normalize this. Never call `f.write(audio)`
93
- directly — always wrap with `to_bytes(audio)` first.
92
+ bytes. All scripts below use a `to_bytes()` helper to normalize this.
94
93
 
95
94
  ### Step 3: Verify Connection
96
95
 
@@ -111,16 +110,13 @@ for v in voices.voices[:10]:
111
110
  ```
112
111
 
113
112
  **CRITICAL:** Voice IDs are **account-specific**. Never hardcode voice IDs from examples or
114
- documentation — always run Step 3 first to discover the actual IDs available on the user's
115
- account. The same voice name (e.g., "Alice") may have a different ID across accounts.
113
+ documentation — always run Step 3 first to discover the actual IDs available.
116
114
 
117
115
  **Exit:** Auth verified, SDK installed, voices listed.
118
116
 
119
117
  ## Phase 1: Quick Text-to-Speech
120
118
 
121
- **Entry:** User wants a single audio file from text (< 10,000 chars).
122
-
123
- Write this script to a temp file and execute:
119
+ **Entry:** User wants a single audio file from text (< 5,000 chars).
124
120
 
125
121
  ```python
126
122
  #!/usr/bin/env python3
@@ -128,16 +124,12 @@ Write this script to a temp file and execute:
128
124
  import os
129
125
  from elevenlabs.client import ElevenLabs
130
126
 
131
- # --- CONFIG (Claude fills these — run Phase 0 Step 3 to list voice IDs) ---
132
127
  TEXT = """Your text here."""
133
- VOICE_ID = "FILL_FROM_VOICE_LIST" # Run voice list first!
134
- MODEL_ID = "eleven_multilingual_v2" # See model table below
135
- OUTPUT_FORMAT = "mp3_44100_128"
128
+ VOICE_ID = "FILL_FROM_VOICE_LIST"
129
+ MODEL_ID = "eleven_multilingual_v2"
136
130
  OUTPUT_PATH = "output.mp3"
137
- # --- END CONFIG ---
138
131
 
139
132
  def to_bytes(audio) -> bytes:
140
- """Normalize convert() output — returns bytes on <3.14, generator on >=3.14."""
141
133
  return audio if isinstance(audio, bytes) else b"".join(audio)
142
134
 
143
135
  key_file = os.path.expanduser("~/.elevenlabs/api_key")
@@ -149,338 +141,365 @@ audio = to_bytes(client.text_to_speech.convert(
149
141
  text=TEXT,
150
142
  voice_id=VOICE_ID,
151
143
  model_id=MODEL_ID,
152
- output_format=OUTPUT_FORMAT,
144
+ output_format="mp3_44100_128",
145
+ language_code="en", # ALWAYS set for cloned/accented voices
153
146
  ))
154
147
 
155
148
  with open(OUTPUT_PATH, "wb") as f:
156
149
  f.write(audio)
157
-
158
- size_kb = os.path.getsize(OUTPUT_PATH) / 1024
159
- print(f"Saved to {OUTPUT_PATH} ({size_kb:.0f} KB)")
150
+ print(f"Saved to {OUTPUT_PATH} ({os.path.getsize(OUTPUT_PATH) / 1024:.0f} KB)")
160
151
  ```
161
152
 
162
- **Exit:** Audio file saved, size reported.
153
+ **Exit:** Audio file saved.
163
154
 
164
- ## Phase 2: Podcast / Long-Form Audio
155
+ ## Phase 2: Long-Form Narration (Blog Posts, Articles)
165
156
 
166
- **Entry:** User wants podcast-style audio (single or multi-voice).
157
+ **Entry:** User wants narration of long-form content (> 5,000 chars).
167
158
 
168
- This is the main generator. Write it to a temp file, fill in the CONFIG section, execute.
159
+ **THIS IS THE CRITICAL PHASE.** Long-form audio requires special handling to maintain
160
+ quality throughout. The approach below was battle-tested and is the only one that
161
+ produces consistent quality across 10+ minute narrations.
169
162
 
170
- **IMPORTANT:** Uses raw MP3 byte concatenation (no pydub). For pauses between segments,
171
- generates a short silent audio clip via the API once and reuses it.
163
+ ### Step 1: Prepare Speech Text
172
164
 
173
- ### Single-Voice Podcast
165
+ Create a separate `speech-text.md` adapted for listening:
174
166
 
175
- ```python
176
- #!/usr/bin/env python3
177
- """ElevenLabs single-voice podcast generator.
167
+ | Written form | Speech form | Why |
168
+ |-------------|-------------|-----|
169
+ | `90%` | `ninety percent` | TTS mispronounces digits |
170
+ | `1.7 times` | `one point seven times` | Same |
171
+ | `2 AM` | `two in the morning` | Natural speech |
172
+ | `Kačka` | `Kachka` | Phonetic for TTS |
173
+ | `Žaneta` | `Zhaneta` | Phonetic for TTS |
174
+ | `Aibility` | `Eigh-bility` | Phonetic — write directly in text |
175
+ | `**bold text**` | `bold text` | Strip all markdown |
176
+ | `---` | *(remove)* | Strip section breaks |
178
177
 
179
- Splits long text on paragraph boundaries, generates per-chunk with
180
- previous_text continuity, concatenates MP3 bytes directly.
181
- """
182
- import os
183
- from elevenlabs.client import ElevenLabs
178
+ **Pause control:**
179
+ - `<break time="0.7s" />` sub-section pause (v2 supports SSML break tags)
180
+ - `<break time="1.0s" />` — major section transition
181
+ - `<break time="1.2s" />` — thesis/key moment (max recommended)
182
+ - **NEVER use `...`** — causes hesitation/nervousness artifacts
183
+ - **NEVER use more than 5-6 break tags total** — too many cause instability
184
+ - Let paragraph breaks and short sentences create natural pacing
185
+
186
+ **What NOT to do:**
187
+ - Don't add verbal filler ("Hey", "So look", "OK so") — sounds like a podcast host
188
+ - Don't over-break sentences into fragments — the model handles natural sentence rhythm fine
189
+ - Don't use `<lexeme>` tags — they get read aloud as text
190
+ - Don't rely on pronunciation dictionaries — they silently fail with some model/voice combos.
191
+ Write pronunciation phonetically directly in the text instead.
192
+
193
+ ### Step 2: Generate with Request Stitching
184
194
 
185
- # --- CONFIG (Claude fills these run Phase 0 Step 3 to list voice IDs) ---
186
- SCRIPT = """
187
- Your podcast script here.
195
+ **Why this approach:** Large chunks (4000+ chars) degrade in quality the model loses
196
+ emotional range and natural pacing in the second half. Small chunks (800-1200 chars)
197
+ stay high quality. Request stitching chains them together for continuity.
188
198
 
189
- Split into paragraphs with blank lines.
199
+ **CRITICAL for cloned voices:**
200
+ - **`language_code="en"` is mandatory** — without it, the model guesses language from
201
+ accent and can switch to Chinese/French mid-narration
202
+ - **Do NOT pass VoiceSettings** — default settings produce the best results with cloned
203
+ voices. Every custom setting tested made it worse (garbled, robotic, unnatural)
190
204
 
191
- Each paragraph becomes natural speech.
205
+ ```python
206
+ #!/usr/bin/env python3
207
+ """ElevenLabs long-form narration with request stitching.
208
+
209
+ Splits text into small chunks, chains via previous_request_ids for
210
+ continuity, uses httpx directly to access request-id headers.
192
211
  """
193
- VOICE_ID = "FILL_FROM_VOICE_LIST" # Run voice list first!
194
- MODEL_ID = "eleven_multilingual_v2"
195
- OUTPUT_PATH = "podcast.mp3"
196
- CHUNK_SIZE = 4500 # chars per API call (leave margin under 5k/10k limit)
212
+ import os
213
+ import httpx
214
+
215
+ # --- CONFIG ---
216
+ SPEECH_TEXT_PATH = "speech-text.md"
217
+ VOICE_ID = "FILL_FROM_VOICE_LIST"
218
+ OUTPUT_PATH = "speech.mp3"
219
+ CHUNK_SIZE = 1000 # chars per chunk — keep 800-1200 for quality
220
+ LANGUAGE_CODE = "en" # ALWAYS set for cloned/accented voices
197
221
  # --- END CONFIG ---
198
222
 
199
- def to_bytes(audio) -> bytes:
200
- """Normalize convert() output — returns bytes on <3.14, generator on >=3.14."""
201
- return audio if isinstance(audio, bytes) else b"".join(audio)
223
+ api_key_file = os.path.expanduser("~/.elevenlabs/api_key")
224
+ api_key = open(api_key_file).read().strip() if os.path.exists(api_key_file) else os.environ["ELEVENLABS_API_KEY"]
202
225
 
203
- key_file = os.path.expanduser("~/.elevenlabs/api_key")
204
- api_key = open(key_file).read().strip() if os.path.exists(key_file) else os.environ["ELEVENLABS_API_KEY"]
205
- client = ElevenLabs(api_key=api_key)
226
+ with open(SPEECH_TEXT_PATH, "r") as f:
227
+ text = f.read()
206
228
 
207
- # Split on paragraph boundaries
208
- paragraphs = [p.strip() for p in SCRIPT.strip().split("\n\n") if p.strip()]
229
+ # Split into small chunks at paragraph boundaries
230
+ paragraphs = text.split("\n\n")
209
231
  chunks, current = [], ""
210
- for para in paragraphs:
211
- if len(current) + len(para) + 2 > CHUNK_SIZE:
212
- if current:
213
- chunks.append(current)
214
- current = para
232
+ for p in paragraphs:
233
+ if len(current) + len(p) + 2 > CHUNK_SIZE and current.strip():
234
+ chunks.append(current.strip())
235
+ current = p
215
236
  else:
216
- current = f"{current}\n\n{para}" if current else para
217
- if current:
218
- chunks.append(current)
237
+ current = f"{current}\n\n{p}" if current else p
238
+ if current.strip():
239
+ chunks.append(current.strip())
219
240
 
220
- print(f"Script: {len(SCRIPT)} chars -> {len(chunks)} chunks")
241
+ print(f"Script: {len(text)} chars -> {len(chunks)} chunks")
242
+ for i, c in enumerate(chunks):
243
+ print(f" Chunk {i+1}: {len(c)} chars")
221
244
 
222
- # Generate silence for pauses (one short phrase, reuse the bytes)
223
- silence = to_bytes(client.text_to_speech.convert(
224
- text="...",
225
- voice_id=VOICE_ID,
226
- model_id=MODEL_ID,
227
- output_format="mp3_44100_128",
228
- ))
245
+ url = f"https://api.elevenlabs.io/v1/text-to-speech/{VOICE_ID}"
246
+ headers = {"xi-api-key": api_key, "Content-Type": "application/json"}
247
+
248
+ all_audio = b""
249
+ prev_request_id = None
229
250
 
230
- # Generate and concatenate
231
- audio_parts = []
232
251
  for i, chunk in enumerate(chunks):
233
- print(f" [{i+1}/{len(chunks)}] {len(chunk)} chars: {chunk[:50]}...")
234
- audio_bytes = to_bytes(client.text_to_speech.convert(
235
- text=chunk,
236
- voice_id=VOICE_ID,
237
- model_id=MODEL_ID,
238
- output_format="mp3_44100_128",
239
- previous_text=chunks[i - 1][-200:] if i > 0 else None,
240
- ))
241
- audio_parts.append(audio_bytes)
242
- if i < len(chunks) - 1:
243
- audio_parts.append(silence)
252
+ data = {
253
+ "text": chunk,
254
+ "model_id": "eleven_multilingual_v2",
255
+ "output_format": "mp3_44100_128",
256
+ "language_code": LANGUAGE_CODE,
257
+ }
258
+
259
+ # Chain to previous chunk for prosody continuity
260
+ if prev_request_id:
261
+ data["previous_request_ids"] = [prev_request_id]
262
+
263
+ # Give forward context from next chunk
264
+ if i + 1 < len(chunks):
265
+ data["next_text"] = chunks[i + 1][:500]
266
+
267
+ print(f" [{i+1}/{len(chunks)}] {len(chunk)} chars...", end=" ", flush=True)
268
+
269
+ resp = httpx.post(url, json=data, headers=headers, timeout=60)
270
+ if resp.status_code != 200:
271
+ print(f"ERROR {resp.status_code}: {resp.text[:200]}")
272
+ break
273
+
274
+ prev_request_id = resp.headers.get("request-id")
275
+ all_audio += resp.content
276
+ print(f"done ({len(resp.content)//1024} KB)")
244
277
 
245
278
  with open(OUTPUT_PATH, "wb") as f:
246
- for part in audio_parts:
247
- f.write(part)
279
+ f.write(all_audio)
248
280
 
249
281
  size_mb = os.path.getsize(OUTPUT_PATH) / (1024 * 1024)
250
282
  print(f"\nSaved to {OUTPUT_PATH} ({size_mb:.1f} MB)")
251
283
  ```
252
284
 
253
- ### Multi-Voice Podcast (Dialogue)
285
+ **Always test first:** Generate chunks 1-2 as a preview clip before committing to
286
+ the full generation. Credits are non-refundable.
287
+
288
+ ### Step 3: Review and Iterate
289
+
290
+ Listen to the full audio. If specific sections sound off:
291
+ - Regenerate only that chunk using `previous_request_ids` (from the preceding chunk)
292
+ and `next_request_ids` (from the following chunk) to maintain flow
293
+ - Request IDs expire after 2 hours — regenerate within that window
294
+
295
+ **Exit:** Long-form narration audio saved.
296
+
297
+ ## Phase 3: Voice Cloning
298
+
299
+ **Entry:** User wants a custom voice from their audio.
300
+
301
+ ### Recording Requirements
302
+
303
+ | Requirement | Details |
304
+ |------------|---------|
305
+ | Duration | **1-2 minutes** (more than 3 min can be detrimental) |
306
+ | Content | Read your own writing — natural intonation matches best |
307
+ | Quality | Quiet room, no background noise, consistent distance from mic |
308
+ | Format | MP3 128kbps or higher, mono or stereo |
309
+ | Style | Consistent pace and tone — the clone replicates EVERYTHING |
310
+ | Avoid | Stumbles, "uhm"s, long pauses, whispers, shouting, music |
311
+
312
+ **CRITICAL:** Do NOT pre-process the recording with ffmpeg filters (silenceremove,
313
+ loudnorm, etc.). These strip voice characteristics the clone needs. The only
314
+ acceptable preprocessing is trimming to length.
315
+
316
+ ### Instant Voice Clone
254
317
 
255
318
  ```python
256
- #!/usr/bin/env python3
257
- """ElevenLabs multi-voice podcast generator.
319
+ from elevenlabs import ElevenLabs
258
320
 
259
- Each segment has a voice_id and text. Generates per-segment,
260
- concatenates MP3 bytes with silence pauses between speakers.
261
- """
321
+ client = ElevenLabs(api_key=get_api_key())
322
+
323
+ voice = client.voices.ivc.create(
324
+ name="User Voice",
325
+ description="Natural speaking voice for narration",
326
+ files=[open("recording.mp3", "rb")],
327
+ remove_background_noise=False, # Preserve voice characteristics
328
+ )
329
+ print(f"Voice ID: {voice.voice_id}")
330
+ ```
331
+
332
+ **After cloning, ALWAYS test with a short clip before generating long content:**
333
+
334
+ ```python
335
+ audio_gen = client.text_to_speech.convert(
336
+ text="A short test sentence to verify the voice sounds right.",
337
+ voice_id=voice.voice_id,
338
+ model_id="eleven_multilingual_v2",
339
+ output_format="mp3_44100_128",
340
+ language_code="en",
341
+ # DO NOT pass voice_settings — defaults are best for clones
342
+ )
343
+ ```
344
+
345
+ ### Model Compatibility with Cloned Voices
346
+
347
+ | Model | Works with clones? | Notes |
348
+ |-------|-------------------|-------|
349
+ | `eleven_multilingual_v2` | **YES** — use this | Best voice fidelity with clones |
350
+ | `eleven_v3` | **NO** | Smooth output but voice identity completely lost |
351
+ | `eleven_flash_v2_5` | Untested | May work, lower quality expected |
352
+ | `eleven_turbo_v2_5` | Untested | May work |
353
+
354
+ ### Voice Settings with Clones
355
+
356
+ **Do NOT override VoiceSettings for cloned voices.** Default settings produce the
357
+ best results. Every combination tested (stability 0.3-0.8, similarity 0.5-1.0,
358
+ style 0.3-0.7, speaker boost on/off) made the output worse — garbled, robotic,
359
+ or unnatural pacing.
360
+
361
+ If you must tweak, test with a single sentence first and compare to the no-settings
362
+ version before committing to a full generation.
363
+
364
+ **Exit:** Custom voice created and tested.
365
+
366
+ ## Phase 4: Single-Voice Podcast
367
+
368
+ **Entry:** User wants podcast-style audio (single voice, long content).
369
+
370
+ Use the **Phase 2 Long-Form Narration** approach with request stitching.
371
+ The old approach (4500-char chunks with `previous_text`) produces lower
372
+ quality than small chunks with `previous_request_ids`.
373
+
374
+ ## Phase 5: Multi-Voice Podcast (Dialogue)
375
+
376
+ ```python
377
+ #!/usr/bin/env python3
378
+ """ElevenLabs multi-voice podcast generator."""
262
379
  import os
263
380
  from elevenlabs.client import ElevenLabs
264
381
 
265
- # --- CONFIG (Claude fills these — run Phase 0 Step 3 to list voice IDs) ---
266
382
  SEGMENTS = [
267
- # (voice_id, text)
268
- # Voice IDs are account-specific! Always run the voice list first.
269
- ("VOICE_ID_HOST", "Welcome to the show. Today we're talking about..."),
270
- ("VOICE_ID_GUEST", "Thanks for having me. Let's dive into the science."),
271
- ("VOICE_ID_HOST", "So how does this actually work?"),
272
- ("VOICE_ID_GUEST", "Great question. It starts with..."),
383
+ ("VOICE_ID_HOST", "Welcome to the show..."),
384
+ ("VOICE_ID_GUEST", "Thanks for having me..."),
273
385
  ]
274
386
  MODEL_ID = "eleven_multilingual_v2"
275
387
  OUTPUT_PATH = "dialogue-podcast.mp3"
276
- # --- END CONFIG ---
277
388
 
278
389
  def to_bytes(audio) -> bytes:
279
- """Normalize convert() output — returns bytes on <3.14, generator on >=3.14."""
280
390
  return audio if isinstance(audio, bytes) else b"".join(audio)
281
391
 
282
- # Voice name lookup for logging
283
- VOICE_NAMES = {}
284
-
285
- key_file = os.path.expanduser("~/.elevenlabs/api_key")
286
- api_key = open(key_file).read().strip() if os.path.exists(key_file) else os.environ["ELEVENLABS_API_KEY"]
287
- client = ElevenLabs(api_key=api_key)
288
-
289
- # Resolve voice names for logging
290
- try:
291
- voices = client.voices.get_all()
292
- VOICE_NAMES = {v.voice_id: v.name for v in voices.voices}
293
- except Exception:
294
- pass
295
-
296
- # Generate silence for pauses
297
- silence = to_bytes(client.text_to_speech.convert(
298
- text="...",
299
- voice_id=SEGMENTS[0][0],
300
- model_id=MODEL_ID,
301
- output_format="mp3_44100_128",
302
- ))
303
-
304
- print(f"Generating {len(SEGMENTS)} segments...")
392
+ client = ElevenLabs(api_key=get_api_key())
305
393
 
306
394
  audio_parts = []
307
395
  for i, (voice_id, text) in enumerate(SEGMENTS):
308
- name = VOICE_NAMES.get(voice_id, voice_id[:12])
309
- preview = text[:60].replace("\n", " ")
310
- print(f" [{i+1}/{len(SEGMENTS)}] {name}: {preview}...")
311
-
396
+ print(f" [{i+1}/{len(SEGMENTS)}] {text[:50]}...")
312
397
  audio_bytes = to_bytes(client.text_to_speech.convert(
313
398
  text=text,
314
399
  voice_id=voice_id,
315
400
  model_id=MODEL_ID,
316
401
  output_format="mp3_44100_128",
402
+ language_code="en",
317
403
  ))
318
404
  audio_parts.append(audio_bytes)
319
- if i < len(SEGMENTS) - 1:
320
- audio_parts.append(silence)
321
405
 
322
406
  with open(OUTPUT_PATH, "wb") as f:
323
407
  for part in audio_parts:
324
408
  f.write(part)
325
409
 
326
- size_mb = os.path.getsize(OUTPUT_PATH) / (1024 * 1024)
327
- print(f"\nSaved to {OUTPUT_PATH} ({size_mb:.1f} MB)")
410
+ print(f"Saved to {OUTPUT_PATH}")
328
411
  ```
329
412
 
330
- **Exit:** Podcast audio file saved.
331
-
332
- ## Phase 3: Voice Cloning
333
-
334
- **Entry:** User wants a custom voice from audio samples.
335
-
336
- ### Instant Voice Clone (1-5 min of audio)
337
-
338
- ```python
339
- voice = client.clone(
340
- name="My Custom Voice",
341
- description="Professional male, mid-30s, neutral accent",
342
- files=["sample1.mp3", "sample2.mp3"],
343
- )
344
- print(f"Cloned voice ID: {voice.voice_id}")
345
- ```
346
-
347
- ### Voice Design (Generate New Voice)
348
-
349
- ```python
350
- audio = client.text_to_speech.convert(
351
- text="Testing a designed voice.",
352
- voice_id="custom",
353
- model_id="eleven_multilingual_v2",
354
- )
355
- ```
356
-
357
- **Exit:** Custom voice created and tested.
358
-
359
- ## Phase 4: Sound Effects
413
+ ## Phase 6: Sound Effects
360
414
 
361
415
  ```python
362
416
  audio = client.text_to_sound_effects.convert(
363
417
  text="Heavy rain on a tin roof with distant thunder",
364
418
  duration_seconds=10.0,
365
419
  )
366
-
367
420
  with open("rain.mp3", "wb") as f:
368
- f.write(audio)
421
+ f.write(to_bytes(audio))
369
422
  ```
370
423
 
371
- Tips: be specific ("footsteps on gravel" > "walking sounds"), include environment ("in a cathedral"), specify duration.
424
+ Tips: be specific ("footsteps on gravel" > "walking sounds"), include environment, specify duration.
372
425
 
373
- ## Phase 5: Speech-to-Speech (Voice Transform)
426
+ ## Phase 7: Speech-to-Speech (Voice Transform)
374
427
 
375
428
  ```python
376
429
  with open("input.mp3", "rb") as f:
377
430
  input_audio = f.read()
378
431
 
379
- transformed = client.speech_to_speech.convert(
432
+ transformed = to_bytes(client.speech_to_speech.convert(
380
433
  audio=input_audio,
381
434
  voice_id="target_voice_id",
382
435
  model_id="eleven_english_sts_v2",
383
- )
384
-
436
+ ))
385
437
  with open("transformed.mp3", "wb") as f:
386
438
  f.write(transformed)
387
439
  ```
388
440
 
389
- Preserves timing, emotion, pacing. Changes voice identity.
390
-
391
- ## Phase 6: Audio Isolation (Noise Removal)
441
+ ## Phase 8: Audio Isolation (Noise Removal)
392
442
 
393
443
  ```python
394
444
  with open("noisy.mp3", "rb") as f:
395
- noisy_audio = f.read()
396
-
397
- clean = client.audio_isolation.audio_isolation(audio=noisy_audio)
398
-
445
+ clean = to_bytes(client.audio_isolation.audio_isolation(audio=f.read()))
399
446
  with open("clean.mp3", "wb") as f:
400
447
  f.write(clean)
401
448
  ```
402
449
 
403
- ## Phase 7: Dubbing / Translation
404
-
405
- ```python
406
- result = client.dubbing.dub_a_video_or_an_audio_file(
407
- file=open("video.mp4", "rb"),
408
- target_lang="es",
409
- source_lang="en",
410
- )
411
- dubbing_id = result.dubbing_id
412
-
413
- # Poll for completion
414
- import time
415
- while True:
416
- status = client.dubbing.get_dubbing_project_metadata(dubbing_id)
417
- if status.status == "dubbed":
418
- break
419
- print(f"Status: {status.status}...")
420
- time.sleep(10)
421
-
422
- dubbed = client.dubbing.get_dubbed_file(dubbing_id, target_lang="es")
423
- with open("dubbed_es.mp4", "wb") as f:
424
- f.write(dubbed)
425
- ```
426
-
427
450
  ## CLI Quick Reference
428
451
 
429
- When the ElevenLabs CLI (`elevenlabs`) is installed and authenticated:
430
-
431
452
  ```bash
432
- # Auth
433
453
  elevenlabs auth login # Interactive API key setup
434
454
  elevenlabs auth whoami --no-ui # Check status
435
455
  elevenlabs auth logout # Remove stored key
436
-
437
- # Agents (conversational AI)
438
- elevenlabs agents init # Init project
439
- elevenlabs agents add "My Agent" # Create agent
440
- elevenlabs agents push # Deploy to ElevenLabs
441
- elevenlabs agents list --no-ui # List agents
442
-
443
- # The CLI is focused on agent management, NOT TTS.
444
- # For TTS/podcast/audio generation, use the Python SDK (this skill).
445
456
  ```
446
457
 
458
+ The CLI is focused on agent management, NOT TTS. For TTS, use the Python SDK.
459
+
447
460
  ## Model Selection
448
461
 
449
- | Model ID | Best For | Char Limit | Latency | Languages | Cost |
450
- |----------|----------|------------|---------|-----------|------|
451
- | `eleven_v3` | Dramatic, expressive | 5,000 | ~300ms | 70+ | Standard |
452
- | `eleven_multilingual_v2` | Long-form, stable | 10,000 | Standard | 29 | Standard |
453
- | `eleven_flash_v2_5` | Ultra-low latency | 40,000 | ~75ms | 32 | 50% cheaper |
454
- | `eleven_turbo_v2_5` | Quality + speed | 40,000 | ~250ms | 32 | Standard |
462
+ | Model ID | Best For | Char Limit | Latency | Clone Support |
463
+ |----------|----------|------------|---------|---------------|
464
+ | `eleven_multilingual_v2` | **Long-form, cloned voices** | 10,000 | Standard | **YES** |
465
+ | `eleven_v3` | Dramatic, expressive (stock voices) | 5,000 | ~300ms | NO loses identity |
466
+ | `eleven_flash_v2_5` | Ultra-low latency | 40,000 | ~75ms | Untested |
467
+ | `eleven_turbo_v2_5` | Quality + speed | 40,000 | ~250ms | Untested |
455
468
 
456
469
  ```
457
- Need < 75ms latency?
458
- ├─ Yes → eleven_flash_v2_5
470
+ Using a cloned voice?
471
+ ├─ Yes → eleven_multilingual_v2 (only reliable option)
459
472
  └─ No → Content > 5,000 chars?
460
473
  ├─ Yes → eleven_multilingual_v2
461
474
  └─ No → Need dramatic delivery?
462
475
  ├─ Yes → eleven_v3
463
- └─ No → eleven_turbo_v2_5
476
+ └─ No → Need low latency?
477
+ ├─ Yes → eleven_flash_v2_5
478
+ └─ No → eleven_turbo_v2_5
464
479
  ```
465
480
 
466
- ## Voice Settings
481
+ ## Pause & Pronunciation Control
467
482
 
468
- | Preset | Stability | Similarity | Use Case |
469
- |--------|-----------|------------|----------|
470
- | Stable narration | 0.8 | 0.75 | Podcasts, audiobooks |
471
- | Expressive | 0.3 | 0.85 | Dramatic reading |
472
- | Balanced | 0.5 | 0.5 | General purpose |
483
+ ### Pauses
473
484
 
474
- ```python
475
- from elevenlabs import VoiceSettings
485
+ | Method | Works? | Notes |
486
+ |--------|--------|-------|
487
+ | `<break time="0.7s" />` | **YES** (v2 only) | SSML break tag, up to 3s. Use sparingly (max 5-6 per generation) |
488
+ | Paragraph breaks | **YES** | Natural, reliable, no cost |
489
+ | Short sentences | **YES** | Best method — rhythm from writing |
490
+ | `...` ellipsis | **NO** | Causes hesitation/nervousness artifacts |
491
+ | Multiple dashes `-- --` | Somewhat | Inconsistent |
476
492
 
477
- audio = client.text_to_speech.convert(
478
- text="...",
479
- voice_id="...",
480
- model_id="eleven_multilingual_v2",
481
- voice_settings=VoiceSettings(stability=0.8, similarity_boost=0.75),
482
- )
483
- ```
493
+ ### Pronunciation
494
+
495
+ | Method | Works? | Notes |
496
+ |--------|--------|-------|
497
+ | Phonetic spelling in text | **YES** | Most reliable: "Eigh-bility" instead of "Aibility" |
498
+ | Pronunciation dictionary API | **UNRELIABLE** | Silently ignored with some model/voice combos |
499
+ | `<lexeme>` tags in text | **NO** | Read aloud as text |
500
+ | `<phoneme>` SSML tags | v2: NO, Flash v2: YES | Only works with specific models |
501
+
502
+ **Rule: Always use phonetic spelling directly in the speech text.** Don't rely on dictionaries or SSML phoneme tags.
484
503
 
485
504
  ## Output Formats
486
505
 
@@ -488,47 +507,45 @@ audio = client.text_to_speech.convert(
488
507
  |--------|---------|----------|
489
508
  | `mp3_44100_128` | High | Default, general purpose |
490
509
  | `mp3_44100_192` | Highest MP3 | Archival |
491
- | `mp3_22050_32` | Low | Previews |
492
510
  | `pcm_44100` | Lossless | Post-processing |
493
511
 
494
- ## Error Handling
495
-
496
- ```python
497
- from elevenlabs.core import ApiError
498
-
499
- try:
500
- audio = client.text_to_speech.convert(...)
501
- except ApiError as e:
502
- if e.status_code == 401:
503
- print("Bad API key. Run: elevenlabs auth login")
504
- elif e.status_code == 429:
505
- print("Rate limited. Wait and retry.")
506
- elif e.status_code == 422:
507
- print(f"Invalid params: {e.body}")
508
- else:
509
- raise
510
- ```
511
-
512
512
  ## Cost Awareness
513
513
 
514
514
  - Characters are the billing unit — every API call costs characters
515
+ - **Small-chunk stitching uses ~1.5x the character count** (overhead per request)
515
516
  - **Preview short clips first** before generating long content
516
517
  - **Cache generated audio** — don't regenerate the same text
517
518
  - `eleven_flash_v2_5` is 50% cheaper than other models
518
- - The silence-for-pauses trick costs ~3 characters per pause ("...")
519
+ - Request IDs expire after 2 hours regenerate within that window
520
+
521
+ ## Error Handling
522
+
523
+ ```python
524
+ # When using httpx directly (for request stitching):
525
+ resp = httpx.post(url, json=data, headers=headers, timeout=60)
526
+ if resp.status_code == 401:
527
+ print("Bad API key.")
528
+ elif resp.status_code == 400 and "quota_exceeded" in resp.text:
529
+ print("Out of credits.")
530
+ elif resp.status_code != 200:
531
+ print(f"Error {resp.status_code}: {resp.text[:200]}")
532
+ ```
519
533
 
520
534
  ## Validate
521
535
 
522
536
  - [ ] API key loaded from `~/.elevenlabs/api_key` or env var, never hardcoded
523
- - [ ] Model selected matches content length (see model table)
524
- - [ ] Voice selected and approved by user before generation
525
- - [ ] For podcasts: script reviewed before generation (credits are non-refundable)
526
- - [ ] Output format matches downstream requirements
537
+ - [ ] Model selected matches voice type (v2 for clones, see model table)
538
+ - [ ] `language_code` set for cloned or accented voices
539
+ - [ ] No VoiceSettings overrides for cloned voices
540
+ - [ ] No `...` ellipses in speech text
541
+ - [ ] Speech text reviewed — numbers written out, names phonetic
542
+ - [ ] Test clip generated and approved before full generation
543
+ - [ ] For long-form: using request stitching with small chunks
527
544
  - [ ] Audio file saved outside git-tracked directories
528
545
  - [ ] File size and duration reported to user
529
546
 
530
547
  ## What Makes This babel-fish
531
548
 
532
- - **Heart of Gold** — the improbably good ship runs on infinite improbability; this skill turns text into voice with similarly improbable ease
533
- - **Multi-Format Production** — text content becomes audio content in one pass
534
- - **Creative Courage** — ship audio content that would have taken a recording studio
549
+ - **Battle-tested** — every recommendation comes from proven success or documented failure
550
+ - **Request stitching** — the key to consistent long-form quality
551
+ - **Clone-aware** — different rules for cloned vs stock voices, learned the hard way
@@ -1,8 +1,73 @@
1
1
  /**
2
2
  * Transform SKILL.md content from Claude Code conventions to target tool conventions.
3
3
  */
4
+ const CODEX_COMMAND_ALIASES: Record<string, string> = {
5
+ "/architect": "$architect",
6
+ "/architecture-review": "$architecture-review",
7
+ "/audio": "$audio",
8
+ "/babel-fish:visualize": "$visualize",
9
+ "/brainstorm": "$brainstorm",
10
+ "/capture": "$capture",
11
+ "/coach": "$coach",
12
+ "/codex": "$codex",
13
+ "/compound": "$compound",
14
+ "/craft-skill": "$craft-skill",
15
+ "/cto": "$cto",
16
+ "/deep-thought:architect": "$architect",
17
+ "/deep-thought:architecture-review": "$architecture-review",
18
+ "/deep-thought:brainstorm": "$brainstorm",
19
+ "/deep-thought:craft-skill": "$craft-skill",
20
+ "/deep-thought:cto": "$cto",
21
+ "/deep-thought:investigate": "$investigate",
22
+ "/deep-thought:plan": "$plan",
23
+ "/deep-thought:review": "$review",
24
+ "/deep-thought:think": "$think",
25
+ "/gemini": "$gemini",
26
+ "/goal-checkin": "$goal-checkin",
27
+ "/goal-setting": "$goal-setting",
28
+ "/guide:capture": "$capture",
29
+ "/guide:codex": "$codex",
30
+ "/guide:gemini": "$gemini",
31
+ "/guide:pipeline": "$pipeline",
32
+ "/guide:setup": "$setup",
33
+ "/guide:write-post": "$write-post",
34
+ "/image": "$image",
35
+ "/investigate": "$investigate",
36
+ "/marvin:compound": "$compound",
37
+ "/marvin:quick-review": "$quick-review",
38
+ "/marvin:redteam": "$redteam",
39
+ "/marvin:review": "$review",
40
+ "/marvin:scaffold": "$scaffold",
41
+ "/marvin:test-writer": "$test-writer",
42
+ "/marvin:work": "$work",
43
+ "/pipeline": "$pipeline",
44
+ "/plan": "$plan",
45
+ "/quick-review": "$quick-review",
46
+ "/redteam": "$redteam",
47
+ "/reflect": "$reflect",
48
+ "/review": "$review",
49
+ "/scaffold": "$scaffold",
50
+ "/setup": "$setup",
51
+ "/test-writer": "$test-writer",
52
+ "/think": "$think",
53
+ "/visualize": "$visualize",
54
+ "/work": "$work",
55
+ "/write-post": "$write-post",
56
+ };
57
+
58
+ function replaceCodexCommandAliases(content: string): string {
59
+ let transformed = content;
60
+ for (const [source, target] of Object.entries(CODEX_COMMAND_ALIASES).sort(
61
+ ([a], [b]) => b.length - a.length
62
+ )) {
63
+ const escaped = source.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
64
+ transformed = transformed.replace(new RegExp(`${escaped}\\b`, "g"), target);
65
+ }
66
+ return transformed;
67
+ }
68
+
4
69
  export function transformContentForCodex(content: string): string {
5
- return content
70
+ return replaceCodexCommandAliases(content)
6
71
  .replace(/~\/\.claude\//g, "~/.codex/")
7
72
  .replace(/\.claude\//g, ".codex/")
8
73
  .replace(