@biggora/claude-plugins 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +13 -0
- package/CLAUDE.md +55 -0
- package/LICENSE +1 -1
- package/README.md +208 -39
- package/bin/cli.js +39 -0
- package/package.json +30 -17
- package/registry/registry.json +166 -1
- package/registry/schema.json +10 -0
- package/src/commands/skills/add.js +194 -0
- package/src/commands/skills/list.js +52 -0
- package/src/commands/skills/remove.js +27 -0
- package/src/commands/skills/update.js +74 -0
- package/src/config.js +5 -0
- package/src/skills/codex-cli/SKILL.md +265 -0
- package/src/skills/commafeed-api/SKILL.md +1012 -0
- package/src/skills/gemini-cli/SKILL.md +379 -0
- package/src/skills/gemini-cli/references/commands.md +145 -0
- package/src/skills/gemini-cli/references/configuration.md +182 -0
- package/src/skills/gemini-cli/references/headless-and-scripting.md +181 -0
- package/src/skills/gemini-cli/references/mcp-and-extensions.md +254 -0
- package/src/skills/n8n-api/SKILL.md +623 -0
- package/src/skills/notebook-lm/SKILL.md +217 -0
- package/src/skills/notebook-lm/references/artifact-options.md +168 -0
- package/src/skills/notebook-lm/references/auth.md +58 -0
- package/src/skills/notebook-lm/references/workflows.md +144 -0
- package/src/skills/screen-recording/SKILL.md +309 -0
- package/src/skills/screen-recording/references/approach1-programmatic.md +311 -0
- package/src/skills/screen-recording/references/approach2-xvfb.md +232 -0
- package/src/skills/screen-recording/references/design-patterns.md +168 -0
- package/src/skills/test-mobile-app/SKILL.md +212 -0
- package/src/skills/test-mobile-app/references/report-template.md +95 -0
- package/src/skills/test-mobile-app/references/setup-appium.md +154 -0
- package/src/skills/test-mobile-app/scripts/analyze_apk.py +164 -0
- package/src/skills/test-mobile-app/scripts/check_environment.py +116 -0
- package/src/skills/test-mobile-app/scripts/generate_report.py +250 -0
- package/src/skills/test-mobile-app/scripts/run_tests.py +326 -0
- package/src/skills/test-web-ui/SKILL.md +232 -0
- package/src/skills/test-web-ui/references/test_case_schema.md +102 -0
- package/src/skills/test-web-ui/scripts/discover.py +176 -0
- package/src/skills/test-web-ui/scripts/generate_report.py +237 -0
- package/src/skills/test-web-ui/scripts/run_tests.py +296 -0
- package/src/skills/text-to-speech/SKILL.md +236 -0
- package/src/skills/text-to-speech/references/espeak-cli.md +277 -0
- package/src/skills/text-to-speech/references/kokoro-onnx.md +124 -0
- package/src/skills/text-to-speech/references/online-engines.md +128 -0
- package/src/skills/text-to-speech/references/pyttsx3-espeak.md +143 -0
- package/src/skills/tm-search/SKILL.md +240 -0
- package/src/skills/tm-search/references/field-guide.md +79 -0
- package/src/skills/tm-search/references/scraping-fallback.md +140 -0
- package/src/skills/tm-search/scripts/tm_search.py +375 -0
- package/src/skills/wp-rest-api/SKILL.md +114 -0
- package/src/skills/wp-rest-api/references/authentication.md +18 -0
- package/src/skills/wp-rest-api/references/custom-content-types.md +20 -0
- package/src/skills/wp-rest-api/references/discovery-and-params.md +20 -0
- package/src/skills/wp-rest-api/references/responses-and-fields.md +30 -0
- package/src/skills/wp-rest-api/references/routes-and-endpoints.md +36 -0
- package/src/skills/wp-rest-api/references/schema.md +22 -0
- package/src/skills/youtube-search/SKILL.md +412 -0
- package/src/skills/youtube-search/references/parsing-examples.md +159 -0
- package/src/skills/youtube-search/references/youtube-api-quota.md +85 -0
- package/src/skills/youtube-thumbnail/SKILL.md +1060 -0
- package/tests/commands/info.test.js +49 -0
- package/tests/commands/install.test.js +36 -0
- package/tests/commands/list.test.js +66 -0
- package/tests/commands/publish.test.js +182 -0
- package/tests/commands/search.test.js +45 -0
- package/tests/commands/uninstall.test.js +29 -0
- package/tests/commands/update.test.js +59 -0
- package/tests/functional/skills-lifecycle.test.js +293 -0
- package/tests/helpers/fixtures.js +63 -0
- package/tests/integration/cli.test.js +83 -0
- package/tests/skills/add.test.js +138 -0
- package/tests/skills/list.test.js +63 -0
- package/tests/skills/remove.test.js +38 -0
- package/tests/skills/update.test.js +60 -0
- package/tests/unit/config.test.js +31 -0
- package/tests/unit/registry.test.js +79 -0
- package/tests/unit/utils.test.js +150 -0
- package/tests/validation/registry-schema.test.js +112 -0
- package/tests/validation/skills-validation.test.js +96 -0
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
# espeak-ng CLI Reference
|
|
2
|
+
|
|
3
|
+
Direct command-line usage of espeak-ng. Useful when you need fine-grained control over
|
|
4
|
+
phonemes, prosody, SSML, or want to avoid the Python pyttsx3 overhead.
|
|
5
|
+
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
## Basic Usage
|
|
9
|
+
|
|
10
|
+
```bash
|
|
11
|
+
# Speak to stdout (pipe to ffmpeg)
|
|
12
|
+
espeak-ng "Hello world"
|
|
13
|
+
|
|
14
|
+
# Save to WAV file
|
|
15
|
+
espeak-ng -w /tmp/output.wav "Hello world"
|
|
16
|
+
|
|
17
|
+
# Read from file
|
|
18
|
+
espeak-ng -w /tmp/output.wav -f /tmp/script.txt
|
|
19
|
+
|
|
20
|
+
# Read from stdin
|
|
21
|
+
echo "Hello world" | espeak-ng -w /tmp/output.wav
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## Key Flags
|
|
27
|
+
|
|
28
|
+
| Flag | Description | Example |
|
|
29
|
+
|------|-------------|---------|
|
|
30
|
+
| `-v <voice>` | Voice/language | `-v en-gb`, `-v ru`, `-v de` |
|
|
31
|
+
| `-s <rate>` | Speed in words/min (default 175) | `-s 140` |
|
|
32
|
+
| `-p <pitch>` | Pitch 0–99 (default 50) | `-p 55` |
|
|
33
|
+
| `-a <amplitude>` | Volume 0–200 (default 100) | `-a 120` |
|
|
34
|
+
| `-g <gap>` | Gap between words in 10ms units | `-g 5` |
|
|
35
|
+
| `-w <file>` | Write WAV output to file | `-w out.wav` |
|
|
36
|
+
| `-f <file>` | Read input from text file | `-f script.txt` |
|
|
37
|
+
| `--ipa` | Print IPA phonemes to stdout | `--ipa` |
|
|
38
|
+
| `-q` | Quiet — no audio, just phoneme output | `-q --ipa` |
|
|
39
|
+
| `--pho` | Output phoneme mnemonics | `--pho` |
|
|
40
|
+
| `-m` | Interpret input as SSML markup | `-m` |
|
|
41
|
+
| `-b 1` | Input is UTF-8 (default on Linux) | `-b 1` |
|
|
42
|
+
| `--punct` | Speak punctuation characters | `--punct` |
|
|
43
|
+
| `-z` | No final sentence pause | `-z` |
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
## Voice Selection
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
# List all voices
|
|
51
|
+
espeak-ng --voices
|
|
52
|
+
|
|
53
|
+
# List voices for a language
|
|
54
|
+
espeak-ng --voices=en
|
|
55
|
+
espeak-ng --voices=ru
|
|
56
|
+
espeak-ng --voices=zh
|
|
57
|
+
|
|
58
|
+
# Key voice IDs
|
|
59
|
+
# English: en, en-gb, en-us, en-gb-scotland, en-gb-x-gbclan, en-gb-x-rp
|
|
60
|
+
# Russian: ru
|
|
61
|
+
# German: de
|
|
62
|
+
# French: fr
|
|
63
|
+
# Spanish: es, es-419 (Latin America)
|
|
64
|
+
# Chinese: cmn (Mandarin), yue (Cantonese)
|
|
65
|
+
# Japanese: ja
|
|
66
|
+
# Arabic: ar
|
|
67
|
+
# Hindi: hi
|
|
68
|
+
# Korean: ko
|
|
69
|
+
# Italian: it
|
|
70
|
+
# Dutch: nl
|
|
71
|
+
# Polish: pl
|
|
72
|
+
# Ukrainian: uk
|
|
73
|
+
# Turkish: tr
|
|
74
|
+
# Swedish: sv
|
|
75
|
+
# Portuguese: pt, pt-br
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
---
|
|
79
|
+
|
|
80
|
+
## SSML Support
|
|
81
|
+
|
|
82
|
+
espeak-ng understands a subset of SSML. Pass `-m` flag to enable:
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
espeak-ng -m -w /tmp/ssml_out.wav '<speak>
|
|
86
|
+
Hello, <break time="500ms"/> how are you?
|
|
87
|
+
<prosody rate="slow" pitch="+5st">This part is slower and higher.</prosody>
|
|
88
|
+
<emphasis level="strong">Important point here.</emphasis>
|
|
89
|
+
Back to normal speed now.
|
|
90
|
+
</speak>'
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### Supported SSML Tags
|
|
94
|
+
|
|
95
|
+
```xml
|
|
96
|
+
<!-- Pause -->
|
|
97
|
+
<break time="300ms"/>
|
|
98
|
+
<break time="1s"/>
|
|
99
|
+
|
|
100
|
+
<!-- Prosody control -->
|
|
101
|
+
<prosody rate="slow">...</prosody> <!-- slow, medium, fast, x-slow, x-fast -->
|
|
102
|
+
<prosody rate="0.8">...</prosody> <!-- relative rate: 0.5–2.0 -->
|
|
103
|
+
<prosody pitch="+5st">...</prosody> <!-- semitones: -12st to +12st -->
|
|
104
|
+
<prosody pitch="high">...</prosody> <!-- x-low, low, medium, high, x-high -->
|
|
105
|
+
<prosody volume="loud">...</prosody> <!-- silent, x-soft, soft, medium, loud, x-loud -->
|
|
106
|
+
|
|
107
|
+
<!-- Emphasis -->
|
|
108
|
+
<emphasis level="strong">...</emphasis> <!-- none, reduced, moderate, strong -->
|
|
109
|
+
|
|
110
|
+
<!-- Say-as (number/date formatting) -->
|
|
111
|
+
<say-as interpret-as="cardinal">42</say-as>
|
|
112
|
+
<say-as interpret-as="ordinal">3</say-as>
|
|
113
|
+
<say-as interpret-as="characters">CPU</say-as>
|
|
114
|
+
<say-as interpret-as="date" format="ymd">2024-03-15</say-as>
|
|
115
|
+
<say-as interpret-as="time" format="hms24">14:30:00</say-as>
|
|
116
|
+
|
|
117
|
+
<!-- Phoneme (IPA or x-sampa) -->
|
|
118
|
+
<phoneme alphabet="ipa" ph="həˈloʊ">Hello</phoneme>
|
|
119
|
+
<phoneme alphabet="x-sampa" ph="h@'loU">Hello</phoneme>
|
|
120
|
+
|
|
121
|
+
<!-- Sub (spoken alias) -->
|
|
122
|
+
<sub alias="Artificial Intelligence">AI</sub>
|
|
123
|
+
|
|
124
|
+
<!-- Language switch -->
|
|
125
|
+
<voice xml:lang="fr">Bonjour</voice>
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
---
|
|
129
|
+
|
|
130
|
+
## Phoneme Control
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
# Get IPA phonemes for text
|
|
134
|
+
espeak-ng -q --ipa "Hello world"
|
|
135
|
+
# → həlˈəʊ wˈɜːld
|
|
136
|
+
|
|
137
|
+
# Get x-sampa phonemes
|
|
138
|
+
espeak-ng -q --pho "Hello world"
|
|
139
|
+
|
|
140
|
+
# Speak using IPA directly
|
|
141
|
+
espeak-ng -w /tmp/ipa.wav "<phoneme alphabet='ipa' ph='həˈloʊ'>Hello</phoneme>" -m
|
|
142
|
+
|
|
143
|
+
# Print phoneme list for a language
|
|
144
|
+
espeak-ng --voices=en --pho
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
---
|
|
148
|
+
|
|
149
|
+
## Python Subprocess Integration
|
|
150
|
+
|
|
151
|
+
```python
|
|
152
|
+
import subprocess
|
|
153
|
+
|
|
154
|
+
def espeak_tts(text: str, output_wav: str,
|
|
155
|
+
voice: str = "en-gb-scotland",
|
|
156
|
+
speed: int = 145,
|
|
157
|
+
pitch: int = 52,
|
|
158
|
+
amplitude: int = 110,
|
|
159
|
+
ssml: bool = False) -> str:
|
|
160
|
+
"""
|
|
161
|
+
Generate TTS via espeak-ng CLI.
|
|
162
|
+
Returns path to output WAV file.
|
|
163
|
+
"""
|
|
164
|
+
cmd = [
|
|
165
|
+
"espeak-ng",
|
|
166
|
+
"-v", voice,
|
|
167
|
+
"-s", str(speed),
|
|
168
|
+
"-p", str(pitch),
|
|
169
|
+
"-a", str(amplitude),
|
|
170
|
+
"-w", output_wav,
|
|
171
|
+
]
|
|
172
|
+
if ssml:
|
|
173
|
+
cmd.append("-m")
|
|
174
|
+
cmd.append(text)
|
|
175
|
+
|
|
176
|
+
subprocess.run(cmd, check=True, capture_output=True)
|
|
177
|
+
return output_wav
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def espeak_tts_file(input_txt: str, output_wav: str,
|
|
181
|
+
voice: str = "en-gb-scotland",
|
|
182
|
+
speed: int = 145) -> str:
|
|
183
|
+
"""Generate TTS from a text file."""
|
|
184
|
+
subprocess.run([
|
|
185
|
+
"espeak-ng", "-v", voice, "-s", str(speed),
|
|
186
|
+
"-f", input_txt, "-w", output_wav
|
|
187
|
+
], check=True, capture_output=True)
|
|
188
|
+
return output_wav
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def get_ipa(text: str, lang: str = "en") -> str:
|
|
192
|
+
"""Return IPA transcription of text."""
|
|
193
|
+
result = subprocess.run(
|
|
194
|
+
["espeak-ng", "-v", lang, "-q", "--ipa", text],
|
|
195
|
+
capture_output=True, text=True
|
|
196
|
+
)
|
|
197
|
+
return result.stdout.strip()
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
---
|
|
201
|
+
|
|
202
|
+
## WAV → MP3 Pipeline
|
|
203
|
+
|
|
204
|
+
```bash
|
|
205
|
+
# Basic
|
|
206
|
+
ffmpeg -i /tmp/espeak.wav -c:a libmp3lame -b:a 192k /tmp/output.mp3 -y -loglevel quiet
|
|
207
|
+
|
|
208
|
+
# Enhanced speech quality (cleaner high-mids, reduced low rumble)
|
|
209
|
+
ffmpeg -i /tmp/espeak.wav \
|
|
210
|
+
-af "aresample=44100,equalizer=f=3000:t=o:w=1:g=3,equalizer=f=200:t=o:w=1:g=-2,loudnorm=I=-16:TP=-1.5:LRA=11" \
|
|
211
|
+
-c:a libmp3lame -b:a 192k /tmp/output.mp3 -y -loglevel quiet
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
---
|
|
215
|
+
|
|
216
|
+
## Multi-voice Script (narrator + character)
|
|
217
|
+
|
|
218
|
+
```python
|
|
219
|
+
import subprocess, os
|
|
220
|
+
|
|
221
|
+
def multi_voice_tts(lines: list[dict], output_mp3: str) -> str:
|
|
222
|
+
"""
|
|
223
|
+
lines = [
|
|
224
|
+
{"text": "Welcome.", "voice": "en-gb-scotland", "speed": 140},
|
|
225
|
+
{"text": "Thank you.", "voice": "en-us", "speed": 160},
|
|
226
|
+
]
|
|
227
|
+
"""
|
|
228
|
+
wavs = []
|
|
229
|
+
for i, line in enumerate(lines):
|
|
230
|
+
wav = f"/tmp/mv_{i}.wav"
|
|
231
|
+
subprocess.run([
|
|
232
|
+
"espeak-ng",
|
|
233
|
+
"-v", line.get("voice", "en-gb-scotland"),
|
|
234
|
+
"-s", str(line.get("speed", 145)),
|
|
235
|
+
"-p", str(line.get("pitch", 50)),
|
|
236
|
+
"-w", wav,
|
|
237
|
+
line["text"]
|
|
238
|
+
], check=True, capture_output=True)
|
|
239
|
+
wavs.append(wav)
|
|
240
|
+
|
|
241
|
+
concat = "/tmp/mv_concat.txt"
|
|
242
|
+
with open(concat, "w") as f:
|
|
243
|
+
for wav in wavs:
|
|
244
|
+
f.write(f"file '{wav}'\n")
|
|
245
|
+
|
|
246
|
+
subprocess.run([
|
|
247
|
+
"ffmpeg", "-f", "concat", "-safe", "0", "-i", concat,
|
|
248
|
+
"-c:a", "libmp3lame", "-b:a", "192k",
|
|
249
|
+
output_mp3, "-y", "-loglevel", "quiet"
|
|
250
|
+
], check=True)
|
|
251
|
+
|
|
252
|
+
for wav in wavs:
|
|
253
|
+
os.unlink(wav)
|
|
254
|
+
|
|
255
|
+
return output_mp3
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
---
|
|
259
|
+
|
|
260
|
+
## Useful Tuning Combinations
|
|
261
|
+
|
|
262
|
+
```bash
|
|
263
|
+
# Natural male English (UK)
|
|
264
|
+
espeak-ng -v en-gb-scotland -s 145 -p 42 -a 110 -w out.wav "Text here"
|
|
265
|
+
|
|
266
|
+
# Natural female English (RP)
|
|
267
|
+
espeak-ng -v en-gb-x-rp -s 150 -p 62 -a 110 -w out.wav "Text here"
|
|
268
|
+
|
|
269
|
+
# Russian — clear and measured
|
|
270
|
+
espeak-ng -v ru -s 130 -p 50 -a 120 -w out.wav "Текст здесь"
|
|
271
|
+
|
|
272
|
+
# Fast technical narration (US English)
|
|
273
|
+
espeak-ng -v en-us -s 175 -p 48 -a 105 -w out.wav "Text here"
|
|
274
|
+
|
|
275
|
+
# Slow, deliberate presentation voice
|
|
276
|
+
espeak-ng -v en-gb-scotland -s 120 -p 50 -g 8 -a 115 -w out.wav "Text here"
|
|
277
|
+
```
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
# Kokoro ONNX — High-Quality Neural TTS
|
|
2
|
+
|
|
3
|
+
Kokoro is a state-of-the-art offline neural TTS engine. Produces near-human quality audio.
|
|
4
|
+
Available in this environment (`kokoro-onnx` installed) but requires model files.
|
|
5
|
+
|
|
6
|
+
## Languages Supported
|
|
7
|
+
|
|
8
|
+
English (US/UK), Chinese, Japanese, Korean, French, Spanish, Hindi, Portuguese, Italian, Brazilian Portuguese
|
|
9
|
+
|
|
10
|
+
## Installation
|
|
11
|
+
|
|
12
|
+
```bash
|
|
13
|
+
pip install kokoro-onnx soundfile --break-system-packages
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
## Model Download (requires internet once)
|
|
17
|
+
|
|
18
|
+
```python
|
|
19
|
+
from huggingface_hub import hf_hub_download
|
|
20
|
+
|
|
21
|
+
# Download model files
|
|
22
|
+
model_path = hf_hub_download(repo_id="hexgrad/Kokoro-82M-ONNX", filename="kokoro-v1.0.onnx")
|
|
23
|
+
voices_path = hf_hub_download(repo_id="hexgrad/Kokoro-82M-ONNX", filename="voices-v1.0.bin")
|
|
24
|
+
|
|
25
|
+
print(f"Model: {model_path}")
|
|
26
|
+
print(f"Voices: {voices_path}")
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
Or manually download and place in `/tmp/kokoro/`:
|
|
30
|
+
- `kokoro-v1.0.onnx`
|
|
31
|
+
- `voices-v1.0.bin`
|
|
32
|
+
|
|
33
|
+
## Usage
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
from kokoro_onnx import Kokoro
|
|
37
|
+
import soundfile as sf
|
|
38
|
+
import subprocess
|
|
39
|
+
|
|
40
|
+
def kokoro_tts(text: str, output_mp3: str, voice: str = 'af_heart', speed: float = 1.0,
|
|
41
|
+
model_path: str = '/tmp/kokoro/kokoro-v1.0.onnx',
|
|
42
|
+
voices_path: str = '/tmp/kokoro/voices-v1.0.bin'):
|
|
43
|
+
"""
|
|
44
|
+
Generate high-quality neural TTS with Kokoro.
|
|
45
|
+
|
|
46
|
+
Voices:
|
|
47
|
+
English (US): af_heart, af_bella, af_nicole, am_adam, am_michael
|
|
48
|
+
English (UK): bf_emma, bf_isabella, bm_george, bm_lewis
|
|
49
|
+
Japanese: jf_nezuko, jf_tsumugi, jm_kumo
|
|
50
|
+
Chinese: zf_xiaobei, zf_xiaoni, zm_yunjian
|
|
51
|
+
French: ff_siwis
|
|
52
|
+
Korean: kf_alpha
|
|
53
|
+
Spanish: es-419-af-dalia, es-419-am-diego
|
|
54
|
+
Hindi: hf_alpha, hm_omega
|
|
55
|
+
Italian: if_sara, im_nicola
|
|
56
|
+
Brazilian PT: pf_dora, pm_alex
|
|
57
|
+
Portuguese: ptf_edite
|
|
58
|
+
"""
|
|
59
|
+
kokoro = Kokoro(model_path, voices_path)
|
|
60
|
+
|
|
61
|
+
samples, sample_rate = kokoro.create(text, voice=voice, speed=speed, lang="en-us")
|
|
62
|
+
|
|
63
|
+
wav_path = output_mp3.replace('.mp3', '.wav')
|
|
64
|
+
sf.write(wav_path, samples, sample_rate)
|
|
65
|
+
|
|
66
|
+
subprocess.run([
|
|
67
|
+
'ffmpeg', '-i', wav_path,
|
|
68
|
+
'-c:a', 'libmp3lame', '-b:a', '192k',
|
|
69
|
+
output_mp3, '-y', '-loglevel', 'quiet'
|
|
70
|
+
], check=True)
|
|
71
|
+
|
|
72
|
+
return output_mp3
|
|
73
|
+
|
|
74
|
+
# Example
|
|
75
|
+
kokoro_tts(
|
|
76
|
+
"Welcome to our product demo. This neural TTS produces natural speech.",
|
|
77
|
+
"/tmp/kokoro_output.mp3",
|
|
78
|
+
voice="af_heart"
|
|
79
|
+
)
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Language Code Mapping
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
KOKORO_LANG = {
|
|
86
|
+
'en-us': 'en-us',
|
|
87
|
+
'en-gb': 'en-gb',
|
|
88
|
+
'ja': 'ja',
|
|
89
|
+
'zh': 'zh',
|
|
90
|
+
'fr': 'fr-fr',
|
|
91
|
+
'ko': 'ko',
|
|
92
|
+
'es': 'es',
|
|
93
|
+
'hi': 'hi',
|
|
94
|
+
'it': 'it',
|
|
95
|
+
'pt': 'pt-br',
|
|
96
|
+
'pt-pt': 'pt-pt',
|
|
97
|
+
}
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## Caching Model Between Sessions
|
|
101
|
+
|
|
102
|
+
```python
|
|
103
|
+
import os, shutil
|
|
104
|
+
from pathlib import Path
|
|
105
|
+
|
|
106
|
+
CACHE_DIR = Path('/home/claude/.kokoro_cache')
|
|
107
|
+
CACHE_DIR.mkdir(exist_ok=True)
|
|
108
|
+
|
|
109
|
+
MODEL_CACHED = CACHE_DIR / 'kokoro-v1.0.onnx'
|
|
110
|
+
VOICES_CACHED = CACHE_DIR / 'voices-v1.0.bin'
|
|
111
|
+
|
|
112
|
+
def get_or_download_kokoro():
|
|
113
|
+
if MODEL_CACHED.exists() and VOICES_CACHED.exists():
|
|
114
|
+
return str(MODEL_CACHED), str(VOICES_CACHED)
|
|
115
|
+
|
|
116
|
+
from huggingface_hub import hf_hub_download
|
|
117
|
+
model = hf_hub_download("hexgrad/Kokoro-82M-ONNX", "kokoro-v1.0.onnx")
|
|
118
|
+
voices = hf_hub_download("hexgrad/Kokoro-82M-ONNX", "voices-v1.0.bin")
|
|
119
|
+
|
|
120
|
+
shutil.copy(model, MODEL_CACHED)
|
|
121
|
+
shutil.copy(voices, VOICES_CACHED)
|
|
122
|
+
|
|
123
|
+
return str(MODEL_CACHED), str(VOICES_CACHED)
|
|
124
|
+
```
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
# Online TTS Engines Reference
|
|
2
|
+
|
|
3
|
+
Use when internet access is available. Superior quality to offline engines.
|
|
4
|
+
|
|
5
|
+
## Priority Order
|
|
6
|
+
|
|
7
|
+
1. **OpenAI TTS** — best quality, via Anthropic API artifacts
|
|
8
|
+
2. **edge-tts** (Microsoft Azure) — free, neural quality, 100+ voices
|
|
9
|
+
3. **gTTS** (Google) — free, good quality, 40+ languages
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## edge-tts (Microsoft Neural TTS — FREE)
|
|
14
|
+
|
|
15
|
+
Best free online option. 400+ voices, 100+ languages, neural quality.
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
pip install edge-tts --break-system-packages
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
```python
|
|
22
|
+
import asyncio
|
|
23
|
+
import edge_tts
|
|
24
|
+
|
|
25
|
+
async def edge_tts_generate(text: str, output_mp3: str, voice: str = "en-US-AriaNeural"):
|
|
26
|
+
communicate = edge_tts.Communicate(text, voice)
|
|
27
|
+
await communicate.save(output_mp3)
|
|
28
|
+
return output_mp3
|
|
29
|
+
|
|
30
|
+
# Sync wrapper
|
|
31
|
+
def tts(text: str, output_mp3: str, voice: str = "en-US-AriaNeural"):
|
|
32
|
+
asyncio.run(edge_tts_generate(text, output_mp3, voice))
|
|
33
|
+
|
|
34
|
+
# List voices
|
|
35
|
+
async def list_voices():
|
|
36
|
+
voices = await edge_tts.list_voices()
|
|
37
|
+
for v in voices:
|
|
38
|
+
print(v['ShortName'], v['Locale'], v['Gender'])
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
### Recommended Voices
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
EDGE_VOICES = {
|
|
45
|
+
'en-us-f': 'en-US-AriaNeural', # US English, female, natural
|
|
46
|
+
'en-us-m': 'en-US-GuyNeural', # US English, male
|
|
47
|
+
'en-gb-f': 'en-GB-SoniaNeural', # UK English, female
|
|
48
|
+
'ru-f': 'ru-RU-SvetlanaNeural', # Russian, female
|
|
49
|
+
'ru-m': 'ru-RU-DmitryNeural', # Russian, male
|
|
50
|
+
'de-f': 'de-DE-KatjaNeural', # German
|
|
51
|
+
'fr-f': 'fr-FR-DeniseNeural', # French
|
|
52
|
+
'es-f': 'es-ES-ElviraNeural', # Spanish
|
|
53
|
+
'zh-f': 'zh-CN-XiaoxiaoNeural', # Chinese
|
|
54
|
+
'ja-f': 'ja-JP-NanamiNeural', # Japanese
|
|
55
|
+
'ar-f': 'ar-EG-SalmaNeural', # Arabic
|
|
56
|
+
'hi-f': 'hi-IN-SwaraNeural', # Hindi
|
|
57
|
+
'ko-f': 'ko-KR-SunHiNeural', # Korean
|
|
58
|
+
'pt-f': 'pt-BR-FranciscaNeural', # Portuguese
|
|
59
|
+
'it-f': 'it-IT-ElsaNeural', # Italian
|
|
60
|
+
'nl-f': 'nl-NL-ColetteNeural', # Dutch
|
|
61
|
+
'pl-f': 'pl-PL-AgnieszkaNeural', # Polish
|
|
62
|
+
}
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
---
|
|
66
|
+
|
|
67
|
+
## gTTS (Google Text-to-Speech — FREE)
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
pip install gtts --break-system-packages
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
from gtts import gTTS
|
|
75
|
+
import subprocess
|
|
76
|
+
|
|
77
|
+
def gtts_generate(text: str, output_mp3: str, lang: str = 'en', slow: bool = False):
|
|
78
|
+
tts = gTTS(text=text, lang=lang, slow=slow)
|
|
79
|
+
tts.save(output_mp3)
|
|
80
|
+
return output_mp3
|
|
81
|
+
|
|
82
|
+
# Language codes: 'en', 'ru', 'de', 'fr', 'es', 'zh-CN', 'ja', 'ko', 'ar', 'hi', etc.
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
---
|
|
86
|
+
|
|
87
|
+
## OpenAI TTS (via API — best quality)
|
|
88
|
+
|
|
89
|
+
Use when building Claude-powered artifacts. Call from JavaScript in artifacts:
|
|
90
|
+
|
|
91
|
+
```javascript
|
|
92
|
+
// In artifact — OpenAI TTS via fetch
|
|
93
|
+
const response = await fetch("https://api.openai.com/v1/audio/speech", {
|
|
94
|
+
method: "POST",
|
|
95
|
+
headers: {
|
|
96
|
+
"Authorization": `Bearer ${OPENAI_KEY}`,
|
|
97
|
+
"Content-Type": "application/json"
|
|
98
|
+
},
|
|
99
|
+
body: JSON.stringify({
|
|
100
|
+
model: "tts-1-hd", // or "tts-1" for faster/cheaper
|
|
101
|
+
input: "Your text here",
|
|
102
|
+
voice: "alloy", // alloy, echo, fable, onyx, nova, shimmer
|
|
103
|
+
response_format: "mp3"
|
|
104
|
+
})
|
|
105
|
+
});
|
|
106
|
+
const audioBlob = await response.blob();
|
|
107
|
+
const url = URL.createObjectURL(audioBlob);
|
|
108
|
+
// play or download
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### OpenAI Voice Options
|
|
112
|
+
- `alloy` — neutral, versatile
|
|
113
|
+
- `echo` — male, calm
|
|
114
|
+
- `fable` — British male, expressive
|
|
115
|
+
- `onyx` — deep male
|
|
116
|
+
- `nova` — female, warm
|
|
117
|
+
- `shimmer` — female, clear
|
|
118
|
+
|
|
119
|
+
---
|
|
120
|
+
|
|
121
|
+
## Choosing Between Engines
|
|
122
|
+
|
|
123
|
+
```
|
|
124
|
+
Need best quality? → OpenAI TTS (if API key available)
|
|
125
|
+
Free + neural quality? → edge-tts (Microsoft Neural)
|
|
126
|
+
Simple + multilingual? → gTTS
|
|
127
|
+
No internet? → Kokoro ONNX (if models available) or pyttsx3
|
|
128
|
+
```
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
# pyttsx3 + espeak-ng Reference
|
|
2
|
+
|
|
3
|
+
Primary TTS engine for this environment. Fully offline, 131+ languages.
|
|
4
|
+
|
|
5
|
+
## Full API
|
|
6
|
+
|
|
7
|
+
```python
|
|
8
|
+
import pyttsx3
|
|
9
|
+
|
|
10
|
+
engine = pyttsx3.init()
|
|
11
|
+
|
|
12
|
+
# Properties
|
|
13
|
+
engine.setProperty('rate', 145) # words per minute (100–200, default 200)
|
|
14
|
+
engine.setProperty('volume', 1.0) # 0.0–1.0
|
|
15
|
+
# Note: 'pitch' property is accepted but has no effect in espeak backend
|
|
16
|
+
|
|
17
|
+
# List all voices
|
|
18
|
+
voices = engine.getProperty('voices')
|
|
19
|
+
for v in voices:
|
|
20
|
+
print(v.id, v.name, v.languages)
|
|
21
|
+
|
|
22
|
+
# Set a specific voice
|
|
23
|
+
engine.setProperty('voice', 'gmw/en-gb-scotland')
|
|
24
|
+
|
|
25
|
+
# Speak (blocking, uses system audio — not useful in agent)
|
|
26
|
+
# engine.say("Hello"); engine.runAndWait()
|
|
27
|
+
|
|
28
|
+
# Save to file (USE THIS in agent context)
|
|
29
|
+
engine.save_to_file("Text to speak", '/tmp/output.wav')
|
|
30
|
+
engine.runAndWait()
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Recommended Voice IDs per Language
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
VOICE_MAP = {
|
|
37
|
+
'en': 'gmw/en-gb-scotland', # clearest English
|
|
38
|
+
'en-us': 'gmw/en-us',
|
|
39
|
+
'ru': 'zle/ru',
|
|
40
|
+
'de': 'gmw/de',
|
|
41
|
+
'fr': 'roa/fr',
|
|
42
|
+
'es': 'roa/es',
|
|
43
|
+
'it': 'roa/it',
|
|
44
|
+
'pt': 'roa/pt-pt',
|
|
45
|
+
'nl': 'gmw/nl',
|
|
46
|
+
'pl': 'zls/pl',
|
|
47
|
+
'cs': 'zlw/cs',
|
|
48
|
+
'zh': 'sit/cmn',
|
|
49
|
+
'ja': 'jpn/ja',
|
|
50
|
+
'ko': 'ko',
|
|
51
|
+
'ar': 'sem/ar',
|
|
52
|
+
'hi': 'inc/hi',
|
|
53
|
+
'tr': 'trk/tr',
|
|
54
|
+
'sv': 'gmw/sv',
|
|
55
|
+
'da': 'gmw/da',
|
|
56
|
+
'fi': 'urj/fi',
|
|
57
|
+
'uk': 'zle/uk',
|
|
58
|
+
}
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Quality Optimization
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
def high_quality_tts(text: str, out_wav: str, lang: str = 'en', rate: int = 145):
|
|
65
|
+
import pyttsx3, subprocess
|
|
66
|
+
|
|
67
|
+
engine = pyttsx3.init()
|
|
68
|
+
engine.setProperty('rate', rate)
|
|
69
|
+
engine.setProperty('volume', 1.0)
|
|
70
|
+
|
|
71
|
+
voice_id = VOICE_MAP.get(lang, f'gmw/{lang}')
|
|
72
|
+
voices = engine.getProperty('voices')
|
|
73
|
+
matched = next((v for v in voices if v.id == voice_id), None)
|
|
74
|
+
if matched:
|
|
75
|
+
engine.setProperty('voice', matched.id)
|
|
76
|
+
|
|
77
|
+
engine.save_to_file(text, out_wav)
|
|
78
|
+
engine.runAndWait()
|
|
79
|
+
|
|
80
|
+
# Enhance with FFmpeg: resample to 44.1kHz + EQ for speech clarity
|
|
81
|
+
enhanced = out_wav.replace('.wav', '_enhanced.wav')
|
|
82
|
+
subprocess.run([
|
|
83
|
+
'ffmpeg', '-i', out_wav,
|
|
84
|
+
'-af', 'aresample=44100,equalizer=f=3000:t=o:w=1:g=3,equalizer=f=200:t=o:w=1:g=-2,loudnorm=I=-16:TP=-1.5:LRA=11',
|
|
85
|
+
enhanced, '-y', '-loglevel', 'quiet'
|
|
86
|
+
], check=True)
|
|
87
|
+
|
|
88
|
+
return enhanced
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## Paragraph-by-Paragraph Generation
|
|
92
|
+
|
|
93
|
+
For long texts, generate in chunks to avoid engine timeouts:
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
def tts_long_text(paragraphs: list[str], output_mp3: str, lang: str = 'en'):
|
|
97
|
+
import pyttsx3, subprocess, os
|
|
98
|
+
|
|
99
|
+
chunks = []
|
|
100
|
+
engine = pyttsx3.init()
|
|
101
|
+
engine.setProperty('rate', 145)
|
|
102
|
+
|
|
103
|
+
for i, para in enumerate(paragraphs):
|
|
104
|
+
if not para.strip():
|
|
105
|
+
continue
|
|
106
|
+
wav = f'/tmp/chunk_{i}.wav'
|
|
107
|
+
engine.save_to_file(para, wav)
|
|
108
|
+
engine.runAndWait()
|
|
109
|
+
chunks.append(wav)
|
|
110
|
+
|
|
111
|
+
# Add 0.3s pause between paragraphs
|
|
112
|
+
concat_list = '/tmp/tts_concat.txt'
|
|
113
|
+
with open(concat_list, 'w') as f:
|
|
114
|
+
for wav in chunks:
|
|
115
|
+
f.write(f"file '{wav}'\n")
|
|
116
|
+
|
|
117
|
+
subprocess.run([
|
|
118
|
+
'ffmpeg', '-f', 'concat', '-safe', '0', '-i', concat_list,
|
|
119
|
+
'-c:a', 'libmp3lame', '-b:a', '192k',
|
|
120
|
+
output_mp3, '-y', '-loglevel', 'quiet'
|
|
121
|
+
], check=True)
|
|
122
|
+
|
|
123
|
+
# Cleanup
|
|
124
|
+
for wav in chunks:
|
|
125
|
+
os.unlink(wav)
|
|
126
|
+
|
|
127
|
+
return output_mp3
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
## Getting Audio Duration
|
|
131
|
+
|
|
132
|
+
```python
|
|
133
|
+
import subprocess, json
|
|
134
|
+
|
|
135
|
+
def get_audio_duration(path: str) -> float:
|
|
136
|
+
"""Returns duration in seconds."""
|
|
137
|
+
result = subprocess.run([
|
|
138
|
+
'ffprobe', '-v', 'quiet', '-print_format', 'json',
|
|
139
|
+
'-show_format', path
|
|
140
|
+
], capture_output=True, text=True)
|
|
141
|
+
data = json.loads(result.stdout)
|
|
142
|
+
return float(data['format']['duration'])
|
|
143
|
+
```
|