smart-tts 1.9.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smart_tts-1.9.0/.env.example +13 -0
- smart_tts-1.9.0/.github/workflows/publish.yml +83 -0
- smart_tts-1.9.0/.gitignore +72 -0
- smart_tts-1.9.0/.vscode/launch.json +23 -0
- smart_tts-1.9.0/LICENSE +21 -0
- smart_tts-1.9.0/PKG-INFO +394 -0
- smart_tts-1.9.0/README.md +362 -0
- smart_tts-1.9.0/example.py +194 -0
- smart_tts-1.9.0/pyproject.toml +68 -0
- smart_tts-1.9.0/smart_tts/__init__.py +44 -0
- smart_tts-1.9.0/smart_tts/_version.py +24 -0
- smart_tts-1.9.0/smart_tts/async_tts.py +321 -0
- smart_tts-1.9.0/smart_tts/audio/__init__.py +4 -0
- smart_tts-1.9.0/smart_tts/audio/mixer.py +117 -0
- smart_tts-1.9.0/smart_tts/audio/probe.py +32 -0
- smart_tts-1.9.0/smart_tts/client/__init__.py +0 -0
- smart_tts-1.9.0/smart_tts/client/elevenlabs.py +159 -0
- smart_tts-1.9.0/smart_tts/client/fish.py +185 -0
- smart_tts-1.9.0/smart_tts/config.py +63 -0
- smart_tts-1.9.0/smart_tts/exceptions.py +39 -0
- smart_tts-1.9.0/smart_tts/models.py +136 -0
- smart_tts-1.9.0/smart_tts/script/__init__.py +3 -0
- smart_tts-1.9.0/smart_tts/script/breaks.py +44 -0
- smart_tts-1.9.0/smart_tts/templates.py +154 -0
- smart_tts-1.9.0/smart_tts/text.py +26 -0
- smart_tts-1.9.0/smart_tts/tts.py +331 -0
- smart_tts-1.9.0/smart_tts/voices/__init__.py +3 -0
- smart_tts-1.9.0/smart_tts/voices/registry.py +144 -0
- smart_tts-1.9.0/spec.md +435 -0
- smart_tts-1.9.0/templates/investigation.json +21 -0
- smart_tts-1.9.0/tests/conftest.py +40 -0
- smart_tts-1.9.0/tests/test_async_smart_tts.py +54 -0
- smart_tts-1.9.0/tests/test_breaks.py +14 -0
- smart_tts-1.9.0/tests/test_config.py +33 -0
- smart_tts-1.9.0/tests/test_fish_client.py +67 -0
- smart_tts-1.9.0/tests/test_smart_tts.py +72 -0
- smart_tts-1.9.0/tests/test_templates.py +126 -0
- smart_tts-1.9.0/uv.lock +583 -0
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Required
|
|
2
|
+
FISH_API_KEY=your_fish_api_key
|
|
3
|
+
|
|
4
|
+
# Optional — for music/ambient generation and mixing
|
|
5
|
+
ELEVENLABS_API_KEY=your_elevenlabs_api_key
|
|
6
|
+
|
|
7
|
+
# Optional
|
|
8
|
+
# Use s2.1-pro-free if you have no paid Fish credits (skips 402 fallback)
|
|
9
|
+
FISH_DEFAULT_MODEL=s2.1-pro
|
|
10
|
+
FISH_DEFAULT_VOICE_ID=67d37d81cb7340b391e9461d6671de03
|
|
11
|
+
ELEVENLABS_CACHE_DIR=~/.cache/smart-tts
|
|
12
|
+
ELEVENLABS_DEFAULT_OUTPUT_FORMAT=mp3_44100_128
|
|
13
|
+
FISH_API_URL=https://api.fish.audio/v1/tts
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- "v*"
|
|
7
|
+
workflow_dispatch:
|
|
8
|
+
inputs:
|
|
9
|
+
tag:
|
|
10
|
+
description: "Git tag to publish (e.g. v0.1.0)"
|
|
11
|
+
required: true
|
|
12
|
+
type: string
|
|
13
|
+
|
|
14
|
+
permissions:
|
|
15
|
+
contents: read
|
|
16
|
+
id-token: write
|
|
17
|
+
|
|
18
|
+
jobs:
|
|
19
|
+
publish:
|
|
20
|
+
runs-on: ubuntu-latest
|
|
21
|
+
environment:
|
|
22
|
+
name: pypi
|
|
23
|
+
url: https://pypi.org/p/smart-tts
|
|
24
|
+
|
|
25
|
+
steps:
|
|
26
|
+
- name: Resolve release tag
|
|
27
|
+
id: release
|
|
28
|
+
run: |
|
|
29
|
+
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
|
|
30
|
+
TAG="${{ github.event.inputs.tag }}"
|
|
31
|
+
else
|
|
32
|
+
TAG="${GITHUB_REF_NAME}"
|
|
33
|
+
fi
|
|
34
|
+
if [[ ! "${TAG}" =~ ^v[0-9] ]]; then
|
|
35
|
+
echo "Tag must look like v0.1.0, got: ${TAG}" >&2
|
|
36
|
+
exit 1
|
|
37
|
+
fi
|
|
38
|
+
VERSION="${TAG#v}"
|
|
39
|
+
echo "tag=${TAG}" >> "$GITHUB_OUTPUT"
|
|
40
|
+
echo "version=${VERSION}" >> "$GITHUB_OUTPUT"
|
|
41
|
+
echo "Publishing version: ${VERSION}"
|
|
42
|
+
|
|
43
|
+
- name: Checkout
|
|
44
|
+
uses: actions/checkout@v4
|
|
45
|
+
with:
|
|
46
|
+
fetch-depth: 0
|
|
47
|
+
ref: ${{ steps.release.outputs.tag }}
|
|
48
|
+
|
|
49
|
+
- name: Install uv
|
|
50
|
+
uses: astral-sh/setup-uv@v5
|
|
51
|
+
with:
|
|
52
|
+
enable-cache: true
|
|
53
|
+
|
|
54
|
+
- name: Set up Python
|
|
55
|
+
run: uv python install 3.11
|
|
56
|
+
|
|
57
|
+
- name: Build package
|
|
58
|
+
run: uv build
|
|
59
|
+
|
|
60
|
+
- name: Verify built version
|
|
61
|
+
run: |
|
|
62
|
+
BUILT_VERSION="$(uv run python - <<'PY'
|
|
63
|
+
import zipfile
|
|
64
|
+
from pathlib import Path
|
|
65
|
+
|
|
66
|
+
wheel = next(Path("dist").glob("*.whl"))
|
|
67
|
+
with zipfile.ZipFile(wheel) as archive:
|
|
68
|
+
for name in archive.namelist():
|
|
69
|
+
if name.endswith("METADATA"):
|
|
70
|
+
for line in archive.read(name).decode().splitlines():
|
|
71
|
+
if line.startswith("Version:"):
|
|
72
|
+
print(line.split(": ", 1)[1])
|
|
73
|
+
raise SystemExit(0)
|
|
74
|
+
raise SystemExit("Version not found in wheel metadata")
|
|
75
|
+
PY
|
|
76
|
+
)"
|
|
77
|
+
echo "Built wheel version: ${BUILT_VERSION}"
|
|
78
|
+
test "${BUILT_VERSION}" = "${{ steps.release.outputs.version }}"
|
|
79
|
+
|
|
80
|
+
- name: Publish to PyPI
|
|
81
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
82
|
+
with:
|
|
83
|
+
packages-dir: dist/
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
# Generated by hatch-vcs during build
|
|
2
|
+
smart_tts/_version.py
|
|
3
|
+
|
|
4
|
+
# Python
|
|
5
|
+
__pycache__/
|
|
6
|
+
*.py[cod]
|
|
7
|
+
*$py.class
|
|
8
|
+
*.so
|
|
9
|
+
*.egg
|
|
10
|
+
*.egg-info/
|
|
11
|
+
dist/
|
|
12
|
+
build/
|
|
13
|
+
*.manifest
|
|
14
|
+
*.spec
|
|
15
|
+
|
|
16
|
+
# Virtual environments
|
|
17
|
+
.venv/
|
|
18
|
+
venv/
|
|
19
|
+
env/
|
|
20
|
+
ENV/
|
|
21
|
+
|
|
22
|
+
# Packaging / lock tooling
|
|
23
|
+
pip-wheel-metadata/
|
|
24
|
+
.pip-cache/
|
|
25
|
+
|
|
26
|
+
# Tests & lint
|
|
27
|
+
.pytest_cache/
|
|
28
|
+
.ruff_cache/
|
|
29
|
+
.mypy_cache/
|
|
30
|
+
.coverage
|
|
31
|
+
.coverage.*
|
|
32
|
+
htmlcov/
|
|
33
|
+
.tox/
|
|
34
|
+
.nox/
|
|
35
|
+
|
|
36
|
+
# Secrets & local env
|
|
37
|
+
.env
|
|
38
|
+
.env.*
|
|
39
|
+
!.env.example
|
|
40
|
+
|
|
41
|
+
# Local diskcache / runtime data
|
|
42
|
+
.cache/
|
|
43
|
+
cache/
|
|
44
|
+
*.db
|
|
45
|
+
*.sqlite3
|
|
46
|
+
|
|
47
|
+
# Generated audio output
|
|
48
|
+
output/
|
|
49
|
+
output.*
|
|
50
|
+
*.mp3
|
|
51
|
+
*.wav
|
|
52
|
+
*.ogg
|
|
53
|
+
*.m4a
|
|
54
|
+
|
|
55
|
+
# IDE & editors
|
|
56
|
+
.vscode/*
|
|
57
|
+
!.vscode/launch.json
|
|
58
|
+
.idea/
|
|
59
|
+
*.swp
|
|
60
|
+
*.swo
|
|
61
|
+
*~
|
|
62
|
+
|
|
63
|
+
# OS
|
|
64
|
+
.DS_Store
|
|
65
|
+
Thumbs.db
|
|
66
|
+
desktop.ini
|
|
67
|
+
|
|
68
|
+
# Cursor
|
|
69
|
+
.cursor/
|
|
70
|
+
|
|
71
|
+
# Logs
|
|
72
|
+
*.log
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
{
|
|
2
|
+
// Use IntelliSense to learn about possible attributes.
|
|
3
|
+
// Hover to view descriptions of existing attributes.
|
|
4
|
+
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
|
5
|
+
"version": "0.2.0",
|
|
6
|
+
"configurations": [
|
|
7
|
+
|
|
8
|
+
{
|
|
9
|
+
"name": "Python Debugger: Sync",
|
|
10
|
+
"type": "debugpy",
|
|
11
|
+
"request": "launch",
|
|
12
|
+
"program": "example.py",
|
|
13
|
+
"console": "integratedTerminal"
|
|
14
|
+
},
|
|
15
|
+
{
|
|
16
|
+
"name": "Python Debugger: Async",
|
|
17
|
+
"type": "debugpy",
|
|
18
|
+
"request": "launch",
|
|
19
|
+
"program": "example_async.py",
|
|
20
|
+
"console": "integratedTerminal"
|
|
21
|
+
}
|
|
22
|
+
]
|
|
23
|
+
}
|
smart_tts-1.9.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 bad.robot
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
smart_tts-1.9.0/PKG-INFO
ADDED
|
@@ -0,0 +1,394 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: smart-tts
|
|
3
|
+
Version: 1.9.0
|
|
4
|
+
Summary: Smart TTS: Fish Audio speech, ElevenLabs music/ambient, ffmpeg mixing
|
|
5
|
+
Project-URL: Homepage, https://github.com/vpuhoff/smart-tts
|
|
6
|
+
Project-URL: Repository, https://github.com/vpuhoff/smart-tts
|
|
7
|
+
Project-URL: Issues, https://github.com/vpuhoff/smart-tts/issues
|
|
8
|
+
Author: vpuhoff
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: elevenlabs,fish-audio,smart-tts,text-to-speech,tts
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
21
|
+
Requires-Python: >=3.11
|
|
22
|
+
Requires-Dist: diskcache>=5.6
|
|
23
|
+
Requires-Dist: elevenlabs>=2.0
|
|
24
|
+
Requires-Dist: httpx>=0.27
|
|
25
|
+
Requires-Dist: python-dotenv>=1.0
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
|
|
28
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
29
|
+
Requires-Dist: respx>=0.21; extra == 'dev'
|
|
30
|
+
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
31
|
+
Description-Content-Type: text/markdown
|
|
32
|
+
|
|
33
|
+
# smart-tts
|
|
34
|
+
|
|
35
|
+
High-level Python library for expressive speech production: [Fish Audio](https://fish.audio) TTS, [ElevenLabs](https://elevenlabs.io) music and ambient beds, and ffmpeg layer mixing.
|
|
36
|
+
|
|
37
|
+
Pass raw text (optionally with SSML `<break>` tags) — the library converts pauses to Fish Audio paralanguage, synthesizes speech in one continuous pass, and can mix music and ambient underneath.
|
|
38
|
+
|
|
39
|
+
## Features
|
|
40
|
+
|
|
41
|
+
- **SmartTTS facade** — one pipeline from text to audio
|
|
42
|
+
- **Fish Audio speech** — single-pass synthesis via `s2.1-pro` (fallback to `s2.1-pro-free` on 402)
|
|
43
|
+
- **SSML breaks → Fish S2 tags** — `<break time="1.2s"/>` becomes `[long pause]`
|
|
44
|
+
- **ElevenLabs beds** — optional music (`music.compose`) and ambient (`text_to_sound_effects`)
|
|
45
|
+
- **ffmpeg mixing** — speech + music + ambient with volume weights
|
|
46
|
+
- **Sync & async API** — `SmartTTS` / `AsyncSmartTTS` with the same signatures
|
|
47
|
+
- **Voice registry** — local `diskcache` for registered Fish `reference_id` voices
|
|
48
|
+
|
|
49
|
+
## Requirements
|
|
50
|
+
|
|
51
|
+
- Python 3.11+
|
|
52
|
+
- `ffmpeg` and `ffprobe` in `PATH` (only when mixing layers)
|
|
53
|
+
|
|
54
|
+
## Installation
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
pip install smart-tts
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
Or from source:
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
git clone https://github.com/vpuhoff/smart-tts.git
|
|
64
|
+
cd smart-tts
|
|
65
|
+
uv sync --dev
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## Quick start
|
|
69
|
+
|
|
70
|
+
1. Copy `.env.example` to `.env` and set your API keys:
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
cp .env.example .env
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
2. Run synthesis:
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
from pathlib import Path
|
|
80
|
+
|
|
81
|
+
from smart_tts import SmartTTS, SynthesisTask, VoiceSettings
|
|
82
|
+
|
|
83
|
+
tts = SmartTTS.from_env()
|
|
84
|
+
tts.sync_voices()
|
|
85
|
+
|
|
86
|
+
result = tts.synthesize_to_file(
|
|
87
|
+
SynthesisTask(
|
|
88
|
+
text='Центр, <break time="1.2s" /> на связи резидентура.',
|
|
89
|
+
language="ru",
|
|
90
|
+
style="serious",
|
|
91
|
+
emotion="serious",
|
|
92
|
+
voice_id="67d37d81cb7340b391e9461d6671de03",
|
|
93
|
+
voice_settings=VoiceSettings(temperature=0.7, speed=1.0),
|
|
94
|
+
),
|
|
95
|
+
Path("output.mp3"),
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
print(result.enhanced_text)
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
See [`example.py`](example.py) for a full demo: detective radio report with speech variants, custom music, and remix.
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
uv run python example.py
|
|
105
|
+
uv run python example.py --variants 2
|
|
106
|
+
uv run python example.py --remix-only --music back.mp3
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## Synthesis with music and ambient
|
|
110
|
+
|
|
111
|
+
Pass bed prompts or file paths in `SynthesisTask` — `synthesize()` generates speech, then mixes layers automatically:
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
result = tts.synthesize(
|
|
115
|
+
SynthesisTask(
|
|
116
|
+
text="...",
|
|
117
|
+
voice_id="your-fish-reference-id",
|
|
118
|
+
music_prompt="Melancholic noir piano, instrumental, no vocals",
|
|
119
|
+
ambient_prompt="Subtle radio hum, tape hiss, seamless loop",
|
|
120
|
+
music_volume=0.32,
|
|
121
|
+
ambient_volume=0.18,
|
|
122
|
+
bed_weight=0.68,
|
|
123
|
+
)
|
|
124
|
+
)
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
Or provide pre-recorded files:
|
|
128
|
+
|
|
129
|
+
```python
|
|
130
|
+
SynthesisTask(
|
|
131
|
+
text="...",
|
|
132
|
+
music_path="back.mp3",
|
|
133
|
+
ambient_path="ambient.wav",
|
|
134
|
+
)
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
`ELEVENLABS_API_KEY` is required for API-generated beds. Custom files work without it.
|
|
138
|
+
|
|
139
|
+
## Task parameters
|
|
140
|
+
|
|
141
|
+
### Core fields
|
|
142
|
+
|
|
143
|
+
| Field | Description |
|
|
144
|
+
|-------|-------------|
|
|
145
|
+
| `text` | Source script; SSML `<break time="Xs"/>` converted to Fish pauses when `enhance_text=True` |
|
|
146
|
+
| `voice_id` | Fish Audio `reference_id` |
|
|
147
|
+
| `language` | Language hint (metadata / emotion mapping) |
|
|
148
|
+
| `style`, `emotion`, `use_case` | Context hints; `emotion` adds Fish paralanguage prefix |
|
|
149
|
+
| `enhance_text` | `True` — break conversion + emotion prefix; `False` — raw text |
|
|
150
|
+
| `voice_settings` | `temperature`, `speed`, `top_p`, `repetition_penalty` for Fish API |
|
|
151
|
+
| `model` | Fish model (see table below) |
|
|
152
|
+
|
|
153
|
+
### Mixing fields
|
|
154
|
+
|
|
155
|
+
| Field | Default | Description |
|
|
156
|
+
|-------|---------|-------------|
|
|
157
|
+
| `music_prompt` | — | ElevenLabs Music API prompt |
|
|
158
|
+
| `ambient_prompt` | — | ElevenLabs Sound Effects API prompt |
|
|
159
|
+
| `music_path` | — | Pre-recorded music file |
|
|
160
|
+
| `ambient_path` | — | Pre-recorded ambient file |
|
|
161
|
+
| `music_volume` | `0.32` | Music level in mix |
|
|
162
|
+
| `ambient_volume` | `0.18` | Ambient level in mix |
|
|
163
|
+
| `speech_volume` | `1.0` | Speech gain in mix (`1.0` = unchanged) |
|
|
164
|
+
| `bed_weight` | `0.68` | Background bed weight vs speech |
|
|
165
|
+
|
|
166
|
+
### SSML breaks
|
|
167
|
+
|
|
168
|
+
```python
|
|
169
|
+
# Input
|
|
170
|
+
'Срочное донесение. <break time="1.2s" /> Обнаружена цель.'
|
|
171
|
+
|
|
172
|
+
# After enhance_text (Fish S2 [bracket] tags)
|
|
173
|
+
'Срочное донесение. [long pause] Обнаружена цель.'
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
| Pause | Fish markup |
|
|
177
|
+
|-------|-------------|
|
|
178
|
+
| ≥ 1.2 s | `[long pause]` |
|
|
179
|
+
| ≥ 0.75 s | `[pause]` |
|
|
180
|
+
| ≥ 0.4 s | `...` |
|
|
181
|
+
|
|
182
|
+
### Emotion tags
|
|
183
|
+
|
|
184
|
+
Fish Audio S2/S2.1 interprets `[bracket]` tags as delivery hints (not spoken text). Parenthesis prose like `(soft tone)` is **not** supported and may be read aloud.
|
|
185
|
+
|
|
186
|
+
| `emotion` | Tag added |
|
|
187
|
+
|-----------|-----------|
|
|
188
|
+
| `warm` | `[warm]` |
|
|
189
|
+
| `serious` | `[serious]` |
|
|
190
|
+
| `excited` | `[excited]` |
|
|
191
|
+
| `sad` | `[sad]` |
|
|
192
|
+
| `whisper` | `[whisper]` |
|
|
193
|
+
| `calm` | `[calm]` |
|
|
194
|
+
|
|
195
|
+
## Configuration
|
|
196
|
+
|
|
197
|
+
### Required
|
|
198
|
+
|
|
199
|
+
| Variable | Description |
|
|
200
|
+
|----------|-------------|
|
|
201
|
+
| `FISH_API_KEY` | Fish Audio API key |
|
|
202
|
+
|
|
203
|
+
### Optional
|
|
204
|
+
|
|
205
|
+
| Variable | Default | Description |
|
|
206
|
+
|----------|---------|-------------|
|
|
207
|
+
| `ELEVENLABS_API_KEY` | — | For music/ambient generation |
|
|
208
|
+
| `FISH_DEFAULT_MODEL` | `s2.1-pro` | Fish model (`s2.1-pro-free` if no paid credits) |
|
|
209
|
+
| `FISH_DEFAULT_VOICE_ID` | Kanevsky ref id | Fallback `reference_id` |
|
|
210
|
+
| `FISH_API_URL` | `https://api.fish.audio/v1/tts` | Fish TTS endpoint |
|
|
211
|
+
| `ELEVENLABS_CACHE_DIR` | `~/.cache/smart-tts` | Voice registry cache |
|
|
212
|
+
| `ELEVENLABS_DEFAULT_OUTPUT_FORMAT` | `mp3_44100_128` | Output format |
|
|
213
|
+
|
|
214
|
+
Programmatic configuration:
|
|
215
|
+
|
|
216
|
+
```python
|
|
217
|
+
from smart_tts import SmartTTS, SmartTTSConfig, TTSModel
|
|
218
|
+
|
|
219
|
+
config = SmartTTSConfig(
|
|
220
|
+
fish_api_key="...",
|
|
221
|
+
elevenlabs_api_key="...", # optional
|
|
222
|
+
default_model=TTSModel.ELEVEN_V3,
|
|
223
|
+
default_voice_id="67d37d81cb7340b391e9461d6671de03",
|
|
224
|
+
)
|
|
225
|
+
tts = SmartTTS(config)
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
## Usage
|
|
229
|
+
|
|
230
|
+
### Synthesis pipeline
|
|
231
|
+
|
|
232
|
+
```python
|
|
233
|
+
from smart_tts import SmartTTS, SynthesisTask, TTSModel, VoiceSettings
|
|
234
|
+
|
|
235
|
+
with SmartTTS.from_env() as tts:
|
|
236
|
+
tts.sync_voices()
|
|
237
|
+
result = tts.synthesize(
|
|
238
|
+
SynthesisTask(
|
|
239
|
+
text="Срочное донесение.",
|
|
240
|
+
voice_id="67d37d81cb7340b391e9461d6671de03",
|
|
241
|
+
model=TTSModel.ELEVEN_V3,
|
|
242
|
+
emotion="serious",
|
|
243
|
+
voice_settings=VoiceSettings(temperature=0.7),
|
|
244
|
+
)
|
|
245
|
+
)
|
|
246
|
+
audio_bytes = result.audio
|
|
247
|
+
prepared_text = result.enhanced_text
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
### Generation templates
|
|
251
|
+
|
|
252
|
+
Use `GenerationTemplate` to bundle speech, background, and mix settings. Pass only the script text at synthesis time:
|
|
253
|
+
|
|
254
|
+
```python
|
|
255
|
+
from pathlib import Path
|
|
256
|
+
|
|
257
|
+
from smart_tts import INVESTIGATION, GenerationTemplate, SmartTTS, synthesize_with_template
|
|
258
|
+
|
|
259
|
+
# Built-in preset
|
|
260
|
+
with SmartTTS.from_env() as tts:
|
|
261
|
+
speech = tts.synthesize_text(
|
|
262
|
+
'Срочное донесение. <break time="1.2s" /> Обнаружена цель.',
|
|
263
|
+
INVESTIGATION,
|
|
264
|
+
mix=False, # speech only
|
|
265
|
+
)
|
|
266
|
+
tts.synthesize_text_to_file(
|
|
267
|
+
"Конец связи.",
|
|
268
|
+
INVESTIGATION,
|
|
269
|
+
Path("output/speech.mp3"),
|
|
270
|
+
mix=False,
|
|
271
|
+
)
|
|
272
|
+
tts.remix_file(
|
|
273
|
+
Path("output/speech.mp3"),
|
|
274
|
+
Path("output/final.mp3"),
|
|
275
|
+
INVESTIGATION,
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
# Custom template or overrides
|
|
279
|
+
template = INVESTIGATION.with_overrides(
|
|
280
|
+
speech_volume=1.2,
|
|
281
|
+
music_path="back.mp3",
|
|
282
|
+
ambient_path=None,
|
|
283
|
+
)
|
|
284
|
+
result = synthesize_with_template("Привет!", template, mix=True)
|
|
285
|
+
|
|
286
|
+
# Load/save JSON recipes (see templates/investigation.json)
|
|
287
|
+
template = GenerationTemplate.from_json_file("templates/investigation.json")
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
| Method | Description |
|
|
291
|
+
|--------|-------------|
|
|
292
|
+
| `template.to_task(text, mix=True, **overrides)` | Build `SynthesisTask` |
|
|
293
|
+
| `template.with_overrides(**kwargs)` | Copy with changed fields |
|
|
294
|
+
| `GenerationTemplate.from_dict()` / `from_json_file()` | Deserialize |
|
|
295
|
+
| `template.save_json(path)` | Serialize to JSON |
|
|
296
|
+
| `get_template("investigation")` | Built-in preset lookup |
|
|
297
|
+
| `tts.synthesize_text(text, template)` | Synthesize with template |
|
|
298
|
+
| `tts.remix_file(speech, output, template)` | Mix speech + beds |
|
|
299
|
+
|
|
300
|
+
### Preview prepared text without TTS
|
|
301
|
+
|
|
302
|
+
```python
|
|
303
|
+
prepared = tts.enhance_text_only(
|
|
304
|
+
SynthesisTask(
|
|
305
|
+
text='Центр, <break time="1.2s" /> на связи.',
|
|
306
|
+
emotion="warm",
|
|
307
|
+
)
|
|
308
|
+
)
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
### One-liner
|
|
312
|
+
|
|
313
|
+
```python
|
|
314
|
+
from smart_tts import synthesize
|
|
315
|
+
|
|
316
|
+
result = synthesize(
|
|
317
|
+
"Привет, мир!",
|
|
318
|
+
language="ru",
|
|
319
|
+
style="neutral",
|
|
320
|
+
)
|
|
321
|
+
```
|
|
322
|
+
|
|
323
|
+
### Async API
|
|
324
|
+
|
|
325
|
+
```python
|
|
326
|
+
import asyncio
|
|
327
|
+
from pathlib import Path
|
|
328
|
+
|
|
329
|
+
from smart_tts import AsyncSmartTTS, SynthesisTask, asynthesize
|
|
330
|
+
|
|
331
|
+
async def main() -> None:
|
|
332
|
+
async with AsyncSmartTTS.from_env() as tts:
|
|
333
|
+
result = await tts.synthesize_to_file(
|
|
334
|
+
SynthesisTask(text="Привет!", language="ru"),
|
|
335
|
+
Path("output.mp3"),
|
|
336
|
+
)
|
|
337
|
+
print(result.enhanced_text)
|
|
338
|
+
|
|
339
|
+
asyncio.run(main())
|
|
340
|
+
```
|
|
341
|
+
|
|
342
|
+
### Voice registry
|
|
343
|
+
|
|
344
|
+
```python
|
|
345
|
+
voices = tts.list_voices()
|
|
346
|
+
voice = tts.get_voice("reference-id")
|
|
347
|
+
tts.sync_voices(force=True) # refresh default voice in cache
|
|
348
|
+
```
|
|
349
|
+
|
|
350
|
+
Fish voices are referenced by `reference_id` from the Fish Audio console. Register custom voices via `VoiceRegistry.register_voice()` or set `FISH_DEFAULT_VOICE_ID`.
|
|
351
|
+
|
|
352
|
+
## Models (`TTSModel`)
|
|
353
|
+
|
|
354
|
+
Legacy enum names map to Fish Audio models:
|
|
355
|
+
|
|
356
|
+
| Enum | Fish model | Notes |
|
|
357
|
+
|------|------------|-------|
|
|
358
|
+
| `TTSModel.ELEVEN_V3` | `s2.1-pro` | Default; auto-fallback to `s2.1-pro-free` on 402 |
|
|
359
|
+
| `TTSModel.ELEVEN_MULTILINGUAL_V2` | `s2-pro` | |
|
|
360
|
+
| `TTSModel.ELEVEN_FLASH_V2_5` | `s1` | |
|
|
361
|
+
|
|
362
|
+
## Package layout
|
|
363
|
+
|
|
364
|
+
```
|
|
365
|
+
smart_tts/
|
|
366
|
+
├── tts.py, async_tts.py # SmartTTS facade
|
|
367
|
+
├── templates.py # GenerationTemplate presets
|
|
368
|
+
├── config.py, models.py
|
|
369
|
+
├── client/
|
|
370
|
+
│ ├── fish.py # Fish Audio TTS
|
|
371
|
+
│ └── elevenlabs.py # Music + ambient beds
|
|
372
|
+
├── script/breaks.py # SSML → Fish paralanguage
|
|
373
|
+
├── audio/mixer.py # ffmpeg mix_tracks
|
|
374
|
+
├── text.py # prepare_text()
|
|
375
|
+
└── voices/registry.py # diskcache voice registry
|
|
376
|
+
```
|
|
377
|
+
|
|
378
|
+
## Development
|
|
379
|
+
|
|
380
|
+
```bash
|
|
381
|
+
uv sync --dev
|
|
382
|
+
uv run pytest
|
|
383
|
+
uv run ruff check .
|
|
384
|
+
```
|
|
385
|
+
|
|
386
|
+
## License
|
|
387
|
+
|
|
388
|
+
MIT — see [LICENSE](LICENSE).
|
|
389
|
+
|
|
390
|
+
## Links
|
|
391
|
+
|
|
392
|
+
- [GitHub repository](https://github.com/vpuhoff/smart-tts)
|
|
393
|
+
- [PyPI package](https://pypi.org/project/smart-tts/)
|
|
394
|
+
- [Design specification (Russian)](spec.md)
|