wyoming-piper 1.5.3__py3-none-any.whl → 2.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wyoming_piper/__init__.py +1 -0
- wyoming_piper/__main__.py +50 -24
- wyoming_piper/download.py +41 -38
- wyoming_piper/handler.py +206 -77
- wyoming_piper/voices.json +420 -0
- {wyoming_piper-1.5.3.dist-info → wyoming_piper-2.1.2.dist-info}/METADATA +16 -14
- wyoming_piper-2.1.2.dist-info/RECORD +13 -0
- {wyoming_piper-1.5.3.dist-info → wyoming_piper-2.1.2.dist-info}/WHEEL +1 -1
- wyoming_piper/process.py +0 -171
- wyoming_piper-1.5.3.dist-info/RECORD +0 -14
- {wyoming_piper-1.5.3.dist-info → wyoming_piper-2.1.2.dist-info}/entry_points.txt +0 -0
- {wyoming_piper-1.5.3.dist-info → wyoming_piper-2.1.2.dist-info}/licenses/LICENSE.md +0 -0
- {wyoming_piper-1.5.3.dist-info → wyoming_piper-2.1.2.dist-info}/top_level.txt +0 -0
wyoming_piper/__init__.py
CHANGED
wyoming_piper/__main__.py
CHANGED
|
@@ -8,12 +8,11 @@ from pathlib import Path
|
|
|
8
8
|
from typing import Any, Dict, Set
|
|
9
9
|
|
|
10
10
|
from wyoming.info import Attribution, Info, TtsProgram, TtsVoice, TtsVoiceSpeaker
|
|
11
|
-
from wyoming.server import AsyncServer
|
|
11
|
+
from wyoming.server import AsyncServer, AsyncTcpServer
|
|
12
12
|
|
|
13
13
|
from . import __version__
|
|
14
|
-
from .download import find_voice, get_voices
|
|
14
|
+
from .download import ensure_voice_exists, find_voice, get_voices
|
|
15
15
|
from .handler import PiperEventHandler
|
|
16
|
-
from .process import PiperProcessManager
|
|
17
16
|
|
|
18
17
|
_LOGGER = logging.getLogger(__name__)
|
|
19
18
|
|
|
@@ -21,17 +20,20 @@ _LOGGER = logging.getLogger(__name__)
|
|
|
21
20
|
async def main() -> None:
|
|
22
21
|
"""Main entry point."""
|
|
23
22
|
parser = argparse.ArgumentParser()
|
|
24
|
-
parser.add_argument(
|
|
25
|
-
"--piper",
|
|
26
|
-
required=True,
|
|
27
|
-
help="Path to piper executable",
|
|
28
|
-
)
|
|
29
23
|
parser.add_argument(
|
|
30
24
|
"--voice",
|
|
31
25
|
required=True,
|
|
32
26
|
help="Default Piper voice to use (e.g., en_US-lessac-medium)",
|
|
33
27
|
)
|
|
34
28
|
parser.add_argument("--uri", default="stdio://", help="unix:// or tcp://")
|
|
29
|
+
#
|
|
30
|
+
parser.add_argument(
|
|
31
|
+
"--zeroconf",
|
|
32
|
+
nargs="?",
|
|
33
|
+
const="piper",
|
|
34
|
+
help="Enable discovery over zeroconf with optional name (default: piper)",
|
|
35
|
+
)
|
|
36
|
+
#
|
|
35
37
|
parser.add_argument(
|
|
36
38
|
"--data-dir",
|
|
37
39
|
required=True,
|
|
@@ -48,17 +50,18 @@ async def main() -> None:
|
|
|
48
50
|
)
|
|
49
51
|
parser.add_argument("--noise-scale", type=float, help="Generator noise")
|
|
50
52
|
parser.add_argument("--length-scale", type=float, help="Phoneme length")
|
|
51
|
-
parser.add_argument(
|
|
53
|
+
parser.add_argument(
|
|
54
|
+
"--noise-w-scale", "--noise-w", type=float, help="Phoneme width noise"
|
|
55
|
+
)
|
|
52
56
|
#
|
|
53
57
|
parser.add_argument(
|
|
54
58
|
"--auto-punctuation", default=".?!", help="Automatically add punctuation"
|
|
55
59
|
)
|
|
56
60
|
parser.add_argument("--samples-per-chunk", type=int, default=1024)
|
|
57
61
|
parser.add_argument(
|
|
58
|
-
"--
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
help="Maximum number of piper process to run simultaneously (default: 1)",
|
|
62
|
+
"--no-streaming",
|
|
63
|
+
action="store_true",
|
|
64
|
+
help="Disable audio streaming on sentence boundaries",
|
|
62
65
|
)
|
|
63
66
|
#
|
|
64
67
|
parser.add_argument(
|
|
@@ -67,6 +70,12 @@ async def main() -> None:
|
|
|
67
70
|
help="Download latest voices.json during startup",
|
|
68
71
|
)
|
|
69
72
|
#
|
|
73
|
+
parser.add_argument(
|
|
74
|
+
"--use-cuda",
|
|
75
|
+
action="store_true",
|
|
76
|
+
help="Use CUDA if available (requires onnxruntime-gpu)",
|
|
77
|
+
)
|
|
78
|
+
#
|
|
70
79
|
parser.add_argument("--debug", action="store_true", help="Log DEBUG messages")
|
|
71
80
|
parser.add_argument(
|
|
72
81
|
"--log-format", default=logging.BASIC_FORMAT, help="Format for log messages"
|
|
@@ -113,12 +122,14 @@ async def main() -> None:
|
|
|
113
122
|
voice_info.get("espeak", {}).get("voice", voice_name.split("_")[0]),
|
|
114
123
|
)
|
|
115
124
|
],
|
|
116
|
-
speakers=
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
125
|
+
speakers=(
|
|
126
|
+
[
|
|
127
|
+
TtsVoiceSpeaker(name=speaker_name)
|
|
128
|
+
for speaker_name in voice_info["speaker_id_map"]
|
|
129
|
+
]
|
|
130
|
+
if voice_info.get("speaker_id_map")
|
|
131
|
+
else None
|
|
132
|
+
),
|
|
122
133
|
)
|
|
123
134
|
for voice_name, voice_info in voices_info.items()
|
|
124
135
|
if not voice_info.get("_is_alias", False)
|
|
@@ -180,26 +191,41 @@ async def main() -> None:
|
|
|
180
191
|
installed=True,
|
|
181
192
|
voices=sorted(voices, key=lambda v: v.name),
|
|
182
193
|
version=__version__,
|
|
194
|
+
supports_synthesize_streaming=(not args.no_streaming),
|
|
183
195
|
)
|
|
184
196
|
],
|
|
185
197
|
)
|
|
186
198
|
|
|
187
|
-
|
|
199
|
+
# Ensure default voice is downloaded
|
|
200
|
+
voice_info = voices_info.get(args.voice, {})
|
|
201
|
+
voice_name = voice_info.get("key", args.voice)
|
|
202
|
+
assert voice_name is not None
|
|
188
203
|
|
|
189
|
-
|
|
190
|
-
# Other voices will be loaded on-demand.
|
|
191
|
-
await process_manager.get_process()
|
|
204
|
+
ensure_voice_exists(voice_name, args.data_dir, args.download_dir, voices_info)
|
|
192
205
|
|
|
193
206
|
# Start server
|
|
194
207
|
server = AsyncServer.from_uri(args.uri)
|
|
195
208
|
|
|
209
|
+
if args.zeroconf:
|
|
210
|
+
if not isinstance(server, AsyncTcpServer):
|
|
211
|
+
raise ValueError("Zeroconf requires tcp:// uri")
|
|
212
|
+
|
|
213
|
+
from wyoming.zeroconf import HomeAssistantZeroconf
|
|
214
|
+
|
|
215
|
+
tcp_server: AsyncTcpServer = server
|
|
216
|
+
hass_zeroconf = HomeAssistantZeroconf(
|
|
217
|
+
name=args.zeroconf, port=tcp_server.port, host=tcp_server.host
|
|
218
|
+
)
|
|
219
|
+
await hass_zeroconf.register_server()
|
|
220
|
+
_LOGGER.debug("Zeroconf discovery enabled")
|
|
221
|
+
|
|
196
222
|
_LOGGER.info("Ready")
|
|
197
223
|
await server.run(
|
|
198
224
|
partial(
|
|
199
225
|
PiperEventHandler,
|
|
200
226
|
wyoming_info,
|
|
201
227
|
args,
|
|
202
|
-
|
|
228
|
+
voices_info,
|
|
203
229
|
)
|
|
204
230
|
)
|
|
205
231
|
|
wyoming_piper/download.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
"""Utility for downloading Piper voices."""
|
|
2
|
+
|
|
2
3
|
import json
|
|
3
4
|
import logging
|
|
4
5
|
import shutil
|
|
@@ -8,9 +9,7 @@ from urllib.error import URLError
|
|
|
8
9
|
from urllib.parse import quote, urlsplit, urlunsplit
|
|
9
10
|
from urllib.request import urlopen
|
|
10
11
|
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
URL_FORMAT = "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/{file}"
|
|
12
|
+
URL_FORMAT = "https://huggingface.co/rhasspy/piper-voices/resolve/main/{file}"
|
|
14
13
|
|
|
15
14
|
_DIR = Path(__file__).parent
|
|
16
15
|
_LOGGER = logging.getLogger(__name__)
|
|
@@ -47,20 +46,21 @@ def get_voices(
|
|
|
47
46
|
except Exception:
|
|
48
47
|
_LOGGER.exception("Failed to update voices list")
|
|
49
48
|
|
|
49
|
+
voices_embedded = _DIR / "voices.json"
|
|
50
|
+
_LOGGER.debug("Loading %s", voices_embedded)
|
|
51
|
+
with open(voices_embedded, "r", encoding="utf-8") as voices_file:
|
|
52
|
+
voices = json.load(voices_file)
|
|
53
|
+
|
|
50
54
|
# Prefer downloaded file to embedded
|
|
51
55
|
if voices_download.exists():
|
|
52
56
|
try:
|
|
53
57
|
_LOGGER.debug("Loading %s", voices_download)
|
|
54
58
|
with open(voices_download, "r", encoding="utf-8") as voices_file:
|
|
55
|
-
|
|
59
|
+
voices.update(json.load(voices_file))
|
|
56
60
|
except Exception:
|
|
57
61
|
_LOGGER.exception("Failed to load %s", voices_download)
|
|
58
62
|
|
|
59
|
-
|
|
60
|
-
voices_embedded = _DIR / "voices.json"
|
|
61
|
-
_LOGGER.debug("Loading %s", voices_embedded)
|
|
62
|
-
with open(voices_embedded, "r", encoding="utf-8") as voices_file:
|
|
63
|
-
return json.load(voices_file)
|
|
63
|
+
return voices
|
|
64
64
|
|
|
65
65
|
|
|
66
66
|
def ensure_voice_exists(
|
|
@@ -87,8 +87,7 @@ def ensure_voice_exists(
|
|
|
87
87
|
for data_dir in data_dirs:
|
|
88
88
|
data_dir = Path(data_dir)
|
|
89
89
|
|
|
90
|
-
|
|
91
|
-
for file_path, file_info in voice_files.items():
|
|
90
|
+
for file_path, _file_info in voice_files.items():
|
|
92
91
|
if file_path in verified_files:
|
|
93
92
|
# Already verified this file in a different data directory
|
|
94
93
|
continue
|
|
@@ -99,34 +98,37 @@ def ensure_voice_exists(
|
|
|
99
98
|
|
|
100
99
|
data_file_path = data_dir / file_name
|
|
101
100
|
_LOGGER.debug("Checking %s", data_file_path)
|
|
102
|
-
if not data_file_path.exists():
|
|
101
|
+
if (not data_file_path.exists()) or (data_file_path.stat().st_size == 0):
|
|
103
102
|
_LOGGER.debug("Missing %s", data_file_path)
|
|
104
103
|
files_to_download.add(file_path)
|
|
105
104
|
continue
|
|
106
105
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
106
|
+
# Don't bother validating sizes or hashes.
|
|
107
|
+
# This causes more problems than its worth.
|
|
108
|
+
#
|
|
109
|
+
# expected_size = file_info["size_bytes"]
|
|
110
|
+
# actual_size = data_file_path.stat().st_size
|
|
111
|
+
# if expected_size != actual_size:
|
|
112
|
+
# _LOGGER.warning(
|
|
113
|
+
# "Wrong size (expected=%s, actual=%s) for %s",
|
|
114
|
+
# expected_size,
|
|
115
|
+
# actual_size,
|
|
116
|
+
# data_file_path,
|
|
117
|
+
# )
|
|
118
|
+
# files_to_download.add(file_path)
|
|
119
|
+
# continue
|
|
120
|
+
|
|
121
|
+
# expected_hash = file_info["md5_digest"]
|
|
122
|
+
# actual_hash = get_file_hash(data_file_path)
|
|
123
|
+
# if expected_hash != actual_hash:
|
|
124
|
+
# _LOGGER.warning(
|
|
125
|
+
# "Wrong hash (expected=%s, actual=%s) for %s",
|
|
126
|
+
# expected_hash,
|
|
127
|
+
# actual_hash,
|
|
128
|
+
# data_file_path,
|
|
129
|
+
# )
|
|
130
|
+
# files_to_download.add(file_path)
|
|
131
|
+
# continue
|
|
130
132
|
|
|
131
133
|
# File exists and has been verified
|
|
132
134
|
verified_files.add(file_path)
|
|
@@ -149,9 +151,10 @@ def ensure_voice_exists(
|
|
|
149
151
|
download_file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
150
152
|
|
|
151
153
|
_LOGGER.debug("Downloading %s to %s", file_url, download_file_path)
|
|
152
|
-
with
|
|
153
|
-
|
|
154
|
-
|
|
154
|
+
with (
|
|
155
|
+
urlopen(_quote_url(file_url)) as response,
|
|
156
|
+
open(download_file_path, "wb") as download_file,
|
|
157
|
+
):
|
|
155
158
|
shutil.copyfileobj(response, download_file)
|
|
156
159
|
|
|
157
160
|
_LOGGER.info("Downloaded %s (%s)", download_file_path, file_url)
|
wyoming_piper/handler.py
CHANGED
|
@@ -1,30 +1,44 @@
|
|
|
1
1
|
"""Event handler for clients of the server."""
|
|
2
|
+
|
|
2
3
|
import argparse
|
|
3
|
-
import
|
|
4
|
+
import asyncio
|
|
4
5
|
import logging
|
|
5
6
|
import math
|
|
6
|
-
import
|
|
7
|
+
import tempfile
|
|
7
8
|
import wave
|
|
8
9
|
from typing import Any, Dict, Optional
|
|
9
10
|
|
|
11
|
+
from piper import PiperVoice, SynthesisConfig
|
|
12
|
+
from sentence_stream import SentenceBoundaryDetector
|
|
10
13
|
from wyoming.audio import AudioChunk, AudioStart, AudioStop
|
|
11
14
|
from wyoming.error import Error
|
|
12
15
|
from wyoming.event import Event
|
|
13
16
|
from wyoming.info import Describe, Info
|
|
14
17
|
from wyoming.server import AsyncEventHandler
|
|
15
|
-
from wyoming.tts import
|
|
18
|
+
from wyoming.tts import (
|
|
19
|
+
Synthesize,
|
|
20
|
+
SynthesizeChunk,
|
|
21
|
+
SynthesizeStart,
|
|
22
|
+
SynthesizeStop,
|
|
23
|
+
SynthesizeStopped,
|
|
24
|
+
)
|
|
16
25
|
|
|
17
|
-
from .
|
|
26
|
+
from .download import ensure_voice_exists, find_voice
|
|
18
27
|
|
|
19
28
|
_LOGGER = logging.getLogger(__name__)
|
|
20
29
|
|
|
30
|
+
# Keep the most recently used voice loaded
|
|
31
|
+
_VOICE: Optional[PiperVoice] = None
|
|
32
|
+
_VOICE_NAME: Optional[str] = None
|
|
33
|
+
_VOICE_LOCK = asyncio.Lock()
|
|
34
|
+
|
|
21
35
|
|
|
22
36
|
class PiperEventHandler(AsyncEventHandler):
|
|
23
37
|
def __init__(
|
|
24
38
|
self,
|
|
25
39
|
wyoming_info: Info,
|
|
26
40
|
cli_args: argparse.Namespace,
|
|
27
|
-
|
|
41
|
+
voices_info: Dict[str, Any],
|
|
28
42
|
*args,
|
|
29
43
|
**kwargs,
|
|
30
44
|
) -> None:
|
|
@@ -32,7 +46,10 @@ class PiperEventHandler(AsyncEventHandler):
|
|
|
32
46
|
|
|
33
47
|
self.cli_args = cli_args
|
|
34
48
|
self.wyoming_info_event = wyoming_info.event()
|
|
35
|
-
self.
|
|
49
|
+
self.voices_info = voices_info
|
|
50
|
+
self.is_streaming: Optional[bool] = None
|
|
51
|
+
self.sbd = SentenceBoundaryDetector()
|
|
52
|
+
self._synthesize: Optional[Synthesize] = None
|
|
36
53
|
|
|
37
54
|
async def handle_event(self, event: Event) -> bool:
|
|
38
55
|
if Describe.is_type(event.type):
|
|
@@ -40,20 +57,90 @@ class PiperEventHandler(AsyncEventHandler):
|
|
|
40
57
|
_LOGGER.debug("Sent info")
|
|
41
58
|
return True
|
|
42
59
|
|
|
43
|
-
if not Synthesize.is_type(event.type):
|
|
44
|
-
_LOGGER.warning("Unexpected event: %s", event)
|
|
45
|
-
return True
|
|
46
|
-
|
|
47
60
|
try:
|
|
48
|
-
|
|
61
|
+
if Synthesize.is_type(event.type):
|
|
62
|
+
if self.is_streaming:
|
|
63
|
+
# Ignore since this is only sent for compatibility reasons.
|
|
64
|
+
# For streaming, we expect:
|
|
65
|
+
# [synthesize-start] -> [synthesize-chunk]+ -> [synthesize]? -> [synthesize-stop]
|
|
66
|
+
return True
|
|
67
|
+
|
|
68
|
+
# Sent outside a stream, so we must process it
|
|
69
|
+
synthesize = Synthesize.from_event(event)
|
|
70
|
+
self._synthesize = Synthesize(text="", voice=synthesize.voice)
|
|
71
|
+
self.sbd = SentenceBoundaryDetector()
|
|
72
|
+
start_sent = False
|
|
73
|
+
for i, sentence in enumerate(self.sbd.add_chunk(synthesize.text)):
|
|
74
|
+
self._synthesize.text = sentence
|
|
75
|
+
await self._handle_synthesize(
|
|
76
|
+
self._synthesize, send_start=(i == 0), send_stop=False
|
|
77
|
+
)
|
|
78
|
+
start_sent = True
|
|
79
|
+
|
|
80
|
+
self._synthesize.text = self.sbd.finish()
|
|
81
|
+
if self._synthesize.text:
|
|
82
|
+
# Last sentence
|
|
83
|
+
await self._handle_synthesize(
|
|
84
|
+
self._synthesize, send_start=(not start_sent), send_stop=True
|
|
85
|
+
)
|
|
86
|
+
else:
|
|
87
|
+
# No final sentence
|
|
88
|
+
await self.write_event(AudioStop().event())
|
|
89
|
+
|
|
90
|
+
return True
|
|
91
|
+
|
|
92
|
+
if self.cli_args.no_streaming:
|
|
93
|
+
# Streaming is not enabled
|
|
94
|
+
return True
|
|
95
|
+
|
|
96
|
+
if SynthesizeStart.is_type(event.type):
|
|
97
|
+
# Start of a stream
|
|
98
|
+
stream_start = SynthesizeStart.from_event(event)
|
|
99
|
+
self.is_streaming = True
|
|
100
|
+
self.sbd = SentenceBoundaryDetector()
|
|
101
|
+
self._synthesize = Synthesize(text="", voice=stream_start.voice)
|
|
102
|
+
_LOGGER.debug("Text stream started: voice=%s", stream_start.voice)
|
|
103
|
+
return True
|
|
104
|
+
|
|
105
|
+
if SynthesizeChunk.is_type(event.type):
|
|
106
|
+
assert self._synthesize is not None
|
|
107
|
+
stream_chunk = SynthesizeChunk.from_event(event)
|
|
108
|
+
for sentence in self.sbd.add_chunk(stream_chunk.text):
|
|
109
|
+
_LOGGER.debug("Synthesizing stream sentence: %s", sentence)
|
|
110
|
+
self._synthesize.text = sentence
|
|
111
|
+
await self._handle_synthesize(self._synthesize)
|
|
112
|
+
|
|
113
|
+
return True
|
|
114
|
+
|
|
115
|
+
if SynthesizeStop.is_type(event.type):
|
|
116
|
+
assert self._synthesize is not None
|
|
117
|
+
self._synthesize.text = self.sbd.finish()
|
|
118
|
+
if self._synthesize.text:
|
|
119
|
+
# Final audio chunk(s)
|
|
120
|
+
await self._handle_synthesize(self._synthesize)
|
|
121
|
+
|
|
122
|
+
# End of audio
|
|
123
|
+
await self.write_event(SynthesizeStopped().event())
|
|
124
|
+
|
|
125
|
+
_LOGGER.debug("Text stream stopped")
|
|
126
|
+
return True
|
|
127
|
+
|
|
128
|
+
if not Synthesize.is_type(event.type):
|
|
129
|
+
return True
|
|
130
|
+
|
|
131
|
+
synthesize = Synthesize.from_event(event)
|
|
132
|
+
return await self._handle_synthesize(synthesize)
|
|
49
133
|
except Exception as err:
|
|
50
134
|
await self.write_event(
|
|
51
135
|
Error(text=str(err), code=err.__class__.__name__).event()
|
|
52
136
|
)
|
|
53
137
|
raise err
|
|
54
138
|
|
|
55
|
-
async def
|
|
56
|
-
synthesize =
|
|
139
|
+
async def _handle_synthesize(
|
|
140
|
+
self, synthesize: Synthesize, send_start: bool = True, send_stop: bool = True
|
|
141
|
+
) -> bool:
|
|
142
|
+
global _VOICE, _VOICE_NAME
|
|
143
|
+
|
|
57
144
|
_LOGGER.debug(synthesize)
|
|
58
145
|
|
|
59
146
|
raw_text = synthesize.text
|
|
@@ -72,75 +159,117 @@ class PiperEventHandler(AsyncEventHandler):
|
|
|
72
159
|
if not has_punctuation:
|
|
73
160
|
text = text + self.cli_args.auto_punctuation[0]
|
|
74
161
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
#
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
162
|
+
# Resolve voice
|
|
163
|
+
_LOGGER.debug("synthesize: raw_text=%s, text='%s'", raw_text, text)
|
|
164
|
+
voice_name: Optional[str] = None
|
|
165
|
+
voice_speaker: Optional[str] = None
|
|
166
|
+
if synthesize.voice is not None:
|
|
167
|
+
voice_name = synthesize.voice.name
|
|
168
|
+
voice_speaker = synthesize.voice.speaker
|
|
169
|
+
|
|
170
|
+
if voice_name is None:
|
|
171
|
+
# Default voice
|
|
172
|
+
voice_name = self.cli_args.voice
|
|
173
|
+
|
|
174
|
+
if voice_name == self.cli_args.voice:
|
|
175
|
+
# Default speaker
|
|
176
|
+
voice_speaker = voice_speaker or self.cli_args.speaker
|
|
177
|
+
|
|
178
|
+
assert voice_name is not None
|
|
179
|
+
|
|
180
|
+
# Resolve alias
|
|
181
|
+
voice_info = self.voices_info.get(voice_name, {})
|
|
182
|
+
voice_name = voice_info.get("key", voice_name)
|
|
183
|
+
assert voice_name is not None
|
|
184
|
+
|
|
185
|
+
with tempfile.NamedTemporaryFile(mode="wb+", suffix=".wav") as output_file:
|
|
186
|
+
async with _VOICE_LOCK:
|
|
187
|
+
if voice_name != _VOICE_NAME:
|
|
188
|
+
# Load new voice
|
|
189
|
+
_LOGGER.debug("Loading voice: %s", _VOICE_NAME)
|
|
190
|
+
ensure_voice_exists(
|
|
191
|
+
voice_name,
|
|
192
|
+
self.cli_args.data_dir,
|
|
193
|
+
self.cli_args.download_dir,
|
|
194
|
+
self.voices_info,
|
|
195
|
+
)
|
|
196
|
+
model_path, config_path = find_voice(
|
|
197
|
+
voice_name, self.cli_args.data_dir
|
|
97
198
|
)
|
|
199
|
+
_VOICE = PiperVoice.load(
|
|
200
|
+
model_path, config_path, use_cuda=self.cli_args.use_cuda
|
|
201
|
+
)
|
|
202
|
+
_VOICE_NAME = voice_name
|
|
98
203
|
|
|
99
|
-
|
|
100
|
-
piper_proc.proc.stdin.write(
|
|
101
|
-
(json.dumps(input_obj, ensure_ascii=False) + "\n").encode()
|
|
102
|
-
)
|
|
103
|
-
await piper_proc.proc.stdin.drain()
|
|
204
|
+
assert _VOICE is not None
|
|
104
205
|
|
|
105
|
-
|
|
106
|
-
|
|
206
|
+
syn_config = SynthesisConfig()
|
|
207
|
+
if voice_speaker is not None:
|
|
208
|
+
syn_config.speaker_id = _VOICE.config.speaker_id_map.get(
|
|
209
|
+
voice_speaker
|
|
210
|
+
)
|
|
211
|
+
if syn_config.speaker_id is None:
|
|
212
|
+
try:
|
|
213
|
+
# Try to interpret as an id
|
|
214
|
+
syn_config.speaker_id = int(voice_speaker)
|
|
215
|
+
except ValueError:
|
|
216
|
+
pass
|
|
107
217
|
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
channels = wav_file.getnchannels()
|
|
218
|
+
if syn_config.speaker_id is None:
|
|
219
|
+
_LOGGER.warning(
|
|
220
|
+
"No speaker '%s' for voice '%s'", voice_speaker, voice_name
|
|
221
|
+
)
|
|
113
222
|
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
223
|
+
if self.cli_args.length_scale is not None:
|
|
224
|
+
syn_config.length_scale = self.cli_args.length_scale
|
|
225
|
+
|
|
226
|
+
if self.cli_args.noise_scale is not None:
|
|
227
|
+
syn_config.noise_scale = self.cli_args.noise_scale
|
|
228
|
+
|
|
229
|
+
if self.cli_args.noise_w_scale is not None:
|
|
230
|
+
syn_config.noise_w_scale = self.cli_args.noise_w_scale
|
|
231
|
+
|
|
232
|
+
wav_writer: wave.Wave_write = wave.open(output_file, "wb")
|
|
233
|
+
with wav_writer:
|
|
234
|
+
_VOICE.synthesize_wav(text, wav_writer, syn_config)
|
|
235
|
+
|
|
236
|
+
output_file.seek(0)
|
|
237
|
+
|
|
238
|
+
wav_file: wave.Wave_read = wave.open(output_file, "rb")
|
|
239
|
+
with wav_file:
|
|
240
|
+
rate = wav_file.getframerate()
|
|
241
|
+
width = wav_file.getsampwidth()
|
|
242
|
+
channels = wav_file.getnchannels()
|
|
243
|
+
|
|
244
|
+
if send_start:
|
|
245
|
+
await self.write_event(
|
|
246
|
+
AudioStart(
|
|
247
|
+
rate=rate,
|
|
248
|
+
width=width,
|
|
249
|
+
channels=channels,
|
|
250
|
+
).event(),
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
# Audio
|
|
254
|
+
audio_bytes = wav_file.readframes(wav_file.getnframes())
|
|
255
|
+
bytes_per_sample = width * channels
|
|
256
|
+
bytes_per_chunk = bytes_per_sample * self.cli_args.samples_per_chunk
|
|
257
|
+
num_chunks = int(math.ceil(len(audio_bytes) / bytes_per_chunk))
|
|
258
|
+
|
|
259
|
+
# Split into chunks
|
|
260
|
+
for i in range(num_chunks):
|
|
261
|
+
offset = i * bytes_per_chunk
|
|
262
|
+
chunk = audio_bytes[offset : offset + bytes_per_chunk]
|
|
263
|
+
await self.write_event(
|
|
264
|
+
AudioChunk(
|
|
265
|
+
audio=chunk,
|
|
266
|
+
rate=rate,
|
|
267
|
+
width=width,
|
|
268
|
+
channels=channels,
|
|
269
|
+
).event(),
|
|
270
|
+
)
|
|
121
271
|
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
bytes_per_sample = width * channels
|
|
125
|
-
bytes_per_chunk = bytes_per_sample * self.cli_args.samples_per_chunk
|
|
126
|
-
num_chunks = int(math.ceil(len(audio_bytes) / bytes_per_chunk))
|
|
127
|
-
|
|
128
|
-
# Split into chunks
|
|
129
|
-
for i in range(num_chunks):
|
|
130
|
-
offset = i * bytes_per_chunk
|
|
131
|
-
chunk = audio_bytes[offset : offset + bytes_per_chunk]
|
|
132
|
-
await self.write_event(
|
|
133
|
-
AudioChunk(
|
|
134
|
-
audio=chunk,
|
|
135
|
-
rate=rate,
|
|
136
|
-
width=width,
|
|
137
|
-
channels=channels,
|
|
138
|
-
).event(),
|
|
139
|
-
)
|
|
140
|
-
|
|
141
|
-
await self.write_event(AudioStop().event())
|
|
142
|
-
_LOGGER.debug("Completed request")
|
|
143
|
-
|
|
144
|
-
os.unlink(output_path)
|
|
272
|
+
if send_stop:
|
|
273
|
+
await self.write_event(AudioStop().event())
|
|
145
274
|
|
|
146
275
|
return True
|