wyoming-piper 1.6.3__py3-none-any.whl → 2.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
wyoming_piper/__main__.py CHANGED
@@ -8,12 +8,11 @@ from pathlib import Path
8
8
  from typing import Any, Dict, Set
9
9
 
10
10
  from wyoming.info import Attribution, Info, TtsProgram, TtsVoice, TtsVoiceSpeaker
11
- from wyoming.server import AsyncServer
11
+ from wyoming.server import AsyncServer, AsyncTcpServer
12
12
 
13
13
  from . import __version__
14
- from .download import find_voice, get_voices
14
+ from .download import ensure_voice_exists, find_voice, get_voices
15
15
  from .handler import PiperEventHandler
16
- from .process import PiperProcessManager
17
16
 
18
17
  _LOGGER = logging.getLogger(__name__)
19
18
 
@@ -21,17 +20,20 @@ _LOGGER = logging.getLogger(__name__)
21
20
  async def main() -> None:
22
21
  """Main entry point."""
23
22
  parser = argparse.ArgumentParser()
24
- parser.add_argument(
25
- "--piper",
26
- required=True,
27
- help="Path to piper executable",
28
- )
29
23
  parser.add_argument(
30
24
  "--voice",
31
25
  required=True,
32
26
  help="Default Piper voice to use (e.g., en_US-lessac-medium)",
33
27
  )
34
28
  parser.add_argument("--uri", default="stdio://", help="unix:// or tcp://")
29
+ #
30
+ parser.add_argument(
31
+ "--zeroconf",
32
+ nargs="?",
33
+ const="piper",
34
+ help="Enable discovery over zeroconf with optional name (default: piper)",
35
+ )
36
+ #
35
37
  parser.add_argument(
36
38
  "--data-dir",
37
39
  required=True,
@@ -48,22 +50,20 @@ async def main() -> None:
48
50
  )
49
51
  parser.add_argument("--noise-scale", type=float, help="Generator noise")
50
52
  parser.add_argument("--length-scale", type=float, help="Phoneme length")
51
- parser.add_argument("--noise-w", type=float, help="Phoneme width noise")
52
- #
53
53
  parser.add_argument(
54
- "--auto-punctuation", default=".?!", help="Automatically add punctuation"
54
+ "--noise-w-scale", "--noise-w", type=float, help="Phoneme width noise"
55
55
  )
56
- parser.add_argument("--samples-per-chunk", type=int, default=1024)
56
+ #
57
57
  parser.add_argument(
58
- "--max-piper-procs",
59
- type=int,
60
- default=1,
61
- help="Maximum number of piper process to run simultaneously (default: 1)",
58
+ "--auto-punctuation",
59
+ default=".?!。?!.؟",
60
+ help="Automatically add punctuation",
62
61
  )
62
+ parser.add_argument("--samples-per-chunk", type=int, default=1024)
63
63
  parser.add_argument(
64
- "--streaming",
64
+ "--no-streaming",
65
65
  action="store_true",
66
- help="Enable audio streaming on sentence boundaries",
66
+ help="Disable audio streaming on sentence boundaries",
67
67
  )
68
68
  #
69
69
  parser.add_argument(
@@ -72,6 +72,12 @@ async def main() -> None:
72
72
  help="Download latest voices.json during startup",
73
73
  )
74
74
  #
75
+ parser.add_argument(
76
+ "--use-cuda",
77
+ action="store_true",
78
+ help="Use CUDA if available (requires onnxruntime-gpu)",
79
+ )
80
+ #
75
81
  parser.add_argument("--debug", action="store_true", help="Log DEBUG messages")
76
82
  parser.add_argument(
77
83
  "--log-format", default=logging.BASIC_FORMAT, help="Format for log messages"
@@ -187,27 +193,41 @@ async def main() -> None:
187
193
  installed=True,
188
194
  voices=sorted(voices, key=lambda v: v.name),
189
195
  version=__version__,
190
- supports_synthesize_streaming=args.streaming,
196
+ supports_synthesize_streaming=(not args.no_streaming),
191
197
  )
192
198
  ],
193
199
  )
194
200
 
195
- process_manager = PiperProcessManager(args, voices_info)
201
+ # Ensure default voice is downloaded
202
+ voice_info = voices_info.get(args.voice, {})
203
+ voice_name = voice_info.get("key", args.voice)
204
+ assert voice_name is not None
196
205
 
197
- # Make sure default voice is loaded.
198
- # Other voices will be loaded on-demand.
199
- await process_manager.get_process()
206
+ ensure_voice_exists(voice_name, args.data_dir, args.download_dir, voices_info)
200
207
 
201
208
  # Start server
202
209
  server = AsyncServer.from_uri(args.uri)
203
210
 
211
+ if args.zeroconf:
212
+ if not isinstance(server, AsyncTcpServer):
213
+ raise ValueError("Zeroconf requires tcp:// uri")
214
+
215
+ from wyoming.zeroconf import HomeAssistantZeroconf
216
+
217
+ tcp_server: AsyncTcpServer = server
218
+ hass_zeroconf = HomeAssistantZeroconf(
219
+ name=args.zeroconf, port=tcp_server.port, host=tcp_server.host
220
+ )
221
+ await hass_zeroconf.register_server()
222
+ _LOGGER.debug("Zeroconf discovery enabled")
223
+
204
224
  _LOGGER.info("Ready")
205
225
  await server.run(
206
226
  partial(
207
227
  PiperEventHandler,
208
228
  wyoming_info,
209
229
  args,
210
- process_manager,
230
+ voices_info,
211
231
  )
212
232
  )
213
233
 
wyoming_piper/download.py CHANGED
@@ -9,8 +9,6 @@ from urllib.error import URLError
9
9
  from urllib.parse import quote, urlsplit, urlunsplit
10
10
  from urllib.request import urlopen
11
11
 
12
- from .file_hash import get_file_hash
13
-
14
12
  URL_FORMAT = "https://huggingface.co/rhasspy/piper-voices/resolve/main/{file}"
15
13
 
16
14
  _DIR = Path(__file__).parent
@@ -89,8 +87,7 @@ def ensure_voice_exists(
89
87
  for data_dir in data_dirs:
90
88
  data_dir = Path(data_dir)
91
89
 
92
- # Check sizes/hashes
93
- for file_path, file_info in voice_files.items():
90
+ for file_path, _file_info in voice_files.items():
94
91
  if file_path in verified_files:
95
92
  # Already verified this file in a different data directory
96
93
  continue
@@ -101,34 +98,37 @@ def ensure_voice_exists(
101
98
 
102
99
  data_file_path = data_dir / file_name
103
100
  _LOGGER.debug("Checking %s", data_file_path)
104
- if not data_file_path.exists():
101
+ if (not data_file_path.exists()) or (data_file_path.stat().st_size == 0):
105
102
  _LOGGER.debug("Missing %s", data_file_path)
106
103
  files_to_download.add(file_path)
107
104
  continue
108
105
 
109
- expected_size = file_info["size_bytes"]
110
- actual_size = data_file_path.stat().st_size
111
- if expected_size != actual_size:
112
- _LOGGER.warning(
113
- "Wrong size (expected=%s, actual=%s) for %s",
114
- expected_size,
115
- actual_size,
116
- data_file_path,
117
- )
118
- files_to_download.add(file_path)
119
- continue
120
-
121
- expected_hash = file_info["md5_digest"]
122
- actual_hash = get_file_hash(data_file_path)
123
- if expected_hash != actual_hash:
124
- _LOGGER.warning(
125
- "Wrong hash (expected=%s, actual=%s) for %s",
126
- expected_hash,
127
- actual_hash,
128
- data_file_path,
129
- )
130
- files_to_download.add(file_path)
131
- continue
106
+ # Don't bother validating sizes or hashes.
107
+ # This causes more problems than its worth.
108
+ #
109
+ # expected_size = file_info["size_bytes"]
110
+ # actual_size = data_file_path.stat().st_size
111
+ # if expected_size != actual_size:
112
+ # _LOGGER.warning(
113
+ # "Wrong size (expected=%s, actual=%s) for %s",
114
+ # expected_size,
115
+ # actual_size,
116
+ # data_file_path,
117
+ # )
118
+ # files_to_download.add(file_path)
119
+ # continue
120
+
121
+ # expected_hash = file_info["md5_digest"]
122
+ # actual_hash = get_file_hash(data_file_path)
123
+ # if expected_hash != actual_hash:
124
+ # _LOGGER.warning(
125
+ # "Wrong hash (expected=%s, actual=%s) for %s",
126
+ # expected_hash,
127
+ # actual_hash,
128
+ # data_file_path,
129
+ # )
130
+ # files_to_download.add(file_path)
131
+ # continue
132
132
 
133
133
  # File exists and has been verified
134
134
  verified_files.add(file_path)
@@ -151,9 +151,10 @@ def ensure_voice_exists(
151
151
  download_file_path.parent.mkdir(parents=True, exist_ok=True)
152
152
 
153
153
  _LOGGER.debug("Downloading %s to %s", file_url, download_file_path)
154
- with urlopen(_quote_url(file_url)) as response, open(
155
- download_file_path, "wb"
156
- ) as download_file:
154
+ with (
155
+ urlopen(_quote_url(file_url)) as response,
156
+ open(download_file_path, "wb") as download_file,
157
+ ):
157
158
  shutil.copyfileobj(response, download_file)
158
159
 
159
160
  _LOGGER.info("Downloaded %s (%s)", download_file_path, file_url)
wyoming_piper/handler.py CHANGED
@@ -1,13 +1,15 @@
1
1
  """Event handler for clients of the server."""
2
2
 
3
3
  import argparse
4
- import json
4
+ import asyncio
5
5
  import logging
6
6
  import math
7
- import os
7
+ import tempfile
8
8
  import wave
9
9
  from typing import Any, Dict, Optional
10
10
 
11
+ from piper import PiperVoice, SynthesisConfig
12
+ from sentence_stream import SentenceBoundaryDetector
11
13
  from wyoming.audio import AudioChunk, AudioStart, AudioStop
12
14
  from wyoming.error import Error
13
15
  from wyoming.event import Event
@@ -21,18 +23,22 @@ from wyoming.tts import (
21
23
  SynthesizeStopped,
22
24
  )
23
25
 
24
- from .process import PiperProcessManager
25
- from .sentence_boundary import SentenceBoundaryDetector, remove_asterisks
26
+ from .download import ensure_voice_exists, find_voice
26
27
 
27
28
  _LOGGER = logging.getLogger(__name__)
28
29
 
30
+ # Keep the most recently used voice loaded
31
+ _VOICE: Optional[PiperVoice] = None
32
+ _VOICE_NAME: Optional[str] = None
33
+ _VOICE_LOCK = asyncio.Lock()
34
+
29
35
 
30
36
  class PiperEventHandler(AsyncEventHandler):
31
37
  def __init__(
32
38
  self,
33
39
  wyoming_info: Info,
34
40
  cli_args: argparse.Namespace,
35
- process_manager: PiperProcessManager,
41
+ voices_info: Dict[str, Any],
36
42
  *args,
37
43
  **kwargs,
38
44
  ) -> None:
@@ -40,9 +46,9 @@ class PiperEventHandler(AsyncEventHandler):
40
46
 
41
47
  self.cli_args = cli_args
42
48
  self.wyoming_info_event = wyoming_info.event()
43
- self.process_manager = process_manager
44
- self.sbd = SentenceBoundaryDetector()
49
+ self.voices_info = voices_info
45
50
  self.is_streaming: Optional[bool] = None
51
+ self.sbd = SentenceBoundaryDetector()
46
52
  self._synthesize: Optional[Synthesize] = None
47
53
 
48
54
  async def handle_event(self, event: Event) -> bool:
@@ -61,10 +67,29 @@ class PiperEventHandler(AsyncEventHandler):
61
67
 
62
68
  # Sent outside a stream, so we must process it
63
69
  synthesize = Synthesize.from_event(event)
64
- synthesize.text = remove_asterisks(synthesize.text)
65
- return await self._handle_synthesize(synthesize)
70
+ self._synthesize = Synthesize(text="", voice=synthesize.voice)
71
+ self.sbd = SentenceBoundaryDetector()
72
+ start_sent = False
73
+ for i, sentence in enumerate(self.sbd.add_chunk(synthesize.text)):
74
+ self._synthesize.text = sentence
75
+ await self._handle_synthesize(
76
+ self._synthesize, send_start=(i == 0), send_stop=False
77
+ )
78
+ start_sent = True
79
+
80
+ self._synthesize.text = self.sbd.finish()
81
+ if self._synthesize.text:
82
+ # Last sentence
83
+ await self._handle_synthesize(
84
+ self._synthesize, send_start=(not start_sent), send_stop=True
85
+ )
86
+ else:
87
+ # No final sentence
88
+ await self.write_event(AudioStop().event())
89
+
90
+ return True
66
91
 
67
- if not self.cli_args.streaming:
92
+ if self.cli_args.no_streaming:
68
93
  # Streaming is not enabled
69
94
  return True
70
95
 
@@ -111,7 +136,11 @@ class PiperEventHandler(AsyncEventHandler):
111
136
  )
112
137
  raise err
113
138
 
114
- async def _handle_synthesize(self, synthesize: Synthesize) -> bool:
139
+ async def _handle_synthesize(
140
+ self, synthesize: Synthesize, send_start: bool = True, send_stop: bool = True
141
+ ) -> bool:
142
+ global _VOICE, _VOICE_NAME
143
+
115
144
  _LOGGER.debug(synthesize)
116
145
 
117
146
  raw_text = synthesize.text
@@ -130,75 +159,118 @@ class PiperEventHandler(AsyncEventHandler):
130
159
  if not has_punctuation:
131
160
  text = text + self.cli_args.auto_punctuation[0]
132
161
 
133
- async with self.process_manager.processes_lock:
134
- _LOGGER.debug("synthesize: raw_text=%s, text='%s'", raw_text, text)
135
- voice_name: Optional[str] = None
136
- voice_speaker: Optional[str] = None
137
- if synthesize.voice is not None:
138
- voice_name = synthesize.voice.name
139
- voice_speaker = synthesize.voice.speaker
140
-
141
- piper_proc = await self.process_manager.get_process(voice_name=voice_name)
142
-
143
- assert piper_proc.proc.stdin is not None
144
- assert piper_proc.proc.stdout is not None
145
-
146
- # JSON in, file path out
147
- input_obj: Dict[str, Any] = {"text": text}
148
- if voice_speaker is not None:
149
- speaker_id = piper_proc.get_speaker_id(voice_speaker)
150
- if speaker_id is not None:
151
- input_obj["speaker_id"] = speaker_id
152
- else:
153
- _LOGGER.warning(
154
- "No speaker '%s' for voice '%s'", voice_speaker, voice_name
162
+ # Resolve voice
163
+ _LOGGER.debug("synthesize: raw_text=%s, text='%s'", raw_text, text)
164
+ voice_name: Optional[str] = None
165
+ voice_speaker: Optional[str] = None
166
+ if synthesize.voice is not None:
167
+ voice_name = synthesize.voice.name
168
+ voice_speaker = synthesize.voice.speaker
169
+
170
+ if voice_name is None:
171
+ # Default voice
172
+ voice_name = self.cli_args.voice
173
+
174
+ if voice_name == self.cli_args.voice:
175
+ # Default speaker
176
+ voice_speaker = voice_speaker or self.cli_args.speaker
177
+
178
+ assert voice_name is not None
179
+
180
+ # Resolve alias
181
+ voice_info = self.voices_info.get(voice_name, {})
182
+ voice_name = voice_info.get("key", voice_name)
183
+ assert voice_name is not None
184
+
185
+ with tempfile.NamedTemporaryFile(mode="wb+", suffix=".wav") as output_file:
186
+ async with _VOICE_LOCK:
187
+ if voice_name != _VOICE_NAME:
188
+ # Load new voice
189
+ _LOGGER.debug("Loading voice: %s", _VOICE_NAME)
190
+ ensure_voice_exists(
191
+ voice_name,
192
+ self.cli_args.data_dir,
193
+ self.cli_args.download_dir,
194
+ self.voices_info,
155
195
  )
196
+ model_path, config_path = find_voice(
197
+ voice_name, self.cli_args.data_dir
198
+ )
199
+ _VOICE = PiperVoice.load(
200
+ model_path, config_path, use_cuda=self.cli_args.use_cuda
201
+ )
202
+ _VOICE_NAME = voice_name
156
203
 
157
- _LOGGER.debug("input: %s", input_obj)
158
- piper_proc.proc.stdin.write(
159
- (json.dumps(input_obj, ensure_ascii=False) + "\n").encode()
160
- )
161
- await piper_proc.proc.stdin.drain()
162
-
163
- output_path = (await piper_proc.proc.stdout.readline()).decode().strip()
164
- _LOGGER.debug(output_path)
204
+ assert _VOICE is not None
165
205
 
166
- wav_file: wave.Wave_read = wave.open(output_path, "rb")
167
- with wav_file:
168
- rate = wav_file.getframerate()
169
- width = wav_file.getsampwidth()
170
- channels = wav_file.getnchannels()
206
+ syn_config = SynthesisConfig()
207
+ if voice_speaker is not None:
208
+ syn_config.speaker_id = _VOICE.config.speaker_id_map.get(
209
+ voice_speaker
210
+ )
211
+ if syn_config.speaker_id is None:
212
+ try:
213
+ # Try to interpret as an id
214
+ syn_config.speaker_id = int(voice_speaker)
215
+ except ValueError:
216
+ pass
217
+
218
+ if syn_config.speaker_id is None:
219
+ _LOGGER.warning(
220
+ "No speaker '%s' for voice '%s'", voice_speaker, voice_name
221
+ )
222
+
223
+ if self.cli_args.length_scale is not None:
224
+ syn_config.length_scale = self.cli_args.length_scale
225
+
226
+ if self.cli_args.noise_scale is not None:
227
+ syn_config.noise_scale = self.cli_args.noise_scale
228
+
229
+ if self.cli_args.noise_w_scale is not None:
230
+ syn_config.noise_w_scale = self.cli_args.noise_w_scale
231
+
232
+ wav_writer: wave.Wave_write = wave.open(output_file, "wb")
233
+ with wav_writer:
234
+ _VOICE.synthesize_wav(text, wav_writer, syn_config)
235
+
236
+ output_file.seek(0)
237
+
238
+ wav_file: wave.Wave_read = wave.open(output_file, "rb")
239
+ with wav_file:
240
+ rate = wav_file.getframerate()
241
+ width = wav_file.getsampwidth()
242
+ channels = wav_file.getnchannels()
243
+
244
+ if send_start:
245
+ await self.write_event(
246
+ AudioStart(
247
+ rate=rate,
248
+ width=width,
249
+ channels=channels,
250
+ ).event(),
251
+ )
171
252
 
172
- await self.write_event(
173
- AudioStart(
174
- rate=rate,
175
- width=width,
176
- channels=channels,
177
- ).event(),
178
- )
253
+ # Audio
254
+ audio_bytes = wav_file.readframes(wav_file.getnframes())
255
+ bytes_per_sample = width * channels
256
+ bytes_per_chunk = bytes_per_sample * self.cli_args.samples_per_chunk
257
+ num_chunks = int(math.ceil(len(audio_bytes) / bytes_per_chunk))
258
+
259
+ # Split into chunks
260
+ for i in range(num_chunks):
261
+ offset = i * bytes_per_chunk
262
+ chunk = audio_bytes[offset : offset + bytes_per_chunk]
263
+
264
+ await self.write_event(
265
+ AudioChunk(
266
+ audio=chunk,
267
+ rate=rate,
268
+ width=width,
269
+ channels=channels,
270
+ ).event(),
271
+ )
179
272
 
180
- # Audio
181
- audio_bytes = wav_file.readframes(wav_file.getnframes())
182
- bytes_per_sample = width * channels
183
- bytes_per_chunk = bytes_per_sample * self.cli_args.samples_per_chunk
184
- num_chunks = int(math.ceil(len(audio_bytes) / bytes_per_chunk))
185
-
186
- # Split into chunks
187
- for i in range(num_chunks):
188
- offset = i * bytes_per_chunk
189
- chunk = audio_bytes[offset : offset + bytes_per_chunk]
190
- await self.write_event(
191
- AudioChunk(
192
- audio=chunk,
193
- rate=rate,
194
- width=width,
195
- channels=channels,
196
- ).event(),
197
- )
198
-
199
- await self.write_event(AudioStop().event())
200
- _LOGGER.debug("Completed request")
201
-
202
- os.unlink(output_path)
273
+ if send_stop:
274
+ await self.write_event(AudioStop().event())
203
275
 
204
276
  return True