piper-tts-plus 0.0.0.dev0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2 @@
1
+ include requirements.txt
2
+ include piper/voices.json
@@ -0,0 +1,32 @@
1
+ Metadata-Version: 2.4
2
+ Name: piper-tts-plus
3
+ Version: 0.0.0.dev0
4
+ Summary: A fast, local neural text to speech system that sounds great and is optimized for the Raspberry Pi 4.
5
+ Home-page: https://github.com/ayutaz/piper-plus
6
+ Author: yousan
7
+ Author-email: rabbitcats77@gmail.com
8
+ License: MIT
9
+ Keywords: piper japanese and other languages tts
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Topic :: Text Processing :: Linguistic
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3.7
15
+ Classifier: Programming Language :: Python :: 3.8
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Requires-Dist: piper-phonemize~=1.1.0
19
+ Requires-Dist: onnxruntime<2,>=1.11.0
20
+ Provides-Extra: gpu
21
+ Requires-Dist: onnxruntime-gpu<2,>=1.11.0; extra == "gpu"
22
+ Provides-Extra: http
23
+ Requires-Dist: flask<4,>=3; extra == "http"
24
+ Dynamic: author
25
+ Dynamic: author-email
26
+ Dynamic: classifier
27
+ Dynamic: home-page
28
+ Dynamic: keywords
29
+ Dynamic: license
30
+ Dynamic: provides-extra
31
+ Dynamic: requires-dist
32
+ Dynamic: summary
@@ -0,0 +1,5 @@
1
+ from .voice import PiperVoice
2
+
3
+ __all__ = [
4
+ "PiperVoice",
5
+ ]
@@ -0,0 +1,159 @@
1
+ import argparse
2
+ import logging
3
+ import sys
4
+ import time
5
+ import wave
6
+ from pathlib import Path
7
+ from typing import Any, Dict
8
+
9
+ from . import PiperVoice
10
+ from .download import ensure_voice_exists, find_voice, get_voices
11
+
12
+ _FILE = Path(__file__)
13
+ _DIR = _FILE.parent
14
+ _LOGGER = logging.getLogger(_FILE.stem)
15
+
16
+
17
+ def main() -> None:
18
+ parser = argparse.ArgumentParser()
19
+ parser.add_argument("-m", "--model", required=True, help="Path to Onnx model file")
20
+ parser.add_argument("-c", "--config", help="Path to model config file")
21
+ parser.add_argument(
22
+ "-f",
23
+ "--output-file",
24
+ "--output_file",
25
+ help="Path to output WAV file (default: stdout)",
26
+ )
27
+ parser.add_argument(
28
+ "-d",
29
+ "--output-dir",
30
+ "--output_dir",
31
+ help="Path to output directory (default: cwd)",
32
+ )
33
+ parser.add_argument(
34
+ "--output-raw",
35
+ "--output_raw",
36
+ action="store_true",
37
+ help="Stream raw audio to stdout",
38
+ )
39
+ #
40
+ parser.add_argument("-s", "--speaker", type=int, help="Id of speaker (default: 0)")
41
+ parser.add_argument(
42
+ "--length-scale", "--length_scale", type=float, help="Phoneme length"
43
+ )
44
+ parser.add_argument(
45
+ "--noise-scale", "--noise_scale", type=float, help="Generator noise"
46
+ )
47
+ parser.add_argument(
48
+ "--noise-w", "--noise_w", type=float, help="Phoneme width noise"
49
+ )
50
+ #
51
+ parser.add_argument("--cuda", action="store_true", help="Use GPU")
52
+ #
53
+ parser.add_argument(
54
+ "--sentence-silence",
55
+ "--sentence_silence",
56
+ type=float,
57
+ default=0.0,
58
+ help="Seconds of silence after each sentence",
59
+ )
60
+ #
61
+ parser.add_argument(
62
+ "--data-dir",
63
+ "--data_dir",
64
+ action="append",
65
+ default=[str(Path.cwd())],
66
+ help="Data directory to check for downloaded models (default: current directory)",
67
+ )
68
+ parser.add_argument(
69
+ "--download-dir",
70
+ "--download_dir",
71
+ help="Directory to download voices into (default: first data dir)",
72
+ )
73
+ #
74
+ parser.add_argument(
75
+ "--update-voices",
76
+ action="store_true",
77
+ help="Download latest voices.json during startup",
78
+ )
79
+ #
80
+ parser.add_argument(
81
+ "--debug", action="store_true", help="Print DEBUG messages to console"
82
+ )
83
+ args = parser.parse_args()
84
+ logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
85
+ _LOGGER.debug(args)
86
+
87
+ if not args.download_dir:
88
+ # Download to first data directory by default
89
+ args.download_dir = args.data_dir[0]
90
+
91
+ # Download voice if file doesn't exist
92
+ model_path = Path(args.model)
93
+ if not model_path.exists():
94
+ # Load voice info
95
+ voices_info = get_voices(args.download_dir, update_voices=args.update_voices)
96
+
97
+ # Resolve aliases for backwards compatibility with old voice names
98
+ aliases_info: Dict[str, Any] = {}
99
+ for voice_info in voices_info.values():
100
+ for voice_alias in voice_info.get("aliases", []):
101
+ aliases_info[voice_alias] = {"_is_alias": True, **voice_info}
102
+
103
+ voices_info.update(aliases_info)
104
+ ensure_voice_exists(args.model, args.data_dir, args.download_dir, voices_info)
105
+ args.model, args.config = find_voice(args.model, args.data_dir)
106
+
107
+ # Load voice
108
+ voice = PiperVoice.load(args.model, config_path=args.config, use_cuda=args.cuda)
109
+ synthesize_args = {
110
+ "speaker_id": args.speaker,
111
+ "length_scale": args.length_scale,
112
+ "noise_scale": args.noise_scale,
113
+ "noise_w": args.noise_w,
114
+ "sentence_silence": args.sentence_silence,
115
+ }
116
+
117
+ if args.output_raw:
118
+ # Read line-by-line
119
+ for line in sys.stdin:
120
+ line = line.strip()
121
+ if not line:
122
+ continue
123
+
124
+ # Write raw audio to stdout as its produced
125
+ audio_stream = voice.synthesize_stream_raw(line, **synthesize_args)
126
+ for audio_bytes in audio_stream:
127
+ sys.stdout.buffer.write(audio_bytes)
128
+ sys.stdout.buffer.flush()
129
+ elif args.output_dir:
130
+ output_dir = Path(args.output_dir)
131
+ output_dir.mkdir(parents=True, exist_ok=True)
132
+
133
+ # Read line-by-line
134
+ for line in sys.stdin:
135
+ line = line.strip()
136
+ if not line:
137
+ continue
138
+
139
+ wav_path = output_dir / f"{time.monotonic_ns()}.wav"
140
+ with wave.open(str(wav_path), "wb") as wav_file:
141
+ voice.synthesize(line, wav_file, **synthesize_args)
142
+
143
+ _LOGGER.info("Wrote %s", wav_path)
144
+ else:
145
+ # Read entire input
146
+ text = sys.stdin.read()
147
+
148
+ if (not args.output_file) or (args.output_file == "-"):
149
+ # Write to stdout
150
+ with wave.open(sys.stdout.buffer, "wb") as wav_file:
151
+ voice.synthesize(text, wav_file, **synthesize_args)
152
+ else:
153
+ # Write to file
154
+ with wave.open(args.output_file, "wb") as wav_file:
155
+ voice.synthesize(text, wav_file, **synthesize_args)
156
+
157
+
158
+ if __name__ == "__main__":
159
+ main()
@@ -0,0 +1,54 @@
1
+ """Piper configuration"""
2
+ from dataclasses import dataclass
3
+ from enum import Enum
4
+ from typing import Any, Dict, Mapping, Sequence
5
+
6
+
7
+ class PhonemeType(str, Enum):
8
+ ESPEAK = "espeak"
9
+ TEXT = "text"
10
+ OPENJTALK = "openjtalk"
11
+
12
+
13
+ @dataclass
14
+ class PiperConfig:
15
+ """Piper configuration"""
16
+
17
+ num_symbols: int
18
+ """Number of phonemes"""
19
+
20
+ num_speakers: int
21
+ """Number of speakers"""
22
+
23
+ sample_rate: int
24
+ """Sample rate of output audio"""
25
+
26
+ espeak_voice: str
27
+ """Name of espeak-ng voice or alphabet"""
28
+
29
+ length_scale: float
30
+ noise_scale: float
31
+ noise_w: float
32
+
33
+ phoneme_id_map: Mapping[str, Sequence[int]]
34
+ """Phoneme -> [id,]"""
35
+
36
+ phoneme_type: PhonemeType
37
+ """espeak or text"""
38
+
39
+ @staticmethod
40
+ def from_dict(config: Dict[str, Any]) -> "PiperConfig":
41
+ inference = config.get("inference", {})
42
+
43
+ return PiperConfig(
44
+ num_symbols=config["num_symbols"],
45
+ num_speakers=config["num_speakers"],
46
+ sample_rate=config["audio"]["sample_rate"],
47
+ noise_scale=inference.get("noise_scale", 0.667),
48
+ length_scale=inference.get("length_scale", 1.0),
49
+ noise_w=inference.get("noise_w", 0.8),
50
+ #
51
+ espeak_voice=config["espeak"]["voice"],
52
+ phoneme_id_map=config["phoneme_id_map"],
53
+ phoneme_type=PhonemeType(config.get("phoneme_type", PhonemeType.ESPEAK)),
54
+ )
@@ -0,0 +1,5 @@
1
+ """Constants"""
2
+
3
+ PAD = "_" # padding (0)
4
+ BOS = "^" # beginning of sentence
5
+ EOS = "$" # end of sentence
@@ -0,0 +1,139 @@
1
+ """Utility for downloading Piper voices."""
2
+ import json
3
+ import logging
4
+ import shutil
5
+ from pathlib import Path
6
+ from typing import Any, Dict, Iterable, Set, Tuple, Union
7
+ from urllib.request import urlopen
8
+
9
+ from .file_hash import get_file_hash
10
+
11
+ URL_FORMAT = "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/{file}"
12
+
13
+ _DIR = Path(__file__).parent
14
+ _LOGGER = logging.getLogger(__name__)
15
+
16
+ _SKIP_FILES = {"MODEL_CARD"}
17
+
18
+
19
+ class VoiceNotFoundError(Exception):
20
+ pass
21
+
22
+
23
+ def get_voices(
24
+ download_dir: Union[str, Path], update_voices: bool = False
25
+ ) -> Dict[str, Any]:
26
+ """Loads available voices from downloaded or embedded JSON file."""
27
+ download_dir = Path(download_dir)
28
+ voices_download = download_dir / "voices.json"
29
+
30
+ if update_voices:
31
+ # Download latest voices.json
32
+ voices_url = URL_FORMAT.format(file="voices.json")
33
+ _LOGGER.debug("Downloading %s to %s", voices_url, voices_download)
34
+ with urlopen(voices_url) as response, open(
35
+ voices_download, "wb"
36
+ ) as download_file:
37
+ shutil.copyfileobj(response, download_file)
38
+
39
+ # Prefer downloaded file to embedded
40
+ voices_embedded = _DIR / "voices.json"
41
+ voices_path = voices_download if voices_download.exists() else voices_embedded
42
+
43
+ _LOGGER.debug("Loading %s", voices_path)
44
+ with open(voices_path, "r", encoding="utf-8") as voices_file:
45
+ return json.load(voices_file)
46
+
47
+
48
+ def ensure_voice_exists(
49
+ name: str,
50
+ data_dirs: Iterable[Union[str, Path]],
51
+ download_dir: Union[str, Path],
52
+ voices_info: Dict[str, Any],
53
+ ):
54
+ assert data_dirs, "No data dirs"
55
+ if name not in voices_info:
56
+ raise VoiceNotFoundError(name)
57
+
58
+ voice_info = voices_info[name]
59
+ voice_files = voice_info["files"]
60
+ files_to_download: Set[str] = set()
61
+
62
+ for data_dir in data_dirs:
63
+ data_dir = Path(data_dir)
64
+
65
+ # Check sizes/hashes
66
+ for file_path, file_info in voice_files.items():
67
+ if file_path in files_to_download:
68
+ # Already planning to download
69
+ continue
70
+
71
+ file_name = Path(file_path).name
72
+ if file_name in _SKIP_FILES:
73
+ continue
74
+
75
+ data_file_path = data_dir / file_name
76
+ _LOGGER.debug("Checking %s", data_file_path)
77
+ if not data_file_path.exists():
78
+ _LOGGER.debug("Missing %s", data_file_path)
79
+ files_to_download.add(file_path)
80
+ continue
81
+
82
+ expected_size = file_info["size_bytes"]
83
+ actual_size = data_file_path.stat().st_size
84
+ if expected_size != actual_size:
85
+ _LOGGER.warning(
86
+ "Wrong size (expected=%s, actual=%s) for %s",
87
+ expected_size,
88
+ actual_size,
89
+ data_file_path,
90
+ )
91
+ files_to_download.add(file_path)
92
+ continue
93
+
94
+ expected_hash = file_info["md5_digest"]
95
+ actual_hash = get_file_hash(data_file_path)
96
+ if expected_hash != actual_hash:
97
+ _LOGGER.warning(
98
+ "Wrong hash (expected=%s, actual=%s) for %s",
99
+ expected_hash,
100
+ actual_hash,
101
+ data_file_path,
102
+ )
103
+ files_to_download.add(file_path)
104
+ continue
105
+
106
+ if (not voice_files) and (not files_to_download):
107
+ raise ValueError(f"Unable to find or download voice: {name}")
108
+
109
+ # Download missing files
110
+ download_dir = Path(download_dir)
111
+
112
+ for file_path in files_to_download:
113
+ file_name = Path(file_path).name
114
+ if file_name in _SKIP_FILES:
115
+ continue
116
+
117
+ file_url = URL_FORMAT.format(file=file_path)
118
+ download_file_path = download_dir / file_name
119
+ download_file_path.parent.mkdir(parents=True, exist_ok=True)
120
+
121
+ _LOGGER.debug("Downloading %s to %s", file_url, download_file_path)
122
+ with urlopen(file_url) as response, open(
123
+ download_file_path, "wb"
124
+ ) as download_file:
125
+ shutil.copyfileobj(response, download_file)
126
+
127
+ _LOGGER.info("Downloaded %s (%s)", download_file_path, file_url)
128
+
129
+
130
+ def find_voice(name: str, data_dirs: Iterable[Union[str, Path]]) -> Tuple[Path, Path]:
131
+ for data_dir in data_dirs:
132
+ data_dir = Path(data_dir)
133
+ onnx_path = data_dir / f"{name}.onnx"
134
+ config_path = data_dir / f"{name}.onnx.json"
135
+
136
+ if onnx_path.exists() and config_path.exists():
137
+ return onnx_path, config_path
138
+
139
+ raise ValueError(f"Missing files for voice {name}")
@@ -0,0 +1,46 @@
1
+ import argparse
2
+ import hashlib
3
+ import json
4
+ import sys
5
+ from pathlib import Path
6
+ from typing import Union
7
+
8
+
9
+ def get_file_hash(path: Union[str, Path], bytes_per_chunk: int = 8192) -> str:
10
+ """Hash a file in chunks using md5."""
11
+ path_hash = hashlib.md5()
12
+ with open(path, "rb") as path_file:
13
+ chunk = path_file.read(bytes_per_chunk)
14
+ while chunk:
15
+ path_hash.update(chunk)
16
+ chunk = path_file.read(bytes_per_chunk)
17
+
18
+ return path_hash.hexdigest()
19
+
20
+
21
+ # -----------------------------------------------------------------------------
22
+
23
+
24
+ def main():
25
+ parser = argparse.ArgumentParser()
26
+ parser.add_argument("file", nargs="+")
27
+ parser.add_argument("--dir", help="Parent directory")
28
+ args = parser.parse_args()
29
+
30
+ if args.dir:
31
+ args.dir = Path(args.dir)
32
+
33
+ hashes = {}
34
+ for path_str in args.file:
35
+ path = Path(path_str)
36
+ path_hash = get_file_hash(path)
37
+ if args.dir:
38
+ path = path.relative_to(args.dir)
39
+
40
+ hashes[str(path)] = path_hash
41
+
42
+ json.dump(hashes, sys.stdout)
43
+
44
+
45
+ if __name__ == "__main__":
46
+ main()
@@ -0,0 +1,127 @@
1
+ #!/usr/bin/env python3
2
+ import argparse
3
+ import io
4
+ import logging
5
+ import wave
6
+ from pathlib import Path
7
+ from typing import Any, Dict
8
+
9
+ from flask import Flask, request
10
+
11
+ from . import PiperVoice
12
+ from .download import ensure_voice_exists, find_voice, get_voices
13
+
14
+ _LOGGER = logging.getLogger()
15
+
16
+
17
+ def main() -> None:
18
+ parser = argparse.ArgumentParser()
19
+ parser.add_argument("--host", default="0.0.0.0", help="HTTP server host")
20
+ parser.add_argument("--port", type=int, default=5000, help="HTTP server port")
21
+ #
22
+ parser.add_argument("-m", "--model", required=True, help="Path to Onnx model file")
23
+ parser.add_argument("-c", "--config", help="Path to model config file")
24
+ #
25
+ parser.add_argument("-s", "--speaker", type=int, help="Id of speaker (default: 0)")
26
+ parser.add_argument(
27
+ "--length-scale", "--length_scale", type=float, help="Phoneme length"
28
+ )
29
+ parser.add_argument(
30
+ "--noise-scale", "--noise_scale", type=float, help="Generator noise"
31
+ )
32
+ parser.add_argument(
33
+ "--noise-w", "--noise_w", type=float, help="Phoneme width noise"
34
+ )
35
+ #
36
+ parser.add_argument("--cuda", action="store_true", help="Use GPU")
37
+ #
38
+ parser.add_argument(
39
+ "--sentence-silence",
40
+ "--sentence_silence",
41
+ type=float,
42
+ default=0.0,
43
+ help="Seconds of silence after each sentence",
44
+ )
45
+ #
46
+ parser.add_argument(
47
+ "--data-dir",
48
+ "--data_dir",
49
+ action="append",
50
+ default=[str(Path.cwd())],
51
+ help="Data directory to check for downloaded models (default: current directory)",
52
+ )
53
+ parser.add_argument(
54
+ "--download-dir",
55
+ "--download_dir",
56
+ help="Directory to download voices into (default: first data dir)",
57
+ )
58
+ #
59
+ parser.add_argument(
60
+ "--update-voices",
61
+ action="store_true",
62
+ help="Download latest voices.json during startup",
63
+ )
64
+ #
65
+ parser.add_argument(
66
+ "--debug", action="store_true", help="Print DEBUG messages to console"
67
+ )
68
+ args = parser.parse_args()
69
+ logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
70
+ _LOGGER.debug(args)
71
+
72
+ if not args.download_dir:
73
+ # Download to first data directory by default
74
+ args.download_dir = args.data_dir[0]
75
+
76
+ # Download voice if file doesn't exist
77
+ model_path = Path(args.model)
78
+ if not model_path.exists():
79
+ # Load voice info
80
+ voices_info = get_voices(args.download_dir, update_voices=args.update_voices)
81
+
82
+ # Resolve aliases for backwards compatibility with old voice names
83
+ aliases_info: Dict[str, Any] = {}
84
+ for voice_info in voices_info.values():
85
+ for voice_alias in voice_info.get("aliases", []):
86
+ aliases_info[voice_alias] = {"_is_alias": True, **voice_info}
87
+
88
+ voices_info.update(aliases_info)
89
+ ensure_voice_exists(args.model, args.data_dir, args.download_dir, voices_info)
90
+ args.model, args.config = find_voice(args.model, args.data_dir)
91
+
92
+ # Load voice
93
+ voice = PiperVoice.load(args.model, config_path=args.config, use_cuda=args.cuda)
94
+ synthesize_args = {
95
+ "speaker_id": args.speaker,
96
+ "length_scale": args.length_scale,
97
+ "noise_scale": args.noise_scale,
98
+ "noise_w": args.noise_w,
99
+ "sentence_silence": args.sentence_silence,
100
+ }
101
+
102
+ # Create web server
103
+ app = Flask(__name__)
104
+
105
+ @app.route("/", methods=["GET", "POST"])
106
+ def app_synthesize() -> bytes:
107
+ if request.method == "POST":
108
+ text = request.data.decode("utf-8")
109
+ else:
110
+ text = request.args.get("text", "")
111
+
112
+ text = text.strip()
113
+ if not text:
114
+ raise ValueError("No text provided")
115
+
116
+ _LOGGER.debug("Synthesizing text: %s", text)
117
+ with io.BytesIO() as wav_io:
118
+ with wave.open(wav_io, "wb") as wav_file:
119
+ voice.synthesize(text, wav_file, **synthesize_args)
120
+
121
+ return wav_io.getvalue()
122
+
123
+ app.run(host=args.host, port=args.port)
124
+
125
+
126
+ if __name__ == "__main__":
127
+ main()
@@ -0,0 +1,12 @@
1
+ """Utilities"""
2
+ import numpy as np
3
+
4
+
5
+ def audio_float_to_int16(
6
+ audio: np.ndarray, max_wav_value: float = 32767.0
7
+ ) -> np.ndarray:
8
+ """Normalize audio and convert to int16 range"""
9
+ audio_norm = audio * (max_wav_value / max(0.01, np.max(np.abs(audio))))
10
+ audio_norm = np.clip(audio_norm, -max_wav_value, max_wav_value)
11
+ audio_norm = audio_norm.astype("int16")
12
+ return audio_norm