chichi-speech 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
File without changes
|
chichi_speech/client.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
|
|
2
|
+
import argparse
|
|
3
|
+
import requests
|
|
4
|
+
import sys
|
|
5
|
+
import os
|
|
6
|
+
|
|
7
|
+
def main():
|
|
8
|
+
parser = argparse.ArgumentParser(description="Client for Qwen3 TTS Service")
|
|
9
|
+
parser.add_argument("text", help="Text to synthesize")
|
|
10
|
+
parser.add_argument("-l", "--language", default="Chinese", help="Language code (default: Chinese)")
|
|
11
|
+
parser.add_argument("-o", "--output", default="output.wav", help="Output WAV file path")
|
|
12
|
+
parser.add_argument("--url", default="http://localhost:9090", help="Service URL (default: http://localhost:9090)")
|
|
13
|
+
|
|
14
|
+
args = parser.parse_args()
|
|
15
|
+
|
|
16
|
+
endpoint = f"{args.url.rstrip('/')}/synthesize"
|
|
17
|
+
payload = {
|
|
18
|
+
"text": args.text,
|
|
19
|
+
"language": args.language
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
print(f"Sending request to {endpoint}...")
|
|
23
|
+
try:
|
|
24
|
+
response = requests.post(endpoint, json=payload, stream=True)
|
|
25
|
+
response.raise_for_status()
|
|
26
|
+
|
|
27
|
+
with open(args.output, "wb") as f:
|
|
28
|
+
for chunk in response.iter_content(chunk_size=8192):
|
|
29
|
+
f.write(chunk)
|
|
30
|
+
|
|
31
|
+
print(f"Success! Audio saved to: {os.path.abspath(args.output)}")
|
|
32
|
+
|
|
33
|
+
except requests.exceptions.ConnectionError:
|
|
34
|
+
print(f"Error: Could not connect to service at {args.url}. Is the server running?")
|
|
35
|
+
sys.exit(1)
|
|
36
|
+
except Exception as e:
|
|
37
|
+
print(f"Error: {e}")
|
|
38
|
+
sys.exit(1)
|
|
39
|
+
|
|
40
|
+
if __name__ == "__main__":
|
|
41
|
+
main()
|
chichi_speech/server.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import uvicorn
|
|
2
|
+
from fastapi import FastAPI, HTTPException
|
|
3
|
+
from fastapi.responses import StreamingResponse
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
import torch
|
|
6
|
+
import soundfile as sf
|
|
7
|
+
import io
|
|
8
|
+
from qwen_tts import Qwen3TTSModel
|
|
9
|
+
|
|
10
|
+
# Initialize FastAPI app
|
|
11
|
+
app = FastAPI(title="ChiChi Speech Service")
|
|
12
|
+
|
|
13
|
+
# Global variables
|
|
14
|
+
model = None
|
|
15
|
+
VOICE_PROMPT = None
|
|
16
|
+
|
|
17
|
+
# Hardcoded reference constants for voice cloning
|
|
18
|
+
# REF_AUDIO = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-TTS-Repo/clone_2.wav"
|
|
19
|
+
# REF_TEXT = "Okay. Yeah. I resent you. I love you. I respect you. But you know what? You blew it! And thanks to you."
|
|
20
|
+
REF_AUDIO = "/Users/stevenliu/Downloads/0126_ori_voice.WAV"
|
|
21
|
+
REF_TEXT = "你那日问我,我的眼里是如何看待我?元氏父兄我打你,他们试着训过,终点并非叛贼。你总说守护江阴义不容辞,守护江阴义不容辞,可我怕你守到这最后,你也住了那史书上殉国忠烈。"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@app.on_event("startup")
|
|
25
|
+
async def startup_event():
|
|
26
|
+
global model, VOICE_PROMPT
|
|
27
|
+
print("Loading Qwen3 TTS Model...")
|
|
28
|
+
# Initialize the model
|
|
29
|
+
# Using the same parameters as voice_clone_basic.py
|
|
30
|
+
model = Qwen3TTSModel.from_pretrained(
|
|
31
|
+
"Qwen/Qwen3-TTS-12Hz-1.7B-Base",
|
|
32
|
+
device_map="mps",
|
|
33
|
+
dtype=torch.float32,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
print("Creating Voice Clone Prompt...")
|
|
37
|
+
# Pre-compute the prompt using the hardcoded reference audio and text
|
|
38
|
+
# This corresponds to: prompt_items = tts.create_voice_clone_prompt(...)
|
|
39
|
+
# We default x_vector_only_mode to False as the variable 'xvec_only'
|
|
40
|
+
# from the snippet is unknown, and typically such flags are optional.
|
|
41
|
+
VOICE_PROMPT = model.create_voice_clone_prompt(
|
|
42
|
+
ref_audio=REF_AUDIO,
|
|
43
|
+
ref_text=REF_TEXT,
|
|
44
|
+
# x_vector_only_mode=False
|
|
45
|
+
)
|
|
46
|
+
print("Service Ready.")
|
|
47
|
+
|
|
48
|
+
class SynthesisRequest(BaseModel):
|
|
49
|
+
text: str
|
|
50
|
+
language: str = "Chinese"
|
|
51
|
+
|
|
52
|
+
@app.post("/synthesize")
|
|
53
|
+
async def synthesize(request: SynthesisRequest):
|
|
54
|
+
"""
|
|
55
|
+
Synthesize speech using the pre-loaded voice clone prompt.
|
|
56
|
+
"""
|
|
57
|
+
if not model or VOICE_PROMPT is None:
|
|
58
|
+
raise HTTPException(status_code=503, detail="Service not initialized")
|
|
59
|
+
|
|
60
|
+
try:
|
|
61
|
+
# Generate the voice clone
|
|
62
|
+
# corresponding to: return tts.generate_voice_clone(...)
|
|
63
|
+
wavs, sr = model.generate_voice_clone(
|
|
64
|
+
text=request.text,
|
|
65
|
+
language=request.language,
|
|
66
|
+
voice_clone_prompt=VOICE_PROMPT,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
# wavs[0] is the audio data. Ensure it's a numpy array for soundfile.
|
|
70
|
+
audio_data = wavs[0]
|
|
71
|
+
if hasattr(audio_data, "cpu"):
|
|
72
|
+
audio_data = audio_data.cpu().float().numpy()
|
|
73
|
+
|
|
74
|
+
# Write to an in-memory buffer
|
|
75
|
+
buffer = io.BytesIO()
|
|
76
|
+
sf.write(buffer, audio_data, sr, format='WAV')
|
|
77
|
+
buffer.seek(0)
|
|
78
|
+
|
|
79
|
+
return StreamingResponse(buffer, media_type="audio/wav")
|
|
80
|
+
|
|
81
|
+
except Exception as e:
|
|
82
|
+
import traceback
|
|
83
|
+
traceback.print_exc()
|
|
84
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
85
|
+
|
|
86
|
+
def main():
|
|
87
|
+
import argparse
|
|
88
|
+
import os
|
|
89
|
+
|
|
90
|
+
parser = argparse.ArgumentParser(description="Qwen3 TTS Service")
|
|
91
|
+
parser.add_argument("--port", type=int, default=9090, help="Service port (default: 9090)")
|
|
92
|
+
parser.add_argument("--host", type=str, default="0.0.0.0", help="Service host (default: 0.0.0.0)")
|
|
93
|
+
parser.add_argument("--ref-audio", type=str, nargs="+", help="Path(s) to reference audio file(s) for voice cloning")
|
|
94
|
+
parser.add_argument("--ref-text", type=str, nargs="+", help="Reference text content(s) corresponding to the audio")
|
|
95
|
+
|
|
96
|
+
args = parser.parse_args()
|
|
97
|
+
|
|
98
|
+
# Environment variable overrides
|
|
99
|
+
if "PORT" in os.environ:
|
|
100
|
+
args.port = int(os.environ["PORT"])
|
|
101
|
+
|
|
102
|
+
# Update global configuration if arguments specific
|
|
103
|
+
global REF_AUDIO, REF_TEXT
|
|
104
|
+
if args.ref_audio:
|
|
105
|
+
REF_AUDIO = args.ref_audio
|
|
106
|
+
if args.ref_text:
|
|
107
|
+
REF_TEXT = args.ref_text
|
|
108
|
+
|
|
109
|
+
print(f"Starting server on {args.host}:{args.port}")
|
|
110
|
+
if args.ref_audio:
|
|
111
|
+
print(f"Overriding reference audio: {args.ref_audio}")
|
|
112
|
+
|
|
113
|
+
# Run the server
|
|
114
|
+
uvicorn.run(app, host=args.host, port=args.port)
|
|
115
|
+
|
|
116
|
+
if __name__ == "__main__":
|
|
117
|
+
main()
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: chichi-speech
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Qwen3 TTS Service with Voice Cloning
|
|
5
|
+
Requires-Python: >=3.10
|
|
6
|
+
Requires-Dist: fastapi
|
|
7
|
+
Requires-Dist: numba>=0.59.0
|
|
8
|
+
Requires-Dist: pydantic
|
|
9
|
+
Requires-Dist: qwen-tts
|
|
10
|
+
Requires-Dist: requests
|
|
11
|
+
Requires-Dist: soundfile
|
|
12
|
+
Requires-Dist: torch
|
|
13
|
+
Requires-Dist: uvicorn
|
|
14
|
+
Provides-Extra: dev
|
|
15
|
+
Requires-Dist: build; extra == 'dev'
|
|
16
|
+
Requires-Dist: httpx; extra == 'dev'
|
|
17
|
+
Requires-Dist: pytest; extra == 'dev'
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
|
|
20
|
+
# Chichi Speech
|
|
21
|
+
|
|
22
|
+
**A high-quality, voice-cloning TTS service powered by Qwen3.**
|
|
23
|
+
|
|
24
|
+
Chichi Speech provides a robust REST API and CLI tools for text-to-speech synthesis, featuring efficient voice cloning capabilities. It is designed to be easily deployed or integrated into other AI agents and workflows.
|
|
25
|
+
|
|
26
|
+
## Features
|
|
27
|
+
|
|
28
|
+
- **High Quality**: Utilizes the Qwen3-TTS model for state-of-the-art speech synthesis.
|
|
29
|
+
- **Voice Cloning**: Clone voices from reference audio files.
|
|
30
|
+
- **Efficient**: Optimized for reusing voice prompts to minimize computation for repeated requests.
|
|
31
|
+
- **Standardized API**: Simple REST API (`/synthesize`) for easy integration.
|
|
32
|
+
- **CLI Tools**: Includes `chichi-speech-server` and `chichi-speech-client` for immediate use.
|
|
33
|
+
|
|
34
|
+
## Installation
|
|
35
|
+
|
|
36
|
+
Prerequisites: `git`, `uv`, `python >= 3.10`.
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
export CHICHI_SPEECH_HOME="~/chichi-speech/"
|
|
40
|
+
export CHICHI_SPEECH_ENV="~/chichi-speech/.venv"
|
|
41
|
+
git clone https://github.com/yourusername/chichi-speech.git $CHICHI_SPEECH_HOME
|
|
42
|
+
cd $CHICHI_SPEECH_HOME
|
|
43
|
+
|
|
44
|
+
uv venv $CHICHI_SPEECH_ENV --python 3.10
|
|
45
|
+
source $CHICHI_SPEECH_ENV/bin/activate
|
|
46
|
+
|
|
47
|
+
uv pip install -e .
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Usage
|
|
51
|
+
|
|
52
|
+
### 1. Start the Service
|
|
53
|
+
|
|
54
|
+
The service runs on port **9090** by default.
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
# Start the server (runs in foreground, use & for background or a separate terminal)
|
|
58
|
+
source $$CHICHI_SPEECH_ENV/bin/activate
|
|
59
|
+
chichi-speech-server
|
|
60
|
+
# OR specify the port explicitly
|
|
61
|
+
chichi-speech-server --port 9090 --host 0.0.0.0
|
|
62
|
+
# OR specify your reference audio and text for voice cloning (Recommended)
|
|
63
|
+
chichi-speech-server --ref-audio /path/to/my/voice.wav --ref-text "caption of the reference audio"
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### 2. Verify Service is Running
|
|
67
|
+
Check the health/docs:
|
|
68
|
+
```bash
|
|
69
|
+
curl http://localhost:9090/docs
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### 3. Generate Speech
|
|
73
|
+
|
|
74
|
+
Use cURL:
|
|
75
|
+
```bash
|
|
76
|
+
curl -X POST "http://localhost:9090/synthesize" \
|
|
77
|
+
-H "Content-Type: application/json" \
|
|
78
|
+
-d '{
|
|
79
|
+
"text": "Nice to meet you",
|
|
80
|
+
"language": "English"
|
|
81
|
+
}' \
|
|
82
|
+
--output output/nice_to_meet.wav
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## Functionality
|
|
86
|
+
- **Endpoint**: `POST /synthesize`
|
|
87
|
+
- **Default Port**: 9090
|
|
88
|
+
- **Voice Cloning**: Uses a pre-computed voice prompt from reference files to ensure the cloned voice is consistent and generation is fast.
|
|
89
|
+
|
|
90
|
+
# Development
|
|
91
|
+
|
|
92
|
+
Install dev dependencies:
|
|
93
|
+
```bash
|
|
94
|
+
uv pip install -e ".[dev]"
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
Run tests:
|
|
98
|
+
```bash
|
|
99
|
+
pytest
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
## License
|
|
103
|
+
|
|
104
|
+
MIT
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
chichi_speech/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
chichi_speech/client.py,sha256=GYIxqKy1ONsOxNMx83gMOT1cM6K0rdjWSR_lnPiLk8U,1381
|
|
3
|
+
chichi_speech/server.py,sha256=JnC7A7casb3q_HRot9sSjgA_41E-H7lnJHr_tHOtSMg,4204
|
|
4
|
+
chichi_speech-0.1.0.dist-info/METADATA,sha256=tL48c1POhy-6QQXlmEMbTx4Z3k1GHix-3ivbEZN3-ps,2844
|
|
5
|
+
chichi_speech-0.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
6
|
+
chichi_speech-0.1.0.dist-info/entry_points.txt,sha256=tRZ0iWPkkhzyEEQUSnGhyjbFJwxX49JkyqfU5xPR9yM,116
|
|
7
|
+
chichi_speech-0.1.0.dist-info/RECORD,,
|