smartpi 0.1.40__py3-none-any.whl → 0.1.41__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smartpi/__init__.py +1 -1
- smartpi/ai_asr.py +1036 -0
- smartpi/ai_llm.py +934 -0
- smartpi/ai_tts.py +938 -0
- smartpi/ai_vad.py +199 -0
- smartpi/base_driver.py +265 -11
- smartpi/local_model.py +432 -0
- smartpi/mcp_client.py +100 -0
- smartpi/mcp_fastmcp.py +322 -0
- smartpi/mcp_intent_recognizer.py +408 -0
- smartpi/models/__init__.py +0 -0
- smartpi/models/snakers4_silero-vad/__init__.py +0 -0
- smartpi/models/snakers4_silero-vad/hubconf.py +56 -0
- smartpi/models/snakers4_silero-vad/src/silero_vad/data/silero_vad.jit +0 -0
- smartpi/models/snakers4_silero-vad/src/silero_vad/data/silero_vad.onnx +0 -0
- smartpi/models/snakers4_silero-vad/src/silero_vad/data/silero_vad_16k_op15.onnx +0 -0
- smartpi/models/snakers4_silero-vad/src/silero_vad/data/silero_vad_half.onnx +0 -0
- smartpi/tencentcloud-speech-sdk-python/__init__.py +1 -0
- smartpi/tencentcloud-speech-sdk-python/asr/__init__.py +0 -0
- smartpi/tencentcloud-speech-sdk-python/asr/flash_recognizer.py +178 -0
- smartpi/tencentcloud-speech-sdk-python/asr/speech_recognizer.py +311 -0
- smartpi/tencentcloud-speech-sdk-python/common/__init__.py +1 -0
- smartpi/tencentcloud-speech-sdk-python/common/credential.py +6 -0
- smartpi/tencentcloud-speech-sdk-python/common/log.py +16 -0
- smartpi/tencentcloud-speech-sdk-python/common/utils.py +7 -0
- smartpi/tencentcloud-speech-sdk-python/examples/tts/tts_text.txt +60 -0
- smartpi/tencentcloud-speech-sdk-python/soe/__init__.py +0 -0
- smartpi/tencentcloud-speech-sdk-python/soe/speaking_assessment.py +276 -0
- smartpi/tencentcloud-speech-sdk-python/tts/__init__.py +0 -0
- smartpi/tencentcloud-speech-sdk-python/tts/flowing_speech_synthesizer.py +294 -0
- smartpi/tencentcloud-speech-sdk-python/tts/speech_synthesizer.py +144 -0
- smartpi/tencentcloud-speech-sdk-python/tts/speech_synthesizer_ws.py +234 -0
- smartpi/tencentcloud-speech-sdk-python/vc/__init__.py +0 -0
- smartpi/tencentcloud-speech-sdk-python/vc/speech_convertor_ws.py +237 -0
- {smartpi-0.1.40.dist-info → smartpi-0.1.41.dist-info}/METADATA +1 -1
- smartpi-0.1.41.dist-info/RECORD +76 -0
- smartpi-0.1.40.dist-info/RECORD +0 -44
- {smartpi-0.1.40.dist-info → smartpi-0.1.41.dist-info}/WHEEL +0 -0
- {smartpi-0.1.40.dist-info → smartpi-0.1.41.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
import sys
|
|
3
|
+
import hmac
|
|
4
|
+
import hashlib
|
|
5
|
+
import base64
|
|
6
|
+
import time
|
|
7
|
+
import json
|
|
8
|
+
import threading
|
|
9
|
+
import websocket
|
|
10
|
+
import uuid
|
|
11
|
+
import urllib
|
|
12
|
+
from common.log import logger
|
|
13
|
+
from common.utils import is_python3
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
_PROTOCOL = "wss://"
|
|
17
|
+
_HOST = "tts.cloud.tencent.com"
|
|
18
|
+
_PATH = "/stream_wsv2"
|
|
19
|
+
_ACTION = "TextToStreamAudioWSv2"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class FlowingSpeechSynthesisListener(object):
|
|
23
|
+
'''
|
|
24
|
+
'''
|
|
25
|
+
def on_synthesis_start(self, session_id):
|
|
26
|
+
logger.info("on_synthesis_start: session_id={}".format(session_id))
|
|
27
|
+
|
|
28
|
+
def on_synthesis_end(self):
|
|
29
|
+
logger.info("on_synthesis_end: -")
|
|
30
|
+
|
|
31
|
+
def on_audio_result(self, audio_bytes):
|
|
32
|
+
logger.info("on_audio_result: recv audio bytes, len={}".format(len(audio_bytes)))
|
|
33
|
+
|
|
34
|
+
def on_text_result(self, response):
|
|
35
|
+
session_id = response["session_id"]
|
|
36
|
+
request_id = response["request_id"]
|
|
37
|
+
message_id = response["message_id"]
|
|
38
|
+
result = response['result']
|
|
39
|
+
subtitles = []
|
|
40
|
+
if "subtitles" in result and len(result["subtitles"]) > 0:
|
|
41
|
+
subtitles = result["subtitles"]
|
|
42
|
+
logger.info("on_text_result: session_id={} request_id={} message_id={}\nsubtitles={}".format(
|
|
43
|
+
session_id, request_id, message_id, subtitles))
|
|
44
|
+
|
|
45
|
+
def on_synthesis_fail(self, response):
|
|
46
|
+
logger.error("on_synthesis_fail: code={} msg={}".format(
|
|
47
|
+
response['code'], response['message']
|
|
48
|
+
))
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
NOTOPEN = 0
|
|
52
|
+
STARTED = 1
|
|
53
|
+
OPENED = 2
|
|
54
|
+
FINAL = 3
|
|
55
|
+
ERROR = 4
|
|
56
|
+
CLOSED = 5
|
|
57
|
+
|
|
58
|
+
FlowingSpeechSynthesizer_ACTION_SYNTHESIS = "ACTION_SYNTHESIS"
|
|
59
|
+
FlowingSpeechSynthesizer_ACTION_COMPLETE = "ACTION_COMPLETE"
|
|
60
|
+
FlowingSpeechSynthesizer_ACTION_RESET = "ACTION_RESET"
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class FlowingSpeechSynthesizer:
|
|
64
|
+
|
|
65
|
+
def __init__(self, appid, credential, listener):
|
|
66
|
+
self.appid = appid
|
|
67
|
+
self.credential = credential
|
|
68
|
+
self.status = NOTOPEN
|
|
69
|
+
self.ws = None
|
|
70
|
+
self.wst = None
|
|
71
|
+
self.listener = listener
|
|
72
|
+
|
|
73
|
+
self.ready = False
|
|
74
|
+
|
|
75
|
+
self.voice_type = 0
|
|
76
|
+
self.codec = "pcm"
|
|
77
|
+
self.sample_rate = 16000
|
|
78
|
+
self.volume = 10
|
|
79
|
+
self.speed = 0
|
|
80
|
+
self.session_id = ""
|
|
81
|
+
self.enable_subtitle = 0
|
|
82
|
+
self.emotion_category = ""
|
|
83
|
+
self.emotion_intensity = 100
|
|
84
|
+
|
|
85
|
+
def set_voice_type(self, voice_type):
|
|
86
|
+
self.voice_type = voice_type
|
|
87
|
+
|
|
88
|
+
def set_emotion_category(self, emotion_category):
|
|
89
|
+
self.emotion_category = emotion_category
|
|
90
|
+
|
|
91
|
+
def set_emotion_intensity(self, emotion_intensity):
|
|
92
|
+
self.emotion_intensity = emotion_intensity
|
|
93
|
+
|
|
94
|
+
def set_codec(self, codec):
|
|
95
|
+
self.codec = codec
|
|
96
|
+
|
|
97
|
+
def set_sample_rate(self, sample_rate):
|
|
98
|
+
self.sample_rate = sample_rate
|
|
99
|
+
|
|
100
|
+
def set_speed(self, speed):
|
|
101
|
+
self.speed = speed
|
|
102
|
+
|
|
103
|
+
def set_volume(self, volume):
|
|
104
|
+
self.volume = volume
|
|
105
|
+
|
|
106
|
+
def set_enable_subtitle(self, enable_subtitle):
|
|
107
|
+
self.enable_subtitle = enable_subtitle
|
|
108
|
+
|
|
109
|
+
def __gen_signature(self, params):
|
|
110
|
+
sort_dict = sorted(params.keys())
|
|
111
|
+
sign_str = "GET" + _HOST + _PATH + "?"
|
|
112
|
+
for key in sort_dict:
|
|
113
|
+
sign_str = sign_str + key + "=" + str(params[key]) + '&'
|
|
114
|
+
sign_str = sign_str[:-1]
|
|
115
|
+
print(sign_str)
|
|
116
|
+
if is_python3():
|
|
117
|
+
secret_key = self.credential.secret_key.encode('utf-8')
|
|
118
|
+
sign_str = sign_str.encode('utf-8')
|
|
119
|
+
else:
|
|
120
|
+
secret_key = self.credential.secret_key
|
|
121
|
+
hmacstr = hmac.new(secret_key, sign_str, hashlib.sha1).digest()
|
|
122
|
+
s = base64.b64encode(hmacstr)
|
|
123
|
+
s = s.decode('utf-8')
|
|
124
|
+
return s
|
|
125
|
+
|
|
126
|
+
def __gen_params(self, session_id):
|
|
127
|
+
self.session_id = session_id
|
|
128
|
+
|
|
129
|
+
params = dict()
|
|
130
|
+
params['Action'] = _ACTION
|
|
131
|
+
params['AppId'] = int(self.appid)
|
|
132
|
+
params['SecretId'] = self.credential.secret_id
|
|
133
|
+
params['ModelType'] = 1
|
|
134
|
+
params['VoiceType'] = self.voice_type
|
|
135
|
+
params['Codec'] = self.codec
|
|
136
|
+
params['SampleRate'] = self.sample_rate
|
|
137
|
+
params['Speed'] = self.speed
|
|
138
|
+
params['Volume'] = self.volume
|
|
139
|
+
params['SessionId'] = self.session_id
|
|
140
|
+
params['EnableSubtitle'] = self.enable_subtitle
|
|
141
|
+
if self.emotion_category != "":
|
|
142
|
+
params['EmotionCategory']= self.emotion_category
|
|
143
|
+
params['EmotionIntensity']= self.emotion_intensity
|
|
144
|
+
|
|
145
|
+
timestamp = int(time.time())
|
|
146
|
+
params['Timestamp'] = timestamp
|
|
147
|
+
params['Expired'] = timestamp + 24 * 60 * 60
|
|
148
|
+
return params
|
|
149
|
+
|
|
150
|
+
def __create_query_string(self, param):
|
|
151
|
+
param = sorted(param.items(), key=lambda d: d[0])
|
|
152
|
+
|
|
153
|
+
url = _PROTOCOL + _HOST + _PATH
|
|
154
|
+
|
|
155
|
+
signstr = url + "?"
|
|
156
|
+
for x in param:
|
|
157
|
+
tmp = x
|
|
158
|
+
for t in tmp:
|
|
159
|
+
signstr += str(t)
|
|
160
|
+
signstr += "="
|
|
161
|
+
signstr = signstr[:-1]
|
|
162
|
+
signstr += "&"
|
|
163
|
+
signstr = signstr[:-1]
|
|
164
|
+
return signstr
|
|
165
|
+
|
|
166
|
+
def __new_ws_request_message(self, action, data):
|
|
167
|
+
return {
|
|
168
|
+
"session_id": self.session_id,
|
|
169
|
+
"message_id": str(uuid.uuid1()),
|
|
170
|
+
|
|
171
|
+
"action": action,
|
|
172
|
+
"data": data,
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
def __do_send(self, action, text):
|
|
176
|
+
WSRequestMessage = self.__new_ws_request_message(action, text)
|
|
177
|
+
data = json.dumps(WSRequestMessage)
|
|
178
|
+
opcode = websocket.ABNF.OPCODE_TEXT
|
|
179
|
+
logger.info("ws send opcode={} data={}".format(opcode, data))
|
|
180
|
+
self.ws.send(data, opcode)
|
|
181
|
+
|
|
182
|
+
def process(self, text, action=FlowingSpeechSynthesizer_ACTION_SYNTHESIS):
|
|
183
|
+
logger.info("process: action={} data={}".format(action, text))
|
|
184
|
+
self.__do_send(action, text)
|
|
185
|
+
|
|
186
|
+
def complete(self, action = FlowingSpeechSynthesizer_ACTION_COMPLETE):
|
|
187
|
+
logger.info("complete: action={}".format(action))
|
|
188
|
+
self.__do_send(action, "")
|
|
189
|
+
|
|
190
|
+
def reset(self, action = FlowingSpeechSynthesizer_ACTION_RESET):
|
|
191
|
+
logger.info("reset: action={}".format(action))
|
|
192
|
+
self.__do_send(action, "")
|
|
193
|
+
|
|
194
|
+
def wait_ready(self, timeout_ms):
|
|
195
|
+
timeout_start = int(time.time() * 1000)
|
|
196
|
+
while True:
|
|
197
|
+
if self.ready:
|
|
198
|
+
return True
|
|
199
|
+
if int(time.time() * 1000) - timeout_start > timeout_ms:
|
|
200
|
+
break
|
|
201
|
+
time.sleep(0.01)
|
|
202
|
+
return False
|
|
203
|
+
|
|
204
|
+
def start(self):
|
|
205
|
+
logger.info("synthesizer start: begin")
|
|
206
|
+
|
|
207
|
+
def _close_conn(reason):
|
|
208
|
+
ta = time.time()
|
|
209
|
+
self.ws.close()
|
|
210
|
+
tb = time.time()
|
|
211
|
+
logger.info("client has closed connection ({}), cost {} ms".format(reason, int((tb-ta)*1000)))
|
|
212
|
+
|
|
213
|
+
def _on_data(ws, data, opcode, flag):
|
|
214
|
+
logger.debug("data={} opcode={} flag={}".format(data, opcode, flag))
|
|
215
|
+
if opcode == websocket.ABNF.OPCODE_BINARY:
|
|
216
|
+
self.listener.on_audio_result(data) # <class 'bytes'>
|
|
217
|
+
pass
|
|
218
|
+
elif opcode == websocket.ABNF.OPCODE_TEXT:
|
|
219
|
+
resp = json.loads(data) # WSResponseMessage
|
|
220
|
+
if resp['code'] != 0:
|
|
221
|
+
logger.error("server synthesis fail request_id={} code={} msg={}".format(
|
|
222
|
+
resp['request_id'], resp['code'], resp['message']
|
|
223
|
+
))
|
|
224
|
+
self.listener.on_synthesis_fail(resp)
|
|
225
|
+
return
|
|
226
|
+
if "final" in resp and resp['final'] == 1:
|
|
227
|
+
logger.info("recv FINAL frame")
|
|
228
|
+
self.status = FINAL
|
|
229
|
+
_close_conn("after recv final")
|
|
230
|
+
self.listener.on_synthesis_end()
|
|
231
|
+
return
|
|
232
|
+
if "ready" in resp and resp['ready'] == 1:
|
|
233
|
+
logger.info("recv READY frame")
|
|
234
|
+
self.ready = True
|
|
235
|
+
return
|
|
236
|
+
if "reset" in resp and resp['reset'] == 1:
|
|
237
|
+
logger.info("recv RESET frame")
|
|
238
|
+
return
|
|
239
|
+
if "heartbeat" in resp and resp['heartbeat'] == 1:
|
|
240
|
+
logger.info("recv HEARTBEAT frame")
|
|
241
|
+
return
|
|
242
|
+
if "result" in resp:
|
|
243
|
+
if "subtitles" in resp["result"] and resp["result"]["subtitles"] is not None:
|
|
244
|
+
self.listener.on_text_result(resp)
|
|
245
|
+
return
|
|
246
|
+
else:
|
|
247
|
+
logger.error("invalid on_data code, opcode=".format(opcode))
|
|
248
|
+
|
|
249
|
+
def _on_error(ws, error):
|
|
250
|
+
if self.status == FINAL or self.status == CLOSED:
|
|
251
|
+
return
|
|
252
|
+
self.status = ERROR
|
|
253
|
+
logger.error("error={}, session_id={}".format(error, self.session_id))
|
|
254
|
+
_close_conn("after recv error")
|
|
255
|
+
|
|
256
|
+
def _on_close(ws, close_status_code, close_msg):
|
|
257
|
+
logger.info("conn closed, close_status_code={} close_msg={}".format(close_status_code, close_msg))
|
|
258
|
+
self.status = CLOSED
|
|
259
|
+
|
|
260
|
+
def _on_open(ws):
|
|
261
|
+
logger.info("conn opened")
|
|
262
|
+
self.status = OPENED
|
|
263
|
+
|
|
264
|
+
session_id = str(uuid.uuid1())
|
|
265
|
+
params = self.__gen_params(session_id)
|
|
266
|
+
signature = self.__gen_signature(params)
|
|
267
|
+
requrl = self.__create_query_string(params)
|
|
268
|
+
|
|
269
|
+
if is_python3():
|
|
270
|
+
autho = urllib.parse.quote(signature)
|
|
271
|
+
else:
|
|
272
|
+
autho = urllib.quote(signature)
|
|
273
|
+
requrl += "&Signature=%s" % autho
|
|
274
|
+
print(requrl)
|
|
275
|
+
|
|
276
|
+
self.ws = websocket.WebSocketApp(requrl, None,# header=headers,
|
|
277
|
+
on_error=_on_error, on_close=_on_close,
|
|
278
|
+
on_data=_on_data)
|
|
279
|
+
self.ws.on_open = _on_open
|
|
280
|
+
|
|
281
|
+
self.status = STARTED
|
|
282
|
+
self.wst = threading.Thread(target=self.ws.run_forever)
|
|
283
|
+
self.wst.daemon = True
|
|
284
|
+
self.wst.start()
|
|
285
|
+
self.listener.on_synthesis_start(session_id)
|
|
286
|
+
|
|
287
|
+
logger.info("synthesizer start: end")
|
|
288
|
+
|
|
289
|
+
def wait(self):
|
|
290
|
+
logger.info("synthesizer wait: begin")
|
|
291
|
+
if self.ws:
|
|
292
|
+
if self.wst and self.wst.is_alive():
|
|
293
|
+
self.wst.join()
|
|
294
|
+
logger.info("synthesizer wait: end")
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
import sys
|
|
3
|
+
import hmac
|
|
4
|
+
import hashlib
|
|
5
|
+
import base64
|
|
6
|
+
import time
|
|
7
|
+
import json
|
|
8
|
+
import uuid
|
|
9
|
+
import requests
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def is_python3():
|
|
13
|
+
if sys.version > '3':
|
|
14
|
+
return True
|
|
15
|
+
return False
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
_PROTOCOL = "https://"
|
|
19
|
+
_HOST = "tts.cloud.tencent.com"
|
|
20
|
+
_PATH = "/stream"
|
|
21
|
+
_ACTION = "TextToStreamAudio"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class SpeechSynthesisListener:
|
|
25
|
+
'''
|
|
26
|
+
reponse:
|
|
27
|
+
所有回调均包含session_id字段
|
|
28
|
+
on_message与on_message包含data字段
|
|
29
|
+
on_fail包含Code、Message字段。
|
|
30
|
+
|
|
31
|
+
字段名 类型 说明
|
|
32
|
+
session_id String 本次请求id
|
|
33
|
+
data String 语音数据
|
|
34
|
+
Code String 错误码
|
|
35
|
+
Message String 错误信息
|
|
36
|
+
'''
|
|
37
|
+
|
|
38
|
+
def on_message(self, response):
|
|
39
|
+
pass
|
|
40
|
+
|
|
41
|
+
def on_complete(self, response):
|
|
42
|
+
pass
|
|
43
|
+
|
|
44
|
+
def on_fail(self, response):
|
|
45
|
+
pass
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class SpeechSynthesizer:
|
|
49
|
+
|
|
50
|
+
def __init__(self, appid, credential, voice_type, listener):
|
|
51
|
+
self.appid = appid
|
|
52
|
+
self.credential = credential
|
|
53
|
+
self.voice_type = voice_type
|
|
54
|
+
self.codec = "pcm"
|
|
55
|
+
self.sample_rate = 16000
|
|
56
|
+
self.volume = 0
|
|
57
|
+
self.speed = 0
|
|
58
|
+
self.listener = listener
|
|
59
|
+
|
|
60
|
+
def set_voice_type(self, voice_type):
|
|
61
|
+
self.voice_type = voice_type
|
|
62
|
+
|
|
63
|
+
def set_codec(self, codec):
|
|
64
|
+
self.codec = codec
|
|
65
|
+
|
|
66
|
+
def set_sample_rate(self, sample_rate):
|
|
67
|
+
self.sample_rate = sample_rate
|
|
68
|
+
|
|
69
|
+
def set_speed(self, speed):
|
|
70
|
+
self.speed = speed
|
|
71
|
+
|
|
72
|
+
def set_volume(self, volume):
|
|
73
|
+
self.volume = volume
|
|
74
|
+
|
|
75
|
+
def synthesis(self, text):
|
|
76
|
+
session_id = str(uuid.uuid1())
|
|
77
|
+
params = self.__gen_params(session_id, text)
|
|
78
|
+
signature = self.__gen_signature(params)
|
|
79
|
+
headers = {
|
|
80
|
+
"Content-Type": "application/json",
|
|
81
|
+
"Authorization": str(signature)
|
|
82
|
+
}
|
|
83
|
+
url = _PROTOCOL + _HOST + _PATH
|
|
84
|
+
r = requests.post(url, headers=headers,
|
|
85
|
+
data=json.dumps(params), stream=True)
|
|
86
|
+
data = None
|
|
87
|
+
response = dict()
|
|
88
|
+
response["session_id"] = session_id
|
|
89
|
+
for chunk in r.iter_content(None):
|
|
90
|
+
if data is None:
|
|
91
|
+
try:
|
|
92
|
+
rsp = json.loads(chunk)
|
|
93
|
+
response["Code"] = rsp["Response"]["Error"]["Code"]
|
|
94
|
+
response["Message"] = rsp["Response"]["Error"]["Message"]
|
|
95
|
+
self.listener.on_fail(response)
|
|
96
|
+
return
|
|
97
|
+
except:
|
|
98
|
+
data = chunk
|
|
99
|
+
response["data"] = data
|
|
100
|
+
self.listener.on_message(response)
|
|
101
|
+
continue
|
|
102
|
+
data = data + chunk
|
|
103
|
+
response["data"] = data
|
|
104
|
+
self.listener.on_message(response)
|
|
105
|
+
response["data"] = data
|
|
106
|
+
self.listener.on_complete(response)
|
|
107
|
+
|
|
108
|
+
def __gen_signature(self, params):
|
|
109
|
+
sort_dict = sorted(params.keys())
|
|
110
|
+
sign_str = "POST" + _HOST + _PATH + "?"
|
|
111
|
+
for key in sort_dict:
|
|
112
|
+
sign_str = sign_str + key + "=" + str(params[key]) + '&'
|
|
113
|
+
sign_str = sign_str[:-1]
|
|
114
|
+
hmacstr = hmac.new(self.credential.secret_key.encode('utf-8'),
|
|
115
|
+
sign_str.encode('utf-8'), hashlib.sha1).digest()
|
|
116
|
+
s = base64.b64encode(hmacstr)
|
|
117
|
+
s = s.decode('utf-8')
|
|
118
|
+
return s
|
|
119
|
+
|
|
120
|
+
def __sign(self, signstr, secret_key):
|
|
121
|
+
hmacstr = hmac.new(secret_key.encode('utf-8'),
|
|
122
|
+
signstr.encode('utf-8'), hashlib.sha1).digest()
|
|
123
|
+
s = base64.b64encode(hmacstr)
|
|
124
|
+
s = s.decode('utf-8')
|
|
125
|
+
return s
|
|
126
|
+
|
|
127
|
+
def __gen_params(self, session_id, text):
|
|
128
|
+
params = dict()
|
|
129
|
+
params['Action'] = _ACTION
|
|
130
|
+
params['AppId'] = int(self.appid)
|
|
131
|
+
params['SecretId'] = self.credential.secret_id
|
|
132
|
+
params['ModelType'] = 1
|
|
133
|
+
params['VoiceType'] = self.voice_type
|
|
134
|
+
params['Codec'] = self.codec
|
|
135
|
+
params['SampleRate'] = self.sample_rate
|
|
136
|
+
params['Speed'] = self.speed
|
|
137
|
+
params['Volume'] = self.volume
|
|
138
|
+
params['SessionId'] = session_id
|
|
139
|
+
params['Text'] = text
|
|
140
|
+
|
|
141
|
+
timestamp = int(time.time())
|
|
142
|
+
params['Timestamp'] = timestamp
|
|
143
|
+
params['Expired'] = timestamp + 24 * 60 * 60
|
|
144
|
+
return params
|
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
import sys
|
|
3
|
+
import hmac
|
|
4
|
+
import hashlib
|
|
5
|
+
import base64
|
|
6
|
+
import time
|
|
7
|
+
import json
|
|
8
|
+
import threading
|
|
9
|
+
from websocket import ABNF, WebSocketApp
|
|
10
|
+
import uuid
|
|
11
|
+
import urllib
|
|
12
|
+
from common.log import logger
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
_PROTOCOL = "wss://"
|
|
16
|
+
_HOST = "tts.cloud.tencent.com"
|
|
17
|
+
_PATH = "/stream_ws"
|
|
18
|
+
_ACTION = "TextToStreamAudioWS"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class SpeechSynthesisListener(object):
|
|
22
|
+
'''
|
|
23
|
+
'''
|
|
24
|
+
def on_synthesis_start(self, session_id):
|
|
25
|
+
logger.info("on_synthesis_start: session_id={}".format(session_id))
|
|
26
|
+
|
|
27
|
+
def on_synthesis_end(self):
|
|
28
|
+
logger.info("on_synthesis_end: -")
|
|
29
|
+
|
|
30
|
+
def on_audio_result(self, audio_bytes):
|
|
31
|
+
logger.info("on_audio_result: recv audio bytes, len={}".format(len(audio_bytes)))
|
|
32
|
+
|
|
33
|
+
def on_text_result(self, response):
|
|
34
|
+
session_id = response["session_id"]
|
|
35
|
+
request_id = response["request_id"]
|
|
36
|
+
message_id = response["message_id"]
|
|
37
|
+
result = response['result']
|
|
38
|
+
subtitles = []
|
|
39
|
+
if "subtitles" in result and len(result["subtitles"]) > 0:
|
|
40
|
+
subtitles = result["subtitles"]
|
|
41
|
+
logger.info("on_text_result: session_id={} request_id={} message_id={}\nsubtitles={}".format(
|
|
42
|
+
session_id, request_id, message_id, subtitles))
|
|
43
|
+
|
|
44
|
+
def on_synthesis_fail(self, response):
|
|
45
|
+
logger.error("on_synthesis_fail: code={} msg={}".format(
|
|
46
|
+
response['code'], response['message']
|
|
47
|
+
))
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
NOTOPEN = 0
|
|
51
|
+
STARTED = 1
|
|
52
|
+
OPENED = 2
|
|
53
|
+
FINAL = 3
|
|
54
|
+
ERROR = 4
|
|
55
|
+
CLOSED = 5
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class SpeechSynthesizer:
|
|
59
|
+
|
|
60
|
+
def __init__(self, appid, credential, listener):
|
|
61
|
+
self.appid = appid
|
|
62
|
+
self.credential = credential
|
|
63
|
+
self.status = NOTOPEN
|
|
64
|
+
self.ws = None
|
|
65
|
+
self.wst = None
|
|
66
|
+
self.listener = listener
|
|
67
|
+
|
|
68
|
+
self.text = "欢迎使用腾讯云实时语音合成"
|
|
69
|
+
self.voice_type = 0
|
|
70
|
+
self.codec = "pcm"
|
|
71
|
+
self.sample_rate = 16000
|
|
72
|
+
self.volume = 0
|
|
73
|
+
self.speed = 0
|
|
74
|
+
self.session_id = ""
|
|
75
|
+
self.enable_subtitle = True
|
|
76
|
+
self.fast_voice_type = ""
|
|
77
|
+
|
|
78
|
+
def set_voice_type(self, voice_type):
|
|
79
|
+
self.voice_type = voice_type
|
|
80
|
+
|
|
81
|
+
def set_codec(self, codec):
|
|
82
|
+
self.codec = codec
|
|
83
|
+
|
|
84
|
+
def set_sample_rate(self, sample_rate):
|
|
85
|
+
self.sample_rate = sample_rate
|
|
86
|
+
|
|
87
|
+
def set_speed(self, speed):
|
|
88
|
+
self.speed = speed
|
|
89
|
+
|
|
90
|
+
def set_volume(self, volume):
|
|
91
|
+
self.volume = volume
|
|
92
|
+
|
|
93
|
+
def set_text(self, text):
|
|
94
|
+
self.text = text
|
|
95
|
+
|
|
96
|
+
def set_enable_subtitle(self, enable_subtitle):
|
|
97
|
+
self.enable_subtitle = enable_subtitle
|
|
98
|
+
|
|
99
|
+
def set_fast_voice_type(self, fast_voice_type):
|
|
100
|
+
self.fast_voice_type = fast_voice_type
|
|
101
|
+
|
|
102
|
+
def __gen_signature(self, params):
|
|
103
|
+
sort_dict = sorted(params.keys())
|
|
104
|
+
sign_str = "GET" + _HOST + _PATH + "?"
|
|
105
|
+
for key in sort_dict:
|
|
106
|
+
sign_str = sign_str + key + "=" + str(params[key]) + '&'
|
|
107
|
+
sign_str = sign_str[:-1]
|
|
108
|
+
secret_key = self.credential.secret_key.encode('utf-8')
|
|
109
|
+
sign_str = sign_str.encode('utf-8')
|
|
110
|
+
hmacstr = hmac.new(secret_key, sign_str, hashlib.sha1).digest()
|
|
111
|
+
s = base64.b64encode(hmacstr)
|
|
112
|
+
s = s.decode('utf-8')
|
|
113
|
+
return s
|
|
114
|
+
|
|
115
|
+
def __gen_params(self, session_id):
|
|
116
|
+
self.session_id = session_id
|
|
117
|
+
|
|
118
|
+
params = dict()
|
|
119
|
+
params['Action'] = _ACTION
|
|
120
|
+
params['AppId'] = int(self.appid)
|
|
121
|
+
params['SecretId'] = self.credential.secret_id
|
|
122
|
+
params['ModelType'] = 1
|
|
123
|
+
params['VoiceType'] = self.voice_type
|
|
124
|
+
params['Codec'] = self.codec
|
|
125
|
+
params['SampleRate'] = self.sample_rate
|
|
126
|
+
params['Speed'] = self.speed
|
|
127
|
+
params['Volume'] = self.volume
|
|
128
|
+
params['SessionId'] = self.session_id
|
|
129
|
+
params['Text'] = self.text
|
|
130
|
+
params['EnableSubtitle'] = self.enable_subtitle
|
|
131
|
+
if len(self.fast_voice_type) > 0:
|
|
132
|
+
params['FastVoiceType'] = self.fast_voice_type
|
|
133
|
+
|
|
134
|
+
timestamp = int(time.time())
|
|
135
|
+
params['Timestamp'] = timestamp
|
|
136
|
+
params['Expired'] = timestamp + 24 * 60 * 60
|
|
137
|
+
return params
|
|
138
|
+
|
|
139
|
+
def __create_query_string(self, param):
|
|
140
|
+
param['Text'] = urllib.parse.quote(param['Text'])
|
|
141
|
+
|
|
142
|
+
param = sorted(param.items(), key=lambda d: d[0])
|
|
143
|
+
|
|
144
|
+
url = _PROTOCOL + _HOST + _PATH
|
|
145
|
+
|
|
146
|
+
signstr = url + "?"
|
|
147
|
+
for x in param:
|
|
148
|
+
tmp = x
|
|
149
|
+
for t in tmp:
|
|
150
|
+
signstr += str(t)
|
|
151
|
+
signstr += "="
|
|
152
|
+
signstr = signstr[:-1]
|
|
153
|
+
signstr += "&"
|
|
154
|
+
signstr = signstr[:-1]
|
|
155
|
+
return signstr
|
|
156
|
+
|
|
157
|
+
def start(self):
|
|
158
|
+
logger.info("synthesizer start: begin")
|
|
159
|
+
|
|
160
|
+
def _close_conn(reason):
|
|
161
|
+
ta = time.time()
|
|
162
|
+
self.ws.close()
|
|
163
|
+
tb = time.time()
|
|
164
|
+
logger.info("client has closed connection ({}), cost {} ms".format(reason, int((tb-ta)*1000)))
|
|
165
|
+
|
|
166
|
+
def _on_data(ws, data, opcode, flag):
|
|
167
|
+
# NOTE print all message that client received
|
|
168
|
+
# logger.info("data={} opcode={} flag={}".format(data, opcode, flag))
|
|
169
|
+
if opcode == ABNF.OPCODE_BINARY:
|
|
170
|
+
self.listener.on_audio_result(data) # <class 'bytes'>
|
|
171
|
+
pass
|
|
172
|
+
elif opcode == ABNF.OPCODE_TEXT:
|
|
173
|
+
resp = json.loads(data) # WSResponseMessage
|
|
174
|
+
if resp['code'] != 0:
|
|
175
|
+
logger.error("server synthesis fail request_id={} code={} msg={}".format(
|
|
176
|
+
resp['request_id'], resp['code'], resp['message']
|
|
177
|
+
))
|
|
178
|
+
self.listener.on_synthesis_fail(resp)
|
|
179
|
+
return
|
|
180
|
+
if "final" in resp and resp['final'] == 1:
|
|
181
|
+
logger.info("recv FINAL frame")
|
|
182
|
+
self.status = FINAL
|
|
183
|
+
_close_conn("after recv final")
|
|
184
|
+
self.listener.on_synthesis_end()
|
|
185
|
+
return
|
|
186
|
+
if "result" in resp:
|
|
187
|
+
if "subtitles" in resp["result"] and resp["result"]["subtitles"] is not None:
|
|
188
|
+
self.listener.on_text_result(resp)
|
|
189
|
+
return
|
|
190
|
+
else:
|
|
191
|
+
logger.error("invalid on_data code, opcode=".format(opcode))
|
|
192
|
+
|
|
193
|
+
def _on_error(ws, error):
|
|
194
|
+
if self.status == FINAL or self.status == CLOSED:
|
|
195
|
+
return
|
|
196
|
+
self.status = ERROR
|
|
197
|
+
logger.error("error={}, session_id={}".format(error, self.session_id))
|
|
198
|
+
_close_conn("after recv error")
|
|
199
|
+
|
|
200
|
+
def _on_close(ws, close_status_code, close_msg):
|
|
201
|
+
logger.info("conn closed, close_status_code={} close_msg={}".format(close_status_code, close_msg))
|
|
202
|
+
self.status = CLOSED
|
|
203
|
+
|
|
204
|
+
def _on_open(ws):
|
|
205
|
+
logger.info("conn opened")
|
|
206
|
+
self.status = OPENED
|
|
207
|
+
|
|
208
|
+
session_id = str(uuid.uuid1())
|
|
209
|
+
params = self.__gen_params(session_id)
|
|
210
|
+
signature = self.__gen_signature(params)
|
|
211
|
+
requrl = self.__create_query_string(params)
|
|
212
|
+
|
|
213
|
+
autho = urllib.parse.quote(signature)
|
|
214
|
+
requrl += "&Signature=%s" % autho
|
|
215
|
+
|
|
216
|
+
self.ws = WebSocketApp(requrl, None,
|
|
217
|
+
on_error=_on_error, on_close=_on_close,
|
|
218
|
+
on_data=_on_data)
|
|
219
|
+
self.ws.on_open = _on_open
|
|
220
|
+
|
|
221
|
+
self.wst = threading.Thread(target=self.ws.run_forever)
|
|
222
|
+
self.wst.daemon = True
|
|
223
|
+
self.wst.start()
|
|
224
|
+
self.status = STARTED
|
|
225
|
+
self.listener.on_synthesis_start(session_id)
|
|
226
|
+
|
|
227
|
+
logger.info("synthesizer start: end")
|
|
228
|
+
|
|
229
|
+
def wait(self):
|
|
230
|
+
logger.info("synthesizer wait: begin")
|
|
231
|
+
if self.ws:
|
|
232
|
+
if self.wst and self.wst.is_alive():
|
|
233
|
+
self.wst.join()
|
|
234
|
+
logger.info("synthesizer wait: end")
|
|
File without changes
|