dashscope 1.8.0__py3-none-any.whl → 1.25.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. dashscope/__init__.py +61 -14
  2. dashscope/aigc/__init__.py +10 -3
  3. dashscope/aigc/chat_completion.py +282 -0
  4. dashscope/aigc/code_generation.py +145 -0
  5. dashscope/aigc/conversation.py +71 -12
  6. dashscope/aigc/generation.py +288 -16
  7. dashscope/aigc/image_synthesis.py +473 -31
  8. dashscope/aigc/multimodal_conversation.py +299 -14
  9. dashscope/aigc/video_synthesis.py +610 -0
  10. dashscope/api_entities/aiohttp_request.py +8 -5
  11. dashscope/api_entities/api_request_data.py +4 -2
  12. dashscope/api_entities/api_request_factory.py +68 -20
  13. dashscope/api_entities/base_request.py +20 -3
  14. dashscope/api_entities/chat_completion_types.py +344 -0
  15. dashscope/api_entities/dashscope_response.py +243 -15
  16. dashscope/api_entities/encryption.py +179 -0
  17. dashscope/api_entities/http_request.py +216 -62
  18. dashscope/api_entities/websocket_request.py +43 -34
  19. dashscope/app/__init__.py +5 -0
  20. dashscope/app/application.py +203 -0
  21. dashscope/app/application_response.py +246 -0
  22. dashscope/assistants/__init__.py +16 -0
  23. dashscope/assistants/assistant_types.py +175 -0
  24. dashscope/assistants/assistants.py +311 -0
  25. dashscope/assistants/files.py +197 -0
  26. dashscope/audio/__init__.py +4 -2
  27. dashscope/audio/asr/__init__.py +17 -1
  28. dashscope/audio/asr/asr_phrase_manager.py +203 -0
  29. dashscope/audio/asr/recognition.py +167 -27
  30. dashscope/audio/asr/transcription.py +107 -14
  31. dashscope/audio/asr/translation_recognizer.py +1006 -0
  32. dashscope/audio/asr/vocabulary.py +177 -0
  33. dashscope/audio/qwen_asr/__init__.py +7 -0
  34. dashscope/audio/qwen_asr/qwen_transcription.py +189 -0
  35. dashscope/audio/qwen_omni/__init__.py +11 -0
  36. dashscope/audio/qwen_omni/omni_realtime.py +524 -0
  37. dashscope/audio/qwen_tts/__init__.py +5 -0
  38. dashscope/audio/qwen_tts/speech_synthesizer.py +77 -0
  39. dashscope/audio/qwen_tts_realtime/__init__.py +10 -0
  40. dashscope/audio/qwen_tts_realtime/qwen_tts_realtime.py +355 -0
  41. dashscope/audio/tts/__init__.py +2 -0
  42. dashscope/audio/tts/speech_synthesizer.py +5 -0
  43. dashscope/audio/tts_v2/__init__.py +12 -0
  44. dashscope/audio/tts_v2/enrollment.py +179 -0
  45. dashscope/audio/tts_v2/speech_synthesizer.py +886 -0
  46. dashscope/cli.py +157 -37
  47. dashscope/client/base_api.py +652 -87
  48. dashscope/common/api_key.py +2 -0
  49. dashscope/common/base_type.py +135 -0
  50. dashscope/common/constants.py +13 -16
  51. dashscope/common/env.py +2 -0
  52. dashscope/common/error.py +58 -22
  53. dashscope/common/logging.py +2 -0
  54. dashscope/common/message_manager.py +2 -0
  55. dashscope/common/utils.py +276 -46
  56. dashscope/customize/__init__.py +0 -0
  57. dashscope/customize/customize_types.py +192 -0
  58. dashscope/customize/deployments.py +146 -0
  59. dashscope/customize/finetunes.py +234 -0
  60. dashscope/embeddings/__init__.py +5 -1
  61. dashscope/embeddings/batch_text_embedding.py +208 -0
  62. dashscope/embeddings/batch_text_embedding_response.py +65 -0
  63. dashscope/embeddings/multimodal_embedding.py +118 -10
  64. dashscope/embeddings/text_embedding.py +13 -1
  65. dashscope/{file.py → files.py} +19 -4
  66. dashscope/io/input_output.py +2 -0
  67. dashscope/model.py +11 -2
  68. dashscope/models.py +43 -0
  69. dashscope/multimodal/__init__.py +20 -0
  70. dashscope/multimodal/dialog_state.py +56 -0
  71. dashscope/multimodal/multimodal_constants.py +28 -0
  72. dashscope/multimodal/multimodal_dialog.py +648 -0
  73. dashscope/multimodal/multimodal_request_params.py +313 -0
  74. dashscope/multimodal/tingwu/__init__.py +10 -0
  75. dashscope/multimodal/tingwu/tingwu.py +80 -0
  76. dashscope/multimodal/tingwu/tingwu_realtime.py +579 -0
  77. dashscope/nlp/__init__.py +0 -0
  78. dashscope/nlp/understanding.py +64 -0
  79. dashscope/protocol/websocket.py +3 -0
  80. dashscope/rerank/__init__.py +0 -0
  81. dashscope/rerank/text_rerank.py +69 -0
  82. dashscope/resources/qwen.tiktoken +151643 -0
  83. dashscope/threads/__init__.py +26 -0
  84. dashscope/threads/messages/__init__.py +0 -0
  85. dashscope/threads/messages/files.py +113 -0
  86. dashscope/threads/messages/messages.py +220 -0
  87. dashscope/threads/runs/__init__.py +0 -0
  88. dashscope/threads/runs/runs.py +501 -0
  89. dashscope/threads/runs/steps.py +112 -0
  90. dashscope/threads/thread_types.py +665 -0
  91. dashscope/threads/threads.py +212 -0
  92. dashscope/tokenizers/__init__.py +7 -0
  93. dashscope/tokenizers/qwen_tokenizer.py +111 -0
  94. dashscope/tokenizers/tokenization.py +125 -0
  95. dashscope/tokenizers/tokenizer.py +45 -0
  96. dashscope/tokenizers/tokenizer_base.py +32 -0
  97. dashscope/utils/__init__.py +0 -0
  98. dashscope/utils/message_utils.py +838 -0
  99. dashscope/utils/oss_utils.py +243 -0
  100. dashscope/utils/param_utils.py +29 -0
  101. dashscope/version.py +3 -1
  102. {dashscope-1.8.0.dist-info → dashscope-1.25.6.dist-info}/METADATA +53 -50
  103. dashscope-1.25.6.dist-info/RECORD +112 -0
  104. {dashscope-1.8.0.dist-info → dashscope-1.25.6.dist-info}/WHEEL +1 -1
  105. {dashscope-1.8.0.dist-info → dashscope-1.25.6.dist-info}/entry_points.txt +0 -1
  106. {dashscope-1.8.0.dist-info → dashscope-1.25.6.dist-info/licenses}/LICENSE +2 -4
  107. dashscope/deployment.py +0 -129
  108. dashscope/finetune.py +0 -149
  109. dashscope-1.8.0.dist-info/RECORD +0 -49
  110. {dashscope-1.8.0.dist-info → dashscope-1.25.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,886 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ import json
4
+ import platform
5
+ import random
6
+ import threading
7
+ import time
8
+ import uuid
9
+ from enum import Enum, unique
10
+
11
+ import websocket
12
+
13
+ import dashscope
14
+ from dashscope.common.error import InputRequired, InvalidTask, ModelRequired
15
+ from dashscope.common.logging import logger
16
+ from dashscope.protocol.websocket import (ACTION_KEY, EVENT_KEY, HEADER,
17
+ TASK_ID, ActionType, EventType,
18
+ WebsocketStreamingMode)
19
+
20
+
21
+ class ResultCallback:
22
+ """
23
+ An interface that defines callback methods for getting speech synthesis results. # noqa E501
24
+ Derive from this class and implement its function to provide your own data.
25
+ """
26
+ def on_open(self) -> None:
27
+ pass
28
+
29
+ def on_complete(self) -> None:
30
+ pass
31
+
32
+ def on_error(self, message) -> None:
33
+ pass
34
+
35
+ def on_close(self) -> None:
36
+ pass
37
+
38
+ def on_event(self, message: str) -> None:
39
+ pass
40
+
41
+ def on_data(self, data: bytes) -> None:
42
+ pass
43
+
44
+
45
+ @unique
46
+ class AudioFormat(Enum):
47
+ DEFAULT = ('Default', 0, '0', 0)
48
+ WAV_8000HZ_MONO_16BIT = ('wav', 8000, 'mono', 0)
49
+ WAV_16000HZ_MONO_16BIT = ('wav', 16000, 'mono', 16)
50
+ WAV_22050HZ_MONO_16BIT = ('wav', 22050, 'mono', 16)
51
+ WAV_24000HZ_MONO_16BIT = ('wav', 24000, 'mono', 16)
52
+ WAV_44100HZ_MONO_16BIT = ('wav', 44100, 'mono', 16)
53
+ WAV_48000HZ_MONO_16BIT = ('wav', 48000, 'mono', 16)
54
+
55
+ MP3_8000HZ_MONO_128KBPS = ('mp3', 8000, 'mono', 128)
56
+ MP3_16000HZ_MONO_128KBPS = ('mp3', 16000, 'mono', 128)
57
+ MP3_22050HZ_MONO_256KBPS = ('mp3', 22050, 'mono', 256)
58
+ MP3_24000HZ_MONO_256KBPS = ('mp3', 24000, 'mono', 256)
59
+ MP3_44100HZ_MONO_256KBPS = ('mp3', 44100, 'mono', 256)
60
+ MP3_48000HZ_MONO_256KBPS = ('mp3', 48000, 'mono', 256)
61
+
62
+ PCM_8000HZ_MONO_16BIT = ('pcm', 8000, 'mono', 16)
63
+ PCM_16000HZ_MONO_16BIT = ('pcm', 16000, 'mono', 16)
64
+ PCM_22050HZ_MONO_16BIT = ('pcm', 22050, 'mono', 16)
65
+ PCM_24000HZ_MONO_16BIT = ('pcm', 24000, 'mono', 16)
66
+ PCM_44100HZ_MONO_16BIT = ('pcm', 44100, 'mono', 16)
67
+ PCM_48000HZ_MONO_16BIT = ('pcm', 48000, 'mono', 16)
68
+
69
+ OGG_OPUS_8KHZ_MONO_32KBPS = ("opus", 8000, "mono", 32)
70
+ OGG_OPUS_8KHZ_MONO_16KBPS = ("opus", 8000, "mono", 16)
71
+ OGG_OPUS_16KHZ_MONO_16KBPS = ("opus", 16000, "mono", 16)
72
+ OGG_OPUS_16KHZ_MONO_32KBPS = ("opus", 16000, "mono", 32)
73
+ OGG_OPUS_16KHZ_MONO_64KBPS = ("opus", 16000, "mono", 64)
74
+ OGG_OPUS_24KHZ_MONO_16KBPS = ("opus", 24000, "mono", 16)
75
+ OGG_OPUS_24KHZ_MONO_32KBPS = ("opus", 24000, "mono", 32)
76
+ OGG_OPUS_24KHZ_MONO_64KBPS = ("opus", 24000, "mono", 64)
77
+ OGG_OPUS_48KHZ_MONO_16KBPS = ("opus", 48000, "mono", 16)
78
+ OGG_OPUS_48KHZ_MONO_32KBPS = ("opus", 48000, "mono", 32)
79
+ OGG_OPUS_48KHZ_MONO_64KBPS = ("opus", 48000, "mono", 64)
80
+ def __init__(self, format, sample_rate, channels, bit_rate):
81
+ self.format = format
82
+ self.sample_rate = sample_rate
83
+ self.channels = channels
84
+ self.bit_rate = bit_rate
85
+
86
+ def __str__(self):
87
+ return f'{self.format.upper()} with {self.sample_rate}Hz sample rate, {self.channels} channel, {self.bit_rate}'
88
+
89
+
90
+ class Request:
91
+ def __init__(
92
+ self,
93
+ apikey,
94
+ model,
95
+ voice,
96
+ format='wav',
97
+ sample_rate=16000,
98
+ bit_rate=64000,
99
+ volume=50,
100
+ speech_rate=1.0,
101
+ pitch_rate=1.0,
102
+ seed=0,
103
+ synthesis_type=0,
104
+ instruction=None,
105
+ language_hints: list = None,
106
+ ):
107
+ self.task_id = self.genUid()
108
+ self.apikey = apikey
109
+ self.voice = voice
110
+ self.model = model
111
+ self.format = format
112
+ self.sample_rate = sample_rate
113
+ self.bit_rate = bit_rate
114
+ self.volume = volume
115
+ self.speech_rate = speech_rate
116
+ self.pitch_rate = pitch_rate
117
+ self.seed = seed
118
+ self.synthesis_type = synthesis_type
119
+ self.instruction = instruction
120
+ self.language_hints = language_hints
121
+
122
+ def genUid(self):
123
+ # 生成随机UUID
124
+ return uuid.uuid4().hex
125
+
126
+ def getWebsocketHeaders(self, headers, workspace):
127
+ ua = 'dashscope/%s; python/%s; platform/%s; processor/%s' % (
128
+ '1.18.0', # dashscope version
129
+ platform.python_version(),
130
+ platform.platform(),
131
+ platform.processor(),
132
+ )
133
+ self.headers = {
134
+ 'user-agent': ua,
135
+ 'Authorization': 'bearer ' + self.apikey,
136
+ }
137
+ if headers:
138
+ self.headers = {**self.headers, **headers}
139
+ if workspace:
140
+ self.headers = {
141
+ **self.headers,
142
+ 'X-DashScope-WorkSpace': workspace,
143
+ }
144
+ return self.headers
145
+
146
+ def getStartRequest(self, additional_params=None):
147
+
148
+ cmd = {
149
+ HEADER: {
150
+ ACTION_KEY: ActionType.START,
151
+ TASK_ID: self.task_id,
152
+ 'streaming': WebsocketStreamingMode.DUPLEX,
153
+ },
154
+ 'payload': {
155
+ 'model': self.model,
156
+ 'task_group': 'audio',
157
+ 'task': 'tts',
158
+ 'function': 'SpeechSynthesizer',
159
+ 'input': {},
160
+ 'parameters': {
161
+ 'voice': self.voice,
162
+ 'volume': self.volume,
163
+ 'text_type': 'PlainText',
164
+ 'sample_rate': self.sample_rate,
165
+ 'rate': self.speech_rate,
166
+ 'format': self.format,
167
+ 'pitch': self.pitch_rate,
168
+ 'seed': self.seed,
169
+ 'type': self.synthesis_type
170
+ },
171
+ },
172
+ }
173
+ if self.format == 'opus':
174
+ cmd['payload']['parameters']['bit_rate'] = self.bit_rate
175
+ if additional_params:
176
+ cmd['payload']['parameters'].update(additional_params)
177
+ if self.instruction is not None:
178
+ cmd['payload']['parameters']['instruction'] = self.instruction
179
+ if self.language_hints is not None:
180
+ cmd['payload']['parameters']['language_hints'] = self.language_hints
181
+ return json.dumps(cmd)
182
+
183
+ def getContinueRequest(self, text):
184
+ cmd = {
185
+ HEADER: {
186
+ ACTION_KEY: ActionType.CONTINUE,
187
+ TASK_ID: self.task_id,
188
+ 'streaming': WebsocketStreamingMode.DUPLEX,
189
+ },
190
+ 'payload': {
191
+ 'model': self.model,
192
+ 'task_group': 'audio',
193
+ 'task': 'tts',
194
+ 'function': 'SpeechSynthesizer',
195
+ 'input': {
196
+ 'text': text
197
+ },
198
+ },
199
+ }
200
+ return json.dumps(cmd)
201
+
202
+ def getFinishRequest(self):
203
+ cmd = {
204
+ HEADER: {
205
+ ACTION_KEY: ActionType.FINISHED,
206
+ TASK_ID: self.task_id,
207
+ 'streaming': WebsocketStreamingMode.DUPLEX,
208
+ },
209
+ 'payload': {
210
+ 'input': {},
211
+ },
212
+ }
213
+ return json.dumps(cmd)
214
+
215
+
216
+ class SpeechSynthesizer:
217
+ def __init__(
218
+ self,
219
+ model,
220
+ voice,
221
+ format: AudioFormat = AudioFormat.DEFAULT,
222
+ volume=50,
223
+ speech_rate=1.0,
224
+ pitch_rate=1.0,
225
+ seed=0,
226
+ synthesis_type=0,
227
+ instruction=None,
228
+ language_hints: list = None,
229
+ headers=None,
230
+ callback: ResultCallback = None,
231
+ workspace=None,
232
+ url=None,
233
+ additional_params=None,
234
+ ):
235
+ """
236
+ CosyVoice Speech Synthesis SDK
237
+ Parameters:
238
+ -----------
239
+ model: str
240
+ Model name.
241
+ voice: str
242
+ Voice name.
243
+ format: AudioFormat
244
+ Synthesis audio format.
245
+ volume: int
246
+ The volume of the synthesized audio, with a range from 0 to 100. Default is 50.
247
+ rate: float
248
+ The speech rate of the synthesized audio, with a range from 0.5 to 2. Default is 1.0.
249
+ pitch: float
250
+ The pitch of the synthesized audio, with a range from 0.5 to 2. Default is 1.0.
251
+ headers: Dict
252
+ User-defined headers.
253
+ callback: ResultCallback
254
+ Callback to receive real-time synthesis results.
255
+ workspace: str
256
+ Dashscope workspace ID.
257
+ url: str
258
+ Dashscope WebSocket URL.
259
+ seed: int
260
+ The seed of the synthesizer, with a range from 0 to 65535. Default is 0.
261
+ synthesis_type: int
262
+ The type of the synthesizer, Default is 0.
263
+ instruction: str
264
+ The instruction of the synthesizer, max length is 128.
265
+ language_hints: list
266
+ The language hints of the synthesizer. supported language: zh, en.
267
+ additional_params: Dict
268
+ Additional parameters for the Dashscope API.
269
+ """
270
+ self.ws = None
271
+ self.start_event = threading.Event()
272
+ self.complete_event = threading.Event()
273
+ self._stopped = threading.Event()
274
+ self._audio_data: bytes = None
275
+ self._is_started = False
276
+ self._cancel = False
277
+ self._cancel_lock = threading.Lock()
278
+ self.async_call = True
279
+ self._is_first = True
280
+ self.async_call = True
281
+ # since dashscope sdk will send first text in run-task
282
+ self._start_stream_timestamp = -1
283
+ self._first_package_timestamp = -1
284
+ self._recv_audio_length = 0
285
+ self.last_response = None
286
+ self._close_ws_after_use = True
287
+ self.__update_params(model, voice, format, volume, speech_rate,
288
+ pitch_rate, seed, synthesis_type, instruction, language_hints, headers, callback, workspace, url,
289
+ additional_params)
290
+
291
+ def __send_str(self, data: str):
292
+ logger.debug('>>>send {}'.format(data))
293
+ self.ws.send(data)
294
+
295
+ def __connect(self, timeout_seconds=5) -> None:
296
+ """
297
+ Establish a connection to the Bailian WebSocket server,
298
+ which can be used to pre-establish the connection and reduce interaction latency.
299
+ If this function is not used to create the connection,
300
+ it will be established when you first send text via call or streaming_call.
301
+ Parameters:
302
+ -----------
303
+ timeout: int
304
+ Throws TimeoutError exception if the connection is not established after times out seconds.
305
+ """
306
+ self.ws = websocket.WebSocketApp(
307
+ self.url,
308
+ header=self.request.getWebsocketHeaders(headers=self.headers,
309
+ workspace=self.workspace),
310
+ on_message=self.on_message,
311
+ on_error=self.on_error,
312
+ on_close=self.on_close,
313
+ )
314
+ self.thread = threading.Thread(target=self.ws.run_forever)
315
+ self.thread.daemon = True
316
+ self.thread.start()
317
+ # 等待连接建立
318
+ start_time = time.time()
319
+ while (not (self.ws.sock and self.ws.sock.connected)
320
+ and (time.time() - start_time) < timeout_seconds):
321
+ time.sleep(0.1) # 短暂休眠,避免密集轮询
322
+ if not (self.ws.sock and self.ws.sock.connected):
323
+ raise TimeoutError(
324
+ 'websocket connection could not established within 5s. '
325
+ 'Please check your network connection, firewall settings, or server status.'
326
+ )
327
+
328
+ def __is_connected(self) -> bool:
329
+ """
330
+ Returns True if the connection is established and still exists;
331
+ otherwise, returns False.
332
+ """
333
+ if not self.ws:
334
+ return False
335
+ if not (self.ws.sock and self.ws.sock.connected):
336
+ return False
337
+ return True
338
+
339
+ def __reset(self):
340
+ self.start_event.clear()
341
+ self.complete_event.clear()
342
+ self._stopped.clear()
343
+ self._audio_data: bytes = None
344
+ self._is_started = False
345
+ self._cancel = False
346
+ self.async_call = True
347
+ self._is_first = True
348
+ self.async_call = True
349
+ # since dashscope sdk will send first text in run-task
350
+ self._start_stream_timestamp = -1
351
+ self._first_package_timestamp = -1
352
+ self._recv_audio_length = 0
353
+ self.last_response = None
354
+
355
+ def __update_params(
356
+ self,
357
+ model,
358
+ voice,
359
+ format: AudioFormat = AudioFormat.DEFAULT,
360
+ volume=50,
361
+ speech_rate=1.0,
362
+ pitch_rate=1.0,
363
+ seed=0,
364
+ synthesis_type=0,
365
+ instruction=None,
366
+ language_hints: list = None,
367
+ headers=None,
368
+ callback: ResultCallback = None,
369
+ workspace=None,
370
+ url=None,
371
+ additional_params=None,
372
+ close_ws_after_use=True,
373
+ ):
374
+ if model is None:
375
+ raise ModelRequired('Model is required!')
376
+ if format is None:
377
+ raise InputRequired('format is required!')
378
+ if url is None:
379
+ url = dashscope.base_websocket_api_url
380
+ self.url = url
381
+ self.apikey = dashscope.api_key
382
+ if self.apikey is None:
383
+ raise InputRequired('apikey is required!')
384
+ self.headers = headers
385
+ self.workspace = workspace
386
+ self.additional_params = additional_params
387
+ self.model = model
388
+ self.voice = voice
389
+ self.aformat = format.format
390
+ if (self.aformat == 'DEFAULT'):
391
+ self.aformat = 'mp3'
392
+ self.sample_rate = format.sample_rate
393
+ if (self.sample_rate == 0):
394
+ self.sample_rate = 22050
395
+
396
+ self.callback = callback
397
+ if not self.callback:
398
+ self.async_call = False
399
+ self.request = Request(
400
+ apikey=self.apikey,
401
+ model=model,
402
+ voice=voice,
403
+ format=format.format,
404
+ sample_rate=format.sample_rate,
405
+ bit_rate = format.bit_rate,
406
+ volume=volume,
407
+ speech_rate=speech_rate,
408
+ pitch_rate=pitch_rate,
409
+ seed=seed,
410
+ synthesis_type=synthesis_type,
411
+ instruction=instruction,
412
+ language_hints=language_hints
413
+ )
414
+ self.last_request_id = self.request.task_id
415
+ self._close_ws_after_use = close_ws_after_use
416
+
417
+ def __str__(self):
418
+ return '[SpeechSynthesizer {} desc] model:{}, voice:{}, format:{}, sample_rate:{}, connected:{}'.format(
419
+ self.__hash__(), self.model, self.voice, self.aformat,
420
+ self.sample_rate, self.__is_connected())
421
+
422
+ def __start_stream(self, ):
423
+ self._start_stream_timestamp = time.time() * 1000
424
+ self._first_package_timestamp = -1
425
+ self._recv_audio_length = 0
426
+ if self.callback is None:
427
+ raise InputRequired('callback is required!')
428
+ # reset inner params
429
+ self._stopped.clear()
430
+ self._stream_data = ['']
431
+ self._worker = None
432
+ self._audio_data: bytes = None
433
+
434
+ if self._is_started:
435
+ raise InvalidTask('task has already started.')
436
+ # 建立ws连接
437
+ if self.ws is None:
438
+ self.__connect(5)
439
+ # 发送run-task指令
440
+ request = self.request.getStartRequest(self.additional_params)
441
+ self.__send_str(request)
442
+ if not self.start_event.wait(10):
443
+ raise TimeoutError('start speech synthesizer failed within 5s.')
444
+ self._is_started = True
445
+ if self.callback:
446
+ self.callback.on_open()
447
+
448
+ def __submit_text(self, text):
449
+ if not self._is_started:
450
+ raise InvalidTask('speech synthesizer has not been started.')
451
+
452
+ if self._stopped.is_set():
453
+ raise InvalidTask('speech synthesizer task has stopped.')
454
+ request = self.request.getContinueRequest(text)
455
+ self.__send_str(request)
456
+
457
+ def streaming_call(self, text: str):
458
+ """
459
+ Streaming input mode: You can call the stream_call function multiple times to send text.
460
+ A session will be created on the first call.
461
+ The session ends after calling streaming_complete.
462
+ Parameters:
463
+ -----------
464
+ text: str
465
+ utf-8 encoded text
466
+ """
467
+ if self._is_first:
468
+ self._is_first = False
469
+ self.__start_stream()
470
+ self.__submit_text(text)
471
+ return None
472
+
473
+ def streaming_complete(self, complete_timeout_millis=600000):
474
+ """
475
+ Synchronously stop the streaming input speech synthesis task.
476
+ Wait for all remaining synthesized audio before returning
477
+
478
+ Parameters:
479
+ -----------
480
+ complete_timeout_millis: int
481
+ Throws TimeoutError exception if it times out. If the timeout is not None
482
+ and greater than zero, it will wait for the corresponding number of
483
+ milliseconds; otherwise, it will wait indefinitely.
484
+ """
485
+ if not self._is_started:
486
+ raise InvalidTask('speech synthesizer has not been started.')
487
+ if self._stopped.is_set():
488
+ raise InvalidTask('speech synthesizer task has stopped.')
489
+ request = self.request.getFinishRequest()
490
+ self.__send_str(request)
491
+ if complete_timeout_millis is not None and complete_timeout_millis > 0:
492
+ if not self.complete_event.wait(timeout=complete_timeout_millis /
493
+ 1000):
494
+ raise TimeoutError(
495
+ 'speech synthesizer wait for complete timeout {}ms'.format(
496
+ complete_timeout_millis))
497
+ else:
498
+ self.complete_event.wait()
499
+ if self._close_ws_after_use:
500
+ self.close()
501
+ self._stopped.set()
502
+ self._is_started = False
503
+
504
+ def __waiting_for_complete(self, timeout):
505
+ if timeout is not None and timeout > 0:
506
+ if not self.complete_event.wait(timeout=timeout / 1000):
507
+ raise TimeoutError(
508
+ f'speech synthesizer wait for complete timeout {timeout}ms'
509
+ )
510
+ else:
511
+ self.complete_event.wait()
512
+ if self._close_ws_after_use:
513
+ self.close()
514
+ self._stopped.set()
515
+ self._is_started = False
516
+
517
+ def async_streaming_complete(self, complete_timeout_millis=600000):
518
+ """
519
+ Asynchronously stop the streaming input speech synthesis task, returns immediately.
520
+ You need to listen and handle the STREAM_INPUT_TTS_EVENT_SYNTHESIS_COMPLETE event in the on_event callback.
521
+ Do not destroy the object and callback before this event.
522
+
523
+ Parameters:
524
+ -----------
525
+ complete_timeout_millis: int
526
+ Throws TimeoutError exception if it times out. If the timeout is not None
527
+ and greater than zero, it will wait for the corresponding number of
528
+ milliseconds; otherwise, it will wait indefinitely.
529
+ """
530
+
531
+ if not self._is_started:
532
+ raise InvalidTask('speech synthesizer has not been started.')
533
+ if self._stopped.is_set():
534
+ raise InvalidTask('speech synthesizer task has stopped.')
535
+ request = self.request.getFinishRequest()
536
+ self.__send_str(request)
537
+ thread = threading.Thread(target=self.__waiting_for_complete,
538
+ args=(complete_timeout_millis, ))
539
+ thread.start()
540
+
541
+ def streaming_cancel(self):
542
+ """
543
+ Immediately terminate the streaming input speech synthesis task
544
+ and discard any remaining audio that is not yet delivered.
545
+ """
546
+
547
+ if not self._is_started:
548
+ raise InvalidTask('speech synthesizer has not been started.')
549
+ if self._stopped.is_set():
550
+ return
551
+ request = self.request.getFinishRequest()
552
+ self.__send_str(request)
553
+ self.ws.close()
554
+ self.start_event.set()
555
+ self.complete_event.set()
556
+
557
+ # 监听消息的回调函数
558
+ def on_message(self, ws, message):
559
+ if isinstance(message, str):
560
+ logger.debug('<<<recv {}'.format(message))
561
+ try:
562
+ # 尝试将消息解析为JSON
563
+ json_data = json.loads(message)
564
+ self.last_response = json_data
565
+ event = json_data['header'][EVENT_KEY]
566
+ # 调用JSON回调
567
+ if EventType.STARTED == event:
568
+ self.start_event.set()
569
+ elif EventType.FINISHED == event:
570
+ self.complete_event.set()
571
+ if self.callback:
572
+ self.callback.on_complete()
573
+ self.callback.on_close()
574
+ elif EventType.FAILED == event:
575
+ self.start_event.set()
576
+ self.complete_event.set()
577
+ if self.async_call:
578
+ self.callback.on_error(message)
579
+ self.callback.on_close()
580
+ else:
581
+ logger.error(f'TaskFailed: {message}')
582
+ raise Exception(f'TaskFailed: {message}')
583
+ elif EventType.GENERATED == event:
584
+ if self.callback:
585
+ self.callback.on_event(message)
586
+ else:
587
+ pass
588
+ except json.JSONDecodeError:
589
+ logger.error('Failed to parse message as JSON.')
590
+ raise Exception('Failed to parse message as JSON.')
591
+ elif isinstance(message, (bytes, bytearray)):
592
+ # 如果失败,认为是二进制消息
593
+ logger.debug('<<<recv binary {}'.format(len(message)))
594
+ if (self._recv_audio_length == 0):
595
+ self._first_package_timestamp = time.time() * 1000
596
+ logger.debug('first package delay {}'.format(
597
+ self._first_package_timestamp -
598
+ self._start_stream_timestamp))
599
+ self._recv_audio_length += len(message) / (2 * self.sample_rate /
600
+ 1000)
601
+ current = time.time() * 1000
602
+ current_rtf = (current - self._start_stream_timestamp
603
+ ) / self._recv_audio_length
604
+ logger.debug('total audio {} ms, current_rtf: {}'.format(
605
+ self._recv_audio_length, current_rtf))
606
+ # 只有在非异步调用的时候保存音频
607
+ if not self.async_call:
608
+ if self._audio_data is None:
609
+ self._audio_data = bytes(message)
610
+ else:
611
+ self._audio_data = self._audio_data + bytes(message)
612
+ if self.callback:
613
+ self.callback.on_data(message)
614
+
615
+ def call(self, text: str, timeout_millis=None):
616
+ """
617
+ Speech synthesis.
618
+ If callback is set, the audio will be returned in real-time through the on_event interface.
619
+ Otherwise, this function blocks until all audio is received and then returns the complete audio data.
620
+
621
+ Parameters:
622
+ -----------
623
+ text: str
624
+ utf-8 encoded text
625
+ timeoutMillis:
626
+ Integer or None
627
+ return: bytes
628
+ If a callback is not set during initialization, the complete audio is returned
629
+ as the function's return value. Otherwise, the return value is null.
630
+ If the timeout is set to a value greater than zero and not None,
631
+ it will wait for the corresponding number of milliseconds;
632
+ otherwise, it will wait indefinitely.
633
+ """
634
+ # print('还不支持非流式语音合成sdk调用大模型,使用流式模拟')
635
+ if self.additional_params is None:
636
+ self.additional_params = {"enable_ssml":True}
637
+ else:
638
+ self.additional_params["enable_ssml"] = True
639
+ if not self.callback:
640
+ self.callback = ResultCallback()
641
+ self.__start_stream()
642
+ self.__submit_text(text)
643
+ if self.async_call:
644
+ self.async_streaming_complete(timeout_millis)
645
+ return None
646
+ else:
647
+ self.streaming_complete(timeout_millis)
648
+ return self._audio_data
649
+
650
+ # WebSocket关闭的回调函数
651
+ def on_close(self, ws, close_status_code, close_msg):
652
+ pass
653
+
654
+ # WebSocket发生错误的回调函数
655
+ def on_error(self, ws, error):
656
+ print(f'websocket closed due to {error}')
657
+ raise Exception(f'websocket closed due to {error}')
658
+
659
+ # 关闭WebSocket连接
660
+ def close(self):
661
+ self.ws.close()
662
+
663
+ # 获取上一个任务的taskId
664
+ def get_last_request_id(self):
665
+ return self.last_request_id
666
+
667
+ def get_first_package_delay(self):
668
+ """First Package Delay is the time between start sending text and receive first audio package
669
+ """
670
+ return self._first_package_timestamp - self._start_stream_timestamp
671
+
672
+ def get_response(self):
673
+ return self.last_response
674
+
675
+
676
+ class SpeechSynthesizerObjectPool:
677
+ _instance_lock = threading.Lock()
678
+
679
+ def __new__(cls, *args, **kwargs):
680
+ if not hasattr(SpeechSynthesizerObjectPool, '_instance'):
681
+ with SpeechSynthesizerObjectPool._instance_lock:
682
+ if not hasattr(SpeechSynthesizerObjectPool, '_instance'):
683
+ SpeechSynthesizerObjectPool._instance = object.__new__(cls)
684
+ return SpeechSynthesizerObjectPool._instance
685
+
686
+ class PoolObject:
687
+ def __init__(self, synthesizer):
688
+ self.synthesizer: SpeechSynthesizer = synthesizer
689
+ self.connect_time = -1
690
+
691
+ def __str__(self):
692
+ return f'synthesizer: {self.synthesizer}, connect_time: {self.connect_time}'
693
+
694
+ def __init__(self,
695
+ max_size: int = 20,
696
+ url=None,
697
+ headers=None,
698
+ workspace=None):
699
+ """
700
+ Speech synthesis object pool that follows the singleton pattern,
701
+ establishes WebSocket connections in advance to avoid connection overhead.
702
+ The connection pool will maintain a number of pre-created synthesizer objects
703
+ up to max_size; objects taken from the pool do not need to be returned,
704
+ and the pool will automatically replenish them.
705
+
706
+ Parameters:
707
+ -----------
708
+ max_size: int
709
+ Size of the object pool, with a value range of 1 to 100.
710
+ """
711
+ self.DEFAULT_MODEL = 'cosyvoice-v1'
712
+ self.DEFAULT_VOICE = 'longxiaochun'
713
+ self.DEFAULT_RECONNECT_INTERVAL = 30
714
+ self.DEFAULT_URL = url
715
+ self.DEFAUTL_HEADERS = headers
716
+ self.DEFAULT_WORKSPACE = workspace
717
+ if max_size <= 0:
718
+ raise ValueError('max_size must be greater than 0')
719
+ if max_size > 100:
720
+ raise ValueError('max_size must be less than 100')
721
+ self._pool = []
722
+ # 如果重连中,则会将avaliable置为False,避免被使用
723
+ self._avaliable = []
724
+ self._pool_size = max_size
725
+ for i in range(self._pool_size):
726
+ synthesizer = self.__get_default_synthesizer()
727
+ tmpPoolObject = self.PoolObject(synthesizer)
728
+ tmpPoolObject.synthesizer._SpeechSynthesizer__connect()
729
+ tmpPoolObject.connect_time = time.time()
730
+ self._pool.append(tmpPoolObject)
731
+ self._avaliable.append(True)
732
+ self._borrowed_object_num = 0
733
+ self._remain_object_num = max_size
734
+ self._lock = threading.Lock()
735
+ self._stop = False
736
+ self._stop_lock = threading.Lock()
737
+ self._working_thread = threading.Thread(target=self.__auto_reconnect,
738
+ args=())
739
+ self._working_thread.start()
740
+
741
+ def __get_default_synthesizer(self) -> SpeechSynthesizer:
742
+ return SpeechSynthesizer(model=self.DEFAULT_MODEL,
743
+ voice=self.DEFAULT_VOICE,
744
+ url=self.DEFAULT_URL,
745
+ headers=self.DEFAUTL_HEADERS,
746
+ workspace=self.DEFAULT_WORKSPACE)
747
+
748
+ def __get_reconnect_interval(self):
749
+ return self.DEFAULT_RECONNECT_INTERVAL + random.random() * 10 - 5
750
+
751
+ def __auto_reconnect(self):
752
+ logger.debug(
753
+ 'speech synthesizer object pool auto reconnect thread start')
754
+ while True:
755
+ objects_need_to_connect = []
756
+ objects_need_to_renew = []
757
+ logger.debug('scanning queue borr: {}/{} remain: {}/{}'.format(
758
+ self._borrowed_object_num, self._pool_size,
759
+ self._remain_object_num, self._pool_size))
760
+ with self._lock:
761
+ if self._stop:
762
+ return
763
+
764
+ current_time = time.time()
765
+ for idx, poolObject in enumerate(self._pool):
766
+ # 如果超过固定时间没有使用对象,则重连
767
+ if poolObject.connect_time == -1:
768
+ objects_need_to_connect.append(poolObject)
769
+ self._avaliable[idx] = False
770
+ elif (not poolObject.synthesizer.
771
+ _SpeechSynthesizer__is_connected()) or (
772
+ current_time - poolObject.connect_time >
773
+ self.__get_reconnect_interval()):
774
+ objects_need_to_renew.append(poolObject)
775
+ self._avaliable[idx] = False
776
+ for poolObject in objects_need_to_connect:
777
+ logger.info(
778
+ '[SpeechSynthesizerObjectPool] pre-connect new synthesizer'
779
+ )
780
+ poolObject.synthesizer._SpeechSynthesizer__connect()
781
+ poolObject.connect_time = time.time()
782
+ for poolObject in objects_need_to_renew:
783
+ logger.info(
784
+ '[SpeechSynthesizerObjectPool] renew synthesizer after {} s'
785
+ .format(current_time - poolObject.connect_time))
786
+ poolObject.synthesizer = self.__get_default_synthesizer()
787
+ poolObject.synthesizer._SpeechSynthesizer__connect()
788
+ poolObject.connect_time = time.time()
789
+ with self._lock:
790
+ for i in range(len(self._avaliable)):
791
+ self._avaliable[i] = True
792
+ time.sleep(1)
793
+
794
+ def shutdown(self):
795
+ """
796
+ This is a ThreadSafe Method.
797
+ destroy the object pool
798
+ """
799
+ logger.debug('[SpeechSynthesizerObjectPool] start shutdown')
800
+ with self._lock:
801
+ self._stop = True
802
+ self._pool = []
803
+ self._working_thread.join()
804
+ logger.debug('[SpeechSynthesizerObjectPool] shutdown complete')
805
+
806
+ def borrow_synthesizer(
807
+ self,
808
+ model,
809
+ voice,
810
+ format: AudioFormat = AudioFormat.DEFAULT,
811
+ volume=50,
812
+ speech_rate=1.0,
813
+ pitch_rate=1.0,
814
+ seed=0,
815
+ synthesis_type=0,
816
+ instruction=None,
817
+ language_hints: list = None,
818
+ headers=None,
819
+ callback: ResultCallback = None,
820
+ workspace=None,
821
+ url=None,
822
+ additional_params=None,
823
+ ):
824
+ """
825
+ This is a ThreadSafe Method.
826
+ get a synthesizer object from the pool.
827
+ objects taken from the pool need to be returned,
828
+ and the pool will automatically replenish them.
829
+ If there is no synthesizer object in the pool,
830
+ a new synthesizer object will be created and returned.
831
+ """
832
+ logger.debug('[SpeechSynthesizerObjectPool] get synthesizer')
833
+ synthesizer: SpeechSynthesizer = None
834
+ with self._lock:
835
+ # 遍历对象池,如果存在预建连的对象,则返回
836
+ for idx, poolObject in enumerate(self._pool):
837
+ if self._avaliable[
838
+ idx] and poolObject.synthesizer._SpeechSynthesizer__is_connected(
839
+ ):
840
+ synthesizer = poolObject.synthesizer
841
+ self._borrowed_object_num += 1
842
+ self._remain_object_num -= 1
843
+ self._pool.pop(idx)
844
+ self._avaliable.pop(idx)
845
+ break
846
+
847
+ # 如果对象池不足,则返回未建连的新对象
848
+ if synthesizer is None:
849
+ synthesizer = self.__get_default_synthesizer()
850
+ logger.warning(
851
+ '[SpeechSynthesizerObjectPool] object pool is exausted, create new synthesizer'
852
+ )
853
+ synthesizer._SpeechSynthesizer__reset()
854
+ synthesizer._SpeechSynthesizer__update_params(model, voice, format,
855
+ volume, speech_rate,
856
+ pitch_rate, seed, synthesis_type, instruction,
857
+ language_hints, self.DEFAUTL_HEADERS,
858
+ callback, self.DEFAULT_WORKSPACE, self.DEFAULT_URL,
859
+ additional_params, False)
860
+ return synthesizer
861
+
862
+ def return_synthesizer(self, synthesizer) -> bool:
863
+ """
864
+ This is a ThreadSafe Method.
865
+ return a synthesizer object back to the pool.
866
+ """
867
+ if not isinstance(synthesizer, SpeechSynthesizer):
868
+ logger.error(
869
+ '[SpeechSynthesizerObjectPool] return_synthesizer: synthesizer is not a SpeechSynthesizer object'
870
+ )
871
+ return False
872
+ with self._lock:
873
+ if self._borrowed_object_num <= 0:
874
+ logger.debug(
875
+ '[SpeechSynthesizerObjectPool] pool is full, drop returned object'
876
+ )
877
+ return False
878
+ poolObject = self.PoolObject(synthesizer)
879
+ poolObject.connect_time = time.time()
880
+ self._pool.append(poolObject)
881
+ self._avaliable.append(True)
882
+ self._borrowed_object_num -= 1
883
+ self._remain_object_num += 1
884
+ logger.debug(
885
+ '[SpeechSynthesizerObjectPool] return synthesizer back to pool'
886
+ )