dashscope 1.8.0__py3-none-any.whl → 1.25.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dashscope/__init__.py +61 -14
- dashscope/aigc/__init__.py +10 -3
- dashscope/aigc/chat_completion.py +282 -0
- dashscope/aigc/code_generation.py +145 -0
- dashscope/aigc/conversation.py +71 -12
- dashscope/aigc/generation.py +288 -16
- dashscope/aigc/image_synthesis.py +473 -31
- dashscope/aigc/multimodal_conversation.py +299 -14
- dashscope/aigc/video_synthesis.py +610 -0
- dashscope/api_entities/aiohttp_request.py +8 -5
- dashscope/api_entities/api_request_data.py +4 -2
- dashscope/api_entities/api_request_factory.py +68 -20
- dashscope/api_entities/base_request.py +20 -3
- dashscope/api_entities/chat_completion_types.py +344 -0
- dashscope/api_entities/dashscope_response.py +243 -15
- dashscope/api_entities/encryption.py +179 -0
- dashscope/api_entities/http_request.py +216 -62
- dashscope/api_entities/websocket_request.py +43 -34
- dashscope/app/__init__.py +5 -0
- dashscope/app/application.py +203 -0
- dashscope/app/application_response.py +246 -0
- dashscope/assistants/__init__.py +16 -0
- dashscope/assistants/assistant_types.py +175 -0
- dashscope/assistants/assistants.py +311 -0
- dashscope/assistants/files.py +197 -0
- dashscope/audio/__init__.py +4 -2
- dashscope/audio/asr/__init__.py +17 -1
- dashscope/audio/asr/asr_phrase_manager.py +203 -0
- dashscope/audio/asr/recognition.py +167 -27
- dashscope/audio/asr/transcription.py +107 -14
- dashscope/audio/asr/translation_recognizer.py +1006 -0
- dashscope/audio/asr/vocabulary.py +177 -0
- dashscope/audio/qwen_asr/__init__.py +7 -0
- dashscope/audio/qwen_asr/qwen_transcription.py +189 -0
- dashscope/audio/qwen_omni/__init__.py +11 -0
- dashscope/audio/qwen_omni/omni_realtime.py +524 -0
- dashscope/audio/qwen_tts/__init__.py +5 -0
- dashscope/audio/qwen_tts/speech_synthesizer.py +77 -0
- dashscope/audio/qwen_tts_realtime/__init__.py +10 -0
- dashscope/audio/qwen_tts_realtime/qwen_tts_realtime.py +355 -0
- dashscope/audio/tts/__init__.py +2 -0
- dashscope/audio/tts/speech_synthesizer.py +5 -0
- dashscope/audio/tts_v2/__init__.py +12 -0
- dashscope/audio/tts_v2/enrollment.py +179 -0
- dashscope/audio/tts_v2/speech_synthesizer.py +886 -0
- dashscope/cli.py +157 -37
- dashscope/client/base_api.py +652 -87
- dashscope/common/api_key.py +2 -0
- dashscope/common/base_type.py +135 -0
- dashscope/common/constants.py +13 -16
- dashscope/common/env.py +2 -0
- dashscope/common/error.py +58 -22
- dashscope/common/logging.py +2 -0
- dashscope/common/message_manager.py +2 -0
- dashscope/common/utils.py +276 -46
- dashscope/customize/__init__.py +0 -0
- dashscope/customize/customize_types.py +192 -0
- dashscope/customize/deployments.py +146 -0
- dashscope/customize/finetunes.py +234 -0
- dashscope/embeddings/__init__.py +5 -1
- dashscope/embeddings/batch_text_embedding.py +208 -0
- dashscope/embeddings/batch_text_embedding_response.py +65 -0
- dashscope/embeddings/multimodal_embedding.py +118 -10
- dashscope/embeddings/text_embedding.py +13 -1
- dashscope/{file.py → files.py} +19 -4
- dashscope/io/input_output.py +2 -0
- dashscope/model.py +11 -2
- dashscope/models.py +43 -0
- dashscope/multimodal/__init__.py +20 -0
- dashscope/multimodal/dialog_state.py +56 -0
- dashscope/multimodal/multimodal_constants.py +28 -0
- dashscope/multimodal/multimodal_dialog.py +648 -0
- dashscope/multimodal/multimodal_request_params.py +313 -0
- dashscope/multimodal/tingwu/__init__.py +10 -0
- dashscope/multimodal/tingwu/tingwu.py +80 -0
- dashscope/multimodal/tingwu/tingwu_realtime.py +579 -0
- dashscope/nlp/__init__.py +0 -0
- dashscope/nlp/understanding.py +64 -0
- dashscope/protocol/websocket.py +3 -0
- dashscope/rerank/__init__.py +0 -0
- dashscope/rerank/text_rerank.py +69 -0
- dashscope/resources/qwen.tiktoken +151643 -0
- dashscope/threads/__init__.py +26 -0
- dashscope/threads/messages/__init__.py +0 -0
- dashscope/threads/messages/files.py +113 -0
- dashscope/threads/messages/messages.py +220 -0
- dashscope/threads/runs/__init__.py +0 -0
- dashscope/threads/runs/runs.py +501 -0
- dashscope/threads/runs/steps.py +112 -0
- dashscope/threads/thread_types.py +665 -0
- dashscope/threads/threads.py +212 -0
- dashscope/tokenizers/__init__.py +7 -0
- dashscope/tokenizers/qwen_tokenizer.py +111 -0
- dashscope/tokenizers/tokenization.py +125 -0
- dashscope/tokenizers/tokenizer.py +45 -0
- dashscope/tokenizers/tokenizer_base.py +32 -0
- dashscope/utils/__init__.py +0 -0
- dashscope/utils/message_utils.py +838 -0
- dashscope/utils/oss_utils.py +243 -0
- dashscope/utils/param_utils.py +29 -0
- dashscope/version.py +3 -1
- {dashscope-1.8.0.dist-info → dashscope-1.25.6.dist-info}/METADATA +53 -50
- dashscope-1.25.6.dist-info/RECORD +112 -0
- {dashscope-1.8.0.dist-info → dashscope-1.25.6.dist-info}/WHEEL +1 -1
- {dashscope-1.8.0.dist-info → dashscope-1.25.6.dist-info}/entry_points.txt +0 -1
- {dashscope-1.8.0.dist-info → dashscope-1.25.6.dist-info/licenses}/LICENSE +2 -4
- dashscope/deployment.py +0 -129
- dashscope/finetune.py +0 -149
- dashscope-1.8.0.dist-info/RECORD +0 -49
- {dashscope-1.8.0.dist-info → dashscope-1.25.6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,1006 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import threading
|
|
6
|
+
import time
|
|
7
|
+
import uuid
|
|
8
|
+
from http import HTTPStatus
|
|
9
|
+
from queue import Queue
|
|
10
|
+
from threading import Timer
|
|
11
|
+
from typing import Any, Dict, List
|
|
12
|
+
|
|
13
|
+
from dashscope.client.base_api import BaseApi
|
|
14
|
+
from dashscope.common.constants import ApiProtocol
|
|
15
|
+
from dashscope.common.error import (InputDataRequired, InputRequired,
|
|
16
|
+
InvalidParameter, InvalidTask,
|
|
17
|
+
ModelRequired)
|
|
18
|
+
from dashscope.common.logging import logger
|
|
19
|
+
from dashscope.common.utils import _get_task_group_and_task
|
|
20
|
+
from dashscope.protocol.websocket import WebsocketStreamingMode
|
|
21
|
+
|
|
22
|
+
DASHSCOPE_TRANSLATION_KEY = 'translations'
|
|
23
|
+
DASHSCOPE_TRANSCRIPTION_KEY = 'transcription'
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ThreadSafeBool:
|
|
27
|
+
def __init__(self, initial_value=False):
|
|
28
|
+
self._value = initial_value
|
|
29
|
+
self._lock = threading.Lock()
|
|
30
|
+
|
|
31
|
+
def set(self, value):
|
|
32
|
+
with self._lock:
|
|
33
|
+
self._value = value
|
|
34
|
+
|
|
35
|
+
def get(self):
|
|
36
|
+
with self._lock:
|
|
37
|
+
return self._value
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class WordObj():
|
|
41
|
+
def __init__(self, ) -> None:
|
|
42
|
+
self.text: str = None
|
|
43
|
+
self.begin_time: int = None
|
|
44
|
+
self.end_time: int = None
|
|
45
|
+
self.fixed: bool = False
|
|
46
|
+
self._raw_data = None
|
|
47
|
+
|
|
48
|
+
@staticmethod
|
|
49
|
+
def from_json(json_data: Dict[str, Any]):
|
|
50
|
+
"""Create a Word object from a JSON dictionary.
|
|
51
|
+
"""
|
|
52
|
+
word = WordObj()
|
|
53
|
+
word.text = json_data['text']
|
|
54
|
+
word.begin_time = json_data['begin_time']
|
|
55
|
+
word.end_time = json_data['end_time']
|
|
56
|
+
word.fixed = json_data['fixed']
|
|
57
|
+
word._raw_data = json_data
|
|
58
|
+
return word
|
|
59
|
+
|
|
60
|
+
def __str__(self) -> str:
|
|
61
|
+
return 'Word: ' + json.dumps(self._raw_data, ensure_ascii=False)
|
|
62
|
+
|
|
63
|
+
def __repr__(self):
|
|
64
|
+
return self.__str__()
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class SentenceBaseObj():
|
|
68
|
+
def __init__(self, ) -> None:
|
|
69
|
+
self.sentence_id: int = -1
|
|
70
|
+
self.text: str = None
|
|
71
|
+
self.begin_time: int = None
|
|
72
|
+
self.end_time: int = None
|
|
73
|
+
self.words: List[WordObj] = []
|
|
74
|
+
self._raw_data = None
|
|
75
|
+
|
|
76
|
+
@staticmethod
|
|
77
|
+
def from_json(json_data: Dict[str, Any]):
|
|
78
|
+
"""Create a SentenceBase object from a JSON dictionary.
|
|
79
|
+
"""
|
|
80
|
+
sentence = SentenceBaseObj()
|
|
81
|
+
sentence.sentence_id = json_data['sentence_id']
|
|
82
|
+
sentence.text = json_data['text']
|
|
83
|
+
sentence.begin_time = json_data['begin_time']
|
|
84
|
+
if json_data.get('end_time') is not None:
|
|
85
|
+
sentence.end_time = json_data['end_time']
|
|
86
|
+
else:
|
|
87
|
+
sentence.end_time = json_data['current_time']
|
|
88
|
+
sentence.words = [
|
|
89
|
+
WordObj.from_json(word) for word in json_data['words']
|
|
90
|
+
]
|
|
91
|
+
sentence._raw_data = json_data
|
|
92
|
+
return sentence
|
|
93
|
+
|
|
94
|
+
def __str__(self) -> str:
|
|
95
|
+
return json.dumps(self._raw_data, ensure_ascii=False)
|
|
96
|
+
|
|
97
|
+
def __repr__(self):
|
|
98
|
+
return self.__str__()
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class TranscriptionResult(SentenceBaseObj):
|
|
102
|
+
def __init__(self, ) -> None:
|
|
103
|
+
self.stash: SentenceBaseObj = None
|
|
104
|
+
self.is_sentence_end = False
|
|
105
|
+
# vad related
|
|
106
|
+
self.vad_pre_end: bool = False
|
|
107
|
+
self.pre_end_failed: bool = False
|
|
108
|
+
self.pre_end_timemillis: int = -1
|
|
109
|
+
self.pre_end_start_time: int = -1
|
|
110
|
+
self.pre_end_end_time: int = -1
|
|
111
|
+
self._raw_data = None
|
|
112
|
+
|
|
113
|
+
@staticmethod
|
|
114
|
+
def from_json(json_data: Dict[str, Any]):
|
|
115
|
+
"""Create a TranscriptionResult object from a JSON dictionary.
|
|
116
|
+
"""
|
|
117
|
+
transcription = TranscriptionResult()
|
|
118
|
+
transcription.sentence_id = json_data['sentence_id']
|
|
119
|
+
transcription.text = json_data['text']
|
|
120
|
+
transcription.begin_time = json_data['begin_time']
|
|
121
|
+
if json_data.get('end_time') is not None:
|
|
122
|
+
transcription.end_time = json_data['end_time']
|
|
123
|
+
else:
|
|
124
|
+
transcription.end_time = json_data['current_time']
|
|
125
|
+
transcription.words = [
|
|
126
|
+
WordObj.from_json(word) for word in json_data['words']
|
|
127
|
+
]
|
|
128
|
+
transcription._raw_data = json_data
|
|
129
|
+
transcription.is_sentence_end = json_data.get('sentence_end')
|
|
130
|
+
if 'stash' in json_data:
|
|
131
|
+
transcription.stash = SentenceBaseObj.from_json(json_data['stash'])
|
|
132
|
+
if 'vad_pre_end' in json_data:
|
|
133
|
+
transcription.vad_pre_end = json_data['vad_pre_end']
|
|
134
|
+
if 'pre_end_failed' in json_data:
|
|
135
|
+
transcription.pre_end_failed = json_data['pre_end_failed']
|
|
136
|
+
if 'pre_end_start_time' in json_data:
|
|
137
|
+
transcription.pre_end_start_time = json_data['pre_end_start_time']
|
|
138
|
+
if 'pre_end_end_time' in json_data:
|
|
139
|
+
transcription.pre_end_end_time = json_data['pre_end_end_time']
|
|
140
|
+
transcription._raw_data = json_data
|
|
141
|
+
return transcription
|
|
142
|
+
|
|
143
|
+
def __str__(self) -> str:
|
|
144
|
+
return 'Transcriptions: ' + json.dumps(self._raw_data,
|
|
145
|
+
ensure_ascii=False)
|
|
146
|
+
|
|
147
|
+
def __repr__(self):
|
|
148
|
+
return self.__str__()
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
class Translation(SentenceBaseObj):
|
|
152
|
+
def __init__(self, ) -> None:
|
|
153
|
+
self.language: str = None
|
|
154
|
+
self.stash: SentenceBaseObj = None
|
|
155
|
+
self.is_sentence_end = False
|
|
156
|
+
# vad related
|
|
157
|
+
self.vad_pre_end: bool = False
|
|
158
|
+
self.pre_end_failed: bool = False
|
|
159
|
+
self.pre_end_timemillis: int = -1
|
|
160
|
+
self.pre_end_start_time: int = -1
|
|
161
|
+
self.pre_end_end_time: int = -1
|
|
162
|
+
self._raw_data = None
|
|
163
|
+
|
|
164
|
+
@staticmethod
|
|
165
|
+
def from_json(json_data: Dict[str, Any]):
|
|
166
|
+
"""Create a Translation object from a JSON dictionary.
|
|
167
|
+
"""
|
|
168
|
+
translation = Translation()
|
|
169
|
+
translation.sentence_id = json_data['sentence_id']
|
|
170
|
+
translation.text = json_data['text']
|
|
171
|
+
translation.begin_time = json_data['begin_time']
|
|
172
|
+
if json_data.get('end_time') is not None:
|
|
173
|
+
translation.end_time = json_data['end_time']
|
|
174
|
+
else:
|
|
175
|
+
translation.end_time = json_data['current_time']
|
|
176
|
+
translation.words = [
|
|
177
|
+
WordObj.from_json(word) for word in json_data['words']
|
|
178
|
+
]
|
|
179
|
+
translation._raw_data = json_data
|
|
180
|
+
|
|
181
|
+
translation.language = json_data['lang']
|
|
182
|
+
translation.is_sentence_end = json_data.get('sentence_end')
|
|
183
|
+
if 'stash' in json_data:
|
|
184
|
+
translation.stash = SentenceBaseObj.from_json(json_data['stash'])
|
|
185
|
+
if 'vad_pre_end' in json_data:
|
|
186
|
+
translation.vad_pre_end = json_data['vad_pre_end']
|
|
187
|
+
if 'pre_end_failed' in json_data:
|
|
188
|
+
translation.pre_end_failed = json_data['pre_end_failed']
|
|
189
|
+
if 'pre_end_start_time' in json_data:
|
|
190
|
+
translation.pre_end_start_time = json_data['pre_end_start_time']
|
|
191
|
+
if 'pre_end_end_time' in json_data:
|
|
192
|
+
translation.pre_end_end_time = json_data['pre_end_end_time']
|
|
193
|
+
translation._raw_data = json_data
|
|
194
|
+
return translation
|
|
195
|
+
|
|
196
|
+
def __str__(self) -> str:
|
|
197
|
+
return 'Translation: ' + json.dumps(self._raw_data, ensure_ascii=False)
|
|
198
|
+
|
|
199
|
+
def __repr__(self):
|
|
200
|
+
return self.__str__()
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
class TranslationResult():
|
|
204
|
+
def __init__(self, ) -> None:
|
|
205
|
+
self.translations: Dict[str:Translation] = {}
|
|
206
|
+
self.is_sentence_end = False
|
|
207
|
+
self._raw_data = None
|
|
208
|
+
|
|
209
|
+
def get_translation(self, language) -> Translation:
|
|
210
|
+
if self.translations is None:
|
|
211
|
+
return None
|
|
212
|
+
return self.translations.get(language)
|
|
213
|
+
|
|
214
|
+
def get_language_list(self, ) -> List[str]:
|
|
215
|
+
if self.translations is None:
|
|
216
|
+
return None
|
|
217
|
+
return list(self.translations.keys())
|
|
218
|
+
|
|
219
|
+
@staticmethod
|
|
220
|
+
def from_json(json_data: List):
|
|
221
|
+
"""Create a TranslationResult object from a JSON dictionary.
|
|
222
|
+
"""
|
|
223
|
+
result = TranslationResult()
|
|
224
|
+
result._raw_data = json_data
|
|
225
|
+
for translation_json in json_data:
|
|
226
|
+
if not isinstance(translation_json, dict):
|
|
227
|
+
raise InvalidParameter(
|
|
228
|
+
f'Invalid translation json data: {translation_json}')
|
|
229
|
+
else:
|
|
230
|
+
translation = Translation.from_json(translation_json)
|
|
231
|
+
result.translations[translation.language] = translation
|
|
232
|
+
if translation.is_sentence_end:
|
|
233
|
+
result.is_sentence_end = True
|
|
234
|
+
return result
|
|
235
|
+
|
|
236
|
+
def __str__(self) -> str:
|
|
237
|
+
return 'TranslationList: ' + json.dumps(self._raw_data,
|
|
238
|
+
ensure_ascii=False)
|
|
239
|
+
|
|
240
|
+
def __repr__(self):
|
|
241
|
+
return self.__str__()
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
class TranslationRecognizerResultPack():
|
|
245
|
+
def __init__(self) -> None:
|
|
246
|
+
self.transcription_result_list: List[TranscriptionResult] = []
|
|
247
|
+
self.translation_result_list: List[TranslationResult] = []
|
|
248
|
+
self.usage_list: List = []
|
|
249
|
+
self.request_id: str = None
|
|
250
|
+
self.error_message = None
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
class TranslationRecognizerCallback():
|
|
254
|
+
"""An interface that defines callback methods for getting translation recognizer results. # noqa E501
|
|
255
|
+
Derive from this class and implement its function to provide your own data.
|
|
256
|
+
"""
|
|
257
|
+
def on_open(self) -> None:
|
|
258
|
+
pass
|
|
259
|
+
|
|
260
|
+
def on_complete(self) -> None:
|
|
261
|
+
pass
|
|
262
|
+
|
|
263
|
+
def on_error(self, message) -> None:
|
|
264
|
+
pass
|
|
265
|
+
|
|
266
|
+
def on_close(self) -> None:
|
|
267
|
+
pass
|
|
268
|
+
|
|
269
|
+
def on_event(self, request_id, transcription_result: TranscriptionResult,
|
|
270
|
+
translation_result: TranslationResult, usage) -> None:
|
|
271
|
+
pass
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
class TranslationRecognizerRealtime(BaseApi):
|
|
275
|
+
"""TranslationRecognizerRealtime interface.
|
|
276
|
+
|
|
277
|
+
Args:
|
|
278
|
+
model (str): The requested model_id.
|
|
279
|
+
callback (TranslationRecognizerRealtime): A callback that returns
|
|
280
|
+
TranslationRecognizerRealtime results.
|
|
281
|
+
format (str): The input audio format.
|
|
282
|
+
sample_rate (int): The input audio sample rate.
|
|
283
|
+
workspace (str): The dashscope workspace id.
|
|
284
|
+
|
|
285
|
+
**kwargs:
|
|
286
|
+
phrase_id (list, `optional`): The ID of phrase.
|
|
287
|
+
disfluency_removal_enabled(bool, `optional`): Filter mood words,
|
|
288
|
+
turned off by default.
|
|
289
|
+
diarization_enabled (bool, `optional`): Speech auto diarization,
|
|
290
|
+
turned off by default.
|
|
291
|
+
speaker_count (int, `optional`): The number of speakers.
|
|
292
|
+
timestamp_alignment_enabled (bool, `optional`): Timestamp-alignment
|
|
293
|
+
calibration, turned off by default.
|
|
294
|
+
special_word_filter(str, `optional`): Sensitive word filter.
|
|
295
|
+
audio_event_detection_enabled(bool, `optional`):
|
|
296
|
+
Audio event detection, turned off by default.
|
|
297
|
+
|
|
298
|
+
Raises:
|
|
299
|
+
InputRequired: Input is required.
|
|
300
|
+
"""
|
|
301
|
+
|
|
302
|
+
SILENCE_TIMEOUT_S = 23
|
|
303
|
+
|
|
304
|
+
def __init__(self,
|
|
305
|
+
model: str,
|
|
306
|
+
callback: TranslationRecognizerCallback,
|
|
307
|
+
format: str,
|
|
308
|
+
sample_rate: int,
|
|
309
|
+
transcription_enabled: bool = True,
|
|
310
|
+
source_language: str = None,
|
|
311
|
+
translation_enabled: bool = False,
|
|
312
|
+
workspace: str = None,
|
|
313
|
+
**kwargs):
|
|
314
|
+
if model is None:
|
|
315
|
+
raise ModelRequired('Model is required!')
|
|
316
|
+
if format is None:
|
|
317
|
+
raise InputRequired('format is required!')
|
|
318
|
+
if sample_rate is None:
|
|
319
|
+
raise InputRequired('sample_rate is required!')
|
|
320
|
+
|
|
321
|
+
self.model = model
|
|
322
|
+
self.format = format
|
|
323
|
+
self.sample_rate = sample_rate
|
|
324
|
+
self.source_language = source_language
|
|
325
|
+
self.transcription_enabled = transcription_enabled
|
|
326
|
+
self.translation_enabled = translation_enabled
|
|
327
|
+
# continuous recognition with start() or once recognition with call()
|
|
328
|
+
self._recognition_once = False
|
|
329
|
+
self._callback = callback
|
|
330
|
+
self._running = False
|
|
331
|
+
self._stream_data = Queue()
|
|
332
|
+
self._worker = None
|
|
333
|
+
self._silence_timer = None
|
|
334
|
+
self._kwargs = kwargs
|
|
335
|
+
self._workspace = workspace
|
|
336
|
+
self._start_stream_timestamp = -1
|
|
337
|
+
self._first_package_timestamp = -1
|
|
338
|
+
self._stop_stream_timestamp = -1
|
|
339
|
+
self._on_complete_timestamp = -1
|
|
340
|
+
self.request_id_confirmed = False
|
|
341
|
+
self.last_request_id = uuid.uuid4().hex
|
|
342
|
+
|
|
343
|
+
def __del__(self):
|
|
344
|
+
if self._running:
|
|
345
|
+
self._running = False
|
|
346
|
+
self._stream_data = Queue()
|
|
347
|
+
if self._worker is not None and self._worker.is_alive():
|
|
348
|
+
self._worker.join()
|
|
349
|
+
if self._silence_timer is not None and self._silence_timer.is_alive( # noqa E501
|
|
350
|
+
):
|
|
351
|
+
self._silence_timer.cancel()
|
|
352
|
+
self._silence_timer = None
|
|
353
|
+
if self._callback:
|
|
354
|
+
self._callback.on_close()
|
|
355
|
+
|
|
356
|
+
def __receive_worker(self):
|
|
357
|
+
"""Asynchronously, initiate a real-time transltion recognizer request and
|
|
358
|
+
obtain the result for parsing.
|
|
359
|
+
"""
|
|
360
|
+
responses = self.__launch_request()
|
|
361
|
+
for part in responses:
|
|
362
|
+
if part.status_code == HTTPStatus.OK:
|
|
363
|
+
logger.debug('Received response request_id: {} {}'.format(
|
|
364
|
+
part.request_id, part.output))
|
|
365
|
+
if len(part.output) == 0:
|
|
366
|
+
self._on_complete_timestamp = time.time() * 1000
|
|
367
|
+
logger.debug('last package delay {}'.format(
|
|
368
|
+
self.get_last_package_delay()))
|
|
369
|
+
self._callback.on_complete()
|
|
370
|
+
else:
|
|
371
|
+
usage = None
|
|
372
|
+
transcription = None
|
|
373
|
+
translations = None
|
|
374
|
+
if DASHSCOPE_TRANSCRIPTION_KEY in part.output:
|
|
375
|
+
transcription = TranscriptionResult.from_json(
|
|
376
|
+
part.output[DASHSCOPE_TRANSCRIPTION_KEY])
|
|
377
|
+
if DASHSCOPE_TRANSLATION_KEY in part.output:
|
|
378
|
+
translations = TranslationResult.from_json(
|
|
379
|
+
part.output[DASHSCOPE_TRANSLATION_KEY])
|
|
380
|
+
if transcription is not None or translations is not None:
|
|
381
|
+
if (self._first_package_timestamp < 0):
|
|
382
|
+
self._first_package_timestamp = time.time() * 1000
|
|
383
|
+
logger.debug('first package delay {}'.format(
|
|
384
|
+
self.get_first_package_delay()))
|
|
385
|
+
|
|
386
|
+
if part.usage is not None:
|
|
387
|
+
usage = part.usage
|
|
388
|
+
if self.request_id_confirmed is False and part.request_id is not None:
|
|
389
|
+
self.last_request_id = part.request_id
|
|
390
|
+
self.request_id_confirmed = True
|
|
391
|
+
self._callback.on_event(part.request_id, transcription,
|
|
392
|
+
translations, usage)
|
|
393
|
+
else:
|
|
394
|
+
self._running = False
|
|
395
|
+
self._stream_data = Queue()
|
|
396
|
+
self._callback.on_error(part)
|
|
397
|
+
self._callback.on_close()
|
|
398
|
+
break
|
|
399
|
+
|
|
400
|
+
def __launch_request(self):
|
|
401
|
+
"""Initiate real-time translation recognizer requests.
|
|
402
|
+
"""
|
|
403
|
+
|
|
404
|
+
self._tidy_kwargs()
|
|
405
|
+
task_name, _ = _get_task_group_and_task(__name__)
|
|
406
|
+
responses = super().call(
|
|
407
|
+
model=self.model,
|
|
408
|
+
task_group='audio',
|
|
409
|
+
task=task_name,
|
|
410
|
+
function='recognition',
|
|
411
|
+
input=self._input_stream_cycle(),
|
|
412
|
+
api_protocol=ApiProtocol.WEBSOCKET,
|
|
413
|
+
ws_stream_mode=WebsocketStreamingMode.DUPLEX,
|
|
414
|
+
is_binary_input=True,
|
|
415
|
+
sample_rate=self.sample_rate,
|
|
416
|
+
format=self.format,
|
|
417
|
+
stream=True,
|
|
418
|
+
source_language=self.source_language,
|
|
419
|
+
transcription_enabled=self.transcription_enabled,
|
|
420
|
+
translation_enabled=self.translation_enabled,
|
|
421
|
+
workspace=self._workspace,
|
|
422
|
+
pre_task_id=self.last_request_id,
|
|
423
|
+
**self._kwargs)
|
|
424
|
+
return responses
|
|
425
|
+
|
|
426
|
+
def start(self, **kwargs):
|
|
427
|
+
"""Real-time translation recognizer in asynchronous mode.
|
|
428
|
+
Please call 'stop()' after you have completed translation & recognition.
|
|
429
|
+
|
|
430
|
+
Args:
|
|
431
|
+
phrase_id (str, `optional`): The ID of phrase.
|
|
432
|
+
|
|
433
|
+
**kwargs:
|
|
434
|
+
disfluency_removal_enabled(bool, `optional`):
|
|
435
|
+
Filter mood words, turned off by default.
|
|
436
|
+
diarization_enabled (bool, `optional`):
|
|
437
|
+
Speech auto diarization, turned off by default.
|
|
438
|
+
speaker_count (int, `optional`): The number of speakers.
|
|
439
|
+
timestamp_alignment_enabled (bool, `optional`):
|
|
440
|
+
Timestamp-alignment calibration, turned off by default.
|
|
441
|
+
special_word_filter(str, `optional`): Sensitive word filter.
|
|
442
|
+
audio_event_detection_enabled(bool, `optional`):
|
|
443
|
+
Audio event detection, turned off by default.
|
|
444
|
+
|
|
445
|
+
Raises:
|
|
446
|
+
InvalidParameter: This interface cannot be called again
|
|
447
|
+
if it has already been started.
|
|
448
|
+
InvalidTask: Task create failed.
|
|
449
|
+
"""
|
|
450
|
+
assert self._callback is not None, 'Please set the callback to get the translation & recognition result.' # noqa E501
|
|
451
|
+
|
|
452
|
+
if self._running:
|
|
453
|
+
raise InvalidParameter(
|
|
454
|
+
'TranslationRecognizerRealtime has started.')
|
|
455
|
+
|
|
456
|
+
self._start_stream_timestamp = -1
|
|
457
|
+
self._first_package_timestamp = -1
|
|
458
|
+
self._stop_stream_timestamp = -1
|
|
459
|
+
self._on_complete_timestamp = -1
|
|
460
|
+
self._kwargs.update(**kwargs)
|
|
461
|
+
self._recognition_once = False
|
|
462
|
+
self._worker = threading.Thread(target=self.__receive_worker)
|
|
463
|
+
self._worker.start()
|
|
464
|
+
if self._worker.is_alive():
|
|
465
|
+
self._running = True
|
|
466
|
+
self._callback.on_open()
|
|
467
|
+
|
|
468
|
+
# If audio data is not received for 23 seconds, the timeout exits
|
|
469
|
+
self._silence_timer = Timer(
|
|
470
|
+
TranslationRecognizerRealtime.SILENCE_TIMEOUT_S,
|
|
471
|
+
self._silence_stop_timer)
|
|
472
|
+
self._silence_timer.start()
|
|
473
|
+
else:
|
|
474
|
+
self._running = False
|
|
475
|
+
raise InvalidTask('Invalid task, task create failed.')
|
|
476
|
+
|
|
477
|
+
def call(self,
|
|
478
|
+
file: str,
|
|
479
|
+
phrase_id: str = None,
|
|
480
|
+
**kwargs) -> TranslationRecognizerResultPack:
|
|
481
|
+
"""TranslationRecognizerRealtime in synchronous mode.
|
|
482
|
+
|
|
483
|
+
Args:
|
|
484
|
+
file (str): The path to the local audio file.
|
|
485
|
+
phrase_id (str, `optional`): The ID of phrase.
|
|
486
|
+
|
|
487
|
+
**kwargs:
|
|
488
|
+
disfluency_removal_enabled(bool, `optional`):
|
|
489
|
+
Filter mood words, turned off by default.
|
|
490
|
+
diarization_enabled (bool, `optional`):
|
|
491
|
+
Speech auto diarization, turned off by default.
|
|
492
|
+
speaker_count (int, `optional`): The number of speakers.
|
|
493
|
+
timestamp_alignment_enabled (bool, `optional`):
|
|
494
|
+
Timestamp-alignment calibration, turned off by default.
|
|
495
|
+
special_word_filter(str, `optional`): Sensitive word filter.
|
|
496
|
+
audio_event_detection_enabled(bool, `optional`):
|
|
497
|
+
Audio event detection, turned off by default.
|
|
498
|
+
|
|
499
|
+
Raises:
|
|
500
|
+
InvalidParameter: This interface cannot be called again
|
|
501
|
+
if it has already been started.
|
|
502
|
+
InputDataRequired: The supplied file was empty.
|
|
503
|
+
|
|
504
|
+
Returns:
|
|
505
|
+
TranslationRecognizerResultPack: The result of speech translation & recognition.
|
|
506
|
+
"""
|
|
507
|
+
self._start_stream_timestamp = time.time() * 1000
|
|
508
|
+
if self._running:
|
|
509
|
+
raise InvalidParameter(
|
|
510
|
+
'TranslationRecognizerRealtime has been called.')
|
|
511
|
+
|
|
512
|
+
if os.path.exists(file):
|
|
513
|
+
if os.path.isdir(file):
|
|
514
|
+
raise IsADirectoryError('Is a directory: ' + file)
|
|
515
|
+
else:
|
|
516
|
+
raise FileNotFoundError('No such file or directory: ' + file)
|
|
517
|
+
|
|
518
|
+
self._recognition_once = True
|
|
519
|
+
self._stream_data = Queue()
|
|
520
|
+
self._phrase = phrase_id
|
|
521
|
+
self._kwargs.update(**kwargs)
|
|
522
|
+
results = TranslationRecognizerResultPack()
|
|
523
|
+
error_message = None
|
|
524
|
+
|
|
525
|
+
try:
|
|
526
|
+
audio_data: bytes = None
|
|
527
|
+
f = open(file, 'rb')
|
|
528
|
+
if os.path.getsize(file):
|
|
529
|
+
while True:
|
|
530
|
+
audio_data = f.read(12800)
|
|
531
|
+
if not audio_data:
|
|
532
|
+
break
|
|
533
|
+
else:
|
|
534
|
+
self._stream_data.put(audio_data)
|
|
535
|
+
else:
|
|
536
|
+
raise InputDataRequired(
|
|
537
|
+
'The supplied file was empty (zero bytes long)')
|
|
538
|
+
f.close()
|
|
539
|
+
self._stop_stream_timestamp = time.time() * 1000
|
|
540
|
+
except Exception as e:
|
|
541
|
+
logger.error(e)
|
|
542
|
+
raise e
|
|
543
|
+
|
|
544
|
+
if not self._stream_data.empty():
|
|
545
|
+
self._running = True
|
|
546
|
+
responses = self.__launch_request()
|
|
547
|
+
for part in responses:
|
|
548
|
+
if part.status_code == HTTPStatus.OK:
|
|
549
|
+
logger.debug('received data: {}'.format(part.output))
|
|
550
|
+
# debug log cal fpd
|
|
551
|
+
transcription = None
|
|
552
|
+
translation = None
|
|
553
|
+
usage = None
|
|
554
|
+
if ('translation' in part.output) or ('transcription'
|
|
555
|
+
in part.output):
|
|
556
|
+
if (self._first_package_timestamp < 0):
|
|
557
|
+
self._first_package_timestamp = time.time() * 1000
|
|
558
|
+
logger.debug('first package delay {}'.format(
|
|
559
|
+
self._first_package_timestamp -
|
|
560
|
+
self._start_stream_timestamp))
|
|
561
|
+
if part.usage is not None:
|
|
562
|
+
usage = part.usage
|
|
563
|
+
|
|
564
|
+
if DASHSCOPE_TRANSCRIPTION_KEY in part.output:
|
|
565
|
+
transcription = TranscriptionResult.from_json(
|
|
566
|
+
part.output[DASHSCOPE_TRANSCRIPTION_KEY])
|
|
567
|
+
|
|
568
|
+
if DASHSCOPE_TRANSLATION_KEY in part.output:
|
|
569
|
+
translation = TranslationResult.from_json(
|
|
570
|
+
part.output[DASHSCOPE_TRANSLATION_KEY])
|
|
571
|
+
|
|
572
|
+
if (transcription is not None
|
|
573
|
+
and transcription.is_sentence_end) or (
|
|
574
|
+
translation is not None
|
|
575
|
+
and translation.is_sentence_end):
|
|
576
|
+
results.request_id = part.request_id
|
|
577
|
+
results.transcription_result_list.append(transcription)
|
|
578
|
+
results.translation_result_list.append(translation)
|
|
579
|
+
results.usage_list.append(usage)
|
|
580
|
+
else:
|
|
581
|
+
error_message = part
|
|
582
|
+
logger.error(error_message)
|
|
583
|
+
break
|
|
584
|
+
|
|
585
|
+
self._on_complete_timestamp = time.time() * 1000
|
|
586
|
+
logger.debug('last package delay {}'.format(
|
|
587
|
+
self.get_last_package_delay()))
|
|
588
|
+
|
|
589
|
+
self._stream_data = Queue()
|
|
590
|
+
self._recognition_once = False
|
|
591
|
+
self._running = False
|
|
592
|
+
results.error_message = error_message
|
|
593
|
+
return results
|
|
594
|
+
|
|
595
|
+
def stop(self):
|
|
596
|
+
"""End asynchronous TranslationRecognizerRealtime.
|
|
597
|
+
|
|
598
|
+
Raises:
|
|
599
|
+
InvalidParameter: Cannot stop an uninitiated TranslationRecognizerRealtime.
|
|
600
|
+
"""
|
|
601
|
+
if self._running is False:
|
|
602
|
+
raise InvalidParameter(
|
|
603
|
+
'TranslationRecognizerRealtime has stopped.')
|
|
604
|
+
|
|
605
|
+
self._stop_stream_timestamp = time.time() * 1000
|
|
606
|
+
|
|
607
|
+
self._running = False
|
|
608
|
+
if self._worker is not None and self._worker.is_alive():
|
|
609
|
+
self._worker.join()
|
|
610
|
+
self._stream_data = Queue()
|
|
611
|
+
if self._silence_timer is not None and self._silence_timer.is_alive():
|
|
612
|
+
self._silence_timer.cancel()
|
|
613
|
+
self._silence_timer = None
|
|
614
|
+
if self._callback:
|
|
615
|
+
self._callback.on_close()
|
|
616
|
+
|
|
617
|
+
def send_audio_frame(self, buffer: bytes):
|
|
618
|
+
"""Push audio to TranslationRecognizerRealtime.
|
|
619
|
+
|
|
620
|
+
Raises:
|
|
621
|
+
InvalidParameter: Cannot send data to an uninitiated TranslationRecognizerRealtime.
|
|
622
|
+
"""
|
|
623
|
+
if self._running is False:
|
|
624
|
+
raise InvalidParameter(
|
|
625
|
+
'TranslationRecognizerRealtime has stopped.')
|
|
626
|
+
|
|
627
|
+
if (self._start_stream_timestamp < 0):
|
|
628
|
+
self._start_stream_timestamp = time.time() * 1000
|
|
629
|
+
logger.debug('send_audio_frame: {}'.format(len(buffer)))
|
|
630
|
+
self._stream_data.put(buffer)
|
|
631
|
+
|
|
632
|
+
def _tidy_kwargs(self):
|
|
633
|
+
for k in self._kwargs.copy():
|
|
634
|
+
if self._kwargs[k] is None:
|
|
635
|
+
self._kwargs.pop(k, None)
|
|
636
|
+
|
|
637
|
+
def _input_stream_cycle(self):
|
|
638
|
+
while self._running:
|
|
639
|
+
while self._stream_data.empty():
|
|
640
|
+
if self._running:
|
|
641
|
+
time.sleep(0.01)
|
|
642
|
+
continue
|
|
643
|
+
else:
|
|
644
|
+
break
|
|
645
|
+
|
|
646
|
+
# Reset silence_timer when getting stream.
|
|
647
|
+
if self._silence_timer is not None and self._silence_timer.is_alive( # noqa E501
|
|
648
|
+
):
|
|
649
|
+
self._silence_timer.cancel()
|
|
650
|
+
self._silence_timer = Timer(
|
|
651
|
+
TranslationRecognizerRealtime.SILENCE_TIMEOUT_S,
|
|
652
|
+
self._silence_stop_timer)
|
|
653
|
+
self._silence_timer.start()
|
|
654
|
+
|
|
655
|
+
while not self._stream_data.empty():
|
|
656
|
+
frame = self._stream_data.get()
|
|
657
|
+
yield bytes(frame)
|
|
658
|
+
|
|
659
|
+
if self._recognition_once:
|
|
660
|
+
self._running = False
|
|
661
|
+
|
|
662
|
+
# drain all audio data when invoking stop().
|
|
663
|
+
if self._recognition_once is False:
|
|
664
|
+
while not self._stream_data.empty():
|
|
665
|
+
frame = self._stream_data.get()
|
|
666
|
+
yield bytes(frame)
|
|
667
|
+
|
|
668
|
+
def _silence_stop_timer(self):
|
|
669
|
+
"""If audio data is not received for a long time, exit worker.
|
|
670
|
+
"""
|
|
671
|
+
self._running = False
|
|
672
|
+
if self._silence_timer is not None and self._silence_timer.is_alive():
|
|
673
|
+
self._silence_timer.cancel()
|
|
674
|
+
self._silence_timer = None
|
|
675
|
+
if self._worker is not None and self._worker.is_alive():
|
|
676
|
+
self._worker.join()
|
|
677
|
+
self._stream_data = Queue()
|
|
678
|
+
|
|
679
|
+
def get_first_package_delay(self):
|
|
680
|
+
"""First Package Delay is the time between start sending audio and receive first words package
|
|
681
|
+
"""
|
|
682
|
+
return self._first_package_timestamp - self._start_stream_timestamp
|
|
683
|
+
|
|
684
|
+
def get_last_package_delay(self):
|
|
685
|
+
"""Last Package Delay is the time between stop sending audio and receive last words package
|
|
686
|
+
"""
|
|
687
|
+
return self._on_complete_timestamp - self._stop_stream_timestamp
|
|
688
|
+
|
|
689
|
+
# 获取上一个任务的taskId
|
|
690
|
+
def get_last_request_id(self):
|
|
691
|
+
return self.last_request_id
|
|
692
|
+
|
|
693
|
+
|
|
694
|
+
class TranslationRecognizerChat(BaseApi):
|
|
695
|
+
"""TranslationRecognizerChat interface.
|
|
696
|
+
|
|
697
|
+
Args:
|
|
698
|
+
model (str): The requested model_id.
|
|
699
|
+
callback (TranslationRecognizerChat): A callback that returns
|
|
700
|
+
TranslationRecognizerChat results.
|
|
701
|
+
format (str): The input audio format.
|
|
702
|
+
sample_rate (int): The input audio sample rate.
|
|
703
|
+
workspace (str): The dashscope workspace id.
|
|
704
|
+
|
|
705
|
+
**kwargs:
|
|
706
|
+
phrase_id (list, `optional`): The ID of phrase.
|
|
707
|
+
disfluency_removal_enabled(bool, `optional`): Filter mood words,
|
|
708
|
+
turned off by default.
|
|
709
|
+
diarization_enabled (bool, `optional`): Speech auto diarization,
|
|
710
|
+
turned off by default.
|
|
711
|
+
speaker_count (int, `optional`): The number of speakers.
|
|
712
|
+
timestamp_alignment_enabled (bool, `optional`): Timestamp-alignment
|
|
713
|
+
calibration, turned off by default.
|
|
714
|
+
special_word_filter(str, `optional`): Sensitive word filter.
|
|
715
|
+
audio_event_detection_enabled(bool, `optional`):
|
|
716
|
+
Audio event detection, turned off by default.
|
|
717
|
+
|
|
718
|
+
Raises:
|
|
719
|
+
InputRequired: Input is required.
|
|
720
|
+
"""
|
|
721
|
+
|
|
722
|
+
SILENCE_TIMEOUT_S = 23
|
|
723
|
+
|
|
724
|
+
def __init__(self,
|
|
725
|
+
model: str,
|
|
726
|
+
callback: TranslationRecognizerCallback,
|
|
727
|
+
format: str,
|
|
728
|
+
sample_rate: int,
|
|
729
|
+
transcription_enabled: bool = True,
|
|
730
|
+
source_language: str = None,
|
|
731
|
+
translation_enabled: bool = False,
|
|
732
|
+
workspace: str = None,
|
|
733
|
+
**kwargs):
|
|
734
|
+
if model is None:
|
|
735
|
+
raise ModelRequired('Model is required!')
|
|
736
|
+
if format is None:
|
|
737
|
+
raise InputRequired('format is required!')
|
|
738
|
+
if sample_rate is None:
|
|
739
|
+
raise InputRequired('sample_rate is required!')
|
|
740
|
+
|
|
741
|
+
self.model = model
|
|
742
|
+
self.format = format
|
|
743
|
+
self.sample_rate = sample_rate
|
|
744
|
+
self.source_language = source_language
|
|
745
|
+
self.transcription_enabled = transcription_enabled
|
|
746
|
+
self.translation_enabled = translation_enabled
|
|
747
|
+
# continuous recognition with start() or once recognition with call()
|
|
748
|
+
self._recognition_once = False
|
|
749
|
+
self._callback = callback
|
|
750
|
+
self._running = False
|
|
751
|
+
self._stream_data = Queue()
|
|
752
|
+
self._worker = None
|
|
753
|
+
self._silence_timer = None
|
|
754
|
+
self._kwargs = kwargs
|
|
755
|
+
self._workspace = workspace
|
|
756
|
+
self._start_stream_timestamp = -1
|
|
757
|
+
self._first_package_timestamp = -1
|
|
758
|
+
self._stop_stream_timestamp = -1
|
|
759
|
+
self._on_complete_timestamp = -1
|
|
760
|
+
self.request_id_confirmed = False
|
|
761
|
+
self.last_request_id = uuid.uuid4().hex
|
|
762
|
+
self._is_sentence_end = ThreadSafeBool(False)
|
|
763
|
+
|
|
764
|
+
def __del__(self):
|
|
765
|
+
if self._running:
|
|
766
|
+
self._running = False
|
|
767
|
+
self._stream_data = Queue()
|
|
768
|
+
if self._worker is not None and self._worker.is_alive():
|
|
769
|
+
self._worker.join()
|
|
770
|
+
if self._silence_timer is not None and self._silence_timer.is_alive( # noqa E501
|
|
771
|
+
):
|
|
772
|
+
self._silence_timer.cancel()
|
|
773
|
+
self._silence_timer = None
|
|
774
|
+
if self._callback:
|
|
775
|
+
self._callback.on_close()
|
|
776
|
+
|
|
777
|
+
def __receive_worker(self):
|
|
778
|
+
"""Asynchronously, initiate a real-time transltion recognizer request and
|
|
779
|
+
obtain the result for parsing.
|
|
780
|
+
"""
|
|
781
|
+
responses = self.__launch_request()
|
|
782
|
+
for part in responses:
|
|
783
|
+
if part.status_code == HTTPStatus.OK:
|
|
784
|
+
logger.debug('Received response request_id: {} {}'.format(
|
|
785
|
+
part.request_id, part.output))
|
|
786
|
+
if len(part.output) == 0:
|
|
787
|
+
self._on_complete_timestamp = time.time() * 1000
|
|
788
|
+
logger.debug('last package delay {}'.format(
|
|
789
|
+
self.get_last_package_delay()))
|
|
790
|
+
self._callback.on_complete()
|
|
791
|
+
else:
|
|
792
|
+
usage = None
|
|
793
|
+
transcription = None
|
|
794
|
+
translations = None
|
|
795
|
+
if DASHSCOPE_TRANSCRIPTION_KEY in part.output:
|
|
796
|
+
transcription = TranscriptionResult.from_json(
|
|
797
|
+
part.output[DASHSCOPE_TRANSCRIPTION_KEY])
|
|
798
|
+
if DASHSCOPE_TRANSLATION_KEY in part.output:
|
|
799
|
+
translations = TranslationResult.from_json(
|
|
800
|
+
part.output[DASHSCOPE_TRANSLATION_KEY])
|
|
801
|
+
if transcription is not None or translations is not None:
|
|
802
|
+
if (self._first_package_timestamp < 0):
|
|
803
|
+
self._first_package_timestamp = time.time() * 1000
|
|
804
|
+
logger.debug('first package delay {}'.format(
|
|
805
|
+
self.get_first_package_delay()))
|
|
806
|
+
|
|
807
|
+
if part.usage is not None:
|
|
808
|
+
usage = part.usage
|
|
809
|
+
if self.request_id_confirmed is False and part.request_id is not None:
|
|
810
|
+
self.last_request_id = part.request_id
|
|
811
|
+
self.request_id_confirmed = True
|
|
812
|
+
if transcription is not None and transcription.is_sentence_end:
|
|
813
|
+
logger.debug(
|
|
814
|
+
'[Chat] recv sentence end in transcription, stop asr'
|
|
815
|
+
)
|
|
816
|
+
self._is_sentence_end.set(True)
|
|
817
|
+
if translations is not None and translations.is_sentence_end:
|
|
818
|
+
logger.debug(
|
|
819
|
+
'[Chat] recv sentence end in translation, stop asr'
|
|
820
|
+
)
|
|
821
|
+
self._is_sentence_end.set(True)
|
|
822
|
+
self._callback.on_event(part.request_id, transcription,
|
|
823
|
+
translations, usage)
|
|
824
|
+
else:
|
|
825
|
+
self._running = False
|
|
826
|
+
self._stream_data = Queue()
|
|
827
|
+
self._callback.on_error(part)
|
|
828
|
+
self._callback.on_close()
|
|
829
|
+
break
|
|
830
|
+
|
|
831
|
+
def __launch_request(self):
|
|
832
|
+
"""Initiate real-time translation recognizer requests.
|
|
833
|
+
"""
|
|
834
|
+
|
|
835
|
+
self._tidy_kwargs()
|
|
836
|
+
task_name, _ = _get_task_group_and_task(__name__)
|
|
837
|
+
responses = super().call(
|
|
838
|
+
model=self.model,
|
|
839
|
+
task_group='audio',
|
|
840
|
+
task=task_name,
|
|
841
|
+
function='recognition',
|
|
842
|
+
input=self._input_stream_cycle(),
|
|
843
|
+
api_protocol=ApiProtocol.WEBSOCKET,
|
|
844
|
+
ws_stream_mode=WebsocketStreamingMode.DUPLEX,
|
|
845
|
+
is_binary_input=True,
|
|
846
|
+
sample_rate=self.sample_rate,
|
|
847
|
+
format=self.format,
|
|
848
|
+
stream=True,
|
|
849
|
+
source_language=self.source_language,
|
|
850
|
+
transcription_enabled=self.transcription_enabled,
|
|
851
|
+
translation_enabled=self.translation_enabled,
|
|
852
|
+
workspace=self._workspace,
|
|
853
|
+
pre_task_id=self.last_request_id,
|
|
854
|
+
**self._kwargs)
|
|
855
|
+
return responses
|
|
856
|
+
|
|
857
|
+
def start(self, **kwargs):
|
|
858
|
+
"""Real-time translation recognizer in asynchronous mode.
|
|
859
|
+
Please call 'stop()' after you have completed translation & recognition.
|
|
860
|
+
|
|
861
|
+
Args:
|
|
862
|
+
phrase_id (str, `optional`): The ID of phrase.
|
|
863
|
+
|
|
864
|
+
**kwargs:
|
|
865
|
+
disfluency_removal_enabled(bool, `optional`):
|
|
866
|
+
Filter mood words, turned off by default.
|
|
867
|
+
diarization_enabled (bool, `optional`):
|
|
868
|
+
Speech auto diarization, turned off by default.
|
|
869
|
+
speaker_count (int, `optional`): The number of speakers.
|
|
870
|
+
timestamp_alignment_enabled (bool, `optional`):
|
|
871
|
+
Timestamp-alignment calibration, turned off by default.
|
|
872
|
+
special_word_filter(str, `optional`): Sensitive word filter.
|
|
873
|
+
audio_event_detection_enabled(bool, `optional`):
|
|
874
|
+
Audio event detection, turned off by default.
|
|
875
|
+
|
|
876
|
+
Raises:
|
|
877
|
+
InvalidParameter: This interface cannot be called again
|
|
878
|
+
if it has already been started.
|
|
879
|
+
InvalidTask: Task create failed.
|
|
880
|
+
"""
|
|
881
|
+
assert self._callback is not None, 'Please set the callback to get the translation & recognition result.' # noqa E501
|
|
882
|
+
|
|
883
|
+
if self._running:
|
|
884
|
+
raise InvalidParameter('TranslationRecognizerChat has started.')
|
|
885
|
+
|
|
886
|
+
self._start_stream_timestamp = -1
|
|
887
|
+
self._first_package_timestamp = -1
|
|
888
|
+
self._stop_stream_timestamp = -1
|
|
889
|
+
self._on_complete_timestamp = -1
|
|
890
|
+
self._kwargs.update(**kwargs)
|
|
891
|
+
self._recognition_once = False
|
|
892
|
+
self._worker = threading.Thread(target=self.__receive_worker)
|
|
893
|
+
self._worker.start()
|
|
894
|
+
if self._worker.is_alive():
|
|
895
|
+
self._running = True
|
|
896
|
+
self._callback.on_open()
|
|
897
|
+
|
|
898
|
+
# If audio data is not received for 23 seconds, the timeout exits
|
|
899
|
+
self._silence_timer = Timer(
|
|
900
|
+
TranslationRecognizerChat.SILENCE_TIMEOUT_S,
|
|
901
|
+
self._silence_stop_timer)
|
|
902
|
+
self._silence_timer.start()
|
|
903
|
+
else:
|
|
904
|
+
self._running = False
|
|
905
|
+
raise InvalidTask('Invalid task, task create failed.')
|
|
906
|
+
|
|
907
|
+
def stop(self):
|
|
908
|
+
"""End asynchronous TranslationRecognizerChat.
|
|
909
|
+
|
|
910
|
+
Raises:
|
|
911
|
+
InvalidParameter: Cannot stop an uninitiated TranslationRecognizerChat.
|
|
912
|
+
"""
|
|
913
|
+
if self._running is False:
|
|
914
|
+
raise InvalidParameter('TranslationRecognizerChat has stopped.')
|
|
915
|
+
|
|
916
|
+
self._stop_stream_timestamp = time.time() * 1000
|
|
917
|
+
logger.debug('stop TranslationRecognizerChat')
|
|
918
|
+
self._running = False
|
|
919
|
+
if self._worker is not None and self._worker.is_alive():
|
|
920
|
+
self._worker.join()
|
|
921
|
+
self._stream_data = Queue()
|
|
922
|
+
if self._silence_timer is not None and self._silence_timer.is_alive():
|
|
923
|
+
self._silence_timer.cancel()
|
|
924
|
+
self._silence_timer = None
|
|
925
|
+
if self._callback:
|
|
926
|
+
self._callback.on_close()
|
|
927
|
+
|
|
928
|
+
def send_audio_frame(self, buffer: bytes) -> bool:
|
|
929
|
+
"""Push audio to TranslationRecognizerChat.
|
|
930
|
+
|
|
931
|
+
Raises:
|
|
932
|
+
InvalidParameter: Cannot send data to an uninitiated TranslationRecognizerChat.
|
|
933
|
+
"""
|
|
934
|
+
if self._is_sentence_end.get():
|
|
935
|
+
logger.debug('skip audio due to has sentence end.')
|
|
936
|
+
return False
|
|
937
|
+
|
|
938
|
+
if self._running is False:
|
|
939
|
+
raise InvalidParameter('TranslationRecognizerChat has stopped.')
|
|
940
|
+
|
|
941
|
+
if (self._start_stream_timestamp < 0):
|
|
942
|
+
self._start_stream_timestamp = time.time() * 1000
|
|
943
|
+
logger.debug('send_audio_frame: {}'.format(len(buffer)))
|
|
944
|
+
self._stream_data.put(buffer)
|
|
945
|
+
return True
|
|
946
|
+
|
|
947
|
+
def _tidy_kwargs(self):
|
|
948
|
+
for k in self._kwargs.copy():
|
|
949
|
+
if self._kwargs[k] is None:
|
|
950
|
+
self._kwargs.pop(k, None)
|
|
951
|
+
|
|
952
|
+
def _input_stream_cycle(self):
|
|
953
|
+
while self._running:
|
|
954
|
+
while self._stream_data.empty():
|
|
955
|
+
if self._running:
|
|
956
|
+
time.sleep(0.01)
|
|
957
|
+
continue
|
|
958
|
+
else:
|
|
959
|
+
break
|
|
960
|
+
|
|
961
|
+
# Reset silence_timer when getting stream.
|
|
962
|
+
if self._silence_timer is not None and self._silence_timer.is_alive( # noqa E501
|
|
963
|
+
):
|
|
964
|
+
self._silence_timer.cancel()
|
|
965
|
+
self._silence_timer = Timer(
|
|
966
|
+
TranslationRecognizerChat.SILENCE_TIMEOUT_S,
|
|
967
|
+
self._silence_stop_timer)
|
|
968
|
+
self._silence_timer.start()
|
|
969
|
+
|
|
970
|
+
while not self._stream_data.empty():
|
|
971
|
+
frame = self._stream_data.get()
|
|
972
|
+
yield bytes(frame)
|
|
973
|
+
|
|
974
|
+
if self._recognition_once:
|
|
975
|
+
self._running = False
|
|
976
|
+
|
|
977
|
+
# drain all audio data when invoking stop().
|
|
978
|
+
if self._recognition_once is False:
|
|
979
|
+
while not self._stream_data.empty():
|
|
980
|
+
frame = self._stream_data.get()
|
|
981
|
+
yield bytes(frame)
|
|
982
|
+
|
|
983
|
+
def _silence_stop_timer(self):
|
|
984
|
+
"""If audio data is not received for a long time, exit worker.
|
|
985
|
+
"""
|
|
986
|
+
self._running = False
|
|
987
|
+
if self._silence_timer is not None and self._silence_timer.is_alive():
|
|
988
|
+
self._silence_timer.cancel()
|
|
989
|
+
self._silence_timer = None
|
|
990
|
+
if self._worker is not None and self._worker.is_alive():
|
|
991
|
+
self._worker.join()
|
|
992
|
+
self._stream_data = Queue()
|
|
993
|
+
|
|
994
|
+
def get_first_package_delay(self):
|
|
995
|
+
"""First Package Delay is the time between start sending audio and receive first words package
|
|
996
|
+
"""
|
|
997
|
+
return self._first_package_timestamp - self._start_stream_timestamp
|
|
998
|
+
|
|
999
|
+
def get_last_package_delay(self):
|
|
1000
|
+
"""Last Package Delay is the time between stop sending audio and receive last words package
|
|
1001
|
+
"""
|
|
1002
|
+
return self._on_complete_timestamp - self._stop_stream_timestamp
|
|
1003
|
+
|
|
1004
|
+
# 获取上一个任务的taskId
|
|
1005
|
+
def get_last_request_id(self):
|
|
1006
|
+
return self.last_request_id
|