dashscope 1.8.0__py3-none-any.whl → 1.25.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. dashscope/__init__.py +61 -14
  2. dashscope/aigc/__init__.py +10 -3
  3. dashscope/aigc/chat_completion.py +282 -0
  4. dashscope/aigc/code_generation.py +145 -0
  5. dashscope/aigc/conversation.py +71 -12
  6. dashscope/aigc/generation.py +288 -16
  7. dashscope/aigc/image_synthesis.py +473 -31
  8. dashscope/aigc/multimodal_conversation.py +299 -14
  9. dashscope/aigc/video_synthesis.py +610 -0
  10. dashscope/api_entities/aiohttp_request.py +8 -5
  11. dashscope/api_entities/api_request_data.py +4 -2
  12. dashscope/api_entities/api_request_factory.py +68 -20
  13. dashscope/api_entities/base_request.py +20 -3
  14. dashscope/api_entities/chat_completion_types.py +344 -0
  15. dashscope/api_entities/dashscope_response.py +243 -15
  16. dashscope/api_entities/encryption.py +179 -0
  17. dashscope/api_entities/http_request.py +216 -62
  18. dashscope/api_entities/websocket_request.py +43 -34
  19. dashscope/app/__init__.py +5 -0
  20. dashscope/app/application.py +203 -0
  21. dashscope/app/application_response.py +246 -0
  22. dashscope/assistants/__init__.py +16 -0
  23. dashscope/assistants/assistant_types.py +175 -0
  24. dashscope/assistants/assistants.py +311 -0
  25. dashscope/assistants/files.py +197 -0
  26. dashscope/audio/__init__.py +4 -2
  27. dashscope/audio/asr/__init__.py +17 -1
  28. dashscope/audio/asr/asr_phrase_manager.py +203 -0
  29. dashscope/audio/asr/recognition.py +167 -27
  30. dashscope/audio/asr/transcription.py +107 -14
  31. dashscope/audio/asr/translation_recognizer.py +1006 -0
  32. dashscope/audio/asr/vocabulary.py +177 -0
  33. dashscope/audio/qwen_asr/__init__.py +7 -0
  34. dashscope/audio/qwen_asr/qwen_transcription.py +189 -0
  35. dashscope/audio/qwen_omni/__init__.py +11 -0
  36. dashscope/audio/qwen_omni/omni_realtime.py +524 -0
  37. dashscope/audio/qwen_tts/__init__.py +5 -0
  38. dashscope/audio/qwen_tts/speech_synthesizer.py +77 -0
  39. dashscope/audio/qwen_tts_realtime/__init__.py +10 -0
  40. dashscope/audio/qwen_tts_realtime/qwen_tts_realtime.py +355 -0
  41. dashscope/audio/tts/__init__.py +2 -0
  42. dashscope/audio/tts/speech_synthesizer.py +5 -0
  43. dashscope/audio/tts_v2/__init__.py +12 -0
  44. dashscope/audio/tts_v2/enrollment.py +179 -0
  45. dashscope/audio/tts_v2/speech_synthesizer.py +886 -0
  46. dashscope/cli.py +157 -37
  47. dashscope/client/base_api.py +652 -87
  48. dashscope/common/api_key.py +2 -0
  49. dashscope/common/base_type.py +135 -0
  50. dashscope/common/constants.py +13 -16
  51. dashscope/common/env.py +2 -0
  52. dashscope/common/error.py +58 -22
  53. dashscope/common/logging.py +2 -0
  54. dashscope/common/message_manager.py +2 -0
  55. dashscope/common/utils.py +276 -46
  56. dashscope/customize/__init__.py +0 -0
  57. dashscope/customize/customize_types.py +192 -0
  58. dashscope/customize/deployments.py +146 -0
  59. dashscope/customize/finetunes.py +234 -0
  60. dashscope/embeddings/__init__.py +5 -1
  61. dashscope/embeddings/batch_text_embedding.py +208 -0
  62. dashscope/embeddings/batch_text_embedding_response.py +65 -0
  63. dashscope/embeddings/multimodal_embedding.py +118 -10
  64. dashscope/embeddings/text_embedding.py +13 -1
  65. dashscope/{file.py → files.py} +19 -4
  66. dashscope/io/input_output.py +2 -0
  67. dashscope/model.py +11 -2
  68. dashscope/models.py +43 -0
  69. dashscope/multimodal/__init__.py +20 -0
  70. dashscope/multimodal/dialog_state.py +56 -0
  71. dashscope/multimodal/multimodal_constants.py +28 -0
  72. dashscope/multimodal/multimodal_dialog.py +648 -0
  73. dashscope/multimodal/multimodal_request_params.py +313 -0
  74. dashscope/multimodal/tingwu/__init__.py +10 -0
  75. dashscope/multimodal/tingwu/tingwu.py +80 -0
  76. dashscope/multimodal/tingwu/tingwu_realtime.py +579 -0
  77. dashscope/nlp/__init__.py +0 -0
  78. dashscope/nlp/understanding.py +64 -0
  79. dashscope/protocol/websocket.py +3 -0
  80. dashscope/rerank/__init__.py +0 -0
  81. dashscope/rerank/text_rerank.py +69 -0
  82. dashscope/resources/qwen.tiktoken +151643 -0
  83. dashscope/threads/__init__.py +26 -0
  84. dashscope/threads/messages/__init__.py +0 -0
  85. dashscope/threads/messages/files.py +113 -0
  86. dashscope/threads/messages/messages.py +220 -0
  87. dashscope/threads/runs/__init__.py +0 -0
  88. dashscope/threads/runs/runs.py +501 -0
  89. dashscope/threads/runs/steps.py +112 -0
  90. dashscope/threads/thread_types.py +665 -0
  91. dashscope/threads/threads.py +212 -0
  92. dashscope/tokenizers/__init__.py +7 -0
  93. dashscope/tokenizers/qwen_tokenizer.py +111 -0
  94. dashscope/tokenizers/tokenization.py +125 -0
  95. dashscope/tokenizers/tokenizer.py +45 -0
  96. dashscope/tokenizers/tokenizer_base.py +32 -0
  97. dashscope/utils/__init__.py +0 -0
  98. dashscope/utils/message_utils.py +838 -0
  99. dashscope/utils/oss_utils.py +243 -0
  100. dashscope/utils/param_utils.py +29 -0
  101. dashscope/version.py +3 -1
  102. {dashscope-1.8.0.dist-info → dashscope-1.25.6.dist-info}/METADATA +53 -50
  103. dashscope-1.25.6.dist-info/RECORD +112 -0
  104. {dashscope-1.8.0.dist-info → dashscope-1.25.6.dist-info}/WHEEL +1 -1
  105. {dashscope-1.8.0.dist-info → dashscope-1.25.6.dist-info}/entry_points.txt +0 -1
  106. {dashscope-1.8.0.dist-info → dashscope-1.25.6.dist-info/licenses}/LICENSE +2 -4
  107. dashscope/deployment.py +0 -129
  108. dashscope/finetune.py +0 -149
  109. dashscope-1.8.0.dist-info/RECORD +0 -49
  110. {dashscope-1.8.0.dist-info → dashscope-1.25.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1006 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ import json
4
+ import os
5
+ import threading
6
+ import time
7
+ import uuid
8
+ from http import HTTPStatus
9
+ from queue import Queue
10
+ from threading import Timer
11
+ from typing import Any, Dict, List
12
+
13
+ from dashscope.client.base_api import BaseApi
14
+ from dashscope.common.constants import ApiProtocol
15
+ from dashscope.common.error import (InputDataRequired, InputRequired,
16
+ InvalidParameter, InvalidTask,
17
+ ModelRequired)
18
+ from dashscope.common.logging import logger
19
+ from dashscope.common.utils import _get_task_group_and_task
20
+ from dashscope.protocol.websocket import WebsocketStreamingMode
21
+
22
+ DASHSCOPE_TRANSLATION_KEY = 'translations'
23
+ DASHSCOPE_TRANSCRIPTION_KEY = 'transcription'
24
+
25
+
26
+ class ThreadSafeBool:
27
+ def __init__(self, initial_value=False):
28
+ self._value = initial_value
29
+ self._lock = threading.Lock()
30
+
31
+ def set(self, value):
32
+ with self._lock:
33
+ self._value = value
34
+
35
+ def get(self):
36
+ with self._lock:
37
+ return self._value
38
+
39
+
40
+ class WordObj():
41
+ def __init__(self, ) -> None:
42
+ self.text: str = None
43
+ self.begin_time: int = None
44
+ self.end_time: int = None
45
+ self.fixed: bool = False
46
+ self._raw_data = None
47
+
48
+ @staticmethod
49
+ def from_json(json_data: Dict[str, Any]):
50
+ """Create a Word object from a JSON dictionary.
51
+ """
52
+ word = WordObj()
53
+ word.text = json_data['text']
54
+ word.begin_time = json_data['begin_time']
55
+ word.end_time = json_data['end_time']
56
+ word.fixed = json_data['fixed']
57
+ word._raw_data = json_data
58
+ return word
59
+
60
+ def __str__(self) -> str:
61
+ return 'Word: ' + json.dumps(self._raw_data, ensure_ascii=False)
62
+
63
+ def __repr__(self):
64
+ return self.__str__()
65
+
66
+
67
+ class SentenceBaseObj():
68
+ def __init__(self, ) -> None:
69
+ self.sentence_id: int = -1
70
+ self.text: str = None
71
+ self.begin_time: int = None
72
+ self.end_time: int = None
73
+ self.words: List[WordObj] = []
74
+ self._raw_data = None
75
+
76
+ @staticmethod
77
+ def from_json(json_data: Dict[str, Any]):
78
+ """Create a SentenceBase object from a JSON dictionary.
79
+ """
80
+ sentence = SentenceBaseObj()
81
+ sentence.sentence_id = json_data['sentence_id']
82
+ sentence.text = json_data['text']
83
+ sentence.begin_time = json_data['begin_time']
84
+ if json_data.get('end_time') is not None:
85
+ sentence.end_time = json_data['end_time']
86
+ else:
87
+ sentence.end_time = json_data['current_time']
88
+ sentence.words = [
89
+ WordObj.from_json(word) for word in json_data['words']
90
+ ]
91
+ sentence._raw_data = json_data
92
+ return sentence
93
+
94
+ def __str__(self) -> str:
95
+ return json.dumps(self._raw_data, ensure_ascii=False)
96
+
97
+ def __repr__(self):
98
+ return self.__str__()
99
+
100
+
101
+ class TranscriptionResult(SentenceBaseObj):
102
+ def __init__(self, ) -> None:
103
+ self.stash: SentenceBaseObj = None
104
+ self.is_sentence_end = False
105
+ # vad related
106
+ self.vad_pre_end: bool = False
107
+ self.pre_end_failed: bool = False
108
+ self.pre_end_timemillis: int = -1
109
+ self.pre_end_start_time: int = -1
110
+ self.pre_end_end_time: int = -1
111
+ self._raw_data = None
112
+
113
+ @staticmethod
114
+ def from_json(json_data: Dict[str, Any]):
115
+ """Create a TranscriptionResult object from a JSON dictionary.
116
+ """
117
+ transcription = TranscriptionResult()
118
+ transcription.sentence_id = json_data['sentence_id']
119
+ transcription.text = json_data['text']
120
+ transcription.begin_time = json_data['begin_time']
121
+ if json_data.get('end_time') is not None:
122
+ transcription.end_time = json_data['end_time']
123
+ else:
124
+ transcription.end_time = json_data['current_time']
125
+ transcription.words = [
126
+ WordObj.from_json(word) for word in json_data['words']
127
+ ]
128
+ transcription._raw_data = json_data
129
+ transcription.is_sentence_end = json_data.get('sentence_end')
130
+ if 'stash' in json_data:
131
+ transcription.stash = SentenceBaseObj.from_json(json_data['stash'])
132
+ if 'vad_pre_end' in json_data:
133
+ transcription.vad_pre_end = json_data['vad_pre_end']
134
+ if 'pre_end_failed' in json_data:
135
+ transcription.pre_end_failed = json_data['pre_end_failed']
136
+ if 'pre_end_start_time' in json_data:
137
+ transcription.pre_end_start_time = json_data['pre_end_start_time']
138
+ if 'pre_end_end_time' in json_data:
139
+ transcription.pre_end_end_time = json_data['pre_end_end_time']
140
+ transcription._raw_data = json_data
141
+ return transcription
142
+
143
+ def __str__(self) -> str:
144
+ return 'Transcriptions: ' + json.dumps(self._raw_data,
145
+ ensure_ascii=False)
146
+
147
+ def __repr__(self):
148
+ return self.__str__()
149
+
150
+
151
+ class Translation(SentenceBaseObj):
152
+ def __init__(self, ) -> None:
153
+ self.language: str = None
154
+ self.stash: SentenceBaseObj = None
155
+ self.is_sentence_end = False
156
+ # vad related
157
+ self.vad_pre_end: bool = False
158
+ self.pre_end_failed: bool = False
159
+ self.pre_end_timemillis: int = -1
160
+ self.pre_end_start_time: int = -1
161
+ self.pre_end_end_time: int = -1
162
+ self._raw_data = None
163
+
164
+ @staticmethod
165
+ def from_json(json_data: Dict[str, Any]):
166
+ """Create a Translation object from a JSON dictionary.
167
+ """
168
+ translation = Translation()
169
+ translation.sentence_id = json_data['sentence_id']
170
+ translation.text = json_data['text']
171
+ translation.begin_time = json_data['begin_time']
172
+ if json_data.get('end_time') is not None:
173
+ translation.end_time = json_data['end_time']
174
+ else:
175
+ translation.end_time = json_data['current_time']
176
+ translation.words = [
177
+ WordObj.from_json(word) for word in json_data['words']
178
+ ]
179
+ translation._raw_data = json_data
180
+
181
+ translation.language = json_data['lang']
182
+ translation.is_sentence_end = json_data.get('sentence_end')
183
+ if 'stash' in json_data:
184
+ translation.stash = SentenceBaseObj.from_json(json_data['stash'])
185
+ if 'vad_pre_end' in json_data:
186
+ translation.vad_pre_end = json_data['vad_pre_end']
187
+ if 'pre_end_failed' in json_data:
188
+ translation.pre_end_failed = json_data['pre_end_failed']
189
+ if 'pre_end_start_time' in json_data:
190
+ translation.pre_end_start_time = json_data['pre_end_start_time']
191
+ if 'pre_end_end_time' in json_data:
192
+ translation.pre_end_end_time = json_data['pre_end_end_time']
193
+ translation._raw_data = json_data
194
+ return translation
195
+
196
+ def __str__(self) -> str:
197
+ return 'Translation: ' + json.dumps(self._raw_data, ensure_ascii=False)
198
+
199
+ def __repr__(self):
200
+ return self.__str__()
201
+
202
+
203
+ class TranslationResult():
204
+ def __init__(self, ) -> None:
205
+ self.translations: Dict[str:Translation] = {}
206
+ self.is_sentence_end = False
207
+ self._raw_data = None
208
+
209
+ def get_translation(self, language) -> Translation:
210
+ if self.translations is None:
211
+ return None
212
+ return self.translations.get(language)
213
+
214
+ def get_language_list(self, ) -> List[str]:
215
+ if self.translations is None:
216
+ return None
217
+ return list(self.translations.keys())
218
+
219
+ @staticmethod
220
+ def from_json(json_data: List):
221
+ """Create a TranslationResult object from a JSON dictionary.
222
+ """
223
+ result = TranslationResult()
224
+ result._raw_data = json_data
225
+ for translation_json in json_data:
226
+ if not isinstance(translation_json, dict):
227
+ raise InvalidParameter(
228
+ f'Invalid translation json data: {translation_json}')
229
+ else:
230
+ translation = Translation.from_json(translation_json)
231
+ result.translations[translation.language] = translation
232
+ if translation.is_sentence_end:
233
+ result.is_sentence_end = True
234
+ return result
235
+
236
+ def __str__(self) -> str:
237
+ return 'TranslationList: ' + json.dumps(self._raw_data,
238
+ ensure_ascii=False)
239
+
240
+ def __repr__(self):
241
+ return self.__str__()
242
+
243
+
244
+ class TranslationRecognizerResultPack():
245
+ def __init__(self) -> None:
246
+ self.transcription_result_list: List[TranscriptionResult] = []
247
+ self.translation_result_list: List[TranslationResult] = []
248
+ self.usage_list: List = []
249
+ self.request_id: str = None
250
+ self.error_message = None
251
+
252
+
253
+ class TranslationRecognizerCallback():
254
+ """An interface that defines callback methods for getting translation recognizer results. # noqa E501
255
+ Derive from this class and implement its function to provide your own data.
256
+ """
257
+ def on_open(self) -> None:
258
+ pass
259
+
260
+ def on_complete(self) -> None:
261
+ pass
262
+
263
+ def on_error(self, message) -> None:
264
+ pass
265
+
266
+ def on_close(self) -> None:
267
+ pass
268
+
269
+ def on_event(self, request_id, transcription_result: TranscriptionResult,
270
+ translation_result: TranslationResult, usage) -> None:
271
+ pass
272
+
273
+
274
+ class TranslationRecognizerRealtime(BaseApi):
275
+ """TranslationRecognizerRealtime interface.
276
+
277
+ Args:
278
+ model (str): The requested model_id.
279
+ callback (TranslationRecognizerRealtime): A callback that returns
280
+ TranslationRecognizerRealtime results.
281
+ format (str): The input audio format.
282
+ sample_rate (int): The input audio sample rate.
283
+ workspace (str): The dashscope workspace id.
284
+
285
+ **kwargs:
286
+ phrase_id (list, `optional`): The ID of phrase.
287
+ disfluency_removal_enabled(bool, `optional`): Filter mood words,
288
+ turned off by default.
289
+ diarization_enabled (bool, `optional`): Speech auto diarization,
290
+ turned off by default.
291
+ speaker_count (int, `optional`): The number of speakers.
292
+ timestamp_alignment_enabled (bool, `optional`): Timestamp-alignment
293
+ calibration, turned off by default.
294
+ special_word_filter(str, `optional`): Sensitive word filter.
295
+ audio_event_detection_enabled(bool, `optional`):
296
+ Audio event detection, turned off by default.
297
+
298
+ Raises:
299
+ InputRequired: Input is required.
300
+ """
301
+
302
+ SILENCE_TIMEOUT_S = 23
303
+
304
+ def __init__(self,
305
+ model: str,
306
+ callback: TranslationRecognizerCallback,
307
+ format: str,
308
+ sample_rate: int,
309
+ transcription_enabled: bool = True,
310
+ source_language: str = None,
311
+ translation_enabled: bool = False,
312
+ workspace: str = None,
313
+ **kwargs):
314
+ if model is None:
315
+ raise ModelRequired('Model is required!')
316
+ if format is None:
317
+ raise InputRequired('format is required!')
318
+ if sample_rate is None:
319
+ raise InputRequired('sample_rate is required!')
320
+
321
+ self.model = model
322
+ self.format = format
323
+ self.sample_rate = sample_rate
324
+ self.source_language = source_language
325
+ self.transcription_enabled = transcription_enabled
326
+ self.translation_enabled = translation_enabled
327
+ # continuous recognition with start() or once recognition with call()
328
+ self._recognition_once = False
329
+ self._callback = callback
330
+ self._running = False
331
+ self._stream_data = Queue()
332
+ self._worker = None
333
+ self._silence_timer = None
334
+ self._kwargs = kwargs
335
+ self._workspace = workspace
336
+ self._start_stream_timestamp = -1
337
+ self._first_package_timestamp = -1
338
+ self._stop_stream_timestamp = -1
339
+ self._on_complete_timestamp = -1
340
+ self.request_id_confirmed = False
341
+ self.last_request_id = uuid.uuid4().hex
342
+
343
+ def __del__(self):
344
+ if self._running:
345
+ self._running = False
346
+ self._stream_data = Queue()
347
+ if self._worker is not None and self._worker.is_alive():
348
+ self._worker.join()
349
+ if self._silence_timer is not None and self._silence_timer.is_alive( # noqa E501
350
+ ):
351
+ self._silence_timer.cancel()
352
+ self._silence_timer = None
353
+ if self._callback:
354
+ self._callback.on_close()
355
+
356
+ def __receive_worker(self):
357
+ """Asynchronously, initiate a real-time transltion recognizer request and
358
+ obtain the result for parsing.
359
+ """
360
+ responses = self.__launch_request()
361
+ for part in responses:
362
+ if part.status_code == HTTPStatus.OK:
363
+ logger.debug('Received response request_id: {} {}'.format(
364
+ part.request_id, part.output))
365
+ if len(part.output) == 0:
366
+ self._on_complete_timestamp = time.time() * 1000
367
+ logger.debug('last package delay {}'.format(
368
+ self.get_last_package_delay()))
369
+ self._callback.on_complete()
370
+ else:
371
+ usage = None
372
+ transcription = None
373
+ translations = None
374
+ if DASHSCOPE_TRANSCRIPTION_KEY in part.output:
375
+ transcription = TranscriptionResult.from_json(
376
+ part.output[DASHSCOPE_TRANSCRIPTION_KEY])
377
+ if DASHSCOPE_TRANSLATION_KEY in part.output:
378
+ translations = TranslationResult.from_json(
379
+ part.output[DASHSCOPE_TRANSLATION_KEY])
380
+ if transcription is not None or translations is not None:
381
+ if (self._first_package_timestamp < 0):
382
+ self._first_package_timestamp = time.time() * 1000
383
+ logger.debug('first package delay {}'.format(
384
+ self.get_first_package_delay()))
385
+
386
+ if part.usage is not None:
387
+ usage = part.usage
388
+ if self.request_id_confirmed is False and part.request_id is not None:
389
+ self.last_request_id = part.request_id
390
+ self.request_id_confirmed = True
391
+ self._callback.on_event(part.request_id, transcription,
392
+ translations, usage)
393
+ else:
394
+ self._running = False
395
+ self._stream_data = Queue()
396
+ self._callback.on_error(part)
397
+ self._callback.on_close()
398
+ break
399
+
400
+ def __launch_request(self):
401
+ """Initiate real-time translation recognizer requests.
402
+ """
403
+
404
+ self._tidy_kwargs()
405
+ task_name, _ = _get_task_group_and_task(__name__)
406
+ responses = super().call(
407
+ model=self.model,
408
+ task_group='audio',
409
+ task=task_name,
410
+ function='recognition',
411
+ input=self._input_stream_cycle(),
412
+ api_protocol=ApiProtocol.WEBSOCKET,
413
+ ws_stream_mode=WebsocketStreamingMode.DUPLEX,
414
+ is_binary_input=True,
415
+ sample_rate=self.sample_rate,
416
+ format=self.format,
417
+ stream=True,
418
+ source_language=self.source_language,
419
+ transcription_enabled=self.transcription_enabled,
420
+ translation_enabled=self.translation_enabled,
421
+ workspace=self._workspace,
422
+ pre_task_id=self.last_request_id,
423
+ **self._kwargs)
424
+ return responses
425
+
426
+ def start(self, **kwargs):
427
+ """Real-time translation recognizer in asynchronous mode.
428
+ Please call 'stop()' after you have completed translation & recognition.
429
+
430
+ Args:
431
+ phrase_id (str, `optional`): The ID of phrase.
432
+
433
+ **kwargs:
434
+ disfluency_removal_enabled(bool, `optional`):
435
+ Filter mood words, turned off by default.
436
+ diarization_enabled (bool, `optional`):
437
+ Speech auto diarization, turned off by default.
438
+ speaker_count (int, `optional`): The number of speakers.
439
+ timestamp_alignment_enabled (bool, `optional`):
440
+ Timestamp-alignment calibration, turned off by default.
441
+ special_word_filter(str, `optional`): Sensitive word filter.
442
+ audio_event_detection_enabled(bool, `optional`):
443
+ Audio event detection, turned off by default.
444
+
445
+ Raises:
446
+ InvalidParameter: This interface cannot be called again
447
+ if it has already been started.
448
+ InvalidTask: Task create failed.
449
+ """
450
+ assert self._callback is not None, 'Please set the callback to get the translation & recognition result.' # noqa E501
451
+
452
+ if self._running:
453
+ raise InvalidParameter(
454
+ 'TranslationRecognizerRealtime has started.')
455
+
456
+ self._start_stream_timestamp = -1
457
+ self._first_package_timestamp = -1
458
+ self._stop_stream_timestamp = -1
459
+ self._on_complete_timestamp = -1
460
+ self._kwargs.update(**kwargs)
461
+ self._recognition_once = False
462
+ self._worker = threading.Thread(target=self.__receive_worker)
463
+ self._worker.start()
464
+ if self._worker.is_alive():
465
+ self._running = True
466
+ self._callback.on_open()
467
+
468
+ # If audio data is not received for 23 seconds, the timeout exits
469
+ self._silence_timer = Timer(
470
+ TranslationRecognizerRealtime.SILENCE_TIMEOUT_S,
471
+ self._silence_stop_timer)
472
+ self._silence_timer.start()
473
+ else:
474
+ self._running = False
475
+ raise InvalidTask('Invalid task, task create failed.')
476
+
477
+ def call(self,
478
+ file: str,
479
+ phrase_id: str = None,
480
+ **kwargs) -> TranslationRecognizerResultPack:
481
+ """TranslationRecognizerRealtime in synchronous mode.
482
+
483
+ Args:
484
+ file (str): The path to the local audio file.
485
+ phrase_id (str, `optional`): The ID of phrase.
486
+
487
+ **kwargs:
488
+ disfluency_removal_enabled(bool, `optional`):
489
+ Filter mood words, turned off by default.
490
+ diarization_enabled (bool, `optional`):
491
+ Speech auto diarization, turned off by default.
492
+ speaker_count (int, `optional`): The number of speakers.
493
+ timestamp_alignment_enabled (bool, `optional`):
494
+ Timestamp-alignment calibration, turned off by default.
495
+ special_word_filter(str, `optional`): Sensitive word filter.
496
+ audio_event_detection_enabled(bool, `optional`):
497
+ Audio event detection, turned off by default.
498
+
499
+ Raises:
500
+ InvalidParameter: This interface cannot be called again
501
+ if it has already been started.
502
+ InputDataRequired: The supplied file was empty.
503
+
504
+ Returns:
505
+ TranslationRecognizerResultPack: The result of speech translation & recognition.
506
+ """
507
+ self._start_stream_timestamp = time.time() * 1000
508
+ if self._running:
509
+ raise InvalidParameter(
510
+ 'TranslationRecognizerRealtime has been called.')
511
+
512
+ if os.path.exists(file):
513
+ if os.path.isdir(file):
514
+ raise IsADirectoryError('Is a directory: ' + file)
515
+ else:
516
+ raise FileNotFoundError('No such file or directory: ' + file)
517
+
518
+ self._recognition_once = True
519
+ self._stream_data = Queue()
520
+ self._phrase = phrase_id
521
+ self._kwargs.update(**kwargs)
522
+ results = TranslationRecognizerResultPack()
523
+ error_message = None
524
+
525
+ try:
526
+ audio_data: bytes = None
527
+ f = open(file, 'rb')
528
+ if os.path.getsize(file):
529
+ while True:
530
+ audio_data = f.read(12800)
531
+ if not audio_data:
532
+ break
533
+ else:
534
+ self._stream_data.put(audio_data)
535
+ else:
536
+ raise InputDataRequired(
537
+ 'The supplied file was empty (zero bytes long)')
538
+ f.close()
539
+ self._stop_stream_timestamp = time.time() * 1000
540
+ except Exception as e:
541
+ logger.error(e)
542
+ raise e
543
+
544
+ if not self._stream_data.empty():
545
+ self._running = True
546
+ responses = self.__launch_request()
547
+ for part in responses:
548
+ if part.status_code == HTTPStatus.OK:
549
+ logger.debug('received data: {}'.format(part.output))
550
+ # debug log cal fpd
551
+ transcription = None
552
+ translation = None
553
+ usage = None
554
+ if ('translation' in part.output) or ('transcription'
555
+ in part.output):
556
+ if (self._first_package_timestamp < 0):
557
+ self._first_package_timestamp = time.time() * 1000
558
+ logger.debug('first package delay {}'.format(
559
+ self._first_package_timestamp -
560
+ self._start_stream_timestamp))
561
+ if part.usage is not None:
562
+ usage = part.usage
563
+
564
+ if DASHSCOPE_TRANSCRIPTION_KEY in part.output:
565
+ transcription = TranscriptionResult.from_json(
566
+ part.output[DASHSCOPE_TRANSCRIPTION_KEY])
567
+
568
+ if DASHSCOPE_TRANSLATION_KEY in part.output:
569
+ translation = TranslationResult.from_json(
570
+ part.output[DASHSCOPE_TRANSLATION_KEY])
571
+
572
+ if (transcription is not None
573
+ and transcription.is_sentence_end) or (
574
+ translation is not None
575
+ and translation.is_sentence_end):
576
+ results.request_id = part.request_id
577
+ results.transcription_result_list.append(transcription)
578
+ results.translation_result_list.append(translation)
579
+ results.usage_list.append(usage)
580
+ else:
581
+ error_message = part
582
+ logger.error(error_message)
583
+ break
584
+
585
+ self._on_complete_timestamp = time.time() * 1000
586
+ logger.debug('last package delay {}'.format(
587
+ self.get_last_package_delay()))
588
+
589
+ self._stream_data = Queue()
590
+ self._recognition_once = False
591
+ self._running = False
592
+ results.error_message = error_message
593
+ return results
594
+
595
+ def stop(self):
596
+ """End asynchronous TranslationRecognizerRealtime.
597
+
598
+ Raises:
599
+ InvalidParameter: Cannot stop an uninitiated TranslationRecognizerRealtime.
600
+ """
601
+ if self._running is False:
602
+ raise InvalidParameter(
603
+ 'TranslationRecognizerRealtime has stopped.')
604
+
605
+ self._stop_stream_timestamp = time.time() * 1000
606
+
607
+ self._running = False
608
+ if self._worker is not None and self._worker.is_alive():
609
+ self._worker.join()
610
+ self._stream_data = Queue()
611
+ if self._silence_timer is not None and self._silence_timer.is_alive():
612
+ self._silence_timer.cancel()
613
+ self._silence_timer = None
614
+ if self._callback:
615
+ self._callback.on_close()
616
+
617
+ def send_audio_frame(self, buffer: bytes):
618
+ """Push audio to TranslationRecognizerRealtime.
619
+
620
+ Raises:
621
+ InvalidParameter: Cannot send data to an uninitiated TranslationRecognizerRealtime.
622
+ """
623
+ if self._running is False:
624
+ raise InvalidParameter(
625
+ 'TranslationRecognizerRealtime has stopped.')
626
+
627
+ if (self._start_stream_timestamp < 0):
628
+ self._start_stream_timestamp = time.time() * 1000
629
+ logger.debug('send_audio_frame: {}'.format(len(buffer)))
630
+ self._stream_data.put(buffer)
631
+
632
+ def _tidy_kwargs(self):
633
+ for k in self._kwargs.copy():
634
+ if self._kwargs[k] is None:
635
+ self._kwargs.pop(k, None)
636
+
637
+ def _input_stream_cycle(self):
638
+ while self._running:
639
+ while self._stream_data.empty():
640
+ if self._running:
641
+ time.sleep(0.01)
642
+ continue
643
+ else:
644
+ break
645
+
646
+ # Reset silence_timer when getting stream.
647
+ if self._silence_timer is not None and self._silence_timer.is_alive( # noqa E501
648
+ ):
649
+ self._silence_timer.cancel()
650
+ self._silence_timer = Timer(
651
+ TranslationRecognizerRealtime.SILENCE_TIMEOUT_S,
652
+ self._silence_stop_timer)
653
+ self._silence_timer.start()
654
+
655
+ while not self._stream_data.empty():
656
+ frame = self._stream_data.get()
657
+ yield bytes(frame)
658
+
659
+ if self._recognition_once:
660
+ self._running = False
661
+
662
+ # drain all audio data when invoking stop().
663
+ if self._recognition_once is False:
664
+ while not self._stream_data.empty():
665
+ frame = self._stream_data.get()
666
+ yield bytes(frame)
667
+
668
+ def _silence_stop_timer(self):
669
+ """If audio data is not received for a long time, exit worker.
670
+ """
671
+ self._running = False
672
+ if self._silence_timer is not None and self._silence_timer.is_alive():
673
+ self._silence_timer.cancel()
674
+ self._silence_timer = None
675
+ if self._worker is not None and self._worker.is_alive():
676
+ self._worker.join()
677
+ self._stream_data = Queue()
678
+
679
+ def get_first_package_delay(self):
680
+ """First Package Delay is the time between start sending audio and receive first words package
681
+ """
682
+ return self._first_package_timestamp - self._start_stream_timestamp
683
+
684
+ def get_last_package_delay(self):
685
+ """Last Package Delay is the time between stop sending audio and receive last words package
686
+ """
687
+ return self._on_complete_timestamp - self._stop_stream_timestamp
688
+
689
+ # 获取上一个任务的taskId
690
+ def get_last_request_id(self):
691
+ return self.last_request_id
692
+
693
+
694
+ class TranslationRecognizerChat(BaseApi):
695
+ """TranslationRecognizerChat interface.
696
+
697
+ Args:
698
+ model (str): The requested model_id.
699
+ callback (TranslationRecognizerChat): A callback that returns
700
+ TranslationRecognizerChat results.
701
+ format (str): The input audio format.
702
+ sample_rate (int): The input audio sample rate.
703
+ workspace (str): The dashscope workspace id.
704
+
705
+ **kwargs:
706
+ phrase_id (list, `optional`): The ID of phrase.
707
+ disfluency_removal_enabled(bool, `optional`): Filter mood words,
708
+ turned off by default.
709
+ diarization_enabled (bool, `optional`): Speech auto diarization,
710
+ turned off by default.
711
+ speaker_count (int, `optional`): The number of speakers.
712
+ timestamp_alignment_enabled (bool, `optional`): Timestamp-alignment
713
+ calibration, turned off by default.
714
+ special_word_filter(str, `optional`): Sensitive word filter.
715
+ audio_event_detection_enabled(bool, `optional`):
716
+ Audio event detection, turned off by default.
717
+
718
+ Raises:
719
+ InputRequired: Input is required.
720
+ """
721
+
722
+ SILENCE_TIMEOUT_S = 23
723
+
724
+ def __init__(self,
725
+ model: str,
726
+ callback: TranslationRecognizerCallback,
727
+ format: str,
728
+ sample_rate: int,
729
+ transcription_enabled: bool = True,
730
+ source_language: str = None,
731
+ translation_enabled: bool = False,
732
+ workspace: str = None,
733
+ **kwargs):
734
+ if model is None:
735
+ raise ModelRequired('Model is required!')
736
+ if format is None:
737
+ raise InputRequired('format is required!')
738
+ if sample_rate is None:
739
+ raise InputRequired('sample_rate is required!')
740
+
741
+ self.model = model
742
+ self.format = format
743
+ self.sample_rate = sample_rate
744
+ self.source_language = source_language
745
+ self.transcription_enabled = transcription_enabled
746
+ self.translation_enabled = translation_enabled
747
+ # continuous recognition with start() or once recognition with call()
748
+ self._recognition_once = False
749
+ self._callback = callback
750
+ self._running = False
751
+ self._stream_data = Queue()
752
+ self._worker = None
753
+ self._silence_timer = None
754
+ self._kwargs = kwargs
755
+ self._workspace = workspace
756
+ self._start_stream_timestamp = -1
757
+ self._first_package_timestamp = -1
758
+ self._stop_stream_timestamp = -1
759
+ self._on_complete_timestamp = -1
760
+ self.request_id_confirmed = False
761
+ self.last_request_id = uuid.uuid4().hex
762
+ self._is_sentence_end = ThreadSafeBool(False)
763
+
764
+ def __del__(self):
765
+ if self._running:
766
+ self._running = False
767
+ self._stream_data = Queue()
768
+ if self._worker is not None and self._worker.is_alive():
769
+ self._worker.join()
770
+ if self._silence_timer is not None and self._silence_timer.is_alive( # noqa E501
771
+ ):
772
+ self._silence_timer.cancel()
773
+ self._silence_timer = None
774
+ if self._callback:
775
+ self._callback.on_close()
776
+
777
+ def __receive_worker(self):
778
+ """Asynchronously, initiate a real-time transltion recognizer request and
779
+ obtain the result for parsing.
780
+ """
781
+ responses = self.__launch_request()
782
+ for part in responses:
783
+ if part.status_code == HTTPStatus.OK:
784
+ logger.debug('Received response request_id: {} {}'.format(
785
+ part.request_id, part.output))
786
+ if len(part.output) == 0:
787
+ self._on_complete_timestamp = time.time() * 1000
788
+ logger.debug('last package delay {}'.format(
789
+ self.get_last_package_delay()))
790
+ self._callback.on_complete()
791
+ else:
792
+ usage = None
793
+ transcription = None
794
+ translations = None
795
+ if DASHSCOPE_TRANSCRIPTION_KEY in part.output:
796
+ transcription = TranscriptionResult.from_json(
797
+ part.output[DASHSCOPE_TRANSCRIPTION_KEY])
798
+ if DASHSCOPE_TRANSLATION_KEY in part.output:
799
+ translations = TranslationResult.from_json(
800
+ part.output[DASHSCOPE_TRANSLATION_KEY])
801
+ if transcription is not None or translations is not None:
802
+ if (self._first_package_timestamp < 0):
803
+ self._first_package_timestamp = time.time() * 1000
804
+ logger.debug('first package delay {}'.format(
805
+ self.get_first_package_delay()))
806
+
807
+ if part.usage is not None:
808
+ usage = part.usage
809
+ if self.request_id_confirmed is False and part.request_id is not None:
810
+ self.last_request_id = part.request_id
811
+ self.request_id_confirmed = True
812
+ if transcription is not None and transcription.is_sentence_end:
813
+ logger.debug(
814
+ '[Chat] recv sentence end in transcription, stop asr'
815
+ )
816
+ self._is_sentence_end.set(True)
817
+ if translations is not None and translations.is_sentence_end:
818
+ logger.debug(
819
+ '[Chat] recv sentence end in translation, stop asr'
820
+ )
821
+ self._is_sentence_end.set(True)
822
+ self._callback.on_event(part.request_id, transcription,
823
+ translations, usage)
824
+ else:
825
+ self._running = False
826
+ self._stream_data = Queue()
827
+ self._callback.on_error(part)
828
+ self._callback.on_close()
829
+ break
830
+
831
+ def __launch_request(self):
832
+ """Initiate real-time translation recognizer requests.
833
+ """
834
+
835
+ self._tidy_kwargs()
836
+ task_name, _ = _get_task_group_and_task(__name__)
837
+ responses = super().call(
838
+ model=self.model,
839
+ task_group='audio',
840
+ task=task_name,
841
+ function='recognition',
842
+ input=self._input_stream_cycle(),
843
+ api_protocol=ApiProtocol.WEBSOCKET,
844
+ ws_stream_mode=WebsocketStreamingMode.DUPLEX,
845
+ is_binary_input=True,
846
+ sample_rate=self.sample_rate,
847
+ format=self.format,
848
+ stream=True,
849
+ source_language=self.source_language,
850
+ transcription_enabled=self.transcription_enabled,
851
+ translation_enabled=self.translation_enabled,
852
+ workspace=self._workspace,
853
+ pre_task_id=self.last_request_id,
854
+ **self._kwargs)
855
+ return responses
856
+
857
+ def start(self, **kwargs):
858
+ """Real-time translation recognizer in asynchronous mode.
859
+ Please call 'stop()' after you have completed translation & recognition.
860
+
861
+ Args:
862
+ phrase_id (str, `optional`): The ID of phrase.
863
+
864
+ **kwargs:
865
+ disfluency_removal_enabled(bool, `optional`):
866
+ Filter mood words, turned off by default.
867
+ diarization_enabled (bool, `optional`):
868
+ Speech auto diarization, turned off by default.
869
+ speaker_count (int, `optional`): The number of speakers.
870
+ timestamp_alignment_enabled (bool, `optional`):
871
+ Timestamp-alignment calibration, turned off by default.
872
+ special_word_filter(str, `optional`): Sensitive word filter.
873
+ audio_event_detection_enabled(bool, `optional`):
874
+ Audio event detection, turned off by default.
875
+
876
+ Raises:
877
+ InvalidParameter: This interface cannot be called again
878
+ if it has already been started.
879
+ InvalidTask: Task create failed.
880
+ """
881
+ assert self._callback is not None, 'Please set the callback to get the translation & recognition result.' # noqa E501
882
+
883
+ if self._running:
884
+ raise InvalidParameter('TranslationRecognizerChat has started.')
885
+
886
+ self._start_stream_timestamp = -1
887
+ self._first_package_timestamp = -1
888
+ self._stop_stream_timestamp = -1
889
+ self._on_complete_timestamp = -1
890
+ self._kwargs.update(**kwargs)
891
+ self._recognition_once = False
892
+ self._worker = threading.Thread(target=self.__receive_worker)
893
+ self._worker.start()
894
+ if self._worker.is_alive():
895
+ self._running = True
896
+ self._callback.on_open()
897
+
898
+ # If audio data is not received for 23 seconds, the timeout exits
899
+ self._silence_timer = Timer(
900
+ TranslationRecognizerChat.SILENCE_TIMEOUT_S,
901
+ self._silence_stop_timer)
902
+ self._silence_timer.start()
903
+ else:
904
+ self._running = False
905
+ raise InvalidTask('Invalid task, task create failed.')
906
+
907
+ def stop(self):
908
+ """End asynchronous TranslationRecognizerChat.
909
+
910
+ Raises:
911
+ InvalidParameter: Cannot stop an uninitiated TranslationRecognizerChat.
912
+ """
913
+ if self._running is False:
914
+ raise InvalidParameter('TranslationRecognizerChat has stopped.')
915
+
916
+ self._stop_stream_timestamp = time.time() * 1000
917
+ logger.debug('stop TranslationRecognizerChat')
918
+ self._running = False
919
+ if self._worker is not None and self._worker.is_alive():
920
+ self._worker.join()
921
+ self._stream_data = Queue()
922
+ if self._silence_timer is not None and self._silence_timer.is_alive():
923
+ self._silence_timer.cancel()
924
+ self._silence_timer = None
925
+ if self._callback:
926
+ self._callback.on_close()
927
+
928
+ def send_audio_frame(self, buffer: bytes) -> bool:
929
+ """Push audio to TranslationRecognizerChat.
930
+
931
+ Raises:
932
+ InvalidParameter: Cannot send data to an uninitiated TranslationRecognizerChat.
933
+ """
934
+ if self._is_sentence_end.get():
935
+ logger.debug('skip audio due to has sentence end.')
936
+ return False
937
+
938
+ if self._running is False:
939
+ raise InvalidParameter('TranslationRecognizerChat has stopped.')
940
+
941
+ if (self._start_stream_timestamp < 0):
942
+ self._start_stream_timestamp = time.time() * 1000
943
+ logger.debug('send_audio_frame: {}'.format(len(buffer)))
944
+ self._stream_data.put(buffer)
945
+ return True
946
+
947
+ def _tidy_kwargs(self):
948
+ for k in self._kwargs.copy():
949
+ if self._kwargs[k] is None:
950
+ self._kwargs.pop(k, None)
951
+
952
+ def _input_stream_cycle(self):
953
+ while self._running:
954
+ while self._stream_data.empty():
955
+ if self._running:
956
+ time.sleep(0.01)
957
+ continue
958
+ else:
959
+ break
960
+
961
+ # Reset silence_timer when getting stream.
962
+ if self._silence_timer is not None and self._silence_timer.is_alive( # noqa E501
963
+ ):
964
+ self._silence_timer.cancel()
965
+ self._silence_timer = Timer(
966
+ TranslationRecognizerChat.SILENCE_TIMEOUT_S,
967
+ self._silence_stop_timer)
968
+ self._silence_timer.start()
969
+
970
+ while not self._stream_data.empty():
971
+ frame = self._stream_data.get()
972
+ yield bytes(frame)
973
+
974
+ if self._recognition_once:
975
+ self._running = False
976
+
977
+ # drain all audio data when invoking stop().
978
+ if self._recognition_once is False:
979
+ while not self._stream_data.empty():
980
+ frame = self._stream_data.get()
981
+ yield bytes(frame)
982
+
983
+ def _silence_stop_timer(self):
984
+ """If audio data is not received for a long time, exit worker.
985
+ """
986
+ self._running = False
987
+ if self._silence_timer is not None and self._silence_timer.is_alive():
988
+ self._silence_timer.cancel()
989
+ self._silence_timer = None
990
+ if self._worker is not None and self._worker.is_alive():
991
+ self._worker.join()
992
+ self._stream_data = Queue()
993
+
994
+ def get_first_package_delay(self):
995
+ """First Package Delay is the time between start sending audio and receive first words package
996
+ """
997
+ return self._first_package_timestamp - self._start_stream_timestamp
998
+
999
+ def get_last_package_delay(self):
1000
+ """Last Package Delay is the time between stop sending audio and receive last words package
1001
+ """
1002
+ return self._on_complete_timestamp - self._stop_stream_timestamp
1003
+
1004
+ # 获取上一个任务的taskId
1005
+ def get_last_request_id(self):
1006
+ return self.last_request_id