dashscope 1.21.0__py3-none-any.whl → 1.22.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dashscope might be problematic. Click here for more details.

@@ -0,0 +1,1004 @@
1
+ import json
2
+ import os
3
+ import threading
4
+ import time
5
+ import uuid
6
+ from http import HTTPStatus
7
+ from queue import Queue
8
+ from threading import Timer
9
+ from typing import Any, Dict, List
10
+
11
+ from dashscope.client.base_api import BaseApi
12
+ from dashscope.common.constants import ApiProtocol
13
+ from dashscope.common.error import (InputDataRequired, InputRequired,
14
+ InvalidParameter, InvalidTask,
15
+ ModelRequired)
16
+ from dashscope.common.logging import logger
17
+ from dashscope.common.utils import _get_task_group_and_task
18
+ from dashscope.protocol.websocket import WebsocketStreamingMode
19
+
20
+ DASHSCOPE_TRANSLATION_KEY = 'translations'
21
+ DASHSCOPE_TRANSCRIPTION_KEY = 'transcription'
22
+
23
+
24
+ class ThreadSafeBool:
25
+ def __init__(self, initial_value=False):
26
+ self._value = initial_value
27
+ self._lock = threading.Lock()
28
+
29
+ def set(self, value):
30
+ with self._lock:
31
+ self._value = value
32
+
33
+ def get(self):
34
+ with self._lock:
35
+ return self._value
36
+
37
+
38
+ class WordObj():
39
+ def __init__(self, ) -> None:
40
+ self.text: str = None
41
+ self.begin_time: int = None
42
+ self.end_time: int = None
43
+ self.fixed: bool = False
44
+ self._raw_data = None
45
+
46
+ @staticmethod
47
+ def from_json(json_data: Dict[str, Any]):
48
+ """Create a Word object from a JSON dictionary.
49
+ """
50
+ word = WordObj()
51
+ word.text = json_data['text']
52
+ word.begin_time = json_data['begin_time']
53
+ word.end_time = json_data['end_time']
54
+ word.fixed = json_data['fixed']
55
+ word._raw_data = json_data
56
+ return word
57
+
58
+ def __str__(self) -> str:
59
+ return 'Word: ' + json.dumps(self._raw_data, ensure_ascii=False)
60
+
61
+ def __repr__(self):
62
+ return self.__str__()
63
+
64
+
65
+ class SentenceBaseObj():
66
+ def __init__(self, ) -> None:
67
+ self.sentence_id: int = -1
68
+ self.text: str = None
69
+ self.begin_time: int = None
70
+ self.end_time: int = None
71
+ self.words: List[WordObj] = []
72
+ self._raw_data = None
73
+
74
+ @staticmethod
75
+ def from_json(json_data: Dict[str, Any]):
76
+ """Create a SentenceBase object from a JSON dictionary.
77
+ """
78
+ sentence = SentenceBaseObj()
79
+ sentence.sentence_id = json_data['sentence_id']
80
+ sentence.text = json_data['text']
81
+ sentence.begin_time = json_data['begin_time']
82
+ if json_data.get('end_time') is not None:
83
+ sentence.end_time = json_data['end_time']
84
+ else:
85
+ sentence.end_time = json_data['current_time']
86
+ sentence.words = [
87
+ WordObj.from_json(word) for word in json_data['words']
88
+ ]
89
+ sentence._raw_data = json_data
90
+ return sentence
91
+
92
+ def __str__(self) -> str:
93
+ return json.dumps(self._raw_data, ensure_ascii=False)
94
+
95
+ def __repr__(self):
96
+ return self.__str__()
97
+
98
+
99
+ class TranscriptionResult(SentenceBaseObj):
100
+ def __init__(self, ) -> None:
101
+ self.stash: SentenceBaseObj = None
102
+ self.is_sentence_end = False
103
+ # vad related
104
+ self.vad_pre_end: bool = False
105
+ self.pre_end_failed: bool = False
106
+ self.pre_end_timemillis: int = -1
107
+ self.pre_end_start_time: int = -1
108
+ self.pre_end_end_time: int = -1
109
+ self._raw_data = None
110
+
111
+ @staticmethod
112
+ def from_json(json_data: Dict[str, Any]):
113
+ """Create a TranscriptionResult object from a JSON dictionary.
114
+ """
115
+ transcription = TranscriptionResult()
116
+ transcription.sentence_id = json_data['sentence_id']
117
+ transcription.text = json_data['text']
118
+ transcription.begin_time = json_data['begin_time']
119
+ if json_data.get('end_time') is not None:
120
+ transcription.end_time = json_data['end_time']
121
+ else:
122
+ transcription.end_time = json_data['current_time']
123
+ transcription.words = [
124
+ WordObj.from_json(word) for word in json_data['words']
125
+ ]
126
+ transcription._raw_data = json_data
127
+ transcription.is_sentence_end = json_data.get('sentence_end')
128
+ if 'stash' in json_data:
129
+ transcription.stash = SentenceBaseObj.from_json(json_data['stash'])
130
+ if 'vad_pre_end' in json_data:
131
+ transcription.vad_pre_end = json_data['vad_pre_end']
132
+ if 'pre_end_failed' in json_data:
133
+ transcription.pre_end_failed = json_data['pre_end_failed']
134
+ if 'pre_end_start_time' in json_data:
135
+ transcription.pre_end_start_time = json_data['pre_end_start_time']
136
+ if 'pre_end_end_time' in json_data:
137
+ transcription.pre_end_end_time = json_data['pre_end_end_time']
138
+ transcription._raw_data = json_data
139
+ return transcription
140
+
141
+ def __str__(self) -> str:
142
+ return 'Transcriptions: ' + json.dumps(self._raw_data,
143
+ ensure_ascii=False)
144
+
145
+ def __repr__(self):
146
+ return self.__str__()
147
+
148
+
149
+ class Translation(SentenceBaseObj):
150
+ def __init__(self, ) -> None:
151
+ self.language: str = None
152
+ self.stash: SentenceBaseObj = None
153
+ self.is_sentence_end = False
154
+ # vad related
155
+ self.vad_pre_end: bool = False
156
+ self.pre_end_failed: bool = False
157
+ self.pre_end_timemillis: int = -1
158
+ self.pre_end_start_time: int = -1
159
+ self.pre_end_end_time: int = -1
160
+ self._raw_data = None
161
+
162
+ @staticmethod
163
+ def from_json(json_data: Dict[str, Any]):
164
+ """Create a Translation object from a JSON dictionary.
165
+ """
166
+ translation = Translation()
167
+ translation.sentence_id = json_data['sentence_id']
168
+ translation.text = json_data['text']
169
+ translation.begin_time = json_data['begin_time']
170
+ if json_data.get('end_time') is not None:
171
+ translation.end_time = json_data['end_time']
172
+ else:
173
+ translation.end_time = json_data['current_time']
174
+ translation.words = [
175
+ WordObj.from_json(word) for word in json_data['words']
176
+ ]
177
+ translation._raw_data = json_data
178
+
179
+ translation.language = json_data['lang']
180
+ translation.is_sentence_end = json_data.get('sentence_end')
181
+ if 'stash' in json_data:
182
+ translation.stash = SentenceBaseObj.from_json(json_data['stash'])
183
+ if 'vad_pre_end' in json_data:
184
+ translation.vad_pre_end = json_data['vad_pre_end']
185
+ if 'pre_end_failed' in json_data:
186
+ translation.pre_end_failed = json_data['pre_end_failed']
187
+ if 'pre_end_start_time' in json_data:
188
+ translation.pre_end_start_time = json_data['pre_end_start_time']
189
+ if 'pre_end_end_time' in json_data:
190
+ translation.pre_end_end_time = json_data['pre_end_end_time']
191
+ translation._raw_data = json_data
192
+ return translation
193
+
194
+ def __str__(self) -> str:
195
+ return 'Translation: ' + json.dumps(self._raw_data, ensure_ascii=False)
196
+
197
+ def __repr__(self):
198
+ return self.__str__()
199
+
200
+
201
+ class TranslationResult():
202
+ def __init__(self, ) -> None:
203
+ self.translations: Dict[str:Translation] = {}
204
+ self.is_sentence_end = False
205
+ self._raw_data = None
206
+
207
+ def get_translation(self, language) -> Translation:
208
+ if self.translations is None:
209
+ return None
210
+ return self.translations.get(language)
211
+
212
+ def get_language_list(self, ) -> List[str]:
213
+ if self.translations is None:
214
+ return None
215
+ return list(self.translations.keys())
216
+
217
+ @staticmethod
218
+ def from_json(json_data: List):
219
+ """Create a TranslationResult object from a JSON dictionary.
220
+ """
221
+ result = TranslationResult()
222
+ result._raw_data = json_data
223
+ for translation_json in json_data:
224
+ if not isinstance(translation_json, dict):
225
+ raise InvalidParameter(
226
+ f'Invalid translation json data: {translation_json}')
227
+ else:
228
+ translation = Translation.from_json(translation_json)
229
+ result.translations[translation.language] = translation
230
+ if translation.is_sentence_end:
231
+ result.is_sentence_end = True
232
+ return result
233
+
234
+ def __str__(self) -> str:
235
+ return 'TranslationList: ' + json.dumps(self._raw_data,
236
+ ensure_ascii=False)
237
+
238
+ def __repr__(self):
239
+ return self.__str__()
240
+
241
+
242
+ class TranslationRecognizerResultPack():
243
+ def __init__(self) -> None:
244
+ self.transcription_result_list: List[TranscriptionResult] = []
245
+ self.translation_result_list: List[TranslationResult] = []
246
+ self.usage_list: List = []
247
+ self.request_id: str = None
248
+ self.error_message = None
249
+
250
+
251
+ class TranslationRecognizerCallback():
252
+ """An interface that defines callback methods for getting translation recognizer results. # noqa E501
253
+ Derive from this class and implement its function to provide your own data.
254
+ """
255
+ def on_open(self) -> None:
256
+ pass
257
+
258
+ def on_complete(self) -> None:
259
+ pass
260
+
261
+ def on_error(self, message) -> None:
262
+ pass
263
+
264
+ def on_close(self) -> None:
265
+ pass
266
+
267
+ def on_event(self, request_id, transcription_result: TranscriptionResult,
268
+ translation_result: TranslationResult, usage) -> None:
269
+ pass
270
+
271
+
272
+ class TranslationRecognizerRealtime(BaseApi):
273
+ """TranslationRecognizerRealtime interface.
274
+
275
+ Args:
276
+ model (str): The requested model_id.
277
+ callback (TranslationRecognizerRealtime): A callback that returns
278
+ TranslationRecognizerRealtime results.
279
+ format (str): The input audio format.
280
+ sample_rate (int): The input audio sample rate.
281
+ workspace (str): The dashscope workspace id.
282
+
283
+ **kwargs:
284
+ phrase_id (list, `optional`): The ID of phrase.
285
+ disfluency_removal_enabled(bool, `optional`): Filter mood words,
286
+ turned off by default.
287
+ diarization_enabled (bool, `optional`): Speech auto diarization,
288
+ turned off by default.
289
+ speaker_count (int, `optional`): The number of speakers.
290
+ timestamp_alignment_enabled (bool, `optional`): Timestamp-alignment
291
+ calibration, turned off by default.
292
+ special_word_filter(str, `optional`): Sensitive word filter.
293
+ audio_event_detection_enabled(bool, `optional`):
294
+ Audio event detection, turned off by default.
295
+
296
+ Raises:
297
+ InputRequired: Input is required.
298
+ """
299
+
300
+ SILENCE_TIMEOUT_S = 23
301
+
302
+ def __init__(self,
303
+ model: str,
304
+ callback: TranslationRecognizerCallback,
305
+ format: str,
306
+ sample_rate: int,
307
+ transcription_enabled: bool = True,
308
+ source_language: str = None,
309
+ translation_enabled: bool = False,
310
+ workspace: str = None,
311
+ **kwargs):
312
+ if model is None:
313
+ raise ModelRequired('Model is required!')
314
+ if format is None:
315
+ raise InputRequired('format is required!')
316
+ if sample_rate is None:
317
+ raise InputRequired('sample_rate is required!')
318
+
319
+ self.model = model
320
+ self.format = format
321
+ self.sample_rate = sample_rate
322
+ self.source_language = source_language
323
+ self.transcription_enabled = transcription_enabled
324
+ self.translation_enabled = translation_enabled
325
+ # continuous recognition with start() or once recognition with call()
326
+ self._recognition_once = False
327
+ self._callback = callback
328
+ self._running = False
329
+ self._stream_data = Queue()
330
+ self._worker = None
331
+ self._silence_timer = None
332
+ self._kwargs = kwargs
333
+ self._workspace = workspace
334
+ self._start_stream_timestamp = -1
335
+ self._first_package_timestamp = -1
336
+ self._stop_stream_timestamp = -1
337
+ self._on_complete_timestamp = -1
338
+ self.request_id_confirmed = False
339
+ self.last_request_id = uuid.uuid4().hex
340
+
341
+ def __del__(self):
342
+ if self._running:
343
+ self._running = False
344
+ self._stream_data = Queue()
345
+ if self._worker is not None and self._worker.is_alive():
346
+ self._worker.join()
347
+ if self._silence_timer is not None and self._silence_timer.is_alive( # noqa E501
348
+ ):
349
+ self._silence_timer.cancel()
350
+ self._silence_timer = None
351
+ if self._callback:
352
+ self._callback.on_close()
353
+
354
+ def __receive_worker(self):
355
+ """Asynchronously, initiate a real-time transltion recognizer request and
356
+ obtain the result for parsing.
357
+ """
358
+ responses = self.__launch_request()
359
+ for part in responses:
360
+ if part.status_code == HTTPStatus.OK:
361
+ logger.debug('Received response request_id: {} {}'.format(
362
+ part.request_id, part.output))
363
+ if len(part.output) == 0:
364
+ self._on_complete_timestamp = time.time() * 1000
365
+ logger.debug('last package delay {}'.format(
366
+ self.get_last_package_delay()))
367
+ self._callback.on_complete()
368
+ else:
369
+ usage = None
370
+ transcription = None
371
+ translations = None
372
+ if DASHSCOPE_TRANSCRIPTION_KEY in part.output:
373
+ transcription = TranscriptionResult.from_json(
374
+ part.output[DASHSCOPE_TRANSCRIPTION_KEY])
375
+ if DASHSCOPE_TRANSLATION_KEY in part.output:
376
+ translations = TranslationResult.from_json(
377
+ part.output[DASHSCOPE_TRANSLATION_KEY])
378
+ if transcription is not None or translations is not None:
379
+ if (self._first_package_timestamp < 0):
380
+ self._first_package_timestamp = time.time() * 1000
381
+ logger.debug('first package delay {}'.format(
382
+ self.get_first_package_delay()))
383
+
384
+ if part.usage is not None:
385
+ usage = part.usage
386
+ if self.request_id_confirmed is False and part.request_id is not None:
387
+ self.last_request_id = part.request_id
388
+ self.request_id_confirmed = True
389
+ self._callback.on_event(part.request_id, transcription,
390
+ translations, usage)
391
+ else:
392
+ self._running = False
393
+ self._stream_data = Queue()
394
+ self._callback.on_error(part)
395
+ self._callback.on_close()
396
+ break
397
+
398
+ def __launch_request(self):
399
+ """Initiate real-time translation recognizer requests.
400
+ """
401
+
402
+ self._tidy_kwargs()
403
+ task_name, _ = _get_task_group_and_task(__name__)
404
+ responses = super().call(
405
+ model=self.model,
406
+ task_group='audio',
407
+ task=task_name,
408
+ function='recognition',
409
+ input=self._input_stream_cycle(),
410
+ api_protocol=ApiProtocol.WEBSOCKET,
411
+ ws_stream_mode=WebsocketStreamingMode.DUPLEX,
412
+ is_binary_input=True,
413
+ sample_rate=self.sample_rate,
414
+ format=self.format,
415
+ stream=True,
416
+ source_language=self.source_language,
417
+ transcription_enabled=self.transcription_enabled,
418
+ translation_enabled=self.translation_enabled,
419
+ workspace=self._workspace,
420
+ pre_task_id=self.last_request_id,
421
+ **self._kwargs)
422
+ return responses
423
+
424
+ def start(self, **kwargs):
425
+ """Real-time translation recognizer in asynchronous mode.
426
+ Please call 'stop()' after you have completed translation & recognition.
427
+
428
+ Args:
429
+ phrase_id (str, `optional`): The ID of phrase.
430
+
431
+ **kwargs:
432
+ disfluency_removal_enabled(bool, `optional`):
433
+ Filter mood words, turned off by default.
434
+ diarization_enabled (bool, `optional`):
435
+ Speech auto diarization, turned off by default.
436
+ speaker_count (int, `optional`): The number of speakers.
437
+ timestamp_alignment_enabled (bool, `optional`):
438
+ Timestamp-alignment calibration, turned off by default.
439
+ special_word_filter(str, `optional`): Sensitive word filter.
440
+ audio_event_detection_enabled(bool, `optional`):
441
+ Audio event detection, turned off by default.
442
+
443
+ Raises:
444
+ InvalidParameter: This interface cannot be called again
445
+ if it has already been started.
446
+ InvalidTask: Task create failed.
447
+ """
448
+ assert self._callback is not None, 'Please set the callback to get the translation & recognition result.' # noqa E501
449
+
450
+ if self._running:
451
+ raise InvalidParameter(
452
+ 'TranslationRecognizerRealtime has started.')
453
+
454
+ self._start_stream_timestamp = -1
455
+ self._first_package_timestamp = -1
456
+ self._stop_stream_timestamp = -1
457
+ self._on_complete_timestamp = -1
458
+ self._kwargs.update(**kwargs)
459
+ self._recognition_once = False
460
+ self._worker = threading.Thread(target=self.__receive_worker)
461
+ self._worker.start()
462
+ if self._worker.is_alive():
463
+ self._running = True
464
+ self._callback.on_open()
465
+
466
+ # If audio data is not received for 23 seconds, the timeout exits
467
+ self._silence_timer = Timer(
468
+ TranslationRecognizerRealtime.SILENCE_TIMEOUT_S,
469
+ self._silence_stop_timer)
470
+ self._silence_timer.start()
471
+ else:
472
+ self._running = False
473
+ raise InvalidTask('Invalid task, task create failed.')
474
+
475
+ def call(self,
476
+ file: str,
477
+ phrase_id: str = None,
478
+ **kwargs) -> TranslationRecognizerResultPack:
479
+ """TranslationRecognizerRealtime in synchronous mode.
480
+
481
+ Args:
482
+ file (str): The path to the local audio file.
483
+ phrase_id (str, `optional`): The ID of phrase.
484
+
485
+ **kwargs:
486
+ disfluency_removal_enabled(bool, `optional`):
487
+ Filter mood words, turned off by default.
488
+ diarization_enabled (bool, `optional`):
489
+ Speech auto diarization, turned off by default.
490
+ speaker_count (int, `optional`): The number of speakers.
491
+ timestamp_alignment_enabled (bool, `optional`):
492
+ Timestamp-alignment calibration, turned off by default.
493
+ special_word_filter(str, `optional`): Sensitive word filter.
494
+ audio_event_detection_enabled(bool, `optional`):
495
+ Audio event detection, turned off by default.
496
+
497
+ Raises:
498
+ InvalidParameter: This interface cannot be called again
499
+ if it has already been started.
500
+ InputDataRequired: The supplied file was empty.
501
+
502
+ Returns:
503
+ TranslationRecognizerResultPack: The result of speech translation & recognition.
504
+ """
505
+ self._start_stream_timestamp = time.time() * 1000
506
+ if self._running:
507
+ raise InvalidParameter(
508
+ 'TranslationRecognizerRealtime has been called.')
509
+
510
+ if os.path.exists(file):
511
+ if os.path.isdir(file):
512
+ raise IsADirectoryError('Is a directory: ' + file)
513
+ else:
514
+ raise FileNotFoundError('No such file or directory: ' + file)
515
+
516
+ self._recognition_once = True
517
+ self._stream_data = Queue()
518
+ self._phrase = phrase_id
519
+ self._kwargs.update(**kwargs)
520
+ results = TranslationRecognizerResultPack()
521
+ error_message = None
522
+
523
+ try:
524
+ audio_data: bytes = None
525
+ f = open(file, 'rb')
526
+ if os.path.getsize(file):
527
+ while True:
528
+ audio_data = f.read(12800)
529
+ if not audio_data:
530
+ break
531
+ else:
532
+ self._stream_data.put(audio_data)
533
+ else:
534
+ raise InputDataRequired(
535
+ 'The supplied file was empty (zero bytes long)')
536
+ f.close()
537
+ self._stop_stream_timestamp = time.time() * 1000
538
+ except Exception as e:
539
+ logger.error(e)
540
+ raise e
541
+
542
+ if not self._stream_data.empty():
543
+ self._running = True
544
+ responses = self.__launch_request()
545
+ for part in responses:
546
+ if part.status_code == HTTPStatus.OK:
547
+ logger.debug('received data: {}'.format(part.output))
548
+ # debug log cal fpd
549
+ transcription = None
550
+ translation = None
551
+ usage = None
552
+ if ('translation' in part.output) or ('transcription'
553
+ in part.output):
554
+ if (self._first_package_timestamp < 0):
555
+ self._first_package_timestamp = time.time() * 1000
556
+ logger.debug('first package delay {}'.format(
557
+ self._first_package_timestamp -
558
+ self._start_stream_timestamp))
559
+ if part.usage is not None:
560
+ usage = part.usage
561
+
562
+ if DASHSCOPE_TRANSCRIPTION_KEY in part.output:
563
+ transcription = TranscriptionResult.from_json(
564
+ part.output[DASHSCOPE_TRANSCRIPTION_KEY])
565
+
566
+ if DASHSCOPE_TRANSLATION_KEY in part.output:
567
+ translation = TranslationResult.from_json(
568
+ part.output[DASHSCOPE_TRANSLATION_KEY])
569
+
570
+ if (transcription is not None
571
+ and transcription.is_sentence_end) or (
572
+ translation is not None
573
+ and translation.is_sentence_end):
574
+ results.request_id = part.request_id
575
+ results.transcription_result_list.append(transcription)
576
+ results.translation_result_list.append(translation)
577
+ results.usage_list.append(usage)
578
+ else:
579
+ error_message = part
580
+ logger.error(error_message)
581
+ break
582
+
583
+ self._on_complete_timestamp = time.time() * 1000
584
+ logger.debug('last package delay {}'.format(
585
+ self.get_last_package_delay()))
586
+
587
+ self._stream_data = Queue()
588
+ self._recognition_once = False
589
+ self._running = False
590
+ results.error_message = error_message
591
+ return results
592
+
593
+ def stop(self):
594
+ """End asynchronous TranslationRecognizerRealtime.
595
+
596
+ Raises:
597
+ InvalidParameter: Cannot stop an uninitiated TranslationRecognizerRealtime.
598
+ """
599
+ if self._running is False:
600
+ raise InvalidParameter(
601
+ 'TranslationRecognizerRealtime has stopped.')
602
+
603
+ self._stop_stream_timestamp = time.time() * 1000
604
+
605
+ self._running = False
606
+ if self._worker is not None and self._worker.is_alive():
607
+ self._worker.join()
608
+ self._stream_data = Queue()
609
+ if self._silence_timer is not None and self._silence_timer.is_alive():
610
+ self._silence_timer.cancel()
611
+ self._silence_timer = None
612
+ if self._callback:
613
+ self._callback.on_close()
614
+
615
+ def send_audio_frame(self, buffer: bytes):
616
+ """Push audio to TranslationRecognizerRealtime.
617
+
618
+ Raises:
619
+ InvalidParameter: Cannot send data to an uninitiated TranslationRecognizerRealtime.
620
+ """
621
+ if self._running is False:
622
+ raise InvalidParameter(
623
+ 'TranslationRecognizerRealtime has stopped.')
624
+
625
+ if (self._start_stream_timestamp < 0):
626
+ self._start_stream_timestamp = time.time() * 1000
627
+ logger.debug('send_audio_frame: {}'.format(len(buffer)))
628
+ self._stream_data.put(buffer)
629
+
630
+ def _tidy_kwargs(self):
631
+ for k in self._kwargs.copy():
632
+ if self._kwargs[k] is None:
633
+ self._kwargs.pop(k, None)
634
+
635
+ def _input_stream_cycle(self):
636
+ while self._running:
637
+ while self._stream_data.empty():
638
+ if self._running:
639
+ time.sleep(0.01)
640
+ continue
641
+ else:
642
+ break
643
+
644
+ # Reset silence_timer when getting stream.
645
+ if self._silence_timer is not None and self._silence_timer.is_alive( # noqa E501
646
+ ):
647
+ self._silence_timer.cancel()
648
+ self._silence_timer = Timer(
649
+ TranslationRecognizerRealtime.SILENCE_TIMEOUT_S,
650
+ self._silence_stop_timer)
651
+ self._silence_timer.start()
652
+
653
+ while not self._stream_data.empty():
654
+ frame = self._stream_data.get()
655
+ yield bytes(frame)
656
+
657
+ if self._recognition_once:
658
+ self._running = False
659
+
660
+ # drain all audio data when invoking stop().
661
+ if self._recognition_once is False:
662
+ while not self._stream_data.empty():
663
+ frame = self._stream_data.get()
664
+ yield bytes(frame)
665
+
666
+ def _silence_stop_timer(self):
667
+ """If audio data is not received for a long time, exit worker.
668
+ """
669
+ self._running = False
670
+ if self._silence_timer is not None and self._silence_timer.is_alive():
671
+ self._silence_timer.cancel()
672
+ self._silence_timer = None
673
+ if self._worker is not None and self._worker.is_alive():
674
+ self._worker.join()
675
+ self._stream_data = Queue()
676
+
677
+ def get_first_package_delay(self):
678
+ """First Package Delay is the time between start sending audio and receive first words package
679
+ """
680
+ return self._first_package_timestamp - self._start_stream_timestamp
681
+
682
+ def get_last_package_delay(self):
683
+ """Last Package Delay is the time between stop sending audio and receive last words package
684
+ """
685
+ return self._on_complete_timestamp - self._stop_stream_timestamp
686
+
687
+ # 获取上一个任务的taskId
688
+ def get_last_request_id(self):
689
+ return self.last_request_id
690
+
691
+
692
+ class TranslationRecognizerChat(BaseApi):
693
+ """TranslationRecognizerChat interface.
694
+
695
+ Args:
696
+ model (str): The requested model_id.
697
+ callback (TranslationRecognizerChat): A callback that returns
698
+ TranslationRecognizerChat results.
699
+ format (str): The input audio format.
700
+ sample_rate (int): The input audio sample rate.
701
+ workspace (str): The dashscope workspace id.
702
+
703
+ **kwargs:
704
+ phrase_id (list, `optional`): The ID of phrase.
705
+ disfluency_removal_enabled(bool, `optional`): Filter mood words,
706
+ turned off by default.
707
+ diarization_enabled (bool, `optional`): Speech auto diarization,
708
+ turned off by default.
709
+ speaker_count (int, `optional`): The number of speakers.
710
+ timestamp_alignment_enabled (bool, `optional`): Timestamp-alignment
711
+ calibration, turned off by default.
712
+ special_word_filter(str, `optional`): Sensitive word filter.
713
+ audio_event_detection_enabled(bool, `optional`):
714
+ Audio event detection, turned off by default.
715
+
716
+ Raises:
717
+ InputRequired: Input is required.
718
+ """
719
+
720
+ SILENCE_TIMEOUT_S = 23
721
+
722
+ def __init__(self,
723
+ model: str,
724
+ callback: TranslationRecognizerCallback,
725
+ format: str,
726
+ sample_rate: int,
727
+ transcription_enabled: bool = True,
728
+ source_language: str = None,
729
+ translation_enabled: bool = False,
730
+ workspace: str = None,
731
+ **kwargs):
732
+ if model is None:
733
+ raise ModelRequired('Model is required!')
734
+ if format is None:
735
+ raise InputRequired('format is required!')
736
+ if sample_rate is None:
737
+ raise InputRequired('sample_rate is required!')
738
+
739
+ self.model = model
740
+ self.format = format
741
+ self.sample_rate = sample_rate
742
+ self.source_language = source_language
743
+ self.transcription_enabled = transcription_enabled
744
+ self.translation_enabled = translation_enabled
745
+ # continuous recognition with start() or once recognition with call()
746
+ self._recognition_once = False
747
+ self._callback = callback
748
+ self._running = False
749
+ self._stream_data = Queue()
750
+ self._worker = None
751
+ self._silence_timer = None
752
+ self._kwargs = kwargs
753
+ self._workspace = workspace
754
+ self._start_stream_timestamp = -1
755
+ self._first_package_timestamp = -1
756
+ self._stop_stream_timestamp = -1
757
+ self._on_complete_timestamp = -1
758
+ self.request_id_confirmed = False
759
+ self.last_request_id = uuid.uuid4().hex
760
+ self._is_sentence_end = ThreadSafeBool(False)
761
+
762
+ def __del__(self):
763
+ if self._running:
764
+ self._running = False
765
+ self._stream_data = Queue()
766
+ if self._worker is not None and self._worker.is_alive():
767
+ self._worker.join()
768
+ if self._silence_timer is not None and self._silence_timer.is_alive( # noqa E501
769
+ ):
770
+ self._silence_timer.cancel()
771
+ self._silence_timer = None
772
+ if self._callback:
773
+ self._callback.on_close()
774
+
775
+ def __receive_worker(self):
776
+ """Asynchronously, initiate a real-time transltion recognizer request and
777
+ obtain the result for parsing.
778
+ """
779
+ responses = self.__launch_request()
780
+ for part in responses:
781
+ if part.status_code == HTTPStatus.OK:
782
+ logger.debug('Received response request_id: {} {}'.format(
783
+ part.request_id, part.output))
784
+ if len(part.output) == 0:
785
+ self._on_complete_timestamp = time.time() * 1000
786
+ logger.debug('last package delay {}'.format(
787
+ self.get_last_package_delay()))
788
+ self._callback.on_complete()
789
+ else:
790
+ usage = None
791
+ transcription = None
792
+ translations = None
793
+ if DASHSCOPE_TRANSCRIPTION_KEY in part.output:
794
+ transcription = TranscriptionResult.from_json(
795
+ part.output[DASHSCOPE_TRANSCRIPTION_KEY])
796
+ if DASHSCOPE_TRANSLATION_KEY in part.output:
797
+ translations = TranslationResult.from_json(
798
+ part.output[DASHSCOPE_TRANSLATION_KEY])
799
+ if transcription is not None or translations is not None:
800
+ if (self._first_package_timestamp < 0):
801
+ self._first_package_timestamp = time.time() * 1000
802
+ logger.debug('first package delay {}'.format(
803
+ self.get_first_package_delay()))
804
+
805
+ if part.usage is not None:
806
+ usage = part.usage
807
+ if self.request_id_confirmed is False and part.request_id is not None:
808
+ self.last_request_id = part.request_id
809
+ self.request_id_confirmed = True
810
+ if transcription is not None and transcription.is_sentence_end:
811
+ logger.debug(
812
+ '[Chat] recv sentence end in transcription, stop asr'
813
+ )
814
+ self._is_sentence_end.set(True)
815
+ if translations is not None and translations.is_sentence_end:
816
+ logger.debug(
817
+ '[Chat] recv sentence end in translation, stop asr'
818
+ )
819
+ self._is_sentence_end.set(True)
820
+ self._callback.on_event(part.request_id, transcription,
821
+ translations, usage)
822
+ else:
823
+ self._running = False
824
+ self._stream_data = Queue()
825
+ self._callback.on_error(part)
826
+ self._callback.on_close()
827
+ break
828
+
829
+ def __launch_request(self):
830
+ """Initiate real-time translation recognizer requests.
831
+ """
832
+
833
+ self._tidy_kwargs()
834
+ task_name, _ = _get_task_group_and_task(__name__)
835
+ responses = super().call(
836
+ model=self.model,
837
+ task_group='audio',
838
+ task=task_name,
839
+ function='recognition',
840
+ input=self._input_stream_cycle(),
841
+ api_protocol=ApiProtocol.WEBSOCKET,
842
+ ws_stream_mode=WebsocketStreamingMode.DUPLEX,
843
+ is_binary_input=True,
844
+ sample_rate=self.sample_rate,
845
+ format=self.format,
846
+ stream=True,
847
+ source_language=self.source_language,
848
+ transcription_enabled=self.transcription_enabled,
849
+ translation_enabled=self.translation_enabled,
850
+ workspace=self._workspace,
851
+ pre_task_id=self.last_request_id,
852
+ **self._kwargs)
853
+ return responses
854
+
855
+ def start(self, **kwargs):
856
+ """Real-time translation recognizer in asynchronous mode.
857
+ Please call 'stop()' after you have completed translation & recognition.
858
+
859
+ Args:
860
+ phrase_id (str, `optional`): The ID of phrase.
861
+
862
+ **kwargs:
863
+ disfluency_removal_enabled(bool, `optional`):
864
+ Filter mood words, turned off by default.
865
+ diarization_enabled (bool, `optional`):
866
+ Speech auto diarization, turned off by default.
867
+ speaker_count (int, `optional`): The number of speakers.
868
+ timestamp_alignment_enabled (bool, `optional`):
869
+ Timestamp-alignment calibration, turned off by default.
870
+ special_word_filter(str, `optional`): Sensitive word filter.
871
+ audio_event_detection_enabled(bool, `optional`):
872
+ Audio event detection, turned off by default.
873
+
874
+ Raises:
875
+ InvalidParameter: This interface cannot be called again
876
+ if it has already been started.
877
+ InvalidTask: Task create failed.
878
+ """
879
+ assert self._callback is not None, 'Please set the callback to get the translation & recognition result.' # noqa E501
880
+
881
+ if self._running:
882
+ raise InvalidParameter('TranslationRecognizerChat has started.')
883
+
884
+ self._start_stream_timestamp = -1
885
+ self._first_package_timestamp = -1
886
+ self._stop_stream_timestamp = -1
887
+ self._on_complete_timestamp = -1
888
+ self._kwargs.update(**kwargs)
889
+ self._recognition_once = False
890
+ self._worker = threading.Thread(target=self.__receive_worker)
891
+ self._worker.start()
892
+ if self._worker.is_alive():
893
+ self._running = True
894
+ self._callback.on_open()
895
+
896
+ # If audio data is not received for 23 seconds, the timeout exits
897
+ self._silence_timer = Timer(
898
+ TranslationRecognizerChat.SILENCE_TIMEOUT_S,
899
+ self._silence_stop_timer)
900
+ self._silence_timer.start()
901
+ else:
902
+ self._running = False
903
+ raise InvalidTask('Invalid task, task create failed.')
904
+
905
+ def stop(self):
906
+ """End asynchronous TranslationRecognizerChat.
907
+
908
+ Raises:
909
+ InvalidParameter: Cannot stop an uninitiated TranslationRecognizerChat.
910
+ """
911
+ if self._running is False:
912
+ raise InvalidParameter('TranslationRecognizerChat has stopped.')
913
+
914
+ self._stop_stream_timestamp = time.time() * 1000
915
+ logger.debug('stop TranslationRecognizerChat')
916
+ self._running = False
917
+ if self._worker is not None and self._worker.is_alive():
918
+ self._worker.join()
919
+ self._stream_data = Queue()
920
+ if self._silence_timer is not None and self._silence_timer.is_alive():
921
+ self._silence_timer.cancel()
922
+ self._silence_timer = None
923
+ if self._callback:
924
+ self._callback.on_close()
925
+
926
+ def send_audio_frame(self, buffer: bytes) -> bool:
927
+ """Push audio to TranslationRecognizerChat.
928
+
929
+ Raises:
930
+ InvalidParameter: Cannot send data to an uninitiated TranslationRecognizerChat.
931
+ """
932
+ if self._is_sentence_end.get():
933
+ logger.debug('skip audio due to has sentence end.')
934
+ return False
935
+
936
+ if self._running is False:
937
+ raise InvalidParameter('TranslationRecognizerChat has stopped.')
938
+
939
+ if (self._start_stream_timestamp < 0):
940
+ self._start_stream_timestamp = time.time() * 1000
941
+ logger.debug('send_audio_frame: {}'.format(len(buffer)))
942
+ self._stream_data.put(buffer)
943
+ return True
944
+
945
+ def _tidy_kwargs(self):
946
+ for k in self._kwargs.copy():
947
+ if self._kwargs[k] is None:
948
+ self._kwargs.pop(k, None)
949
+
950
+ def _input_stream_cycle(self):
951
+ while self._running:
952
+ while self._stream_data.empty():
953
+ if self._running:
954
+ time.sleep(0.01)
955
+ continue
956
+ else:
957
+ break
958
+
959
+ # Reset silence_timer when getting stream.
960
+ if self._silence_timer is not None and self._silence_timer.is_alive( # noqa E501
961
+ ):
962
+ self._silence_timer.cancel()
963
+ self._silence_timer = Timer(
964
+ TranslationRecognizerChat.SILENCE_TIMEOUT_S,
965
+ self._silence_stop_timer)
966
+ self._silence_timer.start()
967
+
968
+ while not self._stream_data.empty():
969
+ frame = self._stream_data.get()
970
+ yield bytes(frame)
971
+
972
+ if self._recognition_once:
973
+ self._running = False
974
+
975
+ # drain all audio data when invoking stop().
976
+ if self._recognition_once is False:
977
+ while not self._stream_data.empty():
978
+ frame = self._stream_data.get()
979
+ yield bytes(frame)
980
+
981
+ def _silence_stop_timer(self):
982
+ """If audio data is not received for a long time, exit worker.
983
+ """
984
+ self._running = False
985
+ if self._silence_timer is not None and self._silence_timer.is_alive():
986
+ self._silence_timer.cancel()
987
+ self._silence_timer = None
988
+ if self._worker is not None and self._worker.is_alive():
989
+ self._worker.join()
990
+ self._stream_data = Queue()
991
+
992
+ def get_first_package_delay(self):
993
+ """First Package Delay is the time between start sending audio and receive first words package
994
+ """
995
+ return self._first_package_timestamp - self._start_stream_timestamp
996
+
997
+ def get_last_package_delay(self):
998
+ """Last Package Delay is the time between stop sending audio and receive last words package
999
+ """
1000
+ return self._on_complete_timestamp - self._stop_stream_timestamp
1001
+
1002
+ # 获取上一个任务的taskId
1003
+ def get_last_request_id(self):
1004
+ return self.last_request_id