dashscope 1.8.0__py3-none-any.whl → 1.25.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dashscope/__init__.py +61 -14
- dashscope/aigc/__init__.py +10 -3
- dashscope/aigc/chat_completion.py +282 -0
- dashscope/aigc/code_generation.py +145 -0
- dashscope/aigc/conversation.py +71 -12
- dashscope/aigc/generation.py +288 -16
- dashscope/aigc/image_synthesis.py +473 -31
- dashscope/aigc/multimodal_conversation.py +299 -14
- dashscope/aigc/video_synthesis.py +610 -0
- dashscope/api_entities/aiohttp_request.py +8 -5
- dashscope/api_entities/api_request_data.py +4 -2
- dashscope/api_entities/api_request_factory.py +68 -20
- dashscope/api_entities/base_request.py +20 -3
- dashscope/api_entities/chat_completion_types.py +344 -0
- dashscope/api_entities/dashscope_response.py +243 -15
- dashscope/api_entities/encryption.py +179 -0
- dashscope/api_entities/http_request.py +216 -62
- dashscope/api_entities/websocket_request.py +43 -34
- dashscope/app/__init__.py +5 -0
- dashscope/app/application.py +203 -0
- dashscope/app/application_response.py +246 -0
- dashscope/assistants/__init__.py +16 -0
- dashscope/assistants/assistant_types.py +175 -0
- dashscope/assistants/assistants.py +311 -0
- dashscope/assistants/files.py +197 -0
- dashscope/audio/__init__.py +4 -2
- dashscope/audio/asr/__init__.py +17 -1
- dashscope/audio/asr/asr_phrase_manager.py +203 -0
- dashscope/audio/asr/recognition.py +167 -27
- dashscope/audio/asr/transcription.py +107 -14
- dashscope/audio/asr/translation_recognizer.py +1006 -0
- dashscope/audio/asr/vocabulary.py +177 -0
- dashscope/audio/qwen_asr/__init__.py +7 -0
- dashscope/audio/qwen_asr/qwen_transcription.py +189 -0
- dashscope/audio/qwen_omni/__init__.py +11 -0
- dashscope/audio/qwen_omni/omni_realtime.py +524 -0
- dashscope/audio/qwen_tts/__init__.py +5 -0
- dashscope/audio/qwen_tts/speech_synthesizer.py +77 -0
- dashscope/audio/qwen_tts_realtime/__init__.py +10 -0
- dashscope/audio/qwen_tts_realtime/qwen_tts_realtime.py +355 -0
- dashscope/audio/tts/__init__.py +2 -0
- dashscope/audio/tts/speech_synthesizer.py +5 -0
- dashscope/audio/tts_v2/__init__.py +12 -0
- dashscope/audio/tts_v2/enrollment.py +179 -0
- dashscope/audio/tts_v2/speech_synthesizer.py +886 -0
- dashscope/cli.py +157 -37
- dashscope/client/base_api.py +652 -87
- dashscope/common/api_key.py +2 -0
- dashscope/common/base_type.py +135 -0
- dashscope/common/constants.py +13 -16
- dashscope/common/env.py +2 -0
- dashscope/common/error.py +58 -22
- dashscope/common/logging.py +2 -0
- dashscope/common/message_manager.py +2 -0
- dashscope/common/utils.py +276 -46
- dashscope/customize/__init__.py +0 -0
- dashscope/customize/customize_types.py +192 -0
- dashscope/customize/deployments.py +146 -0
- dashscope/customize/finetunes.py +234 -0
- dashscope/embeddings/__init__.py +5 -1
- dashscope/embeddings/batch_text_embedding.py +208 -0
- dashscope/embeddings/batch_text_embedding_response.py +65 -0
- dashscope/embeddings/multimodal_embedding.py +118 -10
- dashscope/embeddings/text_embedding.py +13 -1
- dashscope/{file.py → files.py} +19 -4
- dashscope/io/input_output.py +2 -0
- dashscope/model.py +11 -2
- dashscope/models.py +43 -0
- dashscope/multimodal/__init__.py +20 -0
- dashscope/multimodal/dialog_state.py +56 -0
- dashscope/multimodal/multimodal_constants.py +28 -0
- dashscope/multimodal/multimodal_dialog.py +648 -0
- dashscope/multimodal/multimodal_request_params.py +313 -0
- dashscope/multimodal/tingwu/__init__.py +10 -0
- dashscope/multimodal/tingwu/tingwu.py +80 -0
- dashscope/multimodal/tingwu/tingwu_realtime.py +579 -0
- dashscope/nlp/__init__.py +0 -0
- dashscope/nlp/understanding.py +64 -0
- dashscope/protocol/websocket.py +3 -0
- dashscope/rerank/__init__.py +0 -0
- dashscope/rerank/text_rerank.py +69 -0
- dashscope/resources/qwen.tiktoken +151643 -0
- dashscope/threads/__init__.py +26 -0
- dashscope/threads/messages/__init__.py +0 -0
- dashscope/threads/messages/files.py +113 -0
- dashscope/threads/messages/messages.py +220 -0
- dashscope/threads/runs/__init__.py +0 -0
- dashscope/threads/runs/runs.py +501 -0
- dashscope/threads/runs/steps.py +112 -0
- dashscope/threads/thread_types.py +665 -0
- dashscope/threads/threads.py +212 -0
- dashscope/tokenizers/__init__.py +7 -0
- dashscope/tokenizers/qwen_tokenizer.py +111 -0
- dashscope/tokenizers/tokenization.py +125 -0
- dashscope/tokenizers/tokenizer.py +45 -0
- dashscope/tokenizers/tokenizer_base.py +32 -0
- dashscope/utils/__init__.py +0 -0
- dashscope/utils/message_utils.py +838 -0
- dashscope/utils/oss_utils.py +243 -0
- dashscope/utils/param_utils.py +29 -0
- dashscope/version.py +3 -1
- {dashscope-1.8.0.dist-info → dashscope-1.25.6.dist-info}/METADATA +53 -50
- dashscope-1.25.6.dist-info/RECORD +112 -0
- {dashscope-1.8.0.dist-info → dashscope-1.25.6.dist-info}/WHEEL +1 -1
- {dashscope-1.8.0.dist-info → dashscope-1.25.6.dist-info}/entry_points.txt +0 -1
- {dashscope-1.8.0.dist-info → dashscope-1.25.6.dist-info/licenses}/LICENSE +2 -4
- dashscope/deployment.py +0 -129
- dashscope/finetune.py +0 -149
- dashscope-1.8.0.dist-info/RECORD +0 -49
- {dashscope-1.8.0.dist-info → dashscope-1.25.6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
|
|
3
|
+
from http import HTTPStatus
|
|
4
|
+
from typing import Any, Dict
|
|
5
|
+
|
|
6
|
+
from dashscope.api_entities.dashscope_response import DashScopeAPIResponse
|
|
7
|
+
from dashscope.client.base_api import BaseAsyncApi
|
|
8
|
+
from dashscope.common.error import InvalidParameter
|
|
9
|
+
from dashscope.common.logging import logger
|
|
10
|
+
from dashscope.customize.finetunes import FineTunes
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class AsrPhraseManager(BaseAsyncApi):
|
|
14
|
+
"""Hot word management for speech recognition.
|
|
15
|
+
"""
|
|
16
|
+
@classmethod
|
|
17
|
+
def create_phrases(cls,
|
|
18
|
+
model: str,
|
|
19
|
+
phrases: Dict[str, Any],
|
|
20
|
+
training_type: str = 'compile_asr_phrase',
|
|
21
|
+
workspace: str = None,
|
|
22
|
+
**kwargs) -> DashScopeAPIResponse:
|
|
23
|
+
"""Create hot words.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
model (str): The requested model.
|
|
27
|
+
phrases (Dict[str, Any]): A dictionary that contains phrases,
|
|
28
|
+
such as {'下一首':90,'上一首':90}.
|
|
29
|
+
training_type (str, `optional`): The training type,
|
|
30
|
+
'compile_asr_phrase' is default.
|
|
31
|
+
workspace (str): The dashscope workspace id.
|
|
32
|
+
|
|
33
|
+
Raises:
|
|
34
|
+
InvalidParameter: Parameter input is None or empty!
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
DashScopeAPIResponse: The results of creating hot words.
|
|
38
|
+
"""
|
|
39
|
+
if phrases is None or len(phrases) == 0:
|
|
40
|
+
raise InvalidParameter('phrases is empty!')
|
|
41
|
+
if training_type is None or len(training_type) == 0:
|
|
42
|
+
raise InvalidParameter('training_type is empty!')
|
|
43
|
+
|
|
44
|
+
original_ft_sub_path = FineTunes.SUB_PATH
|
|
45
|
+
FineTunes.SUB_PATH = 'fine-tunes'
|
|
46
|
+
response = FineTunes.call(model=model,
|
|
47
|
+
training_file_ids=[],
|
|
48
|
+
validation_file_ids=[],
|
|
49
|
+
mode=training_type,
|
|
50
|
+
hyper_parameters={'phrase_list': phrases},
|
|
51
|
+
workspace=workspace,
|
|
52
|
+
**kwargs)
|
|
53
|
+
FineTunes.SUB_PATH = original_ft_sub_path
|
|
54
|
+
|
|
55
|
+
if response.status_code != HTTPStatus.OK:
|
|
56
|
+
logger.error('Create phrase failed, ' + str(response))
|
|
57
|
+
|
|
58
|
+
return response
|
|
59
|
+
|
|
60
|
+
@classmethod
|
|
61
|
+
def update_phrases(cls,
|
|
62
|
+
model: str,
|
|
63
|
+
phrase_id: str,
|
|
64
|
+
phrases: Dict[str, Any],
|
|
65
|
+
training_type: str = 'compile_asr_phrase',
|
|
66
|
+
workspace: str = None,
|
|
67
|
+
**kwargs) -> DashScopeAPIResponse:
|
|
68
|
+
"""Update the hot words marked phrase_id.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
model (str): The requested model.
|
|
72
|
+
phrase_id (str): The ID of phrases,
|
|
73
|
+
which created by create_phrases().
|
|
74
|
+
phrases (Dict[str, Any]): A dictionary that contains phrases,
|
|
75
|
+
such as {'暂停':90}.
|
|
76
|
+
training_type (str, `optional`):
|
|
77
|
+
The training type, 'compile_asr_phrase' is default.
|
|
78
|
+
workspace (str): The dashscope workspace id.
|
|
79
|
+
|
|
80
|
+
Raises:
|
|
81
|
+
InvalidParameter: Parameter input is None or empty!
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
DashScopeAPIResponse: The results of updating hot words.
|
|
85
|
+
"""
|
|
86
|
+
if phrase_id is None or len(phrase_id) == 0:
|
|
87
|
+
raise InvalidParameter('phrase_id is empty!')
|
|
88
|
+
if phrases is None or len(phrases) == 0:
|
|
89
|
+
raise InvalidParameter('phrases is empty!')
|
|
90
|
+
if training_type is None or len(training_type) == 0:
|
|
91
|
+
raise InvalidParameter('training_type is empty!')
|
|
92
|
+
|
|
93
|
+
original_ft_sub_path = FineTunes.SUB_PATH
|
|
94
|
+
FineTunes.SUB_PATH = 'fine-tunes'
|
|
95
|
+
response = FineTunes.call(model=model,
|
|
96
|
+
training_file_ids=[],
|
|
97
|
+
validation_file_ids=[],
|
|
98
|
+
mode=training_type,
|
|
99
|
+
hyper_parameters={'phrase_list': phrases},
|
|
100
|
+
finetuned_output=phrase_id,
|
|
101
|
+
workspace=workspace,
|
|
102
|
+
**kwargs)
|
|
103
|
+
FineTunes.SUB_PATH = original_ft_sub_path
|
|
104
|
+
|
|
105
|
+
if response.status_code != HTTPStatus.OK:
|
|
106
|
+
logger.error('Update phrase failed, ' + str(response))
|
|
107
|
+
|
|
108
|
+
return response
|
|
109
|
+
|
|
110
|
+
@classmethod
|
|
111
|
+
def query_phrases(cls,
|
|
112
|
+
phrase_id: str,
|
|
113
|
+
workspace: str = None,
|
|
114
|
+
**kwargs) -> DashScopeAPIResponse:
|
|
115
|
+
"""Query the hot words by phrase_id.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
phrase_id (str): The ID of phrases,
|
|
119
|
+
which created by create_phrases().
|
|
120
|
+
workspace (str): The dashscope workspace id.
|
|
121
|
+
|
|
122
|
+
Raises:
|
|
123
|
+
InvalidParameter: phrase_id input is None or empty!
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
AsrPhraseManagerResult: The results of querying hot words.
|
|
127
|
+
"""
|
|
128
|
+
if phrase_id is None or len(phrase_id) == 0:
|
|
129
|
+
raise InvalidParameter('phrase_id is empty!')
|
|
130
|
+
|
|
131
|
+
original_ft_sub_path = FineTunes.SUB_PATH
|
|
132
|
+
FineTunes.SUB_PATH = 'fine-tunes/outputs'
|
|
133
|
+
response = FineTunes.get(job_id=phrase_id,
|
|
134
|
+
workspace=workspace,
|
|
135
|
+
**kwargs)
|
|
136
|
+
FineTunes.SUB_PATH = original_ft_sub_path
|
|
137
|
+
|
|
138
|
+
if response.status_code != HTTPStatus.OK:
|
|
139
|
+
logger.error('Query phrase failed, ' + str(response))
|
|
140
|
+
|
|
141
|
+
return response
|
|
142
|
+
|
|
143
|
+
@classmethod
|
|
144
|
+
def list_phrases(cls,
|
|
145
|
+
page: int = 1,
|
|
146
|
+
page_size: int = 10,
|
|
147
|
+
workspace: str = None,
|
|
148
|
+
**kwargs) -> DashScopeAPIResponse:
|
|
149
|
+
"""List all information of phrases.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
page (int): Page number, greater than 0, default value 1.
|
|
153
|
+
page_size (int): The paging size, greater than 0
|
|
154
|
+
and less than or equal to 100, default value 10.
|
|
155
|
+
workspace (str): The dashscope workspace id.
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
DashScopeAPIResponse: The results of listing hot words.
|
|
159
|
+
"""
|
|
160
|
+
original_ft_sub_path = FineTunes.SUB_PATH
|
|
161
|
+
FineTunes.SUB_PATH = 'fine-tunes/outputs'
|
|
162
|
+
response = FineTunes.list(page=page,
|
|
163
|
+
page_size=page_size,
|
|
164
|
+
workspace=workspace,
|
|
165
|
+
**kwargs)
|
|
166
|
+
FineTunes.SUB_PATH = original_ft_sub_path
|
|
167
|
+
|
|
168
|
+
if response.status_code != HTTPStatus.OK:
|
|
169
|
+
logger.error('List phrase failed, ' + str(response))
|
|
170
|
+
|
|
171
|
+
return response
|
|
172
|
+
|
|
173
|
+
@classmethod
|
|
174
|
+
def delete_phrases(cls,
|
|
175
|
+
phrase_id: str,
|
|
176
|
+
workspace: str = None,
|
|
177
|
+
**kwargs) -> DashScopeAPIResponse:
|
|
178
|
+
"""Delete the hot words by phrase_id.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
phrase_id (str): The ID of phrases,
|
|
182
|
+
which created by create_phrases().
|
|
183
|
+
|
|
184
|
+
Raises:
|
|
185
|
+
InvalidParameter: phrase_id input is None or empty!
|
|
186
|
+
|
|
187
|
+
Returns:
|
|
188
|
+
DashScopeAPIResponse: The results of deleting hot words.
|
|
189
|
+
"""
|
|
190
|
+
if phrase_id is None or len(phrase_id) == 0:
|
|
191
|
+
raise InvalidParameter('phrase_id is empty!')
|
|
192
|
+
|
|
193
|
+
original_ft_sub_path = FineTunes.SUB_PATH
|
|
194
|
+
FineTunes.SUB_PATH = 'fine-tunes/outputs'
|
|
195
|
+
response = FineTunes.delete(job_id=phrase_id,
|
|
196
|
+
workspace=workspace,
|
|
197
|
+
**kwargs)
|
|
198
|
+
FineTunes.SUB_PATH = original_ft_sub_path
|
|
199
|
+
|
|
200
|
+
if response.status_code != HTTPStatus.OK:
|
|
201
|
+
logger.error('Delete phrase failed, ' + str(response))
|
|
202
|
+
|
|
203
|
+
return response
|
|
@@ -1,7 +1,12 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
|
|
1
3
|
import json
|
|
2
4
|
import os
|
|
3
5
|
import threading
|
|
6
|
+
import time
|
|
7
|
+
import uuid
|
|
4
8
|
from http import HTTPStatus
|
|
9
|
+
from queue import Queue
|
|
5
10
|
from threading import Timer
|
|
6
11
|
from typing import Any, Dict, List, Union
|
|
7
12
|
|
|
@@ -110,10 +115,20 @@ class Recognition(BaseApi):
|
|
|
110
115
|
speech recognition results.
|
|
111
116
|
format (str): The input audio format for speech recognition.
|
|
112
117
|
sample_rate (int): The input audio sample rate for speech recognition.
|
|
118
|
+
workspace (str): The dashscope workspace id.
|
|
113
119
|
|
|
114
120
|
**kwargs:
|
|
121
|
+
phrase_id (list, `optional`): The ID of phrase.
|
|
115
122
|
disfluency_removal_enabled(bool, `optional`): Filter mood words,
|
|
116
123
|
turned off by default.
|
|
124
|
+
diarization_enabled (bool, `optional`): Speech auto diarization,
|
|
125
|
+
turned off by default.
|
|
126
|
+
speaker_count (int, `optional`): The number of speakers.
|
|
127
|
+
timestamp_alignment_enabled (bool, `optional`): Timestamp-alignment
|
|
128
|
+
calibration, turned off by default.
|
|
129
|
+
special_word_filter(str, `optional`): Sensitive word filter.
|
|
130
|
+
audio_event_detection_enabled(bool, `optional`):
|
|
131
|
+
Audio event detection, turned off by default.
|
|
117
132
|
|
|
118
133
|
Raises:
|
|
119
134
|
InputRequired: Input is required.
|
|
@@ -121,8 +136,13 @@ class Recognition(BaseApi):
|
|
|
121
136
|
|
|
122
137
|
SILENCE_TIMEOUT_S = 23
|
|
123
138
|
|
|
124
|
-
def __init__(self,
|
|
125
|
-
|
|
139
|
+
def __init__(self,
|
|
140
|
+
model: str,
|
|
141
|
+
callback: RecognitionCallback,
|
|
142
|
+
format: str,
|
|
143
|
+
sample_rate: int,
|
|
144
|
+
workspace: str = None,
|
|
145
|
+
**kwargs):
|
|
126
146
|
if model is None:
|
|
127
147
|
raise ModelRequired('Model is required!')
|
|
128
148
|
if format is None:
|
|
@@ -137,15 +157,22 @@ class Recognition(BaseApi):
|
|
|
137
157
|
self._recognition_once = False
|
|
138
158
|
self._callback = callback
|
|
139
159
|
self._running = False
|
|
140
|
-
self._stream_data =
|
|
160
|
+
self._stream_data = Queue()
|
|
141
161
|
self._worker = None
|
|
142
162
|
self._silence_timer = None
|
|
143
163
|
self._kwargs = kwargs
|
|
164
|
+
self._workspace = workspace
|
|
165
|
+
self._start_stream_timestamp = -1
|
|
166
|
+
self._first_package_timestamp = -1
|
|
167
|
+
self._stop_stream_timestamp = -1
|
|
168
|
+
self._on_complete_timestamp = -1
|
|
169
|
+
self.request_id_confirmed = False
|
|
170
|
+
self.last_request_id = uuid.uuid4().hex
|
|
144
171
|
|
|
145
172
|
def __del__(self):
|
|
146
173
|
if self._running:
|
|
147
174
|
self._running = False
|
|
148
|
-
self._stream_data
|
|
175
|
+
self._stream_data = Queue()
|
|
149
176
|
if self._worker is not None and self._worker.is_alive():
|
|
150
177
|
self._worker.join()
|
|
151
178
|
if self._silence_timer is not None and self._silence_timer.is_alive( # noqa E501
|
|
@@ -162,25 +189,46 @@ class Recognition(BaseApi):
|
|
|
162
189
|
responses = self.__launch_request()
|
|
163
190
|
for part in responses:
|
|
164
191
|
if part.status_code == HTTPStatus.OK:
|
|
165
|
-
if len(part.output) == 0:
|
|
192
|
+
if len(part.output) == 0 or ('finished' in part.output and part.output['finished'] == True):
|
|
193
|
+
self._on_complete_timestamp = time.time() * 1000
|
|
194
|
+
logger.debug('last package delay {}'.format(
|
|
195
|
+
self.get_last_package_delay()))
|
|
166
196
|
self._callback.on_complete()
|
|
167
197
|
else:
|
|
168
198
|
usage: Dict[str, Any] = None
|
|
169
|
-
|
|
170
|
-
if 'sentence' in part.output
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
199
|
+
usages: List[Any] = None
|
|
200
|
+
if 'sentence' in part.output:
|
|
201
|
+
if 'text' in part.output['sentence'] and part.output['sentence']['text'] != '':
|
|
202
|
+
if (self._first_package_timestamp < 0):
|
|
203
|
+
self._first_package_timestamp = time.time() * 1000
|
|
204
|
+
logger.debug('first package delay {}'.format(
|
|
205
|
+
self.get_first_package_delay()))
|
|
206
|
+
sentence = part.output['sentence']
|
|
207
|
+
if 'heartbeat' in sentence and sentence['heartbeat'] == True:
|
|
208
|
+
logger.debug('recv heartbeat')
|
|
209
|
+
continue
|
|
210
|
+
logger.debug(
|
|
211
|
+
'Recv Result [rid:{}]:{}, isEnd: {}'.format(
|
|
212
|
+
part.request_id, sentence,
|
|
213
|
+
RecognitionResult.is_sentence_end(sentence)))
|
|
214
|
+
if part.usage is not None:
|
|
215
|
+
usage = {
|
|
216
|
+
'end_time':
|
|
217
|
+
part.output['sentence']['end_time'],
|
|
218
|
+
'usage': part.usage
|
|
219
|
+
}
|
|
220
|
+
usages = [usage]
|
|
221
|
+
if self.request_id_confirmed is False and part.request_id is not None:
|
|
222
|
+
self.last_request_id = part.request_id
|
|
223
|
+
self.request_id_confirmed = True
|
|
176
224
|
|
|
177
225
|
self._callback.on_event(
|
|
178
226
|
RecognitionResult(
|
|
179
227
|
RecognitionResponse.from_api_response(part),
|
|
180
|
-
usages=
|
|
228
|
+
usages=usages))
|
|
181
229
|
else:
|
|
182
230
|
self._running = False
|
|
183
|
-
self._stream_data
|
|
231
|
+
self._stream_data = Queue()
|
|
184
232
|
self._callback.on_error(
|
|
185
233
|
RecognitionResult(
|
|
186
234
|
RecognitionResponse.from_api_response(part)))
|
|
@@ -190,6 +238,15 @@ class Recognition(BaseApi):
|
|
|
190
238
|
def __launch_request(self):
|
|
191
239
|
"""Initiate real-time speech recognition requests.
|
|
192
240
|
"""
|
|
241
|
+
resources_list: list = []
|
|
242
|
+
if self._phrase is not None and len(self._phrase) > 0:
|
|
243
|
+
item = {'resource_id': self._phrase, 'resource_type': 'asr_phrase'}
|
|
244
|
+
resources_list.append(item)
|
|
245
|
+
|
|
246
|
+
if len(resources_list) > 0:
|
|
247
|
+
self._kwargs['resources'] = resources_list
|
|
248
|
+
|
|
249
|
+
self._tidy_kwargs()
|
|
193
250
|
task_name, _ = _get_task_group_and_task(__name__)
|
|
194
251
|
responses = super().call(model=self.model,
|
|
195
252
|
task_group='audio',
|
|
@@ -202,13 +259,30 @@ class Recognition(BaseApi):
|
|
|
202
259
|
sample_rate=self.sample_rate,
|
|
203
260
|
format=self.format,
|
|
204
261
|
stream=True,
|
|
262
|
+
workspace=self._workspace,
|
|
263
|
+
pre_task_id=self.last_request_id,
|
|
205
264
|
**self._kwargs)
|
|
206
265
|
return responses
|
|
207
266
|
|
|
208
|
-
def start(self):
|
|
267
|
+
def start(self, phrase_id: str = None, **kwargs):
|
|
209
268
|
"""Real-time speech recognition in asynchronous mode.
|
|
210
269
|
Please call 'stop()' after you have completed recognition.
|
|
211
270
|
|
|
271
|
+
Args:
|
|
272
|
+
phrase_id (str, `optional`): The ID of phrase.
|
|
273
|
+
|
|
274
|
+
**kwargs:
|
|
275
|
+
disfluency_removal_enabled(bool, `optional`):
|
|
276
|
+
Filter mood words, turned off by default.
|
|
277
|
+
diarization_enabled (bool, `optional`):
|
|
278
|
+
Speech auto diarization, turned off by default.
|
|
279
|
+
speaker_count (int, `optional`): The number of speakers.
|
|
280
|
+
timestamp_alignment_enabled (bool, `optional`):
|
|
281
|
+
Timestamp-alignment calibration, turned off by default.
|
|
282
|
+
special_word_filter(str, `optional`): Sensitive word filter.
|
|
283
|
+
audio_event_detection_enabled(bool, `optional`):
|
|
284
|
+
Audio event detection, turned off by default.
|
|
285
|
+
|
|
212
286
|
Raises:
|
|
213
287
|
InvalidParameter: This interface cannot be called again
|
|
214
288
|
if it has already been started.
|
|
@@ -219,6 +293,12 @@ class Recognition(BaseApi):
|
|
|
219
293
|
if self._running:
|
|
220
294
|
raise InvalidParameter('Speech recognition has started.')
|
|
221
295
|
|
|
296
|
+
self._start_stream_timestamp = -1
|
|
297
|
+
self._first_package_timestamp = -1
|
|
298
|
+
self._stop_stream_timestamp = -1
|
|
299
|
+
self._on_complete_timestamp = -1
|
|
300
|
+
self._phrase = phrase_id
|
|
301
|
+
self._kwargs.update(**kwargs)
|
|
222
302
|
self._recognition_once = False
|
|
223
303
|
self._worker = threading.Thread(target=self.__receive_worker)
|
|
224
304
|
self._worker.start()
|
|
@@ -234,11 +314,27 @@ class Recognition(BaseApi):
|
|
|
234
314
|
self._running = False
|
|
235
315
|
raise InvalidTask('Invalid task, task create failed.')
|
|
236
316
|
|
|
237
|
-
def call(self,
|
|
317
|
+
def call(self,
|
|
318
|
+
file: str,
|
|
319
|
+
phrase_id: str = None,
|
|
320
|
+
**kwargs) -> RecognitionResult:
|
|
238
321
|
"""Real-time speech recognition in synchronous mode.
|
|
239
322
|
|
|
240
323
|
Args:
|
|
241
324
|
file (str): The path to the local audio file.
|
|
325
|
+
phrase_id (str, `optional`): The ID of phrase.
|
|
326
|
+
|
|
327
|
+
**kwargs:
|
|
328
|
+
disfluency_removal_enabled(bool, `optional`):
|
|
329
|
+
Filter mood words, turned off by default.
|
|
330
|
+
diarization_enabled (bool, `optional`):
|
|
331
|
+
Speech auto diarization, turned off by default.
|
|
332
|
+
speaker_count (int, `optional`): The number of speakers.
|
|
333
|
+
timestamp_alignment_enabled (bool, `optional`):
|
|
334
|
+
Timestamp-alignment calibration, turned off by default.
|
|
335
|
+
special_word_filter(str, `optional`): Sensitive word filter.
|
|
336
|
+
audio_event_detection_enabled(bool, `optional`):
|
|
337
|
+
Audio event detection, turned off by default.
|
|
242
338
|
|
|
243
339
|
Raises:
|
|
244
340
|
InvalidParameter: This interface cannot be called again
|
|
@@ -248,6 +344,7 @@ class Recognition(BaseApi):
|
|
|
248
344
|
Returns:
|
|
249
345
|
RecognitionResult: The result of speech recognition.
|
|
250
346
|
"""
|
|
347
|
+
self._start_stream_timestamp = time.time() * 1000
|
|
251
348
|
if self._running:
|
|
252
349
|
raise InvalidParameter('Speech recognition has been called.')
|
|
253
350
|
|
|
@@ -258,12 +355,14 @@ class Recognition(BaseApi):
|
|
|
258
355
|
raise FileNotFoundError('No such file or directory: ' + file)
|
|
259
356
|
|
|
260
357
|
self._recognition_once = True
|
|
358
|
+
self._stream_data = Queue()
|
|
359
|
+
self._phrase = phrase_id
|
|
360
|
+
self._kwargs.update(**kwargs)
|
|
261
361
|
error_flag: bool = False
|
|
262
362
|
sentences: List[Any] = []
|
|
263
363
|
usages: List[Any] = []
|
|
264
364
|
response: RecognitionResponse = None
|
|
265
365
|
result: RecognitionResult = None
|
|
266
|
-
self._stream_data.clear()
|
|
267
366
|
|
|
268
367
|
try:
|
|
269
368
|
audio_data: bytes = None
|
|
@@ -274,22 +373,33 @@ class Recognition(BaseApi):
|
|
|
274
373
|
if not audio_data:
|
|
275
374
|
break
|
|
276
375
|
else:
|
|
277
|
-
self._stream_data
|
|
376
|
+
self._stream_data.put(audio_data)
|
|
278
377
|
else:
|
|
279
378
|
raise InputDataRequired(
|
|
280
379
|
'The supplied file was empty (zero bytes long)')
|
|
281
380
|
f.close()
|
|
381
|
+
self._stop_stream_timestamp = time.time() * 1000
|
|
282
382
|
except Exception as e:
|
|
283
383
|
logger.error(e)
|
|
284
384
|
raise e
|
|
285
385
|
|
|
286
|
-
if
|
|
386
|
+
if not self._stream_data.empty():
|
|
287
387
|
self._running = True
|
|
288
388
|
responses = self.__launch_request()
|
|
289
389
|
for part in responses:
|
|
290
390
|
if part.status_code == HTTPStatus.OK:
|
|
291
391
|
if 'sentence' in part.output:
|
|
392
|
+
if 'text' in part.output['sentence'] and part.output['sentence']['text'] != '':
|
|
393
|
+
if (self._first_package_timestamp < 0):
|
|
394
|
+
self._first_package_timestamp = time.time() * 1000
|
|
395
|
+
logger.debug('first package delay {}'.format(
|
|
396
|
+
self._first_package_timestamp -
|
|
397
|
+
self._start_stream_timestamp))
|
|
292
398
|
sentence = part.output['sentence']
|
|
399
|
+
logger.debug(
|
|
400
|
+
'Recv Result [rid:{}]:{}, isEnd: {}'.format(
|
|
401
|
+
part.request_id, sentence,
|
|
402
|
+
RecognitionResult.is_sentence_end(sentence)))
|
|
293
403
|
if RecognitionResult.is_sentence_end(sentence):
|
|
294
404
|
sentences.append(sentence)
|
|
295
405
|
|
|
@@ -308,12 +418,16 @@ class Recognition(BaseApi):
|
|
|
308
418
|
error_flag = True
|
|
309
419
|
break
|
|
310
420
|
|
|
421
|
+
self._on_complete_timestamp = time.time() * 1000
|
|
422
|
+
logger.debug('last package delay {}'.format(
|
|
423
|
+
self.get_last_package_delay()))
|
|
424
|
+
|
|
311
425
|
if error_flag:
|
|
312
426
|
result = RecognitionResult(response)
|
|
313
427
|
else:
|
|
314
428
|
result = RecognitionResult(response, sentences, usages)
|
|
315
429
|
|
|
316
|
-
self._stream_data
|
|
430
|
+
self._stream_data = Queue()
|
|
317
431
|
self._recognition_once = False
|
|
318
432
|
self._running = False
|
|
319
433
|
|
|
@@ -328,10 +442,12 @@ class Recognition(BaseApi):
|
|
|
328
442
|
if self._running is False:
|
|
329
443
|
raise InvalidParameter('Speech recognition has stopped.')
|
|
330
444
|
|
|
445
|
+
self._stop_stream_timestamp = time.time() * 1000
|
|
446
|
+
|
|
331
447
|
self._running = False
|
|
332
448
|
if self._worker is not None and self._worker.is_alive():
|
|
333
449
|
self._worker.join()
|
|
334
|
-
self._stream_data
|
|
450
|
+
self._stream_data = Queue()
|
|
335
451
|
if self._silence_timer is not None and self._silence_timer.is_alive():
|
|
336
452
|
self._silence_timer.cancel()
|
|
337
453
|
self._silence_timer = None
|
|
@@ -347,12 +463,21 @@ class Recognition(BaseApi):
|
|
|
347
463
|
if self._running is False:
|
|
348
464
|
raise InvalidParameter('Speech recognition has stopped.')
|
|
349
465
|
|
|
350
|
-
|
|
466
|
+
if (self._start_stream_timestamp < 0):
|
|
467
|
+
self._start_stream_timestamp = time.time() * 1000
|
|
468
|
+
logger.debug('send_audio_frame: {}'.format(len(buffer)))
|
|
469
|
+
self._stream_data.put(buffer)
|
|
470
|
+
|
|
471
|
+
def _tidy_kwargs(self):
|
|
472
|
+
for k in self._kwargs.copy():
|
|
473
|
+
if self._kwargs[k] is None:
|
|
474
|
+
self._kwargs.pop(k, None)
|
|
351
475
|
|
|
352
476
|
def _input_stream_cycle(self):
|
|
353
477
|
while self._running:
|
|
354
|
-
while
|
|
478
|
+
while self._stream_data.empty():
|
|
355
479
|
if self._running:
|
|
480
|
+
time.sleep(0.01)
|
|
356
481
|
continue
|
|
357
482
|
else:
|
|
358
483
|
break
|
|
@@ -365,16 +490,17 @@ class Recognition(BaseApi):
|
|
|
365
490
|
self._silence_stop_timer)
|
|
366
491
|
self._silence_timer.start()
|
|
367
492
|
|
|
368
|
-
|
|
493
|
+
while not self._stream_data.empty():
|
|
494
|
+
frame = self._stream_data.get()
|
|
369
495
|
yield bytes(frame)
|
|
370
|
-
self._stream_data.clear()
|
|
371
496
|
|
|
372
497
|
if self._recognition_once:
|
|
373
498
|
self._running = False
|
|
374
499
|
|
|
375
500
|
# drain all audio data when invoking stop().
|
|
376
501
|
if self._recognition_once is False:
|
|
377
|
-
|
|
502
|
+
while not self._stream_data.empty():
|
|
503
|
+
frame = self._stream_data.get()
|
|
378
504
|
yield bytes(frame)
|
|
379
505
|
|
|
380
506
|
def _silence_stop_timer(self):
|
|
@@ -386,4 +512,18 @@ class Recognition(BaseApi):
|
|
|
386
512
|
self._silence_timer = None
|
|
387
513
|
if self._worker is not None and self._worker.is_alive():
|
|
388
514
|
self._worker.join()
|
|
389
|
-
self._stream_data
|
|
515
|
+
self._stream_data = Queue()
|
|
516
|
+
|
|
517
|
+
def get_first_package_delay(self):
|
|
518
|
+
"""First Package Delay is the time between start sending audio and receive first words package
|
|
519
|
+
"""
|
|
520
|
+
return self._first_package_timestamp - self._start_stream_timestamp
|
|
521
|
+
|
|
522
|
+
def get_last_package_delay(self):
|
|
523
|
+
"""Last Package Delay is the time between stop sending audio and receive last words package
|
|
524
|
+
"""
|
|
525
|
+
return self._on_complete_timestamp - self._stop_stream_timestamp
|
|
526
|
+
|
|
527
|
+
# 获取上一个任务的taskId
|
|
528
|
+
def get_last_request_id(self):
|
|
529
|
+
return self.last_request_id
|