dashscope 1.23.8__py3-none-any.whl → 1.24.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dashscope might be problematic. Click here for more details.
- dashscope/app/application_response.py +48 -1
- dashscope/assistants/assistant_types.py +9 -0
- dashscope/assistants/assistants.py +31 -2
- dashscope/assistants/files.py +7 -1
- dashscope/audio/__init__.py +2 -2
- dashscope/audio/qwen_omni/__init__.py +11 -0
- dashscope/audio/qwen_omni/omni_realtime.py +415 -0
- dashscope/audio/qwen_tts_realtime/__init__.py +10 -0
- dashscope/audio/qwen_tts_realtime/qwen_tts_realtime.py +314 -0
- dashscope/audio/tts_v2/speech_synthesizer.py +38 -22
- dashscope/cli.py +54 -0
- dashscope/embeddings/text_embedding.py +1 -0
- dashscope/multimodal/multimodal_request_params.py +10 -1
- dashscope/threads/runs/runs.py +21 -0
- dashscope/threads/thread_types.py +12 -0
- dashscope/utils/oss_utils.py +3 -0
- dashscope/version.py +1 -1
- {dashscope-1.23.8.dist-info → dashscope-1.24.0.dist-info}/METADATA +1 -1
- {dashscope-1.23.8.dist-info → dashscope-1.24.0.dist-info}/RECORD +23 -19
- {dashscope-1.23.8.dist-info → dashscope-1.24.0.dist-info}/WHEEL +0 -0
- {dashscope-1.23.8.dist-info → dashscope-1.24.0.dist-info}/entry_points.txt +0 -0
- {dashscope-1.23.8.dist-info → dashscope-1.24.0.dist-info}/licenses/LICENSE +0 -0
- {dashscope-1.23.8.dist-info → dashscope-1.24.0.dist-info}/top_level.txt +0 -0
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
"""
|
|
7
7
|
from dataclasses import dataclass
|
|
8
8
|
from http import HTTPStatus
|
|
9
|
-
from typing import Dict, List
|
|
9
|
+
from typing import Dict, List, Optional
|
|
10
10
|
|
|
11
11
|
from dashscope.api_entities.dashscope_response import (DashScopeAPIResponse,
|
|
12
12
|
DictMixin)
|
|
@@ -105,6 +105,50 @@ class ApplicationDocReference(DictMixin):
|
|
|
105
105
|
page_number=page_number,
|
|
106
106
|
**kwargs)
|
|
107
107
|
|
|
108
|
+
@dataclass(init=False)
|
|
109
|
+
class WorkflowMessage(DictMixin):
|
|
110
|
+
node_id: str
|
|
111
|
+
node_name: str
|
|
112
|
+
node_type: str
|
|
113
|
+
node_status: str
|
|
114
|
+
node_is_completed: str
|
|
115
|
+
node_msg_seq_id: int
|
|
116
|
+
message: str
|
|
117
|
+
|
|
118
|
+
class Message(DictMixin):
|
|
119
|
+
role: str
|
|
120
|
+
content: str
|
|
121
|
+
|
|
122
|
+
def __init__(self,
|
|
123
|
+
node_id: str = None,
|
|
124
|
+
node_name: str = None,
|
|
125
|
+
node_type: str = None,
|
|
126
|
+
node_status: str = None,
|
|
127
|
+
node_is_completed: str = None,
|
|
128
|
+
node_msg_seq_id: int = None,
|
|
129
|
+
message: Message = None,
|
|
130
|
+
**kwargs):
|
|
131
|
+
""" Workflow message.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
node_id (str, optional): .
|
|
135
|
+
node_name (str, optional): .
|
|
136
|
+
node_type (str, optional): .
|
|
137
|
+
node_status (str, optional): .
|
|
138
|
+
node_is_completed (str, optional): .
|
|
139
|
+
node_msg_seq_id (int, optional): .
|
|
140
|
+
message (Message, optional): .
|
|
141
|
+
"""
|
|
142
|
+
|
|
143
|
+
super().__init__(node_id=node_id,
|
|
144
|
+
node_name=node_name,
|
|
145
|
+
node_type=node_type,
|
|
146
|
+
node_status=node_status,
|
|
147
|
+
node_is_completed=node_is_completed,
|
|
148
|
+
node_msg_seq_id=node_msg_seq_id,
|
|
149
|
+
message=message,
|
|
150
|
+
**kwargs)
|
|
151
|
+
|
|
108
152
|
|
|
109
153
|
@dataclass(init=False)
|
|
110
154
|
class ApplicationOutput(DictMixin):
|
|
@@ -113,6 +157,7 @@ class ApplicationOutput(DictMixin):
|
|
|
113
157
|
session_id: str
|
|
114
158
|
thoughts: List[ApplicationThought]
|
|
115
159
|
doc_references: List[ApplicationDocReference]
|
|
160
|
+
workflow_message: WorkflowMessage
|
|
116
161
|
|
|
117
162
|
def __init__(self,
|
|
118
163
|
text: str = None,
|
|
@@ -120,6 +165,7 @@ class ApplicationOutput(DictMixin):
|
|
|
120
165
|
session_id: str = None,
|
|
121
166
|
thoughts: List[ApplicationThought] = None,
|
|
122
167
|
doc_references: List[ApplicationDocReference] = None,
|
|
168
|
+
workflow_message: WorkflowMessage = None,
|
|
123
169
|
**kwargs):
|
|
124
170
|
|
|
125
171
|
ths = None
|
|
@@ -139,6 +185,7 @@ class ApplicationOutput(DictMixin):
|
|
|
139
185
|
session_id=session_id,
|
|
140
186
|
thoughts=ths,
|
|
141
187
|
doc_references=refs,
|
|
188
|
+
workflow_message=workflow_message,
|
|
142
189
|
**kwargs)
|
|
143
190
|
|
|
144
191
|
|
|
@@ -118,6 +118,15 @@ class Assistant(BaseObjectMixin):
|
|
|
118
118
|
metadata: Optional[object] = None
|
|
119
119
|
tools: List[Tool]
|
|
120
120
|
|
|
121
|
+
object: Optional[str] = None
|
|
122
|
+
|
|
123
|
+
top_p: Optional[float] = None
|
|
124
|
+
top_k: Optional[int] = None
|
|
125
|
+
temperature: Optional[float] = None
|
|
126
|
+
max_tokens: Optional[int] = None
|
|
127
|
+
|
|
128
|
+
request_id: Optional[str] = None
|
|
129
|
+
|
|
121
130
|
def __init__(self, **kwargs):
|
|
122
131
|
self.tools = convert_tools_dict_to_objects(kwargs.pop('tools', []))
|
|
123
132
|
super().__init__(**kwargs)
|
|
@@ -26,6 +26,10 @@ class Assistants(CreateMixin, CancelMixin, DeleteMixin, ListObjectMixin,
|
|
|
26
26
|
tools: Optional[List[Dict]] = None,
|
|
27
27
|
file_ids: Optional[List[str]] = [],
|
|
28
28
|
metadata: Dict = {},
|
|
29
|
+
top_p: Optional[float] = None,
|
|
30
|
+
top_k: Optional[int] = None,
|
|
31
|
+
temperature: Optional[float] = None,
|
|
32
|
+
max_tokens: Optional[int] = None,
|
|
29
33
|
):
|
|
30
34
|
obj = {}
|
|
31
35
|
if model:
|
|
@@ -41,6 +45,15 @@ class Assistants(CreateMixin, CancelMixin, DeleteMixin, ListObjectMixin,
|
|
|
41
45
|
obj['file_ids'] = file_ids
|
|
42
46
|
obj['metadata'] = metadata
|
|
43
47
|
|
|
48
|
+
if top_p is not None:
|
|
49
|
+
obj['top_p'] = top_p
|
|
50
|
+
if top_k is not None:
|
|
51
|
+
obj['top_k'] = top_k
|
|
52
|
+
if temperature is not None:
|
|
53
|
+
obj['temperature'] = temperature
|
|
54
|
+
if max_tokens is not None:
|
|
55
|
+
obj['max_tokens'] = max_tokens
|
|
56
|
+
|
|
44
57
|
return obj
|
|
45
58
|
|
|
46
59
|
@classmethod
|
|
@@ -98,6 +111,10 @@ class Assistants(CreateMixin, CancelMixin, DeleteMixin, ListObjectMixin,
|
|
|
98
111
|
metadata: Dict = None,
|
|
99
112
|
workspace: str = None,
|
|
100
113
|
api_key: str = None,
|
|
114
|
+
top_p: Optional[float] = None,
|
|
115
|
+
top_k: Optional[int] = None,
|
|
116
|
+
temperature: Optional[float] = None,
|
|
117
|
+
max_tokens: Optional[int] = None,
|
|
101
118
|
**kwargs) -> Assistant:
|
|
102
119
|
"""Create Assistant.
|
|
103
120
|
|
|
@@ -111,6 +128,10 @@ class Assistants(CreateMixin, CancelMixin, DeleteMixin, ListObjectMixin,
|
|
|
111
128
|
metadata (Dict, optional): Custom key-value pairs associate with assistant. Defaults to None.
|
|
112
129
|
workspace (str, optional): The DashScope workspace id. Defaults to None.
|
|
113
130
|
api_key (str, optional): The DashScope api key. Defaults to None.
|
|
131
|
+
top_p (float, optional): top_p parameter for model. Defaults to None.
|
|
132
|
+
top_k (int, optional): top_p parameter for model. Defaults to None.
|
|
133
|
+
temperature (float, optional): temperature parameter for model. Defaults to None.
|
|
134
|
+
max_tokens (int, optional): max_tokens parameter for model. Defaults to None.
|
|
114
135
|
|
|
115
136
|
Raises:
|
|
116
137
|
ModelRequired: The model is required.
|
|
@@ -122,7 +143,7 @@ class Assistants(CreateMixin, CancelMixin, DeleteMixin, ListObjectMixin,
|
|
|
122
143
|
raise ModelRequired('Model is required!')
|
|
123
144
|
data = cls._create_assistant_object(model, name, description,
|
|
124
145
|
instructions, tools, file_ids,
|
|
125
|
-
metadata)
|
|
146
|
+
metadata, top_p, top_k, temperature, max_tokens)
|
|
126
147
|
response = super().call(data=data,
|
|
127
148
|
api_key=api_key,
|
|
128
149
|
flattened_output=True,
|
|
@@ -224,6 +245,10 @@ class Assistants(CreateMixin, CancelMixin, DeleteMixin, ListObjectMixin,
|
|
|
224
245
|
metadata: Dict = None,
|
|
225
246
|
workspace: str = None,
|
|
226
247
|
api_key: str = None,
|
|
248
|
+
top_p: Optional[float] = None,
|
|
249
|
+
top_k: Optional[int] = None,
|
|
250
|
+
temperature: Optional[float] = None,
|
|
251
|
+
max_tokens: Optional[int] = None,
|
|
227
252
|
**kwargs) -> Assistant:
|
|
228
253
|
"""Update an exist assistants
|
|
229
254
|
|
|
@@ -238,6 +263,10 @@ class Assistants(CreateMixin, CancelMixin, DeleteMixin, ListObjectMixin,
|
|
|
238
263
|
metadata (Dict, optional): Custom key-value pairs associate with assistant. Defaults to None.
|
|
239
264
|
workspace (str): The DashScope workspace id.
|
|
240
265
|
api_key (str, optional): The DashScope workspace id. Defaults to None.
|
|
266
|
+
top_p (float, optional): top_p parameter for model. Defaults to None.
|
|
267
|
+
top_k (int, optional): top_p parameter for model. Defaults to None.
|
|
268
|
+
temperature (float, optional): temperature parameter for model. Defaults to None.
|
|
269
|
+
max_tokens (int, optional): max_tokens parameter for model. Defaults to None.
|
|
241
270
|
|
|
242
271
|
Returns:
|
|
243
272
|
Assistant: The updated assistant.
|
|
@@ -247,7 +276,7 @@ class Assistants(CreateMixin, CancelMixin, DeleteMixin, ListObjectMixin,
|
|
|
247
276
|
response = super().update(assistant_id,
|
|
248
277
|
cls._create_assistant_object(
|
|
249
278
|
model, name, description, instructions,
|
|
250
|
-
tools, file_ids, metadata),
|
|
279
|
+
tools, file_ids, metadata, top_p, top_k, temperature, max_tokens),
|
|
251
280
|
api_key=api_key,
|
|
252
281
|
workspace=workspace,
|
|
253
282
|
flattened_output=True,
|
dashscope/assistants/files.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
from typing import Optional
|
|
2
3
|
|
|
3
4
|
from dashscope.assistants.assistant_types import (AssistantFile,
|
|
4
5
|
AssistantFileList,
|
|
@@ -148,7 +149,7 @@ class Files(CreateMixin, DeleteMixin, ListObjectMixin, GetStatusMixin):
|
|
|
148
149
|
assistant_id: str,
|
|
149
150
|
workspace: str = None,
|
|
150
151
|
api_key: str = None,
|
|
151
|
-
**kwargs) -> AssistantFile:
|
|
152
|
+
**kwargs) -> Optional[AssistantFile]:
|
|
152
153
|
"""Retrieve file information.
|
|
153
154
|
|
|
154
155
|
Args:
|
|
@@ -160,6 +161,11 @@ class Files(CreateMixin, DeleteMixin, ListObjectMixin, GetStatusMixin):
|
|
|
160
161
|
Returns:
|
|
161
162
|
AssistantFile: The `AssistantFile` object.
|
|
162
163
|
"""
|
|
164
|
+
response = super().get(target=assistant_id + '/files/' + file_id, api_key=api_key, workspace=workspace, **kwargs)
|
|
165
|
+
if response.status_code == 200 and response.output:
|
|
166
|
+
return AssistantFile(**response.output)
|
|
167
|
+
else:
|
|
168
|
+
return None
|
|
163
169
|
|
|
164
170
|
@classmethod
|
|
165
171
|
def delete(cls,
|
dashscope/audio/__init__.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
-
from . import asr, tts, tts_v2, qwen_tts
|
|
3
|
+
from . import asr, tts, tts_v2, qwen_tts, qwen_tts_realtime, qwen_omni
|
|
4
4
|
|
|
5
|
-
__all__ = [asr, tts, tts_v2, qwen_tts]
|
|
5
|
+
__all__ = [asr, tts, tts_v2, qwen_tts, qwen_tts_realtime, qwen_omni]
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
|
|
3
|
+
from .omni_realtime import (AudioFormat, MultiModality, OmniRealtimeCallback,
|
|
4
|
+
OmniRealtimeConversation)
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
'OmniRealtimeCallback',
|
|
8
|
+
'AudioFormat',
|
|
9
|
+
'MultiModality',
|
|
10
|
+
'OmniRealtimeConversation',
|
|
11
|
+
]
|
|
@@ -0,0 +1,415 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import platform
|
|
5
|
+
import threading
|
|
6
|
+
import time
|
|
7
|
+
from typing import List
|
|
8
|
+
import uuid
|
|
9
|
+
from enum import Enum, unique
|
|
10
|
+
|
|
11
|
+
import dashscope
|
|
12
|
+
import websocket
|
|
13
|
+
from dashscope.common.error import InputRequired, ModelRequired
|
|
14
|
+
from dashscope.common.logging import logger
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class OmniRealtimeCallback:
|
|
18
|
+
"""
|
|
19
|
+
An interface that defines callback methods for getting omni-realtime results. # noqa E501
|
|
20
|
+
Derive from this class and implement its function to provide your own data.
|
|
21
|
+
"""
|
|
22
|
+
def on_open(self) -> None:
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
def on_close(self, close_status_code, close_msg) -> None:
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
def on_event(self, message: str) -> None:
|
|
29
|
+
pass
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@unique
|
|
33
|
+
class AudioFormat(Enum):
|
|
34
|
+
# format, sample_rate, channels, bit_rate, name
|
|
35
|
+
PCM_16000HZ_MONO_16BIT = ('pcm', 16000, 'mono', '16bit', 'pcm16')
|
|
36
|
+
PCM_24000HZ_MONO_16BIT = ('pcm', 24000, 'mono', '16bit', 'pcm16')
|
|
37
|
+
|
|
38
|
+
def __init__(self, format, sample_rate, channels, bit_rate, format_str):
|
|
39
|
+
self.format = format
|
|
40
|
+
self.sample_rate = sample_rate
|
|
41
|
+
self.channels = channels
|
|
42
|
+
self.bit_rate = bit_rate
|
|
43
|
+
self.format_str = format_str
|
|
44
|
+
|
|
45
|
+
def __repr__(self):
|
|
46
|
+
return self.format_str
|
|
47
|
+
|
|
48
|
+
def __str__(self):
|
|
49
|
+
return f'{self.format.upper()} with {self.sample_rate}Hz sample rate, {self.channels} channel, {self.bit_rate} bit rate: {self.format_str}'
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class MultiModality(Enum):
|
|
53
|
+
"""
|
|
54
|
+
MultiModality
|
|
55
|
+
"""
|
|
56
|
+
TEXT = 'text'
|
|
57
|
+
AUDIO = 'audio'
|
|
58
|
+
|
|
59
|
+
def __str__(self):
|
|
60
|
+
return self.name
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class OmniRealtimeConversation:
|
|
64
|
+
def __init__(
|
|
65
|
+
self,
|
|
66
|
+
model,
|
|
67
|
+
callback: OmniRealtimeCallback,
|
|
68
|
+
headers=None,
|
|
69
|
+
workspace=None,
|
|
70
|
+
url=None,
|
|
71
|
+
additional_params=None,
|
|
72
|
+
):
|
|
73
|
+
"""
|
|
74
|
+
Qwen Omni Realtime SDK
|
|
75
|
+
Parameters:
|
|
76
|
+
-----------
|
|
77
|
+
model: str
|
|
78
|
+
Model name.
|
|
79
|
+
headers: Dict
|
|
80
|
+
User-defined headers.
|
|
81
|
+
callback: OmniRealtimeCallback
|
|
82
|
+
Callback to receive real-time omni results.
|
|
83
|
+
workspace: str
|
|
84
|
+
Dashscope workspace ID.
|
|
85
|
+
url: str
|
|
86
|
+
Dashscope WebSocket URL.
|
|
87
|
+
additional_params: Dict
|
|
88
|
+
Additional parameters for the Dashscope API.
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
if model is None:
|
|
92
|
+
raise ModelRequired('Model is required!')
|
|
93
|
+
if callback is None:
|
|
94
|
+
raise ModelRequired('Callback is required!')
|
|
95
|
+
if url is None:
|
|
96
|
+
url = f'wss://dashscope.aliyuncs.com/api-ws/v1/realtime?model={model}'
|
|
97
|
+
else:
|
|
98
|
+
url = f'{url}?model={model}'
|
|
99
|
+
self.url = url
|
|
100
|
+
self.apikey = dashscope.api_key
|
|
101
|
+
self.user_headers = headers
|
|
102
|
+
self.user_workspace = workspace
|
|
103
|
+
self.model = model
|
|
104
|
+
self.config = {}
|
|
105
|
+
self.callback = callback
|
|
106
|
+
self.ws = None
|
|
107
|
+
self.session_id = None
|
|
108
|
+
self.last_message = None
|
|
109
|
+
self.last_response_id = None
|
|
110
|
+
self.last_response_create_time = None
|
|
111
|
+
self.last_first_text_delay = None
|
|
112
|
+
self.last_first_audio_delay = None
|
|
113
|
+
self.metrics = []
|
|
114
|
+
|
|
115
|
+
def _generate_event_id(self):
|
|
116
|
+
'''
|
|
117
|
+
generate random event id: event_xxxx
|
|
118
|
+
'''
|
|
119
|
+
return 'event_' + uuid.uuid4().hex
|
|
120
|
+
|
|
121
|
+
def _get_websocket_header(self, ):
|
|
122
|
+
ua = 'dashscope/%s; python/%s; platform/%s; processor/%s' % (
|
|
123
|
+
'1.18.0', # dashscope version
|
|
124
|
+
platform.python_version(),
|
|
125
|
+
platform.platform(),
|
|
126
|
+
platform.processor(),
|
|
127
|
+
)
|
|
128
|
+
headers = {
|
|
129
|
+
'user-agent': ua,
|
|
130
|
+
'Authorization': 'bearer ' + self.apikey,
|
|
131
|
+
}
|
|
132
|
+
if self.user_headers:
|
|
133
|
+
headers = {**self.user_headers, **headers}
|
|
134
|
+
if self.user_workspace:
|
|
135
|
+
headers = {
|
|
136
|
+
**headers,
|
|
137
|
+
'X-DashScope-WorkSpace': self.user_workspace,
|
|
138
|
+
}
|
|
139
|
+
return headers
|
|
140
|
+
|
|
141
|
+
def connect(self) -> None:
|
|
142
|
+
'''
|
|
143
|
+
connect to server, create session and return default session configuration
|
|
144
|
+
'''
|
|
145
|
+
self.ws = websocket.WebSocketApp(
|
|
146
|
+
self.url,
|
|
147
|
+
header=self._get_websocket_header(),
|
|
148
|
+
on_message=self.on_message,
|
|
149
|
+
on_error=self.on_error,
|
|
150
|
+
on_close=self.on_close,
|
|
151
|
+
)
|
|
152
|
+
self.thread = threading.Thread(target=self.ws.run_forever)
|
|
153
|
+
self.thread.daemon = True
|
|
154
|
+
self.thread.start()
|
|
155
|
+
timeout = 5 # 最长等待时间(秒)
|
|
156
|
+
start_time = time.time()
|
|
157
|
+
while (not (self.ws.sock and self.ws.sock.connected)
|
|
158
|
+
and (time.time() - start_time) < timeout):
|
|
159
|
+
time.sleep(0.1) # 短暂休眠,避免密集轮询
|
|
160
|
+
if not (self.ws.sock and self.ws.sock.connected):
|
|
161
|
+
raise TimeoutError(
|
|
162
|
+
'websocket connection could not established within 5s. '
|
|
163
|
+
'Please check your network connection, firewall settings, or server status.'
|
|
164
|
+
)
|
|
165
|
+
self.callback.on_open()
|
|
166
|
+
|
|
167
|
+
def __send_str(self, data: str, enable_log: bool = True):
|
|
168
|
+
if enable_log:
|
|
169
|
+
logger.debug('[omni realtime] send string: {}'.format(data))
|
|
170
|
+
self.ws.send(data)
|
|
171
|
+
|
|
172
|
+
def update_session(self,
|
|
173
|
+
output_modalities: List[MultiModality],
|
|
174
|
+
voice: str,
|
|
175
|
+
input_audio_format: AudioFormat = AudioFormat.
|
|
176
|
+
PCM_16000HZ_MONO_16BIT,
|
|
177
|
+
output_audio_format: AudioFormat = AudioFormat.
|
|
178
|
+
PCM_24000HZ_MONO_16BIT,
|
|
179
|
+
enable_input_audio_transcription: bool = True,
|
|
180
|
+
input_audio_transcription_model: str = None,
|
|
181
|
+
enable_turn_detection: bool = True,
|
|
182
|
+
turn_detection_type: str = 'server_vad',
|
|
183
|
+
prefix_padding_ms: int = 300,
|
|
184
|
+
turn_detection_threshold: float = 0.2,
|
|
185
|
+
turn_detection_silence_duration_ms: int = 800,
|
|
186
|
+
turn_detection_param: dict = None,
|
|
187
|
+
**kwargs) -> None:
|
|
188
|
+
'''
|
|
189
|
+
update session configuration, should be used before create response
|
|
190
|
+
|
|
191
|
+
Parameters
|
|
192
|
+
----------
|
|
193
|
+
output_modalities: list[MultiModality]
|
|
194
|
+
omni output modalities to be used in session
|
|
195
|
+
voice: str
|
|
196
|
+
voice to be used in session
|
|
197
|
+
input_audio_format: AudioFormat
|
|
198
|
+
input audio format
|
|
199
|
+
output_audio_format: AudioFormat
|
|
200
|
+
output audio format
|
|
201
|
+
enable_turn_detection: bool
|
|
202
|
+
enable turn detection
|
|
203
|
+
turn_detection_threshold: float
|
|
204
|
+
turn detection threshold, range [-1, 1]
|
|
205
|
+
In a noisy environment, it may be necessary to increase the threshold to reduce false detections
|
|
206
|
+
In a quiet environment, it may be necessary to decrease the threshold to improve sensitivity
|
|
207
|
+
turn_detection_silence_duration_ms: int
|
|
208
|
+
duration of silence in milliseconds to detect turn, range [200, 6000]
|
|
209
|
+
'''
|
|
210
|
+
self.config = {
|
|
211
|
+
'modalities': [m.value for m in output_modalities],
|
|
212
|
+
'voice': voice,
|
|
213
|
+
'input_audio_format': input_audio_format.format_str,
|
|
214
|
+
'output_audio_format': output_audio_format.format_str,
|
|
215
|
+
}
|
|
216
|
+
if enable_input_audio_transcription:
|
|
217
|
+
self.config['input_audio_transcription'] = {
|
|
218
|
+
'model': input_audio_transcription_model,
|
|
219
|
+
}
|
|
220
|
+
else:
|
|
221
|
+
self.config['input_audio_transcription'] = None
|
|
222
|
+
if enable_turn_detection:
|
|
223
|
+
self.config['turn_detection'] = {
|
|
224
|
+
'type': turn_detection_type,
|
|
225
|
+
'threshold': turn_detection_threshold,
|
|
226
|
+
'prefix_padding_ms': prefix_padding_ms,
|
|
227
|
+
'silence_duration_ms': turn_detection_silence_duration_ms,
|
|
228
|
+
}
|
|
229
|
+
if turn_detection_param is not None:
|
|
230
|
+
self.config['turn_detection'].update(turn_detection_param)
|
|
231
|
+
else:
|
|
232
|
+
self.config['turn_detection'] = None
|
|
233
|
+
self.config.update(kwargs)
|
|
234
|
+
self.__send_str(
|
|
235
|
+
json.dumps({
|
|
236
|
+
'event_id': self._generate_event_id(),
|
|
237
|
+
'type': 'session.update',
|
|
238
|
+
'session': self.config
|
|
239
|
+
}))
|
|
240
|
+
|
|
241
|
+
def append_audio(self, audio_b64: str) -> None:
|
|
242
|
+
'''
|
|
243
|
+
send audio in base64 format
|
|
244
|
+
|
|
245
|
+
Parameters
|
|
246
|
+
----------
|
|
247
|
+
audio_b64: str
|
|
248
|
+
base64 audio string
|
|
249
|
+
'''
|
|
250
|
+
logger.debug('[omni realtime] append audio: {}'.format(len(audio_b64)))
|
|
251
|
+
self.__send_str(
|
|
252
|
+
json.dumps({
|
|
253
|
+
'event_id': self._generate_event_id(),
|
|
254
|
+
'type': 'input_audio_buffer.append',
|
|
255
|
+
'audio': audio_b64
|
|
256
|
+
}), False)
|
|
257
|
+
|
|
258
|
+
def append_video(self, video_b64: str) -> None:
|
|
259
|
+
'''
|
|
260
|
+
send one image frame in video in base64 format
|
|
261
|
+
|
|
262
|
+
Parameters
|
|
263
|
+
----------
|
|
264
|
+
video_b64: str
|
|
265
|
+
base64 image string
|
|
266
|
+
'''
|
|
267
|
+
logger.debug('[omni realtime] append video: {}'.format(len(video_b64)))
|
|
268
|
+
self.__send_str(
|
|
269
|
+
json.dumps({
|
|
270
|
+
'event_id': self._generate_event_id(),
|
|
271
|
+
'type': 'input_image_buffer.append',
|
|
272
|
+
'image': video_b64
|
|
273
|
+
}), False)
|
|
274
|
+
|
|
275
|
+
def commit(self, ) -> None:
|
|
276
|
+
'''
|
|
277
|
+
Commit the audio and video sent before.
|
|
278
|
+
When in Server VAD mode, the client does not need to use this method,
|
|
279
|
+
the server will commit the audio automatically after detecting vad end.
|
|
280
|
+
'''
|
|
281
|
+
self.__send_str(
|
|
282
|
+
json.dumps({
|
|
283
|
+
'event_id': self._generate_event_id(),
|
|
284
|
+
'type': 'input_audio_buffer.commit'
|
|
285
|
+
}))
|
|
286
|
+
|
|
287
|
+
def clear_appended_audio(self, ) -> None:
|
|
288
|
+
'''
|
|
289
|
+
clear the audio sent to server before.
|
|
290
|
+
'''
|
|
291
|
+
self.__send_str(
|
|
292
|
+
json.dumps({
|
|
293
|
+
'event_id': self._generate_event_id(),
|
|
294
|
+
'type': 'input_audio_buffer.clear'
|
|
295
|
+
}))
|
|
296
|
+
|
|
297
|
+
def create_response(self,
|
|
298
|
+
instructions: str = None,
|
|
299
|
+
output_modalities: List[MultiModality] = None) -> None:
|
|
300
|
+
'''
|
|
301
|
+
create response, use audio and video commited before to request llm.
|
|
302
|
+
When in Server VAD mode, the client does not need to use this method,
|
|
303
|
+
the server will create response automatically after detecting vad
|
|
304
|
+
and sending commit.
|
|
305
|
+
|
|
306
|
+
Parameters
|
|
307
|
+
----------
|
|
308
|
+
instructions: str
|
|
309
|
+
instructions to llm
|
|
310
|
+
output_modalities: list[MultiModality]
|
|
311
|
+
omni output modalities to be used in session
|
|
312
|
+
'''
|
|
313
|
+
request = {
|
|
314
|
+
'event_id': self._generate_event_id(),
|
|
315
|
+
'type': 'response.create',
|
|
316
|
+
'response': {}
|
|
317
|
+
}
|
|
318
|
+
request['response']['instructions'] = instructions
|
|
319
|
+
if output_modalities:
|
|
320
|
+
request['response']['modalities'] = [
|
|
321
|
+
m.value for m in output_modalities
|
|
322
|
+
]
|
|
323
|
+
self.__send_str(json.dumps(request))
|
|
324
|
+
|
|
325
|
+
def cancel_response(self, ) -> None:
|
|
326
|
+
'''
|
|
327
|
+
cancel the current response
|
|
328
|
+
'''
|
|
329
|
+
self.__send_str(
|
|
330
|
+
json.dumps({
|
|
331
|
+
'event_id': self._generate_event_id(),
|
|
332
|
+
'type': 'response.cancel'
|
|
333
|
+
}))
|
|
334
|
+
|
|
335
|
+
def send_raw(self, raw_data: str) -> None:
|
|
336
|
+
'''
|
|
337
|
+
send raw data to server
|
|
338
|
+
'''
|
|
339
|
+
self.__send_str(raw_data)
|
|
340
|
+
|
|
341
|
+
def close(self, ) -> None:
|
|
342
|
+
'''
|
|
343
|
+
close the connection to server
|
|
344
|
+
'''
|
|
345
|
+
self.ws.close()
|
|
346
|
+
|
|
347
|
+
# 监听消息的回调函数
|
|
348
|
+
def on_message(self, ws, message):
|
|
349
|
+
if isinstance(message, str):
|
|
350
|
+
logger.debug('[omni realtime] receive string {}'.format(
|
|
351
|
+
message[:1024]))
|
|
352
|
+
try:
|
|
353
|
+
# 尝试将消息解析为JSON
|
|
354
|
+
json_data = json.loads(message)
|
|
355
|
+
self.last_message = json_data
|
|
356
|
+
self.callback.on_event(json_data)
|
|
357
|
+
if 'type' in message:
|
|
358
|
+
if 'session.created' == json_data['type']:
|
|
359
|
+
self.session_id = json_data['session']['id']
|
|
360
|
+
if 'response.created' == json_data['type']:
|
|
361
|
+
self.last_response_id = json_data['response']['id']
|
|
362
|
+
self.last_response_create_time = time.time() * 1000
|
|
363
|
+
self.last_first_audio_delay = None
|
|
364
|
+
self.last_first_text_delay = None
|
|
365
|
+
elif 'response.audio_transcript.delta' == json_data[
|
|
366
|
+
'type']:
|
|
367
|
+
if self.last_response_create_time and self.last_first_text_delay is None:
|
|
368
|
+
self.last_first_text_delay = time.time(
|
|
369
|
+
) * 1000 - self.last_response_create_time
|
|
370
|
+
elif 'response.audio.delta' == json_data['type']:
|
|
371
|
+
if self.last_response_create_time and self.last_first_audio_delay is None:
|
|
372
|
+
self.last_first_audio_delay = time.time(
|
|
373
|
+
) * 1000 - self.last_response_create_time
|
|
374
|
+
elif 'response.done' == json_data['type']:
|
|
375
|
+
logger.info(
|
|
376
|
+
'[Metric] response: {}, first text delay: {}, first audio delay: {}'
|
|
377
|
+
.format(self.last_response_id,
|
|
378
|
+
self.last_first_text_delay,
|
|
379
|
+
self.last_first_audio_delay))
|
|
380
|
+
except json.JSONDecodeError:
|
|
381
|
+
logger.error('Failed to parse message as JSON.')
|
|
382
|
+
raise Exception('Failed to parse message as JSON.')
|
|
383
|
+
elif isinstance(message, (bytes, bytearray)):
|
|
384
|
+
# 如果失败,认为是二进制消息
|
|
385
|
+
logger.error(
|
|
386
|
+
'should not receive binary message in omni realtime api')
|
|
387
|
+
logger.debug('[omni realtime] receive binary {} bytes'.format(
|
|
388
|
+
len(message)))
|
|
389
|
+
|
|
390
|
+
def on_close(self, ws, close_status_code, close_msg):
|
|
391
|
+
self.callback.on_close(close_status_code, close_msg)
|
|
392
|
+
|
|
393
|
+
# WebSocket发生错误的回调函数
|
|
394
|
+
def on_error(self, ws, error):
|
|
395
|
+
print(f'websocket closed due to {error}')
|
|
396
|
+
raise Exception(f'websocket closed due to {error}')
|
|
397
|
+
|
|
398
|
+
# 获取上一个任务的taskId
|
|
399
|
+
def get_session_id(self) -> str:
|
|
400
|
+
return self.session_id
|
|
401
|
+
|
|
402
|
+
def get_last_message(self) -> str:
|
|
403
|
+
return self.last_message
|
|
404
|
+
|
|
405
|
+
def get_last_message(self) -> str:
|
|
406
|
+
return self.last_message
|
|
407
|
+
|
|
408
|
+
def get_last_response_id(self) -> str:
|
|
409
|
+
return self.last_response_id
|
|
410
|
+
|
|
411
|
+
def get_last_first_text_delay(self):
|
|
412
|
+
return self.last_first_text_delay
|
|
413
|
+
|
|
414
|
+
def get_last_first_audio_delay(self):
|
|
415
|
+
return self.last_first_audio_delay
|