agora-python-server-sdk 2.1.5__tar.gz → 2.1.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of agora-python-server-sdk might be problematic. Click here for more details.
- {agora_python_server_sdk-2.1.5/agora_python_server_sdk.egg-info → agora_python_server_sdk-2.1.6}/PKG-INFO +143 -2
- {agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/README.md +142 -1
- {agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/_ctypes_handle/_audio_frame_observer.py +20 -2
- {agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/audio_frame_observer.py +1 -1
- agora_python_server_sdk-2.1.6/agora/rtc/audio_vad_manager.py +59 -0
- {agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/local_user.py +4 -2
- {agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/utils/audio_consumer.py +4 -3
- {agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/voice_detection.py +3 -3
- {agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6/agora_python_server_sdk.egg-info}/PKG-INFO +143 -2
- {agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora_python_server_sdk.egg-info/SOURCES.txt +1 -0
- {agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/setup.py +1 -1
- {agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/MANIFEST.in +0 -0
- {agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/__init__.py +0 -0
- {agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/_ctypes_handle/_ctypes_data.py +0 -0
- {agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/_ctypes_handle/_local_user_observer.py +0 -0
- {agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/_ctypes_handle/_rtc_connection_observer.py +0 -0
- {agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/_ctypes_handle/_video_encoded_frame_observer.py +0 -0
- {agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/_ctypes_handle/_video_frame_observer.py +0 -0
- {agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/_utils/globals.py +0 -0
- {agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/agora_base.py +0 -0
- {agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/agora_parameter.py +0 -0
- {agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/agora_service.py +0 -0
- {agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/audio_encoded_frame_sender.py +0 -0
- {agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/audio_pcm_data_sender.py +0 -0
- {agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/audio_sessionctrl.py +0 -0
- {agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/local_audio_track.py +0 -0
- {agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/local_user_observer.py +0 -0
- {agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/local_video_track.py +0 -0
- {agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/media_node_factory.py +0 -0
- {agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/remote_audio_track.py +0 -0
- {agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/remote_video_track.py +0 -0
- {agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/rtc_connection.py +0 -0
- {agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/rtc_connection_observer.py +0 -0
- {agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/utils/vad_dump.py +0 -0
- {agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/video_encoded_frame_observer.py +0 -0
- {agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/video_encoded_image_sender.py +0 -0
- {agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/video_frame_observer.py +0 -0
- {agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/video_frame_sender.py +0 -0
- {agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora_python_server_sdk.egg-info/dependency_links.txt +0 -0
- {agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora_python_server_sdk.egg-info/top_level.txt +0 -0
- {agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/pyproject.toml +0 -0
- {agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: agora_python_server_sdk
|
|
3
|
-
Version: 2.1.
|
|
3
|
+
Version: 2.1.6
|
|
4
4
|
Summary: A Python SDK for Agora Server
|
|
5
5
|
Home-page: https://github.com/AgoraIO-Extensions/Agora-Python-Server-SDK
|
|
6
6
|
Classifier: Intended Audience :: Developers
|
|
@@ -29,7 +29,7 @@ Description-Content-Type: text/markdown
|
|
|
29
29
|
- CentOS 7.0 and above
|
|
30
30
|
|
|
31
31
|
- Supported Mac versions:
|
|
32
|
-
- MacOS 13 and above
|
|
32
|
+
- MacOS 13 and above(only for coding and testing)
|
|
33
33
|
|
|
34
34
|
- Python version:
|
|
35
35
|
- Python 3.10 and above
|
|
@@ -51,6 +51,28 @@ python agora_rtc/examples/example_audio_pcm_send.py --appId=xxx --channelId=xxx
|
|
|
51
51
|
|
|
52
52
|
# Change log
|
|
53
53
|
|
|
54
|
+
## 2024.12.09 Release 2.1.6
|
|
55
|
+
- New Features:
|
|
56
|
+
-- Added AudioVadManager to manage VAD (Voice Activity Detection) instances.
|
|
57
|
+
-- Integrated VAD functionality into the SDK. Developers no longer need to worry about how to use VAD; they only need to focus on setting appropriate parameters. Reference: sample_audio_vad.py
|
|
58
|
+
- Changes:
|
|
59
|
+
-- In register_audio_frame_observer, two new parameters have been added to set the VAD parameters. Reference: sample_audio_vad.py
|
|
60
|
+
-- In on_playback_audio_frame_before_mixing, two new return values have been added: vad_result_state and vad_result_bytearray.
|
|
61
|
+
state:
|
|
62
|
+
< 0: No internal automatic VAD applied
|
|
63
|
+
0: No speaking
|
|
64
|
+
1: Started speaking
|
|
65
|
+
2: Speaking
|
|
66
|
+
3: Stopped speaking
|
|
67
|
+
vad_result_bytearray: The result processed by VAD, returned when VAD is active.
|
|
68
|
+
If automatic VAD is enabled:
|
|
69
|
+
Developers should use vad_result_bytearray for subsequent business processing (e.g., sending to ASR/STT), rather than using the raw frame data.
|
|
70
|
+
Reference: sample_audio_vad.py
|
|
71
|
+
- Optimizations:
|
|
72
|
+
-- Replaced the use of pacer with AudioConsumer for pushing PCM audio.
|
|
73
|
+
- Updates:
|
|
74
|
+
-- Updated the samples related to Pacer and VAD.
|
|
75
|
+
|
|
54
76
|
## 2024.12.03 release Version 2.1.5
|
|
55
77
|
- Modifications:
|
|
56
78
|
- LocalUser/audioTrack:
|
|
@@ -138,3 +160,122 @@ Judge the value of state according to the returned state, and do corresponding p
|
|
|
138
160
|
# Source code: audio_consumer.py
|
|
139
161
|
# Sample code: example_audio_consumer.py
|
|
140
162
|
### How to release resources?
|
|
163
|
+
## 如何释放资源?
|
|
164
|
+
localuser.unpublish_audio(audio_track)
|
|
165
|
+
localuser.unpublish_video(video_track)
|
|
166
|
+
audio_track.set_enabled(0)
|
|
167
|
+
video_track.set_enabled(0)
|
|
168
|
+
|
|
169
|
+
localuser.unregister_audio_frame_observer()
|
|
170
|
+
localuser.unregister_video_frame_observer()
|
|
171
|
+
localuser.unregister_local_user_observer()
|
|
172
|
+
|
|
173
|
+
connection.disconnect()
|
|
174
|
+
connection.unregister_observer()
|
|
175
|
+
|
|
176
|
+
localuser.release()
|
|
177
|
+
connection.release()
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
audio_track.release()
|
|
181
|
+
video_track.release()
|
|
182
|
+
pcm_data_sender.release()
|
|
183
|
+
video_data_sender.release()
|
|
184
|
+
audio_consumer.release()
|
|
185
|
+
|
|
186
|
+
media_node_factory.release()
|
|
187
|
+
agora_service.release()
|
|
188
|
+
|
|
189
|
+
#set to None
|
|
190
|
+
audio_track = None
|
|
191
|
+
video_track = None
|
|
192
|
+
audio_observer = None
|
|
193
|
+
video_observer = None
|
|
194
|
+
local_observer = None
|
|
195
|
+
localuser = None
|
|
196
|
+
connection = None
|
|
197
|
+
agora_service = None
|
|
198
|
+
|
|
199
|
+
## Interrupt Handling in AI Scenarios
|
|
200
|
+
# Definition of Interrupt
|
|
201
|
+
In human-machine dialogue, an interrupt refers to the situation where a user suddenly interrupts the robot's response, requesting the robot to stop its current response immediately and shift to answer the user's new question. This behavior is called an interrupt.
|
|
202
|
+
|
|
203
|
+
# Trigger Conditions for Interrupts
|
|
204
|
+
Interrupts can be defined in different ways depending on the product. There are generally two modes:
|
|
205
|
+
|
|
206
|
+
- Mode 1: Voice Activation Mode
|
|
207
|
+
When it detects that the user is speaking, the interrupt strategy is triggered. For example, when the system recognizes speech, it triggers the interrupt strategy to stop the robot's response.
|
|
208
|
+
|
|
209
|
+
- Mode 2: ASR Activation Mode
|
|
210
|
+
When the system detects that the user is speaking and receives a result from ASR (Automatic Speech Recognition) or STT (Speech-to-Text), the interrupt strategy is triggered.
|
|
211
|
+
|
|
212
|
+
# Advantages of Different Interrupt Strategies
|
|
213
|
+
Voice Activation Interrupt
|
|
214
|
+
|
|
215
|
+
Advantages:
|
|
216
|
+
Reduces the user's wait time and the likelihood of interrupts, as the robot will stop its response immediately when the user starts speaking, eliminating the need for the user to wait for the robot to finish speaking.
|
|
217
|
+
Disadvantages:
|
|
218
|
+
Since this is voice-activated, it may be triggered by meaningless audio signals, depending on the accuracy of the VAD (Voice Activity Detection). For example, if someone is typing on the keyboard while the AI is speaking, it might trigger the interrupt incorrectly.
|
|
219
|
+
ASR Activation Interrupt
|
|
220
|
+
|
|
221
|
+
Advantages:
|
|
222
|
+
Reduces the probability of unnecessary interrupts because the interrupt strategy is triggered only after ASR or STT has recognized the user’s speech.
|
|
223
|
+
Disadvantages:
|
|
224
|
+
Since this is ASR/STT-triggered, it requires converting the audio signal into text, which introduces a delay before the interrupt can be processed.
|
|
225
|
+
- Recommended Mode
|
|
226
|
+
If the VAD can filter out non-speech signals and only triggers when human speech is detected, the Voice Activation Mode is recommended. This mode is also suitable when the delay in processing the interrupt is not a major concern.
|
|
227
|
+
|
|
228
|
+
If the interrupt delay is not sensitive, the ASR Activation Mode is recommended. This mode can filter out non-speech signals more effectively and reduce the probability of an unintended interrupt.
|
|
229
|
+
|
|
230
|
+
How to Implement Interrupts? What Actions Are Required?
|
|
231
|
+
In a human-machine dialogue system, conversations are typically structured in "rounds," where each round consists of a question from the user, followed by a response from the robot, and so on. For each round, we can assign a roundId, incrementing it with each new round. A round consists of the following stages:
|
|
232
|
+
|
|
233
|
+
VAD (Voice Activity Detection):
|
|
234
|
+
This marks the start of the dialogue, where the system detects the beginning and end of the user's speech. It then passes this information to the ASR for further processing.
|
|
235
|
+
|
|
236
|
+
ASR (Automatic Speech Recognition):
|
|
237
|
+
This phase involves recognizing the user's speech and converting it into text, which is then passed to the LLM (Large Language Model).
|
|
238
|
+
|
|
239
|
+
LLM (Large Language Model):
|
|
240
|
+
This is the generation phase, where the LLM processes the recognized user input and generates a response.
|
|
241
|
+
|
|
242
|
+
TTS (Text-to-Speech):
|
|
243
|
+
In this phase, the LLM’s response is converted into an audio format.
|
|
244
|
+
|
|
245
|
+
RTC Streaming:
|
|
246
|
+
The generated audio is streamed via RTC (Real-Time Communication) to be played back to the user.
|
|
247
|
+
|
|
248
|
+
Therefore, an interrupt happens when, in the next round (roundId+1), either through Voice Activation (triggered by the VAD phase) or ASR Activation (triggered when ASR recognizes the user’s speech), the following actions must be performed:
|
|
249
|
+
|
|
250
|
+
Stop the LLM Generation in the current round (roundId).
|
|
251
|
+
Stop the TTS Synthesis in the current round (roundId).
|
|
252
|
+
Stop the RTC Streaming in the current round (roundId).
|
|
253
|
+
API Call References:
|
|
254
|
+
Call: AudioConsumer.clear()
|
|
255
|
+
Call: LocalAudioTrack.clear_sender_buffer()
|
|
256
|
+
Business Layer: Clear any remaining TTS-related data (if applicable)
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
## When to Pass LLM Results to TTS for Synthesis?
|
|
260
|
+
LLM (Large Language Model) results are returned asynchronously and in a streaming manner. When should the results from the LLM be passed to TTS (Text-to-Speech) for synthesis?
|
|
261
|
+
|
|
262
|
+
Two main factors need to be considered:
|
|
263
|
+
|
|
264
|
+
Ensure that the TTS synthesized speech is unambiguous:
|
|
265
|
+
The speech synthesized by TTS must be clear, complete, and continuous. For example, if the LLM returns the text: "中间的首都是北京吗?", and we pass it to TTS as:
|
|
266
|
+
|
|
267
|
+
"中",
|
|
268
|
+
"国首",
|
|
269
|
+
"是北",
|
|
270
|
+
"京吗?",
|
|
271
|
+
This would result in ambiguous synthesis because there are no spaces between certain words (e.g., between "中" and "国", "首" and "是", and "京" and "吗"). Proper segmentation must be ensured to avoid such ambiguities.
|
|
272
|
+
Minimize overall processing delay:
|
|
273
|
+
If the LLM results are passed to TTS only after the entire response is generated, the speech synthesis will be unambiguous and continuous. However, this approach introduces significant delay, which negatively affects the user experience.
|
|
274
|
+
|
|
275
|
+
Recommended Approach
|
|
276
|
+
To achieve a balance between clarity and minimal delay, the following steps should be followed:
|
|
277
|
+
|
|
278
|
+
Store the LLM results in a cache as they are received.
|
|
279
|
+
Perform a reverse scan of the cached data to find the most recent punctuation mark.
|
|
280
|
+
Truncate the data from the start to the most recent punctuation mark and pass it to TTS for synthesis.
|
|
281
|
+
Remove the truncated data from the cache. The remaining data should be moved to the beginning of the cache and continue waiting for additional data from the LLM.
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
- CentOS 7.0 and above
|
|
15
15
|
|
|
16
16
|
- Supported Mac versions:
|
|
17
|
-
- MacOS 13 and above
|
|
17
|
+
- MacOS 13 and above(only for coding and testing)
|
|
18
18
|
|
|
19
19
|
- Python version:
|
|
20
20
|
- Python 3.10 and above
|
|
@@ -36,6 +36,28 @@ python agora_rtc/examples/example_audio_pcm_send.py --appId=xxx --channelId=xxx
|
|
|
36
36
|
|
|
37
37
|
# Change log
|
|
38
38
|
|
|
39
|
+
## 2024.12.09 Release 2.1.6
|
|
40
|
+
- New Features:
|
|
41
|
+
-- Added AudioVadManager to manage VAD (Voice Activity Detection) instances.
|
|
42
|
+
-- Integrated VAD functionality into the SDK. Developers no longer need to worry about how to use VAD; they only need to focus on setting appropriate parameters. Reference: sample_audio_vad.py
|
|
43
|
+
- Changes:
|
|
44
|
+
-- In register_audio_frame_observer, two new parameters have been added to set the VAD parameters. Reference: sample_audio_vad.py
|
|
45
|
+
-- In on_playback_audio_frame_before_mixing, two new return values have been added: vad_result_state and vad_result_bytearray.
|
|
46
|
+
state:
|
|
47
|
+
< 0: No internal automatic VAD applied
|
|
48
|
+
0: No speaking
|
|
49
|
+
1: Started speaking
|
|
50
|
+
2: Speaking
|
|
51
|
+
3: Stopped speaking
|
|
52
|
+
vad_result_bytearray: The result processed by VAD, returned when VAD is active.
|
|
53
|
+
If automatic VAD is enabled:
|
|
54
|
+
Developers should use vad_result_bytearray for subsequent business processing (e.g., sending to ASR/STT), rather than using the raw frame data.
|
|
55
|
+
Reference: sample_audio_vad.py
|
|
56
|
+
- Optimizations:
|
|
57
|
+
-- Replaced the use of pacer with AudioConsumer for pushing PCM audio.
|
|
58
|
+
- Updates:
|
|
59
|
+
-- Updated the samples related to Pacer and VAD.
|
|
60
|
+
|
|
39
61
|
## 2024.12.03 release Version 2.1.5
|
|
40
62
|
- Modifications:
|
|
41
63
|
- LocalUser/audioTrack:
|
|
@@ -123,3 +145,122 @@ Judge the value of state according to the returned state, and do corresponding p
|
|
|
123
145
|
# Source code: audio_consumer.py
|
|
124
146
|
# Sample code: example_audio_consumer.py
|
|
125
147
|
### How to release resources?
|
|
148
|
+
## 如何释放资源?
|
|
149
|
+
localuser.unpublish_audio(audio_track)
|
|
150
|
+
localuser.unpublish_video(video_track)
|
|
151
|
+
audio_track.set_enabled(0)
|
|
152
|
+
video_track.set_enabled(0)
|
|
153
|
+
|
|
154
|
+
localuser.unregister_audio_frame_observer()
|
|
155
|
+
localuser.unregister_video_frame_observer()
|
|
156
|
+
localuser.unregister_local_user_observer()
|
|
157
|
+
|
|
158
|
+
connection.disconnect()
|
|
159
|
+
connection.unregister_observer()
|
|
160
|
+
|
|
161
|
+
localuser.release()
|
|
162
|
+
connection.release()
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
audio_track.release()
|
|
166
|
+
video_track.release()
|
|
167
|
+
pcm_data_sender.release()
|
|
168
|
+
video_data_sender.release()
|
|
169
|
+
audio_consumer.release()
|
|
170
|
+
|
|
171
|
+
media_node_factory.release()
|
|
172
|
+
agora_service.release()
|
|
173
|
+
|
|
174
|
+
#set to None
|
|
175
|
+
audio_track = None
|
|
176
|
+
video_track = None
|
|
177
|
+
audio_observer = None
|
|
178
|
+
video_observer = None
|
|
179
|
+
local_observer = None
|
|
180
|
+
localuser = None
|
|
181
|
+
connection = None
|
|
182
|
+
agora_service = None
|
|
183
|
+
|
|
184
|
+
## Interrupt Handling in AI Scenarios
|
|
185
|
+
# Definition of Interrupt
|
|
186
|
+
In human-machine dialogue, an interrupt refers to the situation where a user suddenly interrupts the robot's response, requesting the robot to stop its current response immediately and shift to answer the user's new question. This behavior is called an interrupt.
|
|
187
|
+
|
|
188
|
+
# Trigger Conditions for Interrupts
|
|
189
|
+
Interrupts can be defined in different ways depending on the product. There are generally two modes:
|
|
190
|
+
|
|
191
|
+
- Mode 1: Voice Activation Mode
|
|
192
|
+
When it detects that the user is speaking, the interrupt strategy is triggered. For example, when the system recognizes speech, it triggers the interrupt strategy to stop the robot's response.
|
|
193
|
+
|
|
194
|
+
- Mode 2: ASR Activation Mode
|
|
195
|
+
When the system detects that the user is speaking and receives a result from ASR (Automatic Speech Recognition) or STT (Speech-to-Text), the interrupt strategy is triggered.
|
|
196
|
+
|
|
197
|
+
# Advantages of Different Interrupt Strategies
|
|
198
|
+
Voice Activation Interrupt
|
|
199
|
+
|
|
200
|
+
Advantages:
|
|
201
|
+
Reduces the user's wait time and the likelihood of interrupts, as the robot will stop its response immediately when the user starts speaking, eliminating the need for the user to wait for the robot to finish speaking.
|
|
202
|
+
Disadvantages:
|
|
203
|
+
Since this is voice-activated, it may be triggered by meaningless audio signals, depending on the accuracy of the VAD (Voice Activity Detection). For example, if someone is typing on the keyboard while the AI is speaking, it might trigger the interrupt incorrectly.
|
|
204
|
+
ASR Activation Interrupt
|
|
205
|
+
|
|
206
|
+
Advantages:
|
|
207
|
+
Reduces the probability of unnecessary interrupts because the interrupt strategy is triggered only after ASR or STT has recognized the user’s speech.
|
|
208
|
+
Disadvantages:
|
|
209
|
+
Since this is ASR/STT-triggered, it requires converting the audio signal into text, which introduces a delay before the interrupt can be processed.
|
|
210
|
+
- Recommended Mode
|
|
211
|
+
If the VAD can filter out non-speech signals and only triggers when human speech is detected, the Voice Activation Mode is recommended. This mode is also suitable when the delay in processing the interrupt is not a major concern.
|
|
212
|
+
|
|
213
|
+
If the interrupt delay is not sensitive, the ASR Activation Mode is recommended. This mode can filter out non-speech signals more effectively and reduce the probability of an unintended interrupt.
|
|
214
|
+
|
|
215
|
+
How to Implement Interrupts? What Actions Are Required?
|
|
216
|
+
In a human-machine dialogue system, conversations are typically structured in "rounds," where each round consists of a question from the user, followed by a response from the robot, and so on. For each round, we can assign a roundId, incrementing it with each new round. A round consists of the following stages:
|
|
217
|
+
|
|
218
|
+
VAD (Voice Activity Detection):
|
|
219
|
+
This marks the start of the dialogue, where the system detects the beginning and end of the user's speech. It then passes this information to the ASR for further processing.
|
|
220
|
+
|
|
221
|
+
ASR (Automatic Speech Recognition):
|
|
222
|
+
This phase involves recognizing the user's speech and converting it into text, which is then passed to the LLM (Large Language Model).
|
|
223
|
+
|
|
224
|
+
LLM (Large Language Model):
|
|
225
|
+
This is the generation phase, where the LLM processes the recognized user input and generates a response.
|
|
226
|
+
|
|
227
|
+
TTS (Text-to-Speech):
|
|
228
|
+
In this phase, the LLM’s response is converted into an audio format.
|
|
229
|
+
|
|
230
|
+
RTC Streaming:
|
|
231
|
+
The generated audio is streamed via RTC (Real-Time Communication) to be played back to the user.
|
|
232
|
+
|
|
233
|
+
Therefore, an interrupt happens when, in the next round (roundId+1), either through Voice Activation (triggered by the VAD phase) or ASR Activation (triggered when ASR recognizes the user’s speech), the following actions must be performed:
|
|
234
|
+
|
|
235
|
+
Stop the LLM Generation in the current round (roundId).
|
|
236
|
+
Stop the TTS Synthesis in the current round (roundId).
|
|
237
|
+
Stop the RTC Streaming in the current round (roundId).
|
|
238
|
+
API Call References:
|
|
239
|
+
Call: AudioConsumer.clear()
|
|
240
|
+
Call: LocalAudioTrack.clear_sender_buffer()
|
|
241
|
+
Business Layer: Clear any remaining TTS-related data (if applicable)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
## When to Pass LLM Results to TTS for Synthesis?
|
|
245
|
+
LLM (Large Language Model) results are returned asynchronously and in a streaming manner. When should the results from the LLM be passed to TTS (Text-to-Speech) for synthesis?
|
|
246
|
+
|
|
247
|
+
Two main factors need to be considered:
|
|
248
|
+
|
|
249
|
+
Ensure that the TTS synthesized speech is unambiguous:
|
|
250
|
+
The speech synthesized by TTS must be clear, complete, and continuous. For example, if the LLM returns the text: "中间的首都是北京吗?", and we pass it to TTS as:
|
|
251
|
+
|
|
252
|
+
"中",
|
|
253
|
+
"国首",
|
|
254
|
+
"是北",
|
|
255
|
+
"京吗?",
|
|
256
|
+
This would result in ambiguous synthesis because there are no spaces between certain words (e.g., between "中" and "国", "首" and "是", and "京" and "吗"). Proper segmentation must be ensured to avoid such ambiguities.
|
|
257
|
+
Minimize overall processing delay:
|
|
258
|
+
If the LLM results are passed to TTS only after the entire response is generated, the speech synthesis will be unambiguous and continuous. However, this approach introduces significant delay, which negatively affects the user experience.
|
|
259
|
+
|
|
260
|
+
Recommended Approach
|
|
261
|
+
To achieve a balance between clarity and minimal delay, the following steps should be followed:
|
|
262
|
+
|
|
263
|
+
Store the LLM results in a cache as they are received.
|
|
264
|
+
Perform a reverse scan of the cached data to find the most recent punctuation mark.
|
|
265
|
+
Truncate the data from the start to the most recent punctuation mark and pass it to TTS for synthesis.
|
|
266
|
+
Remove the truncated data from the cache. The remaining data should be moved to the beginning of the cache and continue waiting for additional data from the LLM.
|
|
@@ -5,6 +5,7 @@ import ctypes
|
|
|
5
5
|
from ..audio_frame_observer import *
|
|
6
6
|
import logging
|
|
7
7
|
logger = logging.getLogger(__name__)
|
|
8
|
+
from ..audio_vad_manager import AudioVadManager
|
|
8
9
|
#from ..audio_sessionctrl import *
|
|
9
10
|
|
|
10
11
|
ON_RECORD_AUDIO_FRAME_CALLBACK = ctypes.CFUNCTYPE(ctypes.c_int, AGORA_HANDLE, ctypes.c_char_p, ctypes.POINTER(AudioFrameInner))
|
|
@@ -35,7 +36,7 @@ class AudioFrameObserverInner(ctypes.Structure):
|
|
|
35
36
|
("on_get_ear_monitoring_audio_frame_param", ON_GET_EAR_MONITORING_AUDIO_FRAME_PARAM_CALLBACK)
|
|
36
37
|
]
|
|
37
38
|
|
|
38
|
-
def __init__(self, observer: IAudioFrameObserver, local_user: 'LocalUser'):
|
|
39
|
+
def __init__(self, observer: IAudioFrameObserver, local_user: 'LocalUser', enable_vad: int, vad_configure):
|
|
39
40
|
self.observer = observer
|
|
40
41
|
self.local_user = local_user
|
|
41
42
|
self.on_record_audio_frame = ON_RECORD_AUDIO_FRAME_CALLBACK(self._on_record_audio_frame)
|
|
@@ -45,6 +46,8 @@ class AudioFrameObserverInner(ctypes.Structure):
|
|
|
45
46
|
self.on_playback_audio_frame_before_mixing = ON_PLAYBACK_AUDIO_FRAME_BEFORE_MIXING_CALLBACK(self._on_playback_audio_frame_before_mixing)
|
|
46
47
|
self.on_get_audio_frame_position = ON_GET_AUDIO_FRAME_POSITION_CALLBACK(self._on_get_audio_frame_position)
|
|
47
48
|
self._session_ctrl_manager = None #SessionCtrlManager()
|
|
49
|
+
self._vad_instance_manager = AudioVadManager(vad_configure) if enable_vad else None
|
|
50
|
+
self._enable_vad = True if enable_vad > 0 else False
|
|
48
51
|
|
|
49
52
|
# self.on_get_playback_audio_frame_param = ON_GET_PLAYBACK_AUDIO_FRAME_PARAM_CALLBACK(self._on_get_playback_audio_frame_param)
|
|
50
53
|
# self.on_get_record_audio_frame_param = ON_GET_RECORD_AUDIO_FRAME_PARAM_CALLBACK(self._on_get_record_audio_frame_param)
|
|
@@ -91,7 +94,15 @@ class AudioFrameObserverInner(ctypes.Structure):
|
|
|
91
94
|
|
|
92
95
|
user_id_str = user_id.decode('utf-8')
|
|
93
96
|
frame = audio_frame_inner.contents.get()
|
|
94
|
-
|
|
97
|
+
# make a map: key{channel_id, user_id}, value: vadv2 instance
|
|
98
|
+
# and call back in this call back
|
|
99
|
+
# when to create and remove the key from map?
|
|
100
|
+
# in _del_ function to release the vadv2 instance
|
|
101
|
+
if self._enable_vad:
|
|
102
|
+
vad_result_state, vad_result_bytes = self._vad_instance_manager.process(channel_id_str, user_id_str, frame)
|
|
103
|
+
ret = self.observer.on_playback_audio_frame_before_mixing(self.local_user, channel_id_str, user_id_str, frame, vad_result_state, vad_result_bytes)
|
|
104
|
+
else:
|
|
105
|
+
ret = self.observer.on_playback_audio_frame_before_mixing(self.local_user, channel_id_str, user_id_str, frame, -1, None)
|
|
95
106
|
return ret
|
|
96
107
|
|
|
97
108
|
def _on_get_audio_frame_position(self, local_user_handle):
|
|
@@ -113,3 +124,10 @@ class AudioFrameObserverInner(ctypes.Structure):
|
|
|
113
124
|
def _on_get_ear_monitoring_audio_frame_param(self, local_user_handle) -> AudioParams:
|
|
114
125
|
logger.debug(f"AudioFrameObserverInner _on_get_ear_monitoring_audio_frame_param: {local_user_handle}")
|
|
115
126
|
return self.observer.on_get_ear_monitoring_audio_frame_param(self.local_user)
|
|
127
|
+
def clear(self):
|
|
128
|
+
#disalbe vad
|
|
129
|
+
self._enable_vad = False
|
|
130
|
+
if self._vad_instance_manager:
|
|
131
|
+
self._vad_instance_manager.release()
|
|
132
|
+
self._vad_instance_manager = None
|
|
133
|
+
pass
|
{agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/audio_frame_observer.py
RENAMED
|
@@ -14,7 +14,7 @@ class IAudioFrameObserver:
|
|
|
14
14
|
def on_ear_monitoring_audio_frame(self, agora_local_user, frame):
|
|
15
15
|
return 1
|
|
16
16
|
|
|
17
|
-
def on_playback_audio_frame_before_mixing(self, agora_local_user, channelId, uid, frame: AudioFrame):
|
|
17
|
+
def on_playback_audio_frame_before_mixing(self, agora_local_user, channelId, uid, frame: AudioFrame, vad_result_state:int, vad_result_bytearray:bytearray):
|
|
18
18
|
return 1
|
|
19
19
|
|
|
20
20
|
def on_get_audio_frame_position(self, agora_local_user):
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import ctypes
|
|
2
|
+
from .agora_base import *
|
|
3
|
+
import ctypes
|
|
4
|
+
from .audio_frame_observer import *
|
|
5
|
+
from .voice_detection import AudioVadV2, AudioVadConfigV2
|
|
6
|
+
import logging
|
|
7
|
+
from threading import Lock
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
# 需要考虑不同的vad configure:也就是说需要触发什么时候做设置??
|
|
11
|
+
#可以做一个对外的接口:update(channel_id, user_id, vad_config)
|
|
12
|
+
# 然后一个总的configure接口:update_all(channel_id, vad_config)
|
|
13
|
+
# 默认配置用service 的configure
|
|
14
|
+
class AudioVadManager():
|
|
15
|
+
def __init__(self, configure: AudioVadConfigV2) -> None:
|
|
16
|
+
self._instance_map = {} # set to dict
|
|
17
|
+
self._vad_config = configure
|
|
18
|
+
self._lock = Lock()
|
|
19
|
+
self._is_init = True
|
|
20
|
+
pass
|
|
21
|
+
def _make_key(self, channel_id: str, user_id: str) -> str:
|
|
22
|
+
return channel_id + user_id
|
|
23
|
+
def get_vad_instance(self, channel_id: str, user_id: str) -> AudioVadV2:
|
|
24
|
+
key = self._make_key(channel_id, user_id)
|
|
25
|
+
with self._lock:
|
|
26
|
+
return self._instance_map.get(key, None)
|
|
27
|
+
#note: inner function, not thread safe, but should be ok, since it is called by other thread safe function
|
|
28
|
+
def _add_vad_instance(self, channel_id: str, user_id: str) -> int:
|
|
29
|
+
key = self._make_key(channel_id, user_id)
|
|
30
|
+
self._instance_map[key] = AudioVadV2(self._vad_config)
|
|
31
|
+
return 0
|
|
32
|
+
pass
|
|
33
|
+
def del_vad_instance(self, channel_id: str, user_id: str) -> None:
|
|
34
|
+
key = self._make_key(channel_id, user_id)
|
|
35
|
+
with self._lock:
|
|
36
|
+
self._instance_map.pop(key, None)
|
|
37
|
+
def get_vad_instance(self, channel_id: str, user_id: str) -> AudioVadV2:
|
|
38
|
+
key = self._make_key(channel_id, user_id)
|
|
39
|
+
with self._lock:
|
|
40
|
+
return self._instance_map.get(key, None)
|
|
41
|
+
def process(self, channel_id: str, user_id: str, frame: AudioFrame) -> tuple[int, bytearray]:
|
|
42
|
+
if self._is_init is False:
|
|
43
|
+
return -2, None
|
|
44
|
+
vad_instance = self.get_vad_instance(channel_id, user_id)
|
|
45
|
+
if vad_instance is not None:
|
|
46
|
+
return vad_instance.process(frame)
|
|
47
|
+
else:
|
|
48
|
+
#add new one
|
|
49
|
+
self._add_vad_instance(channel_id, user_id)
|
|
50
|
+
return -1, None
|
|
51
|
+
pass
|
|
52
|
+
def release(self) -> None:
|
|
53
|
+
print("____release vad manager: ", len(self._instance_map))
|
|
54
|
+
if self._is_init is False:
|
|
55
|
+
return
|
|
56
|
+
self._is_init = False
|
|
57
|
+
with self._lock:
|
|
58
|
+
self._instance_map.clear()
|
|
59
|
+
pass
|
|
@@ -413,10 +413,10 @@ class LocalUser:
|
|
|
413
413
|
ret = agora_local_user_set_playback_audio_frame_before_mixing_parameters(self.user_handle, channels, sample_rate_hz)
|
|
414
414
|
return ret
|
|
415
415
|
|
|
416
|
-
def register_audio_frame_observer(self, observer: IAudioFrameObserver):
|
|
416
|
+
def register_audio_frame_observer(self, observer: IAudioFrameObserver, enable_vad: int, vad_configure):
|
|
417
417
|
if self.audio_frame_observer:
|
|
418
418
|
self.unregister_audio_frame_observer()
|
|
419
|
-
self.audio_frame_observer = AudioFrameObserverInner(observer, self)
|
|
419
|
+
self.audio_frame_observer = AudioFrameObserverInner(observer, self, enable_vad, vad_configure)
|
|
420
420
|
ret = agora_local_user_register_audio_frame_observer(self.user_handle, self.audio_frame_observer)
|
|
421
421
|
return ret
|
|
422
422
|
|
|
@@ -424,6 +424,8 @@ class LocalUser:
|
|
|
424
424
|
ret = 0
|
|
425
425
|
if self.audio_frame_observer:
|
|
426
426
|
ret = agora_local_user_unregister_audio_frame_observer(self.user_handle)
|
|
427
|
+
# clear observerInner related resoure
|
|
428
|
+
self.audio_frame_observer.clear()
|
|
427
429
|
self.audio_frame_observer = None
|
|
428
430
|
return ret
|
|
429
431
|
|
{agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/utils/audio_consumer.py
RENAMED
|
@@ -69,7 +69,7 @@ class AudioConsumer:
|
|
|
69
69
|
def consume(self):
|
|
70
70
|
print("consume begin")
|
|
71
71
|
if self._init == False:
|
|
72
|
-
return
|
|
72
|
+
return -1
|
|
73
73
|
now = time.time()*1000
|
|
74
74
|
elapsed_time = int(now - self._start_time)
|
|
75
75
|
expected_total_packages = int(elapsed_time//10)
|
|
@@ -78,7 +78,7 @@ class AudioConsumer:
|
|
|
78
78
|
|
|
79
79
|
if besent_packages > 18 and data_len //self._bytes_per_frame < 18: #for fist time, if data_len is not enough, just return and wait for next time
|
|
80
80
|
#print("-----underflow data")
|
|
81
|
-
return
|
|
81
|
+
return -2
|
|
82
82
|
if besent_packages > 18: #rest to start state, push 18 packs in Start_STATE
|
|
83
83
|
self._reset()
|
|
84
84
|
besent_packages = min(18, data_len//self._bytes_per_frame)
|
|
@@ -89,7 +89,7 @@ class AudioConsumer:
|
|
|
89
89
|
act_besent_packages = (int)(min(besent_packages, data_len//self._bytes_per_frame))
|
|
90
90
|
#print("consume 1:", act_besent_packages, data_len)
|
|
91
91
|
if act_besent_packages < 1:
|
|
92
|
-
return
|
|
92
|
+
return 0
|
|
93
93
|
|
|
94
94
|
#construct an audio frame to push
|
|
95
95
|
#frame = PcmAudioFrame()
|
|
@@ -104,6 +104,7 @@ class AudioConsumer:
|
|
|
104
104
|
self._consumed_packages += act_besent_packages
|
|
105
105
|
|
|
106
106
|
self._pcm_sender.send_audio_pcm_data(self._frame)
|
|
107
|
+
return self._consumed_packages
|
|
107
108
|
#print(f"act_besent_packages: {now},{now - self._start_time}, {besent_packages}, {act_besent_packages},{self._consumed_packages},{data_len}")
|
|
108
109
|
pass
|
|
109
110
|
|
{agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/voice_detection.py
RENAMED
|
@@ -115,7 +115,7 @@ class AudioVadV2():
|
|
|
115
115
|
|
|
116
116
|
ratios = self._calculate_sliding_window_ratio(queue, self._trend_window)
|
|
117
117
|
# 计算趋势
|
|
118
|
-
print(ratios)
|
|
118
|
+
#print(ratios)
|
|
119
119
|
return 1 if ratios[1] > ratios[0] else 0
|
|
120
120
|
|
|
121
121
|
#get silence count from deque: totalcount, silenct_count
|
|
@@ -156,7 +156,7 @@ class AudioVadV2():
|
|
|
156
156
|
|
|
157
157
|
#and clear pre &start
|
|
158
158
|
self._clear_queue(self._stop_queue)
|
|
159
|
-
print("start speaking:", len(self._stop_queue))
|
|
159
|
+
#print("start speaking:", len(self._stop_queue))
|
|
160
160
|
|
|
161
161
|
return state, bytes
|
|
162
162
|
|
|
@@ -165,7 +165,7 @@ class AudioVadV2():
|
|
|
165
165
|
#如果数据满,怎判断是否触发stop
|
|
166
166
|
state = self._cur_state
|
|
167
167
|
size, full = self._push_to_stop(data)
|
|
168
|
-
print(f"stop: {size}, {full}")
|
|
168
|
+
#print(f"stop: {size}, {full}")
|
|
169
169
|
|
|
170
170
|
|
|
171
171
|
if full == True:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: agora_python_server_sdk
|
|
3
|
-
Version: 2.1.
|
|
3
|
+
Version: 2.1.6
|
|
4
4
|
Summary: A Python SDK for Agora Server
|
|
5
5
|
Home-page: https://github.com/AgoraIO-Extensions/Agora-Python-Server-SDK
|
|
6
6
|
Classifier: Intended Audience :: Developers
|
|
@@ -29,7 +29,7 @@ Description-Content-Type: text/markdown
|
|
|
29
29
|
- CentOS 7.0 and above
|
|
30
30
|
|
|
31
31
|
- Supported Mac versions:
|
|
32
|
-
- MacOS 13 and above
|
|
32
|
+
- MacOS 13 and above(only for coding and testing)
|
|
33
33
|
|
|
34
34
|
- Python version:
|
|
35
35
|
- Python 3.10 and above
|
|
@@ -51,6 +51,28 @@ python agora_rtc/examples/example_audio_pcm_send.py --appId=xxx --channelId=xxx
|
|
|
51
51
|
|
|
52
52
|
# Change log
|
|
53
53
|
|
|
54
|
+
## 2024.12.09 Release 2.1.6
|
|
55
|
+
- New Features:
|
|
56
|
+
-- Added AudioVadManager to manage VAD (Voice Activity Detection) instances.
|
|
57
|
+
-- Integrated VAD functionality into the SDK. Developers no longer need to worry about how to use VAD; they only need to focus on setting appropriate parameters. Reference: sample_audio_vad.py
|
|
58
|
+
- Changes:
|
|
59
|
+
-- In register_audio_frame_observer, two new parameters have been added to set the VAD parameters. Reference: sample_audio_vad.py
|
|
60
|
+
-- In on_playback_audio_frame_before_mixing, two new return values have been added: vad_result_state and vad_result_bytearray.
|
|
61
|
+
state:
|
|
62
|
+
< 0: No internal automatic VAD applied
|
|
63
|
+
0: No speaking
|
|
64
|
+
1: Started speaking
|
|
65
|
+
2: Speaking
|
|
66
|
+
3: Stopped speaking
|
|
67
|
+
vad_result_bytearray: The result processed by VAD, returned when VAD is active.
|
|
68
|
+
If automatic VAD is enabled:
|
|
69
|
+
Developers should use vad_result_bytearray for subsequent business processing (e.g., sending to ASR/STT), rather than using the raw frame data.
|
|
70
|
+
Reference: sample_audio_vad.py
|
|
71
|
+
- Optimizations:
|
|
72
|
+
-- Replaced the use of pacer with AudioConsumer for pushing PCM audio.
|
|
73
|
+
- Updates:
|
|
74
|
+
-- Updated the samples related to Pacer and VAD.
|
|
75
|
+
|
|
54
76
|
## 2024.12.03 release Version 2.1.5
|
|
55
77
|
- Modifications:
|
|
56
78
|
- LocalUser/audioTrack:
|
|
@@ -138,3 +160,122 @@ Judge the value of state according to the returned state, and do corresponding p
|
|
|
138
160
|
# Source code: audio_consumer.py
|
|
139
161
|
# Sample code: example_audio_consumer.py
|
|
140
162
|
### How to release resources?
|
|
163
|
+
## 如何释放资源?
|
|
164
|
+
localuser.unpublish_audio(audio_track)
|
|
165
|
+
localuser.unpublish_video(video_track)
|
|
166
|
+
audio_track.set_enabled(0)
|
|
167
|
+
video_track.set_enabled(0)
|
|
168
|
+
|
|
169
|
+
localuser.unregister_audio_frame_observer()
|
|
170
|
+
localuser.unregister_video_frame_observer()
|
|
171
|
+
localuser.unregister_local_user_observer()
|
|
172
|
+
|
|
173
|
+
connection.disconnect()
|
|
174
|
+
connection.unregister_observer()
|
|
175
|
+
|
|
176
|
+
localuser.release()
|
|
177
|
+
connection.release()
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
audio_track.release()
|
|
181
|
+
video_track.release()
|
|
182
|
+
pcm_data_sender.release()
|
|
183
|
+
video_data_sender.release()
|
|
184
|
+
audio_consumer.release()
|
|
185
|
+
|
|
186
|
+
media_node_factory.release()
|
|
187
|
+
agora_service.release()
|
|
188
|
+
|
|
189
|
+
#set to None
|
|
190
|
+
audio_track = None
|
|
191
|
+
video_track = None
|
|
192
|
+
audio_observer = None
|
|
193
|
+
video_observer = None
|
|
194
|
+
local_observer = None
|
|
195
|
+
localuser = None
|
|
196
|
+
connection = None
|
|
197
|
+
agora_service = None
|
|
198
|
+
|
|
199
|
+
## Interrupt Handling in AI Scenarios
|
|
200
|
+
# Definition of Interrupt
|
|
201
|
+
In human-machine dialogue, an interrupt refers to the situation where a user suddenly interrupts the robot's response, requesting the robot to stop its current response immediately and shift to answer the user's new question. This behavior is called an interrupt.
|
|
202
|
+
|
|
203
|
+
# Trigger Conditions for Interrupts
|
|
204
|
+
Interrupts can be defined in different ways depending on the product. There are generally two modes:
|
|
205
|
+
|
|
206
|
+
- Mode 1: Voice Activation Mode
|
|
207
|
+
When it detects that the user is speaking, the interrupt strategy is triggered. For example, when the system recognizes speech, it triggers the interrupt strategy to stop the robot's response.
|
|
208
|
+
|
|
209
|
+
- Mode 2: ASR Activation Mode
|
|
210
|
+
When the system detects that the user is speaking and receives a result from ASR (Automatic Speech Recognition) or STT (Speech-to-Text), the interrupt strategy is triggered.
|
|
211
|
+
|
|
212
|
+
# Advantages of Different Interrupt Strategies
|
|
213
|
+
Voice Activation Interrupt
|
|
214
|
+
|
|
215
|
+
Advantages:
|
|
216
|
+
Reduces the user's wait time and the likelihood of interrupts, as the robot will stop its response immediately when the user starts speaking, eliminating the need for the user to wait for the robot to finish speaking.
|
|
217
|
+
Disadvantages:
|
|
218
|
+
Since this is voice-activated, it may be triggered by meaningless audio signals, depending on the accuracy of the VAD (Voice Activity Detection). For example, if someone is typing on the keyboard while the AI is speaking, it might trigger the interrupt incorrectly.
|
|
219
|
+
ASR Activation Interrupt
|
|
220
|
+
|
|
221
|
+
Advantages:
|
|
222
|
+
Reduces the probability of unnecessary interrupts because the interrupt strategy is triggered only after ASR or STT has recognized the user’s speech.
|
|
223
|
+
Disadvantages:
|
|
224
|
+
Since this is ASR/STT-triggered, it requires converting the audio signal into text, which introduces a delay before the interrupt can be processed.
|
|
225
|
+
- Recommended Mode
|
|
226
|
+
If the VAD can filter out non-speech signals and only triggers when human speech is detected, the Voice Activation Mode is recommended. This mode is also suitable when the delay in processing the interrupt is not a major concern.
|
|
227
|
+
|
|
228
|
+
If the interrupt delay is not sensitive, the ASR Activation Mode is recommended. This mode can filter out non-speech signals more effectively and reduce the probability of an unintended interrupt.
|
|
229
|
+
|
|
230
|
+
How to Implement Interrupts? What Actions Are Required?
|
|
231
|
+
In a human-machine dialogue system, conversations are typically structured in "rounds," where each round consists of a question from the user, followed by a response from the robot, and so on. For each round, we can assign a roundId, incrementing it with each new round. A round consists of the following stages:
|
|
232
|
+
|
|
233
|
+
VAD (Voice Activity Detection):
|
|
234
|
+
This marks the start of the dialogue, where the system detects the beginning and end of the user's speech. It then passes this information to the ASR for further processing.
|
|
235
|
+
|
|
236
|
+
ASR (Automatic Speech Recognition):
|
|
237
|
+
This phase involves recognizing the user's speech and converting it into text, which is then passed to the LLM (Large Language Model).
|
|
238
|
+
|
|
239
|
+
LLM (Large Language Model):
|
|
240
|
+
This is the generation phase, where the LLM processes the recognized user input and generates a response.
|
|
241
|
+
|
|
242
|
+
TTS (Text-to-Speech):
|
|
243
|
+
In this phase, the LLM’s response is converted into an audio format.
|
|
244
|
+
|
|
245
|
+
RTC Streaming:
|
|
246
|
+
The generated audio is streamed via RTC (Real-Time Communication) to be played back to the user.
|
|
247
|
+
|
|
248
|
+
Therefore, an interrupt happens when, in the next round (roundId+1), either through Voice Activation (triggered by the VAD phase) or ASR Activation (triggered when ASR recognizes the user’s speech), the following actions must be performed:
|
|
249
|
+
|
|
250
|
+
Stop the LLM Generation in the current round (roundId).
|
|
251
|
+
Stop the TTS Synthesis in the current round (roundId).
|
|
252
|
+
Stop the RTC Streaming in the current round (roundId).
|
|
253
|
+
API Call References:
|
|
254
|
+
Call: AudioConsumer.clear()
|
|
255
|
+
Call: LocalAudioTrack.clear_sender_buffer()
|
|
256
|
+
Business Layer: Clear any remaining TTS-related data (if applicable)
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
## When to Pass LLM Results to TTS for Synthesis?
|
|
260
|
+
LLM (Large Language Model) results are returned asynchronously and in a streaming manner. When should the results from the LLM be passed to TTS (Text-to-Speech) for synthesis?
|
|
261
|
+
|
|
262
|
+
Two main factors need to be considered:
|
|
263
|
+
|
|
264
|
+
Ensure that the TTS synthesized speech is unambiguous:
|
|
265
|
+
The speech synthesized by TTS must be clear, complete, and continuous. For example, if the LLM returns the text: "中间的首都是北京吗?", and we pass it to TTS as:
|
|
266
|
+
|
|
267
|
+
"中",
|
|
268
|
+
"国首",
|
|
269
|
+
"是北",
|
|
270
|
+
"京吗?",
|
|
271
|
+
This would result in ambiguous synthesis because there are no spaces between certain words (e.g., between "中" and "国", "首" and "是", and "京" and "吗"). Proper segmentation must be ensured to avoid such ambiguities.
|
|
272
|
+
Minimize overall processing delay:
|
|
273
|
+
If the LLM results are passed to TTS only after the entire response is generated, the speech synthesis will be unambiguous and continuous. However, this approach introduces significant delay, which negatively affects the user experience.
|
|
274
|
+
|
|
275
|
+
Recommended Approach
|
|
276
|
+
To achieve a balance between clarity and minimal delay, the following steps should be followed:
|
|
277
|
+
|
|
278
|
+
Store the LLM results in a cache as they are received.
|
|
279
|
+
Perform a reverse scan of the cached data to find the most recent punctuation mark.
|
|
280
|
+
Truncate the data from the start to the most recent punctuation mark and pass it to TTS for synthesis.
|
|
281
|
+
Remove the truncated data from the cache. The remaining data should be moved to the beginning of the cache and continue waiting for additional data from the LLM.
|
|
@@ -10,6 +10,7 @@ agora/rtc/audio_encoded_frame_sender.py
|
|
|
10
10
|
agora/rtc/audio_frame_observer.py
|
|
11
11
|
agora/rtc/audio_pcm_data_sender.py
|
|
12
12
|
agora/rtc/audio_sessionctrl.py
|
|
13
|
+
agora/rtc/audio_vad_manager.py
|
|
13
14
|
agora/rtc/local_audio_track.py
|
|
14
15
|
agora/rtc/local_user.py
|
|
15
16
|
agora/rtc/local_user_observer.py
|
|
@@ -45,7 +45,7 @@ class CustomInstallCommand(install):
|
|
|
45
45
|
|
|
46
46
|
setup(
|
|
47
47
|
name='agora_python_server_sdk',
|
|
48
|
-
version='2.1.
|
|
48
|
+
version='2.1.6',
|
|
49
49
|
description='A Python SDK for Agora Server',
|
|
50
50
|
long_description=open('README.md').read(),
|
|
51
51
|
long_description_content_type='text/markdown',
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/agora_parameter.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/audio_pcm_data_sender.py
RENAMED
|
File without changes
|
{agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/audio_sessionctrl.py
RENAMED
|
File without changes
|
{agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/local_audio_track.py
RENAMED
|
File without changes
|
{agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/local_user_observer.py
RENAMED
|
File without changes
|
{agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/local_video_track.py
RENAMED
|
File without changes
|
{agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/media_node_factory.py
RENAMED
|
File without changes
|
{agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/remote_audio_track.py
RENAMED
|
File without changes
|
{agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/remote_video_track.py
RENAMED
|
File without changes
|
|
File without changes
|
{agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/rtc_connection_observer.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/video_frame_observer.py
RENAMED
|
File without changes
|
{agora_python_server_sdk-2.1.5 → agora_python_server_sdk-2.1.6}/agora/rtc/video_frame_sender.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|