agora-python-server-sdk 2.1.4__tar.gz → 2.1.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of agora-python-server-sdk might be problematic. Click here for more details.

Files changed (46) hide show
  1. agora_python_server_sdk-2.1.6/PKG-INFO +281 -0
  2. agora_python_server_sdk-2.1.6/README.md +266 -0
  3. {agora_python_server_sdk-2.1.4 → agora_python_server_sdk-2.1.6}/agora/rtc/_ctypes_handle/_audio_frame_observer.py +20 -2
  4. {agora_python_server_sdk-2.1.4 → agora_python_server_sdk-2.1.6}/agora/rtc/agora_service.py +11 -2
  5. {agora_python_server_sdk-2.1.4 → agora_python_server_sdk-2.1.6}/agora/rtc/audio_frame_observer.py +1 -1
  6. agora_python_server_sdk-2.1.6/agora/rtc/audio_vad_manager.py +59 -0
  7. {agora_python_server_sdk-2.1.4 → agora_python_server_sdk-2.1.6}/agora/rtc/local_user.py +5 -3
  8. {agora_python_server_sdk-2.1.4 → agora_python_server_sdk-2.1.6}/agora/rtc/rtc_connection.py +7 -4
  9. agora_python_server_sdk-2.1.6/agora/rtc/utils/audio_consumer.py +134 -0
  10. agora_python_server_sdk-2.1.6/agora/rtc/utils/vad_dump.py +104 -0
  11. {agora_python_server_sdk-2.1.4 → agora_python_server_sdk-2.1.6}/agora/rtc/voice_detection.py +3 -3
  12. agora_python_server_sdk-2.1.6/agora_python_server_sdk.egg-info/PKG-INFO +281 -0
  13. {agora_python_server_sdk-2.1.4 → agora_python_server_sdk-2.1.6}/agora_python_server_sdk.egg-info/SOURCES.txt +3 -1
  14. {agora_python_server_sdk-2.1.4 → agora_python_server_sdk-2.1.6}/setup.py +2 -2
  15. agora_python_server_sdk-2.1.4/PKG-INFO +0 -51
  16. agora_python_server_sdk-2.1.4/README.md +0 -36
  17. agora_python_server_sdk-2.1.4/agora/rtc/audio_vad.py +0 -164
  18. agora_python_server_sdk-2.1.4/agora_python_server_sdk.egg-info/PKG-INFO +0 -51
  19. {agora_python_server_sdk-2.1.4 → agora_python_server_sdk-2.1.6}/MANIFEST.in +0 -0
  20. {agora_python_server_sdk-2.1.4 → agora_python_server_sdk-2.1.6}/agora/rtc/__init__.py +0 -0
  21. {agora_python_server_sdk-2.1.4 → agora_python_server_sdk-2.1.6}/agora/rtc/_ctypes_handle/_ctypes_data.py +0 -0
  22. {agora_python_server_sdk-2.1.4 → agora_python_server_sdk-2.1.6}/agora/rtc/_ctypes_handle/_local_user_observer.py +0 -0
  23. {agora_python_server_sdk-2.1.4 → agora_python_server_sdk-2.1.6}/agora/rtc/_ctypes_handle/_rtc_connection_observer.py +0 -0
  24. {agora_python_server_sdk-2.1.4 → agora_python_server_sdk-2.1.6}/agora/rtc/_ctypes_handle/_video_encoded_frame_observer.py +0 -0
  25. {agora_python_server_sdk-2.1.4 → agora_python_server_sdk-2.1.6}/agora/rtc/_ctypes_handle/_video_frame_observer.py +0 -0
  26. {agora_python_server_sdk-2.1.4 → agora_python_server_sdk-2.1.6}/agora/rtc/_utils/globals.py +0 -0
  27. {agora_python_server_sdk-2.1.4 → agora_python_server_sdk-2.1.6}/agora/rtc/agora_base.py +0 -0
  28. {agora_python_server_sdk-2.1.4 → agora_python_server_sdk-2.1.6}/agora/rtc/agora_parameter.py +0 -0
  29. {agora_python_server_sdk-2.1.4 → agora_python_server_sdk-2.1.6}/agora/rtc/audio_encoded_frame_sender.py +0 -0
  30. {agora_python_server_sdk-2.1.4 → agora_python_server_sdk-2.1.6}/agora/rtc/audio_pcm_data_sender.py +0 -0
  31. {agora_python_server_sdk-2.1.4 → agora_python_server_sdk-2.1.6}/agora/rtc/audio_sessionctrl.py +0 -0
  32. {agora_python_server_sdk-2.1.4 → agora_python_server_sdk-2.1.6}/agora/rtc/local_audio_track.py +0 -0
  33. {agora_python_server_sdk-2.1.4 → agora_python_server_sdk-2.1.6}/agora/rtc/local_user_observer.py +0 -0
  34. {agora_python_server_sdk-2.1.4 → agora_python_server_sdk-2.1.6}/agora/rtc/local_video_track.py +0 -0
  35. {agora_python_server_sdk-2.1.4 → agora_python_server_sdk-2.1.6}/agora/rtc/media_node_factory.py +0 -0
  36. {agora_python_server_sdk-2.1.4 → agora_python_server_sdk-2.1.6}/agora/rtc/remote_audio_track.py +0 -0
  37. {agora_python_server_sdk-2.1.4 → agora_python_server_sdk-2.1.6}/agora/rtc/remote_video_track.py +0 -0
  38. {agora_python_server_sdk-2.1.4 → agora_python_server_sdk-2.1.6}/agora/rtc/rtc_connection_observer.py +0 -0
  39. {agora_python_server_sdk-2.1.4 → agora_python_server_sdk-2.1.6}/agora/rtc/video_encoded_frame_observer.py +0 -0
  40. {agora_python_server_sdk-2.1.4 → agora_python_server_sdk-2.1.6}/agora/rtc/video_encoded_image_sender.py +0 -0
  41. {agora_python_server_sdk-2.1.4 → agora_python_server_sdk-2.1.6}/agora/rtc/video_frame_observer.py +0 -0
  42. {agora_python_server_sdk-2.1.4 → agora_python_server_sdk-2.1.6}/agora/rtc/video_frame_sender.py +0 -0
  43. {agora_python_server_sdk-2.1.4 → agora_python_server_sdk-2.1.6}/agora_python_server_sdk.egg-info/dependency_links.txt +0 -0
  44. {agora_python_server_sdk-2.1.4 → agora_python_server_sdk-2.1.6}/agora_python_server_sdk.egg-info/top_level.txt +0 -0
  45. {agora_python_server_sdk-2.1.4 → agora_python_server_sdk-2.1.6}/pyproject.toml +0 -0
  46. {agora_python_server_sdk-2.1.4 → agora_python_server_sdk-2.1.6}/setup.cfg +0 -0
@@ -0,0 +1,281 @@
1
+ Metadata-Version: 2.1
2
+ Name: agora_python_server_sdk
3
+ Version: 2.1.6
4
+ Summary: A Python SDK for Agora Server
5
+ Home-page: https://github.com/AgoraIO-Extensions/Agora-Python-Server-SDK
6
+ Classifier: Intended Audience :: Developers
7
+ Classifier: License :: OSI Approved :: MIT License
8
+ Classifier: Topic :: Multimedia :: Sound/Audio
9
+ Classifier: Topic :: Multimedia :: Video
10
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
11
+ Classifier: Programming Language :: Python :: 3.10
12
+ Classifier: Programming Language :: Python :: 3 :: Only
13
+ Requires-Python: >=3.10
14
+ Description-Content-Type: text/markdown
15
+
16
+ # Note
17
+ - This is a Python SDK wrapper for the Agora RTC SDK.
18
+ - It supports Linux and Mac platforms.
19
+ - The examples are provided as very simple demonstrations and are not recommended for use in production environments.
20
+
21
+ # Very Important Notice !!!
22
+ - A process can only have one instance.
23
+ - An instance can have multiple connections.
24
+ - In all observers or callbacks, you must not call the SDK's own APIs, nor perform CPU-intensive tasks in the callbacks; data copying is allowed.
25
+
26
+ # Required Operating Systems and Python Versions
27
+ - Supported Linux versions:
28
+ - Ubuntu 18.04 LTS and above
29
+ - CentOS 7.0 and above
30
+
31
+ - Supported Mac versions:
32
+ - MacOS 13 and above(only for coding and testing)
33
+
34
+ - Python version:
35
+ - Python 3.10 and above
36
+
37
+ # Using Agora-Python-Server-SDK
38
+ ```
39
+ pip install agora_python_server_sdk
40
+ ```
41
+
42
+ # Running Examples
43
+
44
+ ## Preparing Test Data
45
+ - Download and unzip [test_data.zip](https://download.agora.io/demo/test/test_data_202408221437.zip) to the Agora-Python-Server-SDK directory.
46
+
47
+ ## Executing Test Script
48
+ ```
49
+ python agora_rtc/examples/example_audio_pcm_send.py --appId=xxx --channelId=xxx --userId=xxx --audioFile=./test_data/demo.pcm --sampleRate=16000 --numOfChannels=1
50
+ ```
51
+
52
+ # Change log
53
+
54
+ ## 2024.12.09 Release 2.1.6
55
+ - New Features:
56
+ -- Added AudioVadManager to manage VAD (Voice Activity Detection) instances.
57
+ -- Integrated VAD functionality into the SDK. Developers no longer need to worry about how to use VAD; they only need to focus on setting appropriate parameters. Reference: sample_audio_vad.py
58
+ - Changes:
59
+ -- In register_audio_frame_observer, two new parameters have been added to set the VAD parameters. Reference: sample_audio_vad.py
60
+ -- In on_playback_audio_frame_before_mixing, two new return values have been added: vad_result_state and vad_result_bytearray.
61
+ state:
62
+ < 0: No internal automatic VAD applied
63
+ 0: No speaking
64
+ 1: Started speaking
65
+ 2: Speaking
66
+ 3: Stopped speaking
67
+ vad_result_bytearray: The result processed by VAD, returned when VAD is active.
68
+ If automatic VAD is enabled:
69
+ Developers should use vad_result_bytearray for subsequent business processing (e.g., sending to ASR/STT), rather than using the raw frame data.
70
+ Reference: sample_audio_vad.py
71
+ - Optimizations:
72
+ -- Replaced the use of pacer with AudioConsumer for pushing PCM audio.
73
+ - Updates:
74
+ -- Updated the samples related to Pacer and VAD.
75
+
76
+ ## 2024.12.03 release Version 2.1.5
77
+ - Modifications:
78
+ - LocalUser/audioTrack:
79
+ -- When the scenario is chorus, developers don't need to call setSendDelayInMs.
80
+ -- When the scenario is chorus, developers don't need to set the audio scenario of the track to chorus.
81
+ -- NOTE: This can reduce the difficulty for developers. In AI scenarios, developers only need to set the service to chorus.
82
+ - Additions:
83
+ -- Added the VadDump class, which can assist in troubleshooting vad issues in the testing environment. However, it should not be enabled in the online env ironment.
84
+ -- Added the on_volume_indication callback.
85
+ -- Added the on_remote_video_track_state_changed callback.
86
+ - Removals:
87
+ -- Removed Vad V1 version, only retaining the V2 version. Refer to voice_detection.py and sample_audio_vad.py.
88
+ - Updates:
89
+ -- Updated relevant samples: audioconsume, vad sample.
90
+
91
+ ## 2024.11.12 release 2.1.4
92
+ - Modify the type of metadata in videoFrame from str to bytes type to be consistent with C++; thus, it can support byte streams.
93
+ - The internal encapsulation of ExteranlVideoFrame has been modified to support byte streams. Regarding the support for alpha encoding, a logical judgment has been made. If fill_alpha_buffer is 0, it will not be processed.
94
+ ## 2024.11.11 release 2.1.3
95
+ - Added a new sample: example_jpeg_send.py which can push JPEG files or JPEG streams to a channel.
96
+ -
97
+ - Performance overhead, as noted in the example comments, can be summarized as follows:
98
+ - For a 1920x1080 JPEG file, the process from reading the file to converting it to an RGBA bytearray - takes approximately 11 milliseconds.
99
+
100
+
101
+ ## 2024.11.07 release 2.1.2
102
+ - Updates `user_id` in the `AudioVolumeInfoInner and AudioVolumeInfo` structure to `str` type.
103
+ - Fixes the bug in `_on_audio_volume_indication` callback, where it could only handle one callback to speaker_number
104
+ - Corrects the parameter type in `IRTCLocalUserObserver::on_audio_volume_indication` callback to `list` type.
105
+
106
+ ## 2024.10.29 release 2.1.1
107
+
108
+ Add audio VAD interface of version 2 and corresponding example.
109
+
110
+ ## 2024.10.24 release 2.1.0
111
+
112
+ Fixed some bug.
113
+
114
+
115
+ ### Common Usage Q&A
116
+ ## The relationship between service and process?
117
+ - A process can only have one service, and the service can only be initialized once.
118
+ - A service can only have one media_node_factory.
119
+ - A service can have multiple connections.
120
+ - Release media_node_factory.release() and service.release() when the process exits.
121
+ ## If using Docker with one user per Docker, when the user starts Docker and logs out, how should Docker be released?
122
+ - In this case, create service/media_node_factory and connection when the process starts.
123
+ - Release service/media_node_factory and connection when the process exits, ensuring that...
124
+ ## If Docker is used to support multiple users and Docker runs for a long time, what should be done?
125
+ - In this case, we recommend using the concept of a connection pool.
126
+ - Create service/media_node_factory and a connection pool (only new connections, without initialization) when the process starts.
127
+ - When a user logs in, get a connection from the connection pool, initialize it, execute con.connect() and set up callbacks, and then join the channel.
128
+ - Handle business operations.
129
+ - When a user logs out, execute con.disconnect() and release the audio/video tracks and observers associated with the connection, but do not call con.release(); then put the connection back into the connection pool.
130
+ - When the process exits, release the connection pool (release each con.release()), service/media_node_factory, and the connection pool (release each con.release()) to ensure resource release and optimal performance.
131
+
132
+ ## Use of VAD
133
+ # Source code: voice_detection.py
134
+ # Sample code: example_audio_vad.py
135
+ # It is recommended to use VAD V2 version, and the class is: AudioVadV2; Reference: voice_detection.py.
136
+ # Use of VAD:
137
+ 1. Call _vad_instance.init(AudioVadConfigV2) to initialize the vad instance. Reference: voice_detection.py. Assume the instance is: _vad_instance
138
+ 2. In audio_frame_observer::on_playback_audio_frame_before_mixing(audio_frame):
139
+
140
+ 3. Call the process of the vad module: state, bytes = _vad_instance.process(audio_frame)
141
+ Judge the value of state according to the returned state, and do corresponding processing.
142
+
143
+ A. If state is _vad_instance._vad_state_startspeaking, it indicates that the user is "starting to speak", and speech recognition (STT/ASR) operations can be started. Remember: be sure to pass the returned bytes to the recognition module instead of the original audio_frame, otherwise the recognition result will be incorrect.
144
+ B. If state is _vad_instance._vad_state_stopspeaking, it indicates that the user is "stopping speaking", and speech recognition (STT/ASR) operations can be stopped. Remember: be sure to pass the returned bytes to the recognition module instead of the original audio_frame, otherwise the recognition result will be incorrect.
145
+ C. If state is _vad_instance._vad_state_speaking, it indicates that the user is "speaking", and speech recognition (STT/ASR) operations can be continued. Remember: be sure to pass the returned bytes to the recognition module instead of the original audio_frame, otherwise the recognition result will be incorrect.
146
+ # Note:
147
+ If the vad module is used and it is expected to use the vad module for speech recognition (STT/ASR) and other operations, then be sure to pass the returned bytes to the recognition module instead of the original audio_frame, otherwise the recognition result will be incorrect.
148
+ # How to better troubleshoot VAD issues: It includes two aspects, configuration and debugging.
149
+ 1. Ensure that the initialization parameters of the vad module are correct. Reference: voice_detection.py.
150
+ 2. In state, bytes = on_playback_audio_frame_before_mixing(audio_frame):
151
+
152
+ - A . Save the data of audio_frame to a local file, reference: example_audio_pcm_send.py. This is to record the original audio data. For example, it can be named: source_{time.time()*1000}.pcm
153
+ - B.Save the result of each vad processing:
154
+
155
+ - a When state == start_speaking: create a new binary file, for example, named: vad_{time.time()*1000}.pcm, and write bytes to the file.
156
+ - b When state == speaking: write bytes to the file.
157
+ - c When state == stop_speaking: write bytes to the file and close the file.
158
+ Note: In this way, problems can be troubleshot based on the original audio file and the audio file processed by vad. This function can be disabled in the production environment.
159
+ ### How to push the audio generated by TTS into the channel?
160
+ # Source code: audio_consumer.py
161
+ # Sample code: example_audio_consumer.py
162
+ ### How to release resources?
163
+ ## 如何释放资源?
164
+ localuser.unpublish_audio(audio_track)
165
+ localuser.unpublish_video(video_track)
166
+ audio_track.set_enabled(0)
167
+ video_track.set_enabled(0)
168
+
169
+ localuser.unregister_audio_frame_observer()
170
+ localuser.unregister_video_frame_observer()
171
+ localuser.unregister_local_user_observer()
172
+
173
+ connection.disconnect()
174
+ connection.unregister_observer()
175
+
176
+ localuser.release()
177
+ connection.release()
178
+
179
+
180
+ audio_track.release()
181
+ video_track.release()
182
+ pcm_data_sender.release()
183
+ video_data_sender.release()
184
+ audio_consumer.release()
185
+
186
+ media_node_factory.release()
187
+ agora_service.release()
188
+
189
+ #set to None
190
+ audio_track = None
191
+ video_track = None
192
+ audio_observer = None
193
+ video_observer = None
194
+ local_observer = None
195
+ localuser = None
196
+ connection = None
197
+ agora_service = None
198
+
199
+ ## Interrupt Handling in AI Scenarios
200
+ # Definition of Interrupt
201
+ In human-machine dialogue, an interrupt refers to the situation where a user suddenly interrupts the robot's response, requesting the robot to stop its current response immediately and shift to answer the user's new question. This behavior is called an interrupt.
202
+
203
+ # Trigger Conditions for Interrupts
204
+ Interrupts can be defined in different ways depending on the product. There are generally two modes:
205
+
206
+ - Mode 1: Voice Activation Mode
207
+ When it detects that the user is speaking, the interrupt strategy is triggered. For example, when the system recognizes speech, it triggers the interrupt strategy to stop the robot's response.
208
+
209
+ - Mode 2: ASR Activation Mode
210
+ When the system detects that the user is speaking and receives a result from ASR (Automatic Speech Recognition) or STT (Speech-to-Text), the interrupt strategy is triggered.
211
+
212
+ # Advantages of Different Interrupt Strategies
213
+ Voice Activation Interrupt
214
+
215
+ Advantages:
216
+ Reduces the user's wait time and the likelihood of interrupts, as the robot will stop its response immediately when the user starts speaking, eliminating the need for the user to wait for the robot to finish speaking.
217
+ Disadvantages:
218
+ Since this is voice-activated, it may be triggered by meaningless audio signals, depending on the accuracy of the VAD (Voice Activity Detection). For example, if someone is typing on the keyboard while the AI is speaking, it might trigger the interrupt incorrectly.
219
+ ASR Activation Interrupt
220
+
221
+ Advantages:
222
+ Reduces the probability of unnecessary interrupts because the interrupt strategy is triggered only after ASR or STT has recognized the user’s speech.
223
+ Disadvantages:
224
+ Since this is ASR/STT-triggered, it requires converting the audio signal into text, which introduces a delay before the interrupt can be processed.
225
+ - Recommended Mode
226
+ If the VAD can filter out non-speech signals and only triggers when human speech is detected, the Voice Activation Mode is recommended. This mode is also suitable when the delay in processing the interrupt is not a major concern.
227
+
228
+ If the interrupt delay is not sensitive, the ASR Activation Mode is recommended. This mode can filter out non-speech signals more effectively and reduce the probability of an unintended interrupt.
229
+
230
+ How to Implement Interrupts? What Actions Are Required?
231
+ In a human-machine dialogue system, conversations are typically structured in "rounds," where each round consists of a question from the user, followed by a response from the robot, and so on. For each round, we can assign a roundId, incrementing it with each new round. A round consists of the following stages:
232
+
233
+ VAD (Voice Activity Detection):
234
+ This marks the start of the dialogue, where the system detects the beginning and end of the user's speech. It then passes this information to the ASR for further processing.
235
+
236
+ ASR (Automatic Speech Recognition):
237
+ This phase involves recognizing the user's speech and converting it into text, which is then passed to the LLM (Large Language Model).
238
+
239
+ LLM (Large Language Model):
240
+ This is the generation phase, where the LLM processes the recognized user input and generates a response.
241
+
242
+ TTS (Text-to-Speech):
243
+ In this phase, the LLM’s response is converted into an audio format.
244
+
245
+ RTC Streaming:
246
+ The generated audio is streamed via RTC (Real-Time Communication) to be played back to the user.
247
+
248
+ Therefore, an interrupt happens when, in the next round (roundId+1), either through Voice Activation (triggered by the VAD phase) or ASR Activation (triggered when ASR recognizes the user’s speech), the following actions must be performed:
249
+
250
+ Stop the LLM Generation in the current round (roundId).
251
+ Stop the TTS Synthesis in the current round (roundId).
252
+ Stop the RTC Streaming in the current round (roundId).
253
+ API Call References:
254
+ Call: AudioConsumer.clear()
255
+ Call: LocalAudioTrack.clear_sender_buffer()
256
+ Business Layer: Clear any remaining TTS-related data (if applicable)
257
+
258
+
259
+ ## When to Pass LLM Results to TTS for Synthesis?
260
+ LLM (Large Language Model) results are returned asynchronously and in a streaming manner. When should the results from the LLM be passed to TTS (Text-to-Speech) for synthesis?
261
+
262
+ Two main factors need to be considered:
263
+
264
+ Ensure that the TTS synthesized speech is unambiguous:
265
+ The speech synthesized by TTS must be clear, complete, and continuous. For example, if the LLM returns the text: "中间的首都是北京吗?", and we pass it to TTS as:
266
+
267
+ "中",
268
+ "国首",
269
+ "是北",
270
+ "京吗?",
271
+ This would result in ambiguous synthesis because there are no spaces between certain words (e.g., between "中" and "国", "首" and "是", and "京" and "吗"). Proper segmentation must be ensured to avoid such ambiguities.
272
+ Minimize overall processing delay:
273
+ If the LLM results are passed to TTS only after the entire response is generated, the speech synthesis will be unambiguous and continuous. However, this approach introduces significant delay, which negatively affects the user experience.
274
+
275
+ Recommended Approach
276
+ To achieve a balance between clarity and minimal delay, the following steps should be followed:
277
+
278
+ Store the LLM results in a cache as they are received.
279
+ Perform a reverse scan of the cached data to find the most recent punctuation mark.
280
+ Truncate the data from the start to the most recent punctuation mark and pass it to TTS for synthesis.
281
+ Remove the truncated data from the cache. The remaining data should be moved to the beginning of the cache and continue waiting for additional data from the LLM.
@@ -0,0 +1,266 @@
1
+ # Note
2
+ - This is a Python SDK wrapper for the Agora RTC SDK.
3
+ - It supports Linux and Mac platforms.
4
+ - The examples are provided as very simple demonstrations and are not recommended for use in production environments.
5
+
6
+ # Very Important Notice !!!
7
+ - A process can only have one instance.
8
+ - An instance can have multiple connections.
9
+ - In all observers or callbacks, you must not call the SDK's own APIs, nor perform CPU-intensive tasks in the callbacks; data copying is allowed.
10
+
11
+ # Required Operating Systems and Python Versions
12
+ - Supported Linux versions:
13
+ - Ubuntu 18.04 LTS and above
14
+ - CentOS 7.0 and above
15
+
16
+ - Supported Mac versions:
17
+ - MacOS 13 and above(only for coding and testing)
18
+
19
+ - Python version:
20
+ - Python 3.10 and above
21
+
22
+ # Using Agora-Python-Server-SDK
23
+ ```
24
+ pip install agora_python_server_sdk
25
+ ```
26
+
27
+ # Running Examples
28
+
29
+ ## Preparing Test Data
30
+ - Download and unzip [test_data.zip](https://download.agora.io/demo/test/test_data_202408221437.zip) to the Agora-Python-Server-SDK directory.
31
+
32
+ ## Executing Test Script
33
+ ```
34
+ python agora_rtc/examples/example_audio_pcm_send.py --appId=xxx --channelId=xxx --userId=xxx --audioFile=./test_data/demo.pcm --sampleRate=16000 --numOfChannels=1
35
+ ```
36
+
37
+ # Change log
38
+
39
+ ## 2024.12.09 Release 2.1.6
40
+ - New Features:
41
+ -- Added AudioVadManager to manage VAD (Voice Activity Detection) instances.
42
+ -- Integrated VAD functionality into the SDK. Developers no longer need to worry about how to use VAD; they only need to focus on setting appropriate parameters. Reference: sample_audio_vad.py
43
+ - Changes:
44
+ -- In register_audio_frame_observer, two new parameters have been added to set the VAD parameters. Reference: sample_audio_vad.py
45
+ -- In on_playback_audio_frame_before_mixing, two new return values have been added: vad_result_state and vad_result_bytearray.
46
+ state:
47
+ < 0: No internal automatic VAD applied
48
+ 0: No speaking
49
+ 1: Started speaking
50
+ 2: Speaking
51
+ 3: Stopped speaking
52
+ vad_result_bytearray: The result processed by VAD, returned when VAD is active.
53
+ If automatic VAD is enabled:
54
+ Developers should use vad_result_bytearray for subsequent business processing (e.g., sending to ASR/STT), rather than using the raw frame data.
55
+ Reference: sample_audio_vad.py
56
+ - Optimizations:
57
+ -- Replaced the use of pacer with AudioConsumer for pushing PCM audio.
58
+ - Updates:
59
+ -- Updated the samples related to Pacer and VAD.
60
+
61
+ ## 2024.12.03 release Version 2.1.5
62
+ - Modifications:
63
+ - LocalUser/audioTrack:
64
+ -- When the scenario is chorus, developers don't need to call setSendDelayInMs.
65
+ -- When the scenario is chorus, developers don't need to set the audio scenario of the track to chorus.
66
+ -- NOTE: This can reduce the difficulty for developers. In AI scenarios, developers only need to set the service to chorus.
67
+ - Additions:
68
+ -- Added the VadDump class, which can assist in troubleshooting vad issues in the testing environment. However, it should not be enabled in the online env ironment.
69
+ -- Added the on_volume_indication callback.
70
+ -- Added the on_remote_video_track_state_changed callback.
71
+ - Removals:
72
+ -- Removed Vad V1 version, only retaining the V2 version. Refer to voice_detection.py and sample_audio_vad.py.
73
+ - Updates:
74
+ -- Updated relevant samples: audioconsume, vad sample.
75
+
76
+ ## 2024.11.12 release 2.1.4
77
+ - Modify the type of metadata in videoFrame from str to bytes type to be consistent with C++; thus, it can support byte streams.
78
+ - The internal encapsulation of ExteranlVideoFrame has been modified to support byte streams. Regarding the support for alpha encoding, a logical judgment has been made. If fill_alpha_buffer is 0, it will not be processed.
79
+ ## 2024.11.11 release 2.1.3
80
+ - Added a new sample: example_jpeg_send.py which can push JPEG files or JPEG streams to a channel.
81
+ -
82
+ - Performance overhead, as noted in the example comments, can be summarized as follows:
83
+ - For a 1920x1080 JPEG file, the process from reading the file to converting it to an RGBA bytearray - takes approximately 11 milliseconds.
84
+
85
+
86
+ ## 2024.11.07 release 2.1.2
87
+ - Updates `user_id` in the `AudioVolumeInfoInner and AudioVolumeInfo` structure to `str` type.
88
+ - Fixes the bug in `_on_audio_volume_indication` callback, where it could only handle one callback to speaker_number
89
+ - Corrects the parameter type in `IRTCLocalUserObserver::on_audio_volume_indication` callback to `list` type.
90
+
91
+ ## 2024.10.29 release 2.1.1
92
+
93
+ Add audio VAD interface of version 2 and corresponding example.
94
+
95
+ ## 2024.10.24 release 2.1.0
96
+
97
+ Fixed some bug.
98
+
99
+
100
+ ### Common Usage Q&A
101
+ ## The relationship between service and process?
102
+ - A process can only have one service, and the service can only be initialized once.
103
+ - A service can only have one media_node_factory.
104
+ - A service can have multiple connections.
105
+ - Release media_node_factory.release() and service.release() when the process exits.
106
+ ## If using Docker with one user per Docker, when the user starts Docker and logs out, how should Docker be released?
107
+ - In this case, create service/media_node_factory and connection when the process starts.
108
+ - Release service/media_node_factory and connection when the process exits, ensuring that...
109
+ ## If Docker is used to support multiple users and Docker runs for a long time, what should be done?
110
+ - In this case, we recommend using the concept of a connection pool.
111
+ - Create service/media_node_factory and a connection pool (only new connections, without initialization) when the process starts.
112
+ - When a user logs in, get a connection from the connection pool, initialize it, execute con.connect() and set up callbacks, and then join the channel.
113
+ - Handle business operations.
114
+ - When a user logs out, execute con.disconnect() and release the audio/video tracks and observers associated with the connection, but do not call con.release(); then put the connection back into the connection pool.
115
+ - When the process exits, release the connection pool (release each con.release()), service/media_node_factory, and the connection pool (release each con.release()) to ensure resource release and optimal performance.
116
+
117
+ ## Use of VAD
118
+ # Source code: voice_detection.py
119
+ # Sample code: example_audio_vad.py
120
+ # It is recommended to use VAD V2 version, and the class is: AudioVadV2; Reference: voice_detection.py.
121
+ # Use of VAD:
122
+ 1. Call _vad_instance.init(AudioVadConfigV2) to initialize the vad instance. Reference: voice_detection.py. Assume the instance is: _vad_instance
123
+ 2. In audio_frame_observer::on_playback_audio_frame_before_mixing(audio_frame):
124
+
125
+ 3. Call the process of the vad module: state, bytes = _vad_instance.process(audio_frame)
126
+ Judge the value of state according to the returned state, and do corresponding processing.
127
+
128
+ A. If state is _vad_instance._vad_state_startspeaking, it indicates that the user is "starting to speak", and speech recognition (STT/ASR) operations can be started. Remember: be sure to pass the returned bytes to the recognition module instead of the original audio_frame, otherwise the recognition result will be incorrect.
129
+ B. If state is _vad_instance._vad_state_stopspeaking, it indicates that the user is "stopping speaking", and speech recognition (STT/ASR) operations can be stopped. Remember: be sure to pass the returned bytes to the recognition module instead of the original audio_frame, otherwise the recognition result will be incorrect.
130
+ C. If state is _vad_instance._vad_state_speaking, it indicates that the user is "speaking", and speech recognition (STT/ASR) operations can be continued. Remember: be sure to pass the returned bytes to the recognition module instead of the original audio_frame, otherwise the recognition result will be incorrect.
131
+ # Note:
132
+ If the vad module is used and it is expected to use the vad module for speech recognition (STT/ASR) and other operations, then be sure to pass the returned bytes to the recognition module instead of the original audio_frame, otherwise the recognition result will be incorrect.
133
+ # How to better troubleshoot VAD issues: It includes two aspects, configuration and debugging.
134
+ 1. Ensure that the initialization parameters of the vad module are correct. Reference: voice_detection.py.
135
+ 2. In state, bytes = on_playback_audio_frame_before_mixing(audio_frame):
136
+
137
+ - A . Save the data of audio_frame to a local file, reference: example_audio_pcm_send.py. This is to record the original audio data. For example, it can be named: source_{time.time()*1000}.pcm
138
+ - B.Save the result of each vad processing:
139
+
140
+ - a When state == start_speaking: create a new binary file, for example, named: vad_{time.time()*1000}.pcm, and write bytes to the file.
141
+ - b When state == speaking: write bytes to the file.
142
+ - c When state == stop_speaking: write bytes to the file and close the file.
143
+ Note: In this way, problems can be troubleshot based on the original audio file and the audio file processed by vad. This function can be disabled in the production environment.
144
+ ### How to push the audio generated by TTS into the channel?
145
+ # Source code: audio_consumer.py
146
+ # Sample code: example_audio_consumer.py
147
+ ### How to release resources?
148
+ ## 如何释放资源?
149
+ localuser.unpublish_audio(audio_track)
150
+ localuser.unpublish_video(video_track)
151
+ audio_track.set_enabled(0)
152
+ video_track.set_enabled(0)
153
+
154
+ localuser.unregister_audio_frame_observer()
155
+ localuser.unregister_video_frame_observer()
156
+ localuser.unregister_local_user_observer()
157
+
158
+ connection.disconnect()
159
+ connection.unregister_observer()
160
+
161
+ localuser.release()
162
+ connection.release()
163
+
164
+
165
+ audio_track.release()
166
+ video_track.release()
167
+ pcm_data_sender.release()
168
+ video_data_sender.release()
169
+ audio_consumer.release()
170
+
171
+ media_node_factory.release()
172
+ agora_service.release()
173
+
174
+ #set to None
175
+ audio_track = None
176
+ video_track = None
177
+ audio_observer = None
178
+ video_observer = None
179
+ local_observer = None
180
+ localuser = None
181
+ connection = None
182
+ agora_service = None
183
+
184
+ ## Interrupt Handling in AI Scenarios
185
+ # Definition of Interrupt
186
+ In human-machine dialogue, an interrupt refers to the situation where a user suddenly interrupts the robot's response, requesting the robot to stop its current response immediately and shift to answer the user's new question. This behavior is called an interrupt.
187
+
188
+ # Trigger Conditions for Interrupts
189
+ Interrupts can be defined in different ways depending on the product. There are generally two modes:
190
+
191
+ - Mode 1: Voice Activation Mode
192
+ When it detects that the user is speaking, the interrupt strategy is triggered. For example, when the system recognizes speech, it triggers the interrupt strategy to stop the robot's response.
193
+
194
+ - Mode 2: ASR Activation Mode
195
+ When the system detects that the user is speaking and receives a result from ASR (Automatic Speech Recognition) or STT (Speech-to-Text), the interrupt strategy is triggered.
196
+
197
+ # Advantages of Different Interrupt Strategies
198
+ Voice Activation Interrupt
199
+
200
+ Advantages:
201
+ Reduces the user's wait time and the likelihood of interrupts, as the robot will stop its response immediately when the user starts speaking, eliminating the need for the user to wait for the robot to finish speaking.
202
+ Disadvantages:
203
+ Since this is voice-activated, it may be triggered by meaningless audio signals, depending on the accuracy of the VAD (Voice Activity Detection). For example, if someone is typing on the keyboard while the AI is speaking, it might trigger the interrupt incorrectly.
204
+ ASR Activation Interrupt
205
+
206
+ Advantages:
207
+ Reduces the probability of unnecessary interrupts because the interrupt strategy is triggered only after ASR or STT has recognized the user’s speech.
208
+ Disadvantages:
209
+ Since this is ASR/STT-triggered, it requires converting the audio signal into text, which introduces a delay before the interrupt can be processed.
210
+ - Recommended Mode
211
+ If the VAD can filter out non-speech signals and only triggers when human speech is detected, the Voice Activation Mode is recommended. This mode is also suitable when the delay in processing the interrupt is not a major concern.
212
+
213
+ If the interrupt delay is not sensitive, the ASR Activation Mode is recommended. This mode can filter out non-speech signals more effectively and reduce the probability of an unintended interrupt.
214
+
215
+ How to Implement Interrupts? What Actions Are Required?
216
+ In a human-machine dialogue system, conversations are typically structured in "rounds," where each round consists of a question from the user, followed by a response from the robot, and so on. For each round, we can assign a roundId, incrementing it with each new round. A round consists of the following stages:
217
+
218
+ VAD (Voice Activity Detection):
219
+ This marks the start of the dialogue, where the system detects the beginning and end of the user's speech. It then passes this information to the ASR for further processing.
220
+
221
+ ASR (Automatic Speech Recognition):
222
+ This phase involves recognizing the user's speech and converting it into text, which is then passed to the LLM (Large Language Model).
223
+
224
+ LLM (Large Language Model):
225
+ This is the generation phase, where the LLM processes the recognized user input and generates a response.
226
+
227
+ TTS (Text-to-Speech):
228
+ In this phase, the LLM’s response is converted into an audio format.
229
+
230
+ RTC Streaming:
231
+ The generated audio is streamed via RTC (Real-Time Communication) to be played back to the user.
232
+
233
+ Therefore, an interrupt happens when, in the next round (roundId+1), either through Voice Activation (triggered by the VAD phase) or ASR Activation (triggered when ASR recognizes the user’s speech), the following actions must be performed:
234
+
235
+ Stop the LLM Generation in the current round (roundId).
236
+ Stop the TTS Synthesis in the current round (roundId).
237
+ Stop the RTC Streaming in the current round (roundId).
238
+ API Call References:
239
+ Call: AudioConsumer.clear()
240
+ Call: LocalAudioTrack.clear_sender_buffer()
241
+ Business Layer: Clear any remaining TTS-related data (if applicable)
242
+
243
+
244
+ ## When to Pass LLM Results to TTS for Synthesis?
245
+ LLM (Large Language Model) results are returned asynchronously and in a streaming manner. When should the results from the LLM be passed to TTS (Text-to-Speech) for synthesis?
246
+
247
+ Two main factors need to be considered:
248
+
249
+ Ensure that the TTS synthesized speech is unambiguous:
250
+ The speech synthesized by TTS must be clear, complete, and continuous. For example, if the LLM returns the text: "中间的首都是北京吗?", and we pass it to TTS as:
251
+
252
+ "中",
253
+ "国首",
254
+ "是北",
255
+ "京吗?",
256
+ This would result in ambiguous synthesis because there are no spaces between certain words (e.g., between "中" and "国", "首" and "是", and "京" and "吗"). Proper segmentation must be ensured to avoid such ambiguities.
257
+ Minimize overall processing delay:
258
+ If the LLM results are passed to TTS only after the entire response is generated, the speech synthesis will be unambiguous and continuous. However, this approach introduces significant delay, which negatively affects the user experience.
259
+
260
+ Recommended Approach
261
+ To achieve a balance between clarity and minimal delay, the following steps should be followed:
262
+
263
+ Store the LLM results in a cache as they are received.
264
+ Perform a reverse scan of the cached data to find the most recent punctuation mark.
265
+ Truncate the data from the start to the most recent punctuation mark and pass it to TTS for synthesis.
266
+ Remove the truncated data from the cache. The remaining data should be moved to the beginning of the cache and continue waiting for additional data from the LLM.
@@ -5,6 +5,7 @@ import ctypes
5
5
  from ..audio_frame_observer import *
6
6
  import logging
7
7
  logger = logging.getLogger(__name__)
8
+ from ..audio_vad_manager import AudioVadManager
8
9
  #from ..audio_sessionctrl import *
9
10
 
10
11
  ON_RECORD_AUDIO_FRAME_CALLBACK = ctypes.CFUNCTYPE(ctypes.c_int, AGORA_HANDLE, ctypes.c_char_p, ctypes.POINTER(AudioFrameInner))
@@ -35,7 +36,7 @@ class AudioFrameObserverInner(ctypes.Structure):
35
36
  ("on_get_ear_monitoring_audio_frame_param", ON_GET_EAR_MONITORING_AUDIO_FRAME_PARAM_CALLBACK)
36
37
  ]
37
38
 
38
- def __init__(self, observer: IAudioFrameObserver, local_user: 'LocalUser'):
39
+ def __init__(self, observer: IAudioFrameObserver, local_user: 'LocalUser', enable_vad: int, vad_configure):
39
40
  self.observer = observer
40
41
  self.local_user = local_user
41
42
  self.on_record_audio_frame = ON_RECORD_AUDIO_FRAME_CALLBACK(self._on_record_audio_frame)
@@ -45,6 +46,8 @@ class AudioFrameObserverInner(ctypes.Structure):
45
46
  self.on_playback_audio_frame_before_mixing = ON_PLAYBACK_AUDIO_FRAME_BEFORE_MIXING_CALLBACK(self._on_playback_audio_frame_before_mixing)
46
47
  self.on_get_audio_frame_position = ON_GET_AUDIO_FRAME_POSITION_CALLBACK(self._on_get_audio_frame_position)
47
48
  self._session_ctrl_manager = None #SessionCtrlManager()
49
+ self._vad_instance_manager = AudioVadManager(vad_configure) if enable_vad else None
50
+ self._enable_vad = True if enable_vad > 0 else False
48
51
 
49
52
  # self.on_get_playback_audio_frame_param = ON_GET_PLAYBACK_AUDIO_FRAME_PARAM_CALLBACK(self._on_get_playback_audio_frame_param)
50
53
  # self.on_get_record_audio_frame_param = ON_GET_RECORD_AUDIO_FRAME_PARAM_CALLBACK(self._on_get_record_audio_frame_param)
@@ -91,7 +94,15 @@ class AudioFrameObserverInner(ctypes.Structure):
91
94
 
92
95
  user_id_str = user_id.decode('utf-8')
93
96
  frame = audio_frame_inner.contents.get()
94
- ret = self.observer.on_playback_audio_frame_before_mixing(self.local_user, channel_id_str, user_id_str, frame)
97
+ # make a map: key{channel_id, user_id}, value: vadv2 instance
98
+ # and call back in this call back
99
+ # when to create and remove the key from map?
100
+ # in _del_ function to release the vadv2 instance
101
+ if self._enable_vad:
102
+ vad_result_state, vad_result_bytes = self._vad_instance_manager.process(channel_id_str, user_id_str, frame)
103
+ ret = self.observer.on_playback_audio_frame_before_mixing(self.local_user, channel_id_str, user_id_str, frame, vad_result_state, vad_result_bytes)
104
+ else:
105
+ ret = self.observer.on_playback_audio_frame_before_mixing(self.local_user, channel_id_str, user_id_str, frame, -1, None)
95
106
  return ret
96
107
 
97
108
  def _on_get_audio_frame_position(self, local_user_handle):
@@ -113,3 +124,10 @@ class AudioFrameObserverInner(ctypes.Structure):
113
124
  def _on_get_ear_monitoring_audio_frame_param(self, local_user_handle) -> AudioParams:
114
125
  logger.debug(f"AudioFrameObserverInner _on_get_ear_monitoring_audio_frame_param: {local_user_handle}")
115
126
  return self.observer.on_get_ear_monitoring_audio_frame_param(self.local_user)
127
+ def clear(self):
128
+ #disalbe vad
129
+ self._enable_vad = False
130
+ if self._vad_instance_manager:
131
+ self._vad_instance_manager.release()
132
+ self._vad_instance_manager = None
133
+ pass
@@ -75,6 +75,8 @@ class AgoraService:
75
75
  if result == 0:
76
76
  self.inited = True
77
77
  logger.debug(f'Initialization result: {result}')
78
+ self._is_low_delay = True if config.audio_scenario == AudioScenarioType.AUDIO_SCENARIO_CHORUS else False
79
+
78
80
 
79
81
  # to enable plugin
80
82
  provider = "agora.builtin"
@@ -87,6 +89,9 @@ class AgoraService:
87
89
  agora_parameter = self.get_agora_parameter()
88
90
  agora_parameter.set_int("rtc.set_app_type", 18)
89
91
 
92
+ # force audio vad v2 to be enabled
93
+ agora_parameter.set_parameters("{\"che.audio.label.enable\": true}")
94
+
90
95
  if config.log_path:
91
96
  log_size = 512 * 1024
92
97
  if config.log_size > 0:
@@ -128,7 +133,7 @@ class AgoraService:
128
133
  rtc_conn_handle = agora_rtc_conn_create(self.service_handle, ctypes.byref(RTCConnConfigInner.create(con_config)))
129
134
  if rtc_conn_handle is None:
130
135
  return None
131
- return RTCConnection(rtc_conn_handle)
136
+ return RTCConnection(rtc_conn_handle, self._is_low_delay)
132
137
 
133
138
  # createCustomAudioTrackPcm: creatae a custom audio track from pcm data sender
134
139
  def create_custom_audio_track_pcm(self, audio_pcm_data_sender: AudioPcmDataSender) -> LocalAudioTrack:
@@ -138,7 +143,11 @@ class AgoraService:
138
143
  custom_audio_track = agora_service_create_custom_audio_track_pcm(self.service_handle, audio_pcm_data_sender.sender_handle)
139
144
  if custom_audio_track is None:
140
145
  return None
141
- return LocalAudioTrack(custom_audio_track)
146
+ local_track = LocalAudioTrack(custom_audio_track)
147
+ #default for ai senario to set min delay to 10ms
148
+ if local_track is not None:
149
+ local_track.set_send_delay_ms(10)
150
+ return local_track
142
151
  # mix_mode: MIX_ENABLED = 0, MIX_DISABLED = 1
143
152
 
144
153
  def create_custom_audio_track_encoded(self, audio_encoded_frame_sender: AudioEncodedFrameSender, mix_mode: int):
@@ -14,7 +14,7 @@ class IAudioFrameObserver:
14
14
  def on_ear_monitoring_audio_frame(self, agora_local_user, frame):
15
15
  return 1
16
16
 
17
- def on_playback_audio_frame_before_mixing(self, agora_local_user, channelId, uid, frame: AudioFrame):
17
+ def on_playback_audio_frame_before_mixing(self, agora_local_user, channelId, uid, frame: AudioFrame, vad_result_state:int, vad_result_bytearray:bytearray):
18
18
  return 1
19
19
 
20
20
  def on_get_audio_frame_position(self, agora_local_user):