volcengine-audio 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- volcengine_audio-0.1.0/.github/workflows/tests.yml +33 -0
- volcengine_audio-0.1.0/PKG-INFO +541 -0
- volcengine_audio-0.1.0/README.md +511 -0
- volcengine_audio-0.1.0/pyproject.toml +129 -0
- volcengine_audio-0.1.0/src/volcengine_audio/__init__.py +170 -0
- volcengine_audio-0.1.0/src/volcengine_audio/protocol.py +194 -0
- volcengine_audio-0.1.0/src/volcengine_audio/realtime.py +654 -0
- volcengine_audio-0.1.0/src/volcengine_audio/stt.py +651 -0
- volcengine_audio-0.1.0/src/volcengine_audio/tts.py +603 -0
- volcengine_audio-0.1.0/tests/test_realtime.py +184 -0
- volcengine_audio-0.1.0/tests/test_stt.py +283 -0
- volcengine_audio-0.1.0/tests/test_tts.py +56 -0
- volcengine_audio-0.1.0/uv.lock +338 -0
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
name: Tests
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
pull_request:
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
test:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
strategy:
|
|
11
|
+
fail-fast: false
|
|
12
|
+
matrix:
|
|
13
|
+
python-version: ['3.11', '3.13']
|
|
14
|
+
steps:
|
|
15
|
+
- name: Checkout
|
|
16
|
+
uses: actions/checkout@v4
|
|
17
|
+
|
|
18
|
+
- name: Setup uv
|
|
19
|
+
uses: astral-sh/setup-uv@v5
|
|
20
|
+
|
|
21
|
+
- name: Setup Python
|
|
22
|
+
uses: actions/setup-python@v5
|
|
23
|
+
with:
|
|
24
|
+
python-version: ${{ matrix.python-version }}
|
|
25
|
+
|
|
26
|
+
- name: Install dependencies
|
|
27
|
+
run: uv sync --frozen --extra dev
|
|
28
|
+
|
|
29
|
+
- name: Lint
|
|
30
|
+
run: uv run ruff check src/ tests/
|
|
31
|
+
|
|
32
|
+
- name: Run tests
|
|
33
|
+
run: uv run pytest tests/ -q
|
|
@@ -0,0 +1,541 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: volcengine-audio
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Python SDK for Volcengine Audio Services (TTS, STT, and Realtime Dialogue)
|
|
5
|
+
Project-URL: Homepage, https://github.com/aiyou178/volcengine-audio
|
|
6
|
+
Project-URL: Issues, https://github.com/aiyou178/volcengine-audio/issues
|
|
7
|
+
Project-URL: Repository, https://github.com/aiyou178/volcengine-audio
|
|
8
|
+
Author-email: Zhanzhao Liang <liangzhanzhao1985@gmail.com>
|
|
9
|
+
License: MIT
|
|
10
|
+
Keywords: audio,bytedance,speech,stt,tts,volcengine
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
21
|
+
Requires-Python: >=3.10
|
|
22
|
+
Requires-Dist: orjson
|
|
23
|
+
Requires-Dist: pydantic<3.0,>=2.0
|
|
24
|
+
Provides-Extra: dev
|
|
25
|
+
Requires-Dist: pytest-asyncio>=1.0.0; extra == 'dev'
|
|
26
|
+
Requires-Dist: pytest-randomly>=3.16.0; extra == 'dev'
|
|
27
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
28
|
+
Requires-Dist: ruff>=0.11.0; extra == 'dev'
|
|
29
|
+
Description-Content-Type: text/markdown
|
|
30
|
+
|
|
31
|
+
# Volcengine Audio SDK
|
|
32
|
+
|
|
33
|
+
Python SDK for Volcengine (ByteDance) Audio Services, providing comprehensive support for Text-to-Speech (TTS), Speech-to-Text (STT), and Realtime Dialogue capabilities.
|
|
34
|
+
|
|
35
|
+
## Features
|
|
36
|
+
|
|
37
|
+
- **Speech-to-Text (STT)**: Convert audio to text using Volcengine's ASR services (V2 and V3 APIs)
|
|
38
|
+
- **Text-to-Speech (TTS)**: Synthesize natural-sounding speech from text with various voice types
|
|
39
|
+
- **Realtime Dialogue**: Bidirectional streaming for interactive voice conversations
|
|
40
|
+
- **Protocol Support**: Low-level protocol utilities for custom implementations
|
|
41
|
+
- **Type Safety**: Full Pydantic model validation for all requests and responses
|
|
42
|
+
|
|
43
|
+
## Installation
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
### last document sync
|
|
47
|
+
|
|
48
|
+
* 2026-03-04
|
|
49
|
+
* if you find any document changes, please let me know or submit a PR
|
|
50
|
+
* realtime API source: https://www.volcengine.com/docs/6561/1594356?lang=zh
|
|
51
|
+
|
|
52
|
+
### Install from PyPI
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
# From PyPI (when published)
|
|
56
|
+
pip install volcengine-audio
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
### Install from source
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
git clone https://github.com/aiyou178/volcengine-audio.git
|
|
63
|
+
cd volcengine-audio
|
|
64
|
+
pip install -e .
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Quick Start
|
|
68
|
+
|
|
69
|
+
### Speech-to-Text (STT)
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
from volcengine_audio import (
|
|
73
|
+
VolcengineAsrRequestV3,
|
|
74
|
+
VolcengineAsrFunctionsV3,
|
|
75
|
+
STTAudioFormatV3,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
# Create ASR request
|
|
79
|
+
asr_request = VolcengineAsrRequestV3(
|
|
80
|
+
audio=VolcengineAsrRequestV3.Audio(
|
|
81
|
+
format=STTAudioFormatV3.wav,
|
|
82
|
+
rate=16000,
|
|
83
|
+
),
|
|
84
|
+
request=VolcengineAsrRequestV3.Request(
|
|
85
|
+
model_name="bigmodel",
|
|
86
|
+
enable_itn=True,
|
|
87
|
+
enable_punc=True,
|
|
88
|
+
),
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# Generate request payload
|
|
92
|
+
request_params = asr_request.model_dump(exclude_none=True)
|
|
93
|
+
full_request = VolcengineAsrFunctionsV3.generate_asr_full_client_request(
|
|
94
|
+
sequence=1,
|
|
95
|
+
request_params=request_params,
|
|
96
|
+
compression=True,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
# Send audio chunks
|
|
100
|
+
audio_request = VolcengineAsrFunctionsV3.generate_asr_audio_only_request(
|
|
101
|
+
sequence=2,
|
|
102
|
+
audio=audio_chunk,
|
|
103
|
+
compress=True,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
# Parse response
|
|
107
|
+
response_data = VolcengineAsrFunctionsV3.parse_response(server_response)
|
|
108
|
+
print(response_data['message'])
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### Text-to-Speech (TTS)
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
from volcengine_audio import (
|
|
115
|
+
VolcengineTTSBidirectionRequest,
|
|
116
|
+
VolcengineTTSFunctions,
|
|
117
|
+
TTSBigmodelResourceType,
|
|
118
|
+
TTSAudioFormat,
|
|
119
|
+
EventSend,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
# Create TTS request
|
|
123
|
+
tts_request = VolcengineTTSBidirectionRequest(
|
|
124
|
+
event=EventSend.StartSession,
|
|
125
|
+
req_params=VolcengineTTSBidirectionRequest.ReqParams(
|
|
126
|
+
text="Hello, this is a test.",
|
|
127
|
+
speaker="zh_female_vv_jupiter_bigtts",
|
|
128
|
+
model=TTSBigmodelResourceType.seed_tts_2_0,
|
|
129
|
+
audio_params=VolcengineTTSBidirectionRequest.ReqParams.AudioParams(
|
|
130
|
+
format=TTSAudioFormat.mp3,
|
|
131
|
+
sample_rate=24000,
|
|
132
|
+
),
|
|
133
|
+
),
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
# Create connection
|
|
137
|
+
connection_payload = VolcengineTTSFunctions.start_connection_payload()
|
|
138
|
+
|
|
139
|
+
# Start session
|
|
140
|
+
session_payload = VolcengineTTSFunctions.start_session_payload(
|
|
141
|
+
session_id="unique-session-id",
|
|
142
|
+
req_params=tts_request.req_params.model_dump(exclude_none=True),
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
# Parse response
|
|
146
|
+
event, session_id, payload = VolcengineTTSFunctions.extract_response_payload(server_response)
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
### Realtime Dialogue
|
|
150
|
+
|
|
151
|
+
```python
|
|
152
|
+
from volcengine_audio import (
|
|
153
|
+
RealtimeDialogueConfig,
|
|
154
|
+
RealtimeDialogueFunctions,
|
|
155
|
+
ChatTTSTextRequest,
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
# Configure dialogue session
|
|
159
|
+
config = RealtimeDialogueConfig(
|
|
160
|
+
dialog=RealtimeDialogueConfig.DialogConfig(
|
|
161
|
+
bot_name="AI Assistant",
|
|
162
|
+
system_role="You are a helpful assistant.",
|
|
163
|
+
speaking_style="Professional and friendly.",
|
|
164
|
+
),
|
|
165
|
+
tts=RealtimeDialogueConfig.TTSConfig(
|
|
166
|
+
speaker=RealtimeDialogueConfig.TTSConfig.Speaker.zh_female_vv_jupiter_bigtts,
|
|
167
|
+
),
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
# Start connection
|
|
171
|
+
connection = RealtimeDialogueFunctions.start_connection_payload()
|
|
172
|
+
|
|
173
|
+
# Start session
|
|
174
|
+
session = RealtimeDialogueFunctions.start_session_payload(
|
|
175
|
+
session_id="session-123",
|
|
176
|
+
config=config,
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
# Send audio for recognition
|
|
180
|
+
audio_payload = RealtimeDialogueFunctions.task_request_payload(
|
|
181
|
+
session_id="session-123",
|
|
182
|
+
audio_data=audio_bytes,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
# Request TTS for text
|
|
186
|
+
tts_payload = RealtimeDialogueFunctions.chat_tts_text_payload(
|
|
187
|
+
session_id="session-123",
|
|
188
|
+
tts_request=ChatTTSTextRequest(
|
|
189
|
+
start=True,
|
|
190
|
+
content="Hello!",
|
|
191
|
+
end=True,
|
|
192
|
+
),
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
# Finish session
|
|
196
|
+
finish = RealtimeDialogueFunctions.finish_session_payload("session-123")
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
## API Reference
|
|
200
|
+
|
|
201
|
+
### Modules
|
|
202
|
+
|
|
203
|
+
#### `volcengine_audio.protocol`
|
|
204
|
+
|
|
205
|
+
Core protocol definitions and utilities.
|
|
206
|
+
|
|
207
|
+
**Classes:**
|
|
208
|
+
- `ProtocolVersion`: Protocol version enumeration (V1)
|
|
209
|
+
- `MessageType`: Message types for bidirectional communication
|
|
210
|
+
- `EventSend`: Events sent from client to server
|
|
211
|
+
- `EventReceive`: Events received from server
|
|
212
|
+
- `SerializationMethod`: Payload serialization methods (JSON, RAW, PROTOBUF)
|
|
213
|
+
- `CompressionMethod`: Payload compression methods (NONE, GZIP)
|
|
214
|
+
|
|
215
|
+
**Constants:**
|
|
216
|
+
- `HOST`: `'openspeech.bytedance.com'` - Volcengine audio service host
|
|
217
|
+
|
|
218
|
+
**Functions:**
|
|
219
|
+
- `generate_header()`: Generate protocol header for requests
|
|
220
|
+
- `generate_before_payload()`: Generate sequence number before payload
|
|
221
|
+
|
|
222
|
+
#### `volcengine_audio.stt`
|
|
223
|
+
|
|
224
|
+
Speech-to-Text (ASR) models and utilities.
|
|
225
|
+
|
|
226
|
+
**Request Models:**
|
|
227
|
+
- `VolcengineAsrRequestV3`: ASR V3 API request
|
|
228
|
+
- `VolcengineAsrRequestV2`: ASR V2 API request
|
|
229
|
+
|
|
230
|
+
**Response Models:**
|
|
231
|
+
- `AsrFullServerResponseV2`: Full server response for V2
|
|
232
|
+
- `ListenBidirectionPackage`: Bidirectional listening package
|
|
233
|
+
|
|
234
|
+
**Enums:**
|
|
235
|
+
- `STTResource`: STT resource types for billing
|
|
236
|
+
- `STTAudioFormatV3`: Audio formats (pcm, wav, mp3, ogg)
|
|
237
|
+
- `STTResultType`: Result types (full, single)
|
|
238
|
+
- `STTBigmodelNoStreamLanguage`: Supported languages for bigmodel
|
|
239
|
+
|
|
240
|
+
**Helper Classes:**
|
|
241
|
+
- `VolcengineAsrFunctionsV3`: V3 API helper functions
|
|
242
|
+
- `generate_asr_full_client_request()`: Generate full client request
|
|
243
|
+
- `generate_asr_audio_only_request()`: Generate audio-only request
|
|
244
|
+
- `parse_response()`: Parse server response
|
|
245
|
+
- `VolcengineAsrFunctionsV2`: V2 API helper functions
|
|
246
|
+
- `full_client_request()`: Generate full client request
|
|
247
|
+
- `audio_only_request()`: Generate audio-only request
|
|
248
|
+
|
|
249
|
+
#### `volcengine_audio.tts`
|
|
250
|
+
|
|
251
|
+
Text-to-Speech models and utilities.
|
|
252
|
+
|
|
253
|
+
**Request Models:**
|
|
254
|
+
- `VolcengineTTSRequest`: Standard TTS request
|
|
255
|
+
- `VolcengineTTSBidirectionRequest`: Bidirectional TTS request
|
|
256
|
+
- `TTSReqParams`: TTS request parameters with audio settings
|
|
257
|
+
|
|
258
|
+
**Response Models:**
|
|
259
|
+
- `TTSSentenceStartResponse`: Sentence start notification
|
|
260
|
+
- `TTSSentenceEndResponse`: Sentence end notification
|
|
261
|
+
- `TTSEndResponse`: TTS ended notification
|
|
262
|
+
|
|
263
|
+
**Enums:**
|
|
264
|
+
- `TTSBigmodelResourceType`: TTS model types (seed-tts-1.0, seed-tts-2.0, etc.)
|
|
265
|
+
- `TTSAudioFormat`: Audio formats (wav, pcm, mp3, ogg_opus)
|
|
266
|
+
|
|
267
|
+
**Helper Classes:**
|
|
268
|
+
- `VolcengineTTSFunctions`: TTS API helper functions
|
|
269
|
+
- `start_connection_payload()`: Start connection
|
|
270
|
+
- `start_session_payload()`: Start TTS session
|
|
271
|
+
- `finish_session_payload()`: Finish TTS session
|
|
272
|
+
- `extract_response_payload()`: Extract and parse response
|
|
273
|
+
- `calculate_payload()`: Calculate request payload
|
|
274
|
+
|
|
275
|
+
#### `volcengine_audio.realtime`
|
|
276
|
+
|
|
277
|
+
Realtime dialogue (combined TTS+STT) models and utilities.
|
|
278
|
+
|
|
279
|
+
**Configuration:**
|
|
280
|
+
- `RealtimeDialogueConfig`: Complete dialogue session configuration
|
|
281
|
+
- `DialogConfig`: Bot persona, speaking style, location
|
|
282
|
+
- `TTSConfig`: Voice type and audio settings
|
|
283
|
+
- `Asr`: ASR-specific settings
|
|
284
|
+
|
|
285
|
+
**Request Models:**
|
|
286
|
+
- `SayHelloRequest`: Greeting message
|
|
287
|
+
- `ChatTTSTextRequest`: Text to synthesize with TTS
|
|
288
|
+
- `ChatTextQueryRequest`: Text query for dialogue
|
|
289
|
+
|
|
290
|
+
**Response Models:**
|
|
291
|
+
- `ASRInfoResponse`: ASR task info (first word detection)
|
|
292
|
+
- `ASRResponseModel`: ASR recognition result
|
|
293
|
+
- `ASREndedResponse`: ASR ended notification
|
|
294
|
+
- `ChatResponseModel`: Chat response
|
|
295
|
+
- `SessionStartedResponse`: Session started
|
|
296
|
+
- `SessionFailedResponse`: Session failed
|
|
297
|
+
|
|
298
|
+
**Helper Classes:**
|
|
299
|
+
- `RealtimeDialogueFunctions`: Realtime dialogue API helpers
|
|
300
|
+
- `start_connection_payload()`: Start connection
|
|
301
|
+
- `start_session_payload()`: Start dialogue session
|
|
302
|
+
- `task_request_payload()`: Send audio for recognition
|
|
303
|
+
- `say_hello_payload()`: Send greeting
|
|
304
|
+
- `chat_tts_text_payload()`: Request TTS for text
|
|
305
|
+
- `chat_text_query_payload()`: Send text query
|
|
306
|
+
- `finish_session_payload()`: Finish session
|
|
307
|
+
|
|
308
|
+
## Protocol Details
|
|
309
|
+
|
|
310
|
+
### Message Structure
|
|
311
|
+
|
|
312
|
+
All messages follow a standard protocol structure:
|
|
313
|
+
|
|
314
|
+
```
|
|
315
|
+
[Header 4 bytes][Optional Fields][Payload Size 4 bytes][Payload]
|
|
316
|
+
```
|
|
317
|
+
|
|
318
|
+
#### Header Format
|
|
319
|
+
|
|
320
|
+
```
|
|
321
|
+
Byte 0: [protocol_version:4 bits][header_size:4 bits]
|
|
322
|
+
Byte 1: [message_type:4 bits][message_type_specific_flags:4 bits]
|
|
323
|
+
Byte 2: [serialization_method:4 bits][compression:4 bits]
|
|
324
|
+
Byte 3: [reserved:8 bits]
|
|
325
|
+
```
|
|
326
|
+
|
|
327
|
+
#### Protocol Versions
|
|
328
|
+
|
|
329
|
+
- **V1 (0b0001)**: Current protocol version
|
|
330
|
+
|
|
331
|
+
#### Message Types
|
|
332
|
+
|
|
333
|
+
**Client → Server:**
|
|
334
|
+
- `FULL_CLIENT_REQUEST (0b0001)`: Full request with metadata
|
|
335
|
+
- `AUDIO_ONLY_REQUEST (0b0010)`: Audio-only request
|
|
336
|
+
|
|
337
|
+
**Server → Client:**
|
|
338
|
+
- `FULL_SERVER_RESPONSE (0b1001)`: Full response with metadata
|
|
339
|
+
- `AUDIO_ONLY_RESPONSE (0b1011)`: Audio-only response
|
|
340
|
+
- `ERROR_INFORMATION (0b1111)`: Error information
|
|
341
|
+
|
|
342
|
+
#### Serialization Methods
|
|
343
|
+
|
|
344
|
+
- `RAW (0b0000)`: Raw binary data
|
|
345
|
+
- `JSON (0b0001)`: JSON-encoded payload
|
|
346
|
+
- `PROTOBUF (0b0010)`: Protocol Buffers
|
|
347
|
+
- `THRIFT (0b0011)`: Apache Thrift
|
|
348
|
+
|
|
349
|
+
#### Compression Methods
|
|
350
|
+
|
|
351
|
+
- `NONE (0b0000)`: No compression
|
|
352
|
+
- `GZIP (0b0001)`: GZIP compression
|
|
353
|
+
|
|
354
|
+
### Event Flow
|
|
355
|
+
|
|
356
|
+
#### TTS Bidirectional Flow
|
|
357
|
+
|
|
358
|
+
```
|
|
359
|
+
Client Server
|
|
360
|
+
| |
|
|
361
|
+
|-- StartConnection ----------->|
|
|
362
|
+
|<---------- ConnectionStarted--|
|
|
363
|
+
| |
|
|
364
|
+
|-- StartSession -------------->|
|
|
365
|
+
|<------------ SessionStarted---|
|
|
366
|
+
| |
|
|
367
|
+
|-- TaskRequest (text) -------->|
|
|
368
|
+
|<--------- TTSSentenceStart----|
|
|
369
|
+
|<--------- TTSResponse (audio)-|
|
|
370
|
+
|<----------- TTSSentenceEnd----|
|
|
371
|
+
| |
|
|
372
|
+
|-- FinishSession ------------->|
|
|
373
|
+
|<---------- SessionFinished----|
|
|
374
|
+
| |
|
|
375
|
+
|-- FinishConnection ---------->|
|
|
376
|
+
|<-------- ConnectionFinished---|
|
|
377
|
+
```
|
|
378
|
+
|
|
379
|
+
#### STT Streaming Flow
|
|
380
|
+
|
|
381
|
+
```
|
|
382
|
+
Client Server
|
|
383
|
+
| |
|
|
384
|
+
|-- FullClientRequest --------->|
|
|
385
|
+
| |
|
|
386
|
+
|-- AudioOnlyRequest (chunk1)-->|
|
|
387
|
+
|<------------- FullResponse----|
|
|
388
|
+
| |
|
|
389
|
+
|-- AudioOnlyRequest (chunk2)-->|
|
|
390
|
+
|<------------- FullResponse----|
|
|
391
|
+
| |
|
|
392
|
+
|-- AudioOnlyRequest (last) --->|
|
|
393
|
+
|<------------- FullResponse----|
|
|
394
|
+
```
|
|
395
|
+
|
|
396
|
+
#### Realtime Dialogue Flow
|
|
397
|
+
|
|
398
|
+
```
|
|
399
|
+
Client Server
|
|
400
|
+
| |
|
|
401
|
+
|-- StartConnection ----------->|
|
|
402
|
+
|<---------- ConnectionStarted--|
|
|
403
|
+
| |
|
|
404
|
+
|-- StartSession (config) ----->|
|
|
405
|
+
|<------------ SessionStarted---|
|
|
406
|
+
| |
|
|
407
|
+
|-- TaskRequest (audio) ------->|
|
|
408
|
+
|<-------------- ASRInfo--------|
|
|
409
|
+
|<------------ ASRResponse------|
|
|
410
|
+
|<-------------- ASREnded-------|
|
|
411
|
+
| |
|
|
412
|
+
|<----------- ChatResponse------|
|
|
413
|
+
|<------- TTSSentenceStart------|
|
|
414
|
+
|<--------- TTSResponse (audio)-|
|
|
415
|
+
|<--------- TTSSentenceEnd------|
|
|
416
|
+
|<------------- ChatEnded-------|
|
|
417
|
+
| |
|
|
418
|
+
|-- FinishSession ------------->|
|
|
419
|
+
|<---------- SessionFinished----|
|
|
420
|
+
```
|
|
421
|
+
|
|
422
|
+
## Advanced Usage
|
|
423
|
+
|
|
424
|
+
### Custom Context and Hot Words (STT)
|
|
425
|
+
|
|
426
|
+
```python
|
|
427
|
+
from volcengine_audio import VolcengineAsrRequestV3
|
|
428
|
+
|
|
429
|
+
request = VolcengineAsrRequestV3(
|
|
430
|
+
request=VolcengineAsrRequestV3.Request(
|
|
431
|
+
corpus=VolcengineAsrRequestV3.Request.Corpus(
|
|
432
|
+
context=VolcengineAsrRequestV3.Request.Corpus.Context(
|
|
433
|
+
hotwords=[
|
|
434
|
+
{"word": "Volcengine"},
|
|
435
|
+
{"word": "ByteDance"},
|
|
436
|
+
],
|
|
437
|
+
context_type="dialog_ctx",
|
|
438
|
+
),
|
|
439
|
+
),
|
|
440
|
+
sensitive_words_filter=VolcengineAsrRequestV3.Request.SensitiveWordsFilter(
|
|
441
|
+
system_reserved_filter=True,
|
|
442
|
+
filter_with_signed=["badword1", "badword2"],
|
|
443
|
+
),
|
|
444
|
+
),
|
|
445
|
+
)
|
|
446
|
+
```
|
|
447
|
+
|
|
448
|
+
### Mixed Voice (TTS)
|
|
449
|
+
|
|
450
|
+
```python
|
|
451
|
+
from volcengine_audio import VolcengineTTSBidirectionRequest
|
|
452
|
+
|
|
453
|
+
request = VolcengineTTSBidirectionRequest.ReqParams(
|
|
454
|
+
text="Hello",
|
|
455
|
+
speaker="custom_mix",
|
|
456
|
+
mix_speaker=VolcengineTTSBidirectionRequest.ReqParams.MixSpeaker(
|
|
457
|
+
speakers=[
|
|
458
|
+
{
|
|
459
|
+
"source_speaker": "zh_female_vv_jupiter_bigtts",
|
|
460
|
+
"mix_factor": 0.6,
|
|
461
|
+
},
|
|
462
|
+
{
|
|
463
|
+
"source_speaker": "zh_male_yunzhou_jupiter_bigtts",
|
|
464
|
+
"mix_factor": 0.4,
|
|
465
|
+
},
|
|
466
|
+
],
|
|
467
|
+
),
|
|
468
|
+
)
|
|
469
|
+
```
|
|
470
|
+
|
|
471
|
+
### Emotion Control (TTS)
|
|
472
|
+
|
|
473
|
+
```python
|
|
474
|
+
from volcengine_audio import TTSReqParams
|
|
475
|
+
|
|
476
|
+
audio_params = TTSReqParams.AudioParams(
|
|
477
|
+
emotion="happy",
|
|
478
|
+
emotion_scale=5, # Max intensity
|
|
479
|
+
speech_rate=50, # 1.5x speed
|
|
480
|
+
loudness_rate=20, # 1.2x volume
|
|
481
|
+
pitch=2, # Slightly higher pitch
|
|
482
|
+
)
|
|
483
|
+
```
|
|
484
|
+
|
|
485
|
+
### Web Search Integration (Realtime Dialogue)
|
|
486
|
+
|
|
487
|
+
```python
|
|
488
|
+
from volcengine_audio import RealtimeDialogueConfig
|
|
489
|
+
|
|
490
|
+
config = RealtimeDialogueConfig(
|
|
491
|
+
dialog=RealtimeDialogueConfig.DialogConfig(
|
|
492
|
+
extra=RealtimeDialogueConfig.DialogConfig.Extra(
|
|
493
|
+
enable_volc_websearch=True,
|
|
494
|
+
volc_websearch_type="web_summary",
|
|
495
|
+
volc_websearch_api_key="your-api-key",
|
|
496
|
+
volc_websearch_result_count=5,
|
|
497
|
+
),
|
|
498
|
+
),
|
|
499
|
+
)
|
|
500
|
+
```
|
|
501
|
+
|
|
502
|
+
## Error Handling
|
|
503
|
+
|
|
504
|
+
```python
|
|
505
|
+
from volcengine_audio import EventReceive
|
|
506
|
+
|
|
507
|
+
event, session_id, payload = VolcengineTTSFunctions.extract_response_payload(response)
|
|
508
|
+
|
|
509
|
+
if event == EventReceive.SessionFailed:
|
|
510
|
+
print(f"Session failed: {payload.get('error')}")
|
|
511
|
+
elif event == EventReceive.ConnectionFailed:
|
|
512
|
+
print(f"Connection failed: {payload.get('error')}")
|
|
513
|
+
elif event == EventReceive.SERVER_PROCESSING_ERROR:
|
|
514
|
+
print("Server processing error")
|
|
515
|
+
```
|
|
516
|
+
|
|
517
|
+
## Development
|
|
518
|
+
|
|
519
|
+
### Running Tests
|
|
520
|
+
|
|
521
|
+
```bash
|
|
522
|
+
pytest tests/
|
|
523
|
+
```
|
|
524
|
+
|
|
525
|
+
### Code Style
|
|
526
|
+
|
|
527
|
+
This package uses Ruff for linting and formatting:
|
|
528
|
+
|
|
529
|
+
```bash
|
|
530
|
+
ruff check src/ tests/
|
|
531
|
+
ruff format src/ tests/
|
|
532
|
+
```
|
|
533
|
+
|
|
534
|
+
## License
|
|
535
|
+
|
|
536
|
+
MIT
|
|
537
|
+
|
|
538
|
+
## References
|
|
539
|
+
|
|
540
|
+
- [Volcengine Speech Services Documentation](https://www.volcengine.com/docs/6561/1324606)
|
|
541
|
+
- [Volcengine Realtime Dialogue](https://www.volcengine.com/docs/6561/1594356?lang=zh)
|