volcengine-audio 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- volcengine_audio/__init__.py +170 -0
- volcengine_audio/protocol.py +194 -0
- volcengine_audio/realtime.py +654 -0
- volcengine_audio/stt.py +651 -0
- volcengine_audio/tts.py +603 -0
- volcengine_audio-0.1.0.dist-info/METADATA +541 -0
- volcengine_audio-0.1.0.dist-info/RECORD +8 -0
- volcengine_audio-0.1.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
"""Volcengine Audio SDK for Speech-to-Text and Text-to-Speech services.
|
|
2
|
+
|
|
3
|
+
This package provides Python models and utilities for interacting with Volcengine's
|
|
4
|
+
audio services including STT (Speech-to-Text), TTS (Text-to-Speech), and realtime
|
|
5
|
+
dialogue capabilities.
|
|
6
|
+
|
|
7
|
+
Modules:
|
|
8
|
+
- protocol: Shared protocol definitions and event types
|
|
9
|
+
- stt: Speech-to-Text (ASR) models and helpers
|
|
10
|
+
- tts: Text-to-Speech models and helpers
|
|
11
|
+
- realtime: Realtime dialogue (combined STT+TTS) models and helpers
|
|
12
|
+
|
|
13
|
+
Example:
|
|
14
|
+
>>> from volcengine_audio import VolcengineAsrRequestV3, VolcengineTTSRequest
|
|
15
|
+
>>> from volcengine_audio import RealtimeDialogueConfig, HOST
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
__version__ = '0.1.0'
|
|
19
|
+
|
|
20
|
+
# Protocol exports
|
|
21
|
+
from .protocol import (
|
|
22
|
+
HOST,
|
|
23
|
+
AsrMessageType,
|
|
24
|
+
AsrMessageTypeSpecificFlag,
|
|
25
|
+
AudioCodec,
|
|
26
|
+
CompressionMethod,
|
|
27
|
+
EventReceive,
|
|
28
|
+
EventSend,
|
|
29
|
+
HeaderSize,
|
|
30
|
+
MessageType,
|
|
31
|
+
MessageTypeSpecificFlag,
|
|
32
|
+
ProtocolVersion,
|
|
33
|
+
SerializationMethod,
|
|
34
|
+
generate_before_payload,
|
|
35
|
+
generate_header,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
# Realtime dialogue exports
|
|
39
|
+
from .realtime import (
|
|
40
|
+
ASREndedResponse,
|
|
41
|
+
ASRInfoResponse,
|
|
42
|
+
ASRResponseModel,
|
|
43
|
+
ChatRAGTextRequest,
|
|
44
|
+
ChatResponseModel,
|
|
45
|
+
ChatTextQueryConfirmedResponse,
|
|
46
|
+
ChatTextQueryRequest,
|
|
47
|
+
ChatTTSTextRequest,
|
|
48
|
+
ConnectionFailedResponse,
|
|
49
|
+
ConversationCreateRequest,
|
|
50
|
+
ConversationCreatedResponse,
|
|
51
|
+
ConversationDeleteRequest,
|
|
52
|
+
ConversationDeletedResponse,
|
|
53
|
+
ConversationRetrieveRequest,
|
|
54
|
+
ConversationRetrievedResponse,
|
|
55
|
+
ConversationUpdateRequest,
|
|
56
|
+
ConversationUpdatedResponse,
|
|
57
|
+
RealtimeDialogueConfig,
|
|
58
|
+
RealtimeDialogueErrorResponse,
|
|
59
|
+
RealtimeDialogueFunctions,
|
|
60
|
+
RealtimeDialogueUsage,
|
|
61
|
+
SayHelloRequest,
|
|
62
|
+
SessionFailedResponse,
|
|
63
|
+
SessionStartedResponse,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
# STT exports
|
|
67
|
+
from .stt import (
|
|
68
|
+
AsrFullServerResponseV2,
|
|
69
|
+
AudioFormatV2,
|
|
70
|
+
ListenBidirectionPackage,
|
|
71
|
+
STTAudioFormatV3,
|
|
72
|
+
STTBigmodelNoStreamLanguage,
|
|
73
|
+
STTResource,
|
|
74
|
+
STTResultType,
|
|
75
|
+
VolcengineAsrFunctionsV2,
|
|
76
|
+
VolcengineAsrFunctionsV3,
|
|
77
|
+
VolcengineAsrRequestV2,
|
|
78
|
+
VolcengineAsrRequestV3,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
# TTS exports
|
|
82
|
+
from .tts import (
|
|
83
|
+
AppConfig,
|
|
84
|
+
AudioConfig,
|
|
85
|
+
OperationEnum,
|
|
86
|
+
RequestConfig,
|
|
87
|
+
TTSAudioFormat,
|
|
88
|
+
TTSBigmodelResourceType,
|
|
89
|
+
TTSEndResponse,
|
|
90
|
+
TTSReqParams,
|
|
91
|
+
TTSSentenceEndResponse,
|
|
92
|
+
TTSSentenceStartResponse,
|
|
93
|
+
UserConfig,
|
|
94
|
+
VolcengineTTSBidirectionRequest,
|
|
95
|
+
VolcengineTTSFunctions,
|
|
96
|
+
VolcengineTTSRequest,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
__all__ = [
|
|
100
|
+
# Version
|
|
101
|
+
'__version__',
|
|
102
|
+
# Constants
|
|
103
|
+
'HOST',
|
|
104
|
+
# Protocol
|
|
105
|
+
'AsrMessageType',
|
|
106
|
+
'AsrMessageTypeSpecificFlag',
|
|
107
|
+
'AudioCodec',
|
|
108
|
+
'CompressionMethod',
|
|
109
|
+
'EventReceive',
|
|
110
|
+
'EventSend',
|
|
111
|
+
'HeaderSize',
|
|
112
|
+
'MessageType',
|
|
113
|
+
'MessageTypeSpecificFlag',
|
|
114
|
+
'ProtocolVersion',
|
|
115
|
+
'SerializationMethod',
|
|
116
|
+
'generate_before_payload',
|
|
117
|
+
'generate_header',
|
|
118
|
+
# STT
|
|
119
|
+
'AsrFullServerResponseV2',
|
|
120
|
+
'AudioFormatV2',
|
|
121
|
+
'ListenBidirectionPackage',
|
|
122
|
+
'STTAudioFormatV3',
|
|
123
|
+
'STTBigmodelNoStreamLanguage',
|
|
124
|
+
'STTResource',
|
|
125
|
+
'STTResultType',
|
|
126
|
+
'VolcengineAsrFunctionsV2',
|
|
127
|
+
'VolcengineAsrFunctionsV3',
|
|
128
|
+
'VolcengineAsrRequestV2',
|
|
129
|
+
'VolcengineAsrRequestV3',
|
|
130
|
+
# TTS
|
|
131
|
+
'AppConfig',
|
|
132
|
+
'AudioConfig',
|
|
133
|
+
'OperationEnum',
|
|
134
|
+
'RequestConfig',
|
|
135
|
+
'TTSAudioFormat',
|
|
136
|
+
'TTSBigmodelResourceType',
|
|
137
|
+
'TTSEndResponse',
|
|
138
|
+
'TTSReqParams',
|
|
139
|
+
'TTSSentenceEndResponse',
|
|
140
|
+
'TTSSentenceStartResponse',
|
|
141
|
+
'UserConfig',
|
|
142
|
+
'VolcengineTTSBidirectionRequest',
|
|
143
|
+
'VolcengineTTSFunctions',
|
|
144
|
+
'VolcengineTTSRequest',
|
|
145
|
+
# Realtime dialogue
|
|
146
|
+
'ASREndedResponse',
|
|
147
|
+
'ASRInfoResponse',
|
|
148
|
+
'ASRResponseModel',
|
|
149
|
+
'ChatRAGTextRequest',
|
|
150
|
+
'ChatResponseModel',
|
|
151
|
+
'ChatTextQueryConfirmedResponse',
|
|
152
|
+
'ChatTTSTextRequest',
|
|
153
|
+
'ChatTextQueryRequest',
|
|
154
|
+
'ConnectionFailedResponse',
|
|
155
|
+
'ConversationCreateRequest',
|
|
156
|
+
'ConversationCreatedResponse',
|
|
157
|
+
'ConversationDeleteRequest',
|
|
158
|
+
'ConversationDeletedResponse',
|
|
159
|
+
'ConversationRetrieveRequest',
|
|
160
|
+
'ConversationRetrievedResponse',
|
|
161
|
+
'ConversationUpdateRequest',
|
|
162
|
+
'ConversationUpdatedResponse',
|
|
163
|
+
'RealtimeDialogueConfig',
|
|
164
|
+
'RealtimeDialogueErrorResponse',
|
|
165
|
+
'RealtimeDialogueFunctions',
|
|
166
|
+
'RealtimeDialogueUsage',
|
|
167
|
+
'SayHelloRequest',
|
|
168
|
+
'SessionFailedResponse',
|
|
169
|
+
'SessionStartedResponse',
|
|
170
|
+
]
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
"""Volcengine audio protocol definitions.
|
|
2
|
+
|
|
3
|
+
This module contains shared protocol definitions used by both TTS and STT services,
|
|
4
|
+
including message types, serialization methods, and communication protocols.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import struct
|
|
8
|
+
from enum import Enum, IntEnum
|
|
9
|
+
|
|
10
|
+
HOST = 'openspeech.bytedance.com'
|
|
11
|
+
"""Volcengine audio service host"""
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ProtocolVersion(Enum):
|
|
15
|
+
"""Protocol version for Volcengine audio services"""
|
|
16
|
+
|
|
17
|
+
V1 = 0b0001
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class HeaderSize(Enum):
|
|
21
|
+
"""Header size in 32-bit words"""
|
|
22
|
+
|
|
23
|
+
SIZE_4 = 0b0001 # 4 bytes
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class MessageType(Enum):
|
|
27
|
+
"""Message types for bidirectional communication"""
|
|
28
|
+
|
|
29
|
+
FULL_CLIENT_REQUEST = 0b0001
|
|
30
|
+
AUDIO_ONLY_REQUEST = 0b0010
|
|
31
|
+
FULL_SERVER_RESPONSE = 0b1001
|
|
32
|
+
AUDIO_ONLY_RESPONSE = 0b1011
|
|
33
|
+
ERROR_INFORMATION = 0b1111
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class MessageTypeSpecificFlag(Enum):
|
|
37
|
+
"""Flags for message type specific information"""
|
|
38
|
+
|
|
39
|
+
NO_SEQUENCE = 0b0000
|
|
40
|
+
POS_SEQUENCE = 0b0001
|
|
41
|
+
NEG_SEQUENCE = 0b0010
|
|
42
|
+
NEG_WITH_SEQUENCE = 0b0011
|
|
43
|
+
CARRY_EVENT_ID = 0b0100
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class SerializationMethod(Enum):
|
|
47
|
+
"""Serialization methods for message payloads"""
|
|
48
|
+
|
|
49
|
+
RAW = 0b0000
|
|
50
|
+
JSON = 0b0001
|
|
51
|
+
PROTOBUF = 0b0010
|
|
52
|
+
THRFT = 0b0011
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class CompressionMethod(Enum):
|
|
56
|
+
"""Compression methods for message payloads"""
|
|
57
|
+
|
|
58
|
+
NONE = 0b0000
|
|
59
|
+
GZIP = 0b0001
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class AudioCodec(Enum):
|
|
63
|
+
"""Audio codec types"""
|
|
64
|
+
|
|
65
|
+
raw = 'raw'
|
|
66
|
+
opus = 'opus' # for ogg
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class AsrMessageType(Enum):
|
|
70
|
+
"""Message types specific to ASR (speech recognition)"""
|
|
71
|
+
|
|
72
|
+
FULL_CLIENT_REQUEST = 0b0001
|
|
73
|
+
AUDIO_ONLY_REQUEST = 0b0010
|
|
74
|
+
FULL_SERVER_RESPONSE = 0b1001
|
|
75
|
+
SERVER_ACK = 0b1011
|
|
76
|
+
SERVER_ERROR_RESPONSE = 0b1111
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class AsrMessageTypeSpecificFlag(Enum):
|
|
80
|
+
"""Flags specific to ASR message types"""
|
|
81
|
+
|
|
82
|
+
# requests
|
|
83
|
+
# full client request or non-last audio only request
|
|
84
|
+
NO_SEQUENCE = 0b0000
|
|
85
|
+
POS_SEQUENCE = 0b0001
|
|
86
|
+
# last audio only request without sequence
|
|
87
|
+
NEG_SEQUENCE = 0b0010
|
|
88
|
+
# last audio only request with sequence
|
|
89
|
+
NEG_WITH_SEQUENCE = 0b0011
|
|
90
|
+
# responses
|
|
91
|
+
# full client response or non-last audio only response
|
|
92
|
+
NEG_SEQUENCE_1 = 0b0011
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class EventSend(IntEnum):
|
|
96
|
+
"""Events sent from client to server"""
|
|
97
|
+
|
|
98
|
+
StartConnection = 1
|
|
99
|
+
FinishConnection = 2
|
|
100
|
+
StartSession = 100
|
|
101
|
+
CancelSession = 101
|
|
102
|
+
FinishSession = 102
|
|
103
|
+
TaskRequest = 200
|
|
104
|
+
SayHello = 300
|
|
105
|
+
ChatTTSText = 500
|
|
106
|
+
ChatTextQuery = 501
|
|
107
|
+
ChatRAGText = 502
|
|
108
|
+
ConversationCreate = 510
|
|
109
|
+
ConversationUpdate = 511
|
|
110
|
+
ConversationRetrieve = 512
|
|
111
|
+
ConversationDelete = 514
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class EventReceive(IntEnum):
|
|
115
|
+
"""Events received from server"""
|
|
116
|
+
|
|
117
|
+
ConnectionStarted = 50
|
|
118
|
+
ConnectionFailed = 51
|
|
119
|
+
ConnectionFinished = 52
|
|
120
|
+
SessionStarted = 150
|
|
121
|
+
SessionCanceled = 151
|
|
122
|
+
SessionFinished = 152
|
|
123
|
+
SessionFailed = 153
|
|
124
|
+
USAGE = 154
|
|
125
|
+
TTSSentenceStart = 350
|
|
126
|
+
TTSSentenceEnd = 351
|
|
127
|
+
TTSResponse = 352
|
|
128
|
+
TTSEnded = 359
|
|
129
|
+
ASRInfo = 450
|
|
130
|
+
ASRResponse = 451
|
|
131
|
+
ASREnded = 459
|
|
132
|
+
ChatResponse = 550
|
|
133
|
+
ChatTextQueryConfirmed = 553
|
|
134
|
+
ChatEnded = 559
|
|
135
|
+
ConversationCreated = 567
|
|
136
|
+
ConversationUpdated = 568
|
|
137
|
+
ConversationRetrieved = 569
|
|
138
|
+
ConversationDeleted = 571
|
|
139
|
+
DialogCommonError = 599
|
|
140
|
+
# TODO(Deo): need to check what this code is, in tts
|
|
141
|
+
UNKNOWN = 50000000
|
|
142
|
+
# TODO(Deo): need to check what this code is, in dialogue
|
|
143
|
+
UNKNOWN1 = 55000000
|
|
144
|
+
SERVER_PROCESSING_ERROR = 55000001
|
|
145
|
+
SERVICE_UNAVAILABLE = 55000030
|
|
146
|
+
AUDIO_FLOW_ERROR = 55002070
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def generate_header(
|
|
150
|
+
message_type: MessageType = MessageType.FULL_CLIENT_REQUEST,
|
|
151
|
+
message_type_specific_flags: MessageTypeSpecificFlag = MessageTypeSpecificFlag.NO_SEQUENCE,
|
|
152
|
+
serial_method: SerializationMethod = SerializationMethod.JSON,
|
|
153
|
+
compression_type: CompressionMethod = CompressionMethod.NONE,
|
|
154
|
+
reserved_data: int = 0x00,
|
|
155
|
+
) -> bytearray:
|
|
156
|
+
"""Generate protocol header for Volcengine audio services.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
message_type: Type of message being sent
|
|
160
|
+
message_type_specific_flags: Specific flags for the message type
|
|
161
|
+
serial_method: Serialization method for payload
|
|
162
|
+
compression_type: Compression method for payload
|
|
163
|
+
reserved_data: Reserved byte (default 0x00)
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
4-byte header as bytearray
|
|
167
|
+
|
|
168
|
+
Header structure:
|
|
169
|
+
- Byte 0: protocol_version(4 bits), header_size(4 bits)
|
|
170
|
+
- Byte 1: message_type(4 bits), message_type_specific_flags(4 bits)
|
|
171
|
+
- Byte 2: serialization_method(4 bits), message_compression(4 bits)
|
|
172
|
+
- Byte 3: reserved(8 bits)
|
|
173
|
+
"""
|
|
174
|
+
header = bytearray()
|
|
175
|
+
header_size = 1
|
|
176
|
+
header.append((ProtocolVersion.V1.value << 4) | header_size)
|
|
177
|
+
header.append((message_type.value << 4) | message_type_specific_flags.value)
|
|
178
|
+
header.append((serial_method.value << 4) | compression_type.value)
|
|
179
|
+
header.append(reserved_data)
|
|
180
|
+
return header
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def generate_before_payload(sequence: int) -> bytearray:
|
|
184
|
+
"""Generate sequence number before payload.
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
sequence: Sequence number (signed 32-bit integer)
|
|
188
|
+
|
|
189
|
+
Returns:
|
|
190
|
+
4-byte sequence as bytearray
|
|
191
|
+
"""
|
|
192
|
+
before_payload = bytearray()
|
|
193
|
+
before_payload.extend(struct.pack('>i', sequence))
|
|
194
|
+
return before_payload
|