volcengine-audio 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,170 @@
1
+ """Volcengine Audio SDK for Speech-to-Text and Text-to-Speech services.
2
+
3
+ This package provides Python models and utilities for interacting with Volcengine's
4
+ audio services including STT (Speech-to-Text), TTS (Text-to-Speech), and realtime
5
+ dialogue capabilities.
6
+
7
+ Modules:
8
+ - protocol: Shared protocol definitions and event types
9
+ - stt: Speech-to-Text (ASR) models and helpers
10
+ - tts: Text-to-Speech models and helpers
11
+ - realtime: Realtime dialogue (combined STT+TTS) models and helpers
12
+
13
+ Example:
14
+ >>> from volcengine_audio import VolcengineAsrRequestV3, VolcengineTTSRequest
15
+ >>> from volcengine_audio import RealtimeDialogueConfig, HOST
16
+ """
17
+
18
+ __version__ = '0.1.0'
19
+
20
+ # Protocol exports
21
+ from .protocol import (
22
+ HOST,
23
+ AsrMessageType,
24
+ AsrMessageTypeSpecificFlag,
25
+ AudioCodec,
26
+ CompressionMethod,
27
+ EventReceive,
28
+ EventSend,
29
+ HeaderSize,
30
+ MessageType,
31
+ MessageTypeSpecificFlag,
32
+ ProtocolVersion,
33
+ SerializationMethod,
34
+ generate_before_payload,
35
+ generate_header,
36
+ )
37
+
38
+ # Realtime dialogue exports
39
+ from .realtime import (
40
+ ASREndedResponse,
41
+ ASRInfoResponse,
42
+ ASRResponseModel,
43
+ ChatRAGTextRequest,
44
+ ChatResponseModel,
45
+ ChatTextQueryConfirmedResponse,
46
+ ChatTextQueryRequest,
47
+ ChatTTSTextRequest,
48
+ ConnectionFailedResponse,
49
+ ConversationCreateRequest,
50
+ ConversationCreatedResponse,
51
+ ConversationDeleteRequest,
52
+ ConversationDeletedResponse,
53
+ ConversationRetrieveRequest,
54
+ ConversationRetrievedResponse,
55
+ ConversationUpdateRequest,
56
+ ConversationUpdatedResponse,
57
+ RealtimeDialogueConfig,
58
+ RealtimeDialogueErrorResponse,
59
+ RealtimeDialogueFunctions,
60
+ RealtimeDialogueUsage,
61
+ SayHelloRequest,
62
+ SessionFailedResponse,
63
+ SessionStartedResponse,
64
+ )
65
+
66
+ # STT exports
67
+ from .stt import (
68
+ AsrFullServerResponseV2,
69
+ AudioFormatV2,
70
+ ListenBidirectionPackage,
71
+ STTAudioFormatV3,
72
+ STTBigmodelNoStreamLanguage,
73
+ STTResource,
74
+ STTResultType,
75
+ VolcengineAsrFunctionsV2,
76
+ VolcengineAsrFunctionsV3,
77
+ VolcengineAsrRequestV2,
78
+ VolcengineAsrRequestV3,
79
+ )
80
+
81
+ # TTS exports
82
+ from .tts import (
83
+ AppConfig,
84
+ AudioConfig,
85
+ OperationEnum,
86
+ RequestConfig,
87
+ TTSAudioFormat,
88
+ TTSBigmodelResourceType,
89
+ TTSEndResponse,
90
+ TTSReqParams,
91
+ TTSSentenceEndResponse,
92
+ TTSSentenceStartResponse,
93
+ UserConfig,
94
+ VolcengineTTSBidirectionRequest,
95
+ VolcengineTTSFunctions,
96
+ VolcengineTTSRequest,
97
+ )
98
+
99
+ __all__ = [
100
+ # Version
101
+ '__version__',
102
+ # Constants
103
+ 'HOST',
104
+ # Protocol
105
+ 'AsrMessageType',
106
+ 'AsrMessageTypeSpecificFlag',
107
+ 'AudioCodec',
108
+ 'CompressionMethod',
109
+ 'EventReceive',
110
+ 'EventSend',
111
+ 'HeaderSize',
112
+ 'MessageType',
113
+ 'MessageTypeSpecificFlag',
114
+ 'ProtocolVersion',
115
+ 'SerializationMethod',
116
+ 'generate_before_payload',
117
+ 'generate_header',
118
+ # STT
119
+ 'AsrFullServerResponseV2',
120
+ 'AudioFormatV2',
121
+ 'ListenBidirectionPackage',
122
+ 'STTAudioFormatV3',
123
+ 'STTBigmodelNoStreamLanguage',
124
+ 'STTResource',
125
+ 'STTResultType',
126
+ 'VolcengineAsrFunctionsV2',
127
+ 'VolcengineAsrFunctionsV3',
128
+ 'VolcengineAsrRequestV2',
129
+ 'VolcengineAsrRequestV3',
130
+ # TTS
131
+ 'AppConfig',
132
+ 'AudioConfig',
133
+ 'OperationEnum',
134
+ 'RequestConfig',
135
+ 'TTSAudioFormat',
136
+ 'TTSBigmodelResourceType',
137
+ 'TTSEndResponse',
138
+ 'TTSReqParams',
139
+ 'TTSSentenceEndResponse',
140
+ 'TTSSentenceStartResponse',
141
+ 'UserConfig',
142
+ 'VolcengineTTSBidirectionRequest',
143
+ 'VolcengineTTSFunctions',
144
+ 'VolcengineTTSRequest',
145
+ # Realtime dialogue
146
+ 'ASREndedResponse',
147
+ 'ASRInfoResponse',
148
+ 'ASRResponseModel',
149
+ 'ChatRAGTextRequest',
150
+ 'ChatResponseModel',
151
+ 'ChatTextQueryConfirmedResponse',
152
+ 'ChatTTSTextRequest',
153
+ 'ChatTextQueryRequest',
154
+ 'ConnectionFailedResponse',
155
+ 'ConversationCreateRequest',
156
+ 'ConversationCreatedResponse',
157
+ 'ConversationDeleteRequest',
158
+ 'ConversationDeletedResponse',
159
+ 'ConversationRetrieveRequest',
160
+ 'ConversationRetrievedResponse',
161
+ 'ConversationUpdateRequest',
162
+ 'ConversationUpdatedResponse',
163
+ 'RealtimeDialogueConfig',
164
+ 'RealtimeDialogueErrorResponse',
165
+ 'RealtimeDialogueFunctions',
166
+ 'RealtimeDialogueUsage',
167
+ 'SayHelloRequest',
168
+ 'SessionFailedResponse',
169
+ 'SessionStartedResponse',
170
+ ]
@@ -0,0 +1,194 @@
1
+ """Volcengine audio protocol definitions.
2
+
3
+ This module contains shared protocol definitions used by both TTS and STT services,
4
+ including message types, serialization methods, and communication protocols.
5
+ """
6
+
7
+ import struct
8
+ from enum import Enum, IntEnum
9
+
10
+ HOST = 'openspeech.bytedance.com'
11
+ """Volcengine audio service host"""
12
+
13
+
14
+ class ProtocolVersion(Enum):
15
+ """Protocol version for Volcengine audio services"""
16
+
17
+ V1 = 0b0001
18
+
19
+
20
+ class HeaderSize(Enum):
21
+ """Header size in 32-bit words"""
22
+
23
+ SIZE_4 = 0b0001 # 4 bytes
24
+
25
+
26
+ class MessageType(Enum):
27
+ """Message types for bidirectional communication"""
28
+
29
+ FULL_CLIENT_REQUEST = 0b0001
30
+ AUDIO_ONLY_REQUEST = 0b0010
31
+ FULL_SERVER_RESPONSE = 0b1001
32
+ AUDIO_ONLY_RESPONSE = 0b1011
33
+ ERROR_INFORMATION = 0b1111
34
+
35
+
36
+ class MessageTypeSpecificFlag(Enum):
37
+ """Flags for message type specific information"""
38
+
39
+ NO_SEQUENCE = 0b0000
40
+ POS_SEQUENCE = 0b0001
41
+ NEG_SEQUENCE = 0b0010
42
+ NEG_WITH_SEQUENCE = 0b0011
43
+ CARRY_EVENT_ID = 0b0100
44
+
45
+
46
+ class SerializationMethod(Enum):
47
+ """Serialization methods for message payloads"""
48
+
49
+ RAW = 0b0000
50
+ JSON = 0b0001
51
+ PROTOBUF = 0b0010
52
+ THRFT = 0b0011
53
+
54
+
55
+ class CompressionMethod(Enum):
56
+ """Compression methods for message payloads"""
57
+
58
+ NONE = 0b0000
59
+ GZIP = 0b0001
60
+
61
+
62
+ class AudioCodec(Enum):
63
+ """Audio codec types"""
64
+
65
+ raw = 'raw'
66
+ opus = 'opus' # for ogg
67
+
68
+
69
+ class AsrMessageType(Enum):
70
+ """Message types specific to ASR (speech recognition)"""
71
+
72
+ FULL_CLIENT_REQUEST = 0b0001
73
+ AUDIO_ONLY_REQUEST = 0b0010
74
+ FULL_SERVER_RESPONSE = 0b1001
75
+ SERVER_ACK = 0b1011
76
+ SERVER_ERROR_RESPONSE = 0b1111
77
+
78
+
79
+ class AsrMessageTypeSpecificFlag(Enum):
80
+ """Flags specific to ASR message types"""
81
+
82
+ # requests
83
+ # full client request or non-last audio only request
84
+ NO_SEQUENCE = 0b0000
85
+ POS_SEQUENCE = 0b0001
86
+ # last audio only request without sequence
87
+ NEG_SEQUENCE = 0b0010
88
+ # last audio only request with sequence
89
+ NEG_WITH_SEQUENCE = 0b0011
90
+ # responses
91
+ # full client response or non-last audio only response
92
+ NEG_SEQUENCE_1 = 0b0011
93
+
94
+
95
+ class EventSend(IntEnum):
96
+ """Events sent from client to server"""
97
+
98
+ StartConnection = 1
99
+ FinishConnection = 2
100
+ StartSession = 100
101
+ CancelSession = 101
102
+ FinishSession = 102
103
+ TaskRequest = 200
104
+ SayHello = 300
105
+ ChatTTSText = 500
106
+ ChatTextQuery = 501
107
+ ChatRAGText = 502
108
+ ConversationCreate = 510
109
+ ConversationUpdate = 511
110
+ ConversationRetrieve = 512
111
+ ConversationDelete = 514
112
+
113
+
114
+ class EventReceive(IntEnum):
115
+ """Events received from server"""
116
+
117
+ ConnectionStarted = 50
118
+ ConnectionFailed = 51
119
+ ConnectionFinished = 52
120
+ SessionStarted = 150
121
+ SessionCanceled = 151
122
+ SessionFinished = 152
123
+ SessionFailed = 153
124
+ USAGE = 154
125
+ TTSSentenceStart = 350
126
+ TTSSentenceEnd = 351
127
+ TTSResponse = 352
128
+ TTSEnded = 359
129
+ ASRInfo = 450
130
+ ASRResponse = 451
131
+ ASREnded = 459
132
+ ChatResponse = 550
133
+ ChatTextQueryConfirmed = 553
134
+ ChatEnded = 559
135
+ ConversationCreated = 567
136
+ ConversationUpdated = 568
137
+ ConversationRetrieved = 569
138
+ ConversationDeleted = 571
139
+ DialogCommonError = 599
140
+ # TODO(Deo): need to check what this code is, in tts
141
+ UNKNOWN = 50000000
142
+ # TODO(Deo): need to check what this code is, in dialogue
143
+ UNKNOWN1 = 55000000
144
+ SERVER_PROCESSING_ERROR = 55000001
145
+ SERVICE_UNAVAILABLE = 55000030
146
+ AUDIO_FLOW_ERROR = 55002070
147
+
148
+
149
+ def generate_header(
150
+ message_type: MessageType = MessageType.FULL_CLIENT_REQUEST,
151
+ message_type_specific_flags: MessageTypeSpecificFlag = MessageTypeSpecificFlag.NO_SEQUENCE,
152
+ serial_method: SerializationMethod = SerializationMethod.JSON,
153
+ compression_type: CompressionMethod = CompressionMethod.NONE,
154
+ reserved_data: int = 0x00,
155
+ ) -> bytearray:
156
+ """Generate protocol header for Volcengine audio services.
157
+
158
+ Args:
159
+ message_type: Type of message being sent
160
+ message_type_specific_flags: Specific flags for the message type
161
+ serial_method: Serialization method for payload
162
+ compression_type: Compression method for payload
163
+ reserved_data: Reserved byte (default 0x00)
164
+
165
+ Returns:
166
+ 4-byte header as bytearray
167
+
168
+ Header structure:
169
+ - Byte 0: protocol_version(4 bits), header_size(4 bits)
170
+ - Byte 1: message_type(4 bits), message_type_specific_flags(4 bits)
171
+ - Byte 2: serialization_method(4 bits), message_compression(4 bits)
172
+ - Byte 3: reserved(8 bits)
173
+ """
174
+ header = bytearray()
175
+ header_size = 1
176
+ header.append((ProtocolVersion.V1.value << 4) | header_size)
177
+ header.append((message_type.value << 4) | message_type_specific_flags.value)
178
+ header.append((serial_method.value << 4) | compression_type.value)
179
+ header.append(reserved_data)
180
+ return header
181
+
182
+
183
+ def generate_before_payload(sequence: int) -> bytearray:
184
+ """Generate sequence number before payload.
185
+
186
+ Args:
187
+ sequence: Sequence number (signed 32-bit integer)
188
+
189
+ Returns:
190
+ 4-byte sequence as bytearray
191
+ """
192
+ before_payload = bytearray()
193
+ before_payload.extend(struct.pack('>i', sequence))
194
+ return before_payload