vocence-plugins 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vocence_plugins-0.1.0/.gitignore +17 -0
- vocence_plugins-0.1.0/LICENSE +200 -0
- vocence_plugins-0.1.0/PKG-INFO +106 -0
- vocence_plugins-0.1.0/README.md +79 -0
- vocence_plugins-0.1.0/pyproject.toml +43 -0
- vocence_plugins-0.1.0/src/vocence_plugins/__init__.py +37 -0
- vocence_plugins-0.1.0/src/vocence_plugins/_lang.py +44 -0
- vocence_plugins-0.1.0/src/vocence_plugins/stt.py +313 -0
- vocence_plugins-0.1.0/src/vocence_plugins/tts.py +294 -0
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
Apache License
|
|
2
|
+
Version 2.0, January 2004
|
|
3
|
+
http://www.apache.org/licenses/
|
|
4
|
+
|
|
5
|
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
|
6
|
+
|
|
7
|
+
1. Definitions.
|
|
8
|
+
|
|
9
|
+
"License" shall mean the terms and conditions for use, reproduction,
|
|
10
|
+
and distribution as defined by Sections 1 through 9 of this document.
|
|
11
|
+
|
|
12
|
+
"Licensor" shall mean the copyright owner or entity authorized by
|
|
13
|
+
the copyright owner that is granting the License.
|
|
14
|
+
|
|
15
|
+
"Legal Entity" shall mean the union of the acting entity and all
|
|
16
|
+
other entities that control, are controlled by, or are under common
|
|
17
|
+
control with that entity. For the purposes of this definition,
|
|
18
|
+
"control" means (i) the power, direct or indirect, to cause the
|
|
19
|
+
direction or management of such entity, whether by contract or
|
|
20
|
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
|
21
|
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
|
22
|
+
|
|
23
|
+
"You" (or "Your") shall mean an individual or Legal Entity
|
|
24
|
+
exercising permissions granted by this License.
|
|
25
|
+
|
|
26
|
+
"Source" form shall mean the preferred form for making modifications,
|
|
27
|
+
including but not limited to software source code, documentation
|
|
28
|
+
source, and configuration files.
|
|
29
|
+
|
|
30
|
+
"Object" form shall mean any form resulting from mechanical
|
|
31
|
+
transformation or translation of a Source form, including but
|
|
32
|
+
not limited to compiled object code, generated documentation,
|
|
33
|
+
and conversions to other media types.
|
|
34
|
+
|
|
35
|
+
"Work" shall mean the work of authorship, whether in Source or
|
|
36
|
+
Object form, made available under the License, as indicated by a
|
|
37
|
+
copyright notice that is included in or attached to the work
|
|
38
|
+
(an example is provided in the Appendix below).
|
|
39
|
+
|
|
40
|
+
"Derivative Works" shall mean any work, whether in Source or Object
|
|
41
|
+
form, that is based on (or derived from) the Work and for which the
|
|
42
|
+
editorial revisions, annotations, elaborations, or other modifications
|
|
43
|
+
represent, as a whole, an original work of authorship. For the purposes
|
|
44
|
+
of this License, Derivative Works shall not include works that remain
|
|
45
|
+
separable from, or merely link (or bind by name) to the interfaces of,
|
|
46
|
+
the Work and Derivative Works thereof.
|
|
47
|
+
|
|
48
|
+
"Contribution" shall mean any work of authorship, including
|
|
49
|
+
the original version of the Work and any modifications or additions
|
|
50
|
+
to that Work or Derivative Works thereof, that is intentionally
|
|
51
|
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
|
52
|
+
or by an individual or Legal Entity authorized to submit on behalf of
|
|
53
|
+
the copyright owner. For the purposes of this definition, "submitted"
|
|
54
|
+
means any form of electronic, verbal, or written communication sent
|
|
55
|
+
to the Licensor or its representatives, including but not limited to
|
|
56
|
+
communication on electronic mailing lists, source code control systems,
|
|
57
|
+
and issue tracking systems that are managed by, or on behalf of, the
|
|
58
|
+
Licensor for the purpose of discussing and improving the Work, but
|
|
59
|
+
excluding communication that is conspicuously marked or otherwise
|
|
60
|
+
designated in writing by the copyright owner as "Not a Contribution."
|
|
61
|
+
|
|
62
|
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
|
63
|
+
on behalf of whom a Contribution has been received by Licensor and
|
|
64
|
+
subsequently incorporated within the Work.
|
|
65
|
+
|
|
66
|
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
|
67
|
+
this License, each Contributor hereby grants to You a perpetual,
|
|
68
|
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
|
69
|
+
copyright license to reproduce, prepare Derivative Works of,
|
|
70
|
+
publicly display, publicly perform, sublicense, and distribute the
|
|
71
|
+
Work and such Derivative Works in Source or Object form.
|
|
72
|
+
|
|
73
|
+
3. Grant of Patent License. Subject to the terms and conditions of
|
|
74
|
+
this License, each Contributor hereby grants to You a perpetual,
|
|
75
|
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
|
76
|
+
(except as stated in this section) patent license to make, have made,
|
|
77
|
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
|
78
|
+
where such license applies only to those patent claims licensable
|
|
79
|
+
by such Contributor that are necessarily infringed by their
|
|
80
|
+
Contribution(s) alone or by combination of their Contribution(s)
|
|
81
|
+
with the Work to which such Contribution(s) was submitted. If You
|
|
82
|
+
institute patent litigation against any entity (including a
|
|
83
|
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
|
84
|
+
or a Contribution incorporated within the Work constitutes direct
|
|
85
|
+
or contributory patent infringement, then any patent licenses
|
|
86
|
+
granted to You under this License for that Work shall terminate
|
|
87
|
+
as of the date such litigation is filed.
|
|
88
|
+
|
|
89
|
+
4. Redistribution. You may reproduce and distribute copies of the
|
|
90
|
+
Work or Derivative Works thereof in any medium, with or without
|
|
91
|
+
modifications, and in Source or Object form, provided that You
|
|
92
|
+
meet the following conditions:
|
|
93
|
+
|
|
94
|
+
(a) You must give any other recipients of the Work or
|
|
95
|
+
Derivative Works a copy of this License; and
|
|
96
|
+
|
|
97
|
+
(b) You must cause any modified files to carry prominent notices
|
|
98
|
+
stating that You changed the files; and
|
|
99
|
+
|
|
100
|
+
(c) You must retain, in the Source form of any Derivative Works
|
|
101
|
+
that You distribute, all copyright, patent, trademark, and
|
|
102
|
+
attribution notices from the Source form of the Work,
|
|
103
|
+
excluding those notices that do not pertain to any part of
|
|
104
|
+
the Derivative Works; and
|
|
105
|
+
|
|
106
|
+
(d) If the Work includes a "NOTICE" text file as part of its
|
|
107
|
+
distribution, then any Derivative Works that You distribute must
|
|
108
|
+
include a readable copy of the attribution notices contained
|
|
109
|
+
within such NOTICE file, excluding those notices that do not
|
|
110
|
+
pertain to any part of the Derivative Works, in at least one
|
|
111
|
+
of the following places: within a NOTICE text file distributed
|
|
112
|
+
as part of the Derivative Works; within the Source form or
|
|
113
|
+
documentation, if provided along with the Derivative Works; or,
|
|
114
|
+
within a display generated by the Derivative Works, if and
|
|
115
|
+
wherever such third-party notices normally appear. The contents
|
|
116
|
+
of the NOTICE file are for informational purposes only and
|
|
117
|
+
do not modify the License. You may add Your own attribution
|
|
118
|
+
notices within Derivative Works that You distribute, alongside
|
|
119
|
+
or as an addendum to the NOTICE text from the Work, provided
|
|
120
|
+
that such additional attribution notices cannot be construed
|
|
121
|
+
as modifying the License.
|
|
122
|
+
|
|
123
|
+
You may add Your own copyright statement to Your modifications and
|
|
124
|
+
may provide additional or different license terms and conditions
|
|
125
|
+
for use, reproduction, or distribution of Your modifications, or
|
|
126
|
+
for any such Derivative Works as a whole, provided Your use,
|
|
127
|
+
reproduction, and distribution of the Work otherwise complies with
|
|
128
|
+
the conditions stated in this License.
|
|
129
|
+
|
|
130
|
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
|
131
|
+
any Contribution intentionally submitted for inclusion in the Work
|
|
132
|
+
by You to the Licensor shall be under the terms and conditions of
|
|
133
|
+
this License, without any additional terms or conditions.
|
|
134
|
+
Notwithstanding the above, nothing herein shall supersede or modify
|
|
135
|
+
the terms of any separate license agreement you may have executed
|
|
136
|
+
with Licensor regarding such Contributions.
|
|
137
|
+
|
|
138
|
+
6. Trademarks. This License does not grant permission to use the trade
|
|
139
|
+
names, trademarks, service marks, or product names of the Licensor,
|
|
140
|
+
except as required for describing the origin of the Work and
|
|
141
|
+
reproducing the content of the NOTICE file.
|
|
142
|
+
|
|
143
|
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
|
144
|
+
agreed to in writing, Licensor provides the Work (and each
|
|
145
|
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
|
146
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
147
|
+
implied, including, without limitation, any warranties or conditions
|
|
148
|
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
|
149
|
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
|
150
|
+
appropriateness of using or redistributing the Work and assume any
|
|
151
|
+
risks associated with Your exercise of permissions under this License.
|
|
152
|
+
|
|
153
|
+
8. Limitation of Liability. In no event and under no legal theory,
|
|
154
|
+
whether in tort (including negligence), contract, or otherwise,
|
|
155
|
+
unless required by applicable law (such as deliberate and grossly
|
|
156
|
+
negligent acts) or agreed to in writing, shall any Contributor be
|
|
157
|
+
liable to You for damages, including any direct, indirect, special,
|
|
158
|
+
incidental, or consequential damages of any character arising as a
|
|
159
|
+
result of this License or out of the use or inability to use the
|
|
160
|
+
Work (including but not limited to damages for loss of goodwill,
|
|
161
|
+
work stoppage, computer failure or malfunction, or any and all
|
|
162
|
+
other commercial damages or losses), even if such Contributor
|
|
163
|
+
has been advised of the possibility of such damages.
|
|
164
|
+
|
|
165
|
+
9. Accepting Warranty or Support. While redistributing the Work or
|
|
166
|
+
Derivative Works thereof, You may accept support, warranty,
|
|
167
|
+
indemnity, or other liability obligations and/or rights consistent
|
|
168
|
+
with this License. However, in accepting such obligations, You may
|
|
169
|
+
act only on Your own behalf and on Your sole responsibility, not on
|
|
170
|
+
behalf of any other Contributor, and only if You agree to indemnify,
|
|
171
|
+
defend, and hold each Contributor harmless for any liability
|
|
172
|
+
incurred by, or claims asserted against, such Contributor by reason
|
|
173
|
+
of your accepting any such warranty or support.
|
|
174
|
+
|
|
175
|
+
END OF TERMS AND CONDITIONS
|
|
176
|
+
|
|
177
|
+
APPENDIX: How to apply the Apache License to your work.
|
|
178
|
+
|
|
179
|
+
To apply the Apache License to your work, attach the following
|
|
180
|
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
|
181
|
+
replaced with your own identifying information. (Don't include
|
|
182
|
+
the brackets!) The text should be enclosed in the appropriate
|
|
183
|
+
comment syntax for the file format. We also recommend that a
|
|
184
|
+
file or class name and description of purpose be included on the
|
|
185
|
+
same "printed page" as the copyright notice for easier
|
|
186
|
+
identification within third-party archives.
|
|
187
|
+
|
|
188
|
+
Copyright 2026 Vocence
|
|
189
|
+
|
|
190
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
191
|
+
you may not use this file except in compliance with the License.
|
|
192
|
+
You may obtain a copy of the License at
|
|
193
|
+
|
|
194
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
195
|
+
|
|
196
|
+
Unless required by applicable law or agreed to in writing, software
|
|
197
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
198
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
199
|
+
implied. See the License for the specific language governing permissions
|
|
200
|
+
and limitations under the License.
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vocence-plugins
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Vocence voice plug-ins — drop custom-cloned voices and streaming speech recognition into your real-time voice agents.
|
|
5
|
+
Project-URL: Homepage, https://www.vocence.ai
|
|
6
|
+
Project-URL: Documentation, https://www.vocence.ai/docs/sdk-agents
|
|
7
|
+
Project-URL: Repository, https://github.com/concil859856/vocence-plugins
|
|
8
|
+
Project-URL: Issues, https://github.com/concil859856/vocence-plugins/issues
|
|
9
|
+
Author-email: Vocence <space@vocence.ai>
|
|
10
|
+
License-Expression: Apache-2.0
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: realtime,stt,tts,vocence,voice-agents,voice-cloning
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
|
|
22
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
23
|
+
Requires-Python: >=3.11
|
|
24
|
+
Requires-Dist: aiohttp>=3.9
|
|
25
|
+
Requires-Dist: videosdk-agents>=0.1.0
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
|
|
28
|
+
# vocence-plugins
|
|
29
|
+
|
|
30
|
+
Vocence voice plug-ins for real-time agent pipelines — drop in **Vocence custom voices** for TTS and **Vocence streaming speech recognition** for STT.
|
|
31
|
+
|
|
32
|
+
- **`VocenceTTS`** — streaming text-to-speech with the Vocence voice library (cloned, designed, and built-in speakers). One persistent connection per session, sub-second TTFB on warm connections, PCM16LE @ 24 kHz output.
|
|
33
|
+
- **`VocenceSTT`** — streaming speech-to-text with interim + final transcripts, optional speech / silence events for VAD integration, and language auto-detect.
|
|
34
|
+
|
|
35
|
+
> Status: 0.1.0 — public alpha.
|
|
36
|
+
|
|
37
|
+
## Install
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
pip install vocence-plugins
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
The plug-ins conform to the standard TTS / STT abstract interfaces, so they slot into any compatible voice-agent framework.
|
|
44
|
+
|
|
45
|
+
## API key
|
|
46
|
+
|
|
47
|
+
Get one at https://www.vocence.ai/account/developer. Requires the Premium plan.
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
export VOCENCE_API_KEY=voc_live_...
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
Or pass it directly: `VocenceTTS(api_key="voc_live_...", voice="...")`.
|
|
54
|
+
|
|
55
|
+
## Quickstart
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
from vocence_plugins import VocenceTTS, VocenceSTT
|
|
59
|
+
|
|
60
|
+
tts = VocenceTTS(voice="design-aria", language="English")
|
|
61
|
+
stt = VocenceSTT(language="English")
|
|
62
|
+
|
|
63
|
+
# Wire into your agent pipeline as the TTS / STT components.
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
The plug-ins handle the WebSocket lifecycle, reconnection, and audio framing — your code just sees text in and audio out (TTS), or audio in and transcript events out (STT).
|
|
67
|
+
|
|
68
|
+
## Plugin reference
|
|
69
|
+
|
|
70
|
+
### `VocenceTTS(*, api_key=None, voice, language=None, base_url=...)`
|
|
71
|
+
|
|
72
|
+
Streaming TTS over the Vocence voice service. One WebSocket reused across many `synthesize()` calls in the same session, closed on `aclose()`.
|
|
73
|
+
|
|
74
|
+
| Arg | Default | Notes |
|
|
75
|
+
|---|---|---|
|
|
76
|
+
| `api_key` | `VOCENCE_API_KEY` env | Required (`voc_live_…`). |
|
|
77
|
+
| `voice` | — | Required. Either a built-in slug (`"design-aria"`, `"design-jasper"`, …) or the numeric id of a saved designed / cloned voice. |
|
|
78
|
+
| `language` | `None` | Optional hint sent on every speak. |
|
|
79
|
+
| `base_url` | `https://api.vocence.ai` | Override for staging / self-hosted. |
|
|
80
|
+
|
|
81
|
+
Audio output: PCM16LE @ 24 kHz, mono.
|
|
82
|
+
|
|
83
|
+
### `VocenceSTT(*, api_key=None, language="auto", sample_rate=16000, enable_partials=True, vad_events=True, base_url=...)`
|
|
84
|
+
|
|
85
|
+
Streaming STT. Lazy-opens a WebSocket on the first audio frame, runs a background reader that translates events into the framework's standard transcript event shape (interim, final, speech-start, speech-end).
|
|
86
|
+
|
|
87
|
+
| Arg | Default | Notes |
|
|
88
|
+
|---|---|---|
|
|
89
|
+
| `api_key` | `VOCENCE_API_KEY` env | Required. |
|
|
90
|
+
| `language` | `"auto"` | ISO-639-1 (`"en"`), full name (`"English"`), or `"auto"`. Normalized to ISO before send. |
|
|
91
|
+
| `sample_rate` | 16000 | PCM16LE mono input. |
|
|
92
|
+
| `enable_partials` | `True` | Stream interim hypotheses as the model refines. |
|
|
93
|
+
| `vad_events` | `True` | Emit speech-start / silence events from the internal VAD. |
|
|
94
|
+
|
|
95
|
+
## Compared to the Vocence Python SDK
|
|
96
|
+
|
|
97
|
+
| Use case | Use |
|
|
98
|
+
|---|---|
|
|
99
|
+
| Talk to a Vocence-hosted voice agent (REST + WebSocket to our service) | [`vocence`](https://pypi.org/project/vocence/) |
|
|
100
|
+
| Build your own agent pipeline with Vocence voices + recognition | This package |
|
|
101
|
+
|
|
102
|
+
The two don't overlap — different products for different use cases. Both authenticate with the same `voc_live_…` key.
|
|
103
|
+
|
|
104
|
+
## License
|
|
105
|
+
|
|
106
|
+
Apache-2.0.
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# vocence-plugins
|
|
2
|
+
|
|
3
|
+
Vocence voice plug-ins for real-time agent pipelines — drop in **Vocence custom voices** for TTS and **Vocence streaming speech recognition** for STT.
|
|
4
|
+
|
|
5
|
+
- **`VocenceTTS`** — streaming text-to-speech with the Vocence voice library (cloned, designed, and built-in speakers). One persistent connection per session, sub-second TTFB on warm connections, PCM16LE @ 24 kHz output.
|
|
6
|
+
- **`VocenceSTT`** — streaming speech-to-text with interim + final transcripts, optional speech / silence events for VAD integration, and language auto-detect.
|
|
7
|
+
|
|
8
|
+
> Status: 0.1.0 — public alpha.
|
|
9
|
+
|
|
10
|
+
## Install
|
|
11
|
+
|
|
12
|
+
```bash
|
|
13
|
+
pip install vocence-plugins
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
The plug-ins conform to the standard TTS / STT abstract interfaces, so they slot into any compatible voice-agent framework.
|
|
17
|
+
|
|
18
|
+
## API key
|
|
19
|
+
|
|
20
|
+
Get one at https://www.vocence.ai/account/developer. Requires the Premium plan.
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
export VOCENCE_API_KEY=voc_live_...
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
Or pass it directly: `VocenceTTS(api_key="voc_live_...", voice="...")`.
|
|
27
|
+
|
|
28
|
+
## Quickstart
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
from vocence_plugins import VocenceTTS, VocenceSTT
|
|
32
|
+
|
|
33
|
+
tts = VocenceTTS(voice="design-aria", language="English")
|
|
34
|
+
stt = VocenceSTT(language="English")
|
|
35
|
+
|
|
36
|
+
# Wire into your agent pipeline as the TTS / STT components.
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
The plug-ins handle the WebSocket lifecycle, reconnection, and audio framing — your code just sees text in and audio out (TTS), or audio in and transcript events out (STT).
|
|
40
|
+
|
|
41
|
+
## Plugin reference
|
|
42
|
+
|
|
43
|
+
### `VocenceTTS(*, api_key=None, voice, language=None, base_url=...)`
|
|
44
|
+
|
|
45
|
+
Streaming TTS over the Vocence voice service. One WebSocket reused across many `synthesize()` calls in the same session, closed on `aclose()`.
|
|
46
|
+
|
|
47
|
+
| Arg | Default | Notes |
|
|
48
|
+
|---|---|---|
|
|
49
|
+
| `api_key` | `VOCENCE_API_KEY` env | Required (`voc_live_…`). |
|
|
50
|
+
| `voice` | — | Required. Either a built-in slug (`"design-aria"`, `"design-jasper"`, …) or the numeric id of a saved designed / cloned voice. |
|
|
51
|
+
| `language` | `None` | Optional hint sent on every speak. |
|
|
52
|
+
| `base_url` | `https://api.vocence.ai` | Override for staging / self-hosted. |
|
|
53
|
+
|
|
54
|
+
Audio output: PCM16LE @ 24 kHz, mono.
|
|
55
|
+
|
|
56
|
+
### `VocenceSTT(*, api_key=None, language="auto", sample_rate=16000, enable_partials=True, vad_events=True, base_url=...)`
|
|
57
|
+
|
|
58
|
+
Streaming STT. Lazy-opens a WebSocket on the first audio frame, runs a background reader that translates events into the framework's standard transcript event shape (interim, final, speech-start, speech-end).
|
|
59
|
+
|
|
60
|
+
| Arg | Default | Notes |
|
|
61
|
+
|---|---|---|
|
|
62
|
+
| `api_key` | `VOCENCE_API_KEY` env | Required. |
|
|
63
|
+
| `language` | `"auto"` | ISO-639-1 (`"en"`), full name (`"English"`), or `"auto"`. Normalized to ISO before send. |
|
|
64
|
+
| `sample_rate` | 16000 | PCM16LE mono input. |
|
|
65
|
+
| `enable_partials` | `True` | Stream interim hypotheses as the model refines. |
|
|
66
|
+
| `vad_events` | `True` | Emit speech-start / silence events from the internal VAD. |
|
|
67
|
+
|
|
68
|
+
## Compared to the Vocence Python SDK
|
|
69
|
+
|
|
70
|
+
| Use case | Use |
|
|
71
|
+
|---|---|
|
|
72
|
+
| Talk to a Vocence-hosted voice agent (REST + WebSocket to our service) | [`vocence`](https://pypi.org/project/vocence/) |
|
|
73
|
+
| Build your own agent pipeline with Vocence voices + recognition | This package |
|
|
74
|
+
|
|
75
|
+
The two don't overlap — different products for different use cases. Both authenticate with the same `voc_live_…` key.
|
|
76
|
+
|
|
77
|
+
## License
|
|
78
|
+
|
|
79
|
+
Apache-2.0.
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling>=1.21"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "vocence-plugins"
|
|
7
|
+
dynamic = ["version"]
|
|
8
|
+
description = "Vocence voice plug-ins — drop custom-cloned voices and streaming speech recognition into your real-time voice agents."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "Apache-2.0"
|
|
11
|
+
requires-python = ">=3.11"
|
|
12
|
+
authors = [{ name = "Vocence", email = "space@vocence.ai" }]
|
|
13
|
+
keywords = ["vocence", "voice-agents", "tts", "stt", "voice-cloning", "realtime"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 4 - Beta",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"License :: OSI Approved :: Apache Software License",
|
|
18
|
+
"Operating System :: OS Independent",
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"Programming Language :: Python :: 3.11",
|
|
21
|
+
"Programming Language :: Python :: 3.12",
|
|
22
|
+
"Programming Language :: Python :: 3.13",
|
|
23
|
+
"Topic :: Multimedia :: Sound/Audio :: Speech",
|
|
24
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
25
|
+
]
|
|
26
|
+
dependencies = [
|
|
27
|
+
"aiohttp>=3.9",
|
|
28
|
+
# The plug-ins implement the abstract interfaces of a compatible
|
|
29
|
+
# real-time agent framework. Required at runtime.
|
|
30
|
+
"videosdk-agents>=0.1.0",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[project.urls]
|
|
34
|
+
Homepage = "https://www.vocence.ai"
|
|
35
|
+
Documentation = "https://www.vocence.ai/docs/sdk-agents"
|
|
36
|
+
Repository = "https://github.com/concil859856/vocence-plugins"
|
|
37
|
+
Issues = "https://github.com/concil859856/vocence-plugins/issues"
|
|
38
|
+
|
|
39
|
+
[tool.hatch.version]
|
|
40
|
+
path = "src/vocence_plugins/__init__.py"
|
|
41
|
+
|
|
42
|
+
[tool.hatch.build.targets.wheel]
|
|
43
|
+
packages = ["src/vocence_plugins"]
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""Vocence voice plug-ins.
|
|
2
|
+
|
|
3
|
+
Drop-in components that bring **Vocence custom voices** and
|
|
4
|
+
**Vocence streaming speech recognition** into your real-time voice
|
|
5
|
+
agent pipeline. Authenticated with the standard Vocence
|
|
6
|
+
``voc_live_…`` developer key — the same one you use across the rest
|
|
7
|
+
of the Vocence platform.
|
|
8
|
+
|
|
9
|
+
The headline component is :class:`VocenceTTS` — streaming
|
|
10
|
+
text-to-speech with the Vocence voice library: cloned voices,
|
|
11
|
+
designed voices, and the built-in speaker catalog. :class:`VocenceSTT`
|
|
12
|
+
streams audio in and transcripts out.
|
|
13
|
+
|
|
14
|
+
Example
|
|
15
|
+
-------
|
|
16
|
+
|
|
17
|
+
>>> from vocence_plugins import VocenceTTS, VocenceSTT
|
|
18
|
+
>>>
|
|
19
|
+
>>> tts = VocenceTTS(api_key="voc_live_...", voice="design-aria")
|
|
20
|
+
>>> stt = VocenceSTT(api_key="voc_live_...", language="English")
|
|
21
|
+
|
|
22
|
+
Plug them into the agent framework of your choice — both classes
|
|
23
|
+
conform to the standard TTS / STT abstract interfaces.
|
|
24
|
+
|
|
25
|
+
See https://www.vocence.ai/docs/sdk-agents for the full guide.
|
|
26
|
+
"""
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
__version__ = "0.1.0"
|
|
30
|
+
|
|
31
|
+
from .tts import VocenceTTS
|
|
32
|
+
from .stt import VocenceSTT
|
|
33
|
+
|
|
34
|
+
__all__ = [
|
|
35
|
+
"VocenceTTS",
|
|
36
|
+
"VocenceSTT",
|
|
37
|
+
]
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""Language-name → ISO-639-1 mapping for the Vocence STT pod.
|
|
2
|
+
|
|
3
|
+
The pod's wire protocol expects an ISO code (or ``"auto"``); full
|
|
4
|
+
names like ``"English"`` silently degrade to auto-detect, which then
|
|
5
|
+
mis-classifies short utterances. Mirrors the helper used inside the
|
|
6
|
+
Vocence backend so plugin users get the same behavior the hosted
|
|
7
|
+
service does.
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from typing import Optional
|
|
12
|
+
|
|
13
|
+
_NAME_TO_ISO = {
|
|
14
|
+
"English": "en",
|
|
15
|
+
"Chinese": "zh",
|
|
16
|
+
"Japanese": "ja",
|
|
17
|
+
"Korean": "ko",
|
|
18
|
+
"German": "de",
|
|
19
|
+
"French": "fr",
|
|
20
|
+
"Russian": "ru",
|
|
21
|
+
"Portuguese": "pt",
|
|
22
|
+
"Spanish": "es",
|
|
23
|
+
"Italian": "it",
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def to_iso_639_1(language: Optional[str]) -> str:
|
|
28
|
+
"""Resolve any input form to the ISO-639-1 code the STT pod wants.
|
|
29
|
+
|
|
30
|
+
Accepts agent-config full names (``"English"``), already-ISO codes
|
|
31
|
+
(``"en"``), and the sentinel ``"auto"`` / ``None``. Falls back to
|
|
32
|
+
``"auto"`` for unknown input so the pod is never sent a string it
|
|
33
|
+
doesn't understand.
|
|
34
|
+
"""
|
|
35
|
+
if not language:
|
|
36
|
+
return "auto"
|
|
37
|
+
s = language.strip()
|
|
38
|
+
if s.lower() == "auto":
|
|
39
|
+
return "auto"
|
|
40
|
+
if s in _NAME_TO_ISO:
|
|
41
|
+
return _NAME_TO_ISO[s]
|
|
42
|
+
if len(s) == 2 and s.isalpha():
|
|
43
|
+
return s.lower()
|
|
44
|
+
return "auto"
|
|
@@ -0,0 +1,313 @@
|
|
|
1
|
+
"""VocenceSTT — streaming speech-to-text with Vocence recognition.
|
|
2
|
+
|
|
3
|
+
Conforms to the standard STT abstract interface used by real-time
|
|
4
|
+
agent pipelines, so it slots in alongside any compatible
|
|
5
|
+
``Pipeline(stt=...)``. The plug-in handles connection lifecycle,
|
|
6
|
+
audio framing, and event translation — callers just see audio in
|
|
7
|
+
and standard transcript events out.
|
|
8
|
+
|
|
9
|
+
Audio input: PCM16LE @ 16 kHz, mono. One persistent connection is
|
|
10
|
+
lazily opened on the first audio frame and torn down on
|
|
11
|
+
``aclose()``. A background reader task translates incoming events
|
|
12
|
+
into the framework's standard transcript event shape.
|
|
13
|
+
"""
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import asyncio
|
|
17
|
+
import json
|
|
18
|
+
import logging
|
|
19
|
+
import os
|
|
20
|
+
from typing import Any, Optional
|
|
21
|
+
from urllib.parse import urlparse
|
|
22
|
+
|
|
23
|
+
import aiohttp
|
|
24
|
+
|
|
25
|
+
from videosdk.agents import ( # type: ignore[import-not-found]
|
|
26
|
+
STT,
|
|
27
|
+
STTResponse,
|
|
28
|
+
SpeechData,
|
|
29
|
+
SpeechEventType,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
from ._lang import to_iso_639_1
|
|
33
|
+
|
|
34
|
+
logger = logging.getLogger(__name__)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
_DEFAULT_BASE_URL = os.environ.get("VOCENCE_BASE_URL", "https://api.vocence.ai")
|
|
38
|
+
_DEFAULT_SAMPLE_RATE = 16_000
|
|
39
|
+
_DEFAULT_TIMEOUT_SEC = 30.0
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _ws_url_from_base(base_url: str) -> str:
|
|
43
|
+
"""Translate ``https://api.vocence.ai`` → ``wss://api.vocence.ai/v1/stt/stream``."""
|
|
44
|
+
parsed = urlparse(base_url.rstrip("/"))
|
|
45
|
+
scheme = "wss" if parsed.scheme == "https" else "ws"
|
|
46
|
+
return f"{scheme}://{parsed.netloc}{parsed.path}/v1/stt/stream"
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class VocenceSTT(STT):
|
|
50
|
+
"""Streaming STT plugin backed by the Vocence recognition service.
|
|
51
|
+
|
|
52
|
+
Parameters
|
|
53
|
+
----------
|
|
54
|
+
api_key:
|
|
55
|
+
Vocence developer key (``voc_live_…``). Falls back to the
|
|
56
|
+
``VOCENCE_API_KEY`` env var. Required.
|
|
57
|
+
language:
|
|
58
|
+
Spoken language. Accepts ISO-639-1 codes (``"en"``), full
|
|
59
|
+
names (``"English"``), or ``"auto"`` for auto-detect.
|
|
60
|
+
Default ``"auto"``. Normalized to ISO upfront to avoid
|
|
61
|
+
silent degradation to auto-detect for unrecognized forms.
|
|
62
|
+
sample_rate:
|
|
63
|
+
Audio sample rate. Default 16 kHz (mono PCM16LE). Only
|
|
64
|
+
16 kHz is accepted today; argument is kept for
|
|
65
|
+
forward-compatibility.
|
|
66
|
+
enable_partials:
|
|
67
|
+
Stream interim hypotheses as recognition refines. Default
|
|
68
|
+
``True`` — needed for any responsive UI; disable only for
|
|
69
|
+
batch / archival use cases that just want finals.
|
|
70
|
+
vad_events:
|
|
71
|
+
Emit speech-start / silence events from the internal VAD so
|
|
72
|
+
the orchestrator can drive its interrupt / speech_started
|
|
73
|
+
hooks off these. Default ``True``. Independent of any
|
|
74
|
+
external VAD plugin you wire alongside.
|
|
75
|
+
base_url:
|
|
76
|
+
Override the default ``https://api.vocence.ai``.
|
|
77
|
+
forward_interim_transcripts:
|
|
78
|
+
Whether to surface interim text to the user UI (passed
|
|
79
|
+
through to the standard STT base initializer).
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
def __init__(
|
|
83
|
+
self,
|
|
84
|
+
*,
|
|
85
|
+
api_key: Optional[str] = None,
|
|
86
|
+
language: str = "auto",
|
|
87
|
+
sample_rate: int = _DEFAULT_SAMPLE_RATE,
|
|
88
|
+
enable_partials: bool = True,
|
|
89
|
+
vad_events: bool = True,
|
|
90
|
+
base_url: str = _DEFAULT_BASE_URL,
|
|
91
|
+
forward_interim_transcripts: bool = False,
|
|
92
|
+
**kwargs: Any,
|
|
93
|
+
) -> None:
|
|
94
|
+
super().__init__(forward_interim_transcripts=forward_interim_transcripts)
|
|
95
|
+
self.api_key = api_key or os.environ.get("VOCENCE_API_KEY")
|
|
96
|
+
if not self.api_key:
|
|
97
|
+
raise ValueError(
|
|
98
|
+
"Vocence API key required — pass api_key= or set VOCENCE_API_KEY"
|
|
99
|
+
)
|
|
100
|
+
self.language = language
|
|
101
|
+
self.sample_rate = sample_rate
|
|
102
|
+
self.enable_partials = enable_partials
|
|
103
|
+
self.vad_events = vad_events
|
|
104
|
+
self.base_url = base_url
|
|
105
|
+
self._ws_url = _ws_url_from_base(base_url)
|
|
106
|
+
|
|
107
|
+
self._session: aiohttp.ClientSession | None = None
|
|
108
|
+
self._ws: aiohttp.ClientWebSocketResponse | None = None
|
|
109
|
+
self._reader_task: asyncio.Task | None = None
|
|
110
|
+
self._connect_lock = asyncio.Lock()
|
|
111
|
+
self._closed = False
|
|
112
|
+
|
|
113
|
+
# ----- abstract overrides ---------------------------------------------
|
|
114
|
+
|
|
115
|
+
async def process_audio(
|
|
116
|
+
self,
|
|
117
|
+
audio_frames: bytes,
|
|
118
|
+
language: Optional[str] = None,
|
|
119
|
+
**kwargs: Any,
|
|
120
|
+
) -> None:
|
|
121
|
+
"""Send one PCM16LE frame to the pod. Lazy-opens the WS on
|
|
122
|
+
first call. The framework calls this once per audio frame
|
|
123
|
+
(typically 20-40 ms), so the hot path is just a bytes send.
|
|
124
|
+
|
|
125
|
+
``language`` is accepted for API symmetry but ignored once
|
|
126
|
+
the connection is established — the pod's ``start`` frame
|
|
127
|
+
binds the language for the session. Construct a new
|
|
128
|
+
``VocenceSTT`` for a different language.
|
|
129
|
+
"""
|
|
130
|
+
if self._closed:
|
|
131
|
+
return
|
|
132
|
+
if self._ws is None:
|
|
133
|
+
await self._ensure_connection()
|
|
134
|
+
ws = self._ws
|
|
135
|
+
if ws is None or ws.closed:
|
|
136
|
+
return
|
|
137
|
+
try:
|
|
138
|
+
await ws.send_bytes(audio_frames)
|
|
139
|
+
except Exception as exc: # noqa: BLE001
|
|
140
|
+
logger.warning("VocenceSTT send_bytes failed: %s", exc)
|
|
141
|
+
await self._teardown_ws()
|
|
142
|
+
|
|
143
|
+
async def flush(self) -> None:
|
|
144
|
+
"""Ask the pod to finalize its current partial as soon as
|
|
145
|
+
possible. Useful at end-of-utterance when the orchestrator
|
|
146
|
+
knows the turn is over but the pod hasn't auto-emitted a
|
|
147
|
+
final yet."""
|
|
148
|
+
ws = self._ws
|
|
149
|
+
if ws is None or ws.closed:
|
|
150
|
+
return
|
|
151
|
+
try:
|
|
152
|
+
await ws.send_str(json.dumps({"type": "commit"}))
|
|
153
|
+
except Exception as exc: # noqa: BLE001
|
|
154
|
+
logger.debug("VocenceSTT commit failed: %s", exc)
|
|
155
|
+
|
|
156
|
+
async def aclose(self) -> None:
|
|
157
|
+
"""Close the WebSocket + HTTP session. Idempotent."""
|
|
158
|
+
self._closed = True
|
|
159
|
+
if self._reader_task is not None and not self._reader_task.done():
|
|
160
|
+
self._reader_task.cancel()
|
|
161
|
+
try:
|
|
162
|
+
await self._reader_task
|
|
163
|
+
except (asyncio.CancelledError, Exception):
|
|
164
|
+
pass
|
|
165
|
+
self._reader_task = None
|
|
166
|
+
await self._teardown_ws()
|
|
167
|
+
|
|
168
|
+
# ----- internals ------------------------------------------------------
|
|
169
|
+
|
|
170
|
+
async def _ensure_connection(self) -> None:
|
|
171
|
+
"""Open the WS, send ``start``, wait for ``ready``, kick the reader."""
|
|
172
|
+
async with self._connect_lock:
|
|
173
|
+
if self._ws is not None and not self._ws.closed:
|
|
174
|
+
return
|
|
175
|
+
if self._session is None or self._session.closed:
|
|
176
|
+
self._session = aiohttp.ClientSession(
|
|
177
|
+
timeout=aiohttp.ClientTimeout(total=_DEFAULT_TIMEOUT_SEC),
|
|
178
|
+
)
|
|
179
|
+
headers = {"Authorization": f"Bearer {self.api_key}"}
|
|
180
|
+
self._ws = await self._session.ws_connect(self._ws_url, headers=headers)
|
|
181
|
+
start_payload = {
|
|
182
|
+
"type": "start",
|
|
183
|
+
"language": to_iso_639_1(self.language),
|
|
184
|
+
"sample_rate": self.sample_rate,
|
|
185
|
+
"encoding": "pcm_s16le",
|
|
186
|
+
"enable_partials": self.enable_partials,
|
|
187
|
+
"vad_events": self.vad_events,
|
|
188
|
+
}
|
|
189
|
+
await self._ws.send_str(json.dumps(start_payload))
|
|
190
|
+
ready = await self._ws.receive(timeout=_DEFAULT_TIMEOUT_SEC)
|
|
191
|
+
if ready.type != aiohttp.WSMsgType.TEXT:
|
|
192
|
+
raise RuntimeError(
|
|
193
|
+
f"VocenceSTT: expected ready frame, got {ready.type}"
|
|
194
|
+
)
|
|
195
|
+
data = json.loads(ready.data)
|
|
196
|
+
mtype = (data.get("type") or "").lower()
|
|
197
|
+
if mtype == "error":
|
|
198
|
+
raise RuntimeError(
|
|
199
|
+
f"VocenceSTT connect rejected: "
|
|
200
|
+
f"{data.get('code')}: {data.get('message')}"
|
|
201
|
+
)
|
|
202
|
+
if mtype != "ready":
|
|
203
|
+
raise RuntimeError(
|
|
204
|
+
f"VocenceSTT: unexpected first frame {mtype!r}"
|
|
205
|
+
)
|
|
206
|
+
self._reader_task = asyncio.create_task(
|
|
207
|
+
self._read_loop(), name="vocence_stt_reader"
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
async def _read_loop(self) -> None:
|
|
211
|
+
"""Background task: translate pod events → STTResponse callbacks."""
|
|
212
|
+
ws = self._ws
|
|
213
|
+
if ws is None:
|
|
214
|
+
return
|
|
215
|
+
try:
|
|
216
|
+
async for msg in ws:
|
|
217
|
+
if msg.type != aiohttp.WSMsgType.TEXT:
|
|
218
|
+
if msg.type in (
|
|
219
|
+
aiohttp.WSMsgType.CLOSED,
|
|
220
|
+
aiohttp.WSMsgType.CLOSE,
|
|
221
|
+
aiohttp.WSMsgType.ERROR,
|
|
222
|
+
):
|
|
223
|
+
return
|
|
224
|
+
continue
|
|
225
|
+
try:
|
|
226
|
+
data = json.loads(msg.data)
|
|
227
|
+
except json.JSONDecodeError:
|
|
228
|
+
continue
|
|
229
|
+
response = self._translate_event(data)
|
|
230
|
+
if response is None:
|
|
231
|
+
continue
|
|
232
|
+
cb = self._transcript_callback
|
|
233
|
+
if cb is None:
|
|
234
|
+
continue
|
|
235
|
+
try:
|
|
236
|
+
await cb(response)
|
|
237
|
+
except Exception as exc: # noqa: BLE001
|
|
238
|
+
logger.warning("VocenceSTT callback raised: %s", exc)
|
|
239
|
+
except asyncio.CancelledError:
|
|
240
|
+
return
|
|
241
|
+
except Exception as exc: # noqa: BLE001
|
|
242
|
+
logger.warning("VocenceSTT reader loop crashed: %s", exc)
|
|
243
|
+
self.emit("error", str(exc))
|
|
244
|
+
|
|
245
|
+
def _translate_event(self, data: dict) -> STTResponse | None:
|
|
246
|
+
"""Map one pod event to the the framework's STTResponse shape.
|
|
247
|
+
|
|
248
|
+
Returns ``None`` for events we drop (errors are logged via
|
|
249
|
+
EventEmitter instead of being passed to the orchestrator).
|
|
250
|
+
"""
|
|
251
|
+
mtype = (data.get("type") or "").lower()
|
|
252
|
+
if mtype == "partial":
|
|
253
|
+
text = (data.get("text") or "").strip()
|
|
254
|
+
if not text:
|
|
255
|
+
return None
|
|
256
|
+
return STTResponse(
|
|
257
|
+
event_type=SpeechEventType.INTERIM,
|
|
258
|
+
data=SpeechData(text=text, language=self.language),
|
|
259
|
+
)
|
|
260
|
+
if mtype == "final":
|
|
261
|
+
text = (data.get("text") or "").strip()
|
|
262
|
+
if not text:
|
|
263
|
+
return None
|
|
264
|
+
return STTResponse(
|
|
265
|
+
event_type=SpeechEventType.FINAL,
|
|
266
|
+
data=SpeechData(
|
|
267
|
+
text=text,
|
|
268
|
+
language=data.get("language_detected") or self.language,
|
|
269
|
+
),
|
|
270
|
+
)
|
|
271
|
+
if mtype == "vad_speech":
|
|
272
|
+
return STTResponse(
|
|
273
|
+
event_type=SpeechEventType.START,
|
|
274
|
+
data=SpeechData(text=""),
|
|
275
|
+
)
|
|
276
|
+
if mtype == "vad_silence":
|
|
277
|
+
return STTResponse(
|
|
278
|
+
event_type=SpeechEventType.END,
|
|
279
|
+
data=SpeechData(text=""),
|
|
280
|
+
)
|
|
281
|
+
if mtype == "error":
|
|
282
|
+
logger.warning(
|
|
283
|
+
"VocenceSTT pod error: %s: %s",
|
|
284
|
+
data.get("code"), data.get("message"),
|
|
285
|
+
)
|
|
286
|
+
self.emit("error", str(data.get("message") or data.get("code")))
|
|
287
|
+
return None
|
|
288
|
+
# ready / pong / unknown — drop silently
|
|
289
|
+
return None
|
|
290
|
+
|
|
291
|
+
async def _teardown_ws(self) -> None:
|
|
292
|
+
ws = self._ws
|
|
293
|
+
if ws is not None and not ws.closed:
|
|
294
|
+
with _suppress():
|
|
295
|
+
await ws.send_str(json.dumps({"type": "close"}))
|
|
296
|
+
with _suppress():
|
|
297
|
+
await ws.close(code=1000)
|
|
298
|
+
self._ws = None
|
|
299
|
+
if self._session is not None and not self._session.closed:
|
|
300
|
+
with _suppress():
|
|
301
|
+
await self._session.close()
|
|
302
|
+
self._session = None
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
class _suppress:
|
|
306
|
+
"""Inline contextlib.suppress(Exception) — keeps the module's
|
|
307
|
+
explicit import surface small."""
|
|
308
|
+
|
|
309
|
+
def __enter__(self) -> None:
|
|
310
|
+
return None
|
|
311
|
+
|
|
312
|
+
def __exit__(self, exc_type, exc, tb) -> bool:
|
|
313
|
+
return exc_type is not None and issubclass(exc_type, Exception)
|
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
"""VocenceTTS — streaming text-to-speech with the Vocence voice library.
|
|
2
|
+
|
|
3
|
+
Conforms to the standard TTS abstract interface used by real-time
|
|
4
|
+
agent pipelines, so it slots in alongside any compatible
|
|
5
|
+
``Pipeline(tts=...)``. The plug-in handles all of the network
|
|
6
|
+
plumbing — connection lifecycle, audio framing, and reconnection —
|
|
7
|
+
so callers just see text in and audio out.
|
|
8
|
+
|
|
9
|
+
Audio output: PCM16LE @ 24 kHz, mono. One persistent connection is
|
|
10
|
+
reused across many ``synthesize()`` calls in the same session;
|
|
11
|
+
lazily opened on the first call and torn down on ``aclose()``.
|
|
12
|
+
"""
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import asyncio
|
|
16
|
+
import json
|
|
17
|
+
import logging
|
|
18
|
+
import os
|
|
19
|
+
from typing import Any, AsyncIterator, Optional, Union
|
|
20
|
+
from urllib.parse import urlparse
|
|
21
|
+
|
|
22
|
+
import aiohttp
|
|
23
|
+
|
|
24
|
+
from videosdk.agents import TTS, FlushMarker # type: ignore[import-not-found]
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
_DEFAULT_BASE_URL = os.environ.get("VOCENCE_BASE_URL", "https://api.vocence.ai")
|
|
30
|
+
_DEFAULT_SAMPLE_RATE = 24_000
|
|
31
|
+
_DEFAULT_CHANNELS = 1
|
|
32
|
+
_DEFAULT_TIMEOUT_SEC = 30.0
|
|
33
|
+
# Per-speak text cap. Mirrors the dev-API limit so the server doesn't
|
|
34
|
+
# reject mid-stream. Plugin users sending longer text should split
|
|
35
|
+
# their input — the framework's sentence chunker already handles this when
|
|
36
|
+
# wired in front of the plugin.
|
|
37
|
+
_MAX_TEXT_CHARS_PER_SPEAK = 4000
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _ws_url_from_base(base_url: str, voice_id: str) -> str:
|
|
41
|
+
"""Translate ``https://api.vocence.ai`` → ``wss://api.vocence.ai/...``."""
|
|
42
|
+
parsed = urlparse(base_url.rstrip("/"))
|
|
43
|
+
scheme = "wss" if parsed.scheme == "https" else "ws"
|
|
44
|
+
return f"{scheme}://{parsed.netloc}{parsed.path}/v1/voices/{voice_id}/stream"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class VocenceTTS(TTS):
|
|
48
|
+
"""Streaming TTS plugin backed by the Vocence voice service.
|
|
49
|
+
|
|
50
|
+
Parameters
|
|
51
|
+
----------
|
|
52
|
+
api_key:
|
|
53
|
+
Vocence developer key (``voc_live_…``). Falls back to the
|
|
54
|
+
``VOCENCE_API_KEY`` env var. Required.
|
|
55
|
+
voice:
|
|
56
|
+
Voice slug (built-in like ``"design-aria"``) or the numeric id
|
|
57
|
+
of a saved designed / cloned voice. Required at construction
|
|
58
|
+
because the WS endpoint is voice-scoped.
|
|
59
|
+
language:
|
|
60
|
+
Optional language hint passed on every ``speak`` frame.
|
|
61
|
+
base_url:
|
|
62
|
+
Override the default ``https://api.vocence.ai`` (set for
|
|
63
|
+
staging / self-hosted deployments).
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
def __init__(
|
|
67
|
+
self,
|
|
68
|
+
*,
|
|
69
|
+
api_key: Optional[str] = None,
|
|
70
|
+
voice: Union[str, int],
|
|
71
|
+
language: Optional[str] = None,
|
|
72
|
+
base_url: str = _DEFAULT_BASE_URL,
|
|
73
|
+
sample_rate: int = _DEFAULT_SAMPLE_RATE,
|
|
74
|
+
**kwargs: Any,
|
|
75
|
+
) -> None:
|
|
76
|
+
super().__init__(sample_rate=sample_rate, num_channels=_DEFAULT_CHANNELS)
|
|
77
|
+
self.api_key = api_key or os.environ.get("VOCENCE_API_KEY")
|
|
78
|
+
if not self.api_key:
|
|
79
|
+
raise ValueError(
|
|
80
|
+
"Vocence API key required — pass api_key= or set VOCENCE_API_KEY"
|
|
81
|
+
)
|
|
82
|
+
self.voice = str(voice)
|
|
83
|
+
self.language = language
|
|
84
|
+
self.base_url = base_url
|
|
85
|
+
self._ws_url = _ws_url_from_base(base_url, self.voice)
|
|
86
|
+
|
|
87
|
+
self._session: aiohttp.ClientSession | None = None
|
|
88
|
+
self._ws: aiohttp.ClientWebSocketResponse | None = None
|
|
89
|
+
self._connect_lock = asyncio.Lock()
|
|
90
|
+
# Per-speak state: cleared at the start of each synthesize() call.
|
|
91
|
+
# The receiver loop runs INLINE with the speak request so we can
|
|
92
|
+
# cancel cleanly on interrupt without a separate task.
|
|
93
|
+
self._interrupted = False
|
|
94
|
+
self._first_chunk_sent = False
|
|
95
|
+
|
|
96
|
+
# ----- abstract overrides ---------------------------------------------
|
|
97
|
+
|
|
98
|
+
async def synthesize(
|
|
99
|
+
self,
|
|
100
|
+
text: AsyncIterator[Union[str, FlushMarker]] | str,
|
|
101
|
+
voice_id: Optional[str] = None,
|
|
102
|
+
**kwargs: Any,
|
|
103
|
+
) -> None:
|
|
104
|
+
"""Send text → push PCM frames to ``self.audio_track``.
|
|
105
|
+
|
|
106
|
+
``text`` may be a plain string OR an async iterator of strings
|
|
107
|
+
(with optional ``FlushMarker`` segment boundaries). The async
|
|
108
|
+
iterator path lets the pipeline pump LLM tokens directly here
|
|
109
|
+
instead of waiting for the full reply. We don't yet support
|
|
110
|
+
the WebSocket re-flush semantics that some upstream sentence
|
|
111
|
+
chunkers rely on, so we concatenate iterator chunks up to a
|
|
112
|
+
flush / end-of-stream marker and send them as one ``speak``
|
|
113
|
+
frame each. This produces good audio with one network
|
|
114
|
+
round-trip per sentence.
|
|
115
|
+
|
|
116
|
+
``voice_id`` is accepted for API symmetry but ignored — the
|
|
117
|
+
endpoint is voice-scoped at the WS URL, so changing voice
|
|
118
|
+
mid-session would require a fresh connection. Construct a new
|
|
119
|
+
``VocenceTTS`` for a different voice.
|
|
120
|
+
"""
|
|
121
|
+
self._interrupted = False
|
|
122
|
+
self._first_chunk_sent = False
|
|
123
|
+
await self._ensure_connection()
|
|
124
|
+
if isinstance(text, str):
|
|
125
|
+
await self._speak_once(text)
|
|
126
|
+
return
|
|
127
|
+
# Async iterator path — collect into segments separated by FlushMarker
|
|
128
|
+
# (or end-of-iterator), send each segment as one speak.
|
|
129
|
+
buf: list[str] = []
|
|
130
|
+
async for chunk in text:
|
|
131
|
+
if self._interrupted:
|
|
132
|
+
return
|
|
133
|
+
if isinstance(chunk, FlushMarker):
|
|
134
|
+
segment = "".join(buf).strip()
|
|
135
|
+
buf = []
|
|
136
|
+
if segment:
|
|
137
|
+
await self._speak_once(segment)
|
|
138
|
+
if self._interrupted:
|
|
139
|
+
return
|
|
140
|
+
continue
|
|
141
|
+
if chunk:
|
|
142
|
+
buf.append(chunk)
|
|
143
|
+
tail = "".join(buf).strip()
|
|
144
|
+
if tail and not self._interrupted:
|
|
145
|
+
await self._speak_once(tail)
|
|
146
|
+
|
|
147
|
+
async def interrupt(self) -> None:
|
|
148
|
+
"""Stop the in-flight ``synthesize()`` ASAP. Doesn't close the
|
|
149
|
+
WebSocket — the connection stays warm for the next call."""
|
|
150
|
+
self._interrupted = True
|
|
151
|
+
ws = self._ws
|
|
152
|
+
if ws is not None and not ws.closed:
|
|
153
|
+
# Best-effort: cancel any in-flight read. The receiver loop
|
|
154
|
+
# checks self._interrupted between chunks and bails out.
|
|
155
|
+
# No control frame to send — the pod will move on once we
|
|
156
|
+
# send a fresh ``speak`` next time.
|
|
157
|
+
pass
|
|
158
|
+
|
|
159
|
+
async def aclose(self) -> None:
|
|
160
|
+
"""Tear down the WebSocket + HTTP session. Idempotent."""
|
|
161
|
+
ws = self._ws
|
|
162
|
+
if ws is not None and not ws.closed:
|
|
163
|
+
with _suppress():
|
|
164
|
+
await ws.send_str(json.dumps({"type": "stop"}))
|
|
165
|
+
with _suppress():
|
|
166
|
+
await ws.close(code=1000)
|
|
167
|
+
self._ws = None
|
|
168
|
+
if self._session is not None and not self._session.closed:
|
|
169
|
+
with _suppress():
|
|
170
|
+
await self._session.close()
|
|
171
|
+
self._session = None
|
|
172
|
+
|
|
173
|
+
def reset_first_audio_tracking(self) -> None:
|
|
174
|
+
self._first_chunk_sent = False
|
|
175
|
+
|
|
176
|
+
# ----- internals ------------------------------------------------------
|
|
177
|
+
|
|
178
|
+
async def _ensure_connection(self) -> None:
|
|
179
|
+
"""Open the WS if not already open; reopen if it died."""
|
|
180
|
+
async with self._connect_lock:
|
|
181
|
+
if self._ws is not None and not self._ws.closed:
|
|
182
|
+
return
|
|
183
|
+
if self._session is None or self._session.closed:
|
|
184
|
+
self._session = aiohttp.ClientSession(
|
|
185
|
+
timeout=aiohttp.ClientTimeout(total=_DEFAULT_TIMEOUT_SEC),
|
|
186
|
+
)
|
|
187
|
+
headers = {"Authorization": f"Bearer {self.api_key}"}
|
|
188
|
+
self._ws = await self._session.ws_connect(self._ws_url, headers=headers)
|
|
189
|
+
# First message from the server is ``ready``. Wait for it so
|
|
190
|
+
# subsequent ``speak`` frames aren't sent into a half-open
|
|
191
|
+
# connection. Surface any auth / not-found error here.
|
|
192
|
+
msg = await self._ws.receive(timeout=_DEFAULT_TIMEOUT_SEC)
|
|
193
|
+
if msg.type != aiohttp.WSMsgType.TEXT:
|
|
194
|
+
raise RuntimeError(
|
|
195
|
+
f"VocenceTTS: expected text ready frame, got {msg.type}"
|
|
196
|
+
)
|
|
197
|
+
data = json.loads(msg.data)
|
|
198
|
+
mtype = (data.get("type") or "").lower()
|
|
199
|
+
if mtype == "error":
|
|
200
|
+
raise RuntimeError(
|
|
201
|
+
f"VocenceTTS connect rejected: "
|
|
202
|
+
f"{data.get('code')}: {data.get('message')}"
|
|
203
|
+
)
|
|
204
|
+
if mtype != "ready":
|
|
205
|
+
raise RuntimeError(
|
|
206
|
+
f"VocenceTTS: unexpected first frame {mtype!r}"
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
async def _speak_once(self, text: str) -> None:
|
|
210
|
+
"""Send one ``speak`` and drain audio frames until ``end``."""
|
|
211
|
+
if not text:
|
|
212
|
+
return
|
|
213
|
+
if len(text) > _MAX_TEXT_CHARS_PER_SPEAK:
|
|
214
|
+
# Truncate rather than fail: a sentence chunker in front
|
|
215
|
+
# of us should keep segments well under the cap.
|
|
216
|
+
logger.warning(
|
|
217
|
+
"VocenceTTS: truncating %d-char segment to %d (cap)",
|
|
218
|
+
len(text), _MAX_TEXT_CHARS_PER_SPEAK,
|
|
219
|
+
)
|
|
220
|
+
text = text[:_MAX_TEXT_CHARS_PER_SPEAK]
|
|
221
|
+
ws = self._ws
|
|
222
|
+
if ws is None:
|
|
223
|
+
return
|
|
224
|
+
payload: dict[str, Any] = {"type": "speak", "text": text}
|
|
225
|
+
if self.language:
|
|
226
|
+
payload["language"] = self.language
|
|
227
|
+
await ws.send_str(json.dumps(payload))
|
|
228
|
+
await self._drain_until_end(ws)
|
|
229
|
+
|
|
230
|
+
async def _drain_until_end(self, ws: aiohttp.ClientWebSocketResponse) -> None:
|
|
231
|
+
"""Read frames until ``{"type":"end"}`` or interruption."""
|
|
232
|
+
while True:
|
|
233
|
+
if self._interrupted:
|
|
234
|
+
return
|
|
235
|
+
try:
|
|
236
|
+
msg = await ws.receive(timeout=_DEFAULT_TIMEOUT_SEC)
|
|
237
|
+
except asyncio.TimeoutError:
|
|
238
|
+
logger.warning("VocenceTTS: receive timed out")
|
|
239
|
+
return
|
|
240
|
+
if msg.type == aiohttp.WSMsgType.BINARY:
|
|
241
|
+
if not msg.data:
|
|
242
|
+
continue
|
|
243
|
+
# First-byte callback for TTFB metrics — the pipeline
|
|
244
|
+
# uses this to fire its ``first_audio_byte`` event.
|
|
245
|
+
if not self._first_chunk_sent:
|
|
246
|
+
self._first_chunk_sent = True
|
|
247
|
+
if self._first_audio_callback is not None:
|
|
248
|
+
with _suppress():
|
|
249
|
+
await self._first_audio_callback()
|
|
250
|
+
if self.audio_track is not None:
|
|
251
|
+
with _suppress():
|
|
252
|
+
await self.audio_track.add_new_bytes(msg.data)
|
|
253
|
+
continue
|
|
254
|
+
if msg.type == aiohttp.WSMsgType.TEXT:
|
|
255
|
+
try:
|
|
256
|
+
data = json.loads(msg.data)
|
|
257
|
+
except json.JSONDecodeError:
|
|
258
|
+
continue
|
|
259
|
+
mtype = (data.get("type") or "").lower()
|
|
260
|
+
if mtype == "end":
|
|
261
|
+
return
|
|
262
|
+
if mtype == "meta":
|
|
263
|
+
# Sample rate / encoding info — we know our pod
|
|
264
|
+
# output is PCM16LE @ 24 kHz mono so don't need to
|
|
265
|
+
# do anything with this on the plugin side. The
|
|
266
|
+
# the audio_track already expects this format
|
|
267
|
+
# because we set ``sample_rate=24000`` on the base
|
|
268
|
+
# class at __init__.
|
|
269
|
+
continue
|
|
270
|
+
if mtype == "error":
|
|
271
|
+
logger.warning(
|
|
272
|
+
"VocenceTTS pod error: %s: %s",
|
|
273
|
+
data.get("code"), data.get("message"),
|
|
274
|
+
)
|
|
275
|
+
return
|
|
276
|
+
continue
|
|
277
|
+
if msg.type in (
|
|
278
|
+
aiohttp.WSMsgType.CLOSED,
|
|
279
|
+
aiohttp.WSMsgType.CLOSE,
|
|
280
|
+
aiohttp.WSMsgType.ERROR,
|
|
281
|
+
):
|
|
282
|
+
self._ws = None
|
|
283
|
+
return
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
class _suppress:
|
|
287
|
+
"""Tiny ``contextlib.suppress(Exception)`` clone — kept inline to
|
|
288
|
+
keep the module's import surface small."""
|
|
289
|
+
|
|
290
|
+
def __enter__(self) -> None:
|
|
291
|
+
return None
|
|
292
|
+
|
|
293
|
+
def __exit__(self, exc_type, exc, tb) -> bool:
|
|
294
|
+
return exc_type is not None and issubclass(exc_type, Exception)
|