livekit-plugins-speechmatics 0.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- livekit_plugins_speechmatics-0.0.2/PKG-INFO +66 -0
- livekit_plugins_speechmatics-0.0.2/README.md +33 -0
- livekit_plugins_speechmatics-0.0.2/livekit/plugins/speechmatics/__init__.py +32 -0
- livekit_plugins_speechmatics-0.0.2/livekit/plugins/speechmatics/log.py +3 -0
- livekit_plugins_speechmatics-0.0.2/livekit/plugins/speechmatics/py.typed +0 -0
- livekit_plugins_speechmatics-0.0.2/livekit/plugins/speechmatics/stt.py +328 -0
- livekit_plugins_speechmatics-0.0.2/livekit/plugins/speechmatics/types.py +141 -0
- livekit_plugins_speechmatics-0.0.2/livekit/plugins/speechmatics/utils.py +57 -0
- livekit_plugins_speechmatics-0.0.2/livekit/plugins/speechmatics/version.py +15 -0
- livekit_plugins_speechmatics-0.0.2/livekit_plugins_speechmatics.egg-info/PKG-INFO +66 -0
- livekit_plugins_speechmatics-0.0.2/livekit_plugins_speechmatics.egg-info/SOURCES.txt +14 -0
- livekit_plugins_speechmatics-0.0.2/livekit_plugins_speechmatics.egg-info/dependency_links.txt +1 -0
- livekit_plugins_speechmatics-0.0.2/livekit_plugins_speechmatics.egg-info/requires.txt +1 -0
- livekit_plugins_speechmatics-0.0.2/livekit_plugins_speechmatics.egg-info/top_level.txt +1 -0
- livekit_plugins_speechmatics-0.0.2/setup.cfg +4 -0
- livekit_plugins_speechmatics-0.0.2/setup.py +58 -0
@@ -0,0 +1,66 @@
|
|
1
|
+
Metadata-Version: 2.2
|
2
|
+
Name: livekit-plugins-speechmatics
|
3
|
+
Version: 0.0.2
|
4
|
+
Summary: Agent Framework plugin for Speechmatics
|
5
|
+
Home-page: https://github.com/livekit/agents
|
6
|
+
License: Apache-2.0
|
7
|
+
Project-URL: Documentation, https://docs.livekit.io
|
8
|
+
Project-URL: Website, https://livekit.io/
|
9
|
+
Project-URL: Source, https://github.com/livekit/agents
|
10
|
+
Keywords: webrtc,realtime,audio,video,livekit
|
11
|
+
Classifier: Intended Audience :: Developers
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
13
|
+
Classifier: Topic :: Multimedia :: Sound/Audio
|
14
|
+
Classifier: Topic :: Multimedia :: Video
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
19
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
20
|
+
Requires-Python: >=3.9.0
|
21
|
+
Description-Content-Type: text/markdown
|
22
|
+
Requires-Dist: livekit-agents<1.0.0,>=0.12.16
|
23
|
+
Dynamic: classifier
|
24
|
+
Dynamic: description
|
25
|
+
Dynamic: description-content-type
|
26
|
+
Dynamic: home-page
|
27
|
+
Dynamic: keywords
|
28
|
+
Dynamic: license
|
29
|
+
Dynamic: project-url
|
30
|
+
Dynamic: requires-dist
|
31
|
+
Dynamic: requires-python
|
32
|
+
Dynamic: summary
|
33
|
+
|
34
|
+
# LiveKit Plugins Speechmatics
|
35
|
+
|
36
|
+
Agent Framework plugin for Speechmatics.
|
37
|
+
|
38
|
+
## Installation
|
39
|
+
|
40
|
+
```bash
|
41
|
+
pip install livekit-plugins-speechmatics
|
42
|
+
```
|
43
|
+
|
44
|
+
Usage:
|
45
|
+
|
46
|
+
```python
|
47
|
+
agent = VoicePipelineAgent(
|
48
|
+
stt=speechmatics.STT(),
|
49
|
+
turn_detector=turn_detector.EOUModel(),
|
50
|
+
min_endpointing_delay=0.5,
|
51
|
+
max_endpointing_delay=5.0,
|
52
|
+
...
|
53
|
+
)
|
54
|
+
```
|
55
|
+
|
56
|
+
Note: The plugin was built with
|
57
|
+
LiveKit's [end-of-turn detection feature](https://github.com/livekit/agents#in-house-phrase-endpointing-model) in mind,
|
58
|
+
and it doesn't implement phrase endpointing. `AddTranscript` and `AddPartialTranscript` events are emitted as soon
|
59
|
+
as they’re received from the Speechmatics STT engine. For the best user experience,
|
60
|
+
we recommend running the agent with end-of-turn detection enabled (
|
61
|
+
see [example](https://github.com/livekit-examples/voice-pipeline-agent-python/blob/main/agent.py)).
|
62
|
+
|
63
|
+
## Pre-requisites
|
64
|
+
|
65
|
+
You'll need to specify a Speechmatics API Key. It can be set as environment variable `SPEECHMATICS_API_KEY` or
|
66
|
+
`.env.local` file.
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# LiveKit Plugins Speechmatics
|
2
|
+
|
3
|
+
Agent Framework plugin for Speechmatics.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
```bash
|
8
|
+
pip install livekit-plugins-speechmatics
|
9
|
+
```
|
10
|
+
|
11
|
+
Usage:
|
12
|
+
|
13
|
+
```python
|
14
|
+
agent = VoicePipelineAgent(
|
15
|
+
stt=speechmatics.STT(),
|
16
|
+
turn_detector=turn_detector.EOUModel(),
|
17
|
+
min_endpointing_delay=0.5,
|
18
|
+
max_endpointing_delay=5.0,
|
19
|
+
...
|
20
|
+
)
|
21
|
+
```
|
22
|
+
|
23
|
+
Note: The plugin was built with
|
24
|
+
LiveKit's [end-of-turn detection feature](https://github.com/livekit/agents#in-house-phrase-endpointing-model) in mind,
|
25
|
+
and it doesn't implement phrase endpointing. `AddTranscript` and `AddPartialTranscript` events are emitted as soon
|
26
|
+
as they’re received from the Speechmatics STT engine. For the best user experience,
|
27
|
+
we recommend running the agent with end-of-turn detection enabled (
|
28
|
+
see [example](https://github.com/livekit-examples/voice-pipeline-agent-python/blob/main/agent.py)).
|
29
|
+
|
30
|
+
## Pre-requisites
|
31
|
+
|
32
|
+
You'll need to specify a Speechmatics API Key. It can be set as environment variable `SPEECHMATICS_API_KEY` or
|
33
|
+
`.env.local` file.
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
2
|
+
# you may not use this file except in compliance with the License.
|
3
|
+
# You may obtain a copy of the License at
|
4
|
+
#
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
6
|
+
#
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
10
|
+
# See the License for the specific language governing permissions and
|
11
|
+
# limitations under the License.
|
12
|
+
|
13
|
+
from .log import logger
|
14
|
+
from .stt import STT, SpeechStream
|
15
|
+
from .version import __version__
|
16
|
+
|
17
|
+
__all__ = [
|
18
|
+
"STT",
|
19
|
+
"SpeechStream",
|
20
|
+
"logger",
|
21
|
+
"__version__",
|
22
|
+
]
|
23
|
+
|
24
|
+
from livekit.agents import Plugin
|
25
|
+
|
26
|
+
|
27
|
+
class SpeechmaticsPlugin(Plugin):
|
28
|
+
def __init__(self):
|
29
|
+
super().__init__(__name__, __version__, __package__)
|
30
|
+
|
31
|
+
|
32
|
+
Plugin.register_plugin(SpeechmaticsPlugin())
|
File without changes
|
@@ -0,0 +1,328 @@
|
|
1
|
+
# Copyright 2025 LiveKit, Inc.
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
|
16
|
+
from __future__ import annotations
|
17
|
+
|
18
|
+
import asyncio
|
19
|
+
import dataclasses
|
20
|
+
import json
|
21
|
+
import os
|
22
|
+
import weakref
|
23
|
+
from typing import Dict, List, Optional
|
24
|
+
|
25
|
+
import aiohttp
|
26
|
+
from livekit.agents import (
|
27
|
+
DEFAULT_API_CONNECT_OPTIONS,
|
28
|
+
APIConnectOptions,
|
29
|
+
APIStatusError,
|
30
|
+
stt,
|
31
|
+
utils,
|
32
|
+
)
|
33
|
+
from livekit.agents.utils import AudioBuffer
|
34
|
+
|
35
|
+
from .log import logger
|
36
|
+
from .types import (
|
37
|
+
AudioSettings,
|
38
|
+
ClientMessageType,
|
39
|
+
ConnectionSettings,
|
40
|
+
ServerMessageType,
|
41
|
+
TranscriptionConfig,
|
42
|
+
)
|
43
|
+
from .utils import get_access_token, sanitize_url
|
44
|
+
|
45
|
+
|
46
|
+
class STT(stt.STT):
|
47
|
+
def __init__(
|
48
|
+
self,
|
49
|
+
*,
|
50
|
+
transcription_config: TranscriptionConfig = TranscriptionConfig(
|
51
|
+
language="en",
|
52
|
+
operating_point="enhanced",
|
53
|
+
enable_partials=True,
|
54
|
+
max_delay=0.7,
|
55
|
+
),
|
56
|
+
connection_settings: ConnectionSettings = ConnectionSettings(
|
57
|
+
url="wss://eu2.rt.speechmatics.com/v2",
|
58
|
+
),
|
59
|
+
audio_settings: AudioSettings = AudioSettings(),
|
60
|
+
http_session: Optional[aiohttp.ClientSession] = None,
|
61
|
+
extra_headers: Optional[Dict] = None,
|
62
|
+
):
|
63
|
+
super().__init__(
|
64
|
+
capabilities=stt.STTCapabilities(
|
65
|
+
streaming=True,
|
66
|
+
interim_results=True,
|
67
|
+
),
|
68
|
+
)
|
69
|
+
self._transcription_config = transcription_config
|
70
|
+
self._audio_settings = audio_settings
|
71
|
+
self._connection_settings = connection_settings
|
72
|
+
self._extra_headers = extra_headers or {}
|
73
|
+
self._session = http_session
|
74
|
+
self._streams = weakref.WeakSet[SpeechStream]()
|
75
|
+
|
76
|
+
@property
|
77
|
+
def session(self) -> aiohttp.ClientSession:
|
78
|
+
if not self._session:
|
79
|
+
self._session = utils.http_context.http_session()
|
80
|
+
return self._session
|
81
|
+
|
82
|
+
async def _recognize_impl(
|
83
|
+
self,
|
84
|
+
buffer: AudioBuffer,
|
85
|
+
*,
|
86
|
+
language: str | None,
|
87
|
+
conn_options: APIConnectOptions,
|
88
|
+
) -> stt.SpeechEvent:
|
89
|
+
raise NotImplementedError("Not implemented")
|
90
|
+
|
91
|
+
def stream(
|
92
|
+
self,
|
93
|
+
*,
|
94
|
+
language: Optional[str] = None,
|
95
|
+
conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
|
96
|
+
) -> "SpeechStream":
|
97
|
+
config = dataclasses.replace(self._audio_settings)
|
98
|
+
stream = SpeechStream(
|
99
|
+
stt=self,
|
100
|
+
transcription_config=self._transcription_config,
|
101
|
+
audio_settings=config,
|
102
|
+
connection_settings=self._connection_settings,
|
103
|
+
conn_options=conn_options,
|
104
|
+
http_session=self.session,
|
105
|
+
extra_headers=self._extra_headers,
|
106
|
+
)
|
107
|
+
self._streams.add(stream)
|
108
|
+
return stream
|
109
|
+
|
110
|
+
|
111
|
+
class SpeechStream(stt.SpeechStream):
|
112
|
+
def __init__(
|
113
|
+
self,
|
114
|
+
*,
|
115
|
+
stt: STT,
|
116
|
+
transcription_config: TranscriptionConfig,
|
117
|
+
audio_settings: AudioSettings,
|
118
|
+
connection_settings: ConnectionSettings,
|
119
|
+
conn_options: APIConnectOptions,
|
120
|
+
http_session: aiohttp.ClientSession,
|
121
|
+
extra_headers: Optional[Dict] = None,
|
122
|
+
) -> None:
|
123
|
+
super().__init__(
|
124
|
+
stt=stt, conn_options=conn_options, sample_rate=audio_settings.sample_rate
|
125
|
+
)
|
126
|
+
self._transcription_config = transcription_config
|
127
|
+
self._audio_settings = audio_settings
|
128
|
+
self._connection_settings = connection_settings
|
129
|
+
self._session = http_session
|
130
|
+
self._extra_headers = extra_headers or {}
|
131
|
+
self._speech_duration: float = 0
|
132
|
+
|
133
|
+
self._reconnect_event = asyncio.Event()
|
134
|
+
self._recognition_started = asyncio.Event()
|
135
|
+
self._seq_no = 0
|
136
|
+
|
137
|
+
async def _run(self):
|
138
|
+
closing_ws = False
|
139
|
+
|
140
|
+
async def send_task(ws: aiohttp.ClientWebSocketResponse):
|
141
|
+
nonlocal closing_ws
|
142
|
+
|
143
|
+
start_recognition_msg = {
|
144
|
+
"message": ClientMessageType.StartRecognition,
|
145
|
+
"audio_format": self._audio_settings.asdict(),
|
146
|
+
"transcription_config": self._transcription_config.asdict(),
|
147
|
+
}
|
148
|
+
await ws.send_str(json.dumps(start_recognition_msg))
|
149
|
+
|
150
|
+
await self._recognition_started.wait()
|
151
|
+
|
152
|
+
audio_bstream = utils.audio.AudioByteStream(
|
153
|
+
sample_rate=self._audio_settings.sample_rate,
|
154
|
+
num_channels=1,
|
155
|
+
)
|
156
|
+
|
157
|
+
async for data in self._input_ch:
|
158
|
+
if isinstance(data, self._FlushSentinel):
|
159
|
+
frames = audio_bstream.flush()
|
160
|
+
else:
|
161
|
+
frames = audio_bstream.write(data.data.tobytes())
|
162
|
+
|
163
|
+
for frame in frames:
|
164
|
+
self._seq_no += 1
|
165
|
+
self._speech_duration += frame.duration
|
166
|
+
await ws.send_bytes(frame.data.tobytes())
|
167
|
+
|
168
|
+
closing_ws = True
|
169
|
+
await ws.send_str(
|
170
|
+
json.dumps(
|
171
|
+
{
|
172
|
+
"message": ClientMessageType.EndOfStream,
|
173
|
+
"last_seq_no": self._seq_no,
|
174
|
+
}
|
175
|
+
)
|
176
|
+
)
|
177
|
+
|
178
|
+
async def recv_task(ws: aiohttp.ClientWebSocketResponse):
|
179
|
+
nonlocal closing_ws
|
180
|
+
while True:
|
181
|
+
msg = await ws.receive()
|
182
|
+
if msg.type in (
|
183
|
+
aiohttp.WSMsgType.CLOSED,
|
184
|
+
aiohttp.WSMsgType.CLOSE,
|
185
|
+
aiohttp.WSMsgType.CLOSING,
|
186
|
+
):
|
187
|
+
if closing_ws: # close is expected, see SpeechStream.aclose
|
188
|
+
return
|
189
|
+
|
190
|
+
# this will trigger a reconnection, see the _run loop
|
191
|
+
raise APIStatusError(
|
192
|
+
message="Speechmatics connection closed unexpectedly"
|
193
|
+
)
|
194
|
+
|
195
|
+
try:
|
196
|
+
data = json.loads(msg.data)
|
197
|
+
self._process_stream_event(data, closing_ws)
|
198
|
+
except Exception:
|
199
|
+
logger.exception("failed to process Speechmatics message")
|
200
|
+
|
201
|
+
ws: aiohttp.ClientWebSocketResponse | None = None
|
202
|
+
|
203
|
+
while True:
|
204
|
+
try:
|
205
|
+
ws = await self._connect_ws()
|
206
|
+
tasks = [
|
207
|
+
asyncio.create_task(send_task(ws)),
|
208
|
+
asyncio.create_task(recv_task(ws)),
|
209
|
+
]
|
210
|
+
wait_reconnect_task = asyncio.create_task(self._reconnect_event.wait())
|
211
|
+
|
212
|
+
try:
|
213
|
+
done, _ = await asyncio.wait(
|
214
|
+
[asyncio.gather(*tasks), wait_reconnect_task],
|
215
|
+
return_when=asyncio.FIRST_COMPLETED,
|
216
|
+
) # type: ignore
|
217
|
+
for task in done:
|
218
|
+
if task != wait_reconnect_task:
|
219
|
+
task.result()
|
220
|
+
|
221
|
+
if wait_reconnect_task not in done:
|
222
|
+
break
|
223
|
+
|
224
|
+
self._reconnect_event.clear()
|
225
|
+
finally:
|
226
|
+
await utils.aio.gracefully_cancel(*tasks, wait_reconnect_task)
|
227
|
+
finally:
|
228
|
+
if ws is not None:
|
229
|
+
await ws.close()
|
230
|
+
|
231
|
+
async def _connect_ws(self) -> aiohttp.ClientWebSocketResponse:
|
232
|
+
api_key = self._connection_settings.api_key or os.environ.get(
|
233
|
+
"SPEECHMATICS_API_KEY"
|
234
|
+
)
|
235
|
+
if api_key is None:
|
236
|
+
raise ValueError(
|
237
|
+
"Speechmatics API key is required. "
|
238
|
+
"Pass one in via ConnectionSettings.api_key parameter, "
|
239
|
+
"or set `SPEECHMATICS_API_KEY` environment variable"
|
240
|
+
)
|
241
|
+
if self._connection_settings.get_access_token:
|
242
|
+
api_key = await get_access_token(api_key)
|
243
|
+
headers = {
|
244
|
+
"Authorization": f"Bearer {api_key}",
|
245
|
+
**self._extra_headers,
|
246
|
+
}
|
247
|
+
url = sanitize_url(
|
248
|
+
self._connection_settings.url, self._transcription_config.language
|
249
|
+
)
|
250
|
+
return await self._session.ws_connect(
|
251
|
+
url,
|
252
|
+
ssl=self._connection_settings.ssl_context,
|
253
|
+
headers=headers,
|
254
|
+
)
|
255
|
+
|
256
|
+
def _process_stream_event(self, data: dict, closing_ws: bool) -> None:
|
257
|
+
message_type = data["message"]
|
258
|
+
|
259
|
+
if message_type == ServerMessageType.RecognitionStarted:
|
260
|
+
self._recognition_started.set()
|
261
|
+
|
262
|
+
elif message_type == ServerMessageType.AddPartialTranscript:
|
263
|
+
alts = live_transcription_to_speech_data(data)
|
264
|
+
if len(alts) > 0 and alts[0].text:
|
265
|
+
interim_event = stt.SpeechEvent(
|
266
|
+
type=stt.SpeechEventType.INTERIM_TRANSCRIPT,
|
267
|
+
alternatives=alts,
|
268
|
+
)
|
269
|
+
self._event_ch.send_nowait(interim_event)
|
270
|
+
|
271
|
+
elif message_type == ServerMessageType.AddTranscript:
|
272
|
+
alts = live_transcription_to_speech_data(data)
|
273
|
+
if len(alts) > 0 and alts[0].text:
|
274
|
+
final_event = stt.SpeechEvent(
|
275
|
+
type=stt.SpeechEventType.FINAL_TRANSCRIPT,
|
276
|
+
alternatives=alts,
|
277
|
+
)
|
278
|
+
self._event_ch.send_nowait(final_event)
|
279
|
+
|
280
|
+
if self._speech_duration > 0:
|
281
|
+
usage_event = stt.SpeechEvent(
|
282
|
+
type=stt.SpeechEventType.RECOGNITION_USAGE,
|
283
|
+
alternatives=[],
|
284
|
+
recognition_usage=stt.RecognitionUsage(
|
285
|
+
audio_duration=self._speech_duration
|
286
|
+
),
|
287
|
+
)
|
288
|
+
self._event_ch.send_nowait(usage_event)
|
289
|
+
self._speech_duration = 0
|
290
|
+
|
291
|
+
elif message_type == ServerMessageType.EndOfTranscript:
|
292
|
+
if closing_ws:
|
293
|
+
pass
|
294
|
+
else:
|
295
|
+
raise Exception("Speechmatics connection closed unexpectedly")
|
296
|
+
|
297
|
+
|
298
|
+
def live_transcription_to_speech_data(data: dict) -> List[stt.SpeechData]:
|
299
|
+
speech_data: List[stt.SpeechData] = []
|
300
|
+
|
301
|
+
for result in data.get("results", []):
|
302
|
+
start_time, end_time, is_eos = (
|
303
|
+
result.get("start_time", 0),
|
304
|
+
result.get("end_time", 0),
|
305
|
+
result.get("is_eos", False),
|
306
|
+
)
|
307
|
+
|
308
|
+
for alt in result.get("alternatives", []):
|
309
|
+
content, confidence, language = (
|
310
|
+
alt.get("content", "").strip(),
|
311
|
+
alt.get("confidence", 1.0),
|
312
|
+
alt.get("language", "en"),
|
313
|
+
)
|
314
|
+
|
315
|
+
if not content:
|
316
|
+
continue
|
317
|
+
|
318
|
+
# append punctuation to the previous result
|
319
|
+
if is_eos and speech_data:
|
320
|
+
speech_data[-1].text += content
|
321
|
+
elif speech_data and start_time == speech_data[-1].end_time:
|
322
|
+
speech_data[-1].text += " " + content
|
323
|
+
else:
|
324
|
+
speech_data.append(
|
325
|
+
stt.SpeechData(language, content, start_time, end_time, confidence)
|
326
|
+
)
|
327
|
+
|
328
|
+
return speech_data
|
@@ -0,0 +1,141 @@
|
|
1
|
+
import ssl
|
2
|
+
from dataclasses import asdict, dataclass, field
|
3
|
+
from enum import Enum
|
4
|
+
from typing import Any, Dict, Optional
|
5
|
+
|
6
|
+
|
7
|
+
@dataclass
|
8
|
+
class TranscriptionConfig:
|
9
|
+
"""Real-time: Defines transcription parameters."""
|
10
|
+
|
11
|
+
language: str = "en"
|
12
|
+
"""ISO 639-1 language code. eg. `en`"""
|
13
|
+
|
14
|
+
operating_point: Optional[str] = None
|
15
|
+
"""Specifies which acoustic model to use."""
|
16
|
+
|
17
|
+
output_locale: Optional[str] = None
|
18
|
+
"""RFC-5646 language code for transcript output. eg. `en-AU`"""
|
19
|
+
|
20
|
+
diarization: Optional[str] = None
|
21
|
+
"""Indicates type of diarization to use, if any."""
|
22
|
+
|
23
|
+
additional_vocab: Optional[Dict] = None
|
24
|
+
"""Additional vocabulary that is not part of the standard language."""
|
25
|
+
|
26
|
+
punctuation_overrides: Optional[Dict] = None
|
27
|
+
"""Permitted puctuation marks for advanced punctuation."""
|
28
|
+
|
29
|
+
enable_entities: Optional[bool] = None
|
30
|
+
"""Indicates if inverse text normalization entity output is enabled."""
|
31
|
+
|
32
|
+
max_delay: Optional[float] = None
|
33
|
+
"""Maximum acceptable delay."""
|
34
|
+
|
35
|
+
max_delay_mode: Optional[str] = None
|
36
|
+
"""Determines whether the threshold specified in max_delay can be exceeded
|
37
|
+
if a potential entity is detected. Flexible means if a potential entity
|
38
|
+
is detected, then the max_delay can be overriden until the end of that
|
39
|
+
entity. Fixed means that max_delay specified ignores any potential
|
40
|
+
entity that would not be completed within that threshold."""
|
41
|
+
|
42
|
+
streaming_mode: Optional[bool] = None
|
43
|
+
"""Indicates if we run the engine in streaming mode, or regular RT mode."""
|
44
|
+
|
45
|
+
enable_partials: Optional[bool] = None
|
46
|
+
"""Indicates if partials for transcription, where words are produced
|
47
|
+
immediately, is enabled."""
|
48
|
+
|
49
|
+
def asdict(self) -> Dict[Any, Any]:
|
50
|
+
"""Returns model as a dict while excluding None values recursively."""
|
51
|
+
return asdict(
|
52
|
+
self, dict_factory=lambda x: {k: v for (k, v) in x if v is not None}
|
53
|
+
)
|
54
|
+
|
55
|
+
|
56
|
+
@dataclass
|
57
|
+
class AudioSettings:
|
58
|
+
"""Real-time: Defines audio parameters."""
|
59
|
+
|
60
|
+
encoding: str = "pcm_s16le"
|
61
|
+
"""Encoding format when raw audio is used. Allowed values are
|
62
|
+
`pcm_f32le`, `pcm_s16le` and `mulaw`."""
|
63
|
+
|
64
|
+
sample_rate: int = 16000
|
65
|
+
"""Sampling rate in hertz."""
|
66
|
+
|
67
|
+
def asdict(self):
|
68
|
+
return {
|
69
|
+
"type": "raw",
|
70
|
+
"encoding": self.encoding,
|
71
|
+
"sample_rate": self.sample_rate,
|
72
|
+
}
|
73
|
+
|
74
|
+
|
75
|
+
@dataclass
|
76
|
+
class ConnectionSettings:
|
77
|
+
"""Defines connection parameters."""
|
78
|
+
|
79
|
+
url: str
|
80
|
+
"""Websocket server endpoint."""
|
81
|
+
|
82
|
+
ssl_context: ssl.SSLContext = field(default_factory=ssl.create_default_context)
|
83
|
+
"""SSL context."""
|
84
|
+
|
85
|
+
api_key: Optional[str] = None
|
86
|
+
"""api key to authenticate a customer."""
|
87
|
+
|
88
|
+
get_access_token: Optional[bool] = True
|
89
|
+
"""Automatically generate a temporary token for authentication."""
|
90
|
+
|
91
|
+
|
92
|
+
class ClientMessageType(str, Enum):
|
93
|
+
# pylint: disable=invalid-name
|
94
|
+
"""Real-time: Defines various messages sent from client to server."""
|
95
|
+
|
96
|
+
StartRecognition = "StartRecognition"
|
97
|
+
"""Initiates a recognition job based on configuration set previously."""
|
98
|
+
|
99
|
+
AddAudio = "AddAudio"
|
100
|
+
"""Adds more audio data to the recognition job. The server confirms
|
101
|
+
receipt by sending an :py:attr:`ServerMessageType.AudioAdded` message."""
|
102
|
+
|
103
|
+
EndOfStream = "EndOfStream"
|
104
|
+
"""Indicates that the client has no more audio to send."""
|
105
|
+
|
106
|
+
SetRecognitionConfig = "SetRecognitionConfig"
|
107
|
+
"""Allows the client to re-configure the recognition session."""
|
108
|
+
|
109
|
+
|
110
|
+
class ServerMessageType(str, Enum):
|
111
|
+
"""Real-time: Defines various message types sent from server to client."""
|
112
|
+
|
113
|
+
RecognitionStarted = "RecognitionStarted"
|
114
|
+
"""Server response to :py:attr:`ClientMessageType.StartRecognition`,
|
115
|
+
acknowledging that a recognition session has started."""
|
116
|
+
|
117
|
+
AudioAdded = "AudioAdded"
|
118
|
+
"""Server response to :py:attr:`ClientMessageType.AddAudio`, indicating
|
119
|
+
that audio has been added successfully."""
|
120
|
+
|
121
|
+
AddPartialTranscript = "AddPartialTranscript"
|
122
|
+
"""Indicates a partial transcript, which is an incomplete transcript that
|
123
|
+
is immediately produced and may change as more context becomes available.
|
124
|
+
"""
|
125
|
+
|
126
|
+
AddTranscript = "AddTranscript"
|
127
|
+
"""Indicates the final transcript of a part of the audio."""
|
128
|
+
|
129
|
+
EndOfTranscript = "EndOfTranscript"
|
130
|
+
"""Server response to :py:attr:`ClientMessageType.EndOfStream`,
|
131
|
+
after the server has finished sending all :py:attr:`AddTranscript`
|
132
|
+
messages."""
|
133
|
+
|
134
|
+
Info = "Info"
|
135
|
+
"""Indicates a generic info message."""
|
136
|
+
|
137
|
+
Warning = "Warning"
|
138
|
+
"""Indicates a generic warning message."""
|
139
|
+
|
140
|
+
Error = "Error"
|
141
|
+
"""Indicates n generic error message."""
|
@@ -0,0 +1,57 @@
|
|
1
|
+
import importlib.metadata
|
2
|
+
import os
|
3
|
+
|
4
|
+
import aiohttp
|
5
|
+
|
6
|
+
|
7
|
+
async def get_access_token(api_key: str) -> str:
|
8
|
+
mp_api_url = os.getenv(
|
9
|
+
"SPEECHMATICS_MANAGEMENT_PLATFORM_URL", "https://mp.speechmatics.com"
|
10
|
+
)
|
11
|
+
endpoint = f"{mp_api_url}/v1/api_keys"
|
12
|
+
params = {"type": "rt", "sm-sdk": get_sdk_version()}
|
13
|
+
json_body = {"ttl": 60}
|
14
|
+
headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
|
15
|
+
|
16
|
+
async with aiohttp.ClientSession() as session:
|
17
|
+
async with session.post(
|
18
|
+
endpoint, params=params, json=json_body, headers=headers
|
19
|
+
) as resp:
|
20
|
+
if resp.status == 201:
|
21
|
+
try:
|
22
|
+
data = await resp.json()
|
23
|
+
return data["key_value"]
|
24
|
+
except (ValueError, KeyError) as e:
|
25
|
+
raise Exception(
|
26
|
+
f"Failed to parse Speechmatics access token response: {e}"
|
27
|
+
)
|
28
|
+
else:
|
29
|
+
error_message = await resp.text()
|
30
|
+
raise Exception(
|
31
|
+
f"Failed to get Speechmatics access token. "
|
32
|
+
f"Status: {resp.status}, Error: {error_message}"
|
33
|
+
)
|
34
|
+
|
35
|
+
|
36
|
+
def get_sdk_version():
|
37
|
+
version = importlib.metadata.version("livekit-plugins-speechmatics")
|
38
|
+
return f"livekit-plugins-{version}"
|
39
|
+
|
40
|
+
|
41
|
+
def sanitize_url(url, language):
|
42
|
+
from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse
|
43
|
+
|
44
|
+
parsed_url = urlparse(url)
|
45
|
+
|
46
|
+
query_params = dict(parse_qsl(parsed_url.query))
|
47
|
+
query_params["sm-sdk"] = get_sdk_version()
|
48
|
+
updated_query = urlencode(query_params)
|
49
|
+
|
50
|
+
url_path = parsed_url.path
|
51
|
+
if not url_path.endswith(language):
|
52
|
+
if url_path.endswith("/"):
|
53
|
+
url_path += language
|
54
|
+
else:
|
55
|
+
url_path += f"/{language}"
|
56
|
+
|
57
|
+
return urlunparse(parsed_url._replace(path=url_path, query=updated_query))
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# Copyright 2025 LiveKit, Inc.
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
__version__ = "0.0.2"
|
@@ -0,0 +1,66 @@
|
|
1
|
+
Metadata-Version: 2.2
|
2
|
+
Name: livekit-plugins-speechmatics
|
3
|
+
Version: 0.0.2
|
4
|
+
Summary: Agent Framework plugin for Speechmatics
|
5
|
+
Home-page: https://github.com/livekit/agents
|
6
|
+
License: Apache-2.0
|
7
|
+
Project-URL: Documentation, https://docs.livekit.io
|
8
|
+
Project-URL: Website, https://livekit.io/
|
9
|
+
Project-URL: Source, https://github.com/livekit/agents
|
10
|
+
Keywords: webrtc,realtime,audio,video,livekit
|
11
|
+
Classifier: Intended Audience :: Developers
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
13
|
+
Classifier: Topic :: Multimedia :: Sound/Audio
|
14
|
+
Classifier: Topic :: Multimedia :: Video
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
19
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
20
|
+
Requires-Python: >=3.9.0
|
21
|
+
Description-Content-Type: text/markdown
|
22
|
+
Requires-Dist: livekit-agents<1.0.0,>=0.12.16
|
23
|
+
Dynamic: classifier
|
24
|
+
Dynamic: description
|
25
|
+
Dynamic: description-content-type
|
26
|
+
Dynamic: home-page
|
27
|
+
Dynamic: keywords
|
28
|
+
Dynamic: license
|
29
|
+
Dynamic: project-url
|
30
|
+
Dynamic: requires-dist
|
31
|
+
Dynamic: requires-python
|
32
|
+
Dynamic: summary
|
33
|
+
|
34
|
+
# LiveKit Plugins Speechmatics
|
35
|
+
|
36
|
+
Agent Framework plugin for Speechmatics.
|
37
|
+
|
38
|
+
## Installation
|
39
|
+
|
40
|
+
```bash
|
41
|
+
pip install livekit-plugins-speechmatics
|
42
|
+
```
|
43
|
+
|
44
|
+
Usage:
|
45
|
+
|
46
|
+
```python
|
47
|
+
agent = VoicePipelineAgent(
|
48
|
+
stt=speechmatics.STT(),
|
49
|
+
turn_detector=turn_detector.EOUModel(),
|
50
|
+
min_endpointing_delay=0.5,
|
51
|
+
max_endpointing_delay=5.0,
|
52
|
+
...
|
53
|
+
)
|
54
|
+
```
|
55
|
+
|
56
|
+
Note: The plugin was built with
|
57
|
+
LiveKit's [end-of-turn detection feature](https://github.com/livekit/agents#in-house-phrase-endpointing-model) in mind,
|
58
|
+
and it doesn't implement phrase endpointing. `AddTranscript` and `AddPartialTranscript` events are emitted as soon
|
59
|
+
as they’re received from the Speechmatics STT engine. For the best user experience,
|
60
|
+
we recommend running the agent with end-of-turn detection enabled (
|
61
|
+
see [example](https://github.com/livekit-examples/voice-pipeline-agent-python/blob/main/agent.py)).
|
62
|
+
|
63
|
+
## Pre-requisites
|
64
|
+
|
65
|
+
You'll need to specify a Speechmatics API Key. It can be set as environment variable `SPEECHMATICS_API_KEY` or
|
66
|
+
`.env.local` file.
|
@@ -0,0 +1,14 @@
|
|
1
|
+
README.md
|
2
|
+
setup.py
|
3
|
+
livekit/plugins/speechmatics/__init__.py
|
4
|
+
livekit/plugins/speechmatics/log.py
|
5
|
+
livekit/plugins/speechmatics/py.typed
|
6
|
+
livekit/plugins/speechmatics/stt.py
|
7
|
+
livekit/plugins/speechmatics/types.py
|
8
|
+
livekit/plugins/speechmatics/utils.py
|
9
|
+
livekit/plugins/speechmatics/version.py
|
10
|
+
livekit_plugins_speechmatics.egg-info/PKG-INFO
|
11
|
+
livekit_plugins_speechmatics.egg-info/SOURCES.txt
|
12
|
+
livekit_plugins_speechmatics.egg-info/dependency_links.txt
|
13
|
+
livekit_plugins_speechmatics.egg-info/requires.txt
|
14
|
+
livekit_plugins_speechmatics.egg-info/top_level.txt
|
@@ -0,0 +1 @@
|
|
1
|
+
|
@@ -0,0 +1 @@
|
|
1
|
+
livekit-agents<1.0.0,>=0.12.16
|
@@ -0,0 +1 @@
|
|
1
|
+
livekit
|
@@ -0,0 +1,58 @@
|
|
1
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
2
|
+
# you may not use this file except in compliance with the License.
|
3
|
+
# You may obtain a copy of the License at
|
4
|
+
#
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
6
|
+
#
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
10
|
+
# See the License for the specific language governing permissions and
|
11
|
+
# limitations under the License.
|
12
|
+
|
13
|
+
import os
|
14
|
+
import pathlib
|
15
|
+
|
16
|
+
import setuptools.command.build_py
|
17
|
+
|
18
|
+
here = pathlib.Path(__file__).parent.resolve()
|
19
|
+
about = {}
|
20
|
+
with open(
|
21
|
+
os.path.join(here, "livekit", "plugins", "speechmatics", "version.py"), "r"
|
22
|
+
) as f:
|
23
|
+
exec(f.read(), about)
|
24
|
+
|
25
|
+
|
26
|
+
setuptools.setup(
|
27
|
+
name="livekit-plugins-speechmatics",
|
28
|
+
version=about["__version__"],
|
29
|
+
description="Agent Framework plugin for Speechmatics",
|
30
|
+
long_description=(here / "README.md").read_text(encoding="utf-8"),
|
31
|
+
long_description_content_type="text/markdown",
|
32
|
+
url="https://github.com/livekit/agents",
|
33
|
+
cmdclass={},
|
34
|
+
classifiers=[
|
35
|
+
"Intended Audience :: Developers",
|
36
|
+
"License :: OSI Approved :: Apache Software License",
|
37
|
+
"Topic :: Multimedia :: Sound/Audio",
|
38
|
+
"Topic :: Multimedia :: Video",
|
39
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
40
|
+
"Programming Language :: Python :: 3",
|
41
|
+
"Programming Language :: Python :: 3.9",
|
42
|
+
"Programming Language :: Python :: 3.10",
|
43
|
+
"Programming Language :: Python :: 3 :: Only",
|
44
|
+
],
|
45
|
+
keywords=["webrtc", "realtime", "audio", "video", "livekit"],
|
46
|
+
license="Apache-2.0",
|
47
|
+
packages=setuptools.find_namespace_packages(include=["livekit.*"]),
|
48
|
+
python_requires=">=3.9.0",
|
49
|
+
install_requires=[
|
50
|
+
"livekit-agents>=0.12.16,<1.0.0",
|
51
|
+
],
|
52
|
+
package_data={},
|
53
|
+
project_urls={
|
54
|
+
"Documentation": "https://docs.livekit.io",
|
55
|
+
"Website": "https://livekit.io/",
|
56
|
+
"Source": "https://github.com/livekit/agents",
|
57
|
+
},
|
58
|
+
)
|