meshagent-livekit 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of meshagent-livekit might be problematic. Click here for more details.

@@ -1,377 +1,187 @@
1
1
  import logging
2
2
  import asyncio
3
+ from asyncio import CancelledError
3
4
 
5
+ from meshagent.api import RoomMessage, Requirement, Participant, RemoteParticipant
6
+ from meshagent.api.room_server_client import RoomClient
4
7
 
5
- import os
8
+ from livekit.agents import Agent, AgentSession
6
9
 
7
- from livekit import api
10
+ from openai import AsyncOpenAI
11
+
12
+ from livekit.agents.stt import STT
13
+ from livekit.agents import RoomOutputOptions, StopResponse
14
+ from livekit.agents import llm
8
15
 
9
- from livekit.agents import stt, transcription, utils
10
16
  from livekit.plugins import openai, silero
11
- from livekit import rtc
12
- from livekit.rtc import TranscriptionSegment
13
- from livekit.agents import stt as speech_to_text
14
17
 
15
- from meshagent.api.runtime import RuntimeDocument
18
+ from .voice import VoiceConnection
19
+ from livekit import rtc
16
20
 
17
21
  from typing import Optional
18
22
 
19
- from meshagent.api.schema import MeshSchema
20
-
21
- from meshagent.api.schema import ElementType, ChildProperty, ValueProperty
22
-
23
- from meshagent.agents.agent import AgentCallContext
24
- from meshagent.agents import TaskRunner
25
-
26
- logger = logging.getLogger("transcriber")
27
-
28
-
29
- transcription_schema = MeshSchema(
30
- root_tag_name="transcript",
31
- elements=[
32
- ElementType(
33
- tag_name="transcript",
34
- description="a transcript",
35
- properties=[
36
- ChildProperty(
37
- name="transcriptions",
38
- description="the transcript entries",
39
- child_tag_names=["speech"],
40
- )
41
- ],
42
- ),
43
- ElementType(
44
- tag_name="speech",
45
- description="transcribed speech",
46
- properties=[
47
- ValueProperty(
48
- name="text", description="the transcribed text", type="string"
49
- ),
50
- ValueProperty(
51
- name="startTime",
52
- description="the time of the start of this speech",
53
- type="number",
54
- ),
55
- ValueProperty(
56
- name="endTime",
57
- description="the time of th end of this speech",
58
- type="number",
59
- ),
60
- ValueProperty(
61
- name="participantId",
62
- description="the identity of the participant",
63
- type="string",
64
- ),
65
- ValueProperty(
66
- name="participantName",
67
- description="the name of the participant",
68
- type="string",
69
- ),
70
- ],
71
- ),
72
- ],
73
- )
74
-
75
-
76
- class Transcriber(TaskRunner):
77
- def __init__(
78
- self,
79
- *,
80
- livekit_url: Optional[str] = None,
81
- livekit_api_key: Optional[str] = None,
82
- livekit_api_secret: Optional[str] = None,
83
- livekit_identity: Optional[str] = None,
84
- ):
85
- super().__init__(
86
- name="livekit.transcriber",
87
- title="transcriber",
88
- description="connects to a livekit room and transcribes the conversation",
89
- input_schema={
90
- "type": "object",
91
- "additionalProperties": False,
92
- "required": ["room_name", "path"],
93
- "properties": {
94
- "room_name": {"type": "string"},
95
- "path": {"type": "string"},
96
- },
97
- },
98
- output_schema={
99
- "type": "object",
100
- "additionalProperties": False,
101
- "required": [],
102
- "properties": {},
103
- },
104
- )
105
- self._livekit_url = livekit_url
106
- self._livekit_api_key = livekit_api_key
107
- self._livekit_api_secret = livekit_api_secret
108
- self._livekit_identity = livekit_identity
109
23
 
110
- async def _transcribe_participant(
111
- self,
112
- doc: RuntimeDocument,
113
- room: rtc.Room,
114
- participant: rtc.RemoteParticipant,
115
- stt_stream: stt.SpeechStream,
116
- stt_forwarder: transcription.STTSegmentsForwarder,
117
- ):
118
- logger.info("transcribing participant %s", participant.sid)
119
- """Forward the transcription to the client and log the transcript in the console"""
120
- async for ev in stt_stream:
121
- logger.info("event from participant %s %s", participant.sid, ev)
122
-
123
- if ev.type == stt.SpeechEventType.FINAL_TRANSCRIPT:
124
- logger.info("transcript: %s", ev.alternatives[0].text)
125
- if len(ev.alternatives) > 0:
126
- alt = ev.alternatives[0]
127
- doc.root.append_child(
128
- tag_name="speech",
129
- attributes={
130
- "text": alt.text,
131
- "startTime": alt.start_time,
132
- "endTime": alt.end_time,
133
- "participantId": participant.identity,
134
- "participantName": participant.name,
135
- },
136
- )
24
+ from meshagent.agents import SingleRoomAgent
137
25
 
138
- logger.info("done forwarding %s", participant.sid)
139
26
 
140
- def should_transcribe(self, p: rtc.Participant) -> bool:
141
- # don't transcribe other agents
142
- # todo: maybe have a better way to detect
143
- return ".agent" not in p.identity
27
+ import re
144
28
 
145
- async def _wait_for_disconnect(self, room: rtc.Room):
146
- disconnected = asyncio.Future()
29
+ logger = logging.getLogger("voice")
147
30
 
148
- def on_disconnected(_):
149
- disconnected.set_result(True)
150
31
 
151
- room.on("disconnected", on_disconnected)
32
+ def _replace_non_matching(text: str, allowed_chars: str, replacement: str) -> str:
33
+ """
34
+ Replaces every character in `text` that does not match the given
35
+ `allowed_chars` regex set with `replacement`.
152
36
 
153
- logger.info("waiting for disconnection")
154
- await disconnected
37
+ Parameters:
38
+ -----------
39
+ text : str
40
+ The input string on which the replacement is to be done.
41
+ allowed_chars : str
42
+ A string defining the set of allowed characters (part of a character set).
43
+ For example, "a-zA-Z0-9" will keep only letters and digits.
44
+ replacement : str
45
+ The string to replace non-matching characters with.
155
46
 
156
- async def ask(self, *, context: AgentCallContext, arguments: dict):
157
- logger.info("Transcriber connecting to %s", arguments)
158
- output_path = arguments["path"]
159
- room_name = arguments["room_name"]
47
+ Returns:
48
+ --------
49
+ str
50
+ A new string where all characters not in `allowed_chars` are replaced.
51
+ """
52
+ # Build a regex that matches any character NOT in allowed_chars
53
+ pattern = rf"[^{allowed_chars}]"
54
+ return re.sub(pattern, replacement, text)
160
55
 
161
- client = context.room
162
- doc = await client.sync.open(path=output_path)
163
- try:
164
- vad = silero.VAD.load()
165
- utils.http_context._new_session_ctx()
166
56
 
167
- pending_tasks = list()
168
- participantNames = dict[str, str]()
57
+ def safe_tool_name(name: str):
58
+ return _replace_non_matching(name, "a-zA-Z0-9_-", "_")
169
59
 
170
- sst_provider = openai.STT()
171
- # sst_provider = fal.WizperSTT()
172
60
 
173
- room_options = rtc.RoomOptions(auto_subscribe=False)
174
-
175
- room = rtc.Room()
176
-
177
- url = (
178
- self._livekit_url
179
- if self._livekit_url is not None
180
- else os.getenv("LIVEKIT_URL")
181
- )
182
- api_key = (
183
- self._livekit_api_key
184
- if self._livekit_api_key is not None
185
- else os.getenv("LIVEKIT_API_KEY")
186
- )
187
- api_secret = (
188
- self._livekit_api_secret
189
- if self._livekit_api_secret is not None
190
- else os.getenv("LIVEKIT_API_SECRET")
191
- )
192
- identity = (
193
- self._livekit_identity
194
- if self._livekit_identity is not None
195
- else os.getenv("AGENT_IDENTITY")
196
- )
197
-
198
- token = (
199
- api.AccessToken(api_key=api_key, api_secret=api_secret)
200
- .with_identity(identity)
201
- .with_name("Agent")
202
- .with_kind("agent")
203
- .with_grants(
204
- api.VideoGrants(
205
- can_update_own_metadata=True,
206
- room_join=True,
207
- room=room_name,
208
- agent=True,
209
- )
210
- )
211
- )
61
+ class _Transcriber(Agent):
62
+ def __init__(self, *, stt: STT, room: RoomClient, participant: RemoteParticipant):
63
+ super().__init__(instructions="not-needed", stt=stt)
64
+ self.room = room
65
+ self.participant = participant
212
66
 
213
- jwt = token.to_jwt()
214
-
215
- await room.connect(url=url, token=jwt, options=room_options)
216
-
217
- logger.info("connected to room: %s", room_name)
218
-
219
- audio_streams = list[rtc.AudioStream]()
220
-
221
- async def transcribe_track(
222
- participant: rtc.RemoteParticipant, track: rtc.Track
223
- ):
224
- audio_stream = rtc.AudioStream(track)
225
- stt_forwarder = transcription.STTSegmentsForwarder(
226
- room=room, participant=participant, track=track
227
- )
67
+ async def on_user_turn_completed(
68
+ self, chat_ctx: llm.ChatContext, new_message: llm.ChatMessage
69
+ ):
70
+ logger.info(f"transcription: {new_message.text_content}")
71
+ self.room.messaging.send_message_nowait(
72
+ to=self.participant,
73
+ type="transcript",
74
+ message={"text": new_message.text_content},
75
+ )
228
76
 
229
- audio_streams.append(audio_stream)
77
+ raise StopResponse()
230
78
 
231
- stt = sst_provider
232
- if not sst_provider.capabilities.streaming:
233
- stt = speech_to_text.StreamAdapter(
234
- stt=stt,
235
- vad=vad,
236
- )
237
79
 
238
- stt_stream = stt.stream()
80
+ class Transcriber(SingleRoomAgent):
81
+ def __init__(
82
+ self,
83
+ name: str,
84
+ title: Optional[str] = None,
85
+ description: Optional[str] = None,
86
+ labels: Optional[list[str]] = None,
87
+ requires: list[Requirement] = None,
88
+ ):
89
+ super().__init__(
90
+ name=name,
91
+ description=description,
92
+ title=title,
93
+ labels=labels,
94
+ requires=requires,
95
+ )
239
96
 
240
- pending_tasks.append(
241
- asyncio.create_task(
242
- self._transcribe_participant(
243
- doc, room, participant, stt_stream, stt_forwarder
97
+ async def start(self, *, room):
98
+ await super().start(room=room)
99
+ await room.local_participant.set_attribute("supports_voice", True)
100
+ await room.messaging.enable()
101
+ room.messaging.on("message", self.on_message)
102
+
103
+ def on_message(self, message: RoomMessage):
104
+ if message.type == "voice_call":
105
+ breakout_room = message.message["breakout_room"]
106
+
107
+ logger.info(f"joining breakout room {breakout_room}")
108
+
109
+ def on_done(task: asyncio.Task):
110
+ try:
111
+ task.result()
112
+ except CancelledError:
113
+ pass
114
+ except Exception as e:
115
+ logger.error(f"{e}", exc_info=e)
116
+
117
+ for participant in self.room.messaging.remote_participants:
118
+ if participant.id == message.from_participant_id:
119
+ task = asyncio.create_task(
120
+ self.run_voice_agent(
121
+ participant=participant, breakout_room=breakout_room
244
122
  )
245
123
  )
246
- )
247
-
248
- async for ev in audio_stream:
249
- stt_stream.push_frame(ev.frame)
250
-
251
- def subscribe_if_needed(pub: rtc.RemoteTrackPublication):
252
- if pub.kind == rtc.TrackKind.KIND_AUDIO:
253
- pub.set_subscribed(True)
254
-
255
- for p in room.remote_participants.values():
256
- participantNames[p.identity] = p.name
257
- if self.should_transcribe(p):
258
- for pub in p.track_publications.values():
259
- subscribe_if_needed(pub)
260
-
261
- first_parts = dict[str, rtc.Participant]()
262
-
263
- def on_transcript_event(
264
- segments: list[TranscriptionSegment],
265
- part: rtc.Participant | None,
266
- pub: rtc.TrackPublication | None = None,
267
- ) -> None:
268
- nonlocal room
269
- logger.info("Got transcription segment %s %s %s", segments, part, pub)
270
- for segment in segments:
271
- if segment.id not in first_parts and part is not None:
272
- first_parts[segment.id] = part
273
-
274
- if segment.final:
275
- if part is None and segment.id in first_parts:
276
- part = first_parts[segment.id]
277
- first_parts.pop(segment.id)
278
-
279
- if part is not None:
280
- doc.root.append_child(
281
- tag_name="speech",
282
- attributes={
283
- "text": segment.text,
284
- "startTime": segment.start_time,
285
- "endTime": segment.end_time,
286
- "participantId": part.identity,
287
- "participantName": part.name,
288
- },
289
- )
290
- else:
291
- logger.warning(
292
- "transcription was missing participant information"
293
- )
294
-
295
- def on_participant_connected(p: rtc.RemoteParticipant):
296
- participantNames[p.identity] = p.name
297
-
298
- def on_track_published(
299
- pub: rtc.RemoteTrackPublication, p: rtc.RemoteParticipant
300
- ):
301
- if self.should_transcribe(p):
302
- subscribe_if_needed(pub)
303
-
304
- subscriptions = dict()
305
-
306
- def on_track_unpublished(
307
- pub: rtc.RemoteTrackPublication, p: rtc.RemoteParticipant
308
- ):
309
- if pub in subscriptions:
310
- logger.info("track unpublished, stopping transcription")
311
- # todo: maybe could be more graceful
312
- subscriptions[pub].cancel()
313
- subscriptions.pop(pub)
314
-
315
- def on_track_subscribed(
316
- track: rtc.Track,
317
- publication: rtc.TrackPublication,
318
- participant: rtc.RemoteParticipant,
319
- ):
320
- if track.kind == rtc.TrackKind.KIND_AUDIO:
321
- logger.info("transcribing track %s", track.sid)
322
- track_task = asyncio.create_task(
323
- transcribe_track(participant, track)
324
- )
124
+ task.add_done_callback(on_done)
125
+ return
325
126
 
326
- def on_transcription_done(t):
327
- try:
328
- t.result()
329
- except Exception as e:
330
- logger.error("Transcription failed", exc_info=e)
127
+ logger.error(f"unable to find participant {message.from_participant_id}")
331
128
 
332
- track_task.add_done_callback(on_transcription_done)
333
- pending_tasks.append(track_task)
334
- subscriptions[publication] = track_task
129
+ async def _wait_for_disconnect(self, room: rtc.Room):
130
+ disconnected = asyncio.Future()
335
131
 
336
- for p in room.remote_participants.values():
337
- on_participant_connected(p)
132
+ def on_disconnected(_):
133
+ disconnected.set_result(True)
338
134
 
339
- room.on("participant_connected", on_participant_connected)
135
+ room.on("disconnected", on_disconnected)
340
136
 
341
- room.on("track_published", on_track_published)
342
- room.on("track_unpublished", on_track_unpublished)
343
- room.on("track_subscribed", on_track_subscribed)
344
- room.on("transcription_received", on_transcript_event)
137
+ logger.info("waiting for disconnection")
138
+ await disconnected
345
139
 
346
- await self._wait_for_disconnect(room)
140
+ async def create_agent(
141
+ self, *, session: AgentSession, participant: RemoteParticipant
142
+ ):
143
+ return _Transcriber(
144
+ stt=openai.STT(),
145
+ room=self.room,
146
+ participant=participant,
147
+ )
148
+
149
+ def create_session(self) -> AgentSession:
150
+ token: str = self.room.protocol.token
151
+ url: str = self.room.room_url
347
152
 
348
- logger.info("waited for termination")
349
- await room.disconnect()
153
+ room_proxy_url = f"{url}/v1"
350
154
 
351
- logger.info("closing audio streams")
155
+ oaiclient = AsyncOpenAI(
156
+ api_key=token,
157
+ base_url=room_proxy_url,
158
+ default_headers={"Meshagent-Session": self.room.session_id},
159
+ )
352
160
 
353
- for stream in audio_streams:
354
- await stream.aclose()
161
+ session = AgentSession(
162
+ max_tool_steps=50,
163
+ allow_interruptions=False,
164
+ vad=silero.VAD.load(),
165
+ stt=openai.STT(client=oaiclient),
166
+ # turn_detection=MultilingualModel(),
167
+ )
168
+ return session
355
169
 
356
- logger.info("waiting for pending tasks")
357
- gather_future = asyncio.gather(*pending_tasks)
170
+ async def run_voice_agent(self, *, participant: Participant, breakout_room: str):
171
+ async with VoiceConnection(
172
+ room=self.room, breakout_room=breakout_room
173
+ ) as connection:
174
+ logger.info("starting transcription agent")
358
175
 
359
- gather_future.cancel()
360
- try:
361
- await gather_future
362
- except Exception as e:
363
- if not isinstance(e, asyncio.CancelledError):
364
- logger.warning("Did not shut down cleanly", exc_info=e)
365
- pass
176
+ session = self.create_session()
366
177
 
367
- print("done")
368
- except Exception as e:
369
- logger.info("Transcription failed", exc_info=e)
370
- finally:
371
- await utils.http_context._close_http_ctx()
372
- logger.info("Transcription done")
178
+ agent = await self.create_agent(session=session, participant=participant)
373
179
 
374
- await asyncio.sleep(5)
375
- await client.sync.close(path=output_path)
180
+ await session.start(
181
+ agent=agent,
182
+ room=connection.livekit_room,
183
+ room_output_options=RoomOutputOptions(transcription_enabled=True),
184
+ )
376
185
 
377
- return {}
186
+ logger.info("started transcription agent")
187
+ await self._wait_for_disconnect(room=connection.livekit_room)
@@ -1 +1 @@
1
- __version__ = "0.5.1"
1
+ __version__ = "0.5.3"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: meshagent-livekit
3
- Version: 0.5.1
3
+ Version: 0.5.3
4
4
  Summary: Livekit support for Meshagent
5
5
  License-Expression: Apache-2.0
6
6
  Project-URL: Documentation, https://docs.meshagent.com
@@ -17,8 +17,8 @@ Requires-Dist: livekit-agents~=1.1
17
17
  Requires-Dist: livekit-plugins-openai~=1.1
18
18
  Requires-Dist: livekit-plugins-silero~=1.1
19
19
  Requires-Dist: livekit-plugins-turn-detector~=1.1
20
- Requires-Dist: meshagent-api~=0.5.1
21
- Requires-Dist: meshagent-tools~=0.5.1
20
+ Requires-Dist: meshagent-api~=0.5.3
21
+ Requires-Dist: meshagent-tools~=0.5.3
22
22
  Dynamic: license-file
23
23
 
24
24
  # [Meshagent](https://www.meshagent.com)
@@ -0,0 +1,11 @@
1
+ meshagent/livekit/__init__.py,sha256=X78Z4yEg5XfkNKH0HiIdG4k1q5ktB-ampTuXHLNFrAw,58
2
+ meshagent/livekit/livekit_protocol.py,sha256=5Zu4ymLWEGt5SGXLNu94gOeyjnjhaV6uTS2FhSdODqs,1470
3
+ meshagent/livekit/livekit_protocol_test.py,sha256=o7yYxXad4tMazcxFkq44yW-A9tJ0Lk6WdZpG5ifxcU4,2980
4
+ meshagent/livekit/version.py,sha256=tgzuqHKcEdKBaP57F5oXxq4XlW2n9J4Fj8ZGu7nGOZg,22
5
+ meshagent/livekit/agents/transcriber.py,sha256=S992oVVBt3ShWDQQWprLjyl6Yh0hyNRd8d3qCmg_toU,5795
6
+ meshagent/livekit/agents/voice.py,sha256=STgjMSqzUgV9UAmleOy1vkgRXP93MDSYgiOO6Lo0peU,11964
7
+ meshagent_livekit-0.5.3.dist-info/licenses/LICENSE,sha256=eTt0SPW-sVNdkZe9PS_S8WfCIyLjRXRl7sUBWdlteFg,10254
8
+ meshagent_livekit-0.5.3.dist-info/METADATA,sha256=BBePj7Umfvg1htZtnNE3DZ2pubrlpsqQ9CqcZ_RNOzY,1760
9
+ meshagent_livekit-0.5.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
10
+ meshagent_livekit-0.5.3.dist-info/top_level.txt,sha256=GlcXnHtRP6m7zlG3Df04M35OsHtNXy_DY09oFwWrH74,10
11
+ meshagent_livekit-0.5.3.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- meshagent/livekit/__init__.py,sha256=X78Z4yEg5XfkNKH0HiIdG4k1q5ktB-ampTuXHLNFrAw,58
2
- meshagent/livekit/livekit_protocol.py,sha256=5Zu4ymLWEGt5SGXLNu94gOeyjnjhaV6uTS2FhSdODqs,1470
3
- meshagent/livekit/livekit_protocol_test.py,sha256=o7yYxXad4tMazcxFkq44yW-A9tJ0Lk6WdZpG5ifxcU4,2980
4
- meshagent/livekit/version.py,sha256=eZ1bOun1DDVV0YLOBW4wj2FP1ajReLjbIrGmzN7ASBw,22
5
- meshagent/livekit/agents/transcriber.py,sha256=oqfHBhBSwU62LbsO8WFiJg3Xoi4vkWlTFzgTxBP0erg,13297
6
- meshagent/livekit/agents/voice.py,sha256=STgjMSqzUgV9UAmleOy1vkgRXP93MDSYgiOO6Lo0peU,11964
7
- meshagent_livekit-0.5.1.dist-info/licenses/LICENSE,sha256=eTt0SPW-sVNdkZe9PS_S8WfCIyLjRXRl7sUBWdlteFg,10254
8
- meshagent_livekit-0.5.1.dist-info/METADATA,sha256=IrvPuPE1C6GDh9sOJJS8g5pxDUpaT3usgtn01JbyJ-k,1760
9
- meshagent_livekit-0.5.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
10
- meshagent_livekit-0.5.1.dist-info/top_level.txt,sha256=GlcXnHtRP6m7zlG3Df04M35OsHtNXy_DY09oFwWrH74,10
11
- meshagent_livekit-0.5.1.dist-info/RECORD,,