PyPI - meshagent-livekit - Versions diffs - 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl - Mend

meshagent-livekit 0.5.1py3-none-any.whl → 0.5.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of meshagent-livekit might be problematic. Click here for more details.

Files changed (8) hide show

meshagent/livekit/agents/transcriber.py CHANGED Viewed

@@ -1,377 +1,187 @@
 import logging
 import asyncio
+from asyncio import CancelledError
+from meshagent.api import RoomMessage, Requirement, Participant, RemoteParticipant
+from meshagent.api.room_server_client import RoomClient
-import os
+from livekit.agents import Agent, AgentSession
-from livekit import api
+from openai import AsyncOpenAI
+from livekit.agents.stt import STT
+from livekit.agents import RoomOutputOptions, StopResponse
+from livekit.agents import llm
-from livekit.agents import stt, transcription, utils
 from livekit.plugins import openai, silero
-from livekit import rtc
-from livekit.rtc import TranscriptionSegment
-from livekit.agents import stt as speech_to_text
-from meshagent.api.runtime import RuntimeDocument
+from .voice import VoiceConnection
+from livekit import rtc
 from typing import Optional
-from meshagent.api.schema import MeshSchema
-from meshagent.api.schema import ElementType, ChildProperty, ValueProperty
-from meshagent.agents.agent import AgentCallContext
-from meshagent.agents import TaskRunner
-logger = logging.getLogger("transcriber")
-transcription_schema = MeshSchema(
-    root_tag_name="transcript",
-    elements=[
-        ElementType(
-            tag_name="transcript",
-            description="a transcript",
-            properties=[
-                ChildProperty(
-                    name="transcriptions",
-                    description="the transcript entries",
-                    child_tag_names=["speech"],
-                )
-            ],
-        ),
-        ElementType(
-            tag_name="speech",
-            description="transcribed speech",
-            properties=[
-                ValueProperty(
-                    name="text", description="the transcribed text", type="string"
-                ),
-                ValueProperty(
-                    name="startTime",
-                    description="the time of the start of this speech",
-                    type="number",
-                ),
-                ValueProperty(
-                    name="endTime",
-                    description="the time of th end of this speech",
-                    type="number",
-                ),
-                ValueProperty(
-                    name="participantId",
-                    description="the identity of the participant",
-                    type="string",
-                ),
-                ValueProperty(
-                    name="participantName",
-                    description="the name of the participant",
-                    type="string",
-                ),
-            ],
-        ),
-    ],
-)
-class Transcriber(TaskRunner):
-    def __init__(
-        self,
-        *,
-        livekit_url: Optional[str] = None,
-        livekit_api_key: Optional[str] = None,
-        livekit_api_secret: Optional[str] = None,
-        livekit_identity: Optional[str] = None,
-    ):
-        super().__init__(
-            name="livekit.transcriber",
-            title="transcriber",
-            description="connects to a livekit room and transcribes the conversation",
-            input_schema={
-                "type": "object",
-                "additionalProperties": False,
-                "required": ["room_name", "path"],
-                "properties": {
-                    "room_name": {"type": "string"},
-                    "path": {"type": "string"},
-                },
-            },
-            output_schema={
-                "type": "object",
-                "additionalProperties": False,
-                "required": [],
-                "properties": {},
-            },
-        )
-        self._livekit_url = livekit_url
-        self._livekit_api_key = livekit_api_key
-        self._livekit_api_secret = livekit_api_secret
-        self._livekit_identity = livekit_identity
-    async def _transcribe_participant(
-        self,
-        doc: RuntimeDocument,
-        room: rtc.Room,
-        participant: rtc.RemoteParticipant,
-        stt_stream: stt.SpeechStream,
-        stt_forwarder: transcription.STTSegmentsForwarder,
-    ):
-        logger.info("transcribing participant %s", participant.sid)
-        """Forward the transcription to the client and log the transcript in the console"""
-        async for ev in stt_stream:
-            logger.info("event from participant %s %s", participant.sid, ev)
-            if ev.type == stt.SpeechEventType.FINAL_TRANSCRIPT:
-                logger.info("transcript: %s", ev.alternatives[0].text)
-                if len(ev.alternatives) > 0:
-                    alt = ev.alternatives[0]
-                    doc.root.append_child(
-                        tag_name="speech",
-                        attributes={
-                            "text": alt.text,
-                            "startTime": alt.start_time,
-                            "endTime": alt.end_time,
-                            "participantId": participant.identity,
-                            "participantName": participant.name,
-                        },
-                    )
+from meshagent.agents import SingleRoomAgent
-        logger.info("done forwarding %s", participant.sid)
-    def should_transcribe(self, p: rtc.Participant) -> bool:
-        # don't transcribe other agents
-        # todo: maybe have a better way to detect
-        return ".agent" not in p.identity
+import re
-    async def _wait_for_disconnect(self, room: rtc.Room):
-        disconnected = asyncio.Future()
+logger = logging.getLogger("voice")
-        def on_disconnected(_):
-            disconnected.set_result(True)
-        room.on("disconnected", on_disconnected)
+def _replace_non_matching(text: str, allowed_chars: str, replacement: str) -> str:
+    """
+    Replaces every character in `text` that does not match the given
+    `allowed_chars` regex set with `replacement`.
-        logger.info("waiting for disconnection")
-        await disconnected
+    Parameters:
+    -----------
+    text : str
+        The input string on which the replacement is to be done.
+    allowed_chars : str
+        A string defining the set of allowed characters (part of a character set).
+        For example, "a-zA-Z0-9" will keep only letters and digits.
+    replacement : str
+        The string to replace non-matching characters with.
-    async def ask(self, *, context: AgentCallContext, arguments: dict):
-        logger.info("Transcriber connecting to %s", arguments)
-        output_path = arguments["path"]
-        room_name = arguments["room_name"]
+    Returns:
+    --------
+    str
+        A new string where all characters not in `allowed_chars` are replaced.
+    """
+    # Build a regex that matches any character NOT in allowed_chars
+    pattern = rf"[^{allowed_chars}]"
+    return re.sub(pattern, replacement, text)
-        client = context.room
-        doc = await client.sync.open(path=output_path)
-        try:
-            vad = silero.VAD.load()
-            utils.http_context._new_session_ctx()
-            pending_tasks = list()
-            participantNames = dict[str, str]()
+def safe_tool_name(name: str):
+    return _replace_non_matching(name, "a-zA-Z0-9_-", "_")
-            sst_provider = openai.STT()
-            # sst_provider = fal.WizperSTT()
-            room_options = rtc.RoomOptions(auto_subscribe=False)
-            room = rtc.Room()
-            url = (
-                self._livekit_url
-                if self._livekit_url is not None
-                else os.getenv("LIVEKIT_URL")
-            )
-            api_key = (
-                self._livekit_api_key
-                if self._livekit_api_key is not None
-                else os.getenv("LIVEKIT_API_KEY")
-            )
-            api_secret = (
-                self._livekit_api_secret
-                if self._livekit_api_secret is not None
-                else os.getenv("LIVEKIT_API_SECRET")
-            )
-            identity = (
-                self._livekit_identity
-                if self._livekit_identity is not None
-                else os.getenv("AGENT_IDENTITY")
-            )
-            token = (
-                api.AccessToken(api_key=api_key, api_secret=api_secret)
-                .with_identity(identity)
-                .with_name("Agent")
-                .with_kind("agent")
-                .with_grants(
-                    api.VideoGrants(
-                        can_update_own_metadata=True,
-                        room_join=True,
-                        room=room_name,
-                        agent=True,
-                    )
-                )
-            )
+class _Transcriber(Agent):
+    def __init__(self, *, stt: STT, room: RoomClient, participant: RemoteParticipant):
+        super().__init__(instructions="not-needed", stt=stt)
+        self.room = room
+        self.participant = participant
-            jwt = token.to_jwt()
-            await room.connect(url=url, token=jwt, options=room_options)
-            logger.info("connected to room: %s", room_name)
-            audio_streams = list[rtc.AudioStream]()
-            async def transcribe_track(
-                participant: rtc.RemoteParticipant, track: rtc.Track
-            ):
-                audio_stream = rtc.AudioStream(track)
-                stt_forwarder = transcription.STTSegmentsForwarder(
-                    room=room, participant=participant, track=track
-                )
+    async def on_user_turn_completed(
+        self, chat_ctx: llm.ChatContext, new_message: llm.ChatMessage
+    ):
+        logger.info(f"transcription: {new_message.text_content}")
+        self.room.messaging.send_message_nowait(
+            to=self.participant,
+            type="transcript",
+            message={"text": new_message.text_content},
+        )
-                audio_streams.append(audio_stream)
+        raise StopResponse()
-                stt = sst_provider
-                if not sst_provider.capabilities.streaming:
-                    stt = speech_to_text.StreamAdapter(
-                        stt=stt,
-                        vad=vad,
-                    )
-                stt_stream = stt.stream()
+class Transcriber(SingleRoomAgent):
+    def __init__(
+        self,
+        name: str,
+        title: Optional[str] = None,
+        description: Optional[str] = None,
+        labels: Optional[list[str]] = None,
+        requires: list[Requirement] = None,
+    ):
+        super().__init__(
+            name=name,
+            description=description,
+            title=title,
+            labels=labels,
+            requires=requires,
+        )
-                pending_tasks.append(
-                    asyncio.create_task(
-                        self._transcribe_participant(
-                            doc, room, participant, stt_stream, stt_forwarder
+    async def start(self, *, room):
+        await super().start(room=room)
+        await room.local_participant.set_attribute("supports_voice", True)
+        await room.messaging.enable()
+        room.messaging.on("message", self.on_message)
+    def on_message(self, message: RoomMessage):
+        if message.type == "voice_call":
+            breakout_room = message.message["breakout_room"]
+            logger.info(f"joining breakout room {breakout_room}")
+            def on_done(task: asyncio.Task):
+                try:
+                    task.result()
+                except CancelledError:
+                    pass
+                except Exception as e:
+                    logger.error(f"{e}", exc_info=e)
+            for participant in self.room.messaging.remote_participants:
+                if participant.id == message.from_participant_id:
+                    task = asyncio.create_task(
+                        self.run_voice_agent(
+                            participant=participant, breakout_room=breakout_room
                         )
                     )
-                )
-                async for ev in audio_stream:
-                    stt_stream.push_frame(ev.frame)
-            def subscribe_if_needed(pub: rtc.RemoteTrackPublication):
-                if pub.kind == rtc.TrackKind.KIND_AUDIO:
-                    pub.set_subscribed(True)
-            for p in room.remote_participants.values():
-                participantNames[p.identity] = p.name
-                if self.should_transcribe(p):
-                    for pub in p.track_publications.values():
-                        subscribe_if_needed(pub)
-            first_parts = dict[str, rtc.Participant]()
-            def on_transcript_event(
-                segments: list[TranscriptionSegment],
-                part: rtc.Participant | None,
-                pub: rtc.TrackPublication | None = None,
-            ) -> None:
-                nonlocal room
-                logger.info("Got transcription segment %s %s %s", segments, part, pub)
-                for segment in segments:
-                    if segment.id not in first_parts and part is not None:
-                        first_parts[segment.id] = part
-                    if segment.final:
-                        if part is None and segment.id in first_parts:
-                            part = first_parts[segment.id]
-                            first_parts.pop(segment.id)
-                        if part is not None:
-                            doc.root.append_child(
-                                tag_name="speech",
-                                attributes={
-                                    "text": segment.text,
-                                    "startTime": segment.start_time,
-                                    "endTime": segment.end_time,
-                                    "participantId": part.identity,
-                                    "participantName": part.name,
-                                },
-                            )
-                        else:
-                            logger.warning(
-                                "transcription was missing participant information"
-                            )
-            def on_participant_connected(p: rtc.RemoteParticipant):
-                participantNames[p.identity] = p.name
-            def on_track_published(
-                pub: rtc.RemoteTrackPublication, p: rtc.RemoteParticipant
-            ):
-                if self.should_transcribe(p):
-                    subscribe_if_needed(pub)
-            subscriptions = dict()
-            def on_track_unpublished(
-                pub: rtc.RemoteTrackPublication, p: rtc.RemoteParticipant
-            ):
-                if pub in subscriptions:
-                    logger.info("track unpublished, stopping transcription")
-                    # todo: maybe could be more graceful
-                    subscriptions[pub].cancel()
-                    subscriptions.pop(pub)
-            def on_track_subscribed(
-                track: rtc.Track,
-                publication: rtc.TrackPublication,
-                participant: rtc.RemoteParticipant,
-            ):
-                if track.kind == rtc.TrackKind.KIND_AUDIO:
-                    logger.info("transcribing track %s", track.sid)
-                    track_task = asyncio.create_task(
-                        transcribe_track(participant, track)
-                    )
+                    task.add_done_callback(on_done)
+                    return
-                    def on_transcription_done(t):
-                        try:
-                            t.result()
-                        except Exception as e:
-                            logger.error("Transcription failed", exc_info=e)
+            logger.error(f"unable to find participant {message.from_participant_id}")
-                    track_task.add_done_callback(on_transcription_done)
-                    pending_tasks.append(track_task)
-                    subscriptions[publication] = track_task
+    async def _wait_for_disconnect(self, room: rtc.Room):
+        disconnected = asyncio.Future()
-            for p in room.remote_participants.values():
-                on_participant_connected(p)
+        def on_disconnected(_):
+            disconnected.set_result(True)
-            room.on("participant_connected", on_participant_connected)
+        room.on("disconnected", on_disconnected)
-            room.on("track_published", on_track_published)
-            room.on("track_unpublished", on_track_unpublished)
-            room.on("track_subscribed", on_track_subscribed)
-            room.on("transcription_received", on_transcript_event)
+        logger.info("waiting for disconnection")
+        await disconnected
-            await self._wait_for_disconnect(room)
+    async def create_agent(
+        self, *, session: AgentSession, participant: RemoteParticipant
+    ):
+        return _Transcriber(
+            stt=openai.STT(),
+            room=self.room,
+            participant=participant,
+        )
+    def create_session(self) -> AgentSession:
+        token: str = self.room.protocol.token
+        url: str = self.room.room_url
-            logger.info("waited for termination")
-            await room.disconnect()
+        room_proxy_url = f"{url}/v1"
-            logger.info("closing audio streams")
+        oaiclient = AsyncOpenAI(
+            api_key=token,
+            base_url=room_proxy_url,
+            default_headers={"Meshagent-Session": self.room.session_id},
+        )
-            for stream in audio_streams:
-                await stream.aclose()
+        session = AgentSession(
+            max_tool_steps=50,
+            allow_interruptions=False,
+            vad=silero.VAD.load(),
+            stt=openai.STT(client=oaiclient),
+            # turn_detection=MultilingualModel(),
+        )
+        return session
-            logger.info("waiting for pending tasks")
-            gather_future = asyncio.gather(*pending_tasks)
+    async def run_voice_agent(self, *, participant: Participant, breakout_room: str):
+        async with VoiceConnection(
+            room=self.room, breakout_room=breakout_room
+        ) as connection:
+            logger.info("starting transcription agent")
-            gather_future.cancel()
-            try:
-                await gather_future
-            except Exception as e:
-                if not isinstance(e, asyncio.CancelledError):
-                    logger.warning("Did not shut down cleanly", exc_info=e)
-                pass
+            session = self.create_session()
-            print("done")
-        except Exception as e:
-            logger.info("Transcription failed", exc_info=e)
-        finally:
-            await utils.http_context._close_http_ctx()
-            logger.info("Transcription done")
+            agent = await self.create_agent(session=session, participant=participant)
-            await asyncio.sleep(5)
-            await client.sync.close(path=output_path)
+            await session.start(
+                agent=agent,
+                room=connection.livekit_room,
+                room_output_options=RoomOutputOptions(transcription_enabled=True),
+            )
-            return {}
+            logger.info("started transcription agent")
+            await self._wait_for_disconnect(room=connection.livekit_room)

meshagent/livekit/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.5.1"
1	+ __version__ = "0.5.3"

{meshagent_livekit-0.5.1.dist-info → meshagent_livekit-0.5.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: meshagent-livekit
-Version: 0.5.1
+Version: 0.5.3
 Summary: Livekit support for Meshagent
 License-Expression: Apache-2.0
 Project-URL: Documentation, https://docs.meshagent.com
@@ -17,8 +17,8 @@ Requires-Dist: livekit-agents~=1.1
 Requires-Dist: livekit-plugins-openai~=1.1
 Requires-Dist: livekit-plugins-silero~=1.1
 Requires-Dist: livekit-plugins-turn-detector~=1.1
-Requires-Dist: meshagent-api~=0.5.1
-Requires-Dist: meshagent-tools~=0.5.1
+Requires-Dist: meshagent-api~=0.5.3
+Requires-Dist: meshagent-tools~=0.5.3
 Dynamic: license-file
 # [Meshagent](https://www.meshagent.com)

meshagent_livekit-0.5.3.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,11 @@
+meshagent/livekit/__init__.py,sha256=X78Z4yEg5XfkNKH0HiIdG4k1q5ktB-ampTuXHLNFrAw,58
+meshagent/livekit/livekit_protocol.py,sha256=5Zu4ymLWEGt5SGXLNu94gOeyjnjhaV6uTS2FhSdODqs,1470
+meshagent/livekit/livekit_protocol_test.py,sha256=o7yYxXad4tMazcxFkq44yW-A9tJ0Lk6WdZpG5ifxcU4,2980
+meshagent/livekit/version.py,sha256=tgzuqHKcEdKBaP57F5oXxq4XlW2n9J4Fj8ZGu7nGOZg,22
+meshagent/livekit/agents/transcriber.py,sha256=S992oVVBt3ShWDQQWprLjyl6Yh0hyNRd8d3qCmg_toU,5795
+meshagent/livekit/agents/voice.py,sha256=STgjMSqzUgV9UAmleOy1vkgRXP93MDSYgiOO6Lo0peU,11964
+meshagent_livekit-0.5.3.dist-info/licenses/LICENSE,sha256=eTt0SPW-sVNdkZe9PS_S8WfCIyLjRXRl7sUBWdlteFg,10254
+meshagent_livekit-0.5.3.dist-info/METADATA,sha256=BBePj7Umfvg1htZtnNE3DZ2pubrlpsqQ9CqcZ_RNOzY,1760
+meshagent_livekit-0.5.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+meshagent_livekit-0.5.3.dist-info/top_level.txt,sha256=GlcXnHtRP6m7zlG3Df04M35OsHtNXy_DY09oFwWrH74,10
+meshagent_livekit-0.5.3.dist-info/RECORD,,

meshagent_livekit-0.5.1.dist-info/RECORD DELETED Viewed

@@ -1,11 +0,0 @@
-meshagent/livekit/__init__.py,sha256=X78Z4yEg5XfkNKH0HiIdG4k1q5ktB-ampTuXHLNFrAw,58
-meshagent/livekit/livekit_protocol.py,sha256=5Zu4ymLWEGt5SGXLNu94gOeyjnjhaV6uTS2FhSdODqs,1470
-meshagent/livekit/livekit_protocol_test.py,sha256=o7yYxXad4tMazcxFkq44yW-A9tJ0Lk6WdZpG5ifxcU4,2980
-meshagent/livekit/version.py,sha256=eZ1bOun1DDVV0YLOBW4wj2FP1ajReLjbIrGmzN7ASBw,22
-meshagent/livekit/agents/transcriber.py,sha256=oqfHBhBSwU62LbsO8WFiJg3Xoi4vkWlTFzgTxBP0erg,13297
-meshagent/livekit/agents/voice.py,sha256=STgjMSqzUgV9UAmleOy1vkgRXP93MDSYgiOO6Lo0peU,11964
-meshagent_livekit-0.5.1.dist-info/licenses/LICENSE,sha256=eTt0SPW-sVNdkZe9PS_S8WfCIyLjRXRl7sUBWdlteFg,10254
-meshagent_livekit-0.5.1.dist-info/METADATA,sha256=IrvPuPE1C6GDh9sOJJS8g5pxDUpaT3usgtn01JbyJ-k,1760
-meshagent_livekit-0.5.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-meshagent_livekit-0.5.1.dist-info/top_level.txt,sha256=GlcXnHtRP6m7zlG3Df04M35OsHtNXy_DY09oFwWrH74,10
-meshagent_livekit-0.5.1.dist-info/RECORD,,

{meshagent_livekit-0.5.1.dist-info → meshagent_livekit-0.5.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{meshagent_livekit-0.5.1.dist-info → meshagent_livekit-0.5.3.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{meshagent_livekit-0.5.1.dist-info → meshagent_livekit-0.5.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

meshagent-livekit 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl

Potentially problematic release.

meshagent-livekit 0.5.1py3-none-any.whl → 0.5.3py3-none-any.whl