vision-agents-plugins-qwen 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,90 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .cursor/*
7
+ # Distribution / packaging
8
+ .Python
9
+ build/
10
+ dist/
11
+ downloads/
12
+ develop-eggs/
13
+ eggs/
14
+ .eggs/
15
+ lib64/
16
+ parts/
17
+ sdist/
18
+ var/
19
+ wheels/
20
+ share/python-wheels/
21
+ pip-wheel-metadata/
22
+ MANIFEST
23
+ *.egg-info/
24
+ *.egg
25
+
26
+ # Installer logs
27
+ pip-log.txt
28
+ pip-delete-this-directory.txt
29
+
30
+ # Unit test / coverage reports
31
+ htmlcov/
32
+ .tox/
33
+ .nox/
34
+ .coverage
35
+ .coverage.*
36
+ .cache
37
+ coverage.xml
38
+ nosetests.xml
39
+ *.cover
40
+ *.py,cover
41
+ .hypothesis/
42
+ .pytest_cache/
43
+
44
+ # Type checker / lint caches
45
+ .mypy_cache/
46
+ .dmypy.json
47
+ dmypy.json
48
+ .pytype/
49
+ .pyre/
50
+ .ruff_cache/
51
+
52
+ # Environments
53
+ .venv
54
+ env/
55
+ venv/
56
+ ENV/
57
+ env.bak/
58
+ venv.bak/
59
+ .env
60
+ .env.local
61
+ .env.*.local
62
+ .env.bak
63
+ pyvenv.cfg
64
+ .python-version
65
+
66
+ # Editors / IDEs
67
+ .vscode/
68
+ .idea/
69
+
70
+ # Jupyter Notebook
71
+ .ipynb_checkpoints/
72
+
73
+ # OS / Misc
74
+ .DS_Store
75
+ *.log
76
+
77
+ # Tooling & repo-specific
78
+ pyrightconfig.json
79
+ shell.nix
80
+ bin/*
81
+ lib/*
82
+ stream-py/
83
+
84
+ # Artifacts / assets
85
+ *.pt
86
+ *.kef
87
+ *.onnx
88
+ profile.html
89
+
90
+ /opencode.json
@@ -0,0 +1,83 @@
1
+ Metadata-Version: 2.4
2
+ Name: vision-agents-plugins-qwen
3
+ Version: 0.2.4
4
+ Summary: Qwen Omni plugin for vision agents
5
+ Project-URL: Documentation, https://visionagents.ai/
6
+ Project-URL: Website, https://visionagents.ai/
7
+ Project-URL: Source, https://github.com/GetStream/Vision-Agents
8
+ License-Expression: MIT
9
+ Requires-Python: >=3.10
10
+ Requires-Dist: numpy
11
+ Requires-Dist: vision-agents
12
+ Requires-Dist: websockets>=15.0.1
13
+ Description-Content-Type: text/markdown
14
+
15
+ # Qwen Realtime Plugin for Vision Agents
16
+
17
+ Qwen3 Realtime LLM integration for Vision Agents framework with native audio output and built-in speech recognition using WebSocket-based realtime communication.
18
+
19
+ ## Features
20
+
21
+ - **Native audio output**: No TTS service needed - audio comes directly from the model
22
+ - **Built-in STT**: Integrated speech-to-text using `gummy-realtime-v1` - no external STT service required
23
+ - **Server-side VAD**: Automatic turn detection with configurable silence thresholds
24
+ - **Video understanding**: Optional video frame support for multimodal interactions
25
+ - **Real-time streaming**: WebSocket-based bidirectional communication for low-latency responses
26
+ - **Interruption handling**: Automatic cancellation when user starts speaking
27
+
28
+ ## Installation
29
+
30
+ ```bash
31
+ uv add vision-agents[qwen]
32
+ ```
33
+
34
+ ## Usage
35
+
36
+ ```python
37
+ from vision_agents.core import User, Agent
38
+ from vision_agents.plugins import getstream, qwen
39
+
40
+ agent = Agent(
41
+ edge=getstream.Edge(),
42
+ agent_user=User(name="Qwen Assistant"),
43
+ instructions="Be helpful and friendly",
44
+ llm=qwen.Realtime(
45
+ model="qwen3-omni-flash-realtime",
46
+ voice="Cherry",
47
+ fps=1,
48
+ ),
49
+ # No STT or TTS needed - Qwen Realtime provides both
50
+ )
51
+ ```
52
+
53
+ ## Configuration
54
+
55
+ | Parameter | Description | Default | Accepted Values |
56
+ |-----------|-------------|---------|----------------|
57
+ | `model` | Qwen Realtime model identifier | `"qwen3-omni-flash-realtime"` | Model name string |
58
+ | `api_key` | DashScope API key | `None` (from env) | String or `None` |
59
+ | `base_url` | WebSocket API base URL | `"wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime"` | URL string |
60
+ | `voice` | Voice for audio output | `"Cherry"` | Voice name string |
61
+ | `fps` | Video frames per second | `1` | Integer |
62
+ | `include_video` | Include video frames in requests | `False` | Boolean |
63
+ | `video_width` | Video frame width | `1280` | Integer |
64
+ | `video_height` | Video frame height | `720` | Integer |
65
+
66
+ ## Environment Variables
67
+
68
+ Set `DASHSCOPE_API_KEY` in your environment or `.env` file:
69
+
70
+ ```bash
71
+ DASHSCOPE_API_KEY=your_dashscope_api_key_here
72
+ ```
73
+
74
+ ## Example
75
+
76
+ See `plugins/qwen/example/qwen_realtime_example.py` for a complete working example.
77
+
78
+ ## Dependencies
79
+
80
+ - vision-agents
81
+ - websockets
82
+ - aiortc
83
+ - av
@@ -0,0 +1,69 @@
1
+ # Qwen Realtime Plugin for Vision Agents
2
+
3
+ Qwen3 Realtime LLM integration for Vision Agents framework with native audio output and built-in speech recognition using WebSocket-based realtime communication.
4
+
5
+ ## Features
6
+
7
+ - **Native audio output**: No TTS service needed - audio comes directly from the model
8
+ - **Built-in STT**: Integrated speech-to-text using `gummy-realtime-v1` - no external STT service required
9
+ - **Server-side VAD**: Automatic turn detection with configurable silence thresholds
10
+ - **Video understanding**: Optional video frame support for multimodal interactions
11
+ - **Real-time streaming**: WebSocket-based bidirectional communication for low-latency responses
12
+ - **Interruption handling**: Automatic cancellation when user starts speaking
13
+
14
+ ## Installation
15
+
16
+ ```bash
17
+ uv add vision-agents[qwen]
18
+ ```
19
+
20
+ ## Usage
21
+
22
+ ```python
23
+ from vision_agents.core import User, Agent
24
+ from vision_agents.plugins import getstream, qwen
25
+
26
+ agent = Agent(
27
+ edge=getstream.Edge(),
28
+ agent_user=User(name="Qwen Assistant"),
29
+ instructions="Be helpful and friendly",
30
+ llm=qwen.Realtime(
31
+ model="qwen3-omni-flash-realtime",
32
+ voice="Cherry",
33
+ fps=1,
34
+ ),
35
+ # No STT or TTS needed - Qwen Realtime provides both
36
+ )
37
+ ```
38
+
39
+ ## Configuration
40
+
41
+ | Parameter | Description | Default | Accepted Values |
42
+ |-----------|-------------|---------|----------------|
43
+ | `model` | Qwen Realtime model identifier | `"qwen3-omni-flash-realtime"` | Model name string |
44
+ | `api_key` | DashScope API key | `None` (from env) | String or `None` |
45
+ | `base_url` | WebSocket API base URL | `"wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime"` | URL string |
46
+ | `voice` | Voice for audio output | `"Cherry"` | Voice name string |
47
+ | `fps` | Video frames per second | `1` | Integer |
48
+ | `include_video` | Include video frames in requests | `False` | Boolean |
49
+ | `video_width` | Video frame width | `1280` | Integer |
50
+ | `video_height` | Video frame height | `720` | Integer |
51
+
52
+ ## Environment Variables
53
+
54
+ Set `DASHSCOPE_API_KEY` in your environment or `.env` file:
55
+
56
+ ```bash
57
+ DASHSCOPE_API_KEY=your_dashscope_api_key_here
58
+ ```
59
+
60
+ ## Example
61
+
62
+ See `plugins/qwen/example/qwen_realtime_example.py` for a complete working example.
63
+
64
+ ## Dependencies
65
+
66
+ - vision-agents
67
+ - websockets
68
+ - aiortc
69
+ - av
@@ -0,0 +1,94 @@
1
+ # Qwen Realtime Example
2
+
3
+ This example demonstrates how to use Qwen Realtime with Vision Agents for real-time conversations.
4
+
5
+ ## Features
6
+
7
+ - **Real-time streaming**: Direct audio streaming from Qwen Realtime API
8
+ - **No text input**: The model does not support text input, so start speaking once you join the call
9
+ - **Video support**: Configure frames per second for video processing
10
+
11
+ ## Installation
12
+
13
+ ```bash
14
+ uv add vision-agents[qwen]
15
+ ```
16
+
17
+ ## Quick Start
18
+
19
+ 1. Set your API key in your environment:
20
+
21
+ ```bash
22
+ export DASHSCOPE_API_KEY=your_dashscope_api_key_here
23
+ ```
24
+
25
+ Or create a `.env` file:
26
+
27
+ ```
28
+ DASHSCOPE_API_KEY=your_dashscope_api_key_here
29
+ ```
30
+
31
+ 2. Run the example:
32
+
33
+ ```bash
34
+ uv run python qwen_realtime_example.py
35
+ ```
36
+
37
+ ## Code Example
38
+
39
+ ```python
40
+ from dotenv import load_dotenv
41
+ from vision_agents.core import Agent, User, cli
42
+ from vision_agents.core.agents import AgentLauncher
43
+ from vision_agents.plugins import getstream, qwen
44
+
45
+ load_dotenv()
46
+
47
+ async def create_agent(**kwargs) -> Agent:
48
+ llm = qwen.Realtime(fps=1)
49
+
50
+ agent = Agent(
51
+ edge=getstream.Edge(),
52
+ agent_user=User(name="Qwen Assistant", id="agent"),
53
+ instructions="You are a helpful AI assistant. Be friendly and conversational.",
54
+ llm=llm,
55
+ )
56
+ return agent
57
+
58
+ async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None:
59
+ await agent.create_user()
60
+ call = await agent.create_call(call_type, call_id)
61
+
62
+ with await agent.join(call):
63
+ await agent.edge.open_demo(call)
64
+ await agent.finish()
65
+
66
+ if __name__ == "__main__":
67
+ cli(AgentLauncher(create_agent=create_agent, join_call=join_call))
68
+ ```
69
+
70
+ ## Configuration
71
+
72
+ ### Environment Variables
73
+
74
+ - **`DASHSCOPE_API_KEY`**: Your DashScope/Alibaba API key (required)
75
+
76
+ ### Realtime Parameters
77
+
78
+ | Parameter | Description | Default |
79
+ |-----------|-------------|---------|
80
+ | `fps` | Video frames per second | `1` |
81
+ | `api_key` | DashScope API key | `None` (from env) |
82
+
83
+ ## Requirements
84
+
85
+ - Python 3.10+
86
+ - DashScope API key
87
+ - Stream API credentials (configured via `getstream.Edge()`)
88
+ - `vision-agents` framework
89
+
90
+ ## Notes
91
+
92
+ - The model is hosted in Singapore, so latency may vary depending on your location
93
+ - The model does not support text input - once you join the call, simply start speaking to the agent
94
+ - This example uses the CLI interface for easy interaction
File without changes
@@ -0,0 +1,6 @@
1
+ # DashScope API key for Qwen Omni
2
+ DASHSCOPE_API_KEY=your_dashscope_api_key_here
3
+
4
+ # Stream API credentials
5
+ STREAM_API_KEY=your_stream_api_key
6
+ STREAM_API_SECRET=your_stream_api_secret
@@ -0,0 +1,16 @@
1
+ [project]
2
+ name = "qwen-omni-example"
3
+ version = "0.1.0"
4
+ description = "Example using Qwen Omni with Vision Agents"
5
+ requires-python = ">=3.10"
6
+ dependencies = [
7
+ "vision-agents",
8
+ "vision-agents-plugins-qwen",
9
+ "vision-agents-plugins-getstream",
10
+ "python-dotenv",
11
+ ]
12
+
13
+ [tool.uv.sources]
14
+ vision-agents = { workspace = true }
15
+ vision-agents-plugins-qwen = { workspace = true }
16
+ vision-agents-plugins-getstream = { workspace = true }
@@ -0,0 +1,36 @@
1
+ # This is a basic example using Qwen Realtime with Vision Agents
2
+ # To run this example, you must have DASHSCOPE_API_KEY set in your env.
3
+ # Do note that the model is hosted in Singapore so depending on your location, the latency may vary.
4
+ # This model also does not support text input so once you join the call, simply start speaking to the agent.
5
+
6
+ from dotenv import load_dotenv
7
+ from vision_agents.core import Agent, User, cli
8
+ from vision_agents.core.agents import AgentLauncher
9
+ from vision_agents.plugins import getstream, qwen
10
+
11
+ load_dotenv()
12
+
13
+
14
+ async def create_agent(**kwargs) -> Agent:
15
+ llm = qwen.Realtime(fps=1)
16
+
17
+ agent = Agent(
18
+ edge=getstream.Edge(),
19
+ agent_user=User(name="Qwen Assistant", id="agent"),
20
+ instructions="You are a helpful AI assistant. Be friendly and conversational.",
21
+ llm=llm,
22
+ )
23
+ return agent
24
+
25
+
26
+ async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None:
27
+ await agent.create_user()
28
+ call = await agent.create_call(call_type, call_id)
29
+
30
+ with await agent.join(call):
31
+ await agent.edge.open_demo(call)
32
+ await agent.finish()
33
+
34
+
35
+ if __name__ == "__main__":
36
+ cli(AgentLauncher(create_agent=create_agent, join_call=join_call))
File without changes
@@ -0,0 +1,37 @@
1
+ [build-system]
2
+ requires = ["hatchling", "hatch-vcs"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "vision-agents-plugins-qwen"
7
+ dynamic = ["version"]
8
+ description = "Qwen Omni plugin for vision agents"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = "MIT"
12
+ dependencies = [
13
+ "vision-agents",
14
+ "numpy",
15
+ "websockets>=15.0.1",
16
+ ]
17
+
18
+ [project.urls]
19
+ Documentation = "https://visionagents.ai/"
20
+ Website = "https://visionagents.ai/"
21
+ Source = "https://github.com/GetStream/Vision-Agents"
22
+
23
+ [tool.hatch.version]
24
+ source = "vcs"
25
+ raw-options = { root = "..", search_parent_directories = true, fallback_version = "0.0.0" }
26
+
27
+ [tool.hatch.build.targets.wheel]
28
+ packages = ["."]
29
+
30
+ [tool.uv.sources]
31
+ vision-agents = { workspace = true }
32
+
33
+ [dependency-groups]
34
+ dev = [
35
+ "pytest>=8.4.1",
36
+ "pytest-asyncio>=1.0.0",
37
+ ]
@@ -0,0 +1,86 @@
1
+ import asyncio
2
+
3
+ import dotenv
4
+ import pytest
5
+ from vision_agents.core.llm.events import (
6
+ RealtimeAudioOutputEvent,
7
+ )
8
+ from vision_agents.plugins.qwen import Realtime
9
+
10
+ dotenv.load_dotenv()
11
+
12
+
13
+ @pytest.fixture()
14
+ async def llm():
15
+ """Create and manage Realtime connection lifecycle"""
16
+ realtime = Realtime(
17
+ fps=1, vad_silence_duration_ms=0, vad_prefix_padding_ms=0, vad_threshold=0.1
18
+ )
19
+ yield realtime
20
+ await realtime.close()
21
+
22
+
23
+ class TestQwen3Realtime:
24
+ """Integration tests for Qwen3Realtime connect flow"""
25
+
26
+ @pytest.mark.integration
27
+ async def test_audio_sending_flow(self, llm, mia_audio_16khz, silence_1s_16khz):
28
+ """Test sending real audio data and verify connection remains stable"""
29
+ events = []
30
+
31
+ @llm.events.subscribe
32
+ async def on_audio(event: RealtimeAudioOutputEvent):
33
+ events.append(event)
34
+
35
+ # Connect the llm
36
+ await llm.connect()
37
+ # Let it handle the connection events
38
+ await asyncio.sleep(5.0)
39
+
40
+ # Send 1s of silence first
41
+ await llm.simple_audio_response(silence_1s_16khz)
42
+ # Send audio
43
+ await llm.simple_audio_response(mia_audio_16khz)
44
+ # Send silence again
45
+ await llm.simple_audio_response(silence_1s_16khz)
46
+
47
+ # Let it run for a few sec
48
+ await asyncio.sleep(10.0)
49
+
50
+ # Verify that the model replied with audio
51
+ assert len(events) > 0
52
+
53
+ @pytest.mark.integration
54
+ async def test_video_sending_flow(
55
+ self,
56
+ llm,
57
+ bunny_video_track,
58
+ describe_what_you_see_audio_16khz,
59
+ silence_1s_16khz,
60
+ ):
61
+ """Test sending real video data and verify connection remains stable"""
62
+ events = []
63
+
64
+ @llm.events.subscribe
65
+ async def on_audio(event: RealtimeAudioOutputEvent):
66
+ events.append(event)
67
+
68
+ await llm.connect()
69
+ # Let the model to handle all connection events
70
+ await asyncio.sleep(5.0)
71
+
72
+ # Send 1s of silence first
73
+ await llm.simple_audio_response(silence_1s_16khz)
74
+ # Start video sender with low FPS to avoid overwhelming the connection
75
+ await llm.watch_video_track(bunny_video_track)
76
+ # Send audio to the model (it does not support text inputs)
77
+ await llm.simple_audio_response(describe_what_you_see_audio_16khz)
78
+ # Send silence again
79
+ await llm.simple_audio_response(silence_1s_16khz)
80
+ # Let it run for a few seconds
81
+ await asyncio.sleep(10.0)
82
+
83
+ # Stop video sender
84
+ await llm._stop_watching_video_track()
85
+ # Verify that the model replied
86
+ assert len(events) > 0
@@ -0,0 +1,3 @@
1
+ from .qwen_realtime import Qwen3Realtime as Realtime
2
+
3
+ __all__ = ["Realtime"]
@@ -0,0 +1,151 @@
1
+ import asyncio
2
+ import base64
3
+ import contextlib
4
+ import json
5
+ import logging
6
+ import time
7
+ from typing import Any, AsyncIterator, Optional
8
+
9
+ import websockets
10
+ from getstream.video.rtc import PcmData
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class Qwen3RealtimeClient:
16
+ """
17
+ A wrapper around WebSocket connection for Qwen3Realtime API.
18
+ It automatically reconnects in case of connection failures.
19
+ """
20
+
21
+ def __init__(
22
+ self,
23
+ model: str,
24
+ api_key: str,
25
+ base_url: str,
26
+ config: dict[str, Any],
27
+ reconnect_backoff: float = 1.0,
28
+ ) -> None:
29
+ self._base_url = f"{base_url}?model={model}"
30
+ self._api_key = api_key
31
+ self._real_ws: Optional[websockets.ClientConnection] = None
32
+ self._exit_stack = contextlib.AsyncExitStack()
33
+ self._config = config
34
+ self._conn_lock = asyncio.Lock()
35
+ self._closed = False
36
+ self._reconnect_backoff = reconnect_backoff
37
+
38
+ async def connect(self) -> None:
39
+ if self._conn_lock.locked():
40
+ return None
41
+
42
+ async with self._conn_lock:
43
+ logger.debug(f"Connecting to Qwen3Realtime at {self._base_url}")
44
+ self._real_ws = await self._exit_stack.enter_async_context(
45
+ websockets.connect(
46
+ uri=self._base_url,
47
+ additional_headers={"Authorization": f"Bearer {self._api_key}"},
48
+ )
49
+ )
50
+ # Initialize session with config params
51
+ await self.update_session(self._config)
52
+ return None
53
+
54
+ async def close(self) -> None:
55
+ self._closed = True
56
+ try:
57
+ await self._exit_stack.aclose()
58
+ except Exception as e:
59
+ logger.warning(f"Error closing session: {e}")
60
+
61
+ async def read(self) -> AsyncIterator[dict[str, Any]]:
62
+ while not self._closed:
63
+ try:
64
+ async for msg in self._ws:
65
+ event = json.loads(msg)
66
+ yield event
67
+ except websockets.ConnectionClosedError as e:
68
+ if not _should_reconnect(e):
69
+ raise
70
+ await asyncio.sleep(self._reconnect_backoff)
71
+ await self.connect()
72
+
73
+ async def send_event(self, event: dict[str, Any]) -> None:
74
+ event["event_id"] = f"event_{int(time.time() * 1000)}"
75
+
76
+ try:
77
+ await self._ws.send(json.dumps(event))
78
+ except websockets.ConnectionClosedError as e:
79
+ if not _should_reconnect(e):
80
+ raise
81
+ logger.warning(
82
+ f"Re-establishing Qwen3Realtime connection due to error: {e}"
83
+ )
84
+ await asyncio.sleep(self._reconnect_backoff)
85
+ await self.connect()
86
+
87
+ async def update_session(self, config: dict[str, Any]) -> None:
88
+ """Update the session configuration."""
89
+ await self.send_event(event={"type": "session.update", "session": config})
90
+
91
+ async def send_audio(self, pcm: PcmData) -> None:
92
+ """Stream raw audio data to the API."""
93
+ # Only 16-bit, 16 kHz, mono PCM is supported.
94
+ audio_bytes = pcm.resample(
95
+ target_sample_rate=16000, target_channels=1
96
+ ).samples.tobytes()
97
+ audio_b64 = base64.b64encode(audio_bytes).decode()
98
+ append_event = {"type": "input_audio_buffer.append", "audio": audio_b64}
99
+ await self.send_event(append_event)
100
+
101
+ async def commit_audio(self) -> None:
102
+ """Commit the audio buffer to trigger processing."""
103
+ event = {"type": "input_audio_buffer.commit"}
104
+ await self.send_event(event)
105
+
106
+ async def send_frame(self, frame_bytes: bytes) -> None:
107
+ """
108
+ Append image data to the image buffer.
109
+
110
+ Note:
111
+ - The image format must be JPG or JPEG. A resolution of 480p or 720p is recommended.
112
+ The maximum supported resolution is 1080p.
113
+ - A single image should not exceed 500 KB in size.
114
+ - Encode the image data to Base64 before sending.
115
+ - We recommend sending images to the server at a rate of no more than 2 frames per second.
116
+ - You must send audio data at least once before sending image data.
117
+ """
118
+ image_b64 = base64.b64encode(frame_bytes).decode()
119
+ event = {"type": "input_image_buffer.append", "image": image_b64}
120
+ await self.send_event(event)
121
+
122
+ async def cancel_response(self) -> None:
123
+ """Cancel the current response."""
124
+ event = {"type": "response.cancel"}
125
+ await self.send_event(event)
126
+
127
+ @property
128
+ def _ws(self) -> websockets.ClientConnection:
129
+ if self._real_ws is None:
130
+ raise ValueError("The websocket connection is not established yet")
131
+ return self._real_ws
132
+
133
+
134
+ def _should_reconnect(exc: Exception) -> bool:
135
+ """
136
+ Temporary errors should typically trigger a reconnect.
137
+ So if the websocket breaks this should return True and trigger a reconnect
138
+ """
139
+ reconnect_close_codes = [
140
+ 1011, # Server-side exception or session timeout
141
+ 1012, # Service restart
142
+ 1013, # Try again later
143
+ 1014, # Bad gateway
144
+ ]
145
+ if (
146
+ isinstance(exc, websockets.ConnectionClosedError)
147
+ and exc.rcvd
148
+ and exc.rcvd.code in reconnect_close_codes
149
+ ):
150
+ return True
151
+ return False
@@ -0,0 +1,13 @@
1
+ from dataclasses import dataclass, field
2
+ from vision_agents.core.events import PluginBaseEvent
3
+ from typing import Optional, Any
4
+
5
+
6
+ @dataclass
7
+ class LLMErrorEvent(PluginBaseEvent):
8
+ """Event emitted when an LLM encounters an error."""
9
+
10
+ type: str = field(default="plugin.llm.error", init=False)
11
+ plugin_name: str = ""
12
+ error_message: Optional[str] = None
13
+ event_data: Optional[Any] = None
@@ -0,0 +1,283 @@
1
+ import asyncio
2
+ import base64
3
+ import logging
4
+ import os
5
+ import uuid
6
+ from concurrent.futures import ThreadPoolExecutor
7
+ from typing import Any, Optional, cast
8
+
9
+ import aiortc
10
+ import av
11
+ from aiortc import VideoStreamTrack
12
+ from getstream.video.rtc import PcmData
13
+ from vision_agents.core.edge.types import Participant
14
+ from vision_agents.core.llm import Realtime
15
+ from vision_agents.core.llm.events import LLMResponseChunkEvent
16
+ from vision_agents.core.llm.llm import LLMResponseEvent
17
+ from vision_agents.core.processors import Processor
18
+ from vision_agents.core.utils.video_forwarder import VideoForwarder
19
+ from vision_agents.core.utils.video_utils import frame_to_jpeg_bytes
20
+
21
+ from . import events
22
+ from .client import Qwen3RealtimeClient
23
+
24
+ DEFAULT_BASE_URL = "wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime"
25
+ PLUGIN_NAME = "Qwen3Realtime"
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ class Qwen3Realtime(Realtime):
31
+ def __init__(
32
+ self,
33
+ model: str = "qwen3-omni-flash-realtime",
34
+ api_key: Optional[str] = None,
35
+ base_url: Optional[str] = None,
36
+ voice: str = "Cherry",
37
+ fps: int = 1,
38
+ include_video: bool = False,
39
+ video_width: int = 1280,
40
+ video_height: int = 720,
41
+ audio_transcription_model: str = "gummy-realtime-v1",
42
+ vad_threshold: float = 0.1,
43
+ vad_prefix_padding_ms: int = 500,
44
+ vad_silence_duration_ms: int = 900,
45
+ ):
46
+ super().__init__(fps=fps)
47
+ self.model = model
48
+ self.voice = voice
49
+ self.session_id = str(uuid.uuid4())
50
+ self.events.register_events_from_module(events)
51
+
52
+ self._base_url = base_url or DEFAULT_BASE_URL
53
+
54
+ api_key = api_key or os.getenv("DASHSCOPE_API_KEY")
55
+ if not api_key:
56
+ raise ValueError("api_key is required")
57
+ self._api_key = cast(str, api_key)
58
+
59
+ self._video_forwarder: Optional[VideoForwarder] = None
60
+ self._include_video = include_video
61
+ self._real_client: Optional[Qwen3RealtimeClient] = None
62
+ self._processing_task: Optional[asyncio.Task] = None
63
+ self._video_width = video_width
64
+ self._video_height = video_height
65
+ self._executor = ThreadPoolExecutor(max_workers=1)
66
+
67
+ self._is_responding = False
68
+ self._current_response_id = None
69
+ self._current_item_id = None
70
+ self._current_participant: Optional[Participant] = None
71
+ # The model requires us not to send any video frames until the audio is sent
72
+ self._audio_emitted_once = False
73
+ self._audio_transcription_model = audio_transcription_model
74
+ self._vad_threshold = vad_threshold
75
+ self._vad_prefix_padding_ms = vad_prefix_padding_ms
76
+ self._vad_silence_duration_ms = vad_silence_duration_ms
77
+
78
+ async def connect(self):
79
+ # Stop the processing task first in case we're reconnecting
80
+ await self._stop_processing_task()
81
+
82
+ # Session configuration
83
+ session_config = {
84
+ "modalities": ["text", "audio"],
85
+ "voice": self.voice,
86
+ "instructions": self._instructions,
87
+ "input_audio_format": "pcm16",
88
+ "output_audio_format": "pcm24",
89
+ "input_audio_transcription": {"model": self._audio_transcription_model},
90
+ "turn_detection": {
91
+ "type": "server_vad",
92
+ "threshold": self._vad_threshold,
93
+ "prefix_padding_ms": self._vad_prefix_padding_ms,
94
+ "silence_duration_ms": self._vad_silence_duration_ms,
95
+ },
96
+ }
97
+ self._real_client = Qwen3RealtimeClient(
98
+ api_key=self._api_key,
99
+ base_url=self._base_url,
100
+ model=self.model,
101
+ config=session_config,
102
+ )
103
+ await self._real_client.connect()
104
+ self.connected = True
105
+ logger.debug(f"Started Qwen3Realtime session at {self._base_url}")
106
+
107
+ # Start the loop task
108
+ self._start_processing_task()
109
+
110
+ async def simple_audio_response(
111
+ self, pcm: PcmData, participant: Optional[Participant] = None
112
+ ):
113
+ if not self.connected:
114
+ return
115
+ self._current_participant = participant
116
+ await self._client.send_audio(pcm=pcm)
117
+ self._audio_emitted_once = True
118
+
119
+ async def simple_response(
120
+ self,
121
+ text: str,
122
+ processors: Optional[list[Processor]] = None,
123
+ participant: Optional[Participant] = None,
124
+ ) -> LLMResponseEvent[Any]:
125
+ logger.warning(
126
+ f'Cannot reply to "{text}"; reason - Qwen3Realtime does not support text inputs'
127
+ )
128
+ return LLMResponseEvent(text="", original=None)
129
+
130
+ async def close(self):
131
+ self.connected = False
132
+ await self._stop_watching_video_track()
133
+ if self._processing_task is not None:
134
+ self._processing_task.cancel()
135
+ await self._processing_task
136
+
137
+ self._executor.shutdown(wait=False)
138
+
139
+ if self._real_client is not None:
140
+ await self._real_client.close()
141
+ self._real_client = None
142
+
143
+ async def watch_video_track(
144
+ self,
145
+ track: aiortc.mediastreams.MediaStreamTrack,
146
+ shared_forwarder: Optional[VideoForwarder] = None,
147
+ ) -> None:
148
+ """
149
+ Start sending video frames using VideoForwarder.
150
+
151
+ Args:
152
+ track: Video track to watch
153
+ shared_forwarder: Optional shared VideoForwarder to use instead of creating a new one
154
+ """
155
+
156
+ # This method can be called multiple times with different forwarders
157
+ # Remove handler from old forwarder if it exists
158
+ await self._stop_watching_video_track()
159
+ if self._video_forwarder is not None:
160
+ await self._video_forwarder.remove_frame_handler(self._send_video_frame)
161
+
162
+ self._video_forwarder = shared_forwarder or VideoForwarder(
163
+ input_track=cast(VideoStreamTrack, track),
164
+ max_buffer=5,
165
+ fps=float(self.fps),
166
+ name="qwen3realtime_forwarder",
167
+ )
168
+
169
+ # Add frame handler (starts automatically)
170
+ self._video_forwarder.add_frame_handler(self._send_video_frame, fps=self.fps)
171
+ logger.info(f"Started video forwarding with {self.fps} FPS")
172
+
173
+ async def _send_video_frame(self, frame: av.VideoFrame) -> None:
174
+ """
175
+ Send a video frame to Qwen3 Realtime API using send_realtime_input
176
+
177
+ Parameters:
178
+ frame: Video frame to send.
179
+ """
180
+ if not self._audio_emitted_once:
181
+ # Wait until the audio is sent at least once before forwarding frames
182
+ # per the model spec.
183
+ return
184
+
185
+ loop = asyncio.get_running_loop()
186
+
187
+ # Run frame conversion in a separate thread to avoid blocking the loop.
188
+ jpg_bytes = await loop.run_in_executor(
189
+ self._executor,
190
+ frame_to_jpeg_bytes,
191
+ frame,
192
+ self._video_width,
193
+ self._video_height,
194
+ )
195
+
196
+ try:
197
+ await self._client.send_frame(jpg_bytes)
198
+ except Exception:
199
+ logger.exception("Failed to send a video frame to Qwen3 Realtime API")
200
+
201
+ async def _stop_watching_video_track(self) -> None:
202
+ if self._video_forwarder is not None:
203
+ await self._video_forwarder.remove_frame_handler(self._send_video_frame)
204
+
205
+ @property
206
+ def _client(self) -> Qwen3RealtimeClient:
207
+ if self._real_client is None:
208
+ raise ValueError("The Qwen3Realtime session is not established yet")
209
+ return self._real_client
210
+
211
+ async def _processing_loop(self) -> None:
212
+ logger.debug("Start processing events by Qwen3Realtime")
213
+ try:
214
+ await self._process_events()
215
+ except asyncio.CancelledError:
216
+ logger.debug("Stop processing events by Qwen3Realtime")
217
+
218
+ def _start_processing_task(self) -> None:
219
+ self._processing_task = asyncio.create_task(self._processing_loop())
220
+
221
+ async def _stop_processing_task(self) -> None:
222
+ if self._processing_task is not None:
223
+ self._processing_task.cancel()
224
+ await self._processing_task
225
+
226
+ async def _process_events(self):
227
+ async for event in self._client.read():
228
+ event_type = event.get("type")
229
+ if event_type == "error":
230
+ error = event["error"]
231
+ logger.error(
232
+ f"Error received from Qwen3Realtime API: {error}",
233
+ )
234
+ self.events.send(
235
+ events.LLMErrorEvent(plugin_name=PLUGIN_NAME, error_message=error)
236
+ )
237
+ continue
238
+
239
+ elif event_type == "session.created":
240
+ logger.debug("Qwen3Realtime session initialized successfully")
241
+
242
+ elif event_type == "response.created":
243
+ self._current_response_id = event.get("response", {}).get("id")
244
+ self._is_responding = True
245
+ elif event_type == "response.output_item.added":
246
+ self._current_item_id = event.get("item", {}).get("id")
247
+ elif event_type == "response.done":
248
+ self._is_responding = False
249
+ self._current_response_id = None
250
+ self._current_item_id = None
251
+ elif event_type == "input_audio_buffer.speech_started":
252
+ if self._is_responding:
253
+ await self._on_interruption()
254
+ elif event_type == "response.text.delta":
255
+ self.events.send(
256
+ LLMResponseChunkEvent(
257
+ plugin_name=PLUGIN_NAME, delta=str(event["delta"])
258
+ )
259
+ )
260
+ elif event_type == "response.audio.delta":
261
+ audio_bytes = base64.b64decode(event["delta"])
262
+ pcm = PcmData.from_bytes(audio_bytes, 24000)
263
+ self._emit_audio_output_event(audio_data=pcm)
264
+ elif event_type == "conversation.item.input_audio_transcription.completed":
265
+ transcript = event.get("transcript", "")
266
+ if transcript:
267
+ self._emit_user_speech_transcription(text=transcript)
268
+ elif event_type == "response.audio_transcript.delta":
269
+ delta = event.get("delta", "")
270
+ if delta:
271
+ self._emit_agent_speech_transcription(text=delta)
272
+
273
+ async def _on_interruption(self):
274
+ """Handle user interruption of the current response."""
275
+ if not self._is_responding:
276
+ return
277
+
278
+ if self._current_response_id:
279
+ await self._client.cancel_response()
280
+
281
+ self._is_responding = False
282
+ self._current_response_id = None
283
+ self._current_item_id = None