vision-agents-plugins-qwen 0.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agents_plugins_qwen-0.2.4/.gitignore +90 -0
- vision_agents_plugins_qwen-0.2.4/PKG-INFO +83 -0
- vision_agents_plugins_qwen-0.2.4/README.md +69 -0
- vision_agents_plugins_qwen-0.2.4/example/README.md +94 -0
- vision_agents_plugins_qwen-0.2.4/example/__init__.py +0 -0
- vision_agents_plugins_qwen-0.2.4/example/env.example +6 -0
- vision_agents_plugins_qwen-0.2.4/example/pyproject.toml +16 -0
- vision_agents_plugins_qwen-0.2.4/example/qwen_realtime_example.py +36 -0
- vision_agents_plugins_qwen-0.2.4/py.typed +0 -0
- vision_agents_plugins_qwen-0.2.4/pyproject.toml +37 -0
- vision_agents_plugins_qwen-0.2.4/tests/test_qwen_realtime.py +86 -0
- vision_agents_plugins_qwen-0.2.4/vision_agents/plugins/qwen/__init__.py +3 -0
- vision_agents_plugins_qwen-0.2.4/vision_agents/plugins/qwen/client.py +151 -0
- vision_agents_plugins_qwen-0.2.4/vision_agents/plugins/qwen/events.py +13 -0
- vision_agents_plugins_qwen-0.2.4/vision_agents/plugins/qwen/qwen_realtime.py +283 -0
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
.cursor/*
|
|
7
|
+
# Distribution / packaging
|
|
8
|
+
.Python
|
|
9
|
+
build/
|
|
10
|
+
dist/
|
|
11
|
+
downloads/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
eggs/
|
|
14
|
+
.eggs/
|
|
15
|
+
lib64/
|
|
16
|
+
parts/
|
|
17
|
+
sdist/
|
|
18
|
+
var/
|
|
19
|
+
wheels/
|
|
20
|
+
share/python-wheels/
|
|
21
|
+
pip-wheel-metadata/
|
|
22
|
+
MANIFEST
|
|
23
|
+
*.egg-info/
|
|
24
|
+
*.egg
|
|
25
|
+
|
|
26
|
+
# Installer logs
|
|
27
|
+
pip-log.txt
|
|
28
|
+
pip-delete-this-directory.txt
|
|
29
|
+
|
|
30
|
+
# Unit test / coverage reports
|
|
31
|
+
htmlcov/
|
|
32
|
+
.tox/
|
|
33
|
+
.nox/
|
|
34
|
+
.coverage
|
|
35
|
+
.coverage.*
|
|
36
|
+
.cache
|
|
37
|
+
coverage.xml
|
|
38
|
+
nosetests.xml
|
|
39
|
+
*.cover
|
|
40
|
+
*.py,cover
|
|
41
|
+
.hypothesis/
|
|
42
|
+
.pytest_cache/
|
|
43
|
+
|
|
44
|
+
# Type checker / lint caches
|
|
45
|
+
.mypy_cache/
|
|
46
|
+
.dmypy.json
|
|
47
|
+
dmypy.json
|
|
48
|
+
.pytype/
|
|
49
|
+
.pyre/
|
|
50
|
+
.ruff_cache/
|
|
51
|
+
|
|
52
|
+
# Environments
|
|
53
|
+
.venv
|
|
54
|
+
env/
|
|
55
|
+
venv/
|
|
56
|
+
ENV/
|
|
57
|
+
env.bak/
|
|
58
|
+
venv.bak/
|
|
59
|
+
.env
|
|
60
|
+
.env.local
|
|
61
|
+
.env.*.local
|
|
62
|
+
.env.bak
|
|
63
|
+
pyvenv.cfg
|
|
64
|
+
.python-version
|
|
65
|
+
|
|
66
|
+
# Editors / IDEs
|
|
67
|
+
.vscode/
|
|
68
|
+
.idea/
|
|
69
|
+
|
|
70
|
+
# Jupyter Notebook
|
|
71
|
+
.ipynb_checkpoints/
|
|
72
|
+
|
|
73
|
+
# OS / Misc
|
|
74
|
+
.DS_Store
|
|
75
|
+
*.log
|
|
76
|
+
|
|
77
|
+
# Tooling & repo-specific
|
|
78
|
+
pyrightconfig.json
|
|
79
|
+
shell.nix
|
|
80
|
+
bin/*
|
|
81
|
+
lib/*
|
|
82
|
+
stream-py/
|
|
83
|
+
|
|
84
|
+
# Artifacts / assets
|
|
85
|
+
*.pt
|
|
86
|
+
*.kef
|
|
87
|
+
*.onnx
|
|
88
|
+
profile.html
|
|
89
|
+
|
|
90
|
+
/opencode.json
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vision-agents-plugins-qwen
|
|
3
|
+
Version: 0.2.4
|
|
4
|
+
Summary: Qwen Omni plugin for vision agents
|
|
5
|
+
Project-URL: Documentation, https://visionagents.ai/
|
|
6
|
+
Project-URL: Website, https://visionagents.ai/
|
|
7
|
+
Project-URL: Source, https://github.com/GetStream/Vision-Agents
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
Requires-Python: >=3.10
|
|
10
|
+
Requires-Dist: numpy
|
|
11
|
+
Requires-Dist: vision-agents
|
|
12
|
+
Requires-Dist: websockets>=15.0.1
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
|
|
15
|
+
# Qwen Realtime Plugin for Vision Agents
|
|
16
|
+
|
|
17
|
+
Qwen3 Realtime LLM integration for Vision Agents framework with native audio output and built-in speech recognition using WebSocket-based realtime communication.
|
|
18
|
+
|
|
19
|
+
## Features
|
|
20
|
+
|
|
21
|
+
- **Native audio output**: No TTS service needed - audio comes directly from the model
|
|
22
|
+
- **Built-in STT**: Integrated speech-to-text using `gummy-realtime-v1` - no external STT service required
|
|
23
|
+
- **Server-side VAD**: Automatic turn detection with configurable silence thresholds
|
|
24
|
+
- **Video understanding**: Optional video frame support for multimodal interactions
|
|
25
|
+
- **Real-time streaming**: WebSocket-based bidirectional communication for low-latency responses
|
|
26
|
+
- **Interruption handling**: Automatic cancellation when user starts speaking
|
|
27
|
+
|
|
28
|
+
## Installation
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
uv add vision-agents[qwen]
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Usage
|
|
35
|
+
|
|
36
|
+
```python
|
|
37
|
+
from vision_agents.core import User, Agent
|
|
38
|
+
from vision_agents.plugins import getstream, qwen
|
|
39
|
+
|
|
40
|
+
agent = Agent(
|
|
41
|
+
edge=getstream.Edge(),
|
|
42
|
+
agent_user=User(name="Qwen Assistant"),
|
|
43
|
+
instructions="Be helpful and friendly",
|
|
44
|
+
llm=qwen.Realtime(
|
|
45
|
+
model="qwen3-omni-flash-realtime",
|
|
46
|
+
voice="Cherry",
|
|
47
|
+
fps=1,
|
|
48
|
+
),
|
|
49
|
+
# No STT or TTS needed - Qwen Realtime provides both
|
|
50
|
+
)
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Configuration
|
|
54
|
+
|
|
55
|
+
| Parameter | Description | Default | Accepted Values |
|
|
56
|
+
|-----------|-------------|---------|----------------|
|
|
57
|
+
| `model` | Qwen Realtime model identifier | `"qwen3-omni-flash-realtime"` | Model name string |
|
|
58
|
+
| `api_key` | DashScope API key | `None` (from env) | String or `None` |
|
|
59
|
+
| `base_url` | WebSocket API base URL | `"wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime"` | URL string |
|
|
60
|
+
| `voice` | Voice for audio output | `"Cherry"` | Voice name string |
|
|
61
|
+
| `fps` | Video frames per second | `1` | Integer |
|
|
62
|
+
| `include_video` | Include video frames in requests | `False` | Boolean |
|
|
63
|
+
| `video_width` | Video frame width | `1280` | Integer |
|
|
64
|
+
| `video_height` | Video frame height | `720` | Integer |
|
|
65
|
+
|
|
66
|
+
## Environment Variables
|
|
67
|
+
|
|
68
|
+
Set `DASHSCOPE_API_KEY` in your environment or `.env` file:
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
DASHSCOPE_API_KEY=your_dashscope_api_key_here
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## Example
|
|
75
|
+
|
|
76
|
+
See `plugins/qwen/example/qwen_realtime_example.py` for a complete working example.
|
|
77
|
+
|
|
78
|
+
## Dependencies
|
|
79
|
+
|
|
80
|
+
- vision-agents
|
|
81
|
+
- websockets
|
|
82
|
+
- aiortc
|
|
83
|
+
- av
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
# Qwen Realtime Plugin for Vision Agents
|
|
2
|
+
|
|
3
|
+
Qwen3 Realtime LLM integration for Vision Agents framework with native audio output and built-in speech recognition using WebSocket-based realtime communication.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Native audio output**: No TTS service needed - audio comes directly from the model
|
|
8
|
+
- **Built-in STT**: Integrated speech-to-text using `gummy-realtime-v1` - no external STT service required
|
|
9
|
+
- **Server-side VAD**: Automatic turn detection with configurable silence thresholds
|
|
10
|
+
- **Video understanding**: Optional video frame support for multimodal interactions
|
|
11
|
+
- **Real-time streaming**: WebSocket-based bidirectional communication for low-latency responses
|
|
12
|
+
- **Interruption handling**: Automatic cancellation when user starts speaking
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
uv add vision-agents[qwen]
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## Usage
|
|
21
|
+
|
|
22
|
+
```python
|
|
23
|
+
from vision_agents.core import User, Agent
|
|
24
|
+
from vision_agents.plugins import getstream, qwen
|
|
25
|
+
|
|
26
|
+
agent = Agent(
|
|
27
|
+
edge=getstream.Edge(),
|
|
28
|
+
agent_user=User(name="Qwen Assistant"),
|
|
29
|
+
instructions="Be helpful and friendly",
|
|
30
|
+
llm=qwen.Realtime(
|
|
31
|
+
model="qwen3-omni-flash-realtime",
|
|
32
|
+
voice="Cherry",
|
|
33
|
+
fps=1,
|
|
34
|
+
),
|
|
35
|
+
# No STT or TTS needed - Qwen Realtime provides both
|
|
36
|
+
)
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Configuration
|
|
40
|
+
|
|
41
|
+
| Parameter | Description | Default | Accepted Values |
|
|
42
|
+
|-----------|-------------|---------|----------------|
|
|
43
|
+
| `model` | Qwen Realtime model identifier | `"qwen3-omni-flash-realtime"` | Model name string |
|
|
44
|
+
| `api_key` | DashScope API key | `None` (from env) | String or `None` |
|
|
45
|
+
| `base_url` | WebSocket API base URL | `"wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime"` | URL string |
|
|
46
|
+
| `voice` | Voice for audio output | `"Cherry"` | Voice name string |
|
|
47
|
+
| `fps` | Video frames per second | `1` | Integer |
|
|
48
|
+
| `include_video` | Include video frames in requests | `False` | Boolean |
|
|
49
|
+
| `video_width` | Video frame width | `1280` | Integer |
|
|
50
|
+
| `video_height` | Video frame height | `720` | Integer |
|
|
51
|
+
|
|
52
|
+
## Environment Variables
|
|
53
|
+
|
|
54
|
+
Set `DASHSCOPE_API_KEY` in your environment or `.env` file:
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
DASHSCOPE_API_KEY=your_dashscope_api_key_here
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Example
|
|
61
|
+
|
|
62
|
+
See `plugins/qwen/example/qwen_realtime_example.py` for a complete working example.
|
|
63
|
+
|
|
64
|
+
## Dependencies
|
|
65
|
+
|
|
66
|
+
- vision-agents
|
|
67
|
+
- websockets
|
|
68
|
+
- aiortc
|
|
69
|
+
- av
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# Qwen Realtime Example
|
|
2
|
+
|
|
3
|
+
This example demonstrates how to use Qwen Realtime with Vision Agents for real-time conversations.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Real-time streaming**: Direct audio streaming from Qwen Realtime API
|
|
8
|
+
- **No text input**: The model does not support text input, so start speaking once you join the call
|
|
9
|
+
- **Video support**: Configure frames per second for video processing
|
|
10
|
+
|
|
11
|
+
## Installation
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
uv add vision-agents[qwen]
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Quick Start
|
|
18
|
+
|
|
19
|
+
1. Set your API key in your environment:
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
export DASHSCOPE_API_KEY=your_dashscope_api_key_here
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
Or create a `.env` file:
|
|
26
|
+
|
|
27
|
+
```
|
|
28
|
+
DASHSCOPE_API_KEY=your_dashscope_api_key_here
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
2. Run the example:
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
uv run python qwen_realtime_example.py
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Code Example
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
from dotenv import load_dotenv
|
|
41
|
+
from vision_agents.core import Agent, User, cli
|
|
42
|
+
from vision_agents.core.agents import AgentLauncher
|
|
43
|
+
from vision_agents.plugins import getstream, qwen
|
|
44
|
+
|
|
45
|
+
load_dotenv()
|
|
46
|
+
|
|
47
|
+
async def create_agent(**kwargs) -> Agent:
|
|
48
|
+
llm = qwen.Realtime(fps=1)
|
|
49
|
+
|
|
50
|
+
agent = Agent(
|
|
51
|
+
edge=getstream.Edge(),
|
|
52
|
+
agent_user=User(name="Qwen Assistant", id="agent"),
|
|
53
|
+
instructions="You are a helpful AI assistant. Be friendly and conversational.",
|
|
54
|
+
llm=llm,
|
|
55
|
+
)
|
|
56
|
+
return agent
|
|
57
|
+
|
|
58
|
+
async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None:
|
|
59
|
+
await agent.create_user()
|
|
60
|
+
call = await agent.create_call(call_type, call_id)
|
|
61
|
+
|
|
62
|
+
with await agent.join(call):
|
|
63
|
+
await agent.edge.open_demo(call)
|
|
64
|
+
await agent.finish()
|
|
65
|
+
|
|
66
|
+
if __name__ == "__main__":
|
|
67
|
+
cli(AgentLauncher(create_agent=create_agent, join_call=join_call))
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## Configuration
|
|
71
|
+
|
|
72
|
+
### Environment Variables
|
|
73
|
+
|
|
74
|
+
- **`DASHSCOPE_API_KEY`**: Your DashScope/Alibaba API key (required)
|
|
75
|
+
|
|
76
|
+
### Realtime Parameters
|
|
77
|
+
|
|
78
|
+
| Parameter | Description | Default |
|
|
79
|
+
|-----------|-------------|---------|
|
|
80
|
+
| `fps` | Video frames per second | `1` |
|
|
81
|
+
| `api_key` | DashScope API key | `None` (from env) |
|
|
82
|
+
|
|
83
|
+
## Requirements
|
|
84
|
+
|
|
85
|
+
- Python 3.10+
|
|
86
|
+
- DashScope API key
|
|
87
|
+
- Stream API credentials (configured via `getstream.Edge()`)
|
|
88
|
+
- `vision-agents` framework
|
|
89
|
+
|
|
90
|
+
## Notes
|
|
91
|
+
|
|
92
|
+
- The model is hosted in Singapore, so latency may vary depending on your location
|
|
93
|
+
- The model does not support text input - once you join the call, simply start speaking to the agent
|
|
94
|
+
- This example uses the CLI interface for easy interaction
|
|
File without changes
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "qwen-omni-example"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Example using Qwen Omni with Vision Agents"
|
|
5
|
+
requires-python = ">=3.10"
|
|
6
|
+
dependencies = [
|
|
7
|
+
"vision-agents",
|
|
8
|
+
"vision-agents-plugins-qwen",
|
|
9
|
+
"vision-agents-plugins-getstream",
|
|
10
|
+
"python-dotenv",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
[tool.uv.sources]
|
|
14
|
+
vision-agents = { workspace = true }
|
|
15
|
+
vision-agents-plugins-qwen = { workspace = true }
|
|
16
|
+
vision-agents-plugins-getstream = { workspace = true }
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# This is a basic example using Qwen Realtime with Vision Agents
|
|
2
|
+
# To run this example, you must have DASHSCOPE_API_KEY set in your env.
|
|
3
|
+
# Do note that the model is hosted in Singapore so depending on your location, the latency may vary.
|
|
4
|
+
# This model also does not support text input so once you join the call, simply start speaking to the agent.
|
|
5
|
+
|
|
6
|
+
from dotenv import load_dotenv
|
|
7
|
+
from vision_agents.core import Agent, User, cli
|
|
8
|
+
from vision_agents.core.agents import AgentLauncher
|
|
9
|
+
from vision_agents.plugins import getstream, qwen
|
|
10
|
+
|
|
11
|
+
load_dotenv()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
async def create_agent(**kwargs) -> Agent:
|
|
15
|
+
llm = qwen.Realtime(fps=1)
|
|
16
|
+
|
|
17
|
+
agent = Agent(
|
|
18
|
+
edge=getstream.Edge(),
|
|
19
|
+
agent_user=User(name="Qwen Assistant", id="agent"),
|
|
20
|
+
instructions="You are a helpful AI assistant. Be friendly and conversational.",
|
|
21
|
+
llm=llm,
|
|
22
|
+
)
|
|
23
|
+
return agent
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None:
|
|
27
|
+
await agent.create_user()
|
|
28
|
+
call = await agent.create_call(call_type, call_id)
|
|
29
|
+
|
|
30
|
+
with await agent.join(call):
|
|
31
|
+
await agent.edge.open_demo(call)
|
|
32
|
+
await agent.finish()
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
if __name__ == "__main__":
|
|
36
|
+
cli(AgentLauncher(create_agent=create_agent, join_call=join_call))
|
|
File without changes
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling", "hatch-vcs"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "vision-agents-plugins-qwen"
|
|
7
|
+
dynamic = ["version"]
|
|
8
|
+
description = "Qwen Omni plugin for vision agents"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = "MIT"
|
|
12
|
+
dependencies = [
|
|
13
|
+
"vision-agents",
|
|
14
|
+
"numpy",
|
|
15
|
+
"websockets>=15.0.1",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
[project.urls]
|
|
19
|
+
Documentation = "https://visionagents.ai/"
|
|
20
|
+
Website = "https://visionagents.ai/"
|
|
21
|
+
Source = "https://github.com/GetStream/Vision-Agents"
|
|
22
|
+
|
|
23
|
+
[tool.hatch.version]
|
|
24
|
+
source = "vcs"
|
|
25
|
+
raw-options = { root = "..", search_parent_directories = true, fallback_version = "0.0.0" }
|
|
26
|
+
|
|
27
|
+
[tool.hatch.build.targets.wheel]
|
|
28
|
+
packages = ["."]
|
|
29
|
+
|
|
30
|
+
[tool.uv.sources]
|
|
31
|
+
vision-agents = { workspace = true }
|
|
32
|
+
|
|
33
|
+
[dependency-groups]
|
|
34
|
+
dev = [
|
|
35
|
+
"pytest>=8.4.1",
|
|
36
|
+
"pytest-asyncio>=1.0.0",
|
|
37
|
+
]
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
|
|
3
|
+
import dotenv
|
|
4
|
+
import pytest
|
|
5
|
+
from vision_agents.core.llm.events import (
|
|
6
|
+
RealtimeAudioOutputEvent,
|
|
7
|
+
)
|
|
8
|
+
from vision_agents.plugins.qwen import Realtime
|
|
9
|
+
|
|
10
|
+
dotenv.load_dotenv()
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@pytest.fixture()
|
|
14
|
+
async def llm():
|
|
15
|
+
"""Create and manage Realtime connection lifecycle"""
|
|
16
|
+
realtime = Realtime(
|
|
17
|
+
fps=1, vad_silence_duration_ms=0, vad_prefix_padding_ms=0, vad_threshold=0.1
|
|
18
|
+
)
|
|
19
|
+
yield realtime
|
|
20
|
+
await realtime.close()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class TestQwen3Realtime:
|
|
24
|
+
"""Integration tests for Qwen3Realtime connect flow"""
|
|
25
|
+
|
|
26
|
+
@pytest.mark.integration
|
|
27
|
+
async def test_audio_sending_flow(self, llm, mia_audio_16khz, silence_1s_16khz):
|
|
28
|
+
"""Test sending real audio data and verify connection remains stable"""
|
|
29
|
+
events = []
|
|
30
|
+
|
|
31
|
+
@llm.events.subscribe
|
|
32
|
+
async def on_audio(event: RealtimeAudioOutputEvent):
|
|
33
|
+
events.append(event)
|
|
34
|
+
|
|
35
|
+
# Connect the llm
|
|
36
|
+
await llm.connect()
|
|
37
|
+
# Let it handle the connection events
|
|
38
|
+
await asyncio.sleep(5.0)
|
|
39
|
+
|
|
40
|
+
# Send 1s of silence first
|
|
41
|
+
await llm.simple_audio_response(silence_1s_16khz)
|
|
42
|
+
# Send audio
|
|
43
|
+
await llm.simple_audio_response(mia_audio_16khz)
|
|
44
|
+
# Send silence again
|
|
45
|
+
await llm.simple_audio_response(silence_1s_16khz)
|
|
46
|
+
|
|
47
|
+
# Let it run for a few sec
|
|
48
|
+
await asyncio.sleep(10.0)
|
|
49
|
+
|
|
50
|
+
# Verify that the model replied with audio
|
|
51
|
+
assert len(events) > 0
|
|
52
|
+
|
|
53
|
+
@pytest.mark.integration
|
|
54
|
+
async def test_video_sending_flow(
|
|
55
|
+
self,
|
|
56
|
+
llm,
|
|
57
|
+
bunny_video_track,
|
|
58
|
+
describe_what_you_see_audio_16khz,
|
|
59
|
+
silence_1s_16khz,
|
|
60
|
+
):
|
|
61
|
+
"""Test sending real video data and verify connection remains stable"""
|
|
62
|
+
events = []
|
|
63
|
+
|
|
64
|
+
@llm.events.subscribe
|
|
65
|
+
async def on_audio(event: RealtimeAudioOutputEvent):
|
|
66
|
+
events.append(event)
|
|
67
|
+
|
|
68
|
+
await llm.connect()
|
|
69
|
+
# Let the model to handle all connection events
|
|
70
|
+
await asyncio.sleep(5.0)
|
|
71
|
+
|
|
72
|
+
# Send 1s of silence first
|
|
73
|
+
await llm.simple_audio_response(silence_1s_16khz)
|
|
74
|
+
# Start video sender with low FPS to avoid overwhelming the connection
|
|
75
|
+
await llm.watch_video_track(bunny_video_track)
|
|
76
|
+
# Send audio to the model (it does not support text inputs)
|
|
77
|
+
await llm.simple_audio_response(describe_what_you_see_audio_16khz)
|
|
78
|
+
# Send silence again
|
|
79
|
+
await llm.simple_audio_response(silence_1s_16khz)
|
|
80
|
+
# Let it run for a few seconds
|
|
81
|
+
await asyncio.sleep(10.0)
|
|
82
|
+
|
|
83
|
+
# Stop video sender
|
|
84
|
+
await llm._stop_watching_video_track()
|
|
85
|
+
# Verify that the model replied
|
|
86
|
+
assert len(events) > 0
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import base64
|
|
3
|
+
import contextlib
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
import time
|
|
7
|
+
from typing import Any, AsyncIterator, Optional
|
|
8
|
+
|
|
9
|
+
import websockets
|
|
10
|
+
from getstream.video.rtc import PcmData
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class Qwen3RealtimeClient:
|
|
16
|
+
"""
|
|
17
|
+
A wrapper around WebSocket connection for Qwen3Realtime API.
|
|
18
|
+
It automatically reconnects in case of connection failures.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
model: str,
|
|
24
|
+
api_key: str,
|
|
25
|
+
base_url: str,
|
|
26
|
+
config: dict[str, Any],
|
|
27
|
+
reconnect_backoff: float = 1.0,
|
|
28
|
+
) -> None:
|
|
29
|
+
self._base_url = f"{base_url}?model={model}"
|
|
30
|
+
self._api_key = api_key
|
|
31
|
+
self._real_ws: Optional[websockets.ClientConnection] = None
|
|
32
|
+
self._exit_stack = contextlib.AsyncExitStack()
|
|
33
|
+
self._config = config
|
|
34
|
+
self._conn_lock = asyncio.Lock()
|
|
35
|
+
self._closed = False
|
|
36
|
+
self._reconnect_backoff = reconnect_backoff
|
|
37
|
+
|
|
38
|
+
async def connect(self) -> None:
|
|
39
|
+
if self._conn_lock.locked():
|
|
40
|
+
return None
|
|
41
|
+
|
|
42
|
+
async with self._conn_lock:
|
|
43
|
+
logger.debug(f"Connecting to Qwen3Realtime at {self._base_url}")
|
|
44
|
+
self._real_ws = await self._exit_stack.enter_async_context(
|
|
45
|
+
websockets.connect(
|
|
46
|
+
uri=self._base_url,
|
|
47
|
+
additional_headers={"Authorization": f"Bearer {self._api_key}"},
|
|
48
|
+
)
|
|
49
|
+
)
|
|
50
|
+
# Initialize session with config params
|
|
51
|
+
await self.update_session(self._config)
|
|
52
|
+
return None
|
|
53
|
+
|
|
54
|
+
async def close(self) -> None:
|
|
55
|
+
self._closed = True
|
|
56
|
+
try:
|
|
57
|
+
await self._exit_stack.aclose()
|
|
58
|
+
except Exception as e:
|
|
59
|
+
logger.warning(f"Error closing session: {e}")
|
|
60
|
+
|
|
61
|
+
async def read(self) -> AsyncIterator[dict[str, Any]]:
|
|
62
|
+
while not self._closed:
|
|
63
|
+
try:
|
|
64
|
+
async for msg in self._ws:
|
|
65
|
+
event = json.loads(msg)
|
|
66
|
+
yield event
|
|
67
|
+
except websockets.ConnectionClosedError as e:
|
|
68
|
+
if not _should_reconnect(e):
|
|
69
|
+
raise
|
|
70
|
+
await asyncio.sleep(self._reconnect_backoff)
|
|
71
|
+
await self.connect()
|
|
72
|
+
|
|
73
|
+
async def send_event(self, event: dict[str, Any]) -> None:
|
|
74
|
+
event["event_id"] = f"event_{int(time.time() * 1000)}"
|
|
75
|
+
|
|
76
|
+
try:
|
|
77
|
+
await self._ws.send(json.dumps(event))
|
|
78
|
+
except websockets.ConnectionClosedError as e:
|
|
79
|
+
if not _should_reconnect(e):
|
|
80
|
+
raise
|
|
81
|
+
logger.warning(
|
|
82
|
+
f"Re-establishing Qwen3Realtime connection due to error: {e}"
|
|
83
|
+
)
|
|
84
|
+
await asyncio.sleep(self._reconnect_backoff)
|
|
85
|
+
await self.connect()
|
|
86
|
+
|
|
87
|
+
async def update_session(self, config: dict[str, Any]) -> None:
|
|
88
|
+
"""Update the session configuration."""
|
|
89
|
+
await self.send_event(event={"type": "session.update", "session": config})
|
|
90
|
+
|
|
91
|
+
async def send_audio(self, pcm: PcmData) -> None:
|
|
92
|
+
"""Stream raw audio data to the API."""
|
|
93
|
+
# Only 16-bit, 16 kHz, mono PCM is supported.
|
|
94
|
+
audio_bytes = pcm.resample(
|
|
95
|
+
target_sample_rate=16000, target_channels=1
|
|
96
|
+
).samples.tobytes()
|
|
97
|
+
audio_b64 = base64.b64encode(audio_bytes).decode()
|
|
98
|
+
append_event = {"type": "input_audio_buffer.append", "audio": audio_b64}
|
|
99
|
+
await self.send_event(append_event)
|
|
100
|
+
|
|
101
|
+
async def commit_audio(self) -> None:
|
|
102
|
+
"""Commit the audio buffer to trigger processing."""
|
|
103
|
+
event = {"type": "input_audio_buffer.commit"}
|
|
104
|
+
await self.send_event(event)
|
|
105
|
+
|
|
106
|
+
async def send_frame(self, frame_bytes: bytes) -> None:
|
|
107
|
+
"""
|
|
108
|
+
Append image data to the image buffer.
|
|
109
|
+
|
|
110
|
+
Note:
|
|
111
|
+
- The image format must be JPG or JPEG. A resolution of 480p or 720p is recommended.
|
|
112
|
+
The maximum supported resolution is 1080p.
|
|
113
|
+
- A single image should not exceed 500 KB in size.
|
|
114
|
+
- Encode the image data to Base64 before sending.
|
|
115
|
+
- We recommend sending images to the server at a rate of no more than 2 frames per second.
|
|
116
|
+
- You must send audio data at least once before sending image data.
|
|
117
|
+
"""
|
|
118
|
+
image_b64 = base64.b64encode(frame_bytes).decode()
|
|
119
|
+
event = {"type": "input_image_buffer.append", "image": image_b64}
|
|
120
|
+
await self.send_event(event)
|
|
121
|
+
|
|
122
|
+
async def cancel_response(self) -> None:
|
|
123
|
+
"""Cancel the current response."""
|
|
124
|
+
event = {"type": "response.cancel"}
|
|
125
|
+
await self.send_event(event)
|
|
126
|
+
|
|
127
|
+
@property
|
|
128
|
+
def _ws(self) -> websockets.ClientConnection:
|
|
129
|
+
if self._real_ws is None:
|
|
130
|
+
raise ValueError("The websocket connection is not established yet")
|
|
131
|
+
return self._real_ws
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _should_reconnect(exc: Exception) -> bool:
|
|
135
|
+
"""
|
|
136
|
+
Temporary errors should typically trigger a reconnect.
|
|
137
|
+
So if the websocket breaks this should return True and trigger a reconnect
|
|
138
|
+
"""
|
|
139
|
+
reconnect_close_codes = [
|
|
140
|
+
1011, # Server-side exception or session timeout
|
|
141
|
+
1012, # Service restart
|
|
142
|
+
1013, # Try again later
|
|
143
|
+
1014, # Bad gateway
|
|
144
|
+
]
|
|
145
|
+
if (
|
|
146
|
+
isinstance(exc, websockets.ConnectionClosedError)
|
|
147
|
+
and exc.rcvd
|
|
148
|
+
and exc.rcvd.code in reconnect_close_codes
|
|
149
|
+
):
|
|
150
|
+
return True
|
|
151
|
+
return False
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from vision_agents.core.events import PluginBaseEvent
|
|
3
|
+
from typing import Optional, Any
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class LLMErrorEvent(PluginBaseEvent):
|
|
8
|
+
"""Event emitted when an LLM encounters an error."""
|
|
9
|
+
|
|
10
|
+
type: str = field(default="plugin.llm.error", init=False)
|
|
11
|
+
plugin_name: str = ""
|
|
12
|
+
error_message: Optional[str] = None
|
|
13
|
+
event_data: Optional[Any] = None
|
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import base64
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import uuid
|
|
6
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
7
|
+
from typing import Any, Optional, cast
|
|
8
|
+
|
|
9
|
+
import aiortc
|
|
10
|
+
import av
|
|
11
|
+
from aiortc import VideoStreamTrack
|
|
12
|
+
from getstream.video.rtc import PcmData
|
|
13
|
+
from vision_agents.core.edge.types import Participant
|
|
14
|
+
from vision_agents.core.llm import Realtime
|
|
15
|
+
from vision_agents.core.llm.events import LLMResponseChunkEvent
|
|
16
|
+
from vision_agents.core.llm.llm import LLMResponseEvent
|
|
17
|
+
from vision_agents.core.processors import Processor
|
|
18
|
+
from vision_agents.core.utils.video_forwarder import VideoForwarder
|
|
19
|
+
from vision_agents.core.utils.video_utils import frame_to_jpeg_bytes
|
|
20
|
+
|
|
21
|
+
from . import events
|
|
22
|
+
from .client import Qwen3RealtimeClient
|
|
23
|
+
|
|
24
|
+
DEFAULT_BASE_URL = "wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime"
|
|
25
|
+
PLUGIN_NAME = "Qwen3Realtime"
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class Qwen3Realtime(Realtime):
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
model: str = "qwen3-omni-flash-realtime",
|
|
34
|
+
api_key: Optional[str] = None,
|
|
35
|
+
base_url: Optional[str] = None,
|
|
36
|
+
voice: str = "Cherry",
|
|
37
|
+
fps: int = 1,
|
|
38
|
+
include_video: bool = False,
|
|
39
|
+
video_width: int = 1280,
|
|
40
|
+
video_height: int = 720,
|
|
41
|
+
audio_transcription_model: str = "gummy-realtime-v1",
|
|
42
|
+
vad_threshold: float = 0.1,
|
|
43
|
+
vad_prefix_padding_ms: int = 500,
|
|
44
|
+
vad_silence_duration_ms: int = 900,
|
|
45
|
+
):
|
|
46
|
+
super().__init__(fps=fps)
|
|
47
|
+
self.model = model
|
|
48
|
+
self.voice = voice
|
|
49
|
+
self.session_id = str(uuid.uuid4())
|
|
50
|
+
self.events.register_events_from_module(events)
|
|
51
|
+
|
|
52
|
+
self._base_url = base_url or DEFAULT_BASE_URL
|
|
53
|
+
|
|
54
|
+
api_key = api_key or os.getenv("DASHSCOPE_API_KEY")
|
|
55
|
+
if not api_key:
|
|
56
|
+
raise ValueError("api_key is required")
|
|
57
|
+
self._api_key = cast(str, api_key)
|
|
58
|
+
|
|
59
|
+
self._video_forwarder: Optional[VideoForwarder] = None
|
|
60
|
+
self._include_video = include_video
|
|
61
|
+
self._real_client: Optional[Qwen3RealtimeClient] = None
|
|
62
|
+
self._processing_task: Optional[asyncio.Task] = None
|
|
63
|
+
self._video_width = video_width
|
|
64
|
+
self._video_height = video_height
|
|
65
|
+
self._executor = ThreadPoolExecutor(max_workers=1)
|
|
66
|
+
|
|
67
|
+
self._is_responding = False
|
|
68
|
+
self._current_response_id = None
|
|
69
|
+
self._current_item_id = None
|
|
70
|
+
self._current_participant: Optional[Participant] = None
|
|
71
|
+
# The model requires us not to send any video frames until the audio is sent
|
|
72
|
+
self._audio_emitted_once = False
|
|
73
|
+
self._audio_transcription_model = audio_transcription_model
|
|
74
|
+
self._vad_threshold = vad_threshold
|
|
75
|
+
self._vad_prefix_padding_ms = vad_prefix_padding_ms
|
|
76
|
+
self._vad_silence_duration_ms = vad_silence_duration_ms
|
|
77
|
+
|
|
78
|
+
async def connect(self):
|
|
79
|
+
# Stop the processing task first in case we're reconnecting
|
|
80
|
+
await self._stop_processing_task()
|
|
81
|
+
|
|
82
|
+
# Session configuration
|
|
83
|
+
session_config = {
|
|
84
|
+
"modalities": ["text", "audio"],
|
|
85
|
+
"voice": self.voice,
|
|
86
|
+
"instructions": self._instructions,
|
|
87
|
+
"input_audio_format": "pcm16",
|
|
88
|
+
"output_audio_format": "pcm24",
|
|
89
|
+
"input_audio_transcription": {"model": self._audio_transcription_model},
|
|
90
|
+
"turn_detection": {
|
|
91
|
+
"type": "server_vad",
|
|
92
|
+
"threshold": self._vad_threshold,
|
|
93
|
+
"prefix_padding_ms": self._vad_prefix_padding_ms,
|
|
94
|
+
"silence_duration_ms": self._vad_silence_duration_ms,
|
|
95
|
+
},
|
|
96
|
+
}
|
|
97
|
+
self._real_client = Qwen3RealtimeClient(
|
|
98
|
+
api_key=self._api_key,
|
|
99
|
+
base_url=self._base_url,
|
|
100
|
+
model=self.model,
|
|
101
|
+
config=session_config,
|
|
102
|
+
)
|
|
103
|
+
await self._real_client.connect()
|
|
104
|
+
self.connected = True
|
|
105
|
+
logger.debug(f"Started Qwen3Realtime session at {self._base_url}")
|
|
106
|
+
|
|
107
|
+
# Start the loop task
|
|
108
|
+
self._start_processing_task()
|
|
109
|
+
|
|
110
|
+
async def simple_audio_response(
|
|
111
|
+
self, pcm: PcmData, participant: Optional[Participant] = None
|
|
112
|
+
):
|
|
113
|
+
if not self.connected:
|
|
114
|
+
return
|
|
115
|
+
self._current_participant = participant
|
|
116
|
+
await self._client.send_audio(pcm=pcm)
|
|
117
|
+
self._audio_emitted_once = True
|
|
118
|
+
|
|
119
|
+
async def simple_response(
|
|
120
|
+
self,
|
|
121
|
+
text: str,
|
|
122
|
+
processors: Optional[list[Processor]] = None,
|
|
123
|
+
participant: Optional[Participant] = None,
|
|
124
|
+
) -> LLMResponseEvent[Any]:
|
|
125
|
+
logger.warning(
|
|
126
|
+
f'Cannot reply to "{text}"; reason - Qwen3Realtime does not support text inputs'
|
|
127
|
+
)
|
|
128
|
+
return LLMResponseEvent(text="", original=None)
|
|
129
|
+
|
|
130
|
+
async def close(self):
|
|
131
|
+
self.connected = False
|
|
132
|
+
await self._stop_watching_video_track()
|
|
133
|
+
if self._processing_task is not None:
|
|
134
|
+
self._processing_task.cancel()
|
|
135
|
+
await self._processing_task
|
|
136
|
+
|
|
137
|
+
self._executor.shutdown(wait=False)
|
|
138
|
+
|
|
139
|
+
if self._real_client is not None:
|
|
140
|
+
await self._real_client.close()
|
|
141
|
+
self._real_client = None
|
|
142
|
+
|
|
143
|
+
async def watch_video_track(
|
|
144
|
+
self,
|
|
145
|
+
track: aiortc.mediastreams.MediaStreamTrack,
|
|
146
|
+
shared_forwarder: Optional[VideoForwarder] = None,
|
|
147
|
+
) -> None:
|
|
148
|
+
"""
|
|
149
|
+
Start sending video frames using VideoForwarder.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
track: Video track to watch
|
|
153
|
+
shared_forwarder: Optional shared VideoForwarder to use instead of creating a new one
|
|
154
|
+
"""
|
|
155
|
+
|
|
156
|
+
# This method can be called multiple times with different forwarders
|
|
157
|
+
# Remove handler from old forwarder if it exists
|
|
158
|
+
await self._stop_watching_video_track()
|
|
159
|
+
if self._video_forwarder is not None:
|
|
160
|
+
await self._video_forwarder.remove_frame_handler(self._send_video_frame)
|
|
161
|
+
|
|
162
|
+
self._video_forwarder = shared_forwarder or VideoForwarder(
|
|
163
|
+
input_track=cast(VideoStreamTrack, track),
|
|
164
|
+
max_buffer=5,
|
|
165
|
+
fps=float(self.fps),
|
|
166
|
+
name="qwen3realtime_forwarder",
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
# Add frame handler (starts automatically)
|
|
170
|
+
self._video_forwarder.add_frame_handler(self._send_video_frame, fps=self.fps)
|
|
171
|
+
logger.info(f"Started video forwarding with {self.fps} FPS")
|
|
172
|
+
|
|
173
|
+
async def _send_video_frame(self, frame: av.VideoFrame) -> None:
|
|
174
|
+
"""
|
|
175
|
+
Send a video frame to Qwen3 Realtime API using send_realtime_input
|
|
176
|
+
|
|
177
|
+
Parameters:
|
|
178
|
+
frame: Video frame to send.
|
|
179
|
+
"""
|
|
180
|
+
if not self._audio_emitted_once:
|
|
181
|
+
# Wait until the audio is sent at least once before forwarding frames
|
|
182
|
+
# per the model spec.
|
|
183
|
+
return
|
|
184
|
+
|
|
185
|
+
loop = asyncio.get_running_loop()
|
|
186
|
+
|
|
187
|
+
# Run frame conversion in a separate thread to avoid blocking the loop.
|
|
188
|
+
jpg_bytes = await loop.run_in_executor(
|
|
189
|
+
self._executor,
|
|
190
|
+
frame_to_jpeg_bytes,
|
|
191
|
+
frame,
|
|
192
|
+
self._video_width,
|
|
193
|
+
self._video_height,
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
try:
|
|
197
|
+
await self._client.send_frame(jpg_bytes)
|
|
198
|
+
except Exception:
|
|
199
|
+
logger.exception("Failed to send a video frame to Qwen3 Realtime API")
|
|
200
|
+
|
|
201
|
+
async def _stop_watching_video_track(self) -> None:
|
|
202
|
+
if self._video_forwarder is not None:
|
|
203
|
+
await self._video_forwarder.remove_frame_handler(self._send_video_frame)
|
|
204
|
+
|
|
205
|
+
@property
|
|
206
|
+
def _client(self) -> Qwen3RealtimeClient:
|
|
207
|
+
if self._real_client is None:
|
|
208
|
+
raise ValueError("The Qwen3Realtime session is not established yet")
|
|
209
|
+
return self._real_client
|
|
210
|
+
|
|
211
|
+
async def _processing_loop(self) -> None:
|
|
212
|
+
logger.debug("Start processing events by Qwen3Realtime")
|
|
213
|
+
try:
|
|
214
|
+
await self._process_events()
|
|
215
|
+
except asyncio.CancelledError:
|
|
216
|
+
logger.debug("Stop processing events by Qwen3Realtime")
|
|
217
|
+
|
|
218
|
+
def _start_processing_task(self) -> None:
|
|
219
|
+
self._processing_task = asyncio.create_task(self._processing_loop())
|
|
220
|
+
|
|
221
|
+
async def _stop_processing_task(self) -> None:
|
|
222
|
+
if self._processing_task is not None:
|
|
223
|
+
self._processing_task.cancel()
|
|
224
|
+
await self._processing_task
|
|
225
|
+
|
|
226
|
+
async def _process_events(self):
|
|
227
|
+
async for event in self._client.read():
|
|
228
|
+
event_type = event.get("type")
|
|
229
|
+
if event_type == "error":
|
|
230
|
+
error = event["error"]
|
|
231
|
+
logger.error(
|
|
232
|
+
f"Error received from Qwen3Realtime API: {error}",
|
|
233
|
+
)
|
|
234
|
+
self.events.send(
|
|
235
|
+
events.LLMErrorEvent(plugin_name=PLUGIN_NAME, error_message=error)
|
|
236
|
+
)
|
|
237
|
+
continue
|
|
238
|
+
|
|
239
|
+
elif event_type == "session.created":
|
|
240
|
+
logger.debug("Qwen3Realtime session initialized successfully")
|
|
241
|
+
|
|
242
|
+
elif event_type == "response.created":
|
|
243
|
+
self._current_response_id = event.get("response", {}).get("id")
|
|
244
|
+
self._is_responding = True
|
|
245
|
+
elif event_type == "response.output_item.added":
|
|
246
|
+
self._current_item_id = event.get("item", {}).get("id")
|
|
247
|
+
elif event_type == "response.done":
|
|
248
|
+
self._is_responding = False
|
|
249
|
+
self._current_response_id = None
|
|
250
|
+
self._current_item_id = None
|
|
251
|
+
elif event_type == "input_audio_buffer.speech_started":
|
|
252
|
+
if self._is_responding:
|
|
253
|
+
await self._on_interruption()
|
|
254
|
+
elif event_type == "response.text.delta":
|
|
255
|
+
self.events.send(
|
|
256
|
+
LLMResponseChunkEvent(
|
|
257
|
+
plugin_name=PLUGIN_NAME, delta=str(event["delta"])
|
|
258
|
+
)
|
|
259
|
+
)
|
|
260
|
+
elif event_type == "response.audio.delta":
|
|
261
|
+
audio_bytes = base64.b64decode(event["delta"])
|
|
262
|
+
pcm = PcmData.from_bytes(audio_bytes, 24000)
|
|
263
|
+
self._emit_audio_output_event(audio_data=pcm)
|
|
264
|
+
elif event_type == "conversation.item.input_audio_transcription.completed":
|
|
265
|
+
transcript = event.get("transcript", "")
|
|
266
|
+
if transcript:
|
|
267
|
+
self._emit_user_speech_transcription(text=transcript)
|
|
268
|
+
elif event_type == "response.audio_transcript.delta":
|
|
269
|
+
delta = event.get("delta", "")
|
|
270
|
+
if delta:
|
|
271
|
+
self._emit_agent_speech_transcription(text=delta)
|
|
272
|
+
|
|
273
|
+
async def _on_interruption(self):
|
|
274
|
+
"""Handle user interruption of the current response."""
|
|
275
|
+
if not self._is_responding:
|
|
276
|
+
return
|
|
277
|
+
|
|
278
|
+
if self._current_response_id:
|
|
279
|
+
await self._client.cancel_response()
|
|
280
|
+
|
|
281
|
+
self._is_responding = False
|
|
282
|
+
self._current_response_id = None
|
|
283
|
+
self._current_item_id = None
|