vision-agents-plugins-local 0.4.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agents_plugins_local-0.4.4/.gitignore +98 -0
- vision_agents_plugins_local-0.4.4/PKG-INFO +14 -0
- vision_agents_plugins_local-0.4.4/README.md +0 -0
- vision_agents_plugins_local-0.4.4/pyproject.toml +42 -0
- vision_agents_plugins_local-0.4.4/vision_agents/plugins/local/__init__.py +10 -0
- vision_agents_plugins_local-0.4.4/vision_agents/plugins/local/devices.py +370 -0
- vision_agents_plugins_local-0.4.4/vision_agents/plugins/local/display.py +181 -0
- vision_agents_plugins_local-0.4.4/vision_agents/plugins/local/edge.py +306 -0
- vision_agents_plugins_local-0.4.4/vision_agents/plugins/local/tracks.py +263 -0
- vision_agents_plugins_local-0.4.4/vision_agents/plugins/local/utils.py +86 -0
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
.cursor/*
|
|
7
|
+
# Distribution / packaging
|
|
8
|
+
.Python
|
|
9
|
+
build/
|
|
10
|
+
dist/
|
|
11
|
+
downloads/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
eggs/
|
|
14
|
+
.eggs/
|
|
15
|
+
lib64/
|
|
16
|
+
parts/
|
|
17
|
+
sdist/
|
|
18
|
+
var/
|
|
19
|
+
wheels/
|
|
20
|
+
share/python-wheels/
|
|
21
|
+
pip-wheel-metadata/
|
|
22
|
+
MANIFEST
|
|
23
|
+
*.egg-info/
|
|
24
|
+
*.egg
|
|
25
|
+
|
|
26
|
+
# Installer logs
|
|
27
|
+
pip-log.txt
|
|
28
|
+
pip-delete-this-directory.txt
|
|
29
|
+
|
|
30
|
+
# Unit test / coverage reports
|
|
31
|
+
htmlcov/
|
|
32
|
+
.tox/
|
|
33
|
+
.nox/
|
|
34
|
+
.coverage
|
|
35
|
+
.coverage.*
|
|
36
|
+
.cache
|
|
37
|
+
coverage.xml
|
|
38
|
+
nosetests.xml
|
|
39
|
+
*.cover
|
|
40
|
+
*.py,cover
|
|
41
|
+
.hypothesis/
|
|
42
|
+
.pytest_cache/
|
|
43
|
+
|
|
44
|
+
# Type checker / lint caches
|
|
45
|
+
.mypy_cache/
|
|
46
|
+
.dmypy.json
|
|
47
|
+
dmypy.json
|
|
48
|
+
.pytype/
|
|
49
|
+
.pyre/
|
|
50
|
+
.ruff_cache/
|
|
51
|
+
|
|
52
|
+
# Environments
|
|
53
|
+
.venv
|
|
54
|
+
env/
|
|
55
|
+
venv/
|
|
56
|
+
ENV/
|
|
57
|
+
env.bak/
|
|
58
|
+
venv.bak/
|
|
59
|
+
.env
|
|
60
|
+
.env.local
|
|
61
|
+
.env.*.local
|
|
62
|
+
.env.bak
|
|
63
|
+
pyvenv.cfg
|
|
64
|
+
.python-version
|
|
65
|
+
|
|
66
|
+
# Editors / IDEs
|
|
67
|
+
.vscode/
|
|
68
|
+
.idea/
|
|
69
|
+
|
|
70
|
+
# Jupyter Notebook
|
|
71
|
+
.ipynb_checkpoints/
|
|
72
|
+
|
|
73
|
+
# OS / Misc
|
|
74
|
+
.DS_Store
|
|
75
|
+
*.log
|
|
76
|
+
|
|
77
|
+
# Tooling & repo-specific
|
|
78
|
+
pyrightconfig.json
|
|
79
|
+
shell.nix
|
|
80
|
+
bin/*
|
|
81
|
+
lib/*
|
|
82
|
+
stream-py/
|
|
83
|
+
|
|
84
|
+
# Example lock files (regenerated by uv sync)
|
|
85
|
+
examples/*/uv.lock
|
|
86
|
+
plugins/*/example/uv.lock
|
|
87
|
+
|
|
88
|
+
# Artifacts / assets
|
|
89
|
+
*.pt
|
|
90
|
+
*.kef
|
|
91
|
+
*.onnx
|
|
92
|
+
profile.html
|
|
93
|
+
|
|
94
|
+
/opencode.json
|
|
95
|
+
.ralph-tui/
|
|
96
|
+
.claude/
|
|
97
|
+
|
|
98
|
+
.uv-cache/
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vision-agents-plugins-local
|
|
3
|
+
Version: 0.4.4
|
|
4
|
+
Summary: Local audio & video integration for Vision Agents
|
|
5
|
+
Project-URL: Documentation, https://visionagents.ai/
|
|
6
|
+
Project-URL: Website, https://visionagents.ai/
|
|
7
|
+
Project-URL: Source, https://github.com/GetStream/Vision-Agents
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
Keywords: AI,agents,local,voice agents
|
|
10
|
+
Requires-Python: >=3.10
|
|
11
|
+
Requires-Dist: aiortc<1.15.0,>=1.14.0
|
|
12
|
+
Requires-Dist: av<17,>=14.2.0
|
|
13
|
+
Requires-Dist: sounddevice>=0.5.0
|
|
14
|
+
Requires-Dist: vision-agents
|
|
File without changes
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling", "hatch-vcs"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "vision-agents-plugins-local"
|
|
7
|
+
dynamic = ["version"]
|
|
8
|
+
description = "Local audio & video integration for Vision Agents"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
keywords = ["local", "AI", "voice agents", "agents"]
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
license = "MIT"
|
|
13
|
+
dependencies = [
|
|
14
|
+
"vision-agents",
|
|
15
|
+
"sounddevice>=0.5.0",
|
|
16
|
+
"aiortc>=1.14.0, <1.15.0",
|
|
17
|
+
"av>=14.2.0, <17",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
[project.urls]
|
|
21
|
+
Documentation = "https://visionagents.ai/"
|
|
22
|
+
Website = "https://visionagents.ai/"
|
|
23
|
+
Source = "https://github.com/GetStream/Vision-Agents"
|
|
24
|
+
|
|
25
|
+
[tool.hatch.version]
|
|
26
|
+
source = "vcs"
|
|
27
|
+
raw-options = { root = "..", search_parent_directories = true, fallback_version = "0.0.0" }
|
|
28
|
+
|
|
29
|
+
[tool.hatch.build.targets.wheel]
|
|
30
|
+
packages = ["."]
|
|
31
|
+
|
|
32
|
+
[tool.hatch.build.targets.sdist]
|
|
33
|
+
include = ["/vision_agents"]
|
|
34
|
+
|
|
35
|
+
[tool.uv.sources]
|
|
36
|
+
vision-agents = { workspace = true }
|
|
37
|
+
|
|
38
|
+
[dependency-groups]
|
|
39
|
+
dev = [
|
|
40
|
+
"pytest>=8.4.1",
|
|
41
|
+
"pytest-asyncio>=1.0.0",
|
|
42
|
+
]
|
|
@@ -0,0 +1,370 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Device enumeration and selection utilities for LocalTransport.
|
|
3
|
+
|
|
4
|
+
Provides typed device representations and interactive prompts for
|
|
5
|
+
selecting audio and video devices when running agents locally.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import glob
|
|
9
|
+
import logging
|
|
10
|
+
import platform
|
|
11
|
+
import queue
|
|
12
|
+
import re
|
|
13
|
+
import subprocess
|
|
14
|
+
from dataclasses import dataclass
|
|
15
|
+
|
|
16
|
+
import numpy as np
|
|
17
|
+
import sounddevice as sd
|
|
18
|
+
|
|
19
|
+
from .utils import prompt_selection
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
_AVFOUNDATION_RE = re.compile(r"\[AVFoundation.*?\]\s*\[(\d+)\]\s*(.+)")
|
|
25
|
+
_DSHOW_DEVICE_RE = re.compile(r'"(.+?)"')
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class AudioInputDevice:
|
|
29
|
+
"""Audio input device (microphone).
|
|
30
|
+
|
|
31
|
+
Combines device metadata with stream capture. Subclass to implement
|
|
32
|
+
custom audio backends (e.g. GStreamer).
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def __init__(
|
|
36
|
+
self,
|
|
37
|
+
index: int,
|
|
38
|
+
name: str,
|
|
39
|
+
sample_rate: int = 48000,
|
|
40
|
+
channels: int = 1,
|
|
41
|
+
is_default: bool = False,
|
|
42
|
+
blocksize: int | None = None,
|
|
43
|
+
):
|
|
44
|
+
self.index = index
|
|
45
|
+
self.name = name
|
|
46
|
+
self._sample_rate = sample_rate
|
|
47
|
+
self._channels = channels
|
|
48
|
+
self.is_default = is_default
|
|
49
|
+
self._blocksize = (
|
|
50
|
+
blocksize if blocksize is not None else int(sample_rate * 0.02)
|
|
51
|
+
)
|
|
52
|
+
self._stream: sd.InputStream | None = None
|
|
53
|
+
self._buffer: queue.Queue[np.ndarray] = queue.Queue(maxsize=200)
|
|
54
|
+
|
|
55
|
+
@property
|
|
56
|
+
def sample_rate(self) -> int:
|
|
57
|
+
return self._sample_rate
|
|
58
|
+
|
|
59
|
+
@property
|
|
60
|
+
def channels(self) -> int:
|
|
61
|
+
return self._channels
|
|
62
|
+
|
|
63
|
+
def _callback(
|
|
64
|
+
self,
|
|
65
|
+
indata: np.ndarray,
|
|
66
|
+
frames: int,
|
|
67
|
+
time_info: object,
|
|
68
|
+
status: object,
|
|
69
|
+
) -> None:
|
|
70
|
+
if status:
|
|
71
|
+
logger.warning("Audio input status: %s", status)
|
|
72
|
+
try:
|
|
73
|
+
self._buffer.put_nowait(indata.copy())
|
|
74
|
+
except queue.Full:
|
|
75
|
+
pass
|
|
76
|
+
|
|
77
|
+
def start(self) -> None:
|
|
78
|
+
"""Open and start the audio input stream."""
|
|
79
|
+
self._stream = sd.InputStream(
|
|
80
|
+
samplerate=self._sample_rate,
|
|
81
|
+
channels=self._channels,
|
|
82
|
+
dtype="int16",
|
|
83
|
+
blocksize=self._blocksize,
|
|
84
|
+
device=self.index,
|
|
85
|
+
callback=self._callback,
|
|
86
|
+
)
|
|
87
|
+
self._stream.start()
|
|
88
|
+
logger.info(
|
|
89
|
+
"Started audio input: %dHz, %d channels",
|
|
90
|
+
self._sample_rate,
|
|
91
|
+
self._channels,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
def read(self) -> np.ndarray | None:
|
|
95
|
+
"""Block until audio data is available (up to 100ms timeout)."""
|
|
96
|
+
try:
|
|
97
|
+
return self._buffer.get(timeout=0.1)
|
|
98
|
+
except queue.Empty:
|
|
99
|
+
return None
|
|
100
|
+
|
|
101
|
+
def stop(self) -> None:
|
|
102
|
+
"""Stop and close the audio input stream."""
|
|
103
|
+
if self._stream is not None:
|
|
104
|
+
self._stream.stop()
|
|
105
|
+
self._stream.close()
|
|
106
|
+
self._stream = None
|
|
107
|
+
logger.info("Stopped audio input")
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class AudioOutputDevice:
|
|
111
|
+
"""Audio output device (speaker/headphones).
|
|
112
|
+
|
|
113
|
+
Combines device metadata with stream playback. Subclass to implement
|
|
114
|
+
custom audio backends (e.g. GStreamer).
|
|
115
|
+
"""
|
|
116
|
+
|
|
117
|
+
def __init__(
|
|
118
|
+
self,
|
|
119
|
+
index: int,
|
|
120
|
+
name: str,
|
|
121
|
+
sample_rate: int = 48000,
|
|
122
|
+
channels: int = 2,
|
|
123
|
+
is_default: bool = False,
|
|
124
|
+
blocksize: int = 2048,
|
|
125
|
+
):
|
|
126
|
+
self.index = index
|
|
127
|
+
self.name = name
|
|
128
|
+
self._sample_rate = sample_rate
|
|
129
|
+
self._channels = channels
|
|
130
|
+
self.is_default = is_default
|
|
131
|
+
self._blocksize = blocksize
|
|
132
|
+
self._stream: sd.OutputStream | None = None
|
|
133
|
+
|
|
134
|
+
@property
|
|
135
|
+
def sample_rate(self) -> int:
|
|
136
|
+
return self._sample_rate
|
|
137
|
+
|
|
138
|
+
@property
|
|
139
|
+
def channels(self) -> int:
|
|
140
|
+
return self._channels
|
|
141
|
+
|
|
142
|
+
def start(self) -> None:
|
|
143
|
+
"""Open and start the audio output stream."""
|
|
144
|
+
self._stream = sd.OutputStream(
|
|
145
|
+
samplerate=self._sample_rate,
|
|
146
|
+
channels=self._channels,
|
|
147
|
+
dtype="int16",
|
|
148
|
+
blocksize=self._blocksize,
|
|
149
|
+
device=self.index,
|
|
150
|
+
)
|
|
151
|
+
self._stream.start()
|
|
152
|
+
logger.info(
|
|
153
|
+
"Started audio output: %dHz, %d channels",
|
|
154
|
+
self._sample_rate,
|
|
155
|
+
self._channels,
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
def write(self, samples: np.ndarray) -> None:
|
|
159
|
+
"""Write flat int16 samples to the device."""
|
|
160
|
+
if self._stream is None:
|
|
161
|
+
return
|
|
162
|
+
frames = len(samples) // self._channels
|
|
163
|
+
audio = samples.reshape(frames, self._channels)
|
|
164
|
+
self._stream.write(audio)
|
|
165
|
+
|
|
166
|
+
def flush(self) -> None:
|
|
167
|
+
"""Abort current playback and restart the stream."""
|
|
168
|
+
if self._stream is not None:
|
|
169
|
+
self._stream.abort()
|
|
170
|
+
self._stream.start()
|
|
171
|
+
|
|
172
|
+
def stop(self) -> None:
|
|
173
|
+
"""Stop and close the audio output stream."""
|
|
174
|
+
if self._stream is not None:
|
|
175
|
+
self._stream.stop()
|
|
176
|
+
self._stream.close()
|
|
177
|
+
self._stream = None
|
|
178
|
+
logger.info("Stopped audio output")
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
@dataclass(frozen=True)
|
|
182
|
+
class CameraDevice:
|
|
183
|
+
"""A detected camera."""
|
|
184
|
+
|
|
185
|
+
index: int
|
|
186
|
+
name: str
|
|
187
|
+
device: str
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def list_audio_input_devices() -> list[AudioInputDevice]:
|
|
191
|
+
"""Return all audio input devices."""
|
|
192
|
+
raw = sd.query_devices()
|
|
193
|
+
default_in = sd.default.device[0]
|
|
194
|
+
return [
|
|
195
|
+
AudioInputDevice(
|
|
196
|
+
index=i,
|
|
197
|
+
name=dev["name"],
|
|
198
|
+
sample_rate=int(dev["default_samplerate"]),
|
|
199
|
+
channels=dev["max_input_channels"],
|
|
200
|
+
is_default=(i == default_in),
|
|
201
|
+
)
|
|
202
|
+
for i, dev in enumerate(raw)
|
|
203
|
+
if dev["max_input_channels"] > 0
|
|
204
|
+
]
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def list_audio_output_devices() -> list[AudioOutputDevice]:
|
|
208
|
+
"""Return all audio output devices."""
|
|
209
|
+
raw = sd.query_devices()
|
|
210
|
+
default_out = sd.default.device[1]
|
|
211
|
+
return [
|
|
212
|
+
AudioOutputDevice(
|
|
213
|
+
index=i,
|
|
214
|
+
name=dev["name"],
|
|
215
|
+
sample_rate=int(dev["default_samplerate"]),
|
|
216
|
+
channels=dev["max_output_channels"],
|
|
217
|
+
is_default=(i == default_out),
|
|
218
|
+
)
|
|
219
|
+
for i, dev in enumerate(raw)
|
|
220
|
+
if dev["max_output_channels"] > 0
|
|
221
|
+
]
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def select_audio_input_device() -> AudioInputDevice | None:
|
|
225
|
+
"""Interactive prompt to select an audio input device."""
|
|
226
|
+
devices = list_audio_input_devices()
|
|
227
|
+
default = next((d for d in devices if d.is_default), None)
|
|
228
|
+
return prompt_selection(
|
|
229
|
+
items=devices,
|
|
230
|
+
formatter=_format_audio_device,
|
|
231
|
+
header="INPUT DEVICES (Microphones)",
|
|
232
|
+
default=default,
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def select_audio_output_device() -> AudioOutputDevice | None:
|
|
237
|
+
"""Interactive prompt to select an audio output device."""
|
|
238
|
+
devices = list_audio_output_devices()
|
|
239
|
+
default = next((d for d in devices if d.is_default), None)
|
|
240
|
+
return prompt_selection(
|
|
241
|
+
items=devices,
|
|
242
|
+
formatter=_format_audio_device,
|
|
243
|
+
header="OUTPUT DEVICES (Speakers)",
|
|
244
|
+
default=default,
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def select_video_device() -> CameraDevice | None:
|
|
249
|
+
"""Interactive prompt to select a camera or skip.
|
|
250
|
+
|
|
251
|
+
Returns:
|
|
252
|
+
The selected camera device, or None if skipped.
|
|
253
|
+
"""
|
|
254
|
+
cameras = list_cameras()
|
|
255
|
+
|
|
256
|
+
return prompt_selection(
|
|
257
|
+
items=cameras,
|
|
258
|
+
formatter=lambda c: c.name,
|
|
259
|
+
header="VIDEO DEVICES (Cameras)",
|
|
260
|
+
allow_skip=True,
|
|
261
|
+
empty_message="No cameras detected\n (Camera support requires ffmpeg to be installed)",
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def list_cameras() -> list[CameraDevice]:
|
|
266
|
+
"""List available cameras on the system."""
|
|
267
|
+
system = platform.system()
|
|
268
|
+
|
|
269
|
+
if system == "Darwin":
|
|
270
|
+
return _list_cameras_darwin()
|
|
271
|
+
if system == "Linux":
|
|
272
|
+
return _list_cameras_linux()
|
|
273
|
+
if system == "Windows":
|
|
274
|
+
return _list_cameras_windows()
|
|
275
|
+
|
|
276
|
+
return []
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
def _format_audio_device(dev: AudioInputDevice | AudioOutputDevice) -> str:
|
|
280
|
+
"""Format an audio device for display."""
|
|
281
|
+
default = " [DEFAULT]" if dev.is_default else ""
|
|
282
|
+
return f"{dev.name} ({dev.sample_rate}Hz){default}"
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def _list_cameras_darwin() -> list[CameraDevice]:
|
|
286
|
+
"""List cameras on macOS via ffmpeg/AVFoundation."""
|
|
287
|
+
try:
|
|
288
|
+
result = subprocess.run(
|
|
289
|
+
["ffmpeg", "-f", "avfoundation", "-list_devices", "true", "-i", ""],
|
|
290
|
+
capture_output=True,
|
|
291
|
+
text=True,
|
|
292
|
+
timeout=5,
|
|
293
|
+
)
|
|
294
|
+
except (subprocess.TimeoutExpired, FileNotFoundError):
|
|
295
|
+
logger.warning("Failed to list cameras (is ffmpeg installed?)")
|
|
296
|
+
return []
|
|
297
|
+
|
|
298
|
+
cameras: list[CameraDevice] = []
|
|
299
|
+
in_video_section = False
|
|
300
|
+
|
|
301
|
+
for line in result.stderr.splitlines():
|
|
302
|
+
if "AVFoundation video devices:" in line:
|
|
303
|
+
in_video_section = True
|
|
304
|
+
continue
|
|
305
|
+
if "AVFoundation audio devices:" in line:
|
|
306
|
+
break
|
|
307
|
+
if in_video_section:
|
|
308
|
+
match = _AVFOUNDATION_RE.search(line)
|
|
309
|
+
if match:
|
|
310
|
+
cam_idx = int(match.group(1))
|
|
311
|
+
cameras.append(
|
|
312
|
+
CameraDevice(
|
|
313
|
+
index=cam_idx, name=match.group(2), device=str(cam_idx)
|
|
314
|
+
)
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
return cameras
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
def _list_cameras_linux() -> list[CameraDevice]:
|
|
321
|
+
"""List cameras on Linux via /dev/video* and sysfs."""
|
|
322
|
+
cameras: list[CameraDevice] = []
|
|
323
|
+
|
|
324
|
+
for i, dev_path in enumerate(sorted(glob.glob("/dev/video*"))):
|
|
325
|
+
name_path = f"/sys/class/video4linux/{dev_path.split('/')[-1]}/name"
|
|
326
|
+
try:
|
|
327
|
+
with open(name_path) as f:
|
|
328
|
+
name = f.read().strip()
|
|
329
|
+
except OSError:
|
|
330
|
+
name = dev_path
|
|
331
|
+
cameras.append(CameraDevice(index=i, name=name, device=dev_path))
|
|
332
|
+
|
|
333
|
+
return cameras
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
def _list_cameras_windows() -> list[CameraDevice]:
|
|
337
|
+
"""List cameras on Windows via ffmpeg/DirectShow."""
|
|
338
|
+
try:
|
|
339
|
+
result = subprocess.run(
|
|
340
|
+
["ffmpeg", "-f", "dshow", "-list_devices", "true", "-i", "dummy"],
|
|
341
|
+
capture_output=True,
|
|
342
|
+
text=True,
|
|
343
|
+
timeout=5,
|
|
344
|
+
)
|
|
345
|
+
except (subprocess.TimeoutExpired, FileNotFoundError):
|
|
346
|
+
logger.warning("Failed to list cameras (is ffmpeg installed?)")
|
|
347
|
+
return []
|
|
348
|
+
|
|
349
|
+
cameras: list[CameraDevice] = []
|
|
350
|
+
in_video_section = False
|
|
351
|
+
|
|
352
|
+
for line in result.stderr.splitlines():
|
|
353
|
+
if "DirectShow video devices" in line:
|
|
354
|
+
in_video_section = True
|
|
355
|
+
continue
|
|
356
|
+
if "DirectShow audio devices" in line:
|
|
357
|
+
break
|
|
358
|
+
if in_video_section:
|
|
359
|
+
match = _DSHOW_DEVICE_RE.search(line)
|
|
360
|
+
if match:
|
|
361
|
+
name = match.group(1)
|
|
362
|
+
cameras.append(
|
|
363
|
+
CameraDevice(
|
|
364
|
+
index=len(cameras),
|
|
365
|
+
name=name,
|
|
366
|
+
device=f'video="{name}"',
|
|
367
|
+
)
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
return cameras
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
"""Tkinter-based video display for LocalEdge.
|
|
2
|
+
|
|
3
|
+
Shows the agent's outbound video track in a tkinter window.
|
|
4
|
+
Gracefully handles environments where tkinter is not available.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import asyncio
|
|
8
|
+
import contextlib
|
|
9
|
+
import logging
|
|
10
|
+
import signal
|
|
11
|
+
import threading
|
|
12
|
+
import warnings
|
|
13
|
+
from typing import cast
|
|
14
|
+
|
|
15
|
+
import av
|
|
16
|
+
from aiortc import MediaStreamError, MediaStreamTrack
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
try:
|
|
21
|
+
import tkinter
|
|
22
|
+
|
|
23
|
+
_TKINTER_AVAILABLE = True
|
|
24
|
+
except ImportError:
|
|
25
|
+
_TKINTER_AVAILABLE = False
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _fit_size(src_w: int, src_h: int, dst_w: int, dst_h: int) -> tuple[int, int]:
|
|
29
|
+
"""Compute the largest size that fits dst while preserving src aspect ratio."""
|
|
30
|
+
scale = min(dst_w / src_w, dst_h / src_h)
|
|
31
|
+
return max(2, int(src_w * scale)) & ~1, max(2, int(src_h * scale)) & ~1
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _frame_to_ppm(frame: av.VideoFrame, width: int, height: int) -> bytes:
|
|
35
|
+
"""Convert an av.VideoFrame to PPM bytes, scaled to fit width x height."""
|
|
36
|
+
fit_w, fit_h = _fit_size(frame.width, frame.height, width, height)
|
|
37
|
+
rgb = frame.reformat(width=fit_w, height=fit_h, format="rgb24")
|
|
38
|
+
pixels = rgb.to_ndarray()
|
|
39
|
+
header = f"P6 {fit_w} {fit_h} 255 ".encode()
|
|
40
|
+
return header + pixels.tobytes()
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class VideoDisplay:
|
|
44
|
+
"""Displays video frames from a MediaStreamTrack in a tkinter window.
|
|
45
|
+
|
|
46
|
+
Tkinter events are pumped from the asyncio event loop (main thread)
|
|
47
|
+
to satisfy macOS Cocoa requirements. A thread-safe queue bridges the
|
|
48
|
+
async frame receiver and the display update loop.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
def __init__(
|
|
52
|
+
self,
|
|
53
|
+
title: str = "Agent Video Output",
|
|
54
|
+
width: int = 640,
|
|
55
|
+
height: int = 480,
|
|
56
|
+
fps: int = 30,
|
|
57
|
+
):
|
|
58
|
+
if fps <= 0:
|
|
59
|
+
raise ValueError("fps must be > 0")
|
|
60
|
+
if width <= 0:
|
|
61
|
+
raise ValueError("width must be > 0")
|
|
62
|
+
if height <= 0:
|
|
63
|
+
raise ValueError("height must be > 0")
|
|
64
|
+
self._title = title
|
|
65
|
+
self._width = width
|
|
66
|
+
self._height = height
|
|
67
|
+
self._frame_interval = 1.0 / fps
|
|
68
|
+
self._running = False
|
|
69
|
+
self._latest_frame: av.VideoFrame | None = None
|
|
70
|
+
self._frame_lock = threading.Lock()
|
|
71
|
+
self._recv_task: asyncio.Task[None] | None = None
|
|
72
|
+
self._tk_task: asyncio.Task[None] | None = None
|
|
73
|
+
self._root: tkinter.Tk | None = None
|
|
74
|
+
|
|
75
|
+
async def start(self, video_track: MediaStreamTrack) -> None:
|
|
76
|
+
"""Start displaying frames from the given video track.
|
|
77
|
+
|
|
78
|
+
If tkinter is not available, emits an ImportWarning and returns
|
|
79
|
+
without starting.
|
|
80
|
+
"""
|
|
81
|
+
if not _TKINTER_AVAILABLE:
|
|
82
|
+
warnings.warn(
|
|
83
|
+
"tkinter is not available. Install python3-tk or equivalent "
|
|
84
|
+
"for your platform to use the video display.",
|
|
85
|
+
ImportWarning,
|
|
86
|
+
)
|
|
87
|
+
return
|
|
88
|
+
|
|
89
|
+
self._running = True
|
|
90
|
+
self._recv_task = asyncio.create_task(self._recv_loop(video_track))
|
|
91
|
+
self._tk_task = asyncio.create_task(self._tk_loop())
|
|
92
|
+
|
|
93
|
+
async def stop(self) -> None:
|
|
94
|
+
"""Stop the display and clean up tasks."""
|
|
95
|
+
self._running = False
|
|
96
|
+
|
|
97
|
+
for task in (self._recv_task, self._tk_task):
|
|
98
|
+
if task is not None:
|
|
99
|
+
task.cancel()
|
|
100
|
+
try:
|
|
101
|
+
await task
|
|
102
|
+
except asyncio.CancelledError:
|
|
103
|
+
pass
|
|
104
|
+
|
|
105
|
+
self._recv_task = None
|
|
106
|
+
self._tk_task = None
|
|
107
|
+
|
|
108
|
+
async def _recv_loop(self, track: MediaStreamTrack) -> None:
|
|
109
|
+
"""Consume frames from the video track and store the latest."""
|
|
110
|
+
try:
|
|
111
|
+
while self._running:
|
|
112
|
+
frame = cast(av.VideoFrame, await track.recv())
|
|
113
|
+
with self._frame_lock:
|
|
114
|
+
self._latest_frame = frame
|
|
115
|
+
except asyncio.CancelledError:
|
|
116
|
+
raise
|
|
117
|
+
except MediaStreamError:
|
|
118
|
+
logger.debug("Video track ended")
|
|
119
|
+
except RuntimeError:
|
|
120
|
+
logger.debug("Video track stopped")
|
|
121
|
+
|
|
122
|
+
async def _tk_loop(self) -> None:
|
|
123
|
+
"""Pump Tkinter events from the asyncio event loop (main thread)."""
|
|
124
|
+
root: tkinter.Tk | None = None
|
|
125
|
+
prev_sigint = signal.getsignal(signal.SIGINT)
|
|
126
|
+
try:
|
|
127
|
+
root = tkinter.Tk()
|
|
128
|
+
# Tk() overrides SIGINT — restore the previous handler (typically
|
|
129
|
+
# asyncio's) so the first Ctrl+C gracefully cancels the main task
|
|
130
|
+
# instead of raising KeyboardInterrupt inside this task.
|
|
131
|
+
signal.signal(signal.SIGINT, prev_sigint)
|
|
132
|
+
root.title(self._title)
|
|
133
|
+
root.geometry(f"{self._width}x{self._height}")
|
|
134
|
+
root.protocol("WM_DELETE_WINDOW", self._on_window_close)
|
|
135
|
+
self._root = root
|
|
136
|
+
|
|
137
|
+
gray = bytes([128] * (self._width * self._height * 3))
|
|
138
|
+
header = f"P6 {self._width} {self._height} 255 ".encode()
|
|
139
|
+
self._photo = tkinter.PhotoImage(data=header + gray)
|
|
140
|
+
|
|
141
|
+
self._label = tkinter.Label(root, image=self._photo)
|
|
142
|
+
self._label.pack(fill="both", expand=True)
|
|
143
|
+
|
|
144
|
+
while self._running:
|
|
145
|
+
with self._frame_lock:
|
|
146
|
+
frame = self._latest_frame
|
|
147
|
+
self._latest_frame = None
|
|
148
|
+
|
|
149
|
+
if frame is not None:
|
|
150
|
+
ppm = await asyncio.to_thread(
|
|
151
|
+
_frame_to_ppm,
|
|
152
|
+
frame,
|
|
153
|
+
self._width,
|
|
154
|
+
self._height,
|
|
155
|
+
)
|
|
156
|
+
self._photo = tkinter.PhotoImage(data=ppm)
|
|
157
|
+
self._label.configure(image=self._photo)
|
|
158
|
+
|
|
159
|
+
try:
|
|
160
|
+
root.update()
|
|
161
|
+
except (tkinter.TclError, KeyboardInterrupt):
|
|
162
|
+
break
|
|
163
|
+
|
|
164
|
+
await asyncio.sleep(self._frame_interval)
|
|
165
|
+
except asyncio.CancelledError:
|
|
166
|
+
raise
|
|
167
|
+
finally:
|
|
168
|
+
if root is not None:
|
|
169
|
+
with contextlib.suppress(tkinter.TclError, KeyboardInterrupt):
|
|
170
|
+
root.destroy()
|
|
171
|
+
self._root = None
|
|
172
|
+
|
|
173
|
+
def _on_window_close(self) -> None:
|
|
174
|
+
"""Handle the user closing the tkinter window."""
|
|
175
|
+
self._running = False
|
|
176
|
+
if self._root is not None:
|
|
177
|
+
try:
|
|
178
|
+
self._root.destroy()
|
|
179
|
+
except tkinter.TclError:
|
|
180
|
+
pass
|
|
181
|
+
self._root = None
|
|
@@ -0,0 +1,306 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import TYPE_CHECKING, Any, cast
|
|
5
|
+
|
|
6
|
+
import aiortc
|
|
7
|
+
import av
|
|
8
|
+
import numpy as np
|
|
9
|
+
from getstream.video.rtc.track_util import AudioFormat, PcmData
|
|
10
|
+
from vision_agents.core.agents.conversation import InMemoryConversation
|
|
11
|
+
from vision_agents.core.edge.edge_transport import EdgeTransport
|
|
12
|
+
from vision_agents.core.edge.events import AudioReceivedEvent, TrackAddedEvent
|
|
13
|
+
from vision_agents.core.edge.types import Connection, Participant, TrackType, User
|
|
14
|
+
from vision_agents.core.utils.utils import cancel_and_wait
|
|
15
|
+
from vision_agents.core.utils.video_track import QueuedVideoTrack
|
|
16
|
+
|
|
17
|
+
from .devices import AudioInputDevice, AudioOutputDevice, CameraDevice
|
|
18
|
+
from .display import VideoDisplay
|
|
19
|
+
from .tracks import LocalOutputAudioTrack, LocalVideoTrack
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from vision_agents.core.agents.agents import Agent
|
|
23
|
+
|
|
24
|
+
PLUGIN_NAME = "local"
|
|
25
|
+
LOCAL_VIDEO_TRACK_ID = "local-video-track"
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class LocalCall:
|
|
32
|
+
"""Minimal Call-compatible object for local transport."""
|
|
33
|
+
|
|
34
|
+
id: str
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class LocalEdge(EdgeTransport):
|
|
38
|
+
"""EdgeTransport implementation for local audio/video I/O.
|
|
39
|
+
|
|
40
|
+
Captures microphone audio via AudioInputDevice and plays agent audio
|
|
41
|
+
through AudioOutputDevice (both default to sounddevice). Optionally
|
|
42
|
+
captures camera video via CameraDevice and displays agent video output
|
|
43
|
+
in a tkinter window. Subclass the device classes to use alternative
|
|
44
|
+
backends (e.g. GStreamer).
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def __init__(
|
|
48
|
+
self,
|
|
49
|
+
audio_input: AudioInputDevice,
|
|
50
|
+
audio_output: AudioOutputDevice,
|
|
51
|
+
video_input: CameraDevice | None = None,
|
|
52
|
+
video_width: int = 640,
|
|
53
|
+
video_height: int = 480,
|
|
54
|
+
video_fps: int = 30,
|
|
55
|
+
):
|
|
56
|
+
"""Create a local edge transport.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
audio_input: Microphone device for capturing user audio.
|
|
60
|
+
audio_output: Speaker device for playing agent audio.
|
|
61
|
+
video_input: Camera device for capturing user video. None disables video.
|
|
62
|
+
video_width: Width of the video frame in pixels.
|
|
63
|
+
video_height: Height of the video frame in pixels.
|
|
64
|
+
video_fps: Video frame rate.
|
|
65
|
+
"""
|
|
66
|
+
super().__init__()
|
|
67
|
+
|
|
68
|
+
self._audio_input = audio_input
|
|
69
|
+
self._audio_output = audio_output
|
|
70
|
+
|
|
71
|
+
self._video_input = video_input.device if video_input else None
|
|
72
|
+
self._video_width = video_width
|
|
73
|
+
self._video_height = video_height
|
|
74
|
+
self._video_fps = video_fps
|
|
75
|
+
|
|
76
|
+
self._participant = Participant(
|
|
77
|
+
original=None,
|
|
78
|
+
user_id="local",
|
|
79
|
+
id="local",
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
self._mic_task: asyncio.Task[None] | None = None
|
|
83
|
+
self._video_forward_task: asyncio.Task[None] | None = None
|
|
84
|
+
self._audio_track: LocalOutputAudioTrack | None = None
|
|
85
|
+
self._input_video_track: LocalVideoTrack | None = None
|
|
86
|
+
self._output_video_track = QueuedVideoTrack(
|
|
87
|
+
width=video_width,
|
|
88
|
+
height=video_height,
|
|
89
|
+
fps=video_fps,
|
|
90
|
+
)
|
|
91
|
+
self._video_display: VideoDisplay | None = None
|
|
92
|
+
self._connection: LocalConnection | None = None
|
|
93
|
+
|
|
94
|
+
async def publish_tracks(
|
|
95
|
+
self,
|
|
96
|
+
audio_track: aiortc.MediaStreamTrack | None,
|
|
97
|
+
video_track: aiortc.MediaStreamTrack | None,
|
|
98
|
+
) -> None:
|
|
99
|
+
"""Publish the agent's media tracks locally."""
|
|
100
|
+
if audio_track is not None and isinstance(audio_track, LocalOutputAudioTrack):
|
|
101
|
+
audio_track.start()
|
|
102
|
+
logger.info("Audio track published and started")
|
|
103
|
+
|
|
104
|
+
if video_track is not None:
|
|
105
|
+
self._video_forward_task = asyncio.create_task(
|
|
106
|
+
self._forward_video(video_track)
|
|
107
|
+
)
|
|
108
|
+
logger.info("Video output track published")
|
|
109
|
+
|
|
110
|
+
def create_audio_track(self) -> "LocalOutputAudioTrack":
|
|
111
|
+
"""Create an audio track that plays through the audio output backend."""
|
|
112
|
+
self._audio_track = LocalOutputAudioTrack(
|
|
113
|
+
audio_output=self._audio_output,
|
|
114
|
+
)
|
|
115
|
+
return self._audio_track
|
|
116
|
+
|
|
117
|
+
def create_video_track(self) -> LocalVideoTrack | None:
|
|
118
|
+
"""Create a video track for the agent's camera input."""
|
|
119
|
+
if self._video_input is None:
|
|
120
|
+
logger.debug("No video device configured, skipping video track creation")
|
|
121
|
+
return None
|
|
122
|
+
|
|
123
|
+
self._input_video_track = LocalVideoTrack(
|
|
124
|
+
device=self._video_input,
|
|
125
|
+
width=self._video_width,
|
|
126
|
+
height=self._video_height,
|
|
127
|
+
fps=self._video_fps,
|
|
128
|
+
)
|
|
129
|
+
return self._input_video_track
|
|
130
|
+
|
|
131
|
+
def add_track_subscriber(self, track_id: str) -> LocalVideoTrack | None:
|
|
132
|
+
"""Return the local camera video track if available."""
|
|
133
|
+
if track_id == LOCAL_VIDEO_TRACK_ID and self._input_video_track is not None:
|
|
134
|
+
return self._input_video_track
|
|
135
|
+
return None
|
|
136
|
+
|
|
137
|
+
async def join(
|
|
138
|
+
self, agent: "Agent", call: Any = None, **kwargs: Any
|
|
139
|
+
) -> "LocalConnection":
|
|
140
|
+
"""Start microphone capture and optionally camera."""
|
|
141
|
+
await self._start_audio()
|
|
142
|
+
|
|
143
|
+
if self._video_input is not None:
|
|
144
|
+
video_track = self.create_video_track()
|
|
145
|
+
if video_track is not None:
|
|
146
|
+
self.events.send(
|
|
147
|
+
TrackAddedEvent(
|
|
148
|
+
plugin_name=PLUGIN_NAME,
|
|
149
|
+
track_id=LOCAL_VIDEO_TRACK_ID,
|
|
150
|
+
track_type=TrackType.VIDEO,
|
|
151
|
+
participant=self._participant,
|
|
152
|
+
)
|
|
153
|
+
)
|
|
154
|
+
logger.info("Camera video track added")
|
|
155
|
+
|
|
156
|
+
self._connection = LocalConnection(self)
|
|
157
|
+
return self._connection
|
|
158
|
+
|
|
159
|
+
async def close(self) -> None:
|
|
160
|
+
"""Stop audio/video and release all resources."""
|
|
161
|
+
if self._video_forward_task is not None:
|
|
162
|
+
await cancel_and_wait(self._video_forward_task)
|
|
163
|
+
self._video_forward_task = None
|
|
164
|
+
|
|
165
|
+
self._output_video_track.stop()
|
|
166
|
+
|
|
167
|
+
if self._video_display is not None:
|
|
168
|
+
await self._video_display.stop()
|
|
169
|
+
self._video_display = None
|
|
170
|
+
|
|
171
|
+
await self._stop_audio()
|
|
172
|
+
self._connection = None
|
|
173
|
+
|
|
174
|
+
async def authenticate(self, user: User) -> None:
|
|
175
|
+
# Local transport does not require any auth
|
|
176
|
+
return
|
|
177
|
+
|
|
178
|
+
def open_demo(self, *args: Any, **kwargs: Any) -> None: ...
|
|
179
|
+
|
|
180
|
+
async def open_demo_for_agent(
|
|
181
|
+
self, agent: "Agent", call_type: str, call_id: str
|
|
182
|
+
) -> None:
|
|
183
|
+
"""Open a tkinter window showing the agent's video output."""
|
|
184
|
+
if not agent.publish_video:
|
|
185
|
+
logger.info("Agent has no video output, skipping video display")
|
|
186
|
+
return
|
|
187
|
+
|
|
188
|
+
try:
|
|
189
|
+
self._video_display = VideoDisplay(
|
|
190
|
+
title="Agent Video Output",
|
|
191
|
+
width=self._video_width,
|
|
192
|
+
height=self._video_height,
|
|
193
|
+
fps=self._video_fps,
|
|
194
|
+
)
|
|
195
|
+
await self._video_display.start(self._output_video_track)
|
|
196
|
+
logger.info("Opened video display")
|
|
197
|
+
except RuntimeError:
|
|
198
|
+
logger.warning(
|
|
199
|
+
"Cannot open video display: tkinter is not available. "
|
|
200
|
+
"Install python3-tk or equivalent for your platform."
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
async def create_call(self, call_id: str, **kwargs: Any) -> LocalCall:
|
|
204
|
+
return LocalCall(id=call_id)
|
|
205
|
+
|
|
206
|
+
async def send_custom_event(self, data: dict[str, Any]) -> None:
|
|
207
|
+
raise NotImplementedError("LocalEdge does not support send_custom_event")
|
|
208
|
+
|
|
209
|
+
async def create_conversation(
|
|
210
|
+
self, call: Any, user: User, instructions: str
|
|
211
|
+
) -> InMemoryConversation:
|
|
212
|
+
return InMemoryConversation(instructions=instructions, messages=[])
|
|
213
|
+
|
|
214
|
+
def _emit_audio_event(self, data: np.ndarray) -> None:
|
|
215
|
+
"""Convert raw numpy audio to PcmData and emit AudioReceivedEvent."""
|
|
216
|
+
samples = data.flatten().astype(np.int16)
|
|
217
|
+
pcm = PcmData(
|
|
218
|
+
samples=samples,
|
|
219
|
+
sample_rate=self._audio_input.sample_rate,
|
|
220
|
+
format=AudioFormat.S16,
|
|
221
|
+
channels=self._audio_input.channels,
|
|
222
|
+
)
|
|
223
|
+
pcm.participant = self._participant
|
|
224
|
+
|
|
225
|
+
self.events.send(
|
|
226
|
+
AudioReceivedEvent(
|
|
227
|
+
plugin_name=PLUGIN_NAME,
|
|
228
|
+
pcm_data=pcm,
|
|
229
|
+
participant=self._participant,
|
|
230
|
+
)
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
async def _forward_video(self, source: aiortc.MediaStreamTrack) -> None:
|
|
234
|
+
"""Read frames from source track and push them to the output track."""
|
|
235
|
+
try:
|
|
236
|
+
while True:
|
|
237
|
+
frame = cast(av.VideoFrame, await source.recv())
|
|
238
|
+
await self._output_video_track.add_frame(frame)
|
|
239
|
+
except asyncio.CancelledError:
|
|
240
|
+
raise
|
|
241
|
+
except aiortc.MediaStreamError:
|
|
242
|
+
logger.debug("Source video track ended")
|
|
243
|
+
|
|
244
|
+
async def _mic_loop(self) -> None:
|
|
245
|
+
"""Read mic data via asyncio.to_thread and emit audio events."""
|
|
246
|
+
try:
|
|
247
|
+
while True:
|
|
248
|
+
data = await asyncio.to_thread(self._audio_input.read)
|
|
249
|
+
if data is not None:
|
|
250
|
+
self._emit_audio_event(data)
|
|
251
|
+
except asyncio.CancelledError:
|
|
252
|
+
logger.debug("Mic loop cancelled")
|
|
253
|
+
raise
|
|
254
|
+
|
|
255
|
+
async def _start_audio(self) -> None:
|
|
256
|
+
"""Start microphone capture via the audio input backend."""
|
|
257
|
+
if self._mic_task is not None:
|
|
258
|
+
return
|
|
259
|
+
|
|
260
|
+
self._audio_input.start()
|
|
261
|
+
logger.info(
|
|
262
|
+
"Started microphone: %dHz, %d channels",
|
|
263
|
+
self._audio_input.sample_rate,
|
|
264
|
+
self._audio_input.channels,
|
|
265
|
+
)
|
|
266
|
+
self._mic_task = asyncio.create_task(self._mic_loop())
|
|
267
|
+
|
|
268
|
+
async def _stop_audio(self) -> None:
|
|
269
|
+
"""Stop all audio and video streams."""
|
|
270
|
+
if self._mic_task is not None:
|
|
271
|
+
self._mic_task.cancel()
|
|
272
|
+
try:
|
|
273
|
+
await self._mic_task
|
|
274
|
+
except asyncio.CancelledError:
|
|
275
|
+
pass
|
|
276
|
+
self._mic_task = None
|
|
277
|
+
|
|
278
|
+
self._audio_input.stop()
|
|
279
|
+
logger.info("Stopped microphone")
|
|
280
|
+
|
|
281
|
+
if self._audio_track is not None:
|
|
282
|
+
self._audio_track.stop()
|
|
283
|
+
|
|
284
|
+
if self._input_video_track is not None:
|
|
285
|
+
self._input_video_track.stop()
|
|
286
|
+
self._input_video_track = None
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
class LocalConnection(Connection):
|
|
290
|
+
"""Connection wrapper for local transport."""
|
|
291
|
+
|
|
292
|
+
def __init__(self, transport: "LocalEdge"):
|
|
293
|
+
super().__init__()
|
|
294
|
+
self._transport = transport
|
|
295
|
+
|
|
296
|
+
def idle_since(self) -> float:
|
|
297
|
+
"""Local transport is never idle."""
|
|
298
|
+
return 0.0
|
|
299
|
+
|
|
300
|
+
async def wait_for_participant(self, timeout: float | None = None) -> None:
|
|
301
|
+
"""Local user is always present, return immediately."""
|
|
302
|
+
return
|
|
303
|
+
|
|
304
|
+
async def close(self, timeout: float = 2.0) -> None:
|
|
305
|
+
"""Close the local connection."""
|
|
306
|
+
await self._transport.close()
|
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LocalTransport: audio/video track implementations.
|
|
3
|
+
|
|
4
|
+
Provides LocalOutputAudioTrack for speaker playback and LocalVideoTrack
|
|
5
|
+
for camera capture, enabling vision agents to run locally without cloud
|
|
6
|
+
edge infrastructure.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
import logging
|
|
11
|
+
import platform
|
|
12
|
+
import threading
|
|
13
|
+
import time
|
|
14
|
+
from fractions import Fraction
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
import av
|
|
18
|
+
import numpy as np
|
|
19
|
+
import sounddevice as sd
|
|
20
|
+
from aiortc import AudioStreamTrack, VideoStreamTrack
|
|
21
|
+
from getstream.video.rtc.track_util import PcmData
|
|
22
|
+
|
|
23
|
+
from .devices import AudioOutputDevice
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _get_camera_input_format() -> str:
|
|
29
|
+
"""Get the FFmpeg input format for the current platform."""
|
|
30
|
+
system = platform.system()
|
|
31
|
+
if system == "Darwin":
|
|
32
|
+
return "avfoundation"
|
|
33
|
+
elif system == "Linux":
|
|
34
|
+
return "v4l2"
|
|
35
|
+
elif system == "Windows":
|
|
36
|
+
return "dshow"
|
|
37
|
+
else:
|
|
38
|
+
raise RuntimeError(f"Unsupported platform for camera capture: {system}")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class LocalOutputAudioTrack(AudioStreamTrack):
|
|
42
|
+
"""Audio track that plays PcmData through an AudioOutputDevice.
|
|
43
|
+
|
|
44
|
+
Uses an asyncio.Queue for backpressure: when the queue is full,
|
|
45
|
+
``write`` awaits until the playback task drains an item. The playback
|
|
46
|
+
task offloads blocking device writes via ``asyncio.to_thread``.
|
|
47
|
+
|
|
48
|
+
Extends AudioStreamTrack so it satisfies the MediaStreamTrack interface
|
|
49
|
+
required by EdgeTransport.publish_tracks. Since this is a write-only
|
|
50
|
+
(playback) track, recv() is not supported.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
def __init__(self, audio_output: AudioOutputDevice, buffer_limit: int = 20):
|
|
54
|
+
super().__init__()
|
|
55
|
+
self._audio_output = audio_output
|
|
56
|
+
self._queue: asyncio.Queue[np.ndarray] = asyncio.Queue(maxsize=buffer_limit)
|
|
57
|
+
self._running = False
|
|
58
|
+
self._playback_task: asyncio.Task[None] | None = None
|
|
59
|
+
self._write_lock = asyncio.Lock()
|
|
60
|
+
|
|
61
|
+
async def recv(self) -> av.AudioFrame:
|
|
62
|
+
"""Not supported — this is a write-only playback track."""
|
|
63
|
+
raise NotImplementedError(
|
|
64
|
+
"LocalOutputAudioTrack is a playback-only track; recv() is not supported"
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
def start(self) -> None:
|
|
68
|
+
"""Start the audio output stream."""
|
|
69
|
+
if self._running:
|
|
70
|
+
return
|
|
71
|
+
|
|
72
|
+
self._audio_output.start()
|
|
73
|
+
self._running = True
|
|
74
|
+
self._playback_task = asyncio.create_task(self._playback_loop())
|
|
75
|
+
|
|
76
|
+
async def write(self, data: PcmData) -> None:
|
|
77
|
+
"""Write PCM data to be played on the speaker."""
|
|
78
|
+
if not self._running:
|
|
79
|
+
return
|
|
80
|
+
|
|
81
|
+
async with self._write_lock:
|
|
82
|
+
samples = self._process_audio(data)
|
|
83
|
+
await self._queue.put(samples)
|
|
84
|
+
|
|
85
|
+
async def flush(self) -> None:
|
|
86
|
+
"""Clear any pending audio data and abort OS-level playback."""
|
|
87
|
+
async with self._write_lock:
|
|
88
|
+
while not self._queue.empty():
|
|
89
|
+
try:
|
|
90
|
+
self._queue.get_nowait()
|
|
91
|
+
except asyncio.QueueEmpty:
|
|
92
|
+
break
|
|
93
|
+
self._audio_output.flush()
|
|
94
|
+
|
|
95
|
+
def stop(self) -> None:
|
|
96
|
+
"""Stop the audio output stream."""
|
|
97
|
+
super().stop()
|
|
98
|
+
self._running = False
|
|
99
|
+
|
|
100
|
+
if self._playback_task is not None:
|
|
101
|
+
self._playback_task.cancel()
|
|
102
|
+
self._playback_task = None
|
|
103
|
+
|
|
104
|
+
while not self._queue.empty():
|
|
105
|
+
try:
|
|
106
|
+
self._queue.get_nowait()
|
|
107
|
+
except asyncio.QueueEmpty:
|
|
108
|
+
break
|
|
109
|
+
|
|
110
|
+
self._audio_output.stop()
|
|
111
|
+
|
|
112
|
+
async def _playback_loop(self) -> None:
|
|
113
|
+
"""Async task that drains the queue into the AudioOutput backend."""
|
|
114
|
+
try:
|
|
115
|
+
while True:
|
|
116
|
+
data = await self._queue.get()
|
|
117
|
+
try:
|
|
118
|
+
await asyncio.to_thread(self._audio_output.write, data)
|
|
119
|
+
except sd.PortAudioError as err:
|
|
120
|
+
logger.debug("PortAudio playback error: %s", err)
|
|
121
|
+
except asyncio.CancelledError:
|
|
122
|
+
logger.debug("Playback loop cancelled")
|
|
123
|
+
raise
|
|
124
|
+
except ValueError:
|
|
125
|
+
logger.exception("Audio data processing error")
|
|
126
|
+
except OSError:
|
|
127
|
+
logger.exception("Audio playback device error")
|
|
128
|
+
|
|
129
|
+
def _process_audio(self, data: PcmData) -> np.ndarray:
|
|
130
|
+
"""Resample and convert PcmData to flat int16 numpy for the backend."""
|
|
131
|
+
target_rate = self._audio_output.sample_rate
|
|
132
|
+
target_channels = self._audio_output.channels
|
|
133
|
+
|
|
134
|
+
if data.sample_rate != target_rate or data.channels != target_channels:
|
|
135
|
+
data = data.resample(target_rate, target_channels)
|
|
136
|
+
|
|
137
|
+
samples = data.to_int16().samples
|
|
138
|
+
|
|
139
|
+
if samples.ndim == 2:
|
|
140
|
+
samples = samples.T.flatten()
|
|
141
|
+
|
|
142
|
+
return samples
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
class LocalVideoTrack(VideoStreamTrack):
|
|
146
|
+
"""Video track that captures from local camera using PyAV."""
|
|
147
|
+
|
|
148
|
+
kind = "video"
|
|
149
|
+
|
|
150
|
+
def __init__(
|
|
151
|
+
self,
|
|
152
|
+
device: str,
|
|
153
|
+
width: int = 640,
|
|
154
|
+
height: int = 480,
|
|
155
|
+
fps: int = 30,
|
|
156
|
+
):
|
|
157
|
+
super().__init__()
|
|
158
|
+
|
|
159
|
+
self._device = device
|
|
160
|
+
self._width = width
|
|
161
|
+
self._height = height
|
|
162
|
+
self._fps = fps
|
|
163
|
+
self._container: Any = None
|
|
164
|
+
self._stream: Any = None
|
|
165
|
+
self._started = False
|
|
166
|
+
self._stopped = False
|
|
167
|
+
self._frame_count = 0
|
|
168
|
+
self._start_time: float | None = None
|
|
169
|
+
self._lock = threading.Lock()
|
|
170
|
+
|
|
171
|
+
def _open_camera(self) -> None:
|
|
172
|
+
"""Open the camera device with PyAV."""
|
|
173
|
+
input_format = _get_camera_input_format()
|
|
174
|
+
system = platform.system()
|
|
175
|
+
|
|
176
|
+
options: dict[str, str] = {
|
|
177
|
+
"framerate": str(self._fps),
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
if system == "Darwin":
|
|
181
|
+
device_path = self._device
|
|
182
|
+
options["video_size"] = f"{self._width}x{self._height}"
|
|
183
|
+
options["pixel_format"] = "uyvy422"
|
|
184
|
+
elif system == "Linux":
|
|
185
|
+
device_path = self._device
|
|
186
|
+
options["video_size"] = f"{self._width}x{self._height}"
|
|
187
|
+
elif system == "Windows":
|
|
188
|
+
device_path = self._device
|
|
189
|
+
options["video_size"] = f"{self._width}x{self._height}"
|
|
190
|
+
else:
|
|
191
|
+
raise RuntimeError(f"Unsupported platform: {system}")
|
|
192
|
+
|
|
193
|
+
self._container = av.open(
|
|
194
|
+
device_path,
|
|
195
|
+
format=input_format,
|
|
196
|
+
options=options,
|
|
197
|
+
)
|
|
198
|
+
self._stream = self._container.streams.video[0]
|
|
199
|
+
logger.info(
|
|
200
|
+
"Opened camera: %s (%dx%d @ %dfps)",
|
|
201
|
+
self._device,
|
|
202
|
+
self._width,
|
|
203
|
+
self._height,
|
|
204
|
+
self._fps,
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
def _read_frame(self, max_retries: int = 20, retry_timeout: float = 0.02) -> Any:
|
|
208
|
+
"""Read a single frame from the camera (blocking)."""
|
|
209
|
+
if self._container is None:
|
|
210
|
+
return None
|
|
211
|
+
|
|
212
|
+
for attempt in range(max_retries):
|
|
213
|
+
try:
|
|
214
|
+
for packet in self._container.demux(self._stream):
|
|
215
|
+
for frame in packet.decode():
|
|
216
|
+
return frame
|
|
217
|
+
except BlockingIOError:
|
|
218
|
+
if attempt < max_retries - 1:
|
|
219
|
+
time.sleep(retry_timeout)
|
|
220
|
+
continue
|
|
221
|
+
logger.debug("Camera not ready after %d retries", max_retries)
|
|
222
|
+
return None
|
|
223
|
+
except OSError:
|
|
224
|
+
logger.warning("Error reading camera frame", exc_info=True)
|
|
225
|
+
return None
|
|
226
|
+
return None
|
|
227
|
+
|
|
228
|
+
async def recv(self) -> av.VideoFrame:
|
|
229
|
+
"""Receive the next video frame."""
|
|
230
|
+
if self._stopped:
|
|
231
|
+
raise RuntimeError("Track has been stopped")
|
|
232
|
+
|
|
233
|
+
if not self._started:
|
|
234
|
+
self._started = True
|
|
235
|
+
self._start_time = time.time()
|
|
236
|
+
|
|
237
|
+
await asyncio.to_thread(self._open_camera)
|
|
238
|
+
|
|
239
|
+
frame = await asyncio.to_thread(self._read_frame)
|
|
240
|
+
|
|
241
|
+
if frame is None:
|
|
242
|
+
frame = av.VideoFrame(
|
|
243
|
+
width=self._width, height=self._height, format="rgb24"
|
|
244
|
+
)
|
|
245
|
+
frame.planes[0].update(bytes(self._width * self._height * 3))
|
|
246
|
+
|
|
247
|
+
self._frame_count += 1
|
|
248
|
+
frame.pts = self._frame_count
|
|
249
|
+
frame.time_base = Fraction(1, self._fps)
|
|
250
|
+
return frame
|
|
251
|
+
|
|
252
|
+
def stop(self) -> None:
|
|
253
|
+
"""Stop camera capture and release resources."""
|
|
254
|
+
with self._lock:
|
|
255
|
+
self._stopped = True
|
|
256
|
+
if self._container is not None:
|
|
257
|
+
try:
|
|
258
|
+
self._container.close()
|
|
259
|
+
except OSError:
|
|
260
|
+
logger.warning("Error closing camera")
|
|
261
|
+
self._container = None
|
|
262
|
+
self._stream = None
|
|
263
|
+
logger.info("Stopped camera capture")
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from typing import Callable, TypeVar
|
|
3
|
+
|
|
4
|
+
if sys.platform != "win32":
|
|
5
|
+
import termios
|
|
6
|
+
|
|
7
|
+
def safe_input(prompt: str) -> str:
|
|
8
|
+
"""Call input() after ensuring the terminal translates CR to NL.
|
|
9
|
+
|
|
10
|
+
PortAudio (via sounddevice) can disable the ICRNL terminal flag,
|
|
11
|
+
which causes Enter (CR) to show as ^M instead of submitting input.
|
|
12
|
+
"""
|
|
13
|
+
if sys.stdin.isatty():
|
|
14
|
+
fd = sys.stdin.fileno()
|
|
15
|
+
attrs = termios.tcgetattr(fd)
|
|
16
|
+
if not (attrs[0] & termios.ICRNL):
|
|
17
|
+
attrs[0] |= termios.ICRNL
|
|
18
|
+
termios.tcsetattr(fd, termios.TCSANOW, attrs)
|
|
19
|
+
return input(prompt)
|
|
20
|
+
|
|
21
|
+
else:
|
|
22
|
+
|
|
23
|
+
def safe_input(prompt: str) -> str:
|
|
24
|
+
return input(prompt)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
T = TypeVar("T")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def prompt_selection(
|
|
31
|
+
items: list[T],
|
|
32
|
+
formatter: Callable[[T], str],
|
|
33
|
+
header: str,
|
|
34
|
+
default: T | None = None,
|
|
35
|
+
allow_skip: bool = False,
|
|
36
|
+
empty_message: str | None = None,
|
|
37
|
+
) -> T | None:
|
|
38
|
+
"""Interactive terminal prompt to pick one item from a list."""
|
|
39
|
+
print("\n" + "=" * 50)
|
|
40
|
+
print(header)
|
|
41
|
+
print("=" * 50)
|
|
42
|
+
|
|
43
|
+
if not items:
|
|
44
|
+
if empty_message:
|
|
45
|
+
print(f" {empty_message}")
|
|
46
|
+
print("-" * 50 + "\n")
|
|
47
|
+
return None
|
|
48
|
+
|
|
49
|
+
for i, item in enumerate(items):
|
|
50
|
+
print(f" {i}: {formatter(item)}")
|
|
51
|
+
|
|
52
|
+
if allow_skip:
|
|
53
|
+
print(" n: Skip (none)")
|
|
54
|
+
|
|
55
|
+
print("-" * 50)
|
|
56
|
+
|
|
57
|
+
while True:
|
|
58
|
+
try:
|
|
59
|
+
if allow_skip:
|
|
60
|
+
text = f"Select [0-{len(items) - 1}] or 'n' to skip: "
|
|
61
|
+
elif default is not None:
|
|
62
|
+
text = f"Select [0-{len(items) - 1}] (Enter for default): "
|
|
63
|
+
else:
|
|
64
|
+
text = f"Select [0-{len(items) - 1}]: "
|
|
65
|
+
|
|
66
|
+
choice = safe_input(text).strip().lower()
|
|
67
|
+
|
|
68
|
+
if choice == "" and default is not None:
|
|
69
|
+
print(f" -> Using default: {formatter(default)}")
|
|
70
|
+
return default
|
|
71
|
+
|
|
72
|
+
if choice in ("n", "") and allow_skip:
|
|
73
|
+
print(" -> No selection")
|
|
74
|
+
print("-" * 50 + "\n")
|
|
75
|
+
return None
|
|
76
|
+
|
|
77
|
+
idx = int(choice)
|
|
78
|
+
if 0 <= idx < len(items):
|
|
79
|
+
selected = items[idx]
|
|
80
|
+
print(f" -> Selected: {formatter(selected)}")
|
|
81
|
+
print("-" * 50 + "\n")
|
|
82
|
+
return selected
|
|
83
|
+
|
|
84
|
+
print(f" Invalid choice, enter 0-{len(items) - 1}")
|
|
85
|
+
except ValueError:
|
|
86
|
+
print(" Please enter a number" + (" or 'n'" if allow_skip else ""))
|