vision-agents-plugins-local 0.4.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,98 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .cursor/*
7
+ # Distribution / packaging
8
+ .Python
9
+ build/
10
+ dist/
11
+ downloads/
12
+ develop-eggs/
13
+ eggs/
14
+ .eggs/
15
+ lib64/
16
+ parts/
17
+ sdist/
18
+ var/
19
+ wheels/
20
+ share/python-wheels/
21
+ pip-wheel-metadata/
22
+ MANIFEST
23
+ *.egg-info/
24
+ *.egg
25
+
26
+ # Installer logs
27
+ pip-log.txt
28
+ pip-delete-this-directory.txt
29
+
30
+ # Unit test / coverage reports
31
+ htmlcov/
32
+ .tox/
33
+ .nox/
34
+ .coverage
35
+ .coverage.*
36
+ .cache
37
+ coverage.xml
38
+ nosetests.xml
39
+ *.cover
40
+ *.py,cover
41
+ .hypothesis/
42
+ .pytest_cache/
43
+
44
+ # Type checker / lint caches
45
+ .mypy_cache/
46
+ .dmypy.json
47
+ dmypy.json
48
+ .pytype/
49
+ .pyre/
50
+ .ruff_cache/
51
+
52
+ # Environments
53
+ .venv
54
+ env/
55
+ venv/
56
+ ENV/
57
+ env.bak/
58
+ venv.bak/
59
+ .env
60
+ .env.local
61
+ .env.*.local
62
+ .env.bak
63
+ pyvenv.cfg
64
+ .python-version
65
+
66
+ # Editors / IDEs
67
+ .vscode/
68
+ .idea/
69
+
70
+ # Jupyter Notebook
71
+ .ipynb_checkpoints/
72
+
73
+ # OS / Misc
74
+ .DS_Store
75
+ *.log
76
+
77
+ # Tooling & repo-specific
78
+ pyrightconfig.json
79
+ shell.nix
80
+ bin/*
81
+ lib/*
82
+ stream-py/
83
+
84
+ # Example lock files (regenerated by uv sync)
85
+ examples/*/uv.lock
86
+ plugins/*/example/uv.lock
87
+
88
+ # Artifacts / assets
89
+ *.pt
90
+ *.kef
91
+ *.onnx
92
+ profile.html
93
+
94
+ /opencode.json
95
+ .ralph-tui/
96
+ .claude/
97
+
98
+ .uv-cache/
@@ -0,0 +1,14 @@
1
+ Metadata-Version: 2.4
2
+ Name: vision-agents-plugins-local
3
+ Version: 0.4.4
4
+ Summary: Local audio & video integration for Vision Agents
5
+ Project-URL: Documentation, https://visionagents.ai/
6
+ Project-URL: Website, https://visionagents.ai/
7
+ Project-URL: Source, https://github.com/GetStream/Vision-Agents
8
+ License-Expression: MIT
9
+ Keywords: AI,agents,local,voice agents
10
+ Requires-Python: >=3.10
11
+ Requires-Dist: aiortc<1.15.0,>=1.14.0
12
+ Requires-Dist: av<17,>=14.2.0
13
+ Requires-Dist: sounddevice>=0.5.0
14
+ Requires-Dist: vision-agents
File without changes
@@ -0,0 +1,42 @@
1
+ [build-system]
2
+ requires = ["hatchling", "hatch-vcs"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "vision-agents-plugins-local"
7
+ dynamic = ["version"]
8
+ description = "Local audio & video integration for Vision Agents"
9
+ readme = "README.md"
10
+ keywords = ["local", "AI", "voice agents", "agents"]
11
+ requires-python = ">=3.10"
12
+ license = "MIT"
13
+ dependencies = [
14
+ "vision-agents",
15
+ "sounddevice>=0.5.0",
16
+ "aiortc>=1.14.0, <1.15.0",
17
+ "av>=14.2.0, <17",
18
+ ]
19
+
20
+ [project.urls]
21
+ Documentation = "https://visionagents.ai/"
22
+ Website = "https://visionagents.ai/"
23
+ Source = "https://github.com/GetStream/Vision-Agents"
24
+
25
+ [tool.hatch.version]
26
+ source = "vcs"
27
+ raw-options = { root = "..", search_parent_directories = true, fallback_version = "0.0.0" }
28
+
29
+ [tool.hatch.build.targets.wheel]
30
+ packages = ["."]
31
+
32
+ [tool.hatch.build.targets.sdist]
33
+ include = ["/vision_agents"]
34
+
35
+ [tool.uv.sources]
36
+ vision-agents = { workspace = true }
37
+
38
+ [dependency-groups]
39
+ dev = [
40
+ "pytest>=8.4.1",
41
+ "pytest-asyncio>=1.0.0",
42
+ ]
@@ -0,0 +1,10 @@
1
+ from .devices import AudioInputDevice, AudioOutputDevice, CameraDevice
2
+ from .edge import LocalCall, LocalEdge
3
+
4
+ __all__ = [
5
+ "AudioInputDevice",
6
+ "AudioOutputDevice",
7
+ "CameraDevice",
8
+ "LocalCall",
9
+ "LocalEdge",
10
+ ]
@@ -0,0 +1,370 @@
1
+ """
2
+ Device enumeration and selection utilities for LocalTransport.
3
+
4
+ Provides typed device representations and interactive prompts for
5
+ selecting audio and video devices when running agents locally.
6
+ """
7
+
8
+ import glob
9
+ import logging
10
+ import platform
11
+ import queue
12
+ import re
13
+ import subprocess
14
+ from dataclasses import dataclass
15
+
16
+ import numpy as np
17
+ import sounddevice as sd
18
+
19
+ from .utils import prompt_selection
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ _AVFOUNDATION_RE = re.compile(r"\[AVFoundation.*?\]\s*\[(\d+)\]\s*(.+)")
25
+ _DSHOW_DEVICE_RE = re.compile(r'"(.+?)"')
26
+
27
+
28
+ class AudioInputDevice:
29
+ """Audio input device (microphone).
30
+
31
+ Combines device metadata with stream capture. Subclass to implement
32
+ custom audio backends (e.g. GStreamer).
33
+ """
34
+
35
+ def __init__(
36
+ self,
37
+ index: int,
38
+ name: str,
39
+ sample_rate: int = 48000,
40
+ channels: int = 1,
41
+ is_default: bool = False,
42
+ blocksize: int | None = None,
43
+ ):
44
+ self.index = index
45
+ self.name = name
46
+ self._sample_rate = sample_rate
47
+ self._channels = channels
48
+ self.is_default = is_default
49
+ self._blocksize = (
50
+ blocksize if blocksize is not None else int(sample_rate * 0.02)
51
+ )
52
+ self._stream: sd.InputStream | None = None
53
+ self._buffer: queue.Queue[np.ndarray] = queue.Queue(maxsize=200)
54
+
55
+ @property
56
+ def sample_rate(self) -> int:
57
+ return self._sample_rate
58
+
59
+ @property
60
+ def channels(self) -> int:
61
+ return self._channels
62
+
63
+ def _callback(
64
+ self,
65
+ indata: np.ndarray,
66
+ frames: int,
67
+ time_info: object,
68
+ status: object,
69
+ ) -> None:
70
+ if status:
71
+ logger.warning("Audio input status: %s", status)
72
+ try:
73
+ self._buffer.put_nowait(indata.copy())
74
+ except queue.Full:
75
+ pass
76
+
77
+ def start(self) -> None:
78
+ """Open and start the audio input stream."""
79
+ self._stream = sd.InputStream(
80
+ samplerate=self._sample_rate,
81
+ channels=self._channels,
82
+ dtype="int16",
83
+ blocksize=self._blocksize,
84
+ device=self.index,
85
+ callback=self._callback,
86
+ )
87
+ self._stream.start()
88
+ logger.info(
89
+ "Started audio input: %dHz, %d channels",
90
+ self._sample_rate,
91
+ self._channels,
92
+ )
93
+
94
+ def read(self) -> np.ndarray | None:
95
+ """Block until audio data is available (up to 100ms timeout)."""
96
+ try:
97
+ return self._buffer.get(timeout=0.1)
98
+ except queue.Empty:
99
+ return None
100
+
101
+ def stop(self) -> None:
102
+ """Stop and close the audio input stream."""
103
+ if self._stream is not None:
104
+ self._stream.stop()
105
+ self._stream.close()
106
+ self._stream = None
107
+ logger.info("Stopped audio input")
108
+
109
+
110
+ class AudioOutputDevice:
111
+ """Audio output device (speaker/headphones).
112
+
113
+ Combines device metadata with stream playback. Subclass to implement
114
+ custom audio backends (e.g. GStreamer).
115
+ """
116
+
117
+ def __init__(
118
+ self,
119
+ index: int,
120
+ name: str,
121
+ sample_rate: int = 48000,
122
+ channels: int = 2,
123
+ is_default: bool = False,
124
+ blocksize: int = 2048,
125
+ ):
126
+ self.index = index
127
+ self.name = name
128
+ self._sample_rate = sample_rate
129
+ self._channels = channels
130
+ self.is_default = is_default
131
+ self._blocksize = blocksize
132
+ self._stream: sd.OutputStream | None = None
133
+
134
+ @property
135
+ def sample_rate(self) -> int:
136
+ return self._sample_rate
137
+
138
+ @property
139
+ def channels(self) -> int:
140
+ return self._channels
141
+
142
+ def start(self) -> None:
143
+ """Open and start the audio output stream."""
144
+ self._stream = sd.OutputStream(
145
+ samplerate=self._sample_rate,
146
+ channels=self._channels,
147
+ dtype="int16",
148
+ blocksize=self._blocksize,
149
+ device=self.index,
150
+ )
151
+ self._stream.start()
152
+ logger.info(
153
+ "Started audio output: %dHz, %d channels",
154
+ self._sample_rate,
155
+ self._channels,
156
+ )
157
+
158
+ def write(self, samples: np.ndarray) -> None:
159
+ """Write flat int16 samples to the device."""
160
+ if self._stream is None:
161
+ return
162
+ frames = len(samples) // self._channels
163
+ audio = samples.reshape(frames, self._channels)
164
+ self._stream.write(audio)
165
+
166
+ def flush(self) -> None:
167
+ """Abort current playback and restart the stream."""
168
+ if self._stream is not None:
169
+ self._stream.abort()
170
+ self._stream.start()
171
+
172
+ def stop(self) -> None:
173
+ """Stop and close the audio output stream."""
174
+ if self._stream is not None:
175
+ self._stream.stop()
176
+ self._stream.close()
177
+ self._stream = None
178
+ logger.info("Stopped audio output")
179
+
180
+
181
+ @dataclass(frozen=True)
182
+ class CameraDevice:
183
+ """A detected camera."""
184
+
185
+ index: int
186
+ name: str
187
+ device: str
188
+
189
+
190
+ def list_audio_input_devices() -> list[AudioInputDevice]:
191
+ """Return all audio input devices."""
192
+ raw = sd.query_devices()
193
+ default_in = sd.default.device[0]
194
+ return [
195
+ AudioInputDevice(
196
+ index=i,
197
+ name=dev["name"],
198
+ sample_rate=int(dev["default_samplerate"]),
199
+ channels=dev["max_input_channels"],
200
+ is_default=(i == default_in),
201
+ )
202
+ for i, dev in enumerate(raw)
203
+ if dev["max_input_channels"] > 0
204
+ ]
205
+
206
+
207
+ def list_audio_output_devices() -> list[AudioOutputDevice]:
208
+ """Return all audio output devices."""
209
+ raw = sd.query_devices()
210
+ default_out = sd.default.device[1]
211
+ return [
212
+ AudioOutputDevice(
213
+ index=i,
214
+ name=dev["name"],
215
+ sample_rate=int(dev["default_samplerate"]),
216
+ channels=dev["max_output_channels"],
217
+ is_default=(i == default_out),
218
+ )
219
+ for i, dev in enumerate(raw)
220
+ if dev["max_output_channels"] > 0
221
+ ]
222
+
223
+
224
+ def select_audio_input_device() -> AudioInputDevice | None:
225
+ """Interactive prompt to select an audio input device."""
226
+ devices = list_audio_input_devices()
227
+ default = next((d for d in devices if d.is_default), None)
228
+ return prompt_selection(
229
+ items=devices,
230
+ formatter=_format_audio_device,
231
+ header="INPUT DEVICES (Microphones)",
232
+ default=default,
233
+ )
234
+
235
+
236
+ def select_audio_output_device() -> AudioOutputDevice | None:
237
+ """Interactive prompt to select an audio output device."""
238
+ devices = list_audio_output_devices()
239
+ default = next((d for d in devices if d.is_default), None)
240
+ return prompt_selection(
241
+ items=devices,
242
+ formatter=_format_audio_device,
243
+ header="OUTPUT DEVICES (Speakers)",
244
+ default=default,
245
+ )
246
+
247
+
248
+ def select_video_device() -> CameraDevice | None:
249
+ """Interactive prompt to select a camera or skip.
250
+
251
+ Returns:
252
+ The selected camera device, or None if skipped.
253
+ """
254
+ cameras = list_cameras()
255
+
256
+ return prompt_selection(
257
+ items=cameras,
258
+ formatter=lambda c: c.name,
259
+ header="VIDEO DEVICES (Cameras)",
260
+ allow_skip=True,
261
+ empty_message="No cameras detected\n (Camera support requires ffmpeg to be installed)",
262
+ )
263
+
264
+
265
+ def list_cameras() -> list[CameraDevice]:
266
+ """List available cameras on the system."""
267
+ system = platform.system()
268
+
269
+ if system == "Darwin":
270
+ return _list_cameras_darwin()
271
+ if system == "Linux":
272
+ return _list_cameras_linux()
273
+ if system == "Windows":
274
+ return _list_cameras_windows()
275
+
276
+ return []
277
+
278
+
279
+ def _format_audio_device(dev: AudioInputDevice | AudioOutputDevice) -> str:
280
+ """Format an audio device for display."""
281
+ default = " [DEFAULT]" if dev.is_default else ""
282
+ return f"{dev.name} ({dev.sample_rate}Hz){default}"
283
+
284
+
285
+ def _list_cameras_darwin() -> list[CameraDevice]:
286
+ """List cameras on macOS via ffmpeg/AVFoundation."""
287
+ try:
288
+ result = subprocess.run(
289
+ ["ffmpeg", "-f", "avfoundation", "-list_devices", "true", "-i", ""],
290
+ capture_output=True,
291
+ text=True,
292
+ timeout=5,
293
+ )
294
+ except (subprocess.TimeoutExpired, FileNotFoundError):
295
+ logger.warning("Failed to list cameras (is ffmpeg installed?)")
296
+ return []
297
+
298
+ cameras: list[CameraDevice] = []
299
+ in_video_section = False
300
+
301
+ for line in result.stderr.splitlines():
302
+ if "AVFoundation video devices:" in line:
303
+ in_video_section = True
304
+ continue
305
+ if "AVFoundation audio devices:" in line:
306
+ break
307
+ if in_video_section:
308
+ match = _AVFOUNDATION_RE.search(line)
309
+ if match:
310
+ cam_idx = int(match.group(1))
311
+ cameras.append(
312
+ CameraDevice(
313
+ index=cam_idx, name=match.group(2), device=str(cam_idx)
314
+ )
315
+ )
316
+
317
+ return cameras
318
+
319
+
320
+ def _list_cameras_linux() -> list[CameraDevice]:
321
+ """List cameras on Linux via /dev/video* and sysfs."""
322
+ cameras: list[CameraDevice] = []
323
+
324
+ for i, dev_path in enumerate(sorted(glob.glob("/dev/video*"))):
325
+ name_path = f"/sys/class/video4linux/{dev_path.split('/')[-1]}/name"
326
+ try:
327
+ with open(name_path) as f:
328
+ name = f.read().strip()
329
+ except OSError:
330
+ name = dev_path
331
+ cameras.append(CameraDevice(index=i, name=name, device=dev_path))
332
+
333
+ return cameras
334
+
335
+
336
+ def _list_cameras_windows() -> list[CameraDevice]:
337
+ """List cameras on Windows via ffmpeg/DirectShow."""
338
+ try:
339
+ result = subprocess.run(
340
+ ["ffmpeg", "-f", "dshow", "-list_devices", "true", "-i", "dummy"],
341
+ capture_output=True,
342
+ text=True,
343
+ timeout=5,
344
+ )
345
+ except (subprocess.TimeoutExpired, FileNotFoundError):
346
+ logger.warning("Failed to list cameras (is ffmpeg installed?)")
347
+ return []
348
+
349
+ cameras: list[CameraDevice] = []
350
+ in_video_section = False
351
+
352
+ for line in result.stderr.splitlines():
353
+ if "DirectShow video devices" in line:
354
+ in_video_section = True
355
+ continue
356
+ if "DirectShow audio devices" in line:
357
+ break
358
+ if in_video_section:
359
+ match = _DSHOW_DEVICE_RE.search(line)
360
+ if match:
361
+ name = match.group(1)
362
+ cameras.append(
363
+ CameraDevice(
364
+ index=len(cameras),
365
+ name=name,
366
+ device=f'video="{name}"',
367
+ )
368
+ )
369
+
370
+ return cameras
@@ -0,0 +1,181 @@
1
+ """Tkinter-based video display for LocalEdge.
2
+
3
+ Shows the agent's outbound video track in a tkinter window.
4
+ Gracefully handles environments where tkinter is not available.
5
+ """
6
+
7
+ import asyncio
8
+ import contextlib
9
+ import logging
10
+ import signal
11
+ import threading
12
+ import warnings
13
+ from typing import cast
14
+
15
+ import av
16
+ from aiortc import MediaStreamError, MediaStreamTrack
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ try:
21
+ import tkinter
22
+
23
+ _TKINTER_AVAILABLE = True
24
+ except ImportError:
25
+ _TKINTER_AVAILABLE = False
26
+
27
+
28
+ def _fit_size(src_w: int, src_h: int, dst_w: int, dst_h: int) -> tuple[int, int]:
29
+ """Compute the largest size that fits dst while preserving src aspect ratio."""
30
+ scale = min(dst_w / src_w, dst_h / src_h)
31
+ return max(2, int(src_w * scale)) & ~1, max(2, int(src_h * scale)) & ~1
32
+
33
+
34
+ def _frame_to_ppm(frame: av.VideoFrame, width: int, height: int) -> bytes:
35
+ """Convert an av.VideoFrame to PPM bytes, scaled to fit width x height."""
36
+ fit_w, fit_h = _fit_size(frame.width, frame.height, width, height)
37
+ rgb = frame.reformat(width=fit_w, height=fit_h, format="rgb24")
38
+ pixels = rgb.to_ndarray()
39
+ header = f"P6 {fit_w} {fit_h} 255 ".encode()
40
+ return header + pixels.tobytes()
41
+
42
+
43
+ class VideoDisplay:
44
+ """Displays video frames from a MediaStreamTrack in a tkinter window.
45
+
46
+ Tkinter events are pumped from the asyncio event loop (main thread)
47
+ to satisfy macOS Cocoa requirements. A thread-safe queue bridges the
48
+ async frame receiver and the display update loop.
49
+ """
50
+
51
+ def __init__(
52
+ self,
53
+ title: str = "Agent Video Output",
54
+ width: int = 640,
55
+ height: int = 480,
56
+ fps: int = 30,
57
+ ):
58
+ if fps <= 0:
59
+ raise ValueError("fps must be > 0")
60
+ if width <= 0:
61
+ raise ValueError("width must be > 0")
62
+ if height <= 0:
63
+ raise ValueError("height must be > 0")
64
+ self._title = title
65
+ self._width = width
66
+ self._height = height
67
+ self._frame_interval = 1.0 / fps
68
+ self._running = False
69
+ self._latest_frame: av.VideoFrame | None = None
70
+ self._frame_lock = threading.Lock()
71
+ self._recv_task: asyncio.Task[None] | None = None
72
+ self._tk_task: asyncio.Task[None] | None = None
73
+ self._root: tkinter.Tk | None = None
74
+
75
+ async def start(self, video_track: MediaStreamTrack) -> None:
76
+ """Start displaying frames from the given video track.
77
+
78
+ If tkinter is not available, emits an ImportWarning and returns
79
+ without starting.
80
+ """
81
+ if not _TKINTER_AVAILABLE:
82
+ warnings.warn(
83
+ "tkinter is not available. Install python3-tk or equivalent "
84
+ "for your platform to use the video display.",
85
+ ImportWarning,
86
+ )
87
+ return
88
+
89
+ self._running = True
90
+ self._recv_task = asyncio.create_task(self._recv_loop(video_track))
91
+ self._tk_task = asyncio.create_task(self._tk_loop())
92
+
93
+ async def stop(self) -> None:
94
+ """Stop the display and clean up tasks."""
95
+ self._running = False
96
+
97
+ for task in (self._recv_task, self._tk_task):
98
+ if task is not None:
99
+ task.cancel()
100
+ try:
101
+ await task
102
+ except asyncio.CancelledError:
103
+ pass
104
+
105
+ self._recv_task = None
106
+ self._tk_task = None
107
+
108
+ async def _recv_loop(self, track: MediaStreamTrack) -> None:
109
+ """Consume frames from the video track and store the latest."""
110
+ try:
111
+ while self._running:
112
+ frame = cast(av.VideoFrame, await track.recv())
113
+ with self._frame_lock:
114
+ self._latest_frame = frame
115
+ except asyncio.CancelledError:
116
+ raise
117
+ except MediaStreamError:
118
+ logger.debug("Video track ended")
119
+ except RuntimeError:
120
+ logger.debug("Video track stopped")
121
+
122
+ async def _tk_loop(self) -> None:
123
+ """Pump Tkinter events from the asyncio event loop (main thread)."""
124
+ root: tkinter.Tk | None = None
125
+ prev_sigint = signal.getsignal(signal.SIGINT)
126
+ try:
127
+ root = tkinter.Tk()
128
+ # Tk() overrides SIGINT — restore the previous handler (typically
129
+ # asyncio's) so the first Ctrl+C gracefully cancels the main task
130
+ # instead of raising KeyboardInterrupt inside this task.
131
+ signal.signal(signal.SIGINT, prev_sigint)
132
+ root.title(self._title)
133
+ root.geometry(f"{self._width}x{self._height}")
134
+ root.protocol("WM_DELETE_WINDOW", self._on_window_close)
135
+ self._root = root
136
+
137
+ gray = bytes([128] * (self._width * self._height * 3))
138
+ header = f"P6 {self._width} {self._height} 255 ".encode()
139
+ self._photo = tkinter.PhotoImage(data=header + gray)
140
+
141
+ self._label = tkinter.Label(root, image=self._photo)
142
+ self._label.pack(fill="both", expand=True)
143
+
144
+ while self._running:
145
+ with self._frame_lock:
146
+ frame = self._latest_frame
147
+ self._latest_frame = None
148
+
149
+ if frame is not None:
150
+ ppm = await asyncio.to_thread(
151
+ _frame_to_ppm,
152
+ frame,
153
+ self._width,
154
+ self._height,
155
+ )
156
+ self._photo = tkinter.PhotoImage(data=ppm)
157
+ self._label.configure(image=self._photo)
158
+
159
+ try:
160
+ root.update()
161
+ except (tkinter.TclError, KeyboardInterrupt):
162
+ break
163
+
164
+ await asyncio.sleep(self._frame_interval)
165
+ except asyncio.CancelledError:
166
+ raise
167
+ finally:
168
+ if root is not None:
169
+ with contextlib.suppress(tkinter.TclError, KeyboardInterrupt):
170
+ root.destroy()
171
+ self._root = None
172
+
173
+ def _on_window_close(self) -> None:
174
+ """Handle the user closing the tkinter window."""
175
+ self._running = False
176
+ if self._root is not None:
177
+ try:
178
+ self._root.destroy()
179
+ except tkinter.TclError:
180
+ pass
181
+ self._root = None
@@ -0,0 +1,306 @@
1
+ import asyncio
2
+ import logging
3
+ from dataclasses import dataclass
4
+ from typing import TYPE_CHECKING, Any, cast
5
+
6
+ import aiortc
7
+ import av
8
+ import numpy as np
9
+ from getstream.video.rtc.track_util import AudioFormat, PcmData
10
+ from vision_agents.core.agents.conversation import InMemoryConversation
11
+ from vision_agents.core.edge.edge_transport import EdgeTransport
12
+ from vision_agents.core.edge.events import AudioReceivedEvent, TrackAddedEvent
13
+ from vision_agents.core.edge.types import Connection, Participant, TrackType, User
14
+ from vision_agents.core.utils.utils import cancel_and_wait
15
+ from vision_agents.core.utils.video_track import QueuedVideoTrack
16
+
17
+ from .devices import AudioInputDevice, AudioOutputDevice, CameraDevice
18
+ from .display import VideoDisplay
19
+ from .tracks import LocalOutputAudioTrack, LocalVideoTrack
20
+
21
+ if TYPE_CHECKING:
22
+ from vision_agents.core.agents.agents import Agent
23
+
24
+ PLUGIN_NAME = "local"
25
+ LOCAL_VIDEO_TRACK_ID = "local-video-track"
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ @dataclass
31
+ class LocalCall:
32
+ """Minimal Call-compatible object for local transport."""
33
+
34
+ id: str
35
+
36
+
37
+ class LocalEdge(EdgeTransport):
38
+ """EdgeTransport implementation for local audio/video I/O.
39
+
40
+ Captures microphone audio via AudioInputDevice and plays agent audio
41
+ through AudioOutputDevice (both default to sounddevice). Optionally
42
+ captures camera video via CameraDevice and displays agent video output
43
+ in a tkinter window. Subclass the device classes to use alternative
44
+ backends (e.g. GStreamer).
45
+ """
46
+
47
+ def __init__(
48
+ self,
49
+ audio_input: AudioInputDevice,
50
+ audio_output: AudioOutputDevice,
51
+ video_input: CameraDevice | None = None,
52
+ video_width: int = 640,
53
+ video_height: int = 480,
54
+ video_fps: int = 30,
55
+ ):
56
+ """Create a local edge transport.
57
+
58
+ Args:
59
+ audio_input: Microphone device for capturing user audio.
60
+ audio_output: Speaker device for playing agent audio.
61
+ video_input: Camera device for capturing user video. None disables video.
62
+ video_width: Width of the video frame in pixels.
63
+ video_height: Height of the video frame in pixels.
64
+ video_fps: Video frame rate.
65
+ """
66
+ super().__init__()
67
+
68
+ self._audio_input = audio_input
69
+ self._audio_output = audio_output
70
+
71
+ self._video_input = video_input.device if video_input else None
72
+ self._video_width = video_width
73
+ self._video_height = video_height
74
+ self._video_fps = video_fps
75
+
76
+ self._participant = Participant(
77
+ original=None,
78
+ user_id="local",
79
+ id="local",
80
+ )
81
+
82
+ self._mic_task: asyncio.Task[None] | None = None
83
+ self._video_forward_task: asyncio.Task[None] | None = None
84
+ self._audio_track: LocalOutputAudioTrack | None = None
85
+ self._input_video_track: LocalVideoTrack | None = None
86
+ self._output_video_track = QueuedVideoTrack(
87
+ width=video_width,
88
+ height=video_height,
89
+ fps=video_fps,
90
+ )
91
+ self._video_display: VideoDisplay | None = None
92
+ self._connection: LocalConnection | None = None
93
+
94
+ async def publish_tracks(
95
+ self,
96
+ audio_track: aiortc.MediaStreamTrack | None,
97
+ video_track: aiortc.MediaStreamTrack | None,
98
+ ) -> None:
99
+ """Publish the agent's media tracks locally."""
100
+ if audio_track is not None and isinstance(audio_track, LocalOutputAudioTrack):
101
+ audio_track.start()
102
+ logger.info("Audio track published and started")
103
+
104
+ if video_track is not None:
105
+ self._video_forward_task = asyncio.create_task(
106
+ self._forward_video(video_track)
107
+ )
108
+ logger.info("Video output track published")
109
+
110
+ def create_audio_track(self) -> "LocalOutputAudioTrack":
111
+ """Create an audio track that plays through the audio output backend."""
112
+ self._audio_track = LocalOutputAudioTrack(
113
+ audio_output=self._audio_output,
114
+ )
115
+ return self._audio_track
116
+
117
+ def create_video_track(self) -> LocalVideoTrack | None:
118
+ """Create a video track for the agent's camera input."""
119
+ if self._video_input is None:
120
+ logger.debug("No video device configured, skipping video track creation")
121
+ return None
122
+
123
+ self._input_video_track = LocalVideoTrack(
124
+ device=self._video_input,
125
+ width=self._video_width,
126
+ height=self._video_height,
127
+ fps=self._video_fps,
128
+ )
129
+ return self._input_video_track
130
+
131
+ def add_track_subscriber(self, track_id: str) -> LocalVideoTrack | None:
132
+ """Return the local camera video track if available."""
133
+ if track_id == LOCAL_VIDEO_TRACK_ID and self._input_video_track is not None:
134
+ return self._input_video_track
135
+ return None
136
+
137
+ async def join(
138
+ self, agent: "Agent", call: Any = None, **kwargs: Any
139
+ ) -> "LocalConnection":
140
+ """Start microphone capture and optionally camera."""
141
+ await self._start_audio()
142
+
143
+ if self._video_input is not None:
144
+ video_track = self.create_video_track()
145
+ if video_track is not None:
146
+ self.events.send(
147
+ TrackAddedEvent(
148
+ plugin_name=PLUGIN_NAME,
149
+ track_id=LOCAL_VIDEO_TRACK_ID,
150
+ track_type=TrackType.VIDEO,
151
+ participant=self._participant,
152
+ )
153
+ )
154
+ logger.info("Camera video track added")
155
+
156
+ self._connection = LocalConnection(self)
157
+ return self._connection
158
+
159
+ async def close(self) -> None:
160
+ """Stop audio/video and release all resources."""
161
+ if self._video_forward_task is not None:
162
+ await cancel_and_wait(self._video_forward_task)
163
+ self._video_forward_task = None
164
+
165
+ self._output_video_track.stop()
166
+
167
+ if self._video_display is not None:
168
+ await self._video_display.stop()
169
+ self._video_display = None
170
+
171
+ await self._stop_audio()
172
+ self._connection = None
173
+
174
+ async def authenticate(self, user: User) -> None:
175
+ # Local transport does not require any auth
176
+ return
177
+
178
+ def open_demo(self, *args: Any, **kwargs: Any) -> None: ...
179
+
180
+ async def open_demo_for_agent(
181
+ self, agent: "Agent", call_type: str, call_id: str
182
+ ) -> None:
183
+ """Open a tkinter window showing the agent's video output."""
184
+ if not agent.publish_video:
185
+ logger.info("Agent has no video output, skipping video display")
186
+ return
187
+
188
+ try:
189
+ self._video_display = VideoDisplay(
190
+ title="Agent Video Output",
191
+ width=self._video_width,
192
+ height=self._video_height,
193
+ fps=self._video_fps,
194
+ )
195
+ await self._video_display.start(self._output_video_track)
196
+ logger.info("Opened video display")
197
+ except RuntimeError:
198
+ logger.warning(
199
+ "Cannot open video display: tkinter is not available. "
200
+ "Install python3-tk or equivalent for your platform."
201
+ )
202
+
203
+ async def create_call(self, call_id: str, **kwargs: Any) -> LocalCall:
204
+ return LocalCall(id=call_id)
205
+
206
+ async def send_custom_event(self, data: dict[str, Any]) -> None:
207
+ raise NotImplementedError("LocalEdge does not support send_custom_event")
208
+
209
+ async def create_conversation(
210
+ self, call: Any, user: User, instructions: str
211
+ ) -> InMemoryConversation:
212
+ return InMemoryConversation(instructions=instructions, messages=[])
213
+
214
+ def _emit_audio_event(self, data: np.ndarray) -> None:
215
+ """Convert raw numpy audio to PcmData and emit AudioReceivedEvent."""
216
+ samples = data.flatten().astype(np.int16)
217
+ pcm = PcmData(
218
+ samples=samples,
219
+ sample_rate=self._audio_input.sample_rate,
220
+ format=AudioFormat.S16,
221
+ channels=self._audio_input.channels,
222
+ )
223
+ pcm.participant = self._participant
224
+
225
+ self.events.send(
226
+ AudioReceivedEvent(
227
+ plugin_name=PLUGIN_NAME,
228
+ pcm_data=pcm,
229
+ participant=self._participant,
230
+ )
231
+ )
232
+
233
+ async def _forward_video(self, source: aiortc.MediaStreamTrack) -> None:
234
+ """Read frames from source track and push them to the output track."""
235
+ try:
236
+ while True:
237
+ frame = cast(av.VideoFrame, await source.recv())
238
+ await self._output_video_track.add_frame(frame)
239
+ except asyncio.CancelledError:
240
+ raise
241
+ except aiortc.MediaStreamError:
242
+ logger.debug("Source video track ended")
243
+
244
+ async def _mic_loop(self) -> None:
245
+ """Read mic data via asyncio.to_thread and emit audio events."""
246
+ try:
247
+ while True:
248
+ data = await asyncio.to_thread(self._audio_input.read)
249
+ if data is not None:
250
+ self._emit_audio_event(data)
251
+ except asyncio.CancelledError:
252
+ logger.debug("Mic loop cancelled")
253
+ raise
254
+
255
+ async def _start_audio(self) -> None:
256
+ """Start microphone capture via the audio input backend."""
257
+ if self._mic_task is not None:
258
+ return
259
+
260
+ self._audio_input.start()
261
+ logger.info(
262
+ "Started microphone: %dHz, %d channels",
263
+ self._audio_input.sample_rate,
264
+ self._audio_input.channels,
265
+ )
266
+ self._mic_task = asyncio.create_task(self._mic_loop())
267
+
268
+ async def _stop_audio(self) -> None:
269
+ """Stop all audio and video streams."""
270
+ if self._mic_task is not None:
271
+ self._mic_task.cancel()
272
+ try:
273
+ await self._mic_task
274
+ except asyncio.CancelledError:
275
+ pass
276
+ self._mic_task = None
277
+
278
+ self._audio_input.stop()
279
+ logger.info("Stopped microphone")
280
+
281
+ if self._audio_track is not None:
282
+ self._audio_track.stop()
283
+
284
+ if self._input_video_track is not None:
285
+ self._input_video_track.stop()
286
+ self._input_video_track = None
287
+
288
+
289
+ class LocalConnection(Connection):
290
+ """Connection wrapper for local transport."""
291
+
292
+ def __init__(self, transport: "LocalEdge"):
293
+ super().__init__()
294
+ self._transport = transport
295
+
296
+ def idle_since(self) -> float:
297
+ """Local transport is never idle."""
298
+ return 0.0
299
+
300
+ async def wait_for_participant(self, timeout: float | None = None) -> None:
301
+ """Local user is always present, return immediately."""
302
+ return
303
+
304
+ async def close(self, timeout: float = 2.0) -> None:
305
+ """Close the local connection."""
306
+ await self._transport.close()
@@ -0,0 +1,263 @@
1
+ """
2
+ LocalTransport: audio/video track implementations.
3
+
4
+ Provides LocalOutputAudioTrack for speaker playback and LocalVideoTrack
5
+ for camera capture, enabling vision agents to run locally without cloud
6
+ edge infrastructure.
7
+ """
8
+
9
+ import asyncio
10
+ import logging
11
+ import platform
12
+ import threading
13
+ import time
14
+ from fractions import Fraction
15
+ from typing import Any
16
+
17
+ import av
18
+ import numpy as np
19
+ import sounddevice as sd
20
+ from aiortc import AudioStreamTrack, VideoStreamTrack
21
+ from getstream.video.rtc.track_util import PcmData
22
+
23
+ from .devices import AudioOutputDevice
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ def _get_camera_input_format() -> str:
29
+ """Get the FFmpeg input format for the current platform."""
30
+ system = platform.system()
31
+ if system == "Darwin":
32
+ return "avfoundation"
33
+ elif system == "Linux":
34
+ return "v4l2"
35
+ elif system == "Windows":
36
+ return "dshow"
37
+ else:
38
+ raise RuntimeError(f"Unsupported platform for camera capture: {system}")
39
+
40
+
41
+ class LocalOutputAudioTrack(AudioStreamTrack):
42
+ """Audio track that plays PcmData through an AudioOutputDevice.
43
+
44
+ Uses an asyncio.Queue for backpressure: when the queue is full,
45
+ ``write`` awaits until the playback task drains an item. The playback
46
+ task offloads blocking device writes via ``asyncio.to_thread``.
47
+
48
+ Extends AudioStreamTrack so it satisfies the MediaStreamTrack interface
49
+ required by EdgeTransport.publish_tracks. Since this is a write-only
50
+ (playback) track, recv() is not supported.
51
+ """
52
+
53
+ def __init__(self, audio_output: AudioOutputDevice, buffer_limit: int = 20):
54
+ super().__init__()
55
+ self._audio_output = audio_output
56
+ self._queue: asyncio.Queue[np.ndarray] = asyncio.Queue(maxsize=buffer_limit)
57
+ self._running = False
58
+ self._playback_task: asyncio.Task[None] | None = None
59
+ self._write_lock = asyncio.Lock()
60
+
61
+ async def recv(self) -> av.AudioFrame:
62
+ """Not supported — this is a write-only playback track."""
63
+ raise NotImplementedError(
64
+ "LocalOutputAudioTrack is a playback-only track; recv() is not supported"
65
+ )
66
+
67
+ def start(self) -> None:
68
+ """Start the audio output stream."""
69
+ if self._running:
70
+ return
71
+
72
+ self._audio_output.start()
73
+ self._running = True
74
+ self._playback_task = asyncio.create_task(self._playback_loop())
75
+
76
+ async def write(self, data: PcmData) -> None:
77
+ """Write PCM data to be played on the speaker."""
78
+ if not self._running:
79
+ return
80
+
81
+ async with self._write_lock:
82
+ samples = self._process_audio(data)
83
+ await self._queue.put(samples)
84
+
85
+ async def flush(self) -> None:
86
+ """Clear any pending audio data and abort OS-level playback."""
87
+ async with self._write_lock:
88
+ while not self._queue.empty():
89
+ try:
90
+ self._queue.get_nowait()
91
+ except asyncio.QueueEmpty:
92
+ break
93
+ self._audio_output.flush()
94
+
95
+ def stop(self) -> None:
96
+ """Stop the audio output stream."""
97
+ super().stop()
98
+ self._running = False
99
+
100
+ if self._playback_task is not None:
101
+ self._playback_task.cancel()
102
+ self._playback_task = None
103
+
104
+ while not self._queue.empty():
105
+ try:
106
+ self._queue.get_nowait()
107
+ except asyncio.QueueEmpty:
108
+ break
109
+
110
+ self._audio_output.stop()
111
+
112
+ async def _playback_loop(self) -> None:
113
+ """Async task that drains the queue into the AudioOutput backend."""
114
+ try:
115
+ while True:
116
+ data = await self._queue.get()
117
+ try:
118
+ await asyncio.to_thread(self._audio_output.write, data)
119
+ except sd.PortAudioError as err:
120
+ logger.debug("PortAudio playback error: %s", err)
121
+ except asyncio.CancelledError:
122
+ logger.debug("Playback loop cancelled")
123
+ raise
124
+ except ValueError:
125
+ logger.exception("Audio data processing error")
126
+ except OSError:
127
+ logger.exception("Audio playback device error")
128
+
129
+ def _process_audio(self, data: PcmData) -> np.ndarray:
130
+ """Resample and convert PcmData to flat int16 numpy for the backend."""
131
+ target_rate = self._audio_output.sample_rate
132
+ target_channels = self._audio_output.channels
133
+
134
+ if data.sample_rate != target_rate or data.channels != target_channels:
135
+ data = data.resample(target_rate, target_channels)
136
+
137
+ samples = data.to_int16().samples
138
+
139
+ if samples.ndim == 2:
140
+ samples = samples.T.flatten()
141
+
142
+ return samples
143
+
144
+
145
+ class LocalVideoTrack(VideoStreamTrack):
146
+ """Video track that captures from local camera using PyAV."""
147
+
148
+ kind = "video"
149
+
150
+ def __init__(
151
+ self,
152
+ device: str,
153
+ width: int = 640,
154
+ height: int = 480,
155
+ fps: int = 30,
156
+ ):
157
+ super().__init__()
158
+
159
+ self._device = device
160
+ self._width = width
161
+ self._height = height
162
+ self._fps = fps
163
+ self._container: Any = None
164
+ self._stream: Any = None
165
+ self._started = False
166
+ self._stopped = False
167
+ self._frame_count = 0
168
+ self._start_time: float | None = None
169
+ self._lock = threading.Lock()
170
+
171
+ def _open_camera(self) -> None:
172
+ """Open the camera device with PyAV."""
173
+ input_format = _get_camera_input_format()
174
+ system = platform.system()
175
+
176
+ options: dict[str, str] = {
177
+ "framerate": str(self._fps),
178
+ }
179
+
180
+ if system == "Darwin":
181
+ device_path = self._device
182
+ options["video_size"] = f"{self._width}x{self._height}"
183
+ options["pixel_format"] = "uyvy422"
184
+ elif system == "Linux":
185
+ device_path = self._device
186
+ options["video_size"] = f"{self._width}x{self._height}"
187
+ elif system == "Windows":
188
+ device_path = self._device
189
+ options["video_size"] = f"{self._width}x{self._height}"
190
+ else:
191
+ raise RuntimeError(f"Unsupported platform: {system}")
192
+
193
+ self._container = av.open(
194
+ device_path,
195
+ format=input_format,
196
+ options=options,
197
+ )
198
+ self._stream = self._container.streams.video[0]
199
+ logger.info(
200
+ "Opened camera: %s (%dx%d @ %dfps)",
201
+ self._device,
202
+ self._width,
203
+ self._height,
204
+ self._fps,
205
+ )
206
+
207
+ def _read_frame(self, max_retries: int = 20, retry_timeout: float = 0.02) -> Any:
208
+ """Read a single frame from the camera (blocking)."""
209
+ if self._container is None:
210
+ return None
211
+
212
+ for attempt in range(max_retries):
213
+ try:
214
+ for packet in self._container.demux(self._stream):
215
+ for frame in packet.decode():
216
+ return frame
217
+ except BlockingIOError:
218
+ if attempt < max_retries - 1:
219
+ time.sleep(retry_timeout)
220
+ continue
221
+ logger.debug("Camera not ready after %d retries", max_retries)
222
+ return None
223
+ except OSError:
224
+ logger.warning("Error reading camera frame", exc_info=True)
225
+ return None
226
+ return None
227
+
228
+ async def recv(self) -> av.VideoFrame:
229
+ """Receive the next video frame."""
230
+ if self._stopped:
231
+ raise RuntimeError("Track has been stopped")
232
+
233
+ if not self._started:
234
+ self._started = True
235
+ self._start_time = time.time()
236
+
237
+ await asyncio.to_thread(self._open_camera)
238
+
239
+ frame = await asyncio.to_thread(self._read_frame)
240
+
241
+ if frame is None:
242
+ frame = av.VideoFrame(
243
+ width=self._width, height=self._height, format="rgb24"
244
+ )
245
+ frame.planes[0].update(bytes(self._width * self._height * 3))
246
+
247
+ self._frame_count += 1
248
+ frame.pts = self._frame_count
249
+ frame.time_base = Fraction(1, self._fps)
250
+ return frame
251
+
252
+ def stop(self) -> None:
253
+ """Stop camera capture and release resources."""
254
+ with self._lock:
255
+ self._stopped = True
256
+ if self._container is not None:
257
+ try:
258
+ self._container.close()
259
+ except OSError:
260
+ logger.warning("Error closing camera")
261
+ self._container = None
262
+ self._stream = None
263
+ logger.info("Stopped camera capture")
@@ -0,0 +1,86 @@
1
+ import sys
2
+ from typing import Callable, TypeVar
3
+
4
+ if sys.platform != "win32":
5
+ import termios
6
+
7
+ def safe_input(prompt: str) -> str:
8
+ """Call input() after ensuring the terminal translates CR to NL.
9
+
10
+ PortAudio (via sounddevice) can disable the ICRNL terminal flag,
11
+ which causes Enter (CR) to show as ^M instead of submitting input.
12
+ """
13
+ if sys.stdin.isatty():
14
+ fd = sys.stdin.fileno()
15
+ attrs = termios.tcgetattr(fd)
16
+ if not (attrs[0] & termios.ICRNL):
17
+ attrs[0] |= termios.ICRNL
18
+ termios.tcsetattr(fd, termios.TCSANOW, attrs)
19
+ return input(prompt)
20
+
21
+ else:
22
+
23
+ def safe_input(prompt: str) -> str:
24
+ return input(prompt)
25
+
26
+
27
+ T = TypeVar("T")
28
+
29
+
30
+ def prompt_selection(
31
+ items: list[T],
32
+ formatter: Callable[[T], str],
33
+ header: str,
34
+ default: T | None = None,
35
+ allow_skip: bool = False,
36
+ empty_message: str | None = None,
37
+ ) -> T | None:
38
+ """Interactive terminal prompt to pick one item from a list."""
39
+ print("\n" + "=" * 50)
40
+ print(header)
41
+ print("=" * 50)
42
+
43
+ if not items:
44
+ if empty_message:
45
+ print(f" {empty_message}")
46
+ print("-" * 50 + "\n")
47
+ return None
48
+
49
+ for i, item in enumerate(items):
50
+ print(f" {i}: {formatter(item)}")
51
+
52
+ if allow_skip:
53
+ print(" n: Skip (none)")
54
+
55
+ print("-" * 50)
56
+
57
+ while True:
58
+ try:
59
+ if allow_skip:
60
+ text = f"Select [0-{len(items) - 1}] or 'n' to skip: "
61
+ elif default is not None:
62
+ text = f"Select [0-{len(items) - 1}] (Enter for default): "
63
+ else:
64
+ text = f"Select [0-{len(items) - 1}]: "
65
+
66
+ choice = safe_input(text).strip().lower()
67
+
68
+ if choice == "" and default is not None:
69
+ print(f" -> Using default: {formatter(default)}")
70
+ return default
71
+
72
+ if choice in ("n", "") and allow_skip:
73
+ print(" -> No selection")
74
+ print("-" * 50 + "\n")
75
+ return None
76
+
77
+ idx = int(choice)
78
+ if 0 <= idx < len(items):
79
+ selected = items[idx]
80
+ print(f" -> Selected: {formatter(selected)}")
81
+ print("-" * 50 + "\n")
82
+ return selected
83
+
84
+ print(f" Invalid choice, enter 0-{len(items) - 1}")
85
+ except ValueError:
86
+ print(" Please enter a number" + (" or 'n'" if allow_skip else ""))