voxa-code 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
server/__init__.py ADDED
File without changes
server/apns.py ADDED
@@ -0,0 +1,89 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import logging
5
+
6
+ import httpx
7
+ import jwt
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ def build_apns_jwt(key_pem: str, key_id: str, team_id: str, issued_at: int) -> str:
13
+ return jwt.encode(
14
+ {"iss": team_id, "iat": issued_at},
15
+ key_pem,
16
+ algorithm="ES256",
17
+ headers={"alg": "ES256", "kid": key_id},
18
+ )
19
+
20
+
21
+ def build_voip_payload(call_id: str, summary: str) -> dict:
22
+ return {"call_id": call_id, "summary": summary, "aps": {"content-available": 1}}
23
+
24
+
25
+ def build_cancel_payload(call_id: str) -> dict:
26
+ return {"call_id": call_id, "type": "cancel", "aps": {"content-available": 1}}
27
+
28
+
29
+ class ApnsClient:
30
+ """Sends VoIP pushes via APNs HTTP/2. One per server process."""
31
+
32
+ PROD_HOST = "https://api.push.apple.com"
33
+ SANDBOX_HOST = "https://api.sandbox.push.apple.com"
34
+
35
+ def __init__(self, config, now_fn=None):
36
+ self._cfg = config
37
+ # Xcode/dev-signed builds get sandbox push tokens, which only work
38
+ # against the sandbox host; TestFlight/App Store builds use production.
39
+ self._host = self.SANDBOX_HOST if getattr(config, "apns_sandbox", False) else self.PROD_HOST
40
+ import time
41
+ self._now = now_fn or (lambda: int(time.time()))
42
+ self._jwt = ""
43
+ self._jwt_at = 0
44
+
45
+ def _token(self) -> str:
46
+ now = self._now()
47
+ if not self._jwt or now - self._jwt_at > 50 * 60:
48
+ # Prefer the key contents (set as a secret on container hosts); fall
49
+ # back to a file path for local/dev use.
50
+ key_pem = getattr(self._cfg, "apns_key", "") or open(self._cfg.apns_key_path).read()
51
+ self._jwt = build_apns_jwt(
52
+ key_pem, self._cfg.apns_key_id, self._cfg.apns_team_id, now
53
+ )
54
+ self._jwt_at = now
55
+ return self._jwt
56
+
57
+ async def send_voip(self, device_token: str, call_id: str, summary: str) -> bool | int:
58
+ """Send a VoIP ring. Returns True on success, or the HTTP status code on
59
+ failure (so the caller can prune a 410 Gone / dead token)."""
60
+ url = f"{self._host}/3/device/{device_token}"
61
+ headers = {
62
+ "apns-topic": f"{self._cfg.apns_bundle_id}.voip",
63
+ "apns-push-type": "voip",
64
+ "apns-priority": "10",
65
+ "authorization": f"bearer {self._token()}",
66
+ }
67
+ payload = build_voip_payload(call_id, summary)
68
+ async with httpx.AsyncClient(http2=True, timeout=10) as client:
69
+ resp = await client.post(url, headers=headers, content=json.dumps(payload))
70
+ if resp.status_code != 200:
71
+ # 410 = the token is dead (app deleted/reinstalled); other codes are
72
+ # transient/config. Log the reason so silent no-rings are diagnosable.
73
+ logger.warning("APNs voip push failed: status=%s body=%s token=%s",
74
+ resp.status_code, resp.text[:200], device_token[:8])
75
+ return resp.status_code
76
+ return True
77
+
78
+ async def send_voip_cancel(self, device_token: str, call_id: str) -> bool:
79
+ url = f"{self._host}/3/device/{device_token}"
80
+ headers = {
81
+ "apns-topic": f"{self._cfg.apns_bundle_id}.voip",
82
+ "apns-push-type": "voip",
83
+ "apns-priority": "10",
84
+ "authorization": f"bearer {self._token()}",
85
+ }
86
+ payload = build_cancel_payload(call_id)
87
+ async with httpx.AsyncClient(http2=True, timeout=10) as client:
88
+ resp = await client.post(url, headers=headers, content=json.dumps(payload))
89
+ return resp.status_code == 200
server/app.py ADDED
@@ -0,0 +1,589 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import json
5
+ import logging
6
+ import os
7
+ import tempfile
8
+ import time
9
+ from contextlib import asynccontextmanager
10
+ from pathlib import Path
11
+
12
+ from fastapi import FastAPI, Request, WebSocket, WebSocketDisconnect
13
+ from fastapi.responses import HTMLResponse, JSONResponse
14
+
15
+ from dotenv import load_dotenv
16
+
17
+ from server.config import Config, load_config
18
+ from server.claude_controller import ClaudeController
19
+ from server.tmux_controller import TmuxController
20
+ from server.orchestrator import Orchestrator
21
+
22
+ STATIC = Path(__file__).resolve().parent.parent / "static"
23
+
24
+
25
+ def should_suppress_greeting(pending_updates: list) -> bool:
26
+ """Suppress Voxa's generic opening when there is a queued update to relay on
27
+ answer, so the answer opening is the contextual update spoken once."""
28
+ return bool(pending_updates)
29
+
30
+
31
+ def _strip_finished_prefix(summary: str) -> str:
32
+ """Turn a finish summary ('<project> finished: <result>') into just the result,
33
+ since the opening phrases the 'finished' part itself. '<project> finished' with no
34
+ result becomes ''. Other summaries (e.g. 'needs input: ...') pass through."""
35
+ s = (summary or "").strip()
36
+ low = s.lower()
37
+ i = low.find("finished:")
38
+ if i != -1:
39
+ return s[i + len("finished:"):].strip()
40
+ if low.endswith(" finished") or low == "finished":
41
+ return ""
42
+ return s
43
+
44
+
45
+ def compose_opening(project: str, updates: list) -> str:
46
+ """Voxa's spoken opening when a call is answered: lead with the project and what
47
+ its last task actually did, then ask what's next, instead of a bare greeting
48
+ followed by the raw update. `project` is '' when we couldn't attach to a folder."""
49
+ detail = " ".join(_strip_finished_prefix(u) for u in (updates or []) if u and u.strip()).strip()
50
+ if project and detail:
51
+ body = f"Your last task in {project} just finished. Here's what it did: {detail}."
52
+ elif project:
53
+ body = f"You're back in {project} — your last task there just finished."
54
+ elif detail:
55
+ body = f"Your last task just finished. Here's what it did: {detail}."
56
+ else:
57
+ body = "You're back."
58
+ return f"Hi. {body} What would you like to do next?"
59
+
60
+
61
+ def suppress_greeting_if_supported(operator) -> bool:
62
+ """Suppress the operator's generic opening, but only if it supports it. The metered
63
+ RemoteOperator greets cloud-side and has no suppress_greeting, so this no-ops there
64
+ instead of raising (which would kill the answer flow)."""
65
+ fn = getattr(operator, "suppress_greeting", None)
66
+ if callable(fn):
67
+ fn()
68
+ return True
69
+ return False
70
+
71
+
72
+ def apply_greeting_suppression(operator, pending_updates: list) -> bool:
73
+ """Suppress the operator's generic opening when there is a queued update to relay.
74
+ Safe when the operator has no suppress_greeting (the metered RemoteOperator greets
75
+ cloud-side); never raises on that path."""
76
+ if not should_suppress_greeting(pending_updates):
77
+ return False
78
+ return suppress_greeting_if_supported(operator)
79
+
80
+
81
+ def _default_operator_factory(config, handle_tool_call, voice="", account=""):
82
+ # Metered mode: route V2V through the cloud /live proxy (your key + minute
83
+ # metering live there). Direct mode: talk to Gemini locally with your own key.
84
+ proxy = os.environ.get("VOXA_LIVE_PROXY", "").strip()
85
+ if proxy:
86
+ from server.remote_operator import RemoteOperator
87
+ # Account precedence: the paired phone's id (per-connection) wins, so each
88
+ # phone meters its own balance; fall back to env/auth_token for solo runs.
89
+ acct = account or os.environ.get("VOXA_ACCOUNT", "") or config.auth_token
90
+ return RemoteOperator(
91
+ config, handle_tool_call, proxy_url=proxy, account=acct,
92
+ token=os.environ.get("VOXA_PROXY_TOKEN", ""), voice=voice)
93
+ from server.gemini_operator import GeminiOperator
94
+ return GeminiOperator(config, handle_tool_call, voice=voice)
95
+
96
+
97
+ def create_app(config: Config | None = None, operator_factory=None) -> FastAPI:
98
+ if config is None:
99
+ load_dotenv()
100
+ config = load_config()
101
+ operator_factory = operator_factory or _default_operator_factory
102
+ # "attach" = visible interactive claude in a tmux/Terminal you can also type in;
103
+ # "drive" = headless SDK session with a read-only watch log.
104
+ mode = os.environ.get("VOXA_MODE", "attach").strip().lower()
105
+ app = FastAPI()
106
+
107
+ from server.device_registry import DeviceRegistry
108
+ from server.call_manager import CallManager
109
+ registry = DeviceRegistry(os.environ.get("VOXA_DEVICES_FILE", "devices.json"))
110
+ if config.push_enabled:
111
+ from server.apns import ApnsClient
112
+ pusher = ApnsClient(config)
113
+ else:
114
+ class _NoPush:
115
+ async def send_voip(self, *a, **k):
116
+ logging.warning("push disabled; dropping call %r", a)
117
+ return False
118
+ pusher = _NoPush()
119
+ call_manager = CallManager(pusher, registry)
120
+ app.state.registry = registry
121
+ app.state.call_manager = call_manager
122
+
123
+ def _check(request: Request):
124
+ return request.query_params.get("token") == config.auth_token
125
+
126
+ from server.push_routes import add_push_routes
127
+ add_push_routes(app, registry, call_manager, _check)
128
+
129
+ app.state.turn_start = {} # session_id -> turn start (Claude Code UserPromptSubmit)
130
+ app.state.hook_last = {} # session_id -> last announced time (debounce)
131
+ app.state.hooks_live = False # flips true once a real Claude Code hook arrives
132
+
133
+ async def _ring_via_cloud(summary):
134
+ # The laptop holds no APNs key (zero-config); ask the cloud to ring the
135
+ # last-paired account's phone. The cloud has the key + device registry.
136
+ relay = os.environ.get("VOXA_RELAY_URL", "").strip().rstrip("/")
137
+ account = getattr(app.state, "last_account", "")
138
+ if not relay or not account:
139
+ return
140
+ try:
141
+ import httpx
142
+ async with httpx.AsyncClient(timeout=10) as c:
143
+ # Account-scoped: the unguessable account id is the authorization
144
+ # (the cloud has no per-laptop token it could verify here).
145
+ await c.post(f"{relay}/notify", json={"account": account, "summary": summary})
146
+ except Exception:
147
+ logging.exception("ring via cloud failed")
148
+
149
+ async def report(summary: str):
150
+ """Surface a background/hook update.
151
+
152
+ - Line attached (a metered session is live): stay silent; that session
153
+ narrates its own result on the line, so we don't talk over it.
154
+ - App open but no line yet (connected for setup, not started): queue the
155
+ update so it's spoken when the user taps Start, but don't ring.
156
+ - App closed: queue + ring (CallKit / cloud)."""
157
+ if call_manager.line_open:
158
+ return
159
+ if getattr(app.state, "phone_clients", 0) > 0:
160
+ call_manager.queue(summary) # spoken on begin/attach; no ring while app is open
161
+ return
162
+ await call_manager.on_update(summary)
163
+ if not config.push_enabled:
164
+ # Avoid a double call: with a local APNs key, on_update already rang.
165
+ await _ring_via_cloud(summary)
166
+
167
+ def _stand_down_watcher():
168
+ # The first real Claude Code hook event proves hooks are live: stop the screen
169
+ # scraper, and make hooks the SOLE offline-ring source (so the driven session's
170
+ # own monitor doesn't also ring) — the two would otherwise double-report.
171
+ app.state.hooks_live = True
172
+ t = getattr(app.state, "bg_watcher", None)
173
+ if t is not None and not t.done():
174
+ t.cancel()
175
+ app.state.bg_watcher = None
176
+ hub = getattr(app.state, "hub", None)
177
+ if hub is not None:
178
+ hub.set_offline_ring(False)
179
+
180
+ @app.post("/hook")
181
+ async def claude_hook(request: Request):
182
+ # Claude Code Stop / Notification / UserPromptSubmit hooks POST their stdin JSON
183
+ # here (installed globally by the voxa launcher). This is the reliable,
184
+ # terminal-agnostic signal that a session finished or needs input.
185
+ if request.query_params.get("token") != config.auth_token:
186
+ return JSONResponse({"ok": False}, status_code=401)
187
+ try:
188
+ body = await request.json()
189
+ except Exception:
190
+ return {"ok": True}
191
+ from server.hooks import route_hook
192
+ _stand_down_watcher()
193
+ # Default 0 = call on EVERY finish (matches "call me when Claude finishes"). Set
194
+ # VOXA_HOOK_MIN_SECONDS to a positive value to suppress quick interactive turns
195
+ # (only call for tasks that took at least that many seconds).
196
+ msg = route_hook(
197
+ body or {},
198
+ turn_start=app.state.turn_start,
199
+ hook_last=app.state.hook_last,
200
+ now=time.monotonic(),
201
+ min_seconds=float(os.environ.get("VOXA_HOOK_MIN_SECONDS", "0")),
202
+ )
203
+ if msg:
204
+ # Remember WHICH session triggered this call so answering attaches to it
205
+ # and continues that work (instead of opening an empty default session).
206
+ cwd = (body or {}).get("cwd", "")
207
+ if cwd:
208
+ app.state.pending_source = {"cwd": cwd}
209
+ await report(msg)
210
+ return {"ok": True}
211
+
212
+ # Background watcher: ring the phone when ANY open Claude terminal finishes,
213
+ # not just the one Voxa is attached to. Routes through the hub so it speaks on
214
+ # the line when a phone is connected and rings (CallKit) when it isn't. Off by
215
+ # setting VOXA_WATCH_TERMINALS=0.
216
+ if os.environ.get("VOXA_WATCH_TERMINALS", "1").strip() not in ("0", "false", ""):
217
+ from server.terminal_watcher import TerminalWatcher
218
+
219
+ # Fallback screen-scraper for terminals without hooks. It stands down (see
220
+ # _stand_down_watcher) the moment a real Claude Code hook arrives, so the hook
221
+ # path and the scraper never double-report the same finish.
222
+ async def _on_bg_done(label, cwd, summary):
223
+ msg = f"{label or 'a terminal'} finished" + (f": {summary}" if summary else "")
224
+ await report(msg)
225
+
226
+ async def _on_bg_resumed(label, cwd):
227
+ # The user picked the task back up on the laptop before answering: cancel
228
+ # the ring so the phone stops buzzing for something already handled.
229
+ account = getattr(app.state, "last_account", "")
230
+ if config.push_enabled:
231
+ await call_manager.cancel(account or None)
232
+ return
233
+ relay = os.environ.get("VOXA_RELAY_URL", "").strip().rstrip("/")
234
+ if not relay or not account:
235
+ return
236
+ try:
237
+ import httpx
238
+ async with httpx.AsyncClient(timeout=10) as c:
239
+ await c.post(f"{relay}/notify", json={"account": account, "cancel": True})
240
+ except Exception:
241
+ logging.exception("cancel via cloud failed")
242
+
243
+ def _skip(session):
244
+ # The terminal we're actively driving is reported by the main loop;
245
+ # skip it here only while a phone line is open (to avoid double-report).
246
+ ctrl = getattr(app.state, "controller", None)
247
+ return bool(call_manager.line_open and ctrl is not None
248
+ and getattr(ctrl, "working_dir", None) == session.get("cwd"))
249
+
250
+ watcher = TerminalWatcher(_on_bg_done, on_resumed=_on_bg_resumed, should_skip=_skip)
251
+
252
+ @app.on_event("startup")
253
+ async def _start_watcher():
254
+ app.state.bg_watcher = asyncio.ensure_future(watcher.run())
255
+
256
+ @app.on_event("shutdown")
257
+ async def _stop_watcher():
258
+ t = getattr(app.state, "bg_watcher", None)
259
+ if t:
260
+ t.cancel()
261
+
262
+ @app.get("/healthz")
263
+ async def healthz():
264
+ return {"ok": True}
265
+
266
+ @app.get("/", response_class=HTMLResponse)
267
+ async def index():
268
+ return (STATIC / "index.html").read_text()
269
+
270
+ @app.websocket("/ws")
271
+ async def ws(websocket: WebSocket):
272
+ if websocket.query_params.get("token") != config.auth_token:
273
+ await websocket.close(code=4401)
274
+ return
275
+ await websocket.accept()
276
+ logging.getLogger("voxa").info("ws: phone connected")
277
+ # Count live connections so background/hook events know the app is OPEN
278
+ # (don't call, surface on the line) vs CLOSED (place a call).
279
+ app.state.phone_clients = getattr(app.state, "phone_clients", 0) + 1
280
+ try:
281
+ await _serve_ws(websocket)
282
+ finally:
283
+ app.state.phone_clients = max(0, getattr(app.state, "phone_clients", 1) - 1)
284
+
285
+ async def _serve_ws(websocket: WebSocket):
286
+ # The controller (and the Claude session it owns) persists across
287
+ # connections via the hub. Build it once, then reuse it so Claude keeps
288
+ # running when the phone hangs up.
289
+ hub = getattr(app.state, "hub", None)
290
+ if hub is None:
291
+ if mode == "drive":
292
+ watch_path = os.path.join(
293
+ tempfile.gettempdir(), f"loop-watch-{os.getpid()}.log"
294
+ )
295
+ controller = ClaudeController(
296
+ watch_log_path=watch_path, launch_terminal=True
297
+ )
298
+ else:
299
+ controller = TmuxController(
300
+ launch_terminal=True,
301
+ terminal_app=os.environ.get("VOXA_TERMINAL_APP", "auto"),
302
+ )
303
+ from server.session_hub import SessionHub
304
+ hub = SessionHub(controller, call_manager)
305
+ if getattr(app.state, "hooks_live", False):
306
+ hub.set_offline_ring(False) # hooks already drive offline rings
307
+ app.state.hub = hub
308
+ app.state.controller = controller
309
+ else:
310
+ controller = app.state.controller
311
+
312
+ # Conversational-activity clock for the idle auto-disconnect (below). Raw mic
313
+ # frames don't count (the phone streams continuously); only real speech /
314
+ # spoken replies / a working task do.
315
+ activity = {"t": time.monotonic()}
316
+ def touch(): activity["t"] = time.monotonic()
317
+
318
+ async def speak(text): await operator.speak(text)
319
+ async def notify(msg):
320
+ if isinstance(msg, dict) and msg.get("type") == "transcript":
321
+ touch()
322
+ await websocket.send_json(msg)
323
+ orchestrator = Orchestrator(controller, speak, notify)
324
+ # Route finals through the hub (spoken when a line is attached, ring via the
325
+ # call manager otherwise). set_final keeps this wired across controller swaps
326
+ # when the user attaches to a different open terminal.
327
+ orchestrator.set_final(hub.on_final)
328
+
329
+ _log = logging.getLogger("voxa")
330
+ _counts = {"in": 0, "out": 0}
331
+ voice = websocket.query_params.get("voice", "")
332
+ # The bridge appends ?account=<paired phone's id> so the metered session
333
+ # bills that balance. Only pass it to factories that accept it.
334
+ import inspect
335
+ _kwargs = {"voice": voice}
336
+ _account = websocket.query_params.get("account", "")
337
+ if "account" in inspect.signature(operator_factory).parameters:
338
+ _kwargs["account"] = _account
339
+ # Remember who's paired so the background watcher can ring this account's
340
+ # phone (via the cloud) when a terminal finishes while they're away.
341
+ if _account:
342
+ app.state.last_account = _account
343
+
344
+ # Pre-session gate: do NOT open the (metered) /live voice session until the
345
+ # user taps Start ("begin") or starts talking. Until then we only do FREE
346
+ # setup, listing terminals and setting the folder/terminal, so idle pairing
347
+ # and choosing a project never cost the user a minute.
348
+ await websocket.send_json(
349
+ {"type": "status", "status": "ready",
350
+ "working_dir": getattr(controller, "working_dir", "") or ""})
351
+
352
+ async def _push_terminals():
353
+ try:
354
+ from server.terminals import discover_claude_sessions
355
+ sessions = await asyncio.to_thread(discover_claude_sessions)
356
+ # Seed the orchestrator with the SAME list the phone is shown, so
357
+ # tapping a terminal (attach_terminal by id) resolves correctly.
358
+ orchestrator.remember_terminals(sessions)
359
+ await websocket.send_json({"type": "terminals", "terminals": sessions})
360
+ except Exception:
361
+ pass
362
+ asyncio.ensure_future(_push_terminals())
363
+
364
+ first_audio = None
365
+ while True:
366
+ msg = await websocket.receive()
367
+ if msg["type"] == "websocket.disconnect":
368
+ return # hung up during setup; nothing was attached or metered
369
+ if msg.get("bytes") is not None:
370
+ first_audio = msg["bytes"] # talking implicitly begins the session
371
+ break
372
+ if msg.get("text"):
373
+ try:
374
+ data = json.loads(msg["text"])
375
+ except ValueError:
376
+ continue
377
+ if data.get("type") == "begin":
378
+ break
379
+ # Folder/terminal selection is free, handle it before metering.
380
+ await handle_client_control(msg["text"], orchestrator, websocket, None)
381
+
382
+ async with _as_cm(operator_factory(config, orchestrator.handle_tool_call, **_kwargs)) as operator:
383
+ async def audio_out(pcm):
384
+ touch() # Gemini is speaking -> the line is active
385
+ _counts["out"] += 1
386
+ if _counts["out"] % 50 == 1:
387
+ _log.info("ws: sent %d audio chunks -> phone", _counts["out"])
388
+ await websocket.send_bytes(pcm)
389
+ operator.set_audio_out(audio_out)
390
+ operator.set_text_out(notify)
391
+
392
+ # Attach this operator's voice to the line and speak anything that
393
+ # queued up while no phone was connected.
394
+ pending_updates = hub.attach(lambda t: operator.speak(t))
395
+ # Everything from here runs with the line attached; detach() MUST run even on
396
+ # cancellation (uvicorn reload/shutdown) or an exception, otherwise line_open
397
+ # stays True and every later background finish is silently dropped.
398
+ try:
399
+ # If this call was triggered by a specific Claude terminal, attach to THAT
400
+ # terminal so Voxa continues that session instead of an empty default one,
401
+ # and OPEN with that context so it knows where you are.
402
+ source = getattr(app.state, "pending_source", None)
403
+ app.state.pending_source = None
404
+ attached_folder = None
405
+ if source and source.get("cwd"):
406
+ try:
407
+ res = await orchestrator.attach_source(source["cwd"])
408
+ if "attached" in res:
409
+ controller = orchestrator.controller # follow the swap
410
+ app.state.controller = controller # persist for reconnects
411
+ attached_folder = (os.path.basename(source["cwd"].rstrip("/"))
412
+ or source["cwd"])
413
+ await websocket.send_json(
414
+ {"type": "status",
415
+ "working_dir": res.get("working_dir", source["cwd"])})
416
+ else:
417
+ _log.info("auto-attach to %s skipped: %s",
418
+ source["cwd"], res.get("error"))
419
+ except Exception:
420
+ logging.exception("auto-attach on answer failed")
421
+
422
+ # Voxa's opening is ALWAYS driven from here. Suppress the operator's own
423
+ # auto-greet: on the metered path the cloud brain would otherwise race
424
+ # ahead and speak a generic "what would you like to do?" the instant the
425
+ # /live socket opens, before this contextual opening arrives.
426
+ suppress_greeting_if_supported(operator)
427
+ if attached_folder:
428
+ opening = compose_opening(attached_folder, pending_updates)
429
+ elif pending_updates:
430
+ opening = compose_opening("", pending_updates)
431
+ else:
432
+ opening = "Hi! What would you like to work on?"
433
+ await operator.speak(opening, immediate=True)
434
+
435
+ # If the session began because the user started talking, don't drop
436
+ # that first frame.
437
+ if first_audio is not None:
438
+ await operator.send_audio(first_audio)
439
+
440
+ paused = False
441
+
442
+ async def recv_loop():
443
+ nonlocal paused
444
+ try:
445
+ while True:
446
+ msg = await websocket.receive()
447
+ if msg["type"] == "websocket.disconnect":
448
+ break
449
+ if msg.get("bytes") is not None:
450
+ _counts["in"] += 1
451
+ # Read the CURRENTLY-driven controller (it swaps when the
452
+ # user attaches to another terminal mid-call); the loop's
453
+ # local `controller` would be stale after a swap.
454
+ cur = orchestrator.controller
455
+ if _counts["in"] % 50 == 1:
456
+ _log.info(
457
+ "ws: recv %d mic frames (status=%s)",
458
+ _counts["in"], getattr(cur, "status", "?"),
459
+ )
460
+ # Cost saver: while Claude is working, stop forwarding mic
461
+ # audio to Gemini. Gemini bills per audio token, so no
462
+ # audio in == no charge during the wait. Resume when idle.
463
+ if getattr(cur, "status", "idle") == "working":
464
+ paused = True
465
+ else:
466
+ if paused:
467
+ paused = False
468
+ await websocket.send_json(
469
+ {"type": "status", "status": "listening"}
470
+ )
471
+ await operator.send_audio(msg["bytes"])
472
+ elif msg.get("text"):
473
+ await handle_client_control(
474
+ msg["text"], orchestrator, websocket, operator
475
+ )
476
+ except (WebSocketDisconnect, RuntimeError):
477
+ pass
478
+
479
+ # Idle auto-disconnect: hang up after a quiet stretch (no speech, not
480
+ # working) so an idle line stops burning V2V minutes. Off if 0.
481
+ idle_timeout = float(os.environ.get("VOXA_IDLE_TIMEOUT", "180"))
482
+
483
+ async def idle_watchdog():
484
+ if idle_timeout <= 0:
485
+ return await asyncio.Event().wait() # disabled
486
+ while True:
487
+ await asyncio.sleep(5)
488
+ # Follow controller swaps: a mid-call attach must not leave the
489
+ # watchdog reading the old (idle) controller and hang up mid-task.
490
+ if getattr(orchestrator.controller, "status", "idle") == "working":
491
+ touch() # an active task is not idle
492
+ continue
493
+ if time.monotonic() - activity["t"] > idle_timeout:
494
+ try:
495
+ await websocket.send_json(
496
+ {"type": "status",
497
+ "status": "idle, disconnecting to save minutes"})
498
+ except Exception:
499
+ pass
500
+ return
501
+
502
+ run_task = asyncio.ensure_future(operator.run())
503
+ recv_task = asyncio.ensure_future(recv_loop())
504
+ idle_task = asyncio.ensure_future(idle_watchdog())
505
+ done, pending = await asyncio.wait(
506
+ [run_task, recv_task, idle_task],
507
+ return_when=asyncio.FIRST_COMPLETED,
508
+ )
509
+ for task in pending:
510
+ task.cancel()
511
+ try:
512
+ await task
513
+ except asyncio.CancelledError:
514
+ pass
515
+ except Exception:
516
+ logging.exception("loop task raised during cancellation")
517
+ for task in done:
518
+ exc = task.exception()
519
+ if exc is not None:
520
+ logging.error("loop task raised: %r", exc)
521
+ finally:
522
+ # Detach the line but keep Claude running. Only an explicit stop_claude
523
+ # (via the orchestrator) tears the session down. Persist the
524
+ # currently-driven controller so the next connection reuses it.
525
+ app.state.controller = orchestrator.controller
526
+ _log.info("ws: phone disconnected (mic_in=%d audio_out=%d)",
527
+ _counts["in"], _counts["out"])
528
+ hub.detach()
529
+
530
+ # mount static assets (js/worklet) under /static
531
+ from fastapi.staticfiles import StaticFiles
532
+ app.mount("/static", StaticFiles(directory=str(STATIC)), name="static")
533
+ return app
534
+
535
+
536
+ async def handle_client_control(raw: str, orchestrator, websocket, operator=None) -> None:
537
+ """Handle a JSON control message sent by the phone (e.g. setting the folder)."""
538
+ try:
539
+ data = json.loads(raw)
540
+ except (ValueError, TypeError):
541
+ return
542
+ mtype = data.get("type")
543
+ if mtype == "say" and data.get("text") and operator is not None:
544
+ await operator.send_text(data["text"])
545
+ elif mtype == "claude_input" and data.get("text"):
546
+ # Raw terminal chat from the phone's full-screen view: type straight into the
547
+ # live Claude session, bypassing the voice operator.
548
+ await orchestrator.send_direct(data["text"])
549
+ elif mtype == "claude_scrollback":
550
+ # Full-screen terminal view wants the whole scrollback (not just the pane).
551
+ await orchestrator.send_scrollback()
552
+ elif mtype == "set_terminal" and data.get("app"):
553
+ orchestrator.set_terminal_app(data["app"])
554
+ elif mtype == "set_dir" and data.get("path"):
555
+ result = await orchestrator.handle_tool_call(
556
+ "set_working_dir", {"path": data["path"]}
557
+ )
558
+ if "error" in result:
559
+ await websocket.send_json(
560
+ {"type": "status", "status": f"folder error: {result['error']}"}
561
+ )
562
+ elif mtype == "stop":
563
+ # Cancel the running Claude task (works even while the mic stream is paused).
564
+ await orchestrator.handle_tool_call("stop_claude", {})
565
+ await websocket.send_json({"type": "status", "status": "stopped"})
566
+ elif mtype == "list_dirs":
567
+ # The phone's folder browser asks for the subdirectories of a path. Resolve the
568
+ # deepest existing ancestor (so a half-typed/nonexistent path still lists
569
+ # something sensible) and send its subfolders back for navigation.
570
+ from server.orchestrator import suggest_dirs
571
+ base, options = suggest_dirs(data.get("path") or "~", limit=500)
572
+ await websocket.send_json({"type": "dirs", "path": base, "dirs": options})
573
+ elif mtype == "list_terminals":
574
+ # The tool pushes a {"type":"terminals",...} message to the phone itself.
575
+ await orchestrator.handle_tool_call("list_terminals", {})
576
+ elif mtype == "attach_terminal" and data.get("id"):
577
+ res = await orchestrator.handle_tool_call("attach_terminal", {"id": data["id"]})
578
+ if "error" in res:
579
+ await websocket.send_json({"type": "status", "status": res["error"]})
580
+
581
+
582
+ @asynccontextmanager
583
+ async def _as_cm(obj):
584
+ """Accept either an async context manager or a plain object."""
585
+ if hasattr(obj, "__aenter__"):
586
+ async with obj as entered:
587
+ yield entered
588
+ else:
589
+ yield obj