voxa-code 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- server/__init__.py +0 -0
- server/apns.py +89 -0
- server/app.py +589 -0
- server/appattest.py +310 -0
- server/appstore.py +141 -0
- server/attested_store.py +60 -0
- server/auth.py +70 -0
- server/ax_controller.py +202 -0
- server/billing.py +177 -0
- server/call_manager.py +91 -0
- server/certs/AppleRootCA-G3.pem +15 -0
- server/certs/Apple_App_Attestation_Root_CA.pem +14 -0
- server/claude_controller.py +156 -0
- server/cli.py +365 -0
- server/cloud_app.py +345 -0
- server/config.py +56 -0
- server/device_registry.py +52 -0
- server/gemini_operator.py +677 -0
- server/hooks.py +202 -0
- server/orchestrator.py +315 -0
- server/push_routes.py +50 -0
- server/ratelimit.py +41 -0
- server/relay.py +157 -0
- server/relay_client.py +89 -0
- server/remote_operator.py +128 -0
- server/session_hub.py +33 -0
- server/terminal_watcher.py +241 -0
- server/terminals.py +510 -0
- server/tmux_controller.py +580 -0
- server/transcript_monitor.py +134 -0
- server/transcripts.py +143 -0
- server/users.py +90 -0
- server/voxa_cloud.py +132 -0
- server/waitlist.py +130 -0
- static/app.js +388 -0
- static/favicon.svg +1 -0
- static/index.html +253 -0
- static/pcm-worklet.js +69 -0
- static/pro.html +29 -0
- static/pro2.html +33 -0
- static/voxa-mark-white.svg +1 -0
- voxa_code-0.1.0.dist-info/METADATA +227 -0
- voxa_code-0.1.0.dist-info/RECORD +47 -0
- voxa_code-0.1.0.dist-info/WHEEL +5 -0
- voxa_code-0.1.0.dist-info/entry_points.txt +2 -0
- voxa_code-0.1.0.dist-info/licenses/LICENSE +21 -0
- voxa_code-0.1.0.dist-info/top_level.txt +2 -0
server/__init__.py
ADDED
|
File without changes
|
server/apns.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
import httpx
|
|
7
|
+
import jwt
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def build_apns_jwt(key_pem: str, key_id: str, team_id: str, issued_at: int) -> str:
|
|
13
|
+
return jwt.encode(
|
|
14
|
+
{"iss": team_id, "iat": issued_at},
|
|
15
|
+
key_pem,
|
|
16
|
+
algorithm="ES256",
|
|
17
|
+
headers={"alg": "ES256", "kid": key_id},
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def build_voip_payload(call_id: str, summary: str) -> dict:
|
|
22
|
+
return {"call_id": call_id, "summary": summary, "aps": {"content-available": 1}}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def build_cancel_payload(call_id: str) -> dict:
|
|
26
|
+
return {"call_id": call_id, "type": "cancel", "aps": {"content-available": 1}}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class ApnsClient:
|
|
30
|
+
"""Sends VoIP pushes via APNs HTTP/2. One per server process."""
|
|
31
|
+
|
|
32
|
+
PROD_HOST = "https://api.push.apple.com"
|
|
33
|
+
SANDBOX_HOST = "https://api.sandbox.push.apple.com"
|
|
34
|
+
|
|
35
|
+
def __init__(self, config, now_fn=None):
|
|
36
|
+
self._cfg = config
|
|
37
|
+
# Xcode/dev-signed builds get sandbox push tokens, which only work
|
|
38
|
+
# against the sandbox host; TestFlight/App Store builds use production.
|
|
39
|
+
self._host = self.SANDBOX_HOST if getattr(config, "apns_sandbox", False) else self.PROD_HOST
|
|
40
|
+
import time
|
|
41
|
+
self._now = now_fn or (lambda: int(time.time()))
|
|
42
|
+
self._jwt = ""
|
|
43
|
+
self._jwt_at = 0
|
|
44
|
+
|
|
45
|
+
def _token(self) -> str:
|
|
46
|
+
now = self._now()
|
|
47
|
+
if not self._jwt or now - self._jwt_at > 50 * 60:
|
|
48
|
+
# Prefer the key contents (set as a secret on container hosts); fall
|
|
49
|
+
# back to a file path for local/dev use.
|
|
50
|
+
key_pem = getattr(self._cfg, "apns_key", "") or open(self._cfg.apns_key_path).read()
|
|
51
|
+
self._jwt = build_apns_jwt(
|
|
52
|
+
key_pem, self._cfg.apns_key_id, self._cfg.apns_team_id, now
|
|
53
|
+
)
|
|
54
|
+
self._jwt_at = now
|
|
55
|
+
return self._jwt
|
|
56
|
+
|
|
57
|
+
async def send_voip(self, device_token: str, call_id: str, summary: str) -> bool | int:
|
|
58
|
+
"""Send a VoIP ring. Returns True on success, or the HTTP status code on
|
|
59
|
+
failure (so the caller can prune a 410 Gone / dead token)."""
|
|
60
|
+
url = f"{self._host}/3/device/{device_token}"
|
|
61
|
+
headers = {
|
|
62
|
+
"apns-topic": f"{self._cfg.apns_bundle_id}.voip",
|
|
63
|
+
"apns-push-type": "voip",
|
|
64
|
+
"apns-priority": "10",
|
|
65
|
+
"authorization": f"bearer {self._token()}",
|
|
66
|
+
}
|
|
67
|
+
payload = build_voip_payload(call_id, summary)
|
|
68
|
+
async with httpx.AsyncClient(http2=True, timeout=10) as client:
|
|
69
|
+
resp = await client.post(url, headers=headers, content=json.dumps(payload))
|
|
70
|
+
if resp.status_code != 200:
|
|
71
|
+
# 410 = the token is dead (app deleted/reinstalled); other codes are
|
|
72
|
+
# transient/config. Log the reason so silent no-rings are diagnosable.
|
|
73
|
+
logger.warning("APNs voip push failed: status=%s body=%s token=%s",
|
|
74
|
+
resp.status_code, resp.text[:200], device_token[:8])
|
|
75
|
+
return resp.status_code
|
|
76
|
+
return True
|
|
77
|
+
|
|
78
|
+
async def send_voip_cancel(self, device_token: str, call_id: str) -> bool:
|
|
79
|
+
url = f"{self._host}/3/device/{device_token}"
|
|
80
|
+
headers = {
|
|
81
|
+
"apns-topic": f"{self._cfg.apns_bundle_id}.voip",
|
|
82
|
+
"apns-push-type": "voip",
|
|
83
|
+
"apns-priority": "10",
|
|
84
|
+
"authorization": f"bearer {self._token()}",
|
|
85
|
+
}
|
|
86
|
+
payload = build_cancel_payload(call_id)
|
|
87
|
+
async with httpx.AsyncClient(http2=True, timeout=10) as client:
|
|
88
|
+
resp = await client.post(url, headers=headers, content=json.dumps(payload))
|
|
89
|
+
return resp.status_code == 200
|
server/app.py
ADDED
|
@@ -0,0 +1,589 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
7
|
+
import tempfile
|
|
8
|
+
import time
|
|
9
|
+
from contextlib import asynccontextmanager
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
from fastapi import FastAPI, Request, WebSocket, WebSocketDisconnect
|
|
13
|
+
from fastapi.responses import HTMLResponse, JSONResponse
|
|
14
|
+
|
|
15
|
+
from dotenv import load_dotenv
|
|
16
|
+
|
|
17
|
+
from server.config import Config, load_config
|
|
18
|
+
from server.claude_controller import ClaudeController
|
|
19
|
+
from server.tmux_controller import TmuxController
|
|
20
|
+
from server.orchestrator import Orchestrator
|
|
21
|
+
|
|
22
|
+
STATIC = Path(__file__).resolve().parent.parent / "static"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def should_suppress_greeting(pending_updates: list) -> bool:
|
|
26
|
+
"""Suppress Voxa's generic opening when there is a queued update to relay on
|
|
27
|
+
answer, so the answer opening is the contextual update spoken once."""
|
|
28
|
+
return bool(pending_updates)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _strip_finished_prefix(summary: str) -> str:
|
|
32
|
+
"""Turn a finish summary ('<project> finished: <result>') into just the result,
|
|
33
|
+
since the opening phrases the 'finished' part itself. '<project> finished' with no
|
|
34
|
+
result becomes ''. Other summaries (e.g. 'needs input: ...') pass through."""
|
|
35
|
+
s = (summary or "").strip()
|
|
36
|
+
low = s.lower()
|
|
37
|
+
i = low.find("finished:")
|
|
38
|
+
if i != -1:
|
|
39
|
+
return s[i + len("finished:"):].strip()
|
|
40
|
+
if low.endswith(" finished") or low == "finished":
|
|
41
|
+
return ""
|
|
42
|
+
return s
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def compose_opening(project: str, updates: list) -> str:
|
|
46
|
+
"""Voxa's spoken opening when a call is answered: lead with the project and what
|
|
47
|
+
its last task actually did, then ask what's next, instead of a bare greeting
|
|
48
|
+
followed by the raw update. `project` is '' when we couldn't attach to a folder."""
|
|
49
|
+
detail = " ".join(_strip_finished_prefix(u) for u in (updates or []) if u and u.strip()).strip()
|
|
50
|
+
if project and detail:
|
|
51
|
+
body = f"Your last task in {project} just finished. Here's what it did: {detail}."
|
|
52
|
+
elif project:
|
|
53
|
+
body = f"You're back in {project} — your last task there just finished."
|
|
54
|
+
elif detail:
|
|
55
|
+
body = f"Your last task just finished. Here's what it did: {detail}."
|
|
56
|
+
else:
|
|
57
|
+
body = "You're back."
|
|
58
|
+
return f"Hi. {body} What would you like to do next?"
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def suppress_greeting_if_supported(operator) -> bool:
|
|
62
|
+
"""Suppress the operator's generic opening, but only if it supports it. The metered
|
|
63
|
+
RemoteOperator greets cloud-side and has no suppress_greeting, so this no-ops there
|
|
64
|
+
instead of raising (which would kill the answer flow)."""
|
|
65
|
+
fn = getattr(operator, "suppress_greeting", None)
|
|
66
|
+
if callable(fn):
|
|
67
|
+
fn()
|
|
68
|
+
return True
|
|
69
|
+
return False
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def apply_greeting_suppression(operator, pending_updates: list) -> bool:
|
|
73
|
+
"""Suppress the operator's generic opening when there is a queued update to relay.
|
|
74
|
+
Safe when the operator has no suppress_greeting (the metered RemoteOperator greets
|
|
75
|
+
cloud-side); never raises on that path."""
|
|
76
|
+
if not should_suppress_greeting(pending_updates):
|
|
77
|
+
return False
|
|
78
|
+
return suppress_greeting_if_supported(operator)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _default_operator_factory(config, handle_tool_call, voice="", account=""):
|
|
82
|
+
# Metered mode: route V2V through the cloud /live proxy (your key + minute
|
|
83
|
+
# metering live there). Direct mode: talk to Gemini locally with your own key.
|
|
84
|
+
proxy = os.environ.get("VOXA_LIVE_PROXY", "").strip()
|
|
85
|
+
if proxy:
|
|
86
|
+
from server.remote_operator import RemoteOperator
|
|
87
|
+
# Account precedence: the paired phone's id (per-connection) wins, so each
|
|
88
|
+
# phone meters its own balance; fall back to env/auth_token for solo runs.
|
|
89
|
+
acct = account or os.environ.get("VOXA_ACCOUNT", "") or config.auth_token
|
|
90
|
+
return RemoteOperator(
|
|
91
|
+
config, handle_tool_call, proxy_url=proxy, account=acct,
|
|
92
|
+
token=os.environ.get("VOXA_PROXY_TOKEN", ""), voice=voice)
|
|
93
|
+
from server.gemini_operator import GeminiOperator
|
|
94
|
+
return GeminiOperator(config, handle_tool_call, voice=voice)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def create_app(config: Config | None = None, operator_factory=None) -> FastAPI:
|
|
98
|
+
if config is None:
|
|
99
|
+
load_dotenv()
|
|
100
|
+
config = load_config()
|
|
101
|
+
operator_factory = operator_factory or _default_operator_factory
|
|
102
|
+
# "attach" = visible interactive claude in a tmux/Terminal you can also type in;
|
|
103
|
+
# "drive" = headless SDK session with a read-only watch log.
|
|
104
|
+
mode = os.environ.get("VOXA_MODE", "attach").strip().lower()
|
|
105
|
+
app = FastAPI()
|
|
106
|
+
|
|
107
|
+
from server.device_registry import DeviceRegistry
|
|
108
|
+
from server.call_manager import CallManager
|
|
109
|
+
registry = DeviceRegistry(os.environ.get("VOXA_DEVICES_FILE", "devices.json"))
|
|
110
|
+
if config.push_enabled:
|
|
111
|
+
from server.apns import ApnsClient
|
|
112
|
+
pusher = ApnsClient(config)
|
|
113
|
+
else:
|
|
114
|
+
class _NoPush:
|
|
115
|
+
async def send_voip(self, *a, **k):
|
|
116
|
+
logging.warning("push disabled; dropping call %r", a)
|
|
117
|
+
return False
|
|
118
|
+
pusher = _NoPush()
|
|
119
|
+
call_manager = CallManager(pusher, registry)
|
|
120
|
+
app.state.registry = registry
|
|
121
|
+
app.state.call_manager = call_manager
|
|
122
|
+
|
|
123
|
+
def _check(request: Request):
|
|
124
|
+
return request.query_params.get("token") == config.auth_token
|
|
125
|
+
|
|
126
|
+
from server.push_routes import add_push_routes
|
|
127
|
+
add_push_routes(app, registry, call_manager, _check)
|
|
128
|
+
|
|
129
|
+
app.state.turn_start = {} # session_id -> turn start (Claude Code UserPromptSubmit)
|
|
130
|
+
app.state.hook_last = {} # session_id -> last announced time (debounce)
|
|
131
|
+
app.state.hooks_live = False # flips true once a real Claude Code hook arrives
|
|
132
|
+
|
|
133
|
+
async def _ring_via_cloud(summary):
|
|
134
|
+
# The laptop holds no APNs key (zero-config); ask the cloud to ring the
|
|
135
|
+
# last-paired account's phone. The cloud has the key + device registry.
|
|
136
|
+
relay = os.environ.get("VOXA_RELAY_URL", "").strip().rstrip("/")
|
|
137
|
+
account = getattr(app.state, "last_account", "")
|
|
138
|
+
if not relay or not account:
|
|
139
|
+
return
|
|
140
|
+
try:
|
|
141
|
+
import httpx
|
|
142
|
+
async with httpx.AsyncClient(timeout=10) as c:
|
|
143
|
+
# Account-scoped: the unguessable account id is the authorization
|
|
144
|
+
# (the cloud has no per-laptop token it could verify here).
|
|
145
|
+
await c.post(f"{relay}/notify", json={"account": account, "summary": summary})
|
|
146
|
+
except Exception:
|
|
147
|
+
logging.exception("ring via cloud failed")
|
|
148
|
+
|
|
149
|
+
async def report(summary: str):
|
|
150
|
+
"""Surface a background/hook update.
|
|
151
|
+
|
|
152
|
+
- Line attached (a metered session is live): stay silent; that session
|
|
153
|
+
narrates its own result on the line, so we don't talk over it.
|
|
154
|
+
- App open but no line yet (connected for setup, not started): queue the
|
|
155
|
+
update so it's spoken when the user taps Start, but don't ring.
|
|
156
|
+
- App closed: queue + ring (CallKit / cloud)."""
|
|
157
|
+
if call_manager.line_open:
|
|
158
|
+
return
|
|
159
|
+
if getattr(app.state, "phone_clients", 0) > 0:
|
|
160
|
+
call_manager.queue(summary) # spoken on begin/attach; no ring while app is open
|
|
161
|
+
return
|
|
162
|
+
await call_manager.on_update(summary)
|
|
163
|
+
if not config.push_enabled:
|
|
164
|
+
# Avoid a double call: with a local APNs key, on_update already rang.
|
|
165
|
+
await _ring_via_cloud(summary)
|
|
166
|
+
|
|
167
|
+
def _stand_down_watcher():
|
|
168
|
+
# The first real Claude Code hook event proves hooks are live: stop the screen
|
|
169
|
+
# scraper, and make hooks the SOLE offline-ring source (so the driven session's
|
|
170
|
+
# own monitor doesn't also ring) — the two would otherwise double-report.
|
|
171
|
+
app.state.hooks_live = True
|
|
172
|
+
t = getattr(app.state, "bg_watcher", None)
|
|
173
|
+
if t is not None and not t.done():
|
|
174
|
+
t.cancel()
|
|
175
|
+
app.state.bg_watcher = None
|
|
176
|
+
hub = getattr(app.state, "hub", None)
|
|
177
|
+
if hub is not None:
|
|
178
|
+
hub.set_offline_ring(False)
|
|
179
|
+
|
|
180
|
+
@app.post("/hook")
|
|
181
|
+
async def claude_hook(request: Request):
|
|
182
|
+
# Claude Code Stop / Notification / UserPromptSubmit hooks POST their stdin JSON
|
|
183
|
+
# here (installed globally by the voxa launcher). This is the reliable,
|
|
184
|
+
# terminal-agnostic signal that a session finished or needs input.
|
|
185
|
+
if request.query_params.get("token") != config.auth_token:
|
|
186
|
+
return JSONResponse({"ok": False}, status_code=401)
|
|
187
|
+
try:
|
|
188
|
+
body = await request.json()
|
|
189
|
+
except Exception:
|
|
190
|
+
return {"ok": True}
|
|
191
|
+
from server.hooks import route_hook
|
|
192
|
+
_stand_down_watcher()
|
|
193
|
+
# Default 0 = call on EVERY finish (matches "call me when Claude finishes"). Set
|
|
194
|
+
# VOXA_HOOK_MIN_SECONDS to a positive value to suppress quick interactive turns
|
|
195
|
+
# (only call for tasks that took at least that many seconds).
|
|
196
|
+
msg = route_hook(
|
|
197
|
+
body or {},
|
|
198
|
+
turn_start=app.state.turn_start,
|
|
199
|
+
hook_last=app.state.hook_last,
|
|
200
|
+
now=time.monotonic(),
|
|
201
|
+
min_seconds=float(os.environ.get("VOXA_HOOK_MIN_SECONDS", "0")),
|
|
202
|
+
)
|
|
203
|
+
if msg:
|
|
204
|
+
# Remember WHICH session triggered this call so answering attaches to it
|
|
205
|
+
# and continues that work (instead of opening an empty default session).
|
|
206
|
+
cwd = (body or {}).get("cwd", "")
|
|
207
|
+
if cwd:
|
|
208
|
+
app.state.pending_source = {"cwd": cwd}
|
|
209
|
+
await report(msg)
|
|
210
|
+
return {"ok": True}
|
|
211
|
+
|
|
212
|
+
# Background watcher: ring the phone when ANY open Claude terminal finishes,
|
|
213
|
+
# not just the one Voxa is attached to. Routes through the hub so it speaks on
|
|
214
|
+
# the line when a phone is connected and rings (CallKit) when it isn't. Off by
|
|
215
|
+
# setting VOXA_WATCH_TERMINALS=0.
|
|
216
|
+
if os.environ.get("VOXA_WATCH_TERMINALS", "1").strip() not in ("0", "false", ""):
|
|
217
|
+
from server.terminal_watcher import TerminalWatcher
|
|
218
|
+
|
|
219
|
+
# Fallback screen-scraper for terminals without hooks. It stands down (see
|
|
220
|
+
# _stand_down_watcher) the moment a real Claude Code hook arrives, so the hook
|
|
221
|
+
# path and the scraper never double-report the same finish.
|
|
222
|
+
async def _on_bg_done(label, cwd, summary):
|
|
223
|
+
msg = f"{label or 'a terminal'} finished" + (f": {summary}" if summary else "")
|
|
224
|
+
await report(msg)
|
|
225
|
+
|
|
226
|
+
async def _on_bg_resumed(label, cwd):
|
|
227
|
+
# The user picked the task back up on the laptop before answering: cancel
|
|
228
|
+
# the ring so the phone stops buzzing for something already handled.
|
|
229
|
+
account = getattr(app.state, "last_account", "")
|
|
230
|
+
if config.push_enabled:
|
|
231
|
+
await call_manager.cancel(account or None)
|
|
232
|
+
return
|
|
233
|
+
relay = os.environ.get("VOXA_RELAY_URL", "").strip().rstrip("/")
|
|
234
|
+
if not relay or not account:
|
|
235
|
+
return
|
|
236
|
+
try:
|
|
237
|
+
import httpx
|
|
238
|
+
async with httpx.AsyncClient(timeout=10) as c:
|
|
239
|
+
await c.post(f"{relay}/notify", json={"account": account, "cancel": True})
|
|
240
|
+
except Exception:
|
|
241
|
+
logging.exception("cancel via cloud failed")
|
|
242
|
+
|
|
243
|
+
def _skip(session):
|
|
244
|
+
# The terminal we're actively driving is reported by the main loop;
|
|
245
|
+
# skip it here only while a phone line is open (to avoid double-report).
|
|
246
|
+
ctrl = getattr(app.state, "controller", None)
|
|
247
|
+
return bool(call_manager.line_open and ctrl is not None
|
|
248
|
+
and getattr(ctrl, "working_dir", None) == session.get("cwd"))
|
|
249
|
+
|
|
250
|
+
watcher = TerminalWatcher(_on_bg_done, on_resumed=_on_bg_resumed, should_skip=_skip)
|
|
251
|
+
|
|
252
|
+
@app.on_event("startup")
|
|
253
|
+
async def _start_watcher():
|
|
254
|
+
app.state.bg_watcher = asyncio.ensure_future(watcher.run())
|
|
255
|
+
|
|
256
|
+
@app.on_event("shutdown")
|
|
257
|
+
async def _stop_watcher():
|
|
258
|
+
t = getattr(app.state, "bg_watcher", None)
|
|
259
|
+
if t:
|
|
260
|
+
t.cancel()
|
|
261
|
+
|
|
262
|
+
@app.get("/healthz")
|
|
263
|
+
async def healthz():
|
|
264
|
+
return {"ok": True}
|
|
265
|
+
|
|
266
|
+
@app.get("/", response_class=HTMLResponse)
|
|
267
|
+
async def index():
|
|
268
|
+
return (STATIC / "index.html").read_text()
|
|
269
|
+
|
|
270
|
+
@app.websocket("/ws")
|
|
271
|
+
async def ws(websocket: WebSocket):
|
|
272
|
+
if websocket.query_params.get("token") != config.auth_token:
|
|
273
|
+
await websocket.close(code=4401)
|
|
274
|
+
return
|
|
275
|
+
await websocket.accept()
|
|
276
|
+
logging.getLogger("voxa").info("ws: phone connected")
|
|
277
|
+
# Count live connections so background/hook events know the app is OPEN
|
|
278
|
+
# (don't call, surface on the line) vs CLOSED (place a call).
|
|
279
|
+
app.state.phone_clients = getattr(app.state, "phone_clients", 0) + 1
|
|
280
|
+
try:
|
|
281
|
+
await _serve_ws(websocket)
|
|
282
|
+
finally:
|
|
283
|
+
app.state.phone_clients = max(0, getattr(app.state, "phone_clients", 1) - 1)
|
|
284
|
+
|
|
285
|
+
async def _serve_ws(websocket: WebSocket):
|
|
286
|
+
# The controller (and the Claude session it owns) persists across
|
|
287
|
+
# connections via the hub. Build it once, then reuse it so Claude keeps
|
|
288
|
+
# running when the phone hangs up.
|
|
289
|
+
hub = getattr(app.state, "hub", None)
|
|
290
|
+
if hub is None:
|
|
291
|
+
if mode == "drive":
|
|
292
|
+
watch_path = os.path.join(
|
|
293
|
+
tempfile.gettempdir(), f"loop-watch-{os.getpid()}.log"
|
|
294
|
+
)
|
|
295
|
+
controller = ClaudeController(
|
|
296
|
+
watch_log_path=watch_path, launch_terminal=True
|
|
297
|
+
)
|
|
298
|
+
else:
|
|
299
|
+
controller = TmuxController(
|
|
300
|
+
launch_terminal=True,
|
|
301
|
+
terminal_app=os.environ.get("VOXA_TERMINAL_APP", "auto"),
|
|
302
|
+
)
|
|
303
|
+
from server.session_hub import SessionHub
|
|
304
|
+
hub = SessionHub(controller, call_manager)
|
|
305
|
+
if getattr(app.state, "hooks_live", False):
|
|
306
|
+
hub.set_offline_ring(False) # hooks already drive offline rings
|
|
307
|
+
app.state.hub = hub
|
|
308
|
+
app.state.controller = controller
|
|
309
|
+
else:
|
|
310
|
+
controller = app.state.controller
|
|
311
|
+
|
|
312
|
+
# Conversational-activity clock for the idle auto-disconnect (below). Raw mic
|
|
313
|
+
# frames don't count (the phone streams continuously); only real speech /
|
|
314
|
+
# spoken replies / a working task do.
|
|
315
|
+
activity = {"t": time.monotonic()}
|
|
316
|
+
def touch(): activity["t"] = time.monotonic()
|
|
317
|
+
|
|
318
|
+
async def speak(text): await operator.speak(text)
|
|
319
|
+
async def notify(msg):
|
|
320
|
+
if isinstance(msg, dict) and msg.get("type") == "transcript":
|
|
321
|
+
touch()
|
|
322
|
+
await websocket.send_json(msg)
|
|
323
|
+
orchestrator = Orchestrator(controller, speak, notify)
|
|
324
|
+
# Route finals through the hub (spoken when a line is attached, ring via the
|
|
325
|
+
# call manager otherwise). set_final keeps this wired across controller swaps
|
|
326
|
+
# when the user attaches to a different open terminal.
|
|
327
|
+
orchestrator.set_final(hub.on_final)
|
|
328
|
+
|
|
329
|
+
_log = logging.getLogger("voxa")
|
|
330
|
+
_counts = {"in": 0, "out": 0}
|
|
331
|
+
voice = websocket.query_params.get("voice", "")
|
|
332
|
+
# The bridge appends ?account=<paired phone's id> so the metered session
|
|
333
|
+
# bills that balance. Only pass it to factories that accept it.
|
|
334
|
+
import inspect
|
|
335
|
+
_kwargs = {"voice": voice}
|
|
336
|
+
_account = websocket.query_params.get("account", "")
|
|
337
|
+
if "account" in inspect.signature(operator_factory).parameters:
|
|
338
|
+
_kwargs["account"] = _account
|
|
339
|
+
# Remember who's paired so the background watcher can ring this account's
|
|
340
|
+
# phone (via the cloud) when a terminal finishes while they're away.
|
|
341
|
+
if _account:
|
|
342
|
+
app.state.last_account = _account
|
|
343
|
+
|
|
344
|
+
# Pre-session gate: do NOT open the (metered) /live voice session until the
|
|
345
|
+
# user taps Start ("begin") or starts talking. Until then we only do FREE
|
|
346
|
+
# setup, listing terminals and setting the folder/terminal, so idle pairing
|
|
347
|
+
# and choosing a project never cost the user a minute.
|
|
348
|
+
await websocket.send_json(
|
|
349
|
+
{"type": "status", "status": "ready",
|
|
350
|
+
"working_dir": getattr(controller, "working_dir", "") or ""})
|
|
351
|
+
|
|
352
|
+
async def _push_terminals():
|
|
353
|
+
try:
|
|
354
|
+
from server.terminals import discover_claude_sessions
|
|
355
|
+
sessions = await asyncio.to_thread(discover_claude_sessions)
|
|
356
|
+
# Seed the orchestrator with the SAME list the phone is shown, so
|
|
357
|
+
# tapping a terminal (attach_terminal by id) resolves correctly.
|
|
358
|
+
orchestrator.remember_terminals(sessions)
|
|
359
|
+
await websocket.send_json({"type": "terminals", "terminals": sessions})
|
|
360
|
+
except Exception:
|
|
361
|
+
pass
|
|
362
|
+
asyncio.ensure_future(_push_terminals())
|
|
363
|
+
|
|
364
|
+
first_audio = None
|
|
365
|
+
while True:
|
|
366
|
+
msg = await websocket.receive()
|
|
367
|
+
if msg["type"] == "websocket.disconnect":
|
|
368
|
+
return # hung up during setup; nothing was attached or metered
|
|
369
|
+
if msg.get("bytes") is not None:
|
|
370
|
+
first_audio = msg["bytes"] # talking implicitly begins the session
|
|
371
|
+
break
|
|
372
|
+
if msg.get("text"):
|
|
373
|
+
try:
|
|
374
|
+
data = json.loads(msg["text"])
|
|
375
|
+
except ValueError:
|
|
376
|
+
continue
|
|
377
|
+
if data.get("type") == "begin":
|
|
378
|
+
break
|
|
379
|
+
# Folder/terminal selection is free, handle it before metering.
|
|
380
|
+
await handle_client_control(msg["text"], orchestrator, websocket, None)
|
|
381
|
+
|
|
382
|
+
async with _as_cm(operator_factory(config, orchestrator.handle_tool_call, **_kwargs)) as operator:
|
|
383
|
+
async def audio_out(pcm):
|
|
384
|
+
touch() # Gemini is speaking -> the line is active
|
|
385
|
+
_counts["out"] += 1
|
|
386
|
+
if _counts["out"] % 50 == 1:
|
|
387
|
+
_log.info("ws: sent %d audio chunks -> phone", _counts["out"])
|
|
388
|
+
await websocket.send_bytes(pcm)
|
|
389
|
+
operator.set_audio_out(audio_out)
|
|
390
|
+
operator.set_text_out(notify)
|
|
391
|
+
|
|
392
|
+
# Attach this operator's voice to the line and speak anything that
|
|
393
|
+
# queued up while no phone was connected.
|
|
394
|
+
pending_updates = hub.attach(lambda t: operator.speak(t))
|
|
395
|
+
# Everything from here runs with the line attached; detach() MUST run even on
|
|
396
|
+
# cancellation (uvicorn reload/shutdown) or an exception, otherwise line_open
|
|
397
|
+
# stays True and every later background finish is silently dropped.
|
|
398
|
+
try:
|
|
399
|
+
# If this call was triggered by a specific Claude terminal, attach to THAT
|
|
400
|
+
# terminal so Voxa continues that session instead of an empty default one,
|
|
401
|
+
# and OPEN with that context so it knows where you are.
|
|
402
|
+
source = getattr(app.state, "pending_source", None)
|
|
403
|
+
app.state.pending_source = None
|
|
404
|
+
attached_folder = None
|
|
405
|
+
if source and source.get("cwd"):
|
|
406
|
+
try:
|
|
407
|
+
res = await orchestrator.attach_source(source["cwd"])
|
|
408
|
+
if "attached" in res:
|
|
409
|
+
controller = orchestrator.controller # follow the swap
|
|
410
|
+
app.state.controller = controller # persist for reconnects
|
|
411
|
+
attached_folder = (os.path.basename(source["cwd"].rstrip("/"))
|
|
412
|
+
or source["cwd"])
|
|
413
|
+
await websocket.send_json(
|
|
414
|
+
{"type": "status",
|
|
415
|
+
"working_dir": res.get("working_dir", source["cwd"])})
|
|
416
|
+
else:
|
|
417
|
+
_log.info("auto-attach to %s skipped: %s",
|
|
418
|
+
source["cwd"], res.get("error"))
|
|
419
|
+
except Exception:
|
|
420
|
+
logging.exception("auto-attach on answer failed")
|
|
421
|
+
|
|
422
|
+
# Voxa's opening is ALWAYS driven from here. Suppress the operator's own
|
|
423
|
+
# auto-greet: on the metered path the cloud brain would otherwise race
|
|
424
|
+
# ahead and speak a generic "what would you like to do?" the instant the
|
|
425
|
+
# /live socket opens, before this contextual opening arrives.
|
|
426
|
+
suppress_greeting_if_supported(operator)
|
|
427
|
+
if attached_folder:
|
|
428
|
+
opening = compose_opening(attached_folder, pending_updates)
|
|
429
|
+
elif pending_updates:
|
|
430
|
+
opening = compose_opening("", pending_updates)
|
|
431
|
+
else:
|
|
432
|
+
opening = "Hi! What would you like to work on?"
|
|
433
|
+
await operator.speak(opening, immediate=True)
|
|
434
|
+
|
|
435
|
+
# If the session began because the user started talking, don't drop
|
|
436
|
+
# that first frame.
|
|
437
|
+
if first_audio is not None:
|
|
438
|
+
await operator.send_audio(first_audio)
|
|
439
|
+
|
|
440
|
+
paused = False
|
|
441
|
+
|
|
442
|
+
async def recv_loop():
|
|
443
|
+
nonlocal paused
|
|
444
|
+
try:
|
|
445
|
+
while True:
|
|
446
|
+
msg = await websocket.receive()
|
|
447
|
+
if msg["type"] == "websocket.disconnect":
|
|
448
|
+
break
|
|
449
|
+
if msg.get("bytes") is not None:
|
|
450
|
+
_counts["in"] += 1
|
|
451
|
+
# Read the CURRENTLY-driven controller (it swaps when the
|
|
452
|
+
# user attaches to another terminal mid-call); the loop's
|
|
453
|
+
# local `controller` would be stale after a swap.
|
|
454
|
+
cur = orchestrator.controller
|
|
455
|
+
if _counts["in"] % 50 == 1:
|
|
456
|
+
_log.info(
|
|
457
|
+
"ws: recv %d mic frames (status=%s)",
|
|
458
|
+
_counts["in"], getattr(cur, "status", "?"),
|
|
459
|
+
)
|
|
460
|
+
# Cost saver: while Claude is working, stop forwarding mic
|
|
461
|
+
# audio to Gemini. Gemini bills per audio token, so no
|
|
462
|
+
# audio in == no charge during the wait. Resume when idle.
|
|
463
|
+
if getattr(cur, "status", "idle") == "working":
|
|
464
|
+
paused = True
|
|
465
|
+
else:
|
|
466
|
+
if paused:
|
|
467
|
+
paused = False
|
|
468
|
+
await websocket.send_json(
|
|
469
|
+
{"type": "status", "status": "listening"}
|
|
470
|
+
)
|
|
471
|
+
await operator.send_audio(msg["bytes"])
|
|
472
|
+
elif msg.get("text"):
|
|
473
|
+
await handle_client_control(
|
|
474
|
+
msg["text"], orchestrator, websocket, operator
|
|
475
|
+
)
|
|
476
|
+
except (WebSocketDisconnect, RuntimeError):
|
|
477
|
+
pass
|
|
478
|
+
|
|
479
|
+
# Idle auto-disconnect: hang up after a quiet stretch (no speech, not
|
|
480
|
+
# working) so an idle line stops burning V2V minutes. Off if 0.
|
|
481
|
+
idle_timeout = float(os.environ.get("VOXA_IDLE_TIMEOUT", "180"))
|
|
482
|
+
|
|
483
|
+
async def idle_watchdog():
|
|
484
|
+
if idle_timeout <= 0:
|
|
485
|
+
return await asyncio.Event().wait() # disabled
|
|
486
|
+
while True:
|
|
487
|
+
await asyncio.sleep(5)
|
|
488
|
+
# Follow controller swaps: a mid-call attach must not leave the
|
|
489
|
+
# watchdog reading the old (idle) controller and hang up mid-task.
|
|
490
|
+
if getattr(orchestrator.controller, "status", "idle") == "working":
|
|
491
|
+
touch() # an active task is not idle
|
|
492
|
+
continue
|
|
493
|
+
if time.monotonic() - activity["t"] > idle_timeout:
|
|
494
|
+
try:
|
|
495
|
+
await websocket.send_json(
|
|
496
|
+
{"type": "status",
|
|
497
|
+
"status": "idle, disconnecting to save minutes"})
|
|
498
|
+
except Exception:
|
|
499
|
+
pass
|
|
500
|
+
return
|
|
501
|
+
|
|
502
|
+
run_task = asyncio.ensure_future(operator.run())
|
|
503
|
+
recv_task = asyncio.ensure_future(recv_loop())
|
|
504
|
+
idle_task = asyncio.ensure_future(idle_watchdog())
|
|
505
|
+
done, pending = await asyncio.wait(
|
|
506
|
+
[run_task, recv_task, idle_task],
|
|
507
|
+
return_when=asyncio.FIRST_COMPLETED,
|
|
508
|
+
)
|
|
509
|
+
for task in pending:
|
|
510
|
+
task.cancel()
|
|
511
|
+
try:
|
|
512
|
+
await task
|
|
513
|
+
except asyncio.CancelledError:
|
|
514
|
+
pass
|
|
515
|
+
except Exception:
|
|
516
|
+
logging.exception("loop task raised during cancellation")
|
|
517
|
+
for task in done:
|
|
518
|
+
exc = task.exception()
|
|
519
|
+
if exc is not None:
|
|
520
|
+
logging.error("loop task raised: %r", exc)
|
|
521
|
+
finally:
|
|
522
|
+
# Detach the line but keep Claude running. Only an explicit stop_claude
|
|
523
|
+
# (via the orchestrator) tears the session down. Persist the
|
|
524
|
+
# currently-driven controller so the next connection reuses it.
|
|
525
|
+
app.state.controller = orchestrator.controller
|
|
526
|
+
_log.info("ws: phone disconnected (mic_in=%d audio_out=%d)",
|
|
527
|
+
_counts["in"], _counts["out"])
|
|
528
|
+
hub.detach()
|
|
529
|
+
|
|
530
|
+
# mount static assets (js/worklet) under /static
|
|
531
|
+
from fastapi.staticfiles import StaticFiles
|
|
532
|
+
app.mount("/static", StaticFiles(directory=str(STATIC)), name="static")
|
|
533
|
+
return app
|
|
534
|
+
|
|
535
|
+
|
|
536
|
+
async def handle_client_control(raw: str, orchestrator, websocket, operator=None) -> None:
|
|
537
|
+
"""Handle a JSON control message sent by the phone (e.g. setting the folder)."""
|
|
538
|
+
try:
|
|
539
|
+
data = json.loads(raw)
|
|
540
|
+
except (ValueError, TypeError):
|
|
541
|
+
return
|
|
542
|
+
mtype = data.get("type")
|
|
543
|
+
if mtype == "say" and data.get("text") and operator is not None:
|
|
544
|
+
await operator.send_text(data["text"])
|
|
545
|
+
elif mtype == "claude_input" and data.get("text"):
|
|
546
|
+
# Raw terminal chat from the phone's full-screen view: type straight into the
|
|
547
|
+
# live Claude session, bypassing the voice operator.
|
|
548
|
+
await orchestrator.send_direct(data["text"])
|
|
549
|
+
elif mtype == "claude_scrollback":
|
|
550
|
+
# Full-screen terminal view wants the whole scrollback (not just the pane).
|
|
551
|
+
await orchestrator.send_scrollback()
|
|
552
|
+
elif mtype == "set_terminal" and data.get("app"):
|
|
553
|
+
orchestrator.set_terminal_app(data["app"])
|
|
554
|
+
elif mtype == "set_dir" and data.get("path"):
|
|
555
|
+
result = await orchestrator.handle_tool_call(
|
|
556
|
+
"set_working_dir", {"path": data["path"]}
|
|
557
|
+
)
|
|
558
|
+
if "error" in result:
|
|
559
|
+
await websocket.send_json(
|
|
560
|
+
{"type": "status", "status": f"folder error: {result['error']}"}
|
|
561
|
+
)
|
|
562
|
+
elif mtype == "stop":
|
|
563
|
+
# Cancel the running Claude task (works even while the mic stream is paused).
|
|
564
|
+
await orchestrator.handle_tool_call("stop_claude", {})
|
|
565
|
+
await websocket.send_json({"type": "status", "status": "stopped"})
|
|
566
|
+
elif mtype == "list_dirs":
|
|
567
|
+
# The phone's folder browser asks for the subdirectories of a path. Resolve the
|
|
568
|
+
# deepest existing ancestor (so a half-typed/nonexistent path still lists
|
|
569
|
+
# something sensible) and send its subfolders back for navigation.
|
|
570
|
+
from server.orchestrator import suggest_dirs
|
|
571
|
+
base, options = suggest_dirs(data.get("path") or "~", limit=500)
|
|
572
|
+
await websocket.send_json({"type": "dirs", "path": base, "dirs": options})
|
|
573
|
+
elif mtype == "list_terminals":
|
|
574
|
+
# The tool pushes a {"type":"terminals",...} message to the phone itself.
|
|
575
|
+
await orchestrator.handle_tool_call("list_terminals", {})
|
|
576
|
+
elif mtype == "attach_terminal" and data.get("id"):
|
|
577
|
+
res = await orchestrator.handle_tool_call("attach_terminal", {"id": data["id"]})
|
|
578
|
+
if "error" in res:
|
|
579
|
+
await websocket.send_json({"type": "status", "status": res["error"]})
|
|
580
|
+
|
|
581
|
+
|
|
582
|
+
@asynccontextmanager
|
|
583
|
+
async def _as_cm(obj):
|
|
584
|
+
"""Accept either an async context manager or a plain object."""
|
|
585
|
+
if hasattr(obj, "__aenter__"):
|
|
586
|
+
async with obj as entered:
|
|
587
|
+
yield entered
|
|
588
|
+
else:
|
|
589
|
+
yield obj
|