voxa-code 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- server/__init__.py +0 -0
- server/apns.py +89 -0
- server/app.py +589 -0
- server/appattest.py +310 -0
- server/appstore.py +141 -0
- server/attested_store.py +60 -0
- server/auth.py +70 -0
- server/ax_controller.py +202 -0
- server/billing.py +177 -0
- server/call_manager.py +91 -0
- server/certs/AppleRootCA-G3.pem +15 -0
- server/certs/Apple_App_Attestation_Root_CA.pem +14 -0
- server/claude_controller.py +156 -0
- server/cli.py +365 -0
- server/cloud_app.py +345 -0
- server/config.py +56 -0
- server/device_registry.py +52 -0
- server/gemini_operator.py +677 -0
- server/hooks.py +202 -0
- server/orchestrator.py +315 -0
- server/push_routes.py +50 -0
- server/ratelimit.py +41 -0
- server/relay.py +157 -0
- server/relay_client.py +89 -0
- server/remote_operator.py +128 -0
- server/session_hub.py +33 -0
- server/terminal_watcher.py +241 -0
- server/terminals.py +510 -0
- server/tmux_controller.py +580 -0
- server/transcript_monitor.py +134 -0
- server/transcripts.py +143 -0
- server/users.py +90 -0
- server/voxa_cloud.py +132 -0
- server/waitlist.py +130 -0
- static/app.js +388 -0
- static/favicon.svg +1 -0
- static/index.html +253 -0
- static/pcm-worklet.js +69 -0
- static/pro.html +29 -0
- static/pro2.html +33 -0
- static/voxa-mark-white.svg +1 -0
- voxa_code-0.1.0.dist-info/METADATA +227 -0
- voxa_code-0.1.0.dist-info/RECORD +47 -0
- voxa_code-0.1.0.dist-info/WHEEL +5 -0
- voxa_code-0.1.0.dist-info/entry_points.txt +2 -0
- voxa_code-0.1.0.dist-info/licenses/LICENSE +21 -0
- voxa_code-0.1.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,677 @@
|
|
|
1
|
+
"""Gemini Live operator bridge for Loop.
|
|
2
|
+
|
|
3
|
+
Verified against google-genai 2.10.0. Confirmed Live API names:
|
|
4
|
+
- client.aio.live.connect(model=..., config=...) -> async context manager yielding AsyncSession
|
|
5
|
+
- session.send_realtime_input(audio=types.Blob(...)) -> sends mic PCM
|
|
6
|
+
- session.receive() -> AsyncIterator[types.LiveServerMessage]
|
|
7
|
+
- session.send_tool_response(function_responses=[types.FunctionResponse(...)]) -> sends tool result
|
|
8
|
+
- session.send_client_content(turns=types.Content(...), turn_complete=True) -> inject text turn
|
|
9
|
+
|
|
10
|
+
Deviations from brief:
|
|
11
|
+
- The brief says self._cm = client.aio.live.connect(...) then self._session = await self._cm.__aenter__().
|
|
12
|
+
In 2.10.0, connect() is an @asynccontextmanager (not a plain coroutine returning a CM), so it cannot
|
|
13
|
+
be stored and manually __aenter__'d in the usual way. Instead, we store the async generator and
|
|
14
|
+
use asend(None) to drive it. This is equivalent and avoids needing a separate context manager wrapper.
|
|
15
|
+
Practically: we use `async with client.aio.live.connect(...) as session` in __aenter__ via
|
|
16
|
+
contextlib.AsyncExitStack so the public GeminiOperator interface (async with / __aenter__/__aexit__)
|
|
17
|
+
is unchanged for Task 5.
|
|
18
|
+
- response.data: confirmed to exist as a property on LiveServerMessage (concatenates inline_data bytes
|
|
19
|
+
from all parts). Brief's description matches the actual 2.10.0 implementation.
|
|
20
|
+
- send_realtime_input in 2.10.0 takes keyword-only args; `audio=types.Blob(...)` is valid.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
import asyncio
|
|
26
|
+
import contextlib
|
|
27
|
+
import difflib
|
|
28
|
+
import logging
|
|
29
|
+
import time
|
|
30
|
+
from collections.abc import Awaitable, Callable
|
|
31
|
+
from typing import Optional
|
|
32
|
+
|
|
33
|
+
from google import genai
|
|
34
|
+
from google.genai import types
|
|
35
|
+
|
|
36
|
+
from .config import Config
|
|
37
|
+
|
|
38
|
+
logger = logging.getLogger(__name__)
|
|
39
|
+
|
|
40
|
+
# ---------------------------------------------------------------------------
|
|
41
|
+
# System instruction
|
|
42
|
+
# ---------------------------------------------------------------------------
|
|
43
|
+
|
|
44
|
+
SYSTEM_INSTRUCTION = (
|
|
45
|
+
"You are Voxa, a concise voice operator that drives Claude Code on the user's machine. "
|
|
46
|
+
"Keep all spoken responses short and natural. "
|
|
47
|
+
"\n\nCHOOSING THE FOLDER: the user can either type it in the 'Working folder' field on their "
|
|
48
|
+
"phone, OR just tell you by voice. When they say it by voice, convert it to an absolute path "
|
|
49
|
+
"(expand the home directory to ~, e.g. 'documents folder' -> '~/Documents') and call "
|
|
50
|
+
"set_working_dir with your best guess. If it returns an error, it also returns 'searched_in' and a "
|
|
51
|
+
"list of 'suggestions': tell the user that folder wasn't found and read a few of the suggestions, "
|
|
52
|
+
"then try again with their choice. You can also call list_dirs(parent) to browse what's inside a "
|
|
53
|
+
"folder and read the options aloud. If the user wants to CREATE a new folder, call make_dir with "
|
|
54
|
+
"the full path; it makes the folder and starts the session there. "
|
|
55
|
+
"\n\nAFTER OPENING A SESSION, DO NOTHING ON YOUR OWN: once a session opens (or you switch folders), do NOT "
|
|
56
|
+
"call send_to_claude or run any command by yourself. In particular, do NOT list the folder's contents, "
|
|
57
|
+
"summarise, or 'take a look' unasked. Just say the session is ready and ASK what they'd like to do, then wait. "
|
|
58
|
+
"Only call send_to_claude when the user has actually asked for something. "
|
|
59
|
+
"\n\nFULL ACCESS: Claude has FULL read/write/execute access to the ENTIRE machine (permissions are "
|
|
60
|
+
"bypassed); it is NOT limited to the project folder. To look at, list, open, read, edit, run, or move "
|
|
61
|
+
"files ANYWHERE, do NOT call set_working_dir, just call send_to_claude with the request including an "
|
|
62
|
+
"absolute path (expand ~). Examples: 'open my Documents and list the files' -> send_to_claude('List all "
|
|
63
|
+
"files and folders in ~/Documents'); 'open that file on my desktop' -> send_to_claude('Open ~/Desktop/<name>'). "
|
|
64
|
+
"This keeps Claude's context and works for any location. Use set_working_dir ONLY when the user explicitly "
|
|
65
|
+
"wants to SWITCH the project they are working in to a different folder (it RESTARTS Claude there and loses "
|
|
66
|
+
"the current chat). NEVER claim you switched, opened, or changed a folder unless the tool actually confirmed it. "
|
|
67
|
+
"\n\nRUNNING TASKS: once a folder is set, call send_to_claude with the user's request EXACTLY as they "
|
|
68
|
+
"said it, WORD FOR WORD. Do NOT add, expand, rephrase, or infer ANYTHING they did not say: no extra "
|
|
69
|
+
"technologies, frameworks, languages, libraries, file names, or details. If the request sounds CUT OFF or "
|
|
70
|
+
"incomplete (the user trailed off, e.g. 'can you create a'), do NOT guess, complete, or invent it (never make "
|
|
71
|
+
"up a file name or its contents); wait and ASK them to finish the request before you call any tool. If they say 'create a browser "
|
|
72
|
+
"game named test_one', send EXACTLY 'create a browser game named test_one', NOT 'create a browser game "
|
|
73
|
+
"named test_one with HTML, CSS, and JavaScript'. Then say something brief like 'On it' while Claude "
|
|
74
|
+
"works. Send the user's request as ONE message and let Claude do ALL of it (Claude creates the files, "
|
|
75
|
+
"writes the code, runs things itself). NEVER split a request into multiple send_to_claude calls and NEVER "
|
|
76
|
+
"send a follow-up step on your own (e.g. 'create the files', 'create the javascript file'). When Claude "
|
|
77
|
+
"finishes, you ONLY speak the result to the user and WAIT; do not call send_to_claude again until the user "
|
|
78
|
+
"asks for the next thing. "
|
|
79
|
+
"The Claude session is persistent for the whole call, so it remembers previous turns: follow-ups "
|
|
80
|
+
"like 'open it' or 'now add a test' work; pass them through verbatim. If send_to_claude reports no "
|
|
81
|
+
"session has started, help the user pick a folder first (by voice or the phone field). "
|
|
82
|
+
"\n\nCRITICAL - NEVER PUT YOUR OWN WORDS INTO CLAUDE: send_to_claude is ONLY for the user's own "
|
|
83
|
+
"requests. NEVER send your own narration, summaries, confirmations, or descriptions of what Claude did "
|
|
84
|
+
"into send_to_claude. When Claude finishes, you RELAY the result to the USER by SPEAKING it, you do NOT "
|
|
85
|
+
"type it back into Claude. Lines like 'I've created index.html...' are things you SAY to the user, "
|
|
86
|
+
"never things you send to Claude. And never invent or assume what Claude built (e.g. the game type or "
|
|
87
|
+
"which files exist), only state what the screen update actually shows. "
|
|
88
|
+
"\n\nLIVE SCREEN UPDATES: you automatically receive messages describing what is currently on Claude's "
|
|
89
|
+
"terminal whenever it stops or pauses. ALWAYS relay these to the user, and NEVER answer on their behalf. "
|
|
90
|
+
"If it is a question, menu, or permission/trust prompt (e.g. 'Do you trust this folder? 1. Yes 2. No', "
|
|
91
|
+
"or 'Allow edit? y/n'), read the options aloud and ASK the user what they want to do. When they answer, "
|
|
92
|
+
"translate their words into the exact input Claude expects and send it with send_to_claude: for a numbered "
|
|
93
|
+
"menu send the number (e.g. user says 'yes, trust it' -> send '1'); for a yes/no send 'y' or 'n'; for a "
|
|
94
|
+
"free-form question send their answer. If it is just Claude's finished result, summarise it in a few "
|
|
95
|
+
"sentences. When unsure whether something needs a decision, ask the user rather than guessing. "
|
|
96
|
+
"IGNORE Claude Code's own interface noise: MCP server status or warnings, tool/status lines, tips, "
|
|
97
|
+
"'what's new', spinners, and the cost/token bar are NOT messages for the user. This includes Claude's "
|
|
98
|
+
"status bar / footer: the model name, the EFFORT level (e.g. 'high', 'xhigh'), usage percentages, and "
|
|
99
|
+
"slash-command hints like '/effort' or '/model'. NEVER read, repeat, comment on, or ASK THE USER ABOUT any "
|
|
100
|
+
"of this UI text. If the screen shows only such chrome and no real answer or question, say nothing about it "
|
|
101
|
+
"and just wait. Never read them aloud or comment on them; only relay Claude's actual answer to the request "
|
|
102
|
+
"or a real question Claude is asking. "
|
|
103
|
+
"\n\nEXISTING TERMINALS: if the user wants to work on a terminal/Claude they ALREADY have open "
|
|
104
|
+
"(e.g. 'use my open terminal', 'attach to the one in veil', 'pick from my terminals'), call "
|
|
105
|
+
"list_terminals, read out the controllable ones by their folder, and when they choose call "
|
|
106
|
+
"attach_terminal (by id, or 'match' the folder name, or 'index'). When attach_terminal returns a "
|
|
107
|
+
"'recap' field, it is the recent conversation from THAT terminal's Claude session: use it to briefly "
|
|
108
|
+
"tell the user what they were working on in that terminal and what the last thing was, THEN ask what "
|
|
109
|
+
"they want to do next. Do not read the recap verbatim, summarise it in a sentence or two. After "
|
|
110
|
+
"attaching, drive it exactly like a normal session. If a terminal is reported not controllable, tell "
|
|
111
|
+
"the user it can't be driven unless Claude runs inside tmux. "
|
|
112
|
+
"\n\nSESSION DETAILS: when the user asks about something that happened earlier in the "
|
|
113
|
+
"attached session (what files changed, why a test failed, what was decided), call "
|
|
114
|
+
"read_session (last=N or search='keyword') and answer from what it returns. Summarise "
|
|
115
|
+
"in a few sentences; never read raw transcript dumps, code, or long paths aloud. If "
|
|
116
|
+
"read_session errors, say you could not find that session's history. "
|
|
117
|
+
"\n\nYou may call get_claude_status to check progress. If the user says stop or cancel, call stop_claude. "
|
|
118
|
+
"Never read out long raw file paths or code blocks verbatim unless asked."
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
# ---------------------------------------------------------------------------
|
|
122
|
+
# Tool declarations (must match orchestrator's handle_tool_call names exactly)
|
|
123
|
+
# ---------------------------------------------------------------------------
|
|
124
|
+
|
|
125
|
+
TOOL_DECLARATIONS = [
|
|
126
|
+
{
|
|
127
|
+
"name": "start_claude_session",
|
|
128
|
+
"description": "Start a Claude Code session in a working directory.",
|
|
129
|
+
"parameters": {
|
|
130
|
+
"type": "object",
|
|
131
|
+
"properties": {
|
|
132
|
+
"working_dir": {
|
|
133
|
+
"type": "string",
|
|
134
|
+
"description": "Absolute or ~-relative path to the project folder.",
|
|
135
|
+
},
|
|
136
|
+
},
|
|
137
|
+
"required": ["working_dir"],
|
|
138
|
+
},
|
|
139
|
+
},
|
|
140
|
+
{
|
|
141
|
+
"name": "send_to_claude",
|
|
142
|
+
"description": (
|
|
143
|
+
"Send a prompt to the active Claude session. "
|
|
144
|
+
"Returns immediately; the result is spoken later."
|
|
145
|
+
),
|
|
146
|
+
"parameters": {
|
|
147
|
+
"type": "object",
|
|
148
|
+
"properties": {
|
|
149
|
+
"text": {"type": "string"},
|
|
150
|
+
},
|
|
151
|
+
"required": ["text"],
|
|
152
|
+
},
|
|
153
|
+
},
|
|
154
|
+
{
|
|
155
|
+
"name": "get_claude_status",
|
|
156
|
+
"description": "Check whether Claude is idle, working, finished, or errored.",
|
|
157
|
+
"parameters": {
|
|
158
|
+
"type": "object",
|
|
159
|
+
"properties": {},
|
|
160
|
+
},
|
|
161
|
+
},
|
|
162
|
+
{
|
|
163
|
+
"name": "set_working_dir",
|
|
164
|
+
"description": (
|
|
165
|
+
"Set/Change the working directory for the Claude session (accepts ~-relative paths). "
|
|
166
|
+
"On failure returns 'searched_in' and 'suggestions' to read back to the user."
|
|
167
|
+
),
|
|
168
|
+
"parameters": {
|
|
169
|
+
"type": "object",
|
|
170
|
+
"properties": {
|
|
171
|
+
"path": {"type": "string"},
|
|
172
|
+
},
|
|
173
|
+
"required": ["path"],
|
|
174
|
+
},
|
|
175
|
+
},
|
|
176
|
+
{
|
|
177
|
+
"name": "list_dirs",
|
|
178
|
+
"description": "List the subdirectories inside a folder, to help the user choose by voice.",
|
|
179
|
+
"parameters": {
|
|
180
|
+
"type": "object",
|
|
181
|
+
"properties": {
|
|
182
|
+
"parent": {"type": "string", "description": "Folder to list (~-relative ok)."},
|
|
183
|
+
},
|
|
184
|
+
"required": ["parent"],
|
|
185
|
+
},
|
|
186
|
+
},
|
|
187
|
+
{
|
|
188
|
+
"name": "make_dir",
|
|
189
|
+
"description": "Create a new folder (and parents) then start the Claude session inside it.",
|
|
190
|
+
"parameters": {
|
|
191
|
+
"type": "object",
|
|
192
|
+
"properties": {
|
|
193
|
+
"path": {"type": "string", "description": "Full path of the new folder (~-relative ok)."},
|
|
194
|
+
},
|
|
195
|
+
"required": ["path"],
|
|
196
|
+
},
|
|
197
|
+
},
|
|
198
|
+
{
|
|
199
|
+
"name": "stop_claude",
|
|
200
|
+
"description": "Cancel the current Claude run.",
|
|
201
|
+
"parameters": {
|
|
202
|
+
"type": "object",
|
|
203
|
+
"properties": {},
|
|
204
|
+
},
|
|
205
|
+
},
|
|
206
|
+
{
|
|
207
|
+
"name": "list_terminals",
|
|
208
|
+
"description": (
|
|
209
|
+
"List the Claude sessions the user already has open in their terminals "
|
|
210
|
+
"(iTerm2, tmux, ...). Returns each with a label (its folder) and whether it "
|
|
211
|
+
"is controllable. Also shows them on the phone as a tappable list."
|
|
212
|
+
),
|
|
213
|
+
"parameters": {"type": "object", "properties": {}},
|
|
214
|
+
},
|
|
215
|
+
{
|
|
216
|
+
"name": "attach_terminal",
|
|
217
|
+
"description": (
|
|
218
|
+
"Attach to one of the open Claude terminals from list_terminals and drive it. "
|
|
219
|
+
"Identify it by 'id', or by 'match' (part of its folder name), or 'index' (1-based)."
|
|
220
|
+
),
|
|
221
|
+
"parameters": {
|
|
222
|
+
"type": "object",
|
|
223
|
+
"properties": {
|
|
224
|
+
"id": {"type": "string"},
|
|
225
|
+
"match": {"type": "string", "description": "Part of the folder/label to match."},
|
|
226
|
+
"index": {"type": "integer", "description": "1-based position in the last list."},
|
|
227
|
+
},
|
|
228
|
+
},
|
|
229
|
+
},
|
|
230
|
+
{
|
|
231
|
+
"name": "read_session",
|
|
232
|
+
"description": (
|
|
233
|
+
"Read the attached Claude session's full transcript on demand. "
|
|
234
|
+
"Use when the user asks about details of past work in this session "
|
|
235
|
+
"(what changed, why something failed, what was decided). "
|
|
236
|
+
"Pass last=N for the most recent N messages, or search='text' to "
|
|
237
|
+
"find messages mentioning something."
|
|
238
|
+
),
|
|
239
|
+
"parameters": {
|
|
240
|
+
"type": "object",
|
|
241
|
+
"properties": {
|
|
242
|
+
"last": {"type": "integer", "description": "How many recent messages (max 40)."},
|
|
243
|
+
"search": {"type": "string", "description": "Find messages containing this text."},
|
|
244
|
+
},
|
|
245
|
+
},
|
|
246
|
+
},
|
|
247
|
+
]
|
|
248
|
+
|
|
249
|
+
# ---------------------------------------------------------------------------
|
|
250
|
+
# GeminiOperator
|
|
251
|
+
# ---------------------------------------------------------------------------
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
class GeminiOperator:
|
|
255
|
+
"""Bridges a phone call to a Gemini Live realtime voice session.
|
|
256
|
+
|
|
257
|
+
Usage::
|
|
258
|
+
|
|
259
|
+
async with GeminiOperator(config, handle_tool_call) as op:
|
|
260
|
+
op.set_audio_out(send_to_phone)
|
|
261
|
+
await asyncio.gather(op.run(), mic_pump(op))
|
|
262
|
+
"""
|
|
263
|
+
|
|
264
|
+
def __init__(
|
|
265
|
+
self,
|
|
266
|
+
config: Config,
|
|
267
|
+
handle_tool_call: Callable[[str, dict], Awaitable[dict]],
|
|
268
|
+
voice: str = "",
|
|
269
|
+
) -> None:
|
|
270
|
+
self._config = config
|
|
271
|
+
self._handle = handle_tool_call
|
|
272
|
+
self._voice = voice
|
|
273
|
+
self._audio_out: Optional[Callable[[bytes], Awaitable[None]]] = None
|
|
274
|
+
self._text_out: Optional[Callable[[dict], Awaitable[None]]] = None
|
|
275
|
+
self._session: Optional[genai.live.AsyncSession] = None # type: ignore[name-defined]
|
|
276
|
+
self._client: Optional[genai.Client] = None # type: ignore[name-defined]
|
|
277
|
+
# The active session lives in its own stack so it can be torn down and
|
|
278
|
+
# reopened (resume) independently of the operator's lifetime.
|
|
279
|
+
self._session_stack: Optional[contextlib.AsyncExitStack] = None
|
|
280
|
+
# Set while a session is open and usable; cleared during a (re)connect so
|
|
281
|
+
# senders drop/await instead of writing to a half-open socket.
|
|
282
|
+
self._ready = asyncio.Event()
|
|
283
|
+
self._closing = False # True once __aexit__ starts (suppress resume)
|
|
284
|
+
# Server-side half-duplex: while Voxa is speaking we model the phone's
|
|
285
|
+
# realtime playback timeline and DROP mic audio until it finishes (+margin),
|
|
286
|
+
# so Voxa's own voice off the speaker is never fed back to Gemini as "user
|
|
287
|
+
# input". Robust regardless of the app build.
|
|
288
|
+
self._play_until = 0.0 # monotonic time the current reply finishes playing
|
|
289
|
+
self._echo_margin = 0.7 # extra guard after playback ends (s)
|
|
290
|
+
# Latest session-resumption handle from the server (see run()). Passed back on
|
|
291
|
+
# (re)connect so a dropped Live connection can resume mid-call. None until the
|
|
292
|
+
# server first marks a checkpoint resumable.
|
|
293
|
+
self._resume_handle: Optional[str] = None
|
|
294
|
+
# Dedupe relayed updates (see speak): the same finished-task confirmation can
|
|
295
|
+
# be pushed several times in a row (a self-interruption/echo loop re-triggers
|
|
296
|
+
# the task), which reads aloud as a stutter. Skip near-identical repeats.
|
|
297
|
+
self._last_spoken = ""
|
|
298
|
+
self._last_spoken_at = 0.0
|
|
299
|
+
self._speak_dedupe_window = 90.0
|
|
300
|
+
# Debounce relays: one user action makes Claude's screen settle in stages, so
|
|
301
|
+
# the finished-update fires several times in a burst. Without coalescing,
|
|
302
|
+
# Gemini speaks a confirmation for EACH (the "again and again" repetition). We
|
|
303
|
+
# accumulate a burst and speak ONE summary after a brief quiet window.
|
|
304
|
+
self._pending_speak = ""
|
|
305
|
+
self._speak_task: Optional[asyncio.Task] = None
|
|
306
|
+
self._speak_debounce = 2.5 # coalesce a settling burst (monitor idles ~3.6s apart)
|
|
307
|
+
# Loop guard: send_to_claude may only fire after a genuine NEW user turn
|
|
308
|
+
# (spoken or typed). A finished-task relay is injected as a user turn but is
|
|
309
|
+
# NOT a real utterance, so it can't license another dispatch. This stops the
|
|
310
|
+
# agent from auto-continuing / decomposing a task into repeated send_to_claude.
|
|
311
|
+
self._user_spoke = False
|
|
312
|
+
self._greeted = False # speak an opening greeting once, so Voxa talks first
|
|
313
|
+
|
|
314
|
+
# ------------------------------------------------------------------
|
|
315
|
+
# Async context manager: opens the Live session
|
|
316
|
+
# ------------------------------------------------------------------
|
|
317
|
+
|
|
318
|
+
async def __aenter__(self) -> "GeminiOperator":
|
|
319
|
+
# Vertex AI mode (set on hosts whose IP the Developer API geo-blocks): auth
|
|
320
|
+
# by service account, no IP-location check. Falls back to the Developer API
|
|
321
|
+
# (api key) everywhere else. Env: GOOGLE_GENAI_USE_VERTEXAI + project/location.
|
|
322
|
+
import os
|
|
323
|
+
if os.environ.get("GOOGLE_GENAI_USE_VERTEXAI", "").strip().lower() in ("1", "true", "yes"):
|
|
324
|
+
client = genai.Client(
|
|
325
|
+
vertexai=True,
|
|
326
|
+
project=os.environ.get("GOOGLE_CLOUD_PROJECT"),
|
|
327
|
+
location=os.environ.get("GOOGLE_CLOUD_LOCATION", "us-central1"),
|
|
328
|
+
)
|
|
329
|
+
else:
|
|
330
|
+
client = genai.Client(api_key=self._config.gemini_api_key)
|
|
331
|
+
self._client = client
|
|
332
|
+
await self._open()
|
|
333
|
+
return self
|
|
334
|
+
|
|
335
|
+
def _build_config(self) -> types.LiveConnectConfig:
|
|
336
|
+
"""The Live session config. Rebuilt on every (re)connect so it carries the
|
|
337
|
+
latest session-resumption handle."""
|
|
338
|
+
cfg = types.LiveConnectConfig(
|
|
339
|
+
response_modalities=["AUDIO"],
|
|
340
|
+
system_instruction=SYSTEM_INSTRUCTION,
|
|
341
|
+
tools=[{"function_declarations": TOOL_DECLARATIONS}],
|
|
342
|
+
# Live captions: transcribe both the user's speech and Gemini's spoken output.
|
|
343
|
+
input_audio_transcription=types.AudioTranscriptionConfig(),
|
|
344
|
+
output_audio_transcription=types.AudioTranscriptionConfig(),
|
|
345
|
+
# Never let incoming audio (mic bleed, noise, our own tail) cut off a
|
|
346
|
+
# reply mid-sentence. Each reply finishes fully -> no overlap, no
|
|
347
|
+
# accidental interruptions. The phone's "interrupt" button is the only
|
|
348
|
+
# way to stop playback.
|
|
349
|
+
realtime_input_config=types.RealtimeInputConfig(
|
|
350
|
+
activity_handling=types.ActivityHandling.NO_INTERRUPTION,
|
|
351
|
+
),
|
|
352
|
+
# Survive Gemini Live's caps on long calls. Context-window compression
|
|
353
|
+
# (sliding window) prunes the oldest turns instead of ending the audio
|
|
354
|
+
# session at its ~15-min limit, so a long conversation keeps going.
|
|
355
|
+
context_window_compression=types.ContextWindowCompressionConfig(
|
|
356
|
+
sliding_window=types.SlidingWindow(),
|
|
357
|
+
),
|
|
358
|
+
# Enable session resumption so the server emits resume handles (captured in
|
|
359
|
+
# run()). Gemini refreshes the underlying connection roughly every ~10 min;
|
|
360
|
+
# the handle lets reconnect() pick the session back up. `transparent` is
|
|
361
|
+
# left off (Vertex-only); `handle` is None on the first connect.
|
|
362
|
+
session_resumption=types.SessionResumptionConfig(handle=self._resume_handle),
|
|
363
|
+
)
|
|
364
|
+
if self._voice:
|
|
365
|
+
cfg.speech_config = types.SpeechConfig(
|
|
366
|
+
voice_config=types.VoiceConfig(
|
|
367
|
+
prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name=self._voice)
|
|
368
|
+
)
|
|
369
|
+
)
|
|
370
|
+
return cfg
|
|
371
|
+
|
|
372
|
+
async def _open(self) -> None:
|
|
373
|
+
"""Open a fresh Live connection (resuming via the stored handle if present)
|
|
374
|
+
and mark the session ready."""
|
|
375
|
+
stack = contextlib.AsyncExitStack()
|
|
376
|
+
self._session = await stack.enter_async_context(
|
|
377
|
+
self._client.aio.live.connect(
|
|
378
|
+
model=self._config.gemini_live_model,
|
|
379
|
+
config=self._build_config(),
|
|
380
|
+
)
|
|
381
|
+
)
|
|
382
|
+
self._session_stack = stack
|
|
383
|
+
self._ready.set()
|
|
384
|
+
|
|
385
|
+
async def _close_session(self) -> None:
|
|
386
|
+
"""Tear down the current Live connection (used before a resume and on exit)."""
|
|
387
|
+
self._ready.clear()
|
|
388
|
+
stack, self._session_stack = self._session_stack, None
|
|
389
|
+
self._session = None
|
|
390
|
+
if stack is not None:
|
|
391
|
+
with contextlib.suppress(Exception):
|
|
392
|
+
await stack.aclose()
|
|
393
|
+
|
|
394
|
+
async def _reconnect(self) -> None:
|
|
395
|
+
"""Resume the session on a fresh connection using the latest handle. Retries
|
|
396
|
+
with backoff; raises if it can't reconnect (the call then ends as before)."""
|
|
397
|
+
await self._close_session()
|
|
398
|
+
delay = 0.5
|
|
399
|
+
for attempt in range(5):
|
|
400
|
+
try:
|
|
401
|
+
await self._open()
|
|
402
|
+
logger.info("Gemini session resumed (handle=%s…)", (self._resume_handle or "")[:8])
|
|
403
|
+
return
|
|
404
|
+
except Exception as exc:
|
|
405
|
+
logger.warning("Gemini resume attempt %d failed: %s", attempt + 1, exc)
|
|
406
|
+
await asyncio.sleep(delay)
|
|
407
|
+
delay = min(8.0, delay * 2)
|
|
408
|
+
raise RuntimeError("Gemini session resume failed after retries")
|
|
409
|
+
|
|
410
|
+
async def _await_ready(self, timeout: float = 10.0) -> bool:
|
|
411
|
+
"""Wait for an open session (e.g. through a brief resume). False if we're
|
|
412
|
+
closing or it didn't come back in time."""
|
|
413
|
+
if self._closing:
|
|
414
|
+
return False
|
|
415
|
+
try:
|
|
416
|
+
await asyncio.wait_for(self._ready.wait(), timeout)
|
|
417
|
+
except asyncio.TimeoutError:
|
|
418
|
+
return False
|
|
419
|
+
return self._session is not None
|
|
420
|
+
|
|
421
|
+
async def __aexit__(self, *exc) -> bool:
|
|
422
|
+
self._closing = True
|
|
423
|
+
if self._speak_task and not self._speak_task.done():
|
|
424
|
+
self._speak_task.cancel()
|
|
425
|
+
await self._close_session()
|
|
426
|
+
return False
|
|
427
|
+
|
|
428
|
+
# ------------------------------------------------------------------
|
|
429
|
+
# Public interface
|
|
430
|
+
# ------------------------------------------------------------------
|
|
431
|
+
|
|
432
|
+
def set_audio_out(self, cb: Callable[[bytes], Awaitable[None]]) -> None:
|
|
433
|
+
"""Register the callback that receives 24 kHz PCM audio from Gemini."""
|
|
434
|
+
self._audio_out = cb
|
|
435
|
+
|
|
436
|
+
def set_text_out(self, cb: Callable[[dict], Awaitable[None]]) -> None:
|
|
437
|
+
"""Register the callback that receives JSON control/caption messages."""
|
|
438
|
+
self._text_out = cb
|
|
439
|
+
|
|
440
|
+
async def send_audio(self, pcm16k: bytes) -> None:
|
|
441
|
+
"""Forward a mic audio frame (16 kHz mono PCM) to Gemini, EXCEPT while Voxa
|
|
442
|
+
is still speaking (so the speaker's output captured by the mic isn't fed
|
|
443
|
+
back and mistaken for the user)."""
|
|
444
|
+
if self._session is None or not self._ready.is_set():
|
|
445
|
+
return # dropped during a (re)connect; mic frames are continuous, safe to drop
|
|
446
|
+
if time.monotonic() < self._play_until + self._echo_margin:
|
|
447
|
+
return # half-duplex: drop mic while the reply is still playing
|
|
448
|
+
try:
|
|
449
|
+
await self._session.send_realtime_input(
|
|
450
|
+
audio=types.Blob(data=pcm16k, mime_type="audio/pcm;rate=16000")
|
|
451
|
+
)
|
|
452
|
+
except Exception:
|
|
453
|
+
return # connection dropping; run()'s receive loop handles the resume
|
|
454
|
+
|
|
455
|
+
async def speak(self, text: str, immediate: bool = False) -> None:
|
|
456
|
+
"""Relay text for Gemini to read aloud, DEBOUNCED and DEDUPED.
|
|
457
|
+
|
|
458
|
+
``immediate`` skips the debounce window (used for the on-answer opening, so
|
|
459
|
+
Voxa speaks in its own voice right away instead of the phone's fallback voice).
|
|
460
|
+
|
|
461
|
+
One user action makes Claude's screen settle in stages, firing the
|
|
462
|
+
finished-update several times in a burst; speaking each one is the "again and
|
|
463
|
+
again" repetition. So we accumulate the burst and speak ONE summary after a
|
|
464
|
+
brief quiet window, and skip a relay near-identical to the last thing we spoke
|
|
465
|
+
(a cross-action duplicate)."""
|
|
466
|
+
norm = " ".join((text or "").split())
|
|
467
|
+
if not norm:
|
|
468
|
+
return
|
|
469
|
+
now = time.monotonic()
|
|
470
|
+
if self._last_spoken and now - self._last_spoken_at < self._speak_dedupe_window:
|
|
471
|
+
if difflib.SequenceMatcher(None, norm.lower(), self._last_spoken.lower()).ratio() >= 0.7:
|
|
472
|
+
logger.info("speak: skipped near-duplicate update")
|
|
473
|
+
return
|
|
474
|
+
# Accumulate this update and (re)arm the debounce timer; the burst becomes one.
|
|
475
|
+
self._pending_speak = f"{self._pending_speak}\n{text}".strip() if self._pending_speak else text
|
|
476
|
+
# A relay (greeting/recap/result) is NOT a user request: consume any pending
|
|
477
|
+
# user turn NOW, at queue time. Doing it later in _flush_speak could clear a
|
|
478
|
+
# genuine user turn that arrives during the debounce window (blocking the loop
|
|
479
|
+
# guard from dispatching the user's real request).
|
|
480
|
+
self._user_spoke = False
|
|
481
|
+
if self._speak_task and not self._speak_task.done():
|
|
482
|
+
self._speak_task.cancel()
|
|
483
|
+
delay = 0.0 if immediate else self._speak_debounce
|
|
484
|
+
self._speak_task = asyncio.create_task(self._flush_speak(delay))
|
|
485
|
+
|
|
486
|
+
async def _flush_speak(self, delay: float | None = None) -> None:
|
|
487
|
+
"""After the relays go quiet, speak the accumulated burst as one message."""
|
|
488
|
+
if delay is None:
|
|
489
|
+
delay = self._speak_debounce
|
|
490
|
+
try:
|
|
491
|
+
await asyncio.sleep(delay)
|
|
492
|
+
except asyncio.CancelledError:
|
|
493
|
+
return
|
|
494
|
+
if not self._pending_speak:
|
|
495
|
+
return
|
|
496
|
+
# Check readiness BEFORE consuming _pending_speak, so a reconnect mid-debounce
|
|
497
|
+
# doesn't lose the message — it stays queued and the next relay re-arms the flush.
|
|
498
|
+
if not await self._await_ready():
|
|
499
|
+
logger.warning("speak deferred; Gemini session not ready (kept pending)")
|
|
500
|
+
return
|
|
501
|
+
text = self._pending_speak
|
|
502
|
+
self._pending_speak = ""
|
|
503
|
+
# Collapse duplicate AND near-duplicate (reworded) lines across the burst: one
|
|
504
|
+
# Claude turn settles in stages, each a paraphrased re-narration of the same
|
|
505
|
+
# result, which reads aloud as the "repeats the same thing" stutter. Drop a line
|
|
506
|
+
# that is >=0.7 similar to one already kept; genuinely new lines survive.
|
|
507
|
+
out: list[str] = []
|
|
508
|
+
seen: list[str] = []
|
|
509
|
+
for ln in text.split("\n"):
|
|
510
|
+
norm_ln = " ".join(ln.split()).lower()
|
|
511
|
+
if not norm_ln:
|
|
512
|
+
continue
|
|
513
|
+
if any(difflib.SequenceMatcher(None, norm_ln, s).ratio() >= 0.7 for s in seen):
|
|
514
|
+
continue
|
|
515
|
+
out.append(ln)
|
|
516
|
+
seen.append(norm_ln)
|
|
517
|
+
text = "\n".join(out)
|
|
518
|
+
self._last_spoken = " ".join(text.split())
|
|
519
|
+
self._last_spoken_at = time.monotonic()
|
|
520
|
+
with contextlib.suppress(Exception):
|
|
521
|
+
await self._session.send_client_content(
|
|
522
|
+
turns=types.Content(
|
|
523
|
+
role="user",
|
|
524
|
+
parts=[types.Part(text=f"Tell the user: {text}")],
|
|
525
|
+
),
|
|
526
|
+
turn_complete=True,
|
|
527
|
+
)
|
|
528
|
+
|
|
529
|
+
def suppress_greeting(self) -> None:
|
|
530
|
+
"""Skip the automatic opening greeting. Used when there is a queued update to
|
|
531
|
+
relay on answer, so Voxa speaks ONE contextual opening instead of greeting and
|
|
532
|
+
then re-reading the update."""
|
|
533
|
+
self._greeted = True
|
|
534
|
+
|
|
535
|
+
async def greet(self) -> None:
|
|
536
|
+
"""Speak a short opening greeting so Voxa talks first, without waiting for the
|
|
537
|
+
user. Injected as a one-off directive at session start."""
|
|
538
|
+
if self._session is None:
|
|
539
|
+
return
|
|
540
|
+
with contextlib.suppress(Exception):
|
|
541
|
+
await self._session.send_client_content(
|
|
542
|
+
turns=types.Content(role="user", parts=[types.Part(text=(
|
|
543
|
+
"[The call just connected. Greet the user warmly in ONE short "
|
|
544
|
+
"sentence and ask what they'd like to work on. Speak now; do not "
|
|
545
|
+
"call any tool.]"))]),
|
|
546
|
+
turn_complete=True,
|
|
547
|
+
)
|
|
548
|
+
|
|
549
|
+
def _allow_tool(self, name: str) -> bool:
|
|
550
|
+
"""Loop guard. send_to_claude requires a fresh user turn and consumes it, so
|
|
551
|
+
the agent can't dispatch work to Claude on its own (e.g. after a finished-task
|
|
552
|
+
relay) or split one request into multiple steps. All other tools are free."""
|
|
553
|
+
if name != "send_to_claude":
|
|
554
|
+
return True
|
|
555
|
+
if not self._user_spoke:
|
|
556
|
+
return False
|
|
557
|
+
self._user_spoke = False # consume this user turn
|
|
558
|
+
return True
|
|
559
|
+
|
|
560
|
+
async def send_text(self, text: str) -> None:
|
|
561
|
+
"""Send the user's typed message as a normal user turn (like speaking it)."""
|
|
562
|
+
self._user_spoke = True # a typed command is a real user request
|
|
563
|
+
if not await self._await_ready():
|
|
564
|
+
logger.warning("send_text dropped; Gemini session not ready")
|
|
565
|
+
return
|
|
566
|
+
with contextlib.suppress(Exception):
|
|
567
|
+
await self._session.send_client_content(
|
|
568
|
+
turns=types.Content(role="user", parts=[types.Part(text=text)]),
|
|
569
|
+
turn_complete=True,
|
|
570
|
+
)
|
|
571
|
+
|
|
572
|
+
async def run(self) -> None:
|
|
573
|
+
"""Receive loop: dispatch audio, stream captions, route tool calls.
|
|
574
|
+
|
|
575
|
+
``session.receive()`` yields the messages for a single model turn and then
|
|
576
|
+
ends (it breaks on ``turn_complete``). The outer ``while True`` re-enters it
|
|
577
|
+
to keep listening across turns; it blocks on the socket each call, so this
|
|
578
|
+
does not busy-loop. If the connection drops (GoAway / ~10-min cap), it is
|
|
579
|
+
resumed transparently via the stored handle and the loop continues.
|
|
580
|
+
"""
|
|
581
|
+
if self._session is None:
|
|
582
|
+
raise RuntimeError("GeminiOperator is not open; use 'async with'.")
|
|
583
|
+
|
|
584
|
+
if not self._greeted: # Voxa speaks first, once, at session start
|
|
585
|
+
self._greeted = True
|
|
586
|
+
await self.greet()
|
|
587
|
+
|
|
588
|
+
while True:
|
|
589
|
+
try:
|
|
590
|
+
async for response in self._session.receive():
|
|
591
|
+
# Session resumption: remember the latest resumable checkpoint so a
|
|
592
|
+
# dropped connection can be reopened mid-call (handle fed back in
|
|
593
|
+
# __aenter__). Only update when the server marks it resumable.
|
|
594
|
+
sru = getattr(response, "session_resumption_update", None)
|
|
595
|
+
if sru is not None and getattr(sru, "resumable", False) and sru.new_handle:
|
|
596
|
+
self._resume_handle = sru.new_handle
|
|
597
|
+
# GoAway: the server will close this connection shortly (it caps the
|
|
598
|
+
# connection lifetime at ~10 min). Log it; the stored handle is what
|
|
599
|
+
# _reconnect() uses to continue without losing the session.
|
|
600
|
+
ga = getattr(response, "go_away", None)
|
|
601
|
+
if ga is not None:
|
|
602
|
+
logger.info("Gemini go_away: time_left=%s", getattr(ga, "time_left", "?"))
|
|
603
|
+
|
|
604
|
+
# Audio data from Gemini (24 kHz mono PCM)
|
|
605
|
+
if response.data is not None:
|
|
606
|
+
# Advance the playback timeline by this chunk's real duration
|
|
607
|
+
# (24kHz, 16-bit mono = 48000 bytes/sec) so the mic stays gated
|
|
608
|
+
# for as long as the phone will actually be playing it.
|
|
609
|
+
dur = len(response.data) / 48000.0
|
|
610
|
+
self._play_until = max(self._play_until, time.monotonic()) + dur
|
|
611
|
+
if self._audio_out is not None:
|
|
612
|
+
await self._audio_out(response.data)
|
|
613
|
+
|
|
614
|
+
# Live captions: transcripts of the user's speech and Gemini's output
|
|
615
|
+
sc = response.server_content
|
|
616
|
+
if sc is not None and self._text_out is not None:
|
|
617
|
+
# Barge-in: Gemini stopped its current reply to start a new one.
|
|
618
|
+
# Tell the phone to drop any buffered audio so the old and new
|
|
619
|
+
# replies don't play over each other ("multiple things at once").
|
|
620
|
+
if getattr(sc, "interrupted", False):
|
|
621
|
+
await self._text_out({"type": "flush_audio"})
|
|
622
|
+
if sc.output_transcription and sc.output_transcription.text:
|
|
623
|
+
await self._text_out({
|
|
624
|
+
"type": "transcript",
|
|
625
|
+
"role": "agent",
|
|
626
|
+
"text": sc.output_transcription.text,
|
|
627
|
+
})
|
|
628
|
+
if sc.input_transcription and sc.input_transcription.text:
|
|
629
|
+
self._user_spoke = True # a real spoken request just came in
|
|
630
|
+
await self._text_out({
|
|
631
|
+
"type": "transcript",
|
|
632
|
+
"role": "user",
|
|
633
|
+
"text": sc.input_transcription.text,
|
|
634
|
+
})
|
|
635
|
+
|
|
636
|
+
# Tool/function calls from Gemini
|
|
637
|
+
if response.tool_call is not None:
|
|
638
|
+
for fc in response.tool_call.function_calls:
|
|
639
|
+
if not self._allow_tool(fc.name):
|
|
640
|
+
# Self-initiated dispatch with no new user request: refuse
|
|
641
|
+
# and tell the model to relay + wait instead of looping.
|
|
642
|
+
logger.info("suppressed self-initiated %s (no new user turn)", fc.name)
|
|
643
|
+
result = {
|
|
644
|
+
"ignored": True,
|
|
645
|
+
"reason": "No new request from the user since the last "
|
|
646
|
+
"one. Do NOT send another instruction to Claude or "
|
|
647
|
+
"split the task into steps yourself; Claude does the "
|
|
648
|
+
"whole job. Relay Claude's result to the user and ASK "
|
|
649
|
+
"what they want next; only call send_to_claude after "
|
|
650
|
+
"the user actually asks for something.",
|
|
651
|
+
}
|
|
652
|
+
else:
|
|
653
|
+
try:
|
|
654
|
+
result = await self._handle(fc.name, dict(fc.args or {}))
|
|
655
|
+
except Exception as exc:
|
|
656
|
+
logger.exception("handle_tool_call(%s) raised: %s", fc.name, exc)
|
|
657
|
+
result = {"error": str(exc)}
|
|
658
|
+
await self._session.send_tool_response(
|
|
659
|
+
function_responses=[
|
|
660
|
+
types.FunctionResponse(
|
|
661
|
+
id=fc.id,
|
|
662
|
+
name=fc.name,
|
|
663
|
+
response=result,
|
|
664
|
+
)
|
|
665
|
+
]
|
|
666
|
+
)
|
|
667
|
+
except asyncio.CancelledError:
|
|
668
|
+
raise
|
|
669
|
+
except Exception as exc:
|
|
670
|
+
# The Live connection dropped (GoAway / ~10-min cap / network). Resume
|
|
671
|
+
# on a fresh connection using the stored handle and keep going, so the
|
|
672
|
+
# call survives transparently. With no handle (or while closing) there's
|
|
673
|
+
# nothing to resume — propagate as before and let the call end.
|
|
674
|
+
if self._closing or not self._resume_handle:
|
|
675
|
+
raise
|
|
676
|
+
logger.info("Gemini connection lost (%s); resuming", exc)
|
|
677
|
+
await self._reconnect()
|