voxa-code 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,677 @@
1
+ """Gemini Live operator bridge for Loop.
2
+
3
+ Verified against google-genai 2.10.0. Confirmed Live API names:
4
+ - client.aio.live.connect(model=..., config=...) -> async context manager yielding AsyncSession
5
+ - session.send_realtime_input(audio=types.Blob(...)) -> sends mic PCM
6
+ - session.receive() -> AsyncIterator[types.LiveServerMessage]
7
+ - session.send_tool_response(function_responses=[types.FunctionResponse(...)]) -> sends tool result
8
+ - session.send_client_content(turns=types.Content(...), turn_complete=True) -> inject text turn
9
+
10
+ Deviations from brief:
11
+ - The brief says self._cm = client.aio.live.connect(...) then self._session = await self._cm.__aenter__().
12
+ In 2.10.0, connect() is an @asynccontextmanager (not a plain coroutine returning a CM), so it cannot
13
+ be stored and manually __aenter__'d in the usual way. Instead, we store the async generator and
14
+ use asend(None) to drive it. This is equivalent and avoids needing a separate context manager wrapper.
15
+ Practically: we use `async with client.aio.live.connect(...) as session` in __aenter__ via
16
+ contextlib.AsyncExitStack so the public GeminiOperator interface (async with / __aenter__/__aexit__)
17
+ is unchanged for Task 5.
18
+ - response.data: confirmed to exist as a property on LiveServerMessage (concatenates inline_data bytes
19
+ from all parts). Brief's description matches the actual 2.10.0 implementation.
20
+ - send_realtime_input in 2.10.0 takes keyword-only args; `audio=types.Blob(...)` is valid.
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ import asyncio
26
+ import contextlib
27
+ import difflib
28
+ import logging
29
+ import time
30
+ from collections.abc import Awaitable, Callable
31
+ from typing import Optional
32
+
33
+ from google import genai
34
+ from google.genai import types
35
+
36
+ from .config import Config
37
+
38
+ logger = logging.getLogger(__name__)
39
+
40
+ # ---------------------------------------------------------------------------
41
+ # System instruction
42
+ # ---------------------------------------------------------------------------
43
+
44
+ SYSTEM_INSTRUCTION = (
45
+ "You are Voxa, a concise voice operator that drives Claude Code on the user's machine. "
46
+ "Keep all spoken responses short and natural. "
47
+ "\n\nCHOOSING THE FOLDER: the user can either type it in the 'Working folder' field on their "
48
+ "phone, OR just tell you by voice. When they say it by voice, convert it to an absolute path "
49
+ "(expand the home directory to ~, e.g. 'documents folder' -> '~/Documents') and call "
50
+ "set_working_dir with your best guess. If it returns an error, it also returns 'searched_in' and a "
51
+ "list of 'suggestions': tell the user that folder wasn't found and read a few of the suggestions, "
52
+ "then try again with their choice. You can also call list_dirs(parent) to browse what's inside a "
53
+ "folder and read the options aloud. If the user wants to CREATE a new folder, call make_dir with "
54
+ "the full path; it makes the folder and starts the session there. "
55
+ "\n\nAFTER OPENING A SESSION, DO NOTHING ON YOUR OWN: once a session opens (or you switch folders), do NOT "
56
+ "call send_to_claude or run any command by yourself. In particular, do NOT list the folder's contents, "
57
+ "summarise, or 'take a look' unasked. Just say the session is ready and ASK what they'd like to do, then wait. "
58
+ "Only call send_to_claude when the user has actually asked for something. "
59
+ "\n\nFULL ACCESS: Claude has FULL read/write/execute access to the ENTIRE machine (permissions are "
60
+ "bypassed); it is NOT limited to the project folder. To look at, list, open, read, edit, run, or move "
61
+ "files ANYWHERE, do NOT call set_working_dir, just call send_to_claude with the request including an "
62
+ "absolute path (expand ~). Examples: 'open my Documents and list the files' -> send_to_claude('List all "
63
+ "files and folders in ~/Documents'); 'open that file on my desktop' -> send_to_claude('Open ~/Desktop/<name>'). "
64
+ "This keeps Claude's context and works for any location. Use set_working_dir ONLY when the user explicitly "
65
+ "wants to SWITCH the project they are working in to a different folder (it RESTARTS Claude there and loses "
66
+ "the current chat). NEVER claim you switched, opened, or changed a folder unless the tool actually confirmed it. "
67
+ "\n\nRUNNING TASKS: once a folder is set, call send_to_claude with the user's request EXACTLY as they "
68
+ "said it, WORD FOR WORD. Do NOT add, expand, rephrase, or infer ANYTHING they did not say: no extra "
69
+ "technologies, frameworks, languages, libraries, file names, or details. If the request sounds CUT OFF or "
70
+ "incomplete (the user trailed off, e.g. 'can you create a'), do NOT guess, complete, or invent it (never make "
71
+ "up a file name or its contents); wait and ASK them to finish the request before you call any tool. If they say 'create a browser "
72
+ "game named test_one', send EXACTLY 'create a browser game named test_one', NOT 'create a browser game "
73
+ "named test_one with HTML, CSS, and JavaScript'. Then say something brief like 'On it' while Claude "
74
+ "works. Send the user's request as ONE message and let Claude do ALL of it (Claude creates the files, "
75
+ "writes the code, runs things itself). NEVER split a request into multiple send_to_claude calls and NEVER "
76
+ "send a follow-up step on your own (e.g. 'create the files', 'create the javascript file'). When Claude "
77
+ "finishes, you ONLY speak the result to the user and WAIT; do not call send_to_claude again until the user "
78
+ "asks for the next thing. "
79
+ "The Claude session is persistent for the whole call, so it remembers previous turns: follow-ups "
80
+ "like 'open it' or 'now add a test' work; pass them through verbatim. If send_to_claude reports no "
81
+ "session has started, help the user pick a folder first (by voice or the phone field). "
82
+ "\n\nCRITICAL - NEVER PUT YOUR OWN WORDS INTO CLAUDE: send_to_claude is ONLY for the user's own "
83
+ "requests. NEVER send your own narration, summaries, confirmations, or descriptions of what Claude did "
84
+ "into send_to_claude. When Claude finishes, you RELAY the result to the USER by SPEAKING it, you do NOT "
85
+ "type it back into Claude. Lines like 'I've created index.html...' are things you SAY to the user, "
86
+ "never things you send to Claude. And never invent or assume what Claude built (e.g. the game type or "
87
+ "which files exist), only state what the screen update actually shows. "
88
+ "\n\nLIVE SCREEN UPDATES: you automatically receive messages describing what is currently on Claude's "
89
+ "terminal whenever it stops or pauses. ALWAYS relay these to the user, and NEVER answer on their behalf. "
90
+ "If it is a question, menu, or permission/trust prompt (e.g. 'Do you trust this folder? 1. Yes 2. No', "
91
+ "or 'Allow edit? y/n'), read the options aloud and ASK the user what they want to do. When they answer, "
92
+ "translate their words into the exact input Claude expects and send it with send_to_claude: for a numbered "
93
+ "menu send the number (e.g. user says 'yes, trust it' -> send '1'); for a yes/no send 'y' or 'n'; for a "
94
+ "free-form question send their answer. If it is just Claude's finished result, summarise it in a few "
95
+ "sentences. When unsure whether something needs a decision, ask the user rather than guessing. "
96
+ "IGNORE Claude Code's own interface noise: MCP server status or warnings, tool/status lines, tips, "
97
+ "'what's new', spinners, and the cost/token bar are NOT messages for the user. This includes Claude's "
98
+ "status bar / footer: the model name, the EFFORT level (e.g. 'high', 'xhigh'), usage percentages, and "
99
+ "slash-command hints like '/effort' or '/model'. NEVER read, repeat, comment on, or ASK THE USER ABOUT any "
100
+ "of this UI text. If the screen shows only such chrome and no real answer or question, say nothing about it "
101
+ "and just wait. Never read them aloud or comment on them; only relay Claude's actual answer to the request "
102
+ "or a real question Claude is asking. "
103
+ "\n\nEXISTING TERMINALS: if the user wants to work on a terminal/Claude they ALREADY have open "
104
+ "(e.g. 'use my open terminal', 'attach to the one in veil', 'pick from my terminals'), call "
105
+ "list_terminals, read out the controllable ones by their folder, and when they choose call "
106
+ "attach_terminal (by id, or 'match' the folder name, or 'index'). When attach_terminal returns a "
107
+ "'recap' field, it is the recent conversation from THAT terminal's Claude session: use it to briefly "
108
+ "tell the user what they were working on in that terminal and what the last thing was, THEN ask what "
109
+ "they want to do next. Do not read the recap verbatim, summarise it in a sentence or two. After "
110
+ "attaching, drive it exactly like a normal session. If a terminal is reported not controllable, tell "
111
+ "the user it can't be driven unless Claude runs inside tmux. "
112
+ "\n\nSESSION DETAILS: when the user asks about something that happened earlier in the "
113
+ "attached session (what files changed, why a test failed, what was decided), call "
114
+ "read_session (last=N or search='keyword') and answer from what it returns. Summarise "
115
+ "in a few sentences; never read raw transcript dumps, code, or long paths aloud. If "
116
+ "read_session errors, say you could not find that session's history. "
117
+ "\n\nYou may call get_claude_status to check progress. If the user says stop or cancel, call stop_claude. "
118
+ "Never read out long raw file paths or code blocks verbatim unless asked."
119
+ )
120
+
121
+ # ---------------------------------------------------------------------------
122
+ # Tool declarations (must match orchestrator's handle_tool_call names exactly)
123
+ # ---------------------------------------------------------------------------
124
+
125
+ TOOL_DECLARATIONS = [
126
+ {
127
+ "name": "start_claude_session",
128
+ "description": "Start a Claude Code session in a working directory.",
129
+ "parameters": {
130
+ "type": "object",
131
+ "properties": {
132
+ "working_dir": {
133
+ "type": "string",
134
+ "description": "Absolute or ~-relative path to the project folder.",
135
+ },
136
+ },
137
+ "required": ["working_dir"],
138
+ },
139
+ },
140
+ {
141
+ "name": "send_to_claude",
142
+ "description": (
143
+ "Send a prompt to the active Claude session. "
144
+ "Returns immediately; the result is spoken later."
145
+ ),
146
+ "parameters": {
147
+ "type": "object",
148
+ "properties": {
149
+ "text": {"type": "string"},
150
+ },
151
+ "required": ["text"],
152
+ },
153
+ },
154
+ {
155
+ "name": "get_claude_status",
156
+ "description": "Check whether Claude is idle, working, finished, or errored.",
157
+ "parameters": {
158
+ "type": "object",
159
+ "properties": {},
160
+ },
161
+ },
162
+ {
163
+ "name": "set_working_dir",
164
+ "description": (
165
+ "Set/Change the working directory for the Claude session (accepts ~-relative paths). "
166
+ "On failure returns 'searched_in' and 'suggestions' to read back to the user."
167
+ ),
168
+ "parameters": {
169
+ "type": "object",
170
+ "properties": {
171
+ "path": {"type": "string"},
172
+ },
173
+ "required": ["path"],
174
+ },
175
+ },
176
+ {
177
+ "name": "list_dirs",
178
+ "description": "List the subdirectories inside a folder, to help the user choose by voice.",
179
+ "parameters": {
180
+ "type": "object",
181
+ "properties": {
182
+ "parent": {"type": "string", "description": "Folder to list (~-relative ok)."},
183
+ },
184
+ "required": ["parent"],
185
+ },
186
+ },
187
+ {
188
+ "name": "make_dir",
189
+ "description": "Create a new folder (and parents) then start the Claude session inside it.",
190
+ "parameters": {
191
+ "type": "object",
192
+ "properties": {
193
+ "path": {"type": "string", "description": "Full path of the new folder (~-relative ok)."},
194
+ },
195
+ "required": ["path"],
196
+ },
197
+ },
198
+ {
199
+ "name": "stop_claude",
200
+ "description": "Cancel the current Claude run.",
201
+ "parameters": {
202
+ "type": "object",
203
+ "properties": {},
204
+ },
205
+ },
206
+ {
207
+ "name": "list_terminals",
208
+ "description": (
209
+ "List the Claude sessions the user already has open in their terminals "
210
+ "(iTerm2, tmux, ...). Returns each with a label (its folder) and whether it "
211
+ "is controllable. Also shows them on the phone as a tappable list."
212
+ ),
213
+ "parameters": {"type": "object", "properties": {}},
214
+ },
215
+ {
216
+ "name": "attach_terminal",
217
+ "description": (
218
+ "Attach to one of the open Claude terminals from list_terminals and drive it. "
219
+ "Identify it by 'id', or by 'match' (part of its folder name), or 'index' (1-based)."
220
+ ),
221
+ "parameters": {
222
+ "type": "object",
223
+ "properties": {
224
+ "id": {"type": "string"},
225
+ "match": {"type": "string", "description": "Part of the folder/label to match."},
226
+ "index": {"type": "integer", "description": "1-based position in the last list."},
227
+ },
228
+ },
229
+ },
230
+ {
231
+ "name": "read_session",
232
+ "description": (
233
+ "Read the attached Claude session's full transcript on demand. "
234
+ "Use when the user asks about details of past work in this session "
235
+ "(what changed, why something failed, what was decided). "
236
+ "Pass last=N for the most recent N messages, or search='text' to "
237
+ "find messages mentioning something."
238
+ ),
239
+ "parameters": {
240
+ "type": "object",
241
+ "properties": {
242
+ "last": {"type": "integer", "description": "How many recent messages (max 40)."},
243
+ "search": {"type": "string", "description": "Find messages containing this text."},
244
+ },
245
+ },
246
+ },
247
+ ]
248
+
249
+ # ---------------------------------------------------------------------------
250
+ # GeminiOperator
251
+ # ---------------------------------------------------------------------------
252
+
253
+
254
+ class GeminiOperator:
255
+ """Bridges a phone call to a Gemini Live realtime voice session.
256
+
257
+ Usage::
258
+
259
+ async with GeminiOperator(config, handle_tool_call) as op:
260
+ op.set_audio_out(send_to_phone)
261
+ await asyncio.gather(op.run(), mic_pump(op))
262
+ """
263
+
264
+ def __init__(
265
+ self,
266
+ config: Config,
267
+ handle_tool_call: Callable[[str, dict], Awaitable[dict]],
268
+ voice: str = "",
269
+ ) -> None:
270
+ self._config = config
271
+ self._handle = handle_tool_call
272
+ self._voice = voice
273
+ self._audio_out: Optional[Callable[[bytes], Awaitable[None]]] = None
274
+ self._text_out: Optional[Callable[[dict], Awaitable[None]]] = None
275
+ self._session: Optional[genai.live.AsyncSession] = None # type: ignore[name-defined]
276
+ self._client: Optional[genai.Client] = None # type: ignore[name-defined]
277
+ # The active session lives in its own stack so it can be torn down and
278
+ # reopened (resume) independently of the operator's lifetime.
279
+ self._session_stack: Optional[contextlib.AsyncExitStack] = None
280
+ # Set while a session is open and usable; cleared during a (re)connect so
281
+ # senders drop/await instead of writing to a half-open socket.
282
+ self._ready = asyncio.Event()
283
+ self._closing = False # True once __aexit__ starts (suppress resume)
284
+ # Server-side half-duplex: while Voxa is speaking we model the phone's
285
+ # realtime playback timeline and DROP mic audio until it finishes (+margin),
286
+ # so Voxa's own voice off the speaker is never fed back to Gemini as "user
287
+ # input". Robust regardless of the app build.
288
+ self._play_until = 0.0 # monotonic time the current reply finishes playing
289
+ self._echo_margin = 0.7 # extra guard after playback ends (s)
290
+ # Latest session-resumption handle from the server (see run()). Passed back on
291
+ # (re)connect so a dropped Live connection can resume mid-call. None until the
292
+ # server first marks a checkpoint resumable.
293
+ self._resume_handle: Optional[str] = None
294
+ # Dedupe relayed updates (see speak): the same finished-task confirmation can
295
+ # be pushed several times in a row (a self-interruption/echo loop re-triggers
296
+ # the task), which reads aloud as a stutter. Skip near-identical repeats.
297
+ self._last_spoken = ""
298
+ self._last_spoken_at = 0.0
299
+ self._speak_dedupe_window = 90.0
300
+ # Debounce relays: one user action makes Claude's screen settle in stages, so
301
+ # the finished-update fires several times in a burst. Without coalescing,
302
+ # Gemini speaks a confirmation for EACH (the "again and again" repetition). We
303
+ # accumulate a burst and speak ONE summary after a brief quiet window.
304
+ self._pending_speak = ""
305
+ self._speak_task: Optional[asyncio.Task] = None
306
+ self._speak_debounce = 2.5 # coalesce a settling burst (monitor idles ~3.6s apart)
307
+ # Loop guard: send_to_claude may only fire after a genuine NEW user turn
308
+ # (spoken or typed). A finished-task relay is injected as a user turn but is
309
+ # NOT a real utterance, so it can't license another dispatch. This stops the
310
+ # agent from auto-continuing / decomposing a task into repeated send_to_claude.
311
+ self._user_spoke = False
312
+ self._greeted = False # speak an opening greeting once, so Voxa talks first
313
+
314
+ # ------------------------------------------------------------------
315
+ # Async context manager: opens the Live session
316
+ # ------------------------------------------------------------------
317
+
318
+ async def __aenter__(self) -> "GeminiOperator":
319
+ # Vertex AI mode (set on hosts whose IP the Developer API geo-blocks): auth
320
+ # by service account, no IP-location check. Falls back to the Developer API
321
+ # (api key) everywhere else. Env: GOOGLE_GENAI_USE_VERTEXAI + project/location.
322
+ import os
323
+ if os.environ.get("GOOGLE_GENAI_USE_VERTEXAI", "").strip().lower() in ("1", "true", "yes"):
324
+ client = genai.Client(
325
+ vertexai=True,
326
+ project=os.environ.get("GOOGLE_CLOUD_PROJECT"),
327
+ location=os.environ.get("GOOGLE_CLOUD_LOCATION", "us-central1"),
328
+ )
329
+ else:
330
+ client = genai.Client(api_key=self._config.gemini_api_key)
331
+ self._client = client
332
+ await self._open()
333
+ return self
334
+
335
+ def _build_config(self) -> types.LiveConnectConfig:
336
+ """The Live session config. Rebuilt on every (re)connect so it carries the
337
+ latest session-resumption handle."""
338
+ cfg = types.LiveConnectConfig(
339
+ response_modalities=["AUDIO"],
340
+ system_instruction=SYSTEM_INSTRUCTION,
341
+ tools=[{"function_declarations": TOOL_DECLARATIONS}],
342
+ # Live captions: transcribe both the user's speech and Gemini's spoken output.
343
+ input_audio_transcription=types.AudioTranscriptionConfig(),
344
+ output_audio_transcription=types.AudioTranscriptionConfig(),
345
+ # Never let incoming audio (mic bleed, noise, our own tail) cut off a
346
+ # reply mid-sentence. Each reply finishes fully -> no overlap, no
347
+ # accidental interruptions. The phone's "interrupt" button is the only
348
+ # way to stop playback.
349
+ realtime_input_config=types.RealtimeInputConfig(
350
+ activity_handling=types.ActivityHandling.NO_INTERRUPTION,
351
+ ),
352
+ # Survive Gemini Live's caps on long calls. Context-window compression
353
+ # (sliding window) prunes the oldest turns instead of ending the audio
354
+ # session at its ~15-min limit, so a long conversation keeps going.
355
+ context_window_compression=types.ContextWindowCompressionConfig(
356
+ sliding_window=types.SlidingWindow(),
357
+ ),
358
+ # Enable session resumption so the server emits resume handles (captured in
359
+ # run()). Gemini refreshes the underlying connection roughly every ~10 min;
360
+ # the handle lets reconnect() pick the session back up. `transparent` is
361
+ # left off (Vertex-only); `handle` is None on the first connect.
362
+ session_resumption=types.SessionResumptionConfig(handle=self._resume_handle),
363
+ )
364
+ if self._voice:
365
+ cfg.speech_config = types.SpeechConfig(
366
+ voice_config=types.VoiceConfig(
367
+ prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name=self._voice)
368
+ )
369
+ )
370
+ return cfg
371
+
372
+ async def _open(self) -> None:
373
+ """Open a fresh Live connection (resuming via the stored handle if present)
374
+ and mark the session ready."""
375
+ stack = contextlib.AsyncExitStack()
376
+ self._session = await stack.enter_async_context(
377
+ self._client.aio.live.connect(
378
+ model=self._config.gemini_live_model,
379
+ config=self._build_config(),
380
+ )
381
+ )
382
+ self._session_stack = stack
383
+ self._ready.set()
384
+
385
+ async def _close_session(self) -> None:
386
+ """Tear down the current Live connection (used before a resume and on exit)."""
387
+ self._ready.clear()
388
+ stack, self._session_stack = self._session_stack, None
389
+ self._session = None
390
+ if stack is not None:
391
+ with contextlib.suppress(Exception):
392
+ await stack.aclose()
393
+
394
+ async def _reconnect(self) -> None:
395
+ """Resume the session on a fresh connection using the latest handle. Retries
396
+ with backoff; raises if it can't reconnect (the call then ends as before)."""
397
+ await self._close_session()
398
+ delay = 0.5
399
+ for attempt in range(5):
400
+ try:
401
+ await self._open()
402
+ logger.info("Gemini session resumed (handle=%s…)", (self._resume_handle or "")[:8])
403
+ return
404
+ except Exception as exc:
405
+ logger.warning("Gemini resume attempt %d failed: %s", attempt + 1, exc)
406
+ await asyncio.sleep(delay)
407
+ delay = min(8.0, delay * 2)
408
+ raise RuntimeError("Gemini session resume failed after retries")
409
+
410
+ async def _await_ready(self, timeout: float = 10.0) -> bool:
411
+ """Wait for an open session (e.g. through a brief resume). False if we're
412
+ closing or it didn't come back in time."""
413
+ if self._closing:
414
+ return False
415
+ try:
416
+ await asyncio.wait_for(self._ready.wait(), timeout)
417
+ except asyncio.TimeoutError:
418
+ return False
419
+ return self._session is not None
420
+
421
+ async def __aexit__(self, *exc) -> bool:
422
+ self._closing = True
423
+ if self._speak_task and not self._speak_task.done():
424
+ self._speak_task.cancel()
425
+ await self._close_session()
426
+ return False
427
+
428
+ # ------------------------------------------------------------------
429
+ # Public interface
430
+ # ------------------------------------------------------------------
431
+
432
+ def set_audio_out(self, cb: Callable[[bytes], Awaitable[None]]) -> None:
433
+ """Register the callback that receives 24 kHz PCM audio from Gemini."""
434
+ self._audio_out = cb
435
+
436
+ def set_text_out(self, cb: Callable[[dict], Awaitable[None]]) -> None:
437
+ """Register the callback that receives JSON control/caption messages."""
438
+ self._text_out = cb
439
+
440
+ async def send_audio(self, pcm16k: bytes) -> None:
441
+ """Forward a mic audio frame (16 kHz mono PCM) to Gemini, EXCEPT while Voxa
442
+ is still speaking (so the speaker's output captured by the mic isn't fed
443
+ back and mistaken for the user)."""
444
+ if self._session is None or not self._ready.is_set():
445
+ return # dropped during a (re)connect; mic frames are continuous, safe to drop
446
+ if time.monotonic() < self._play_until + self._echo_margin:
447
+ return # half-duplex: drop mic while the reply is still playing
448
+ try:
449
+ await self._session.send_realtime_input(
450
+ audio=types.Blob(data=pcm16k, mime_type="audio/pcm;rate=16000")
451
+ )
452
+ except Exception:
453
+ return # connection dropping; run()'s receive loop handles the resume
454
+
455
+ async def speak(self, text: str, immediate: bool = False) -> None:
456
+ """Relay text for Gemini to read aloud, DEBOUNCED and DEDUPED.
457
+
458
+ ``immediate`` skips the debounce window (used for the on-answer opening, so
459
+ Voxa speaks in its own voice right away instead of the phone's fallback voice).
460
+
461
+ One user action makes Claude's screen settle in stages, firing the
462
+ finished-update several times in a burst; speaking each one is the "again and
463
+ again" repetition. So we accumulate the burst and speak ONE summary after a
464
+ brief quiet window, and skip a relay near-identical to the last thing we spoke
465
+ (a cross-action duplicate)."""
466
+ norm = " ".join((text or "").split())
467
+ if not norm:
468
+ return
469
+ now = time.monotonic()
470
+ if self._last_spoken and now - self._last_spoken_at < self._speak_dedupe_window:
471
+ if difflib.SequenceMatcher(None, norm.lower(), self._last_spoken.lower()).ratio() >= 0.7:
472
+ logger.info("speak: skipped near-duplicate update")
473
+ return
474
+ # Accumulate this update and (re)arm the debounce timer; the burst becomes one.
475
+ self._pending_speak = f"{self._pending_speak}\n{text}".strip() if self._pending_speak else text
476
+ # A relay (greeting/recap/result) is NOT a user request: consume any pending
477
+ # user turn NOW, at queue time. Doing it later in _flush_speak could clear a
478
+ # genuine user turn that arrives during the debounce window (blocking the loop
479
+ # guard from dispatching the user's real request).
480
+ self._user_spoke = False
481
+ if self._speak_task and not self._speak_task.done():
482
+ self._speak_task.cancel()
483
+ delay = 0.0 if immediate else self._speak_debounce
484
+ self._speak_task = asyncio.create_task(self._flush_speak(delay))
485
+
486
+ async def _flush_speak(self, delay: float | None = None) -> None:
487
+ """After the relays go quiet, speak the accumulated burst as one message."""
488
+ if delay is None:
489
+ delay = self._speak_debounce
490
+ try:
491
+ await asyncio.sleep(delay)
492
+ except asyncio.CancelledError:
493
+ return
494
+ if not self._pending_speak:
495
+ return
496
+ # Check readiness BEFORE consuming _pending_speak, so a reconnect mid-debounce
497
+ # doesn't lose the message — it stays queued and the next relay re-arms the flush.
498
+ if not await self._await_ready():
499
+ logger.warning("speak deferred; Gemini session not ready (kept pending)")
500
+ return
501
+ text = self._pending_speak
502
+ self._pending_speak = ""
503
+ # Collapse duplicate AND near-duplicate (reworded) lines across the burst: one
504
+ # Claude turn settles in stages, each a paraphrased re-narration of the same
505
+ # result, which reads aloud as the "repeats the same thing" stutter. Drop a line
506
+ # that is >=0.7 similar to one already kept; genuinely new lines survive.
507
+ out: list[str] = []
508
+ seen: list[str] = []
509
+ for ln in text.split("\n"):
510
+ norm_ln = " ".join(ln.split()).lower()
511
+ if not norm_ln:
512
+ continue
513
+ if any(difflib.SequenceMatcher(None, norm_ln, s).ratio() >= 0.7 for s in seen):
514
+ continue
515
+ out.append(ln)
516
+ seen.append(norm_ln)
517
+ text = "\n".join(out)
518
+ self._last_spoken = " ".join(text.split())
519
+ self._last_spoken_at = time.monotonic()
520
+ with contextlib.suppress(Exception):
521
+ await self._session.send_client_content(
522
+ turns=types.Content(
523
+ role="user",
524
+ parts=[types.Part(text=f"Tell the user: {text}")],
525
+ ),
526
+ turn_complete=True,
527
+ )
528
+
529
+ def suppress_greeting(self) -> None:
530
+ """Skip the automatic opening greeting. Used when there is a queued update to
531
+ relay on answer, so Voxa speaks ONE contextual opening instead of greeting and
532
+ then re-reading the update."""
533
+ self._greeted = True
534
+
535
+ async def greet(self) -> None:
536
+ """Speak a short opening greeting so Voxa talks first, without waiting for the
537
+ user. Injected as a one-off directive at session start."""
538
+ if self._session is None:
539
+ return
540
+ with contextlib.suppress(Exception):
541
+ await self._session.send_client_content(
542
+ turns=types.Content(role="user", parts=[types.Part(text=(
543
+ "[The call just connected. Greet the user warmly in ONE short "
544
+ "sentence and ask what they'd like to work on. Speak now; do not "
545
+ "call any tool.]"))]),
546
+ turn_complete=True,
547
+ )
548
+
549
+ def _allow_tool(self, name: str) -> bool:
550
+ """Loop guard. send_to_claude requires a fresh user turn and consumes it, so
551
+ the agent can't dispatch work to Claude on its own (e.g. after a finished-task
552
+ relay) or split one request into multiple steps. All other tools are free."""
553
+ if name != "send_to_claude":
554
+ return True
555
+ if not self._user_spoke:
556
+ return False
557
+ self._user_spoke = False # consume this user turn
558
+ return True
559
+
560
+ async def send_text(self, text: str) -> None:
561
+ """Send the user's typed message as a normal user turn (like speaking it)."""
562
+ self._user_spoke = True # a typed command is a real user request
563
+ if not await self._await_ready():
564
+ logger.warning("send_text dropped; Gemini session not ready")
565
+ return
566
+ with contextlib.suppress(Exception):
567
+ await self._session.send_client_content(
568
+ turns=types.Content(role="user", parts=[types.Part(text=text)]),
569
+ turn_complete=True,
570
+ )
571
+
572
+ async def run(self) -> None:
573
+ """Receive loop: dispatch audio, stream captions, route tool calls.
574
+
575
+ ``session.receive()`` yields the messages for a single model turn and then
576
+ ends (it breaks on ``turn_complete``). The outer ``while True`` re-enters it
577
+ to keep listening across turns; it blocks on the socket each call, so this
578
+ does not busy-loop. If the connection drops (GoAway / ~10-min cap), it is
579
+ resumed transparently via the stored handle and the loop continues.
580
+ """
581
+ if self._session is None:
582
+ raise RuntimeError("GeminiOperator is not open; use 'async with'.")
583
+
584
+ if not self._greeted: # Voxa speaks first, once, at session start
585
+ self._greeted = True
586
+ await self.greet()
587
+
588
+ while True:
589
+ try:
590
+ async for response in self._session.receive():
591
+ # Session resumption: remember the latest resumable checkpoint so a
592
+ # dropped connection can be reopened mid-call (handle fed back in
593
+ # __aenter__). Only update when the server marks it resumable.
594
+ sru = getattr(response, "session_resumption_update", None)
595
+ if sru is not None and getattr(sru, "resumable", False) and sru.new_handle:
596
+ self._resume_handle = sru.new_handle
597
+ # GoAway: the server will close this connection shortly (it caps the
598
+ # connection lifetime at ~10 min). Log it; the stored handle is what
599
+ # _reconnect() uses to continue without losing the session.
600
+ ga = getattr(response, "go_away", None)
601
+ if ga is not None:
602
+ logger.info("Gemini go_away: time_left=%s", getattr(ga, "time_left", "?"))
603
+
604
+ # Audio data from Gemini (24 kHz mono PCM)
605
+ if response.data is not None:
606
+ # Advance the playback timeline by this chunk's real duration
607
+ # (24kHz, 16-bit mono = 48000 bytes/sec) so the mic stays gated
608
+ # for as long as the phone will actually be playing it.
609
+ dur = len(response.data) / 48000.0
610
+ self._play_until = max(self._play_until, time.monotonic()) + dur
611
+ if self._audio_out is not None:
612
+ await self._audio_out(response.data)
613
+
614
+ # Live captions: transcripts of the user's speech and Gemini's output
615
+ sc = response.server_content
616
+ if sc is not None and self._text_out is not None:
617
+ # Barge-in: Gemini stopped its current reply to start a new one.
618
+ # Tell the phone to drop any buffered audio so the old and new
619
+ # replies don't play over each other ("multiple things at once").
620
+ if getattr(sc, "interrupted", False):
621
+ await self._text_out({"type": "flush_audio"})
622
+ if sc.output_transcription and sc.output_transcription.text:
623
+ await self._text_out({
624
+ "type": "transcript",
625
+ "role": "agent",
626
+ "text": sc.output_transcription.text,
627
+ })
628
+ if sc.input_transcription and sc.input_transcription.text:
629
+ self._user_spoke = True # a real spoken request just came in
630
+ await self._text_out({
631
+ "type": "transcript",
632
+ "role": "user",
633
+ "text": sc.input_transcription.text,
634
+ })
635
+
636
+ # Tool/function calls from Gemini
637
+ if response.tool_call is not None:
638
+ for fc in response.tool_call.function_calls:
639
+ if not self._allow_tool(fc.name):
640
+ # Self-initiated dispatch with no new user request: refuse
641
+ # and tell the model to relay + wait instead of looping.
642
+ logger.info("suppressed self-initiated %s (no new user turn)", fc.name)
643
+ result = {
644
+ "ignored": True,
645
+ "reason": "No new request from the user since the last "
646
+ "one. Do NOT send another instruction to Claude or "
647
+ "split the task into steps yourself; Claude does the "
648
+ "whole job. Relay Claude's result to the user and ASK "
649
+ "what they want next; only call send_to_claude after "
650
+ "the user actually asks for something.",
651
+ }
652
+ else:
653
+ try:
654
+ result = await self._handle(fc.name, dict(fc.args or {}))
655
+ except Exception as exc:
656
+ logger.exception("handle_tool_call(%s) raised: %s", fc.name, exc)
657
+ result = {"error": str(exc)}
658
+ await self._session.send_tool_response(
659
+ function_responses=[
660
+ types.FunctionResponse(
661
+ id=fc.id,
662
+ name=fc.name,
663
+ response=result,
664
+ )
665
+ ]
666
+ )
667
+ except asyncio.CancelledError:
668
+ raise
669
+ except Exception as exc:
670
+ # The Live connection dropped (GoAway / ~10-min cap / network). Resume
671
+ # on a fresh connection using the stored handle and keep going, so the
672
+ # call survives transparently. With no handle (or while closing) there's
673
+ # nothing to resume — propagate as before and let the call end.
674
+ if self._closing or not self._resume_handle:
675
+ raise
676
+ logger.info("Gemini connection lost (%s); resuming", exc)
677
+ await self._reconnect()