ai-interview-assistant 2.2.2__tar.gz → 2.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/PKG-INFO +1 -1
  2. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/pyproject.toml +1 -1
  3. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/__init__.py +1 -1
  4. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/audio/capture.py +173 -38
  5. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/audio/transcriber.py +51 -11
  6. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/daemon.py +33 -0
  7. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview_assistant.egg-info/PKG-INFO +1 -1
  8. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview_assistant.egg-info/SOURCES.txt +2 -1
  9. ai_interview_assistant-2.2.4/tests/test_transcription_phase.py +149 -0
  10. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/README.md +0 -0
  11. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/setup.cfg +0 -0
  12. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/__main__.py +0 -0
  13. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/ai_client.py +0 -0
  14. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/audio/__init__.py +0 -0
  15. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/buffer.py +0 -0
  16. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/cli.py +0 -0
  17. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/config.py +0 -0
  18. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/flet_gui/__init__.py +0 -0
  19. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/flet_gui/__main__.py +0 -0
  20. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/flet_gui/app.py +0 -0
  21. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/flet_gui/screens/__init__.py +0 -0
  22. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/flet_gui/screens/dashboard.py +0 -0
  23. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/flet_gui/screens/hotkeys.py +0 -0
  24. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/flet_gui/screens/scripts.py +0 -0
  25. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/flet_gui/screens/settings.py +0 -0
  26. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/hotkey_config.py +0 -0
  27. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/hotkeys.py +0 -0
  28. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/i18n.py +0 -0
  29. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/llm_clients.py +0 -0
  30. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/menubar.py +0 -0
  31. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/metrics.py +0 -0
  32. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/ollama_utils.py +0 -0
  33. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/overlay.py +0 -0
  34. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/screenshot.py +0 -0
  35. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/server/__init__.py +0 -0
  36. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/server/app.py +0 -0
  37. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/server/routes.py +0 -0
  38. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/server/websocket.py +0 -0
  39. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/state.py +0 -0
  40. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/utils.py +0 -0
  41. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/watchdog.py +0 -0
  42. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview_assistant.egg-info/dependency_links.txt +0 -0
  43. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview_assistant.egg-info/entry_points.txt +0 -0
  44. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview_assistant.egg-info/requires.txt +0 -0
  45. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview_assistant.egg-info/top_level.txt +0 -0
  46. {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/tests/test_llm_clients.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ai-interview-assistant
3
- Version: 2.2.2
3
+ Version: 2.2.4
4
4
  Summary: Ghost background AI assistant for live code challenges
5
5
  Requires-Python: >=3.11
6
6
  Requires-Dist: click>=8.0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "ai-interview-assistant"
7
- version = "2.2.2"
7
+ version = "2.2.4"
8
8
  description = "Ghost background AI assistant for live code challenges"
9
9
  requires-python = ">=3.11"
10
10
  dependencies = [
@@ -1,3 +1,3 @@
1
1
  """AI Interview Assistant — ghost background tool for live code challenges."""
2
2
 
3
- __version__ = "2.2.2"
3
+ __version__ = "2.2.4"
@@ -285,12 +285,50 @@ class ScreenCaptureAudio:
285
285
  # Combined capture: mic + system audio
286
286
  # ---------------------------------------------------------------------------
287
287
 
288
+ class _QueueAudioSource:
289
+ """Minimal capture-like adapter over a single Queue.
290
+
291
+ Exposes the same ``get_audio`` / ``flush`` surface a DeepgramTranscriber pulls
292
+ from, so a second transcriber can consume the candidate (mic) stream from its
293
+ own queue without knowing about CombinedAudioCapture internals.
294
+ """
295
+
296
+ def __init__(self, queue: "Queue") -> None:
297
+ self._queue = queue
298
+
299
+ def get_audio(self, timeout: float = 0.1) -> Optional[np.ndarray]:
300
+ try:
301
+ return self._queue.get(timeout=timeout)
302
+ except Empty:
303
+ return None
304
+
305
+ def flush(self) -> int:
306
+ dropped = 0
307
+ while True:
308
+ try:
309
+ self._queue.get_nowait()
310
+ dropped += 1
311
+ except Empty:
312
+ return dropped
313
+
314
+
288
315
  class CombinedAudioCapture:
289
316
  """Mixes microphone + system audio and feeds a Queue for the transcriber."""
290
317
 
318
+ # Each queued item is one 10ms chunk (see _mix_loop). Keep the transcriber in
319
+ # real-time phase: if the consumer (Deepgram feed loop) falls behind, drop the
320
+ # oldest chunks so emitted transcripts stay live instead of drifting late.
321
+ _STALL_TRIM_CHUNKS = 200 # ~2.0s of backlog triggers a trim
322
+ _STALL_TAIL_CHUNKS = 50 # leave ~0.5s of live tail after trimming
323
+
291
324
  def __init__(self, sample_rate: int = SAMPLE_RATE) -> None:
292
325
  self.sample_rate = sample_rate
293
326
  self.audio_queue: Queue[np.ndarray] = Queue()
327
+ # Separate queue carrying the microphone (candidate) stream when system
328
+ # audio is the primary source. Only fed while a mic transcriber is active
329
+ # (see _mic_routing) so it never grows unbounded when unused.
330
+ self.mic_queue: Queue[np.ndarray] = Queue()
331
+ self._mic_routing = False
294
332
  self._stop_event = Event()
295
333
  self._mic: Optional[MicrophoneCapture] = None
296
334
  self._sys: Optional[ScreenCaptureAudio] = None
@@ -323,58 +361,155 @@ class CombinedAudioCapture:
323
361
  only the interviewer's voice should be transcribed automatically.
324
362
  The microphone is only used for manual recordings (2x ESC hold).
325
363
  Falls back to mic-only if system audio is unavailable.
364
+
365
+ Latency design: each iteration drains the capture queues **fully** and
366
+ **non-blocking**, then emits **every** complete chunk. The previous design
367
+ blocked up to 20ms on the (discarded) mic queue and emitted only one chunk
368
+ per iteration — so whenever the loop dipped below the capture rate, audio
369
+ piled up in ``sys_buf`` (capped at 1s by its maxlen) and produced a steady,
370
+ invisible transcription lag. Draining fully keeps ``sys_buf`` near-empty so
371
+ audio reaches Deepgram in real time.
326
372
  """
327
373
  chunk_size = int(self.sample_rate * 0.01) # 10ms — forward audio to Deepgram faster
328
374
  mic_buf: deque = deque(maxlen=self.sample_rate)
329
375
  sys_buf: deque = deque(maxlen=self.sample_rate)
330
- _MAX_QUEUE_SIZE = self.sample_rate * 30 # ~30s of audio samples
331
376
  _last_drain_warn = 0.0
377
+ _trim_events = 0 # trims since last log — distinguishes a one-off spike
378
+ _trim_chunks = 0 # chunks dropped since last log from a chronic stall
332
379
 
333
380
  while not self._stop_event.is_set():
334
- # Always drain mic queue to prevent buffer buildup, but discard when system audio is active
335
- mic_chunk = self._mic.get_audio(timeout=0.02) if self._mic else None
336
- if mic_chunk is not None and not self._has_system_audio:
337
- mic_buf.extend(mic_chunk.flatten())
381
+ # Drain the mic queue fully (non-blocking) so it never backs up. Keep
382
+ # the samples when mic is the fallback primary, or when a mic
383
+ # transcriber is routing the candidate stream; otherwise drain-and-
384
+ # discard.
385
+ if self._mic:
386
+ keep_mic = (not self._has_system_audio) or self._mic_routing
387
+ while True:
388
+ mic_chunk = self._mic.get_audio(timeout=0)
389
+ if mic_chunk is None:
390
+ break
391
+ if keep_mic:
392
+ mic_buf.extend(mic_chunk.flatten())
338
393
 
394
+ # Drain the system-audio queue fully (non-blocking).
339
395
  if self._sys and self._sys.is_running:
340
- sys_chunk = self._sys.get_audio(timeout=0.02)
341
- if sys_chunk is not None:
396
+ while True:
397
+ sys_chunk = self._sys.get_audio(timeout=0)
398
+ if sys_chunk is None:
399
+ break
342
400
  sys_buf.extend(sys_chunk.flatten())
343
401
 
402
+ # Emit EVERY complete chunk this iteration (system audio decoupled from
403
+ # mic — it never waits on the mic having a chunk ready).
344
404
  if self._has_system_audio:
345
- # System audio only — speaker voice, no mic bleed
346
- if len(sys_buf) >= chunk_size:
347
- sys_data = np.array(
348
- [sys_buf.popleft() for _ in range(min(chunk_size, len(sys_buf)))],
349
- dtype=np.float32,
350
- )
351
- self.audio_queue.put(sys_data)
405
+ produced = self._emit_all_chunks(sys_buf, chunk_size, self.audio_queue)
406
+ # Candidate's own voice → separate queue for the mic transcriber.
407
+ if self._mic_routing:
408
+ produced += self._emit_all_chunks(mic_buf, chunk_size, self.mic_queue)
352
409
  else:
353
- # Fallback: mic only (no system audio available)
354
- if len(mic_buf) >= chunk_size:
355
- mic_data = np.array(
356
- [mic_buf.popleft() for _ in range(min(chunk_size, len(mic_buf)))],
357
- dtype=np.float32,
358
- )
359
- self.audio_queue.put(mic_data)
410
+ produced = self._emit_all_chunks(mic_buf, chunk_size, self.audio_queue)
411
+
412
+ # Stall-recovery: if the downstream consumer (Deepgram feed loop) has
413
+ # fallen behind real-time, trim oldest queued chunks so transcripts stay
414
+ # live. Aggregate over a 30s window so a chronic stall (many trims) is
415
+ # distinguishable from a one-off spike and escalates to WARNING.
416
+ trimmed = self._trim_stale_backlog()
417
+ if trimmed:
418
+ _trim_events += 1
419
+ _trim_chunks += trimmed
420
+ # Bound the secondary (mic) queue too — if its transcriber dies (no
421
+ # Whisper fallback) routing stays on, so without this it would grow
422
+ # without limit. Same drop-oldest-to-stay-live policy as the primary.
423
+ if self._mic_routing:
424
+ self._trim_queue(self.mic_queue)
425
+ now = time.time()
426
+ if _trim_events and now - _last_drain_warn > 30:
427
+ import logging
428
+ _log = logging.getLogger(__name__)
429
+ _msg = ("Audio backlog trimmed %d times (~%.1fs dropped) in last 30s "
430
+ "— transcriber feed behind real-time")
431
+ if _trim_events >= 50:
432
+ _log.warning(_msg, _trim_events, _trim_chunks * 0.01)
433
+ else:
434
+ _log.info(_msg, _trim_events, _trim_chunks * 0.01)
435
+ _last_drain_warn = now
436
+ _trim_events = 0
437
+ _trim_chunks = 0
438
+
439
+ # Yield the CPU only when there was nothing to forward — the gets above
440
+ # no longer block, so this prevents a busy-spin during silence while
441
+ # keeping zero added latency whenever audio is actively flowing.
442
+ if not produced:
443
+ time.sleep(0.005)
444
+
445
+ def _emit_all_chunks(self, buf: "deque", chunk_size: int, queue: "Queue") -> int:
446
+ """Pop every complete ``chunk_size`` window from ``buf`` and forward it to
447
+ ``queue``. Returns the number of chunks emitted.
448
+
449
+ Draining all complete chunks (not just one per call) is what keeps the
450
+ local buffer from accumulating a hidden backlog when the producer briefly
451
+ outpaces a single emit.
452
+ """
453
+ produced = 0
454
+ while len(buf) >= chunk_size:
455
+ data = np.array([buf.popleft() for _ in range(chunk_size)], dtype=np.float32)
456
+ queue.put(data)
457
+ produced += 1
458
+ return produced
360
459
 
361
- # Watchdog: if output queue is growing too large, consumer is dead — flush it
362
- qsize = self.audio_queue.qsize()
363
- if qsize > _MAX_QUEUE_SIZE:
364
- now = time.time()
365
- if now - _last_drain_warn > 30:
366
- import logging
367
- logging.getLogger(__name__).warning(
368
- "Audio queue backlog: %d chunks (~%ds) — flushing to prevent memory leak",
369
- qsize, qsize // self.sample_rate,
370
- )
371
- _last_drain_warn = now
372
- # Drain all but the last 1s
373
- while self.audio_queue.qsize() > self.sample_rate:
374
- try:
375
- self.audio_queue.get_nowait()
376
- except Exception:
377
- break
460
+ @property
461
+ def has_system_audio(self) -> bool:
462
+ return self._has_system_audio
463
+
464
+ def enable_mic_routing(self) -> None:
465
+ """Begin routing the microphone (candidate) stream to ``mic_queue`` for a
466
+ secondary transcriber. Called once the mic Deepgram connection is up."""
467
+ self._mic_routing = True
468
+
469
+ def mic_source(self) -> "_QueueAudioSource":
470
+ """A capture-like view over ``mic_queue`` (get_audio + flush) to hand to a
471
+ second DeepgramTranscriber for the candidate's own voice."""
472
+ return _QueueAudioSource(self.mic_queue)
473
+
474
+ def _trim_queue(self, queue: "Queue") -> int:
475
+ """Drop oldest chunks from ``queue`` when it exceeds ~2s of backlog, down to
476
+ a ~0.5s tail. No-op at or below the threshold. Returns the count dropped.
477
+
478
+ A growing queue means its consumer (a transcriber feed loop) is behind
479
+ real-time — or, for the mic queue, has died (the secondary transcriber has
480
+ no Whisper fallback, so a dead Deepgram socket leaves no consumer). Either
481
+ way, bounding it keeps transcripts live and prevents unbounded growth.
482
+ """
483
+ if queue.qsize() <= self._STALL_TRIM_CHUNKS:
484
+ return 0
485
+ trimmed = 0
486
+ while queue.qsize() > self._STALL_TAIL_CHUNKS:
487
+ try:
488
+ queue.get_nowait()
489
+ trimmed += 1
490
+ except Empty:
491
+ break
492
+ return trimmed
493
+
494
+ def _trim_stale_backlog(self) -> int:
495
+ """Bound the primary transcriber queue (each item is one 10ms chunk)."""
496
+ return self._trim_queue(self.audio_queue)
497
+
498
+ def flush(self) -> int:
499
+ """Drop every queued audio chunk; return how many were dropped.
500
+
501
+ Called the moment the Deepgram WebSocket opens so audio captured during
502
+ the connect handshake — and during a reconnect's backoff, when the
503
+ producer keeps running while the socket is down — is not replayed as a
504
+ burst of stale, lagging transcripts ahead of live speech.
505
+ """
506
+ dropped = 0
507
+ while True:
508
+ try:
509
+ self.audio_queue.get_nowait()
510
+ dropped += 1
511
+ except Empty:
512
+ return dropped
378
513
 
379
514
  def stop(self) -> None:
380
515
  self._stop_event.set()
@@ -37,11 +37,20 @@ class DeepgramTranscriber:
37
37
  transcript_buffer: "RollingTranscriptBuffer",
38
38
  api_key: str,
39
39
  language: str = "en",
40
+ label: str = "",
41
+ whisper_fallback: bool = True,
40
42
  ) -> None:
41
43
  self._capture = audio_capture
42
44
  self._buffer = transcript_buffer
43
45
  self._language = language
44
46
  self._api_key = api_key
47
+ # Secondary streams (e.g. the candidate's mic) set a label so their text
48
+ # is prefixed in the AI context (so the model can tell who spoke) and so
49
+ # they don't fight the primary stream on the live transcript bar.
50
+ self._label = label
51
+ # Secondary streams skip the Whisper fallback — one local Whisper model is
52
+ # enough; a second would double CPU/memory for little gain.
53
+ self._whisper_fallback = whisper_fallback
45
54
  self._stop_event = threading.Event()
46
55
  self._thread: Optional[threading.Thread] = None
47
56
  self._connection = None
@@ -50,6 +59,11 @@ class DeepgramTranscriber:
50
59
  self._last_transcript_time: float = 0.0
51
60
  self._last_interim = None
52
61
 
62
+ def _buffer_text(self, sentence: str) -> str:
63
+ """Prefix a labeled (secondary) stream's text so the AI context shows who
64
+ spoke, e.g. ``[me] ...`` for the candidate's own voice."""
65
+ return f"[{self._label}] {sentence}" if self._label else sentence
66
+
53
67
  def _on_message(self, sender, result=None, **kwargs):
54
68
  try:
55
69
  if result is None:
@@ -64,9 +78,10 @@ class DeepgramTranscriber:
64
78
 
65
79
  if result.is_final:
66
80
  # Final: add to AI buffer and broadcast; clear pending interim
67
- logger.info("Deepgram final: %s", sentence[:120])
81
+ logger.info("Deepgram final%s: %s",
82
+ f" [{self._label}]" if self._label else "", sentence[:120])
68
83
  self._last_interim = None
69
- self._buffer.append(sentence)
84
+ self._buffer.append(self._buffer_text(sentence))
70
85
  state.last_activity_at = time.time()
71
86
  try:
72
87
  from ai_interview.metrics import metrics
@@ -76,12 +91,14 @@ class DeepgramTranscriber:
76
91
  if loop is not None:
77
92
  import asyncio
78
93
  asyncio.run_coroutine_threadsafe(
79
- self._broadcast_transcript(sentence, interim=False), loop
94
+ self._broadcast_transcript(sentence, interim=False, source=self._label), loop
80
95
  )
81
96
  else:
82
- # Interim: track for utterance_end flush; broadcast to transcript bar
97
+ # Interim: track for utterance_end flush. Only the primary stream
98
+ # broadcasts interims — two streams replacing the same transcript
99
+ # bar in-place would flicker against each other.
83
100
  self._last_interim = sentence
84
- if loop is not None:
101
+ if loop is not None and not self._label:
85
102
  import asyncio
86
103
  asyncio.run_coroutine_threadsafe(
87
104
  self._broadcast_transcript(sentence, interim=True), loop
@@ -90,10 +107,10 @@ class DeepgramTranscriber:
90
107
  logger.warning("Deepgram message parse error: %s", exc)
91
108
 
92
109
  @staticmethod
93
- async def _broadcast_transcript(sentence: str, interim: bool = False) -> None:
110
+ async def _broadcast_transcript(sentence: str, interim: bool = False, source: str = "") -> None:
94
111
  try:
95
112
  from ai_interview.server.websocket import broadcast
96
- await broadcast({"type": "transcript", "text": sentence, "interim": interim})
113
+ await broadcast({"type": "transcript", "text": sentence, "interim": interim, "source": source})
97
114
  except Exception:
98
115
  pass
99
116
 
@@ -103,13 +120,13 @@ class DeepgramTranscriber:
103
120
  last = getattr(self, '_last_interim', None)
104
121
  if last:
105
122
  logger.info("Deepgram utterance_end flush: %s", last[:80])
106
- self._buffer.append(last)
123
+ self._buffer.append(self._buffer_text(last))
107
124
  from ai_interview.state import state
108
125
  loop = state.asyncio_loop
109
126
  if loop is not None:
110
127
  import asyncio
111
128
  asyncio.run_coroutine_threadsafe(
112
- self._broadcast_transcript(last, interim=False), loop
129
+ self._broadcast_transcript(last, interim=False, source=self._label), loop
113
130
  )
114
131
  self._last_interim = None
115
132
  except Exception as exc:
@@ -197,8 +214,11 @@ class DeepgramTranscriber:
197
214
  except Exception:
198
215
  pass
199
216
 
200
- logger.error("Deepgram reconnect failed after 10 attempts — falling back to local Whisper")
201
- self._start_whisper_fallback()
217
+ if self._whisper_fallback:
218
+ logger.error("Deepgram reconnect failed after 10 attempts — falling back to local Whisper")
219
+ self._start_whisper_fallback()
220
+ else:
221
+ logger.error("Deepgram reconnect failed after 10 attempts (label=%s) — giving up, no Whisper fallback for secondary stream", self._label)
202
222
  finally:
203
223
  self._reconnect_lock.release()
204
224
 
@@ -368,6 +388,26 @@ class DeepgramTranscriber:
368
388
 
369
389
  self._connection = conn
370
390
  logger.info("Deepgram connection established")
391
+
392
+ # Drop audio captured during the connect handshake (and any reconnect
393
+ # backoff) so transcripts start aligned to live speech instead of
394
+ # replaying stale backlog. Covers initial connect and reconnect — both
395
+ # route through here. A capture stand-in without flush() is the only
396
+ # "expected" miss; anything else is a real fault and must be logged
397
+ # (a None/broken capture would otherwise crash the feed loop silently).
398
+ dropped = 0
399
+ try:
400
+ dropped = self._capture.flush()
401
+ except AttributeError:
402
+ logger.debug("Capture has no flush() — skipping pre-connect flush")
403
+ except Exception as exc:
404
+ logger.warning("Pre-connect flush failed: %s", exc)
405
+ if dropped:
406
+ logger.info(
407
+ "Flushed %d pre-connect audio chunks (~%.1fs) to stay live",
408
+ dropped, dropped * 0.01,
409
+ )
410
+
371
411
  try:
372
412
  from ai_interview.metrics import metrics
373
413
  metrics.record("deepgram_connect", val=duration_ms, ok=True)
@@ -249,6 +249,37 @@ def run_daemon(config: Config) -> None:
249
249
  state.transcriber_name = transcriber_name
250
250
  state.ai_model = config.model
251
251
 
252
+ # Secondary transcriber: the candidate's own microphone on a SEPARATE Deepgram
253
+ # connection (per-role, like meeting-helper). The primary stream above carries
254
+ # system audio (interviewer); this adds the user's voice to the AI context,
255
+ # prefixed "[me]" so the model can tell the two apart. Only when system audio
256
+ # is the primary source (otherwise the mic already IS the primary), a Deepgram
257
+ # key is present, and not disabled via config.
258
+ mic_transcriber = None
259
+ if (
260
+ getattr(config, "transcribe_mic", True)
261
+ and isinstance(transcriber, DeepgramTranscriber)
262
+ and audio_capture.has_system_audio
263
+ and config.deepgram_api_key
264
+ ):
265
+ try:
266
+ mic_t = DeepgramTranscriber(
267
+ audio_capture.mic_source(),
268
+ state.transcript_buffer,
269
+ config.deepgram_api_key,
270
+ language=config.transcription_language,
271
+ label="me",
272
+ whisper_fallback=False,
273
+ )
274
+ if mic_t.start():
275
+ audio_capture.enable_mic_routing()
276
+ mic_transcriber = mic_t
277
+ logger.info("Mic (candidate) transcriber started on a separate Deepgram connection")
278
+ else:
279
+ logger.warning("Mic transcriber failed to start — continuing with interviewer audio only")
280
+ except Exception as exc:
281
+ logger.warning("Mic transcriber setup failed: %s — continuing without it", exc)
282
+
252
283
  # -----------------------------------------------------------------------
253
284
  # Start Datadog metrics + structured logging (no-op if dd_api_key absent)
254
285
  # -----------------------------------------------------------------------
@@ -363,6 +394,8 @@ def run_daemon(config: Config) -> None:
363
394
  finally:
364
395
  if transcriber is not None:
365
396
  transcriber.stop()
397
+ if mic_transcriber is not None:
398
+ mic_transcriber.stop()
366
399
  audio_capture.stop()
367
400
  # Clean up all screenshots captured during this session
368
401
  try:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ai-interview-assistant
3
- Version: 2.2.2
3
+ Version: 2.2.4
4
4
  Summary: Ghost background AI assistant for live code challenges
5
5
  Requires-Python: >=3.11
6
6
  Requires-Dist: click>=8.0
@@ -40,4 +40,5 @@ src/ai_interview_assistant.egg-info/dependency_links.txt
40
40
  src/ai_interview_assistant.egg-info/entry_points.txt
41
41
  src/ai_interview_assistant.egg-info/requires.txt
42
42
  src/ai_interview_assistant.egg-info/top_level.txt
43
- tests/test_llm_clients.py
43
+ tests/test_llm_clients.py
44
+ tests/test_transcription_phase.py
@@ -0,0 +1,149 @@
1
+ """Unit tests for live-transcription phase management (no audio devices).
2
+
3
+ Covers docs/perf/SPEC-transcription-phase.md: flush-on-(re)connect and the
4
+ correctly-scaled stall-recovery trim. CombinedAudioCapture.__init__ only creates
5
+ a Queue, so it is constructed directly without opening any device.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import numpy as np
10
+ import pytest
11
+
12
+ from ai_interview.audio.capture import CombinedAudioCapture
13
+
14
+
15
+ def _chunk():
16
+ # One 10ms chunk at 16kHz = 160 float32 samples (shape mirrors _mix_loop output).
17
+ return np.zeros(160, dtype=np.float32)
18
+
19
+
20
+ def _fill(cap, n):
21
+ for _ in range(n):
22
+ cap.audio_queue.put(_chunk())
23
+
24
+
25
+ @pytest.fixture
26
+ def cap():
27
+ return CombinedAudioCapture(sample_rate=16000)
28
+
29
+
30
+ # --------------------------------------------------------------------------
31
+ # flush()
32
+ # --------------------------------------------------------------------------
33
+
34
+ def test_flush_empties_queue_and_returns_count(cap):
35
+ _fill(cap, 37)
36
+ dropped = cap.flush()
37
+ assert dropped == 37
38
+ assert cap.audio_queue.qsize() == 0
39
+
40
+
41
+ def test_flush_on_empty_queue_returns_zero(cap):
42
+ assert cap.flush() == 0
43
+ assert cap.audio_queue.qsize() == 0
44
+
45
+
46
+ # --------------------------------------------------------------------------
47
+ # _trim_stale_backlog()
48
+ # --------------------------------------------------------------------------
49
+
50
+ def test_trim_noop_below_threshold(cap):
51
+ _fill(cap, cap._STALL_TRIM_CHUNKS - 1)
52
+ assert cap._trim_stale_backlog() == 0
53
+ assert cap.audio_queue.qsize() == cap._STALL_TRIM_CHUNKS - 1
54
+
55
+
56
+ def test_trim_noop_at_exact_threshold(cap):
57
+ _fill(cap, cap._STALL_TRIM_CHUNKS)
58
+ assert cap._trim_stale_backlog() == 0, "trim must not fire at exactly the threshold"
59
+ assert cap.audio_queue.qsize() == cap._STALL_TRIM_CHUNKS
60
+
61
+
62
+ def test_trim_above_threshold_drops_to_tail(cap):
63
+ over = cap._STALL_TRIM_CHUNKS + 120
64
+ _fill(cap, over)
65
+ trimmed = cap._trim_stale_backlog()
66
+ assert cap.audio_queue.qsize() == cap._STALL_TAIL_CHUNKS
67
+ assert trimmed == over - cap._STALL_TAIL_CHUNKS
68
+
69
+
70
+ def test_trim_keeps_newest_tail(cap):
71
+ # Mark chunks with an identifiable value so we can prove the OLDEST were dropped.
72
+ for i in range(cap._STALL_TRIM_CHUNKS + 10):
73
+ cap.audio_queue.put(np.full(1, i, dtype=np.float32))
74
+ cap._trim_stale_backlog()
75
+ remaining_first = cap.audio_queue.get_nowait()[0]
76
+ # The first surviving chunk must be one of the newer ones, not index 0.
77
+ assert remaining_first > 0
78
+
79
+
80
+ # --------------------------------------------------------------------------
81
+ # constants sanity
82
+ # --------------------------------------------------------------------------
83
+
84
+ def test_stall_constants_are_sane():
85
+ c = CombinedAudioCapture(sample_rate=16000)
86
+ assert c._STALL_TRIM_CHUNKS > c._STALL_TAIL_CHUNKS > 0
87
+ # 10ms per chunk: ~2s trigger, ~0.5s tail.
88
+ assert c._STALL_TRIM_CHUNKS * 0.01 == pytest.approx(2.0)
89
+ assert c._STALL_TAIL_CHUNKS * 0.01 == pytest.approx(0.5)
90
+
91
+
92
+ # --------------------------------------------------------------------------
93
+ # _emit_all_chunks — the latency fix: emit ALL complete chunks, not one
94
+ # --------------------------------------------------------------------------
95
+
96
+ def test_emit_all_chunks_drains_every_complete_chunk(cap):
97
+ from collections import deque
98
+ buf = deque([0.0] * (160 * 3 + 30)) # 3 full 10ms chunks + 30 leftover samples
99
+ produced = cap._emit_all_chunks(buf, 160, cap.audio_queue)
100
+ assert produced == 3, "must emit ALL complete chunks in one call, not just one"
101
+ assert cap.audio_queue.qsize() == 3
102
+ assert len(buf) == 30, "sub-chunk remainder stays buffered for next time"
103
+
104
+
105
+ def test_emit_all_chunks_noop_below_chunk_size(cap):
106
+ from collections import deque
107
+ buf = deque([0.0] * 159)
108
+ assert cap._emit_all_chunks(buf, 160, cap.audio_queue) == 0
109
+ assert cap.audio_queue.qsize() == 0
110
+ assert len(buf) == 159
111
+
112
+
113
+ def test_emit_all_chunks_targets_the_given_queue(cap):
114
+ from collections import deque
115
+ buf = deque([0.0] * 320)
116
+ cap._emit_all_chunks(buf, 160, cap.mic_queue)
117
+ assert cap.mic_queue.qsize() == 2
118
+ assert cap.audio_queue.qsize() == 0, "must write to the queue it was handed"
119
+
120
+
121
+ # --------------------------------------------------------------------------
122
+ # Separate mic stream: routing flag + queue adapter
123
+ # --------------------------------------------------------------------------
124
+
125
+ def test_enable_mic_routing_flag(cap):
126
+ assert cap._mic_routing is False
127
+ cap.enable_mic_routing()
128
+ assert cap._mic_routing is True
129
+
130
+
131
+ def test_mic_source_get_and_flush(cap):
132
+ src = cap.mic_source()
133
+ assert src.get_audio(timeout=0.001) is None # empty queue
134
+ cap.mic_queue.put(_chunk())
135
+ cap.mic_queue.put(_chunk())
136
+ assert src.get_audio(timeout=0.001) is not None
137
+ assert src.flush() == 1 # the one remaining chunk dropped
138
+ assert cap.mic_queue.qsize() == 0
139
+
140
+
141
+ def test_trim_queue_bounds_mic_queue(cap):
142
+ # If the mic transcriber dies, routing stays on and mic_queue would grow
143
+ # unbounded — _trim_queue must cap it to the live tail like the primary.
144
+ over = cap._STALL_TRIM_CHUNKS + 100
145
+ for _ in range(over):
146
+ cap.mic_queue.put(_chunk())
147
+ trimmed = cap._trim_queue(cap.mic_queue)
148
+ assert cap.mic_queue.qsize() == cap._STALL_TAIL_CHUNKS
149
+ assert trimmed == over - cap._STALL_TAIL_CHUNKS