ai-interview-assistant 2.2.2__tar.gz → 2.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/PKG-INFO +1 -1
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/pyproject.toml +1 -1
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/__init__.py +1 -1
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/audio/capture.py +173 -38
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/audio/transcriber.py +51 -11
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/daemon.py +33 -0
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview_assistant.egg-info/PKG-INFO +1 -1
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview_assistant.egg-info/SOURCES.txt +2 -1
- ai_interview_assistant-2.2.4/tests/test_transcription_phase.py +149 -0
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/README.md +0 -0
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/setup.cfg +0 -0
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/__main__.py +0 -0
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/ai_client.py +0 -0
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/audio/__init__.py +0 -0
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/buffer.py +0 -0
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/cli.py +0 -0
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/config.py +0 -0
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/flet_gui/__init__.py +0 -0
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/flet_gui/__main__.py +0 -0
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/flet_gui/app.py +0 -0
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/flet_gui/screens/__init__.py +0 -0
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/flet_gui/screens/dashboard.py +0 -0
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/flet_gui/screens/hotkeys.py +0 -0
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/flet_gui/screens/scripts.py +0 -0
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/flet_gui/screens/settings.py +0 -0
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/hotkey_config.py +0 -0
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/hotkeys.py +0 -0
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/i18n.py +0 -0
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/llm_clients.py +0 -0
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/menubar.py +0 -0
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/metrics.py +0 -0
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/ollama_utils.py +0 -0
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/overlay.py +0 -0
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/screenshot.py +0 -0
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/server/__init__.py +0 -0
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/server/app.py +0 -0
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/server/routes.py +0 -0
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/server/websocket.py +0 -0
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/state.py +0 -0
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/utils.py +0 -0
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/watchdog.py +0 -0
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview_assistant.egg-info/dependency_links.txt +0 -0
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview_assistant.egg-info/entry_points.txt +0 -0
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview_assistant.egg-info/requires.txt +0 -0
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview_assistant.egg-info/top_level.txt +0 -0
- {ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/tests/test_llm_clients.py +0 -0
{ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/audio/capture.py
RENAMED
|
@@ -285,12 +285,50 @@ class ScreenCaptureAudio:
|
|
|
285
285
|
# Combined capture: mic + system audio
|
|
286
286
|
# ---------------------------------------------------------------------------
|
|
287
287
|
|
|
288
|
+
class _QueueAudioSource:
|
|
289
|
+
"""Minimal capture-like adapter over a single Queue.
|
|
290
|
+
|
|
291
|
+
Exposes the same ``get_audio`` / ``flush`` surface a DeepgramTranscriber pulls
|
|
292
|
+
from, so a second transcriber can consume the candidate (mic) stream from its
|
|
293
|
+
own queue without knowing about CombinedAudioCapture internals.
|
|
294
|
+
"""
|
|
295
|
+
|
|
296
|
+
def __init__(self, queue: "Queue") -> None:
|
|
297
|
+
self._queue = queue
|
|
298
|
+
|
|
299
|
+
def get_audio(self, timeout: float = 0.1) -> Optional[np.ndarray]:
|
|
300
|
+
try:
|
|
301
|
+
return self._queue.get(timeout=timeout)
|
|
302
|
+
except Empty:
|
|
303
|
+
return None
|
|
304
|
+
|
|
305
|
+
def flush(self) -> int:
|
|
306
|
+
dropped = 0
|
|
307
|
+
while True:
|
|
308
|
+
try:
|
|
309
|
+
self._queue.get_nowait()
|
|
310
|
+
dropped += 1
|
|
311
|
+
except Empty:
|
|
312
|
+
return dropped
|
|
313
|
+
|
|
314
|
+
|
|
288
315
|
class CombinedAudioCapture:
|
|
289
316
|
"""Mixes microphone + system audio and feeds a Queue for the transcriber."""
|
|
290
317
|
|
|
318
|
+
# Each queued item is one 10ms chunk (see _mix_loop). Keep the transcriber in
|
|
319
|
+
# real-time phase: if the consumer (Deepgram feed loop) falls behind, drop the
|
|
320
|
+
# oldest chunks so emitted transcripts stay live instead of drifting late.
|
|
321
|
+
_STALL_TRIM_CHUNKS = 200 # ~2.0s of backlog triggers a trim
|
|
322
|
+
_STALL_TAIL_CHUNKS = 50 # leave ~0.5s of live tail after trimming
|
|
323
|
+
|
|
291
324
|
def __init__(self, sample_rate: int = SAMPLE_RATE) -> None:
|
|
292
325
|
self.sample_rate = sample_rate
|
|
293
326
|
self.audio_queue: Queue[np.ndarray] = Queue()
|
|
327
|
+
# Separate queue carrying the microphone (candidate) stream when system
|
|
328
|
+
# audio is the primary source. Only fed while a mic transcriber is active
|
|
329
|
+
# (see _mic_routing) so it never grows unbounded when unused.
|
|
330
|
+
self.mic_queue: Queue[np.ndarray] = Queue()
|
|
331
|
+
self._mic_routing = False
|
|
294
332
|
self._stop_event = Event()
|
|
295
333
|
self._mic: Optional[MicrophoneCapture] = None
|
|
296
334
|
self._sys: Optional[ScreenCaptureAudio] = None
|
|
@@ -323,58 +361,155 @@ class CombinedAudioCapture:
|
|
|
323
361
|
only the interviewer's voice should be transcribed automatically.
|
|
324
362
|
The microphone is only used for manual recordings (2x ESC hold).
|
|
325
363
|
Falls back to mic-only if system audio is unavailable.
|
|
364
|
+
|
|
365
|
+
Latency design: each iteration drains the capture queues **fully** and
|
|
366
|
+
**non-blocking**, then emits **every** complete chunk. The previous design
|
|
367
|
+
blocked up to 20ms on the (discarded) mic queue and emitted only one chunk
|
|
368
|
+
per iteration — so whenever the loop dipped below the capture rate, audio
|
|
369
|
+
piled up in ``sys_buf`` (capped at 1s by its maxlen) and produced a steady,
|
|
370
|
+
invisible transcription lag. Draining fully keeps ``sys_buf`` near-empty so
|
|
371
|
+
audio reaches Deepgram in real time.
|
|
326
372
|
"""
|
|
327
373
|
chunk_size = int(self.sample_rate * 0.01) # 10ms — forward audio to Deepgram faster
|
|
328
374
|
mic_buf: deque = deque(maxlen=self.sample_rate)
|
|
329
375
|
sys_buf: deque = deque(maxlen=self.sample_rate)
|
|
330
|
-
_MAX_QUEUE_SIZE = self.sample_rate * 30 # ~30s of audio samples
|
|
331
376
|
_last_drain_warn = 0.0
|
|
377
|
+
_trim_events = 0 # trims since last log — distinguishes a one-off spike
|
|
378
|
+
_trim_chunks = 0 # chunks dropped since last log from a chronic stall
|
|
332
379
|
|
|
333
380
|
while not self._stop_event.is_set():
|
|
334
|
-
#
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
381
|
+
# Drain the mic queue fully (non-blocking) so it never backs up. Keep
|
|
382
|
+
# the samples when mic is the fallback primary, or when a mic
|
|
383
|
+
# transcriber is routing the candidate stream; otherwise drain-and-
|
|
384
|
+
# discard.
|
|
385
|
+
if self._mic:
|
|
386
|
+
keep_mic = (not self._has_system_audio) or self._mic_routing
|
|
387
|
+
while True:
|
|
388
|
+
mic_chunk = self._mic.get_audio(timeout=0)
|
|
389
|
+
if mic_chunk is None:
|
|
390
|
+
break
|
|
391
|
+
if keep_mic:
|
|
392
|
+
mic_buf.extend(mic_chunk.flatten())
|
|
338
393
|
|
|
394
|
+
# Drain the system-audio queue fully (non-blocking).
|
|
339
395
|
if self._sys and self._sys.is_running:
|
|
340
|
-
|
|
341
|
-
|
|
396
|
+
while True:
|
|
397
|
+
sys_chunk = self._sys.get_audio(timeout=0)
|
|
398
|
+
if sys_chunk is None:
|
|
399
|
+
break
|
|
342
400
|
sys_buf.extend(sys_chunk.flatten())
|
|
343
401
|
|
|
402
|
+
# Emit EVERY complete chunk this iteration (system audio decoupled from
|
|
403
|
+
# mic — it never waits on the mic having a chunk ready).
|
|
344
404
|
if self._has_system_audio:
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
dtype=np.float32,
|
|
350
|
-
)
|
|
351
|
-
self.audio_queue.put(sys_data)
|
|
405
|
+
produced = self._emit_all_chunks(sys_buf, chunk_size, self.audio_queue)
|
|
406
|
+
# Candidate's own voice → separate queue for the mic transcriber.
|
|
407
|
+
if self._mic_routing:
|
|
408
|
+
produced += self._emit_all_chunks(mic_buf, chunk_size, self.mic_queue)
|
|
352
409
|
else:
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
410
|
+
produced = self._emit_all_chunks(mic_buf, chunk_size, self.audio_queue)
|
|
411
|
+
|
|
412
|
+
# Stall-recovery: if the downstream consumer (Deepgram feed loop) has
|
|
413
|
+
# fallen behind real-time, trim oldest queued chunks so transcripts stay
|
|
414
|
+
# live. Aggregate over a 30s window so a chronic stall (many trims) is
|
|
415
|
+
# distinguishable from a one-off spike and escalates to WARNING.
|
|
416
|
+
trimmed = self._trim_stale_backlog()
|
|
417
|
+
if trimmed:
|
|
418
|
+
_trim_events += 1
|
|
419
|
+
_trim_chunks += trimmed
|
|
420
|
+
# Bound the secondary (mic) queue too — if its transcriber dies (no
|
|
421
|
+
# Whisper fallback) routing stays on, so without this it would grow
|
|
422
|
+
# without limit. Same drop-oldest-to-stay-live policy as the primary.
|
|
423
|
+
if self._mic_routing:
|
|
424
|
+
self._trim_queue(self.mic_queue)
|
|
425
|
+
now = time.time()
|
|
426
|
+
if _trim_events and now - _last_drain_warn > 30:
|
|
427
|
+
import logging
|
|
428
|
+
_log = logging.getLogger(__name__)
|
|
429
|
+
_msg = ("Audio backlog trimmed %d times (~%.1fs dropped) in last 30s "
|
|
430
|
+
"— transcriber feed behind real-time")
|
|
431
|
+
if _trim_events >= 50:
|
|
432
|
+
_log.warning(_msg, _trim_events, _trim_chunks * 0.01)
|
|
433
|
+
else:
|
|
434
|
+
_log.info(_msg, _trim_events, _trim_chunks * 0.01)
|
|
435
|
+
_last_drain_warn = now
|
|
436
|
+
_trim_events = 0
|
|
437
|
+
_trim_chunks = 0
|
|
438
|
+
|
|
439
|
+
# Yield the CPU only when there was nothing to forward — the gets above
|
|
440
|
+
# no longer block, so this prevents a busy-spin during silence while
|
|
441
|
+
# keeping zero added latency whenever audio is actively flowing.
|
|
442
|
+
if not produced:
|
|
443
|
+
time.sleep(0.005)
|
|
444
|
+
|
|
445
|
+
def _emit_all_chunks(self, buf: "deque", chunk_size: int, queue: "Queue") -> int:
|
|
446
|
+
"""Pop every complete ``chunk_size`` window from ``buf`` and forward it to
|
|
447
|
+
``queue``. Returns the number of chunks emitted.
|
|
448
|
+
|
|
449
|
+
Draining all complete chunks (not just one per call) is what keeps the
|
|
450
|
+
local buffer from accumulating a hidden backlog when the producer briefly
|
|
451
|
+
outpaces a single emit.
|
|
452
|
+
"""
|
|
453
|
+
produced = 0
|
|
454
|
+
while len(buf) >= chunk_size:
|
|
455
|
+
data = np.array([buf.popleft() for _ in range(chunk_size)], dtype=np.float32)
|
|
456
|
+
queue.put(data)
|
|
457
|
+
produced += 1
|
|
458
|
+
return produced
|
|
360
459
|
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
460
|
+
@property
|
|
461
|
+
def has_system_audio(self) -> bool:
|
|
462
|
+
return self._has_system_audio
|
|
463
|
+
|
|
464
|
+
def enable_mic_routing(self) -> None:
|
|
465
|
+
"""Begin routing the microphone (candidate) stream to ``mic_queue`` for a
|
|
466
|
+
secondary transcriber. Called once the mic Deepgram connection is up."""
|
|
467
|
+
self._mic_routing = True
|
|
468
|
+
|
|
469
|
+
def mic_source(self) -> "_QueueAudioSource":
|
|
470
|
+
"""A capture-like view over ``mic_queue`` (get_audio + flush) to hand to a
|
|
471
|
+
second DeepgramTranscriber for the candidate's own voice."""
|
|
472
|
+
return _QueueAudioSource(self.mic_queue)
|
|
473
|
+
|
|
474
|
+
def _trim_queue(self, queue: "Queue") -> int:
|
|
475
|
+
"""Drop oldest chunks from ``queue`` when it exceeds ~2s of backlog, down to
|
|
476
|
+
a ~0.5s tail. No-op at or below the threshold. Returns the count dropped.
|
|
477
|
+
|
|
478
|
+
A growing queue means its consumer (a transcriber feed loop) is behind
|
|
479
|
+
real-time — or, for the mic queue, has died (the secondary transcriber has
|
|
480
|
+
no Whisper fallback, so a dead Deepgram socket leaves no consumer). Either
|
|
481
|
+
way, bounding it keeps transcripts live and prevents unbounded growth.
|
|
482
|
+
"""
|
|
483
|
+
if queue.qsize() <= self._STALL_TRIM_CHUNKS:
|
|
484
|
+
return 0
|
|
485
|
+
trimmed = 0
|
|
486
|
+
while queue.qsize() > self._STALL_TAIL_CHUNKS:
|
|
487
|
+
try:
|
|
488
|
+
queue.get_nowait()
|
|
489
|
+
trimmed += 1
|
|
490
|
+
except Empty:
|
|
491
|
+
break
|
|
492
|
+
return trimmed
|
|
493
|
+
|
|
494
|
+
def _trim_stale_backlog(self) -> int:
|
|
495
|
+
"""Bound the primary transcriber queue (each item is one 10ms chunk)."""
|
|
496
|
+
return self._trim_queue(self.audio_queue)
|
|
497
|
+
|
|
498
|
+
def flush(self) -> int:
|
|
499
|
+
"""Drop every queued audio chunk; return how many were dropped.
|
|
500
|
+
|
|
501
|
+
Called the moment the Deepgram WebSocket opens so audio captured during
|
|
502
|
+
the connect handshake — and during a reconnect's backoff, when the
|
|
503
|
+
producer keeps running while the socket is down — is not replayed as a
|
|
504
|
+
burst of stale, lagging transcripts ahead of live speech.
|
|
505
|
+
"""
|
|
506
|
+
dropped = 0
|
|
507
|
+
while True:
|
|
508
|
+
try:
|
|
509
|
+
self.audio_queue.get_nowait()
|
|
510
|
+
dropped += 1
|
|
511
|
+
except Empty:
|
|
512
|
+
return dropped
|
|
378
513
|
|
|
379
514
|
def stop(self) -> None:
|
|
380
515
|
self._stop_event.set()
|
{ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/audio/transcriber.py
RENAMED
|
@@ -37,11 +37,20 @@ class DeepgramTranscriber:
|
|
|
37
37
|
transcript_buffer: "RollingTranscriptBuffer",
|
|
38
38
|
api_key: str,
|
|
39
39
|
language: str = "en",
|
|
40
|
+
label: str = "",
|
|
41
|
+
whisper_fallback: bool = True,
|
|
40
42
|
) -> None:
|
|
41
43
|
self._capture = audio_capture
|
|
42
44
|
self._buffer = transcript_buffer
|
|
43
45
|
self._language = language
|
|
44
46
|
self._api_key = api_key
|
|
47
|
+
# Secondary streams (e.g. the candidate's mic) set a label so their text
|
|
48
|
+
# is prefixed in the AI context (so the model can tell who spoke) and so
|
|
49
|
+
# they don't fight the primary stream on the live transcript bar.
|
|
50
|
+
self._label = label
|
|
51
|
+
# Secondary streams skip the Whisper fallback — one local Whisper model is
|
|
52
|
+
# enough; a second would double CPU/memory for little gain.
|
|
53
|
+
self._whisper_fallback = whisper_fallback
|
|
45
54
|
self._stop_event = threading.Event()
|
|
46
55
|
self._thread: Optional[threading.Thread] = None
|
|
47
56
|
self._connection = None
|
|
@@ -50,6 +59,11 @@ class DeepgramTranscriber:
|
|
|
50
59
|
self._last_transcript_time: float = 0.0
|
|
51
60
|
self._last_interim = None
|
|
52
61
|
|
|
62
|
+
def _buffer_text(self, sentence: str) -> str:
|
|
63
|
+
"""Prefix a labeled (secondary) stream's text so the AI context shows who
|
|
64
|
+
spoke, e.g. ``[me] ...`` for the candidate's own voice."""
|
|
65
|
+
return f"[{self._label}] {sentence}" if self._label else sentence
|
|
66
|
+
|
|
53
67
|
def _on_message(self, sender, result=None, **kwargs):
|
|
54
68
|
try:
|
|
55
69
|
if result is None:
|
|
@@ -64,9 +78,10 @@ class DeepgramTranscriber:
|
|
|
64
78
|
|
|
65
79
|
if result.is_final:
|
|
66
80
|
# Final: add to AI buffer and broadcast; clear pending interim
|
|
67
|
-
logger.info("Deepgram final: %s",
|
|
81
|
+
logger.info("Deepgram final%s: %s",
|
|
82
|
+
f" [{self._label}]" if self._label else "", sentence[:120])
|
|
68
83
|
self._last_interim = None
|
|
69
|
-
self._buffer.append(sentence)
|
|
84
|
+
self._buffer.append(self._buffer_text(sentence))
|
|
70
85
|
state.last_activity_at = time.time()
|
|
71
86
|
try:
|
|
72
87
|
from ai_interview.metrics import metrics
|
|
@@ -76,12 +91,14 @@ class DeepgramTranscriber:
|
|
|
76
91
|
if loop is not None:
|
|
77
92
|
import asyncio
|
|
78
93
|
asyncio.run_coroutine_threadsafe(
|
|
79
|
-
self._broadcast_transcript(sentence, interim=False), loop
|
|
94
|
+
self._broadcast_transcript(sentence, interim=False, source=self._label), loop
|
|
80
95
|
)
|
|
81
96
|
else:
|
|
82
|
-
# Interim: track for utterance_end flush
|
|
97
|
+
# Interim: track for utterance_end flush. Only the primary stream
|
|
98
|
+
# broadcasts interims — two streams replacing the same transcript
|
|
99
|
+
# bar in-place would flicker against each other.
|
|
83
100
|
self._last_interim = sentence
|
|
84
|
-
if loop is not None:
|
|
101
|
+
if loop is not None and not self._label:
|
|
85
102
|
import asyncio
|
|
86
103
|
asyncio.run_coroutine_threadsafe(
|
|
87
104
|
self._broadcast_transcript(sentence, interim=True), loop
|
|
@@ -90,10 +107,10 @@ class DeepgramTranscriber:
|
|
|
90
107
|
logger.warning("Deepgram message parse error: %s", exc)
|
|
91
108
|
|
|
92
109
|
@staticmethod
|
|
93
|
-
async def _broadcast_transcript(sentence: str, interim: bool = False) -> None:
|
|
110
|
+
async def _broadcast_transcript(sentence: str, interim: bool = False, source: str = "") -> None:
|
|
94
111
|
try:
|
|
95
112
|
from ai_interview.server.websocket import broadcast
|
|
96
|
-
await broadcast({"type": "transcript", "text": sentence, "interim": interim})
|
|
113
|
+
await broadcast({"type": "transcript", "text": sentence, "interim": interim, "source": source})
|
|
97
114
|
except Exception:
|
|
98
115
|
pass
|
|
99
116
|
|
|
@@ -103,13 +120,13 @@ class DeepgramTranscriber:
|
|
|
103
120
|
last = getattr(self, '_last_interim', None)
|
|
104
121
|
if last:
|
|
105
122
|
logger.info("Deepgram utterance_end flush: %s", last[:80])
|
|
106
|
-
self._buffer.append(last)
|
|
123
|
+
self._buffer.append(self._buffer_text(last))
|
|
107
124
|
from ai_interview.state import state
|
|
108
125
|
loop = state.asyncio_loop
|
|
109
126
|
if loop is not None:
|
|
110
127
|
import asyncio
|
|
111
128
|
asyncio.run_coroutine_threadsafe(
|
|
112
|
-
self._broadcast_transcript(last, interim=False), loop
|
|
129
|
+
self._broadcast_transcript(last, interim=False, source=self._label), loop
|
|
113
130
|
)
|
|
114
131
|
self._last_interim = None
|
|
115
132
|
except Exception as exc:
|
|
@@ -197,8 +214,11 @@ class DeepgramTranscriber:
|
|
|
197
214
|
except Exception:
|
|
198
215
|
pass
|
|
199
216
|
|
|
200
|
-
|
|
201
|
-
|
|
217
|
+
if self._whisper_fallback:
|
|
218
|
+
logger.error("Deepgram reconnect failed after 10 attempts — falling back to local Whisper")
|
|
219
|
+
self._start_whisper_fallback()
|
|
220
|
+
else:
|
|
221
|
+
logger.error("Deepgram reconnect failed after 10 attempts (label=%s) — giving up, no Whisper fallback for secondary stream", self._label)
|
|
202
222
|
finally:
|
|
203
223
|
self._reconnect_lock.release()
|
|
204
224
|
|
|
@@ -368,6 +388,26 @@ class DeepgramTranscriber:
|
|
|
368
388
|
|
|
369
389
|
self._connection = conn
|
|
370
390
|
logger.info("Deepgram connection established")
|
|
391
|
+
|
|
392
|
+
# Drop audio captured during the connect handshake (and any reconnect
|
|
393
|
+
# backoff) so transcripts start aligned to live speech instead of
|
|
394
|
+
# replaying stale backlog. Covers initial connect and reconnect — both
|
|
395
|
+
# route through here. A capture stand-in without flush() is the only
|
|
396
|
+
# "expected" miss; anything else is a real fault and must be logged
|
|
397
|
+
# (a None/broken capture would otherwise crash the feed loop silently).
|
|
398
|
+
dropped = 0
|
|
399
|
+
try:
|
|
400
|
+
dropped = self._capture.flush()
|
|
401
|
+
except AttributeError:
|
|
402
|
+
logger.debug("Capture has no flush() — skipping pre-connect flush")
|
|
403
|
+
except Exception as exc:
|
|
404
|
+
logger.warning("Pre-connect flush failed: %s", exc)
|
|
405
|
+
if dropped:
|
|
406
|
+
logger.info(
|
|
407
|
+
"Flushed %d pre-connect audio chunks (~%.1fs) to stay live",
|
|
408
|
+
dropped, dropped * 0.01,
|
|
409
|
+
)
|
|
410
|
+
|
|
371
411
|
try:
|
|
372
412
|
from ai_interview.metrics import metrics
|
|
373
413
|
metrics.record("deepgram_connect", val=duration_ms, ok=True)
|
|
@@ -249,6 +249,37 @@ def run_daemon(config: Config) -> None:
|
|
|
249
249
|
state.transcriber_name = transcriber_name
|
|
250
250
|
state.ai_model = config.model
|
|
251
251
|
|
|
252
|
+
# Secondary transcriber: the candidate's own microphone on a SEPARATE Deepgram
|
|
253
|
+
# connection (per-role, like meeting-helper). The primary stream above carries
|
|
254
|
+
# system audio (interviewer); this adds the user's voice to the AI context,
|
|
255
|
+
# prefixed "[me]" so the model can tell the two apart. Only when system audio
|
|
256
|
+
# is the primary source (otherwise the mic already IS the primary), a Deepgram
|
|
257
|
+
# key is present, and not disabled via config.
|
|
258
|
+
mic_transcriber = None
|
|
259
|
+
if (
|
|
260
|
+
getattr(config, "transcribe_mic", True)
|
|
261
|
+
and isinstance(transcriber, DeepgramTranscriber)
|
|
262
|
+
and audio_capture.has_system_audio
|
|
263
|
+
and config.deepgram_api_key
|
|
264
|
+
):
|
|
265
|
+
try:
|
|
266
|
+
mic_t = DeepgramTranscriber(
|
|
267
|
+
audio_capture.mic_source(),
|
|
268
|
+
state.transcript_buffer,
|
|
269
|
+
config.deepgram_api_key,
|
|
270
|
+
language=config.transcription_language,
|
|
271
|
+
label="me",
|
|
272
|
+
whisper_fallback=False,
|
|
273
|
+
)
|
|
274
|
+
if mic_t.start():
|
|
275
|
+
audio_capture.enable_mic_routing()
|
|
276
|
+
mic_transcriber = mic_t
|
|
277
|
+
logger.info("Mic (candidate) transcriber started on a separate Deepgram connection")
|
|
278
|
+
else:
|
|
279
|
+
logger.warning("Mic transcriber failed to start — continuing with interviewer audio only")
|
|
280
|
+
except Exception as exc:
|
|
281
|
+
logger.warning("Mic transcriber setup failed: %s — continuing without it", exc)
|
|
282
|
+
|
|
252
283
|
# -----------------------------------------------------------------------
|
|
253
284
|
# Start Datadog metrics + structured logging (no-op if dd_api_key absent)
|
|
254
285
|
# -----------------------------------------------------------------------
|
|
@@ -363,6 +394,8 @@ def run_daemon(config: Config) -> None:
|
|
|
363
394
|
finally:
|
|
364
395
|
if transcriber is not None:
|
|
365
396
|
transcriber.stop()
|
|
397
|
+
if mic_transcriber is not None:
|
|
398
|
+
mic_transcriber.stop()
|
|
366
399
|
audio_capture.stop()
|
|
367
400
|
# Clean up all screenshots captured during this session
|
|
368
401
|
try:
|
|
@@ -40,4 +40,5 @@ src/ai_interview_assistant.egg-info/dependency_links.txt
|
|
|
40
40
|
src/ai_interview_assistant.egg-info/entry_points.txt
|
|
41
41
|
src/ai_interview_assistant.egg-info/requires.txt
|
|
42
42
|
src/ai_interview_assistant.egg-info/top_level.txt
|
|
43
|
-
tests/test_llm_clients.py
|
|
43
|
+
tests/test_llm_clients.py
|
|
44
|
+
tests/test_transcription_phase.py
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
"""Unit tests for live-transcription phase management (no audio devices).
|
|
2
|
+
|
|
3
|
+
Covers docs/perf/SPEC-transcription-phase.md: flush-on-(re)connect and the
|
|
4
|
+
correctly-scaled stall-recovery trim. CombinedAudioCapture.__init__ only creates
|
|
5
|
+
a Queue, so it is constructed directly without opening any device.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pytest
|
|
11
|
+
|
|
12
|
+
from ai_interview.audio.capture import CombinedAudioCapture
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _chunk():
|
|
16
|
+
# One 10ms chunk at 16kHz = 160 float32 samples (shape mirrors _mix_loop output).
|
|
17
|
+
return np.zeros(160, dtype=np.float32)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _fill(cap, n):
|
|
21
|
+
for _ in range(n):
|
|
22
|
+
cap.audio_queue.put(_chunk())
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@pytest.fixture
|
|
26
|
+
def cap():
|
|
27
|
+
return CombinedAudioCapture(sample_rate=16000)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# --------------------------------------------------------------------------
|
|
31
|
+
# flush()
|
|
32
|
+
# --------------------------------------------------------------------------
|
|
33
|
+
|
|
34
|
+
def test_flush_empties_queue_and_returns_count(cap):
|
|
35
|
+
_fill(cap, 37)
|
|
36
|
+
dropped = cap.flush()
|
|
37
|
+
assert dropped == 37
|
|
38
|
+
assert cap.audio_queue.qsize() == 0
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def test_flush_on_empty_queue_returns_zero(cap):
|
|
42
|
+
assert cap.flush() == 0
|
|
43
|
+
assert cap.audio_queue.qsize() == 0
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# --------------------------------------------------------------------------
|
|
47
|
+
# _trim_stale_backlog()
|
|
48
|
+
# --------------------------------------------------------------------------
|
|
49
|
+
|
|
50
|
+
def test_trim_noop_below_threshold(cap):
|
|
51
|
+
_fill(cap, cap._STALL_TRIM_CHUNKS - 1)
|
|
52
|
+
assert cap._trim_stale_backlog() == 0
|
|
53
|
+
assert cap.audio_queue.qsize() == cap._STALL_TRIM_CHUNKS - 1
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def test_trim_noop_at_exact_threshold(cap):
|
|
57
|
+
_fill(cap, cap._STALL_TRIM_CHUNKS)
|
|
58
|
+
assert cap._trim_stale_backlog() == 0, "trim must not fire at exactly the threshold"
|
|
59
|
+
assert cap.audio_queue.qsize() == cap._STALL_TRIM_CHUNKS
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def test_trim_above_threshold_drops_to_tail(cap):
|
|
63
|
+
over = cap._STALL_TRIM_CHUNKS + 120
|
|
64
|
+
_fill(cap, over)
|
|
65
|
+
trimmed = cap._trim_stale_backlog()
|
|
66
|
+
assert cap.audio_queue.qsize() == cap._STALL_TAIL_CHUNKS
|
|
67
|
+
assert trimmed == over - cap._STALL_TAIL_CHUNKS
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def test_trim_keeps_newest_tail(cap):
|
|
71
|
+
# Mark chunks with an identifiable value so we can prove the OLDEST were dropped.
|
|
72
|
+
for i in range(cap._STALL_TRIM_CHUNKS + 10):
|
|
73
|
+
cap.audio_queue.put(np.full(1, i, dtype=np.float32))
|
|
74
|
+
cap._trim_stale_backlog()
|
|
75
|
+
remaining_first = cap.audio_queue.get_nowait()[0]
|
|
76
|
+
# The first surviving chunk must be one of the newer ones, not index 0.
|
|
77
|
+
assert remaining_first > 0
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# --------------------------------------------------------------------------
|
|
81
|
+
# constants sanity
|
|
82
|
+
# --------------------------------------------------------------------------
|
|
83
|
+
|
|
84
|
+
def test_stall_constants_are_sane():
|
|
85
|
+
c = CombinedAudioCapture(sample_rate=16000)
|
|
86
|
+
assert c._STALL_TRIM_CHUNKS > c._STALL_TAIL_CHUNKS > 0
|
|
87
|
+
# 10ms per chunk: ~2s trigger, ~0.5s tail.
|
|
88
|
+
assert c._STALL_TRIM_CHUNKS * 0.01 == pytest.approx(2.0)
|
|
89
|
+
assert c._STALL_TAIL_CHUNKS * 0.01 == pytest.approx(0.5)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# --------------------------------------------------------------------------
|
|
93
|
+
# _emit_all_chunks — the latency fix: emit ALL complete chunks, not one
|
|
94
|
+
# --------------------------------------------------------------------------
|
|
95
|
+
|
|
96
|
+
def test_emit_all_chunks_drains_every_complete_chunk(cap):
|
|
97
|
+
from collections import deque
|
|
98
|
+
buf = deque([0.0] * (160 * 3 + 30)) # 3 full 10ms chunks + 30 leftover samples
|
|
99
|
+
produced = cap._emit_all_chunks(buf, 160, cap.audio_queue)
|
|
100
|
+
assert produced == 3, "must emit ALL complete chunks in one call, not just one"
|
|
101
|
+
assert cap.audio_queue.qsize() == 3
|
|
102
|
+
assert len(buf) == 30, "sub-chunk remainder stays buffered for next time"
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def test_emit_all_chunks_noop_below_chunk_size(cap):
|
|
106
|
+
from collections import deque
|
|
107
|
+
buf = deque([0.0] * 159)
|
|
108
|
+
assert cap._emit_all_chunks(buf, 160, cap.audio_queue) == 0
|
|
109
|
+
assert cap.audio_queue.qsize() == 0
|
|
110
|
+
assert len(buf) == 159
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def test_emit_all_chunks_targets_the_given_queue(cap):
|
|
114
|
+
from collections import deque
|
|
115
|
+
buf = deque([0.0] * 320)
|
|
116
|
+
cap._emit_all_chunks(buf, 160, cap.mic_queue)
|
|
117
|
+
assert cap.mic_queue.qsize() == 2
|
|
118
|
+
assert cap.audio_queue.qsize() == 0, "must write to the queue it was handed"
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
# --------------------------------------------------------------------------
|
|
122
|
+
# Separate mic stream: routing flag + queue adapter
|
|
123
|
+
# --------------------------------------------------------------------------
|
|
124
|
+
|
|
125
|
+
def test_enable_mic_routing_flag(cap):
|
|
126
|
+
assert cap._mic_routing is False
|
|
127
|
+
cap.enable_mic_routing()
|
|
128
|
+
assert cap._mic_routing is True
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def test_mic_source_get_and_flush(cap):
|
|
132
|
+
src = cap.mic_source()
|
|
133
|
+
assert src.get_audio(timeout=0.001) is None # empty queue
|
|
134
|
+
cap.mic_queue.put(_chunk())
|
|
135
|
+
cap.mic_queue.put(_chunk())
|
|
136
|
+
assert src.get_audio(timeout=0.001) is not None
|
|
137
|
+
assert src.flush() == 1 # the one remaining chunk dropped
|
|
138
|
+
assert cap.mic_queue.qsize() == 0
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def test_trim_queue_bounds_mic_queue(cap):
|
|
142
|
+
# If the mic transcriber dies, routing stays on and mic_queue would grow
|
|
143
|
+
# unbounded — _trim_queue must cap it to the live tail like the primary.
|
|
144
|
+
over = cap._STALL_TRIM_CHUNKS + 100
|
|
145
|
+
for _ in range(over):
|
|
146
|
+
cap.mic_queue.put(_chunk())
|
|
147
|
+
trimmed = cap._trim_queue(cap.mic_queue)
|
|
148
|
+
assert cap.mic_queue.qsize() == cap._STALL_TAIL_CHUNKS
|
|
149
|
+
assert trimmed == over - cap._STALL_TAIL_CHUNKS
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/audio/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/flet_gui/__init__.py
RENAMED
|
File without changes
|
{ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/flet_gui/__main__.py
RENAMED
|
File without changes
|
{ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/flet_gui/app.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/hotkey_config.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/llm_clients.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/ollama_utils.py
RENAMED
|
File without changes
|
|
File without changes
|
{ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/screenshot.py
RENAMED
|
File without changes
|
{ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/server/__init__.py
RENAMED
|
File without changes
|
{ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/server/app.py
RENAMED
|
File without changes
|
{ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/server/routes.py
RENAMED
|
File without changes
|
{ai_interview_assistant-2.2.2 → ai_interview_assistant-2.2.4}/src/ai_interview/server/websocket.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|