PyPI - amd-gaia - Versions diffs - 0.15.0__py3-none-any.whl → 0.15.1__py3-none-any.whl - Mend

amd-gaia 0.15.0py3-none-any.whl → 0.15.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (181) hide show

{amd_gaia-0.15.0.dist-info → amd_gaia-0.15.1.dist-info}/METADATA +223 -223
amd_gaia-0.15.1.dist-info/RECORD +178 -0
{amd_gaia-0.15.0.dist-info → amd_gaia-0.15.1.dist-info}/entry_points.txt +1 -0
{amd_gaia-0.15.0.dist-info → amd_gaia-0.15.1.dist-info}/licenses/LICENSE.md +20 -20
gaia/__init__.py +29 -29
gaia/agents/__init__.py +19 -19
gaia/agents/base/__init__.py +9 -9
gaia/agents/base/agent.py +2177 -2177
gaia/agents/base/api_agent.py +120 -120
gaia/agents/base/console.py +1841 -1841
gaia/agents/base/errors.py +237 -237
gaia/agents/base/mcp_agent.py +86 -86
gaia/agents/base/tools.py +83 -83
gaia/agents/blender/agent.py +556 -556
gaia/agents/blender/agent_simple.py +133 -135
gaia/agents/blender/app.py +211 -211
gaia/agents/blender/app_simple.py +41 -41
gaia/agents/blender/core/__init__.py +16 -16
gaia/agents/blender/core/materials.py +506 -506
gaia/agents/blender/core/objects.py +316 -316
gaia/agents/blender/core/rendering.py +225 -225
gaia/agents/blender/core/scene.py +220 -220
gaia/agents/blender/core/view.py +146 -146
gaia/agents/chat/__init__.py +9 -9
gaia/agents/chat/agent.py +835 -835
gaia/agents/chat/app.py +1058 -1058
gaia/agents/chat/session.py +508 -508
gaia/agents/chat/tools/__init__.py +15 -15
gaia/agents/chat/tools/file_tools.py +96 -96
gaia/agents/chat/tools/rag_tools.py +1729 -1729
gaia/agents/chat/tools/shell_tools.py +436 -436
gaia/agents/code/__init__.py +7 -7
gaia/agents/code/agent.py +549 -549
gaia/agents/code/cli.py +377 -0
gaia/agents/code/models.py +135 -135
gaia/agents/code/orchestration/__init__.py +24 -24
gaia/agents/code/orchestration/checklist_executor.py +1763 -1763
gaia/agents/code/orchestration/checklist_generator.py +713 -713
gaia/agents/code/orchestration/factories/__init__.py +9 -9
gaia/agents/code/orchestration/factories/base.py +63 -63
gaia/agents/code/orchestration/factories/nextjs_factory.py +118 -118
gaia/agents/code/orchestration/factories/python_factory.py +106 -106
gaia/agents/code/orchestration/orchestrator.py +841 -841
gaia/agents/code/orchestration/project_analyzer.py +391 -391
gaia/agents/code/orchestration/steps/__init__.py +67 -67
gaia/agents/code/orchestration/steps/base.py +188 -188
gaia/agents/code/orchestration/steps/error_handler.py +314 -314
gaia/agents/code/orchestration/steps/nextjs.py +828 -828
gaia/agents/code/orchestration/steps/python.py +307 -307
gaia/agents/code/orchestration/template_catalog.py +469 -469
gaia/agents/code/orchestration/workflows/__init__.py +14 -14
gaia/agents/code/orchestration/workflows/base.py +80 -80
gaia/agents/code/orchestration/workflows/nextjs.py +186 -186
gaia/agents/code/orchestration/workflows/python.py +94 -94
gaia/agents/code/prompts/__init__.py +11 -11
gaia/agents/code/prompts/base_prompt.py +77 -77
gaia/agents/code/prompts/code_patterns.py +2036 -2036
gaia/agents/code/prompts/nextjs_prompt.py +40 -40
gaia/agents/code/prompts/python_prompt.py +109 -109
gaia/agents/code/schema_inference.py +365 -365
gaia/agents/code/system_prompt.py +41 -41
gaia/agents/code/tools/__init__.py +42 -42
gaia/agents/code/tools/cli_tools.py +1138 -1138
gaia/agents/code/tools/code_formatting.py +319 -319
gaia/agents/code/tools/code_tools.py +769 -769
gaia/agents/code/tools/error_fixing.py +1347 -1347
gaia/agents/code/tools/external_tools.py +180 -180
gaia/agents/code/tools/file_io.py +845 -845
gaia/agents/code/tools/prisma_tools.py +190 -190
gaia/agents/code/tools/project_management.py +1016 -1016
gaia/agents/code/tools/testing.py +321 -321
gaia/agents/code/tools/typescript_tools.py +122 -122
gaia/agents/code/tools/validation_parsing.py +461 -461
gaia/agents/code/tools/validation_tools.py +806 -806
gaia/agents/code/tools/web_dev_tools.py +1758 -1758
gaia/agents/code/validators/__init__.py +16 -16
gaia/agents/code/validators/antipattern_checker.py +241 -241
gaia/agents/code/validators/ast_analyzer.py +197 -197
gaia/agents/code/validators/requirements_validator.py +145 -145
gaia/agents/code/validators/syntax_validator.py +171 -171
gaia/agents/docker/__init__.py +7 -7
gaia/agents/docker/agent.py +642 -642
gaia/agents/emr/__init__.py +8 -8
gaia/agents/emr/agent.py +1506 -1506
gaia/agents/emr/cli.py +1322 -1322
gaia/agents/emr/constants.py +475 -475
gaia/agents/emr/dashboard/__init__.py +4 -4
gaia/agents/emr/dashboard/server.py +1974 -1974
gaia/agents/jira/__init__.py +11 -11
gaia/agents/jira/agent.py +894 -894
gaia/agents/jira/jql_templates.py +299 -299
gaia/agents/routing/__init__.py +7 -7
gaia/agents/routing/agent.py +567 -570
gaia/agents/routing/system_prompt.py +75 -75
gaia/agents/summarize/__init__.py +11 -0
gaia/agents/summarize/agent.py +885 -0
gaia/agents/summarize/prompts.py +129 -0
gaia/api/__init__.py +23 -23
gaia/api/agent_registry.py +238 -238
gaia/api/app.py +305 -305
gaia/api/openai_server.py +575 -575
gaia/api/schemas.py +186 -186
gaia/api/sse_handler.py +373 -373
gaia/apps/__init__.py +4 -4
gaia/apps/llm/__init__.py +6 -6
gaia/apps/llm/app.py +173 -169
gaia/apps/summarize/app.py +116 -633
gaia/apps/summarize/html_viewer.py +133 -133
gaia/apps/summarize/pdf_formatter.py +284 -284
gaia/audio/__init__.py +2 -2
gaia/audio/audio_client.py +439 -439
gaia/audio/audio_recorder.py +269 -269
gaia/audio/kokoro_tts.py +599 -599
gaia/audio/whisper_asr.py +432 -432
gaia/chat/__init__.py +16 -16
gaia/chat/app.py +430 -430
gaia/chat/prompts.py +522 -522
gaia/chat/sdk.py +1228 -1225
gaia/cli.py +5481 -5632
gaia/database/__init__.py +10 -10
gaia/database/agent.py +176 -176
gaia/database/mixin.py +290 -290
gaia/database/testing.py +64 -64
gaia/eval/batch_experiment.py +2332 -2332
gaia/eval/claude.py +542 -542
gaia/eval/config.py +37 -37
gaia/eval/email_generator.py +512 -512
gaia/eval/eval.py +3179 -3179
gaia/eval/groundtruth.py +1130 -1130
gaia/eval/transcript_generator.py +582 -582
gaia/eval/webapp/README.md +167 -167
gaia/eval/webapp/package-lock.json +875 -875
gaia/eval/webapp/package.json +20 -20
gaia/eval/webapp/public/app.js +3402 -3402
gaia/eval/webapp/public/index.html +87 -87
gaia/eval/webapp/public/styles.css +3661 -3661
gaia/eval/webapp/server.js +415 -415
gaia/eval/webapp/test-setup.js +72 -72
gaia/llm/__init__.py +9 -2
gaia/llm/base_client.py +60 -0
gaia/llm/exceptions.py +12 -0
gaia/llm/factory.py +70 -0
gaia/llm/lemonade_client.py +3236 -3221
gaia/llm/lemonade_manager.py +294 -294
gaia/llm/providers/__init__.py +9 -0
gaia/llm/providers/claude.py +108 -0
gaia/llm/providers/lemonade.py +120 -0
gaia/llm/providers/openai_provider.py +79 -0
gaia/llm/vlm_client.py +382 -382
gaia/logger.py +189 -189
gaia/mcp/agent_mcp_server.py +245 -245
gaia/mcp/blender_mcp_client.py +138 -138
gaia/mcp/blender_mcp_server.py +648 -648
gaia/mcp/context7_cache.py +332 -332
gaia/mcp/external_services.py +518 -518
gaia/mcp/mcp_bridge.py +811 -550
gaia/mcp/servers/__init__.py +6 -6
gaia/mcp/servers/docker_mcp.py +83 -83
gaia/perf_analysis.py +361 -0
gaia/rag/__init__.py +10 -10
gaia/rag/app.py +293 -293
gaia/rag/demo.py +304 -304
gaia/rag/pdf_utils.py +235 -235
gaia/rag/sdk.py +2194 -2194
gaia/security.py +163 -163
gaia/talk/app.py +289 -289
gaia/talk/sdk.py +538 -538
gaia/testing/__init__.py +87 -87
gaia/testing/assertions.py +330 -330
gaia/testing/fixtures.py +333 -333
gaia/testing/mocks.py +493 -493
gaia/util.py +46 -46
gaia/utils/__init__.py +33 -33
gaia/utils/file_watcher.py +675 -675
gaia/utils/parsing.py +223 -223
gaia/version.py +100 -100
amd_gaia-0.15.0.dist-info/RECORD +0 -168
gaia/agents/code/app.py +0 -266
gaia/llm/llm_client.py +0 -723
{amd_gaia-0.15.0.dist-info → amd_gaia-0.15.1.dist-info}/WHEEL +0 -0
{amd_gaia-0.15.0.dist-info → amd_gaia-0.15.1.dist-info}/top_level.txt +0 -0

gaia/audio/whisper_asr.py CHANGED Viewed

@@ -1,432 +1,432 @@
-# Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved.
-# SPDX-License-Identifier: MIT
-# Standard library imports
-import os
-import queue
-import threading
-import time
-# Third-party imports
-import numpy as np
-try:
-    import pyaudio
-except ImportError:
-    pyaudio = None
-try:
-    import torch
-except ImportError:
-    torch = None
-try:
-    import whisper
-except ImportError:
-    whisper = None
-from gaia.audio.audio_recorder import AudioRecorder
-# First-party imports
-from gaia.logger import get_logger
-class WhisperAsr(AudioRecorder):
-    log = get_logger(__name__)
-    def __init__(
-        self,
-        model_size="small",
-        device_index=None,  # Use default input device
-        transcription_queue=None,
-        enable_cuda=False,
-        silence_threshold=None,  # Custom silence threshold
-        min_audio_length=None,  # Custom minimum audio length
-    ):
-        # Check for required dependencies
-        missing = []
-        if pyaudio is None:
-            missing.append("pyaudio")
-        if torch is None:
-            missing.append("torch")
-        if whisper is None:
-            missing.append("openai-whisper")
-        if missing:
-            error_msg = (
-                f"\n❌ Error: Missing required talk dependencies: {', '.join(missing)}\n\n"
-                f"Please install the talk dependencies:\n"
-                f'  uv pip install -e ".[talk]"\n\n'
-                f"Or install packages directly:\n"
-                f"  uv pip install {' '.join(missing)}\n"
-            )
-            raise ImportError(error_msg)
-        super().__init__(device_index)
-        # Override thresholds if provided
-        if silence_threshold is not None:
-            self.SILENCE_THRESHOLD = silence_threshold
-        if min_audio_length is not None:
-            self.MIN_AUDIO_LENGTH = min_audio_length
-        self.log = self.__class__.log
-        # Initialize Whisper model with optimized settings
-        self.log.debug(f"Loading Whisper model: {model_size}")
-        self.model = whisper.load_model(model_size)
-        # Add compute type optimization if GPU available
-        self.using_cuda = enable_cuda and torch.cuda.is_available()
-        if self.using_cuda:
-            self.model.to(torch.device("cuda"))
-            torch.set_float32_matmul_precision("high")
-            # Enable torch compile for better performance
-            if hasattr(torch, "compile"):
-                self.model = torch.compile(self.model)
-            self.log.debug("GPU acceleration enabled with optimizations")
-        # Add batch processing capability
-        self.batch_size = 3  # Process multiple audio segments at once
-        self.audio_buffer = []
-        self.last_process_time = time.time()
-        self.process_interval = 0.5  # Process every 0.5 seconds
-        # Rest of initialization
-        self.transcription_queue = transcription_queue
-    def _record_audio_streaming(self):
-        """Record audio for streaming mode - puts chunks directly into queue."""
-        pa = pyaudio.PyAudio()
-        try:
-            # Log device info
-            if self.device_index is not None:
-                device_info = pa.get_device_info_by_index(self.device_index)
-            else:
-                device_info = pa.get_default_input_device_info()
-                self.device_index = device_info["index"]
-            self.log.debug(
-                f"Using audio device [{self.device_index}]: {device_info['name']}"
-            )
-            self.stream = pa.open(
-                format=self.FORMAT,
-                channels=self.CHANNELS,
-                rate=self.RATE,
-                input=True,
-                input_device_index=self.device_index,
-                frames_per_buffer=self.CHUNK,
-            )
-            self.log.debug("Streaming recording started...")
-            audio_buffer = np.array([], dtype=np.float32)
-            chunks_processed = 0
-            # Use 3-second chunks for better context (Whisper works better with longer segments)
-            chunk_duration = 3.0  # seconds
-            overlap_duration = 0.5  # seconds of overlap to avoid cutting words
-            chunk_size = int(self.RATE * chunk_duration)
-            overlap_size = int(self.RATE * overlap_duration)
-            # Simple VAD - only send chunks with sufficient audio energy
-            min_energy_threshold = 0.001  # Minimum energy to consider as speech
-            while self.is_recording:
-                try:
-                    data = np.frombuffer(
-                        self.stream.read(self.CHUNK, exception_on_overflow=False),
-                        dtype=np.float32,
-                    )
-                    audio_buffer = np.concatenate((audio_buffer, data))
-                    # Process when we have enough audio (3 seconds)
-                    if len(audio_buffer) >= chunk_size:
-                        chunk = audio_buffer[:chunk_size].copy()
-                        # Only process if chunk has sufficient audio energy (not silence)
-                        energy = np.abs(chunk).mean()
-                        chunks_processed += 1
-                        if energy > min_energy_threshold:
-                            self.audio_queue.put(chunk)
-                            self.log.debug(
-                                f"Chunk {chunks_processed}: Added to queue (energy: {energy:.6f})"
-                            )
-                        else:
-                            self.log.debug(
-                                f"Chunk {chunks_processed}: Skipped - too quiet (energy: {energy:.6f})"
-                            )
-                        # Keep overlap to maintain context between chunks
-                        audio_buffer = audio_buffer[chunk_size - overlap_size :]
-                except Exception as e:
-                    self.log.error(f"Error reading from stream: {e}")
-                    break
-            # Process any remaining audio
-            if len(audio_buffer) > self.RATE * 0.5:  # At least 0.5 seconds
-                self.audio_queue.put(audio_buffer.copy())
-        finally:
-            if self.stream:
-                self.stream.stop_stream()
-                self.stream.close()
-            pa.terminate()
-    def start_recording_streaming(self):
-        """Start recording in streaming mode."""
-        self.is_recording = True
-        self.record_thread = threading.Thread(target=self._record_audio_streaming)
-        self.record_thread.start()
-        time.sleep(0.1)
-        self.process_thread = threading.Thread(target=self._process_audio)
-        self.process_thread.start()
-        time.sleep(0.1)
-    def _process_audio(self):
-        """Internal method to process audio with batching and optimizations."""
-        self.log.debug("Starting optimized audio processing...")
-        processed_count = 0
-        while self.is_recording:
-            try:
-                current_time = time.time()
-                # Collect audio segments into buffer
-                while len(self.audio_buffer) < self.batch_size:
-                    try:
-                        audio = self.audio_queue.get_nowait()
-                        if len(audio) > 0:
-                            self.audio_buffer.append(audio)
-                            self.log.debug(
-                                f"Added audio to buffer (size: {len(self.audio_buffer)}/{self.batch_size})"
-                            )
-                    except queue.Empty:
-                        break
-                # Process batch if enough time has passed or buffer is full
-                if len(self.audio_buffer) >= self.batch_size or (
-                    len(self.audio_buffer) > 0
-                    and current_time - self.last_process_time >= self.process_interval
-                ):
-                    try:
-                        processed_count += 1
-                        self.log.debug(
-                            f"Processing batch {processed_count} with {len(self.audio_buffer)} segments..."
-                        )
-                        with torch.inference_mode():
-                            # Process batch of audio segments with better quality settings
-                            results = [
-                                self.model.transcribe(
-                                    audio,
-                                    language="en",
-                                    temperature=0.0,  # Deterministic, no randomness
-                                    no_speech_threshold=0.6,  # Higher threshold to filter noise
-                                    condition_on_previous_text=False,  # Don't use previous text as it can cause hallucinations
-                                    beam_size=5,  # Larger beam for better quality
-                                    best_of=5,  # More attempts for better quality
-                                    fp16=self.using_cuda,
-                                    suppress_blank=True,  # Suppress blank outputs
-                                    suppress_tokens=[-1],  # Suppress special tokens
-                                    without_timestamps=False,  # Keep timestamps for context
-                                )
-                                for audio in self.audio_buffer
-                            ]
-                            # Send transcriptions to queue
-                            for i, result in enumerate(results):
-                                transcribed_text = result["text"].strip()
-                                if transcribed_text and self.transcription_queue:
-                                    self.transcription_queue.put(transcribed_text)
-                                    self.log.debug(
-                                        f"Transcribed segment {i+1}: {transcribed_text}"
-                                    )
-                                else:
-                                    self.log.debug(f"Segment {i+1}: No text or empty")
-                        self.audio_buffer = []
-                        self.last_process_time = current_time
-                    except Exception as e:
-                        self.log.error(f"Batch transcription error: {e}")
-                        self.audio_buffer = []  # Clear buffer on error
-                else:
-                    # Small sleep to prevent CPU spinning
-                    time.sleep(0.01)
-            except Exception as e:
-                self.log.error(f"Error in audio processing: {e}")
-                if not self.is_recording:
-                    break
-        self.log.debug("Audio processing stopped")
-    def transcribe_file(self, file_path):
-        """Transcribe an existing audio file."""
-        if not os.path.exists(file_path):
-            raise FileNotFoundError(f"Audio file not found: {file_path}")
-        result = self.model.transcribe(file_path)
-        return result["text"]
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser(description="Whisper ASR Demo")
-    parser.add_argument(
-        "--mode",
-        choices=["file", "mic", "both"],
-        default="file",
-        help="Test mode: file, mic, or both",
-    )
-    parser.add_argument(
-        "--duration",
-        type=int,
-        default=5,
-        help="Recording duration in seconds for mic mode",
-    )
-    parser.add_argument(
-        "--model",
-        default="base",
-        help="Whisper model size (tiny, base, small, medium, large)",
-    )
-    parser.add_argument(
-        "--cuda", action="store_true", help="Enable CUDA acceleration if available"
-    )
-    parser.add_argument(
-        "--stream",
-        action="store_true",
-        help="Stream transcriptions in real-time as they arrive",
-    )
-    args = parser.parse_args()
-    print("=== Whisper ASR Demo ===")
-    print(f"Model: {args.model}, CUDA: {args.cuda}")
-    # Test file transcription
-    if args.mode in ["file", "both"]:
-        print("\n--- File Transcription Test ---")
-        asr = WhisperAsr(model_size=args.model, enable_cuda=args.cuda)
-        try:
-            test_file = "./data/audio/test.m4a"
-            start_time = time.time()
-            text = asr.transcribe_file(test_file)
-            elapsed = time.time() - start_time
-            print(f"Transcription: {text}")
-            print(f"Time taken: {elapsed:.2f} seconds")
-        except FileNotFoundError:
-            print(f"No audio file found at {test_file}")
-    # Test microphone transcription
-    if args.mode in ["mic", "both"]:
-        print("\n--- Microphone Transcription Test ---")
-        print(f"Recording for {args.duration} seconds...")
-        print(f"Mode: {'Streaming' if args.stream else 'Batch'}")
-        # Create a queue to collect transcriptions
-        transcription_queue = queue.Queue()
-        asr = WhisperAsr(
-            model_size=args.model,
-            transcription_queue=transcription_queue,
-            enable_cuda=args.cuda,
-        )
-        start_time = time.time()
-        transcriptions = []
-        if args.stream:
-            # Streaming mode - show text as it arrives
-            print("Starting recording threads...")
-            asr.start_recording_streaming()  # Use streaming-specific method
-            print("\n[STREAMING] Transcriptions as they arrive:")
-            print("-" * 50)
-            # Give recording a moment to start properly
-            time.sleep(0.5)
-            print(f"Recording status: {asr.is_recording}")
-            print(f"Listening for {args.duration} seconds...")
-            end_time = start_time + args.duration
-            checks = 0
-            try:
-                while time.time() < end_time:
-                    checks += 1
-                    # Check for new transcriptions
-                    while not transcription_queue.empty():
-                        try:
-                            text = transcription_queue.get_nowait()
-                            if text:
-                                transcriptions.append(text)
-                                # Stream the text immediately with timestamp
-                                time_offset = time.time() - start_time
-                                print(f"[{time_offset:5.1f}s] {text}")
-                        except queue.Empty:
-                            break
-                    # Debug: Show we're still checking
-                    if checks % 20 == 0:  # Every second (20 * 0.05)
-                        print(
-                            f"  ... still listening (audio_queue size: ~{asr.audio_queue.qsize()})"
-                        )
-                    # Small sleep to prevent CPU spinning
-                    time.sleep(0.05)
-            finally:
-                # Stop recording
-                asr.stop_recording()
-                # Collect any remaining transcriptions
-                time.sleep(0.5)  # Give a moment for final processing
-                while not transcription_queue.empty():
-                    try:
-                        text = transcription_queue.get_nowait()
-                        if text:
-                            transcriptions.append(text)
-                            time_offset = time.time() - start_time
-                            print(f"[{time_offset:5.1f}s] {text}")
-                    except queue.Empty:
-                        break
-            print("-" * 50)
-        else:
-            # Batch mode - collect all text then display
-            asr.start_recording(duration=args.duration)  # Blocking
-            # Collect all transcriptions after recording
-            while not transcription_queue.empty():
-                try:
-                    text = transcription_queue.get_nowait()
-                    if text:
-                        transcriptions.append(text)
-                except queue.Empty:
-                    break
-        elapsed = time.time() - start_time
-        # Display results
-        print("\nResults:")
-        if transcriptions:
-            print(f"  Transcription segments: {len(transcriptions)}")
-            if not args.stream:  # Show individual segments in batch mode
-                for i, text in enumerate(transcriptions, 1):
-                    print(f"    {i}. {text}")
-            print(f"  Full transcript: {' '.join(transcriptions)}")
-        else:
-            print("  No transcriptions received (possibly no speech detected)")
-        print(f"  Total time: {elapsed:.2f} seconds")
-        print(f"  Processing efficiency: {args.duration/elapsed:.2f}x realtime")
-    print("\nDemo completed!")
+# Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: MIT
+# Standard library imports
+import os
+import queue
+import threading
+import time
+# Third-party imports
+import numpy as np
+try:
+    import pyaudio
+except ImportError:
+    pyaudio = None
+try:
+    import torch
+except ImportError:
+    torch = None
+try:
+    import whisper
+except ImportError:
+    whisper = None
+from gaia.audio.audio_recorder import AudioRecorder
+# First-party imports
+from gaia.logger import get_logger
+class WhisperAsr(AudioRecorder):
+    log = get_logger(__name__)
+    def __init__(
+        self,
+        model_size="small",
+        device_index=None,  # Use default input device
+        transcription_queue=None,
+        enable_cuda=False,
+        silence_threshold=None,  # Custom silence threshold
+        min_audio_length=None,  # Custom minimum audio length
+    ):
+        # Check for required dependencies
+        missing = []
+        if pyaudio is None:
+            missing.append("pyaudio")
+        if torch is None:
+            missing.append("torch")
+        if whisper is None:
+            missing.append("openai-whisper")
+        if missing:
+            error_msg = (
+                f"\n❌ Error: Missing required talk dependencies: {', '.join(missing)}\n\n"
+                f"Please install the talk dependencies:\n"
+                f'  uv pip install -e ".[talk]"\n\n'
+                f"Or install packages directly:\n"
+                f"  uv pip install {' '.join(missing)}\n"
+            )
+            raise ImportError(error_msg)
+        super().__init__(device_index)
+        # Override thresholds if provided
+        if silence_threshold is not None:
+            self.SILENCE_THRESHOLD = silence_threshold
+        if min_audio_length is not None:
+            self.MIN_AUDIO_LENGTH = min_audio_length
+        self.log = self.__class__.log
+        # Initialize Whisper model with optimized settings
+        self.log.debug(f"Loading Whisper model: {model_size}")
+        self.model = whisper.load_model(model_size)
+        # Add compute type optimization if GPU available
+        self.using_cuda = enable_cuda and torch.cuda.is_available()
+        if self.using_cuda:
+            self.model.to(torch.device("cuda"))
+            torch.set_float32_matmul_precision("high")
+            # Enable torch compile for better performance
+            if hasattr(torch, "compile"):
+                self.model = torch.compile(self.model)
+            self.log.debug("GPU acceleration enabled with optimizations")
+        # Add batch processing capability
+        self.batch_size = 3  # Process multiple audio segments at once
+        self.audio_buffer = []
+        self.last_process_time = time.time()
+        self.process_interval = 0.5  # Process every 0.5 seconds
+        # Rest of initialization
+        self.transcription_queue = transcription_queue
+    def _record_audio_streaming(self):
+        """Record audio for streaming mode - puts chunks directly into queue."""
+        pa = pyaudio.PyAudio()
+        try:
+            # Log device info
+            if self.device_index is not None:
+                device_info = pa.get_device_info_by_index(self.device_index)
+            else:
+                device_info = pa.get_default_input_device_info()
+                self.device_index = device_info["index"]
+            self.log.debug(
+                f"Using audio device [{self.device_index}]: {device_info['name']}"
+            )
+            self.stream = pa.open(
+                format=self.FORMAT,
+                channels=self.CHANNELS,
+                rate=self.RATE,
+                input=True,
+                input_device_index=self.device_index,
+                frames_per_buffer=self.CHUNK,
+            )
+            self.log.debug("Streaming recording started...")
+            audio_buffer = np.array([], dtype=np.float32)
+            chunks_processed = 0
+            # Use 3-second chunks for better context (Whisper works better with longer segments)
+            chunk_duration = 3.0  # seconds
+            overlap_duration = 0.5  # seconds of overlap to avoid cutting words
+            chunk_size = int(self.RATE * chunk_duration)
+            overlap_size = int(self.RATE * overlap_duration)
+            # Simple VAD - only send chunks with sufficient audio energy
+            min_energy_threshold = 0.001  # Minimum energy to consider as speech
+            while self.is_recording:
+                try:
+                    data = np.frombuffer(
+                        self.stream.read(self.CHUNK, exception_on_overflow=False),
+                        dtype=np.float32,
+                    )
+                    audio_buffer = np.concatenate((audio_buffer, data))
+                    # Process when we have enough audio (3 seconds)
+                    if len(audio_buffer) >= chunk_size:
+                        chunk = audio_buffer[:chunk_size].copy()
+                        # Only process if chunk has sufficient audio energy (not silence)
+                        energy = np.abs(chunk).mean()
+                        chunks_processed += 1
+                        if energy > min_energy_threshold:
+                            self.audio_queue.put(chunk)
+                            self.log.debug(
+                                f"Chunk {chunks_processed}: Added to queue (energy: {energy:.6f})"
+                            )
+                        else:
+                            self.log.debug(
+                                f"Chunk {chunks_processed}: Skipped - too quiet (energy: {energy:.6f})"
+                            )
+                        # Keep overlap to maintain context between chunks
+                        audio_buffer = audio_buffer[chunk_size - overlap_size :]
+                except Exception as e:
+                    self.log.error(f"Error reading from stream: {e}")
+                    break
+            # Process any remaining audio
+            if len(audio_buffer) > self.RATE * 0.5:  # At least 0.5 seconds
+                self.audio_queue.put(audio_buffer.copy())
+        finally:
+            if self.stream:
+                self.stream.stop_stream()
+                self.stream.close()
+            pa.terminate()
+    def start_recording_streaming(self):
+        """Start recording in streaming mode."""
+        self.is_recording = True
+        self.record_thread = threading.Thread(target=self._record_audio_streaming)
+        self.record_thread.start()
+        time.sleep(0.1)
+        self.process_thread = threading.Thread(target=self._process_audio)
+        self.process_thread.start()
+        time.sleep(0.1)
+    def _process_audio(self):
+        """Internal method to process audio with batching and optimizations."""
+        self.log.debug("Starting optimized audio processing...")
+        processed_count = 0
+        while self.is_recording:
+            try:
+                current_time = time.time()
+                # Collect audio segments into buffer
+                while len(self.audio_buffer) < self.batch_size:
+                    try:
+                        audio = self.audio_queue.get_nowait()
+                        if len(audio) > 0:
+                            self.audio_buffer.append(audio)
+                            self.log.debug(
+                                f"Added audio to buffer (size: {len(self.audio_buffer)}/{self.batch_size})"
+                            )
+                    except queue.Empty:
+                        break
+                # Process batch if enough time has passed or buffer is full
+                if len(self.audio_buffer) >= self.batch_size or (
+                    len(self.audio_buffer) > 0
+                    and current_time - self.last_process_time >= self.process_interval
+                ):
+                    try:
+                        processed_count += 1
+                        self.log.debug(
+                            f"Processing batch {processed_count} with {len(self.audio_buffer)} segments..."
+                        )
+                        with torch.inference_mode():
+                            # Process batch of audio segments with better quality settings
+                            results = [
+                                self.model.transcribe(
+                                    audio,
+                                    language="en",
+                                    temperature=0.0,  # Deterministic, no randomness
+                                    no_speech_threshold=0.6,  # Higher threshold to filter noise
+                                    condition_on_previous_text=False,  # Don't use previous text as it can cause hallucinations
+                                    beam_size=5,  # Larger beam for better quality
+                                    best_of=5,  # More attempts for better quality
+                                    fp16=self.using_cuda,
+                                    suppress_blank=True,  # Suppress blank outputs
+                                    suppress_tokens=[-1],  # Suppress special tokens
+                                    without_timestamps=False,  # Keep timestamps for context
+                                )
+                                for audio in self.audio_buffer
+                            ]
+                            # Send transcriptions to queue
+                            for i, result in enumerate(results):
+                                transcribed_text = result["text"].strip()
+                                if transcribed_text and self.transcription_queue:
+                                    self.transcription_queue.put(transcribed_text)
+                                    self.log.debug(
+                                        f"Transcribed segment {i+1}: {transcribed_text}"
+                                    )
+                                else:
+                                    self.log.debug(f"Segment {i+1}: No text or empty")
+                        self.audio_buffer = []
+                        self.last_process_time = current_time
+                    except Exception as e:
+                        self.log.error(f"Batch transcription error: {e}")
+                        self.audio_buffer = []  # Clear buffer on error
+                else:
+                    # Small sleep to prevent CPU spinning
+                    time.sleep(0.01)
+            except Exception as e:
+                self.log.error(f"Error in audio processing: {e}")
+                if not self.is_recording:
+                    break
+        self.log.debug("Audio processing stopped")
+    def transcribe_file(self, file_path):
+        """Transcribe an existing audio file."""
+        if not os.path.exists(file_path):
+            raise FileNotFoundError(f"Audio file not found: {file_path}")
+        result = self.model.transcribe(file_path)
+        return result["text"]
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Whisper ASR Demo")
+    parser.add_argument(
+        "--mode",
+        choices=["file", "mic", "both"],
+        default="file",
+        help="Test mode: file, mic, or both",
+    )
+    parser.add_argument(
+        "--duration",
+        type=int,
+        default=5,
+        help="Recording duration in seconds for mic mode",
+    )
+    parser.add_argument(
+        "--model",
+        default="base",
+        help="Whisper model size (tiny, base, small, medium, large)",
+    )
+    parser.add_argument(
+        "--cuda", action="store_true", help="Enable CUDA acceleration if available"
+    )
+    parser.add_argument(
+        "--stream",
+        action="store_true",
+        help="Stream transcriptions in real-time as they arrive",
+    )
+    args = parser.parse_args()
+    print("=== Whisper ASR Demo ===")
+    print(f"Model: {args.model}, CUDA: {args.cuda}")
+    # Test file transcription
+    if args.mode in ["file", "both"]:
+        print("\n--- File Transcription Test ---")
+        asr = WhisperAsr(model_size=args.model, enable_cuda=args.cuda)
+        try:
+            test_file = "./data/audio/test.m4a"
+            start_time = time.time()
+            text = asr.transcribe_file(test_file)
+            elapsed = time.time() - start_time
+            print(f"Transcription: {text}")
+            print(f"Time taken: {elapsed:.2f} seconds")
+        except FileNotFoundError:
+            print(f"No audio file found at {test_file}")
+    # Test microphone transcription
+    if args.mode in ["mic", "both"]:
+        print("\n--- Microphone Transcription Test ---")
+        print(f"Recording for {args.duration} seconds...")
+        print(f"Mode: {'Streaming' if args.stream else 'Batch'}")
+        # Create a queue to collect transcriptions
+        transcription_queue = queue.Queue()
+        asr = WhisperAsr(
+            model_size=args.model,
+            transcription_queue=transcription_queue,
+            enable_cuda=args.cuda,
+        )
+        start_time = time.time()
+        transcriptions = []
+        if args.stream:
+            # Streaming mode - show text as it arrives
+            print("Starting recording threads...")
+            asr.start_recording_streaming()  # Use streaming-specific method
+            print("\n[STREAMING] Transcriptions as they arrive:")
+            print("-" * 50)
+            # Give recording a moment to start properly
+            time.sleep(0.5)
+            print(f"Recording status: {asr.is_recording}")
+            print(f"Listening for {args.duration} seconds...")
+            end_time = start_time + args.duration
+            checks = 0
+            try:
+                while time.time() < end_time:
+                    checks += 1
+                    # Check for new transcriptions
+                    while not transcription_queue.empty():
+                        try:
+                            text = transcription_queue.get_nowait()
+                            if text:
+                                transcriptions.append(text)
+                                # Stream the text immediately with timestamp
+                                time_offset = time.time() - start_time
+                                print(f"[{time_offset:5.1f}s] {text}")
+                        except queue.Empty:
+                            break
+                    # Debug: Show we're still checking
+                    if checks % 20 == 0:  # Every second (20 * 0.05)
+                        print(
+                            f"  ... still listening (audio_queue size: ~{asr.audio_queue.qsize()})"
+                        )
+                    # Small sleep to prevent CPU spinning
+                    time.sleep(0.05)
+            finally:
+                # Stop recording
+                asr.stop_recording()
+                # Collect any remaining transcriptions
+                time.sleep(0.5)  # Give a moment for final processing
+                while not transcription_queue.empty():
+                    try:
+                        text = transcription_queue.get_nowait()
+                        if text:
+                            transcriptions.append(text)
+                            time_offset = time.time() - start_time
+                            print(f"[{time_offset:5.1f}s] {text}")
+                    except queue.Empty:
+                        break
+            print("-" * 50)
+        else:
+            # Batch mode - collect all text then display
+            asr.start_recording(duration=args.duration)  # Blocking
+            # Collect all transcriptions after recording
+            while not transcription_queue.empty():
+                try:
+                    text = transcription_queue.get_nowait()
+                    if text:
+                        transcriptions.append(text)
+                except queue.Empty:
+                    break
+        elapsed = time.time() - start_time
+        # Display results
+        print("\nResults:")
+        if transcriptions:
+            print(f"  Transcription segments: {len(transcriptions)}")
+            if not args.stream:  # Show individual segments in batch mode
+                for i, text in enumerate(transcriptions, 1):
+                    print(f"    {i}. {text}")
+            print(f"  Full transcript: {' '.join(transcriptions)}")
+        else:
+            print("  No transcriptions received (possibly no speech detected)")
+        print(f"  Total time: {elapsed:.2f} seconds")
+        print(f"  Processing efficiency: {args.duration/elapsed:.2f}x realtime")
+    print("\nDemo completed!")

amd-gaia 0.15.0__py3-none-any.whl → 0.15.1__py3-none-any.whl

amd-gaia 0.15.0py3-none-any.whl → 0.15.1py3-none-any.whl