PyPI - voice-mode - Versions diffs - 2.34.2__py3-none-any.whl → 4.0.1__py3-none-any.whl - Mend

voice-mode 2.34.2py3-none-any.whl → 4.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (156) hide show

voice_mode/tools/transcription/backends.py ADDED Viewed

@@ -0,0 +1,287 @@
+"""Backend implementations for transcription."""
+import os
+import json
+import subprocess
+import tempfile
+from pathlib import Path
+from typing import Dict, Any, Optional, List
+import httpx
+from voice_mode.config import OPENAI_API_KEY
+from .types import TranscriptionResult
+async def transcribe_with_openai(
+    audio_path: Path,
+    word_timestamps: bool = False,
+    language: Optional[str] = None,
+    model: str = "whisper-1"
+) -> TranscriptionResult:
+    """
+    Transcribe using OpenAI API with optional word-level timestamps.
+    """
+    # Import OpenAI client
+    from openai import AsyncOpenAI
+    # Get API key from VoiceMode config
+    api_key = OPENAI_API_KEY or os.environ.get("OPENAI_API_KEY")
+    if not api_key:
+        return TranscriptionResult(
+            text="",
+            language="",
+            segments=[],
+            backend="openai",
+            success=False,
+            error="OpenAI API key not configured. Set OPENAI_API_KEY environment variable."
+        )
+    # Initialize async client (automatically respects OPENAI_BASE_URL env var)
+    client = AsyncOpenAI(api_key=api_key)
+    # Prepare timestamp granularities
+    timestamp_granularities = ["segment"]
+    if word_timestamps:
+        timestamp_granularities.append("word")
+    try:
+        # Open and transcribe the audio file
+        with open(audio_path, "rb") as audio_file:
+            transcription = await client.audio.transcriptions.create(
+                model=model,
+                file=audio_file,
+                response_format="verbose_json",
+                timestamp_granularities=timestamp_granularities,
+                language=language
+            )
+        # Convert response to dictionary
+        result = transcription.model_dump() if hasattr(transcription, 'model_dump') else transcription.dict()
+        # Format response
+        formatted = TranscriptionResult(
+            text=result.get("text", ""),
+            language=result.get("language", ""),
+            duration=result.get("duration", 0),
+            segments=[],
+            backend="openai",
+            model=model,
+            success=True
+        )
+        # Process segments
+        for segment in result.get("segments", []):
+            seg_data = {
+                "id": segment.get("id"),
+                "text": segment.get("text", "").strip(),
+                "start": segment.get("start", 0),
+                "end": segment.get("end", 0)
+            }
+            formatted["segments"].append(seg_data)
+        # Handle word timestamps - OpenAI returns them at the top level
+        if word_timestamps and "words" in result:
+            formatted["words"] = [
+                {
+                    "word": w.get("word", ""),
+                    "start": w.get("start", 0),
+                    "end": w.get("end", 0)
+                }
+                for w in result.get("words", [])
+            ]
+        else:
+            formatted["words"] = []
+        return formatted
+    except Exception as e:
+        return TranscriptionResult(
+            text="",
+            language="",
+            segments=[],
+            backend="openai",
+            success=False,
+            error=str(e)
+        )
+async def transcribe_with_whisperx(
+    audio_path: Path,
+    word_timestamps: bool = True,
+    language: Optional[str] = None
+) -> TranscriptionResult:
+    """
+    Transcribe using WhisperX for enhanced word-level alignment.
+    """
+    try:
+        # Try importing WhisperX
+        import whisperx
+        import torch
+    except ImportError:
+        return TranscriptionResult(
+            text="",
+            language="",
+            segments=[],
+            backend="whisperx",
+            success=False,
+            error="WhisperX not installed. Install with: pip install git+https://github.com/m-bain/whisperX.git"
+        )
+    try:
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        compute_type = "float16" if device == "cuda" else "int8"
+        # Load model
+        model = whisperx.load_model("large-v3", device, compute_type=compute_type)
+        # Load audio
+        audio = whisperx.load_audio(str(audio_path))
+        # Transcribe
+        result = model.transcribe(audio, batch_size=16, language=language)
+        # Align for word timestamps if requested
+        if word_timestamps:
+            # Load alignment model
+            model_a, metadata = whisperx.load_align_model(
+                language_code=result.get("language", language or "en"),
+                device=device
+            )
+            # Align
+            result = whisperx.align(
+                result["segments"],
+                model_a,
+                metadata,
+                audio,
+                device,
+                return_char_alignments=False
+            )
+        # Format response
+        formatted = TranscriptionResult(
+            text=" ".join(s.get("text", "") for s in result.get("segments", [])),
+            language=result.get("language", ""),
+            segments=result.get("segments", []),
+            backend="whisperx",
+            success=True
+        )
+        # Add enhanced_alignment flag
+        if word_timestamps:
+            formatted["enhanced_alignment"] = True
+        # Flatten words if available
+        if word_timestamps:
+            formatted["words"] = []
+            for segment in formatted["segments"]:
+                if "words" in segment:
+                    formatted["words"].extend(segment["words"])
+        return formatted
+    except Exception as e:
+        return TranscriptionResult(
+            text="",
+            language="",
+            segments=[],
+            backend="whisperx",
+            success=False,
+            error=str(e)
+        )
+async def transcribe_with_whisper_cpp(
+    audio_path: Path,
+    word_timestamps: bool = False,
+    language: Optional[str] = None
+) -> TranscriptionResult:
+    """
+    Transcribe using local whisper.cpp server.
+    """
+    # Check if whisper-server is running (using localhost:2022 as configured)
+    server_url = "http://localhost:2022/v1/audio/transcriptions"
+    # Convert audio to WAV if needed
+    if audio_path.suffix.lower() != ".wav":
+        # Use ffmpeg to convert
+        wav_path = Path(tempfile.mktemp(suffix=".wav"))
+        try:
+            subprocess.run([
+                "ffmpeg", "-i", str(audio_path),
+                "-ar", "16000", "-ac", "1", "-f", "wav",
+                str(wav_path)
+            ], check=True, capture_output=True)
+        except subprocess.CalledProcessError as e:
+            return TranscriptionResult(
+                text="",
+                language="",
+                segments=[],
+                backend="whisper-cpp",
+                success=False,
+                error=f"Failed to convert audio to WAV: {e.stderr.decode() if e.stderr else str(e)}"
+            )
+    else:
+        wav_path = audio_path
+    try:
+        # Read audio file
+        with open(wav_path, "rb") as f:
+            audio_data = f.read()
+        # Prepare request
+        files = {"file": ("audio.wav", audio_data, "audio/wav")}
+        data = {
+            "response_format": "verbose_json" if word_timestamps else "json",
+            "word_timestamps": "true" if word_timestamps else "false"
+        }
+        if language:
+            data["language"] = language
+        # Send request
+        async with httpx.AsyncClient() as client:
+            response = await client.post(
+                server_url,
+                files=files,
+                data=data,
+                timeout=120.0
+            )
+        if response.status_code != 200:
+            raise Exception(f"Whisper server error: {response.text}")
+        result = response.json()
+        # Format response
+        formatted = TranscriptionResult(
+            text=result.get("text", ""),
+            language=result.get("language", ""),
+            segments=result.get("segments", []),
+            backend="whisper-cpp",
+            success=True
+        )
+        # Add word timestamps if available
+        if word_timestamps and "words" in result:
+            formatted["words"] = result["words"]
+        return formatted
+    except Exception as e:
+        return TranscriptionResult(
+            text="",
+            language="",
+            segments=[],
+            backend="whisper-cpp",
+            success=False,
+            error=str(e)
+        )
+    finally:
+        # Clean up temp file if created
+        if wav_path != audio_path and wav_path.exists():
+            wav_path.unlink()

voice_mode/tools/transcription/core.py ADDED Viewed

@@ -0,0 +1,136 @@
+"""Core transcription functionality."""
+import asyncio
+from pathlib import Path
+from typing import Optional, Union, BinaryIO, Dict, Any
+from .types import TranscriptionResult, TranscriptionBackend, OutputFormat
+from .backends import (
+    transcribe_with_openai,
+    transcribe_with_whisperx,
+    transcribe_with_whisper_cpp
+)
+from .formats import convert_to_format
+async def transcribe_audio(
+    audio_file: Union[str, Path, BinaryIO],
+    word_timestamps: bool = False,
+    backend: TranscriptionBackend = TranscriptionBackend.OPENAI,
+    output_format: OutputFormat = OutputFormat.JSON,
+    language: Optional[str] = None,
+    model: str = "whisper-1"
+) -> TranscriptionResult:
+    """
+    Transcribe audio with optional word-level timestamps.
+    This is the main API entry point for VoiceMode transcription.
+    Args:
+        audio_file: Path to audio file or file-like object
+        word_timestamps: Include word-level timestamps
+        backend: Which transcription backend to use
+        output_format: Output format for transcription
+        language: Language code (e.g., 'en', 'es', 'fr')
+        model: Model to use (for OpenAI backend)
+    Returns:
+        TranscriptionResult with transcription data
+    """
+    # Convert path to Path object
+    if isinstance(audio_file, str):
+        audio_path = Path(audio_file)
+    elif isinstance(audio_file, Path):
+        audio_path = audio_file
+    else:
+        # Handle BinaryIO case
+        import tempfile
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
+            tmp.write(audio_file.read())
+            audio_path = Path(tmp.name)
+    # Validate file exists
+    if not audio_path.exists():
+        return TranscriptionResult(
+            text="",
+            language="",
+            segments=[],
+            backend=backend.value,
+            success=False,
+            error=f"Audio file not found: {audio_path}"
+        )
+    # Call appropriate backend
+    try:
+        if backend == TranscriptionBackend.OPENAI:
+            result = await transcribe_with_openai(
+                audio_path,
+                word_timestamps=word_timestamps,
+                language=language,
+                model=model
+            )
+        elif backend == TranscriptionBackend.WHISPERX:
+            result = await transcribe_with_whisperx(
+                audio_path,
+                word_timestamps=word_timestamps,
+                language=language
+            )
+        elif backend == TranscriptionBackend.WHISPER_CPP:
+            result = await transcribe_with_whisper_cpp(
+                audio_path,
+                word_timestamps=word_timestamps,
+                language=language
+            )
+        else:
+            return TranscriptionResult(
+                text="",
+                language="",
+                segments=[],
+                backend=backend.value,
+                success=False,
+                error=f"Unknown backend: {backend}"
+            )
+        # Convert format if needed
+        if output_format != OutputFormat.JSON and result.get("success", False):
+            formatted_content = convert_to_format(result, output_format)
+            result["formatted_content"] = formatted_content
+        return result
+    except Exception as e:
+        return TranscriptionResult(
+            text="",
+            language="",
+            segments=[],
+            backend=backend.value,
+            success=False,
+            error=str(e)
+        )
+    finally:
+        # Clean up temp file if created from BinaryIO
+        if not isinstance(audio_file, (str, Path)) and audio_path.exists():
+            audio_path.unlink()
+def transcribe_audio_sync(
+    audio_file: Union[str, Path, BinaryIO],
+    word_timestamps: bool = False,
+    backend: TranscriptionBackend = TranscriptionBackend.OPENAI,
+    output_format: OutputFormat = OutputFormat.JSON,
+    language: Optional[str] = None,
+    model: str = "whisper-1"
+) -> TranscriptionResult:
+    """
+    Synchronous wrapper for transcribe_audio.
+    Useful for CLI and non-async contexts.
+    """
+    return asyncio.run(transcribe_audio(
+        audio_file=audio_file,
+        word_timestamps=word_timestamps,
+        backend=backend,
+        output_format=output_format,
+        language=language,
+        model=model
+    ))

voice_mode/tools/transcription/formats.py ADDED Viewed

@@ -0,0 +1,144 @@
+"""Format converters for transcription output."""
+import csv
+import io
+from typing import Dict, Any, List
+from .types import TranscriptionResult, OutputFormat
+def format_timestamp_srt(seconds: float) -> str:
+    """Format timestamp for SRT (HH:MM:SS,mmm)"""
+    hours = int(seconds // 3600)
+    minutes = int((seconds % 3600) // 60)
+    secs = seconds % 60
+    return f"{hours:02d}:{minutes:02d}:{secs:06.3f}".replace(".", ",")
+def format_timestamp_vtt(seconds: float) -> str:
+    """Format timestamp for WebVTT (HH:MM:SS.mmm)"""
+    hours = int(seconds // 3600)
+    minutes = int((seconds % 3600) // 60)
+    secs = seconds % 60
+    return f"{hours:02d}:{minutes:02d}:{secs:06.3f}"
+def convert_to_srt(transcription: Dict[str, Any]) -> str:
+    """
+    Convert transcription to SRT subtitle format.
+    """
+    srt_lines = []
+    for i, segment in enumerate(transcription.get("segments", []), 1):
+        start = format_timestamp_srt(segment.get("start", 0))
+        end = format_timestamp_srt(segment.get("end", 0))
+        text = segment.get("text", "").strip()
+        # Add speaker if available
+        if "speaker" in segment:
+            text = f"[{segment['speaker']}] {text}"
+        srt_lines.append(str(i))
+        srt_lines.append(f"{start} --> {end}")
+        srt_lines.append(text)
+        srt_lines.append("")
+    return "\n".join(srt_lines)
+def convert_to_vtt(transcription: Dict[str, Any]) -> str:
+    """
+    Convert transcription to WebVTT format.
+    """
+    vtt_lines = ["WEBVTT", ""]
+    for segment in transcription.get("segments", []):
+        start = format_timestamp_vtt(segment.get("start", 0))
+        end = format_timestamp_vtt(segment.get("end", 0))
+        text = segment.get("text", "").strip()
+        # Add speaker if available
+        if "speaker" in segment:
+            text = f"<v {segment['speaker']}>{text}"
+        vtt_lines.append(f"{start} --> {end}")
+        vtt_lines.append(text)
+        vtt_lines.append("")
+    return "\n".join(vtt_lines)
+def convert_to_csv(transcription: Dict[str, Any]) -> str:
+    """
+    Convert transcription to CSV format with word-level data.
+    """
+    output = io.StringIO()
+    # Determine columns based on available data
+    has_words = "words" in transcription and transcription["words"]
+    has_speakers = any("speaker" in w for w in transcription.get("words", []))
+    has_probability = any("probability" in w for w in transcription.get("words", []))
+    # Write header
+    if has_words:
+        headers = ["word", "start", "end"]
+        if has_speakers:
+            headers.append("speaker")
+        if has_probability:
+            headers.append("probability")
+    else:
+        headers = ["text", "start", "end"]
+        if has_speakers:
+            headers.append("speaker")
+    writer = csv.DictWriter(output, fieldnames=headers)
+    writer.writeheader()
+    # Write data
+    if has_words:
+        for word in transcription.get("words", []):
+            row = {
+                "word": word.get("word", ""),
+                "start": word.get("start", 0),
+                "end": word.get("end", 0)
+            }
+            if has_speakers:
+                row["speaker"] = word.get("speaker", "")
+            if has_probability:
+                row["probability"] = word.get("probability", "")
+            writer.writerow(row)
+    else:
+        for segment in transcription.get("segments", []):
+            row = {
+                "text": segment.get("text", "").strip(),
+                "start": segment.get("start", 0),
+                "end": segment.get("end", 0)
+            }
+            if has_speakers:
+                row["speaker"] = segment.get("speaker", "")
+            writer.writerow(row)
+    return output.getvalue()
+def convert_to_format(transcription: TranscriptionResult, format: OutputFormat) -> str:
+    """
+    Convert transcription to specified format.
+    Args:
+        transcription: The transcription result
+        format: Target output format
+    Returns:
+        Formatted string representation
+    """
+    if format == OutputFormat.SRT:
+        return convert_to_srt(transcription)
+    elif format == OutputFormat.VTT:
+        return convert_to_vtt(transcription)
+    elif format == OutputFormat.CSV:
+        return convert_to_csv(transcription)
+    else:
+        # Default to JSON (handled elsewhere)
+        import json
+        return json.dumps(transcription, indent=2)

voice_mode/tools/transcription/types.py ADDED Viewed

@@ -0,0 +1,52 @@
+"""Type definitions for transcription module."""
+from typing import TypedDict, List, Optional, Literal
+from enum import Enum
+class TranscriptionBackend(str, Enum):
+    """Available transcription backends."""
+    OPENAI = "openai"
+    WHISPERX = "whisperx"
+    WHISPER_CPP = "whisper-cpp"
+class OutputFormat(str, Enum):
+    """Available output formats."""
+    JSON = "json"
+    SRT = "srt"
+    VTT = "vtt"
+    CSV = "csv"
+class WordData(TypedDict, total=False):
+    """Word-level timestamp data."""
+    word: str
+    start: float
+    end: float
+    probability: Optional[float]
+    speaker: Optional[str]
+class SegmentData(TypedDict, total=False):
+    """Segment-level timestamp data."""
+    id: Optional[int]
+    text: str
+    start: float
+    end: float
+    words: Optional[List[WordData]]
+    speaker: Optional[str]
+class TranscriptionResult(TypedDict, total=False):
+    """Complete transcription result."""
+    text: str
+    language: str
+    duration: Optional[float]
+    segments: List[SegmentData]
+    words: Optional[List[WordData]]
+    backend: str
+    model: Optional[str]
+    success: bool
+    error: Optional[str]
+    formatted_content: Optional[str]  # For non-JSON output formats

voice_mode/utils/services/kokoro_helpers.py CHANGED Viewed

@@ -24,9 +24,22 @@ def find_kokoro_fastapi() -> Optional[str]:
             if platform.system() == "Darwin":
                 start_script = path / "start-gpu_mac.sh"
             else:
-                start_script = path / "start.sh"
+                # Check for appropriate start script
+                if has_gpu_support():
+                    # Prefer GPU script, fallback to general start
+                    possible_scripts = [
+                        path / "start-gpu.sh"
+                    ]
+                else:
+                    # Prefer CPU script, fallback to general start
+                    possible_scripts = [
+                        path / "start-cpu.sh"
+                    ]
+                # Find first existing script
+                start_script = next((script for script in possible_scripts if script.exists()), None)
-            if start_script.exists():
+            if start_script and start_script.exists():
                 return str(path)
     return None
@@ -37,4 +50,4 @@ def has_gpu_support() -> bool:
     This is a wrapper around the shared GPU detection utility.
     """
-    return _has_gpu_support()
+    return _has_gpu_support()

{voice_mode-2.34.2.dist-info → voice_mode-4.0.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: voice-mode
-Version: 2.34.2
+Version: 4.0.1
 Summary: VoiceMode - Voice interaction capabilities for AI assistants (formerly voice-mcp)
 Project-URL: Homepage, https://github.com/mbailey/voicemode
 Project-URL: Repository, https://github.com/mbailey/voicemode
@@ -66,9 +66,12 @@ Requires-Dist: pandas>=2.0.0; extra == 'notebooks'
 Provides-Extra: scripts
 Requires-Dist: flask>=3.0.0; extra == 'scripts'
 Provides-Extra: test
+Requires-Dist: coverage[toml]>=7.4.0; extra == 'test'
 Requires-Dist: pytest-asyncio>=0.21.0; extra == 'test'
-Requires-Dist: pytest-cov>=4.0.0; extra == 'test'
+Requires-Dist: pytest-cov>=4.1.0; extra == 'test'
 Requires-Dist: pytest-mock>=3.10.0; extra == 'test'
+Requires-Dist: pytest-timeout>=2.2.0; extra == 'test'
+Requires-Dist: pytest-xdist>=3.5.0; extra == 'test'
 Requires-Dist: pytest>=7.0.0; extra == 'test'
 Description-Content-Type: text/markdown

voice-mode 2.34.2__py3-none-any.whl → 4.0.1__py3-none-any.whl

voice-mode 2.34.2py3-none-any.whl → 4.0.1py3-none-any.whl