PyPI - converse-framework - Versions diffs - 0.2.0__py3-none-any.whl - Mend

converse-framework 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

converse_framework/__init__.py +108 -0
converse_framework/audio_utils.py +412 -0
converse_framework/cuda_utils.py +176 -0
converse_framework/events.py +94 -0
converse_framework/examples/__init__.py +20 -0
converse_framework/examples/subprocess_provider.py +439 -0
converse_framework/examples/text_chat.py +308 -0
converse_framework/examples/voice_chat.py +223 -0
converse_framework/examples/websocket_voice_chat.py +174 -0
converse_framework/js/browser-voice-client.js +248 -0
converse_framework/js/mic-frame-sender.js +445 -0
converse_framework/js/speaker-echo-guard.js +308 -0
converse_framework/js/tts-audio-player.js +237 -0
converse_framework/pipeline.py +620 -0
converse_framework/protocols.py +382 -0
converse_framework/provider_events.py +159 -0
converse_framework/providers/__init__.py +28 -0
converse_framework/providers/faster_whisper.py +290 -0
converse_framework/providers/kokoro_onnx.py +391 -0
converse_framework/providers/llamacpp.py +264 -0
converse_framework/providers/mock.py +171 -0
converse_framework/providers/pocket_tts.py +409 -0
converse_framework/providers/silero.py +161 -0
converse_framework/providers/unavailable.py +137 -0
converse_framework/providers/whisper_cpp.py +322 -0
converse_framework/registry.py +397 -0
converse_framework/session.py +315 -0
converse_framework/transport.py +54 -0
converse_framework/utterance_collector.py +336 -0
converse_framework-0.2.0.dist-info/METADATA +992 -0
converse_framework-0.2.0.dist-info/RECORD +33 -0
converse_framework-0.2.0.dist-info/WHEEL +4 -0
converse_framework-0.2.0.dist-info/licenses/LICENSE +21 -0

converse_framework/protocols.py ADDED Viewed

@@ -0,0 +1,382 @@
+"""Provider interfaces and shared dataclasses for the speech stack."""
+from __future__ import annotations
+from collections.abc import AsyncIterator, Awaitable, Callable
+from dataclasses import dataclass
+from typing import Protocol, runtime_checkable
+@dataclass(frozen=True)
+class ProviderCapabilities:
+    """Static capability flags advertised by a provider implementation.
+    The pipeline and transport layers consult these flags to decide
+    which features (partial transcripts, streaming TTS, barge-in, GPU
+    requirements, supported languages) are available without
+    instantiating the provider.
+    Attributes:
+        supports_partials: Provider can emit non-final transcript
+            chunks while audio is still arriving.
+        supports_streaming_tts: TTS can start streaming audio chunks
+            before the full text is known.
+        supports_barge_in: Provider can detect user speech while TTS
+            is still playing and signal cancellation.
+        requires_gpu: Provider needs a GPU at runtime. UI layers use
+            this to warn the user before they select the provider.
+        languages: ISO language codes the provider can handle.
+    """
+    supports_partials: bool = False
+    supports_streaming_tts: bool = False
+    supports_barge_in: bool = False
+    requires_gpu: bool = False
+    languages: tuple[str, ...] = ("en",)
+@dataclass(frozen=True)
+class ProviderStatus:
+    """Runtime status snapshot of a provider.
+    Returned by the ``status`` property / ``check_status`` coroutine of
+    every provider protocol. ``ready`` is the headline boolean the UI
+    uses to enable / disable a provider row. ``message`` carries the
+    human-readable explanation (missing dependency, model not loaded,
+    GPU absent, ...).
+    Attributes:
+        name: Provider name as registered in :mod:`registry` (e.g.
+            ``"mock"``, ``"silero"``, ``"faster-whisper"``).
+        kind: Provider category, one of ``"vad"``, ``"asr"``,
+            ``"llm"``, ``"tts"``.
+        ready: True if the provider can be used right now.
+        message: Human-readable status, surfaced verbatim in the UI.
+        install_hint: Optional package spec to install when this
+            provider is unavailable because an optional dependency is
+            missing, e.g. ``"converse-framework[silero]"``.
+        missing_extra: Optional extra name for UI display when the
+            framework knows which optional dependency group is missing.
+        capabilities: Static feature flags for this provider.
+        provider_id: Stable identifier for UI selection when the
+            registered ``name`` is aliased.
+        selected: True if this provider is the one currently bound
+            into the active :class:`ProviderBundle`.
+        loaded: True if the heavy backend has been initialised.
+        managed_externally: Provider lifecycle is owned by another
+            runtime (e.g. a TTS preset manager) and the framework
+            should not call :meth:`load` / :meth:`unload` on it.
+        supports_model_management: Provider exposes model hot-swap.
+        supports_voice_selection: Provider exposes voice selection.
+        voices: Tuple of voice metadata dicts (``id``, ``label``, …)
+            supported by this provider, if discoverable.
+        active_voice: Currently selected voice identifier, if any.
+        models: Tuple of model metadata dicts (``id``, ``label``, …)
+            supported by this provider, if discoverable.
+        active_model: Currently selected model identifier, if any.
+        status_level: Categorical readiness level. One of
+            ``"ready"`` | ``"configured"`` | ``"loading"`` |
+            ``"error"`` | ``"unavailable"``.
+    """
+    name: str
+    kind: str
+    ready: bool
+    message: str
+    capabilities: ProviderCapabilities
+    install_hint: str | None = None
+    missing_extra: str | None = None
+    provider_id: str | None = None
+    selected: bool = False
+    loaded: bool = True
+    managed_externally: bool = False
+    supports_model_management: bool = False
+    supports_voice_selection: bool = False
+    voices: tuple[dict[str, str], ...] = ()
+    active_voice: str | None = None
+    models: tuple[dict[str, str], ...] = ()
+    active_model: str | None = None
+    status_level: str = "ready"
+@dataclass(frozen=True)
+class VoiceInfo:
+    """Structured metadata for a voice supported by a TTS provider.
+    Attributes:
+        id: Machine-readable voice identifier (e.g. ``"azelma"``).
+        label: Human-readable voice display name (e.g. ``"Azelma"``).
+        language: ISO language code for the voice (e.g. ``"en"``, ``"fr"``).
+        description: Optional human-readable description.
+        gender: Optional gender hint (``"male"``, ``"female"``, ``"neutral"``).
+    """
+    id: str
+    label: str
+    language: str = "en"
+    description: str = ""
+    gender: str = "neutral"
+@dataclass(frozen=True)
+class ProviderConfigResult:
+    """Result of a provider :meth:`~TTSProvider.configure` call.
+    Attributes:
+        status: The provider's status after applying the change.
+        changed: True if at least one option value actually changed.
+        requires_reload: True if the change invalidated cached model
+            or voice state and the provider needs a :meth:`load` call
+            before it can be used again.
+        message: Human-readable summary of what was changed.
+    """
+    status: ProviderStatus
+    changed: bool = False
+    requires_reload: bool = False
+    message: str = ""
+@dataclass(frozen=True)
+class TranscriptEvent:
+    """A single incremental transcript chunk produced by an ASR provider.
+    ASR providers stream a sequence of these events for every audio
+    turn. Non-final events (``final=False``) are hypothesis updates
+    that may still change; only the last ``final=True`` event in a
+    stream is authoritative for the utterance.
+    Attributes:
+        text: Transcript text for this chunk. For non-final chunks
+            this is the running hypothesis; for the final chunk it is
+            the committed utterance text.
+        final: True iff this is the closing, committed transcript for
+            the current utterance.
+    """
+    text: str
+    final: bool
+@dataclass(frozen=True)
+class AudioChunk:
+    """A single chunk of encoded audio emitted by a TTS provider.
+    TTS providers yield a stream of these chunks. The framework does
+    not interpret the audio bytes directly -- it forwards them to
+    transports -- but it does attach enough metadata for downstream
+    consumers to render or persist the audio correctly.
+    Attributes:
+        data: Raw encoded audio bytes (the encoding is described by
+            ``encoding`` / ``mime_type``).
+        mime_type: Optional MIME hint (``"audio/wav"``,
+            ``"audio/mpeg"``, ...). ``None`` if the provider cannot
+            name the encoding.
+        sample_rate: Samples per second of the decoded audio, or
+            ``None`` if not applicable (e.g. compressed formats
+            served whole).
+        channels: Channel count of the decoded audio.
+        encoding: Encoding name (``"pcm_s16le"``, ``"mp3"``,
+            ``"wav"`` ...). Matches the value the
+            :mod:`audio_utils` helpers expect.
+        duration_ms: Duration of this chunk in milliseconds, when the
+            provider can compute it. ``None`` for the first chunk of
+            streaming codecs.
+        final: True if this is the last chunk for the current
+            synthesis request.
+    """
+    data: bytes
+    mime_type: str | None = None
+    sample_rate: int | None = None
+    channels: int = 1
+    encoding: str | None = None
+    duration_ms: int | None = None
+    final: bool = False
+@dataclass(frozen=True)
+class VADEvent:
+    """A single VAD decision produced by a VAD provider.
+    The :class:`AudioUtteranceCollector` state machine consumes a
+    stream of these events per :class:`AudioFrame` to drive the
+    recording lifecycle.
+    Attributes:
+        type: Event kind. ``"vad.speech_start"`` marks the leading
+            edge of detected speech; ``"vad.speech_end"`` marks the
+            trailing edge; ``"vad.probability"`` is an intermediate
+            level readout that does not change the recording state.
+        probability: Confidence of the decision, in ``[0.0, 1.0]``.
+        audio_ms: Position in the current utterance, in milliseconds
+            from the first frame of the turn.
+    """
+    type: str
+    probability: float
+    audio_ms: int
+ProgressCallback = Callable[[str, dict], Awaitable[None]]
+@runtime_checkable
+class VADProvider(Protocol):
+    """Voice-activity-detection provider.
+    Implementations consume a stream of parsed :class:`AudioFrame`
+    objects and emit :class:`VADEvent` decisions that the utterance
+    collector turns into utterance boundaries.
+    The ``status`` property exposes the current
+    :class:`ProviderStatus`; :meth:`check_status` is the async form
+    that performs a real probe (file existence, model loaded, ...).
+    ``probe_status`` is a cheap, no-model-load variant;
+    ``load_status`` may load heavy resources.
+    """
+    @property
+    def status(self) -> ProviderStatus: ...
+    async def check_status(self) -> ProviderStatus: ...
+    async def probe_status(self) -> ProviderStatus:
+        """Cheap readiness probe, does not load heavy resources."""
+        return await self.check_status()
+    async def load_status(self) -> ProviderStatus:
+        """May load or initialise heavy resources."""
+        return await self.probe_status()
+    async def process_frame(self, frame) -> list[VADEvent]: ...
+    async def unload(self) -> ProviderStatus: ...
+@runtime_checkable
+class ASRProvider(Protocol):
+    """Automatic-speech-recognition provider.
+    Implementations accept either raw 16-bit signed-LE mono PCM
+    bytes (audio path) or a transcript seed string (text path) and
+    stream :class:`TranscriptEvent` chunks. The text-input path is
+    used by the pipeline to keep the public API symmetric between
+    audio and chat front-ends.
+    """
+    @property
+    def status(self) -> ProviderStatus: ...
+    async def check_status(self) -> ProviderStatus: ...
+    async def probe_status(self) -> ProviderStatus:
+        """Cheap readiness probe, does not load heavy resources."""
+        return await self.check_status()
+    async def load_status(self) -> ProviderStatus:
+        """May load or initialise heavy resources."""
+        return await self.load()
+    async def load(self) -> ProviderStatus: ...
+    def transcribe_text_input(self, text: str) -> AsyncIterator[TranscriptEvent]: ...
+    def transcribe_audio(
+        self,
+        pcm_s16le: bytes,
+        sample_rate: int,
+        progress: ProgressCallback | None = None,
+    ) -> AsyncIterator[TranscriptEvent]: ...
+    async def unload(self) -> ProviderStatus: ...
+@runtime_checkable
+class LLMProvider(Protocol):
+    """Large-language-model provider.
+    Implementations take an OpenAI-style ``messages`` list
+    (``[{"role": ..., "content": ...}, ...]``) and stream token
+    strings. The pipeline feeds these tokens into the TTS chunker,
+    so implementations do not need to do their own sentence
+    splitting -- a simple token stream is the contract.
+    """
+    @property
+    def status(self) -> ProviderStatus: ...
+    async def check_status(self) -> ProviderStatus: ...
+    async def probe_status(self) -> ProviderStatus:
+        """Cheap readiness probe, does not load heavy resources."""
+        return await self.check_status()
+    async def load_status(self) -> ProviderStatus:
+        """May load or initialise heavy resources."""
+        return await self.check_status()
+    def stream_response(self, messages: list[dict[str, str]]) -> AsyncIterator[str]: ...
+@runtime_checkable
+class TTSProvider(Protocol):
+    """Text-to-speech provider.
+    Implementations accept a single text string and stream
+    :class:`AudioChunk` objects back. The :meth:`stream_audio` form
+    is the simple contract; :meth:`stream_audio_with_progress` adds
+    an optional progress callback the pipeline uses to emit
+    ``tts.progress`` events to the transport layer.
+    """
+    @property
+    def status(self) -> ProviderStatus: ...
+    async def check_status(self) -> ProviderStatus: ...
+    async def probe_status(self) -> ProviderStatus:
+        """Cheap readiness probe, does not load heavy resources."""
+        return await self.check_status()
+    async def load_status(self) -> ProviderStatus:
+        """May load or initialise heavy resources."""
+        return await self.load()
+    async def load(self) -> ProviderStatus: ...
+    async def unload(self) -> ProviderStatus: ...
+    async def configure(self, **options) -> ProviderConfigResult:
+        """Apply configuration changes.
+        Supported options depend on the provider implementation.
+        Returns a :class:`ProviderConfigResult` describing whether
+        the change was applied and whether a reload is required.
+        """
+        from .protocols import ProviderConfigResult
+        return ProviderConfigResult(
+            status=await self.check_status(),
+            changed=False,
+            requires_reload=False,
+            message="configure() is not implemented by this provider.",
+        )
+    def list_voices(self) -> tuple[VoiceInfo, ...]:
+        """Return structured metadata for voices this provider supports.
+        Implementations should return static metadata where possible
+        rather than importing the heavy backend just to enumerate
+        voices.
+        """
+        return ()
+    def stream_audio(self, text: str) -> AsyncIterator[AudioChunk]: ...
+    def stream_audio_with_progress(
+        self, text: str, progress: ProgressCallback | None = None
+    ) -> AsyncIterator[AudioChunk]: ...

converse_framework/provider_events.py ADDED Viewed

@@ -0,0 +1,159 @@
+"""Standardised provider lifecycle event helpers.
+These helpers produce a consistent event shape for provider loading,
+loaded, and error events across VAD, ASR, LLM, and TTS providers.
+The pipeline and provider code emit these alongside existing
+``asr.progress`` and ``tts.progress`` events for backward compat.
+Event types emitted:
+* ``provider.loading`` — a provider has begun loading a model.
+* ``provider.loaded`` — a provider has finished loading.
+* ``provider.error`` — a provider encountered a non-recoverable error.
+Payload fields:
+* ``kind`` — ``"vad"`` | ``"asr"`` | ``"llm"`` | ``"tts"``
+* ``provider`` — provider name (:attr:`ProviderStatus.name`)
+* ``provider_id`` — stable identifier (:attr:`ProviderStatus.provider_id`)
+* ``stage`` — substage description (``"loading"``, ``"loaded"``, …)
+* ``message`` — human-readable detail
+* ``error_type`` — exception class name for error events
+* ``loaded`` — bool, whether the provider reports loaded after the event
+* ``latency_ms`` — elapsed milliseconds when available
+* ``turn_id`` and ``mode`` — tied to a turn when emitted from pipeline
+Typical usage::
+    from converse_framework.provider_events import provider_loading_event
+    await sink.emit(**provider_loading_event(
+        kind="asr",
+        provider="faster-whisper",
+        stage="loading",
+        message="Loading model...",
+    ))
+Or with latency and turn context::
+    await sink.emit(
+        **provider_error_event(
+            kind="tts",
+            provider="pocket-tts",
+            stage="synthesis",
+            message=str(exc),
+            error_type=type(exc).__name__,
+        ),
+        turn_id=turn_id,
+        mode=turn_mode,
+        latency_ms=elapsed_ms(started),
+    )
+"""
+from __future__ import annotations
+from typing import Any
+def provider_loading_event(
+    *,
+    kind: str,
+    provider: str,
+    stage: str = "loading",
+    message: str = "",
+    **extra: Any,
+) -> dict[str, Any]:
+    """Build a ``provider.loading`` event payload.
+    Args:
+        kind: Provider category (``"vad"``, ``"asr"``, ``"llm"``, ``"tts"``).
+        provider: Provider name from :attr:`ProviderStatus.name`.
+        stage: Sub-stage label (e.g. ``"loading"``, ``"downloading"``).
+        message: Human-readable description.
+        **extra: Additional fields forwarded verbatim.
+    Returns:
+        A keyword-expandable dict with ``event_type`` and ``payload``
+        suitable for ``await sink.emit(**result)``.
+    """
+    return {
+        "event_type": "provider.loading",
+        "kind": kind,
+        "provider": provider,
+        "stage": stage,
+        "message": message,
+        "loaded": False,
+        **extra,
+    }
+def provider_loaded_event(
+    *,
+    kind: str,
+    provider: str,
+    stage: str = "loaded",
+    message: str = "",
+    latency_ms: int | None = None,
+    **extra: Any,
+) -> dict[str, Any]:
+    """Build a ``provider.loaded`` event payload.
+    Args:
+        kind: Provider category.
+        provider: Provider name.
+        stage: Sub-stage label (e.g. ``"loaded"``).
+        message: Human-readable description.
+        latency_ms: Elapsed milliseconds for the load operation.
+        **extra: Additional fields forwarded verbatim.
+    Returns:
+        A keyword-expandable dict for ``await sink.emit(**result)``.
+    """
+    payload: dict[str, Any] = {
+        "event_type": "provider.loaded",
+        "kind": kind,
+        "provider": provider,
+        "stage": stage,
+        "message": message,
+        "loaded": True,
+    }
+    if latency_ms is not None:
+        payload["latency_ms"] = latency_ms
+    payload.update(extra)
+    return payload
+def provider_error_event(
+    *,
+    kind: str,
+    provider: str,
+    stage: str = "",
+    message: str = "",
+    error_type: str = "Exception",
+    loaded: bool = False,
+    **extra: Any,
+) -> dict[str, Any]:
+    """Build a ``provider.error`` event payload.
+    Args:
+        kind: Provider category.
+        provider: Provider name.
+        stage: Sub-stage where the error occurred.
+        message: Human-readable error description.
+        error_type: Exception class name (e.g. ``"RuntimeError"``).
+        loaded: Whether the provider was loaded at the time of error.
+        **extra: Additional fields forwarded verbatim.
+    Returns:
+        A keyword-expandable dict for ``await sink.emit(**result)``.
+    """
+    return {
+        "event_type": "provider.error",
+        "kind": kind,
+        "provider": provider,
+        "stage": stage,
+        "message": message,
+        "error_type": error_type,
+        "loaded": loaded,
+        **extra,
+    }

converse_framework/providers/__init__.py ADDED Viewed

@@ -0,0 +1,28 @@
+"""Built-in provider implementations.
+Mock and unavailable providers are imported eagerly because they have no
+heavy dependencies. The concrete providers (``silero``, ``faster-whisper``,
+``llamacpp``, ``kokoro-onnx``, ``pocket-tts``) are not imported here --
+they are registered with :func:`converse_framework.registry.register_provider`
+by import string and loaded lazily on first use.
+"""
+from converse_framework.providers.mock import (
+    MockASRProvider,
+    MockLLMProvider,
+    MockTTSProvider,
+    MockVADProvider,
+)
+from converse_framework.providers.unavailable import (
+    UnavailableProvider,
+    extra_hint_for,
+)
+__all__ = [
+    "MockASRProvider",
+    "MockLLMProvider",
+    "MockTTSProvider",
+    "MockVADProvider",
+    "UnavailableProvider",
+    "extra_hint_for",
+]