PyPI - bithuman - Versions diffs - 1.0.2__py3-none-any.whl - Mend

bithuman 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

bithuman/__init__.py +13 -0
bithuman/_version.py +1 -0
bithuman/api.py +164 -0
bithuman/audio/__init__.py +19 -0
bithuman/audio/audio.py +396 -0
bithuman/audio/hparams.py +108 -0
bithuman/audio/utils.py +255 -0
bithuman/config.py +88 -0
bithuman/engine/__init__.py +15 -0
bithuman/engine/auth.py +335 -0
bithuman/engine/compression.py +257 -0
bithuman/engine/enums.py +16 -0
bithuman/engine/image_ops.py +192 -0
bithuman/engine/inference.py +108 -0
bithuman/engine/knn.py +58 -0
bithuman/engine/video_data.py +391 -0
bithuman/engine/video_reader.py +168 -0
bithuman/lib/__init__.py +1 -0
bithuman/lib/audio_encoder.onnx +45631 -28
bithuman/lib/generator.py +763 -0
bithuman/lib/pth2h5.py +106 -0
bithuman/plugins/__init__.py +0 -0
bithuman/plugins/stt.py +185 -0
bithuman/runtime.py +1004 -0
bithuman/runtime_async.py +469 -0
bithuman/service/__init__.py +9 -0
bithuman/service/client.py +788 -0
bithuman/service/messages.py +210 -0
bithuman/service/server.py +759 -0
bithuman/utils/__init__.py +43 -0
bithuman/utils/agent.py +359 -0
bithuman/utils/fps_controller.py +90 -0
bithuman/utils/image.py +41 -0
bithuman/utils/unzip.py +38 -0
bithuman/video_graph/__init__.py +16 -0
bithuman/video_graph/action_trigger.py +83 -0
bithuman/video_graph/driver_video.py +482 -0
bithuman/video_graph/navigator.py +736 -0
bithuman/video_graph/trigger.py +90 -0
bithuman/video_graph/video_script.py +344 -0
bithuman-1.0.2.dist-info/METADATA +37 -0
bithuman-1.0.2.dist-info/RECORD +44 -0
bithuman-1.0.2.dist-info/WHEEL +5 -0
bithuman-1.0.2.dist-info/top_level.txt +1 -0

bithuman/utils/__init__.py ADDED Viewed

@@ -0,0 +1,43 @@
+import hashlib
+from pathlib import Path
+from typing import Optional
+from loguru import logger
+from .fps_controller import FPSController
+__all__ = ["FPSController"]
+def calculate_file_hash(file_path: str) -> Optional[str]:
+    """Calculate an MD5 hash of a file.
+    This function reads the file in chunks to efficiently handle large files
+    and calculates an MD5 hash, which is returned as a hexadecimal string.
+    Args:
+        file_path: Path to the file to be hashed
+    Returns:
+        A hexadecimal string representing the file hash, or None if the file doesn't exist
+    Raises:
+        IOError: If there's an error reading the file
+    """
+    try:
+        path = Path(file_path)
+        if not path.is_file():
+            logger.warning(f"Cannot calculate hash for non-file: {file_path}")
+            return None
+        md5_hash = hashlib.md5()
+        # Read the file in chunks of 4K to avoid loading large files into memory
+        with open(path, "rb") as f:
+            for byte_block in iter(lambda: f.read(4096), b""):
+                md5_hash.update(byte_block)
+        return md5_hash.hexdigest()
+    except Exception as e:
+        logger.error(f"Error calculating file hash for {file_path}: {e}")
+        raise

bithuman/utils/agent.py ADDED Viewed

@@ -0,0 +1,359 @@
+from __future__ import annotations
+import asyncio
+import time
+from abc import ABC, abstractmethod
+from typing import Any, Optional
+import cv2
+import numpy as np
+try:
+    from livekit import rtc
+    from livekit.agents import utils
+    from livekit.agents.voice import AgentSession, io
+    from livekit.agents.voice.avatar import (
+        AudioReceiver,
+        AudioSegmentEnd,
+        AvatarOptions,
+    )
+    from livekit.agents.voice.chat_cli import ChatCLI
+except ImportError:
+    raise ImportError(
+        "livekit-agents is required, please install it with `pip install livekit-agents[openai,silero,deepgram,cartesia]~=1.0rc`"
+    )
+from loguru import logger
+from bithuman import AsyncBithuman, AudioChunk, VideoFrame
+from bithuman.utils import FPSController
+class AudioOutput(ABC):
+    @abstractmethod
+    async def capture_frame(self, audio_chunk: AudioChunk) -> None:
+        pass
+    @abstractmethod
+    def clear_buffer(self) -> None:
+        pass
+class VideoOutput(ABC):
+    @abstractmethod
+    async def capture_frame(
+        self, frame: VideoFrame, fps: float, exp_time: float
+    ) -> None:
+        pass
+    @abstractmethod
+    def buffer_empty(self) -> bool:
+        pass
+class LocalAudioIO(ChatCLI, AudioOutput):
+    """Chat interface that redirects audio output to a custom destination."""
+    def __init__(
+        self,
+        session: AgentSession,
+        agent_audio_output: io.AudioOutput,
+        *,
+        buffer_size: int = 0,
+        loop: Optional[asyncio.AbstractEventLoop] = None,
+    ) -> None:
+        super().__init__(agent_session=session, loop=loop)
+        self._redirected_audio_output = agent_audio_output
+        self._input_buffer = utils.aio.Chan[rtc.AudioFrame](maxsize=buffer_size)
+        self._forward_audio_atask: Optional[asyncio.Task] = None
+        self._sample_rate = self._audio_sink.sample_rate
+        self._resampler: Optional[rtc.AudioResampler] = None
+    async def start(self) -> None:
+        await super().start()
+        self._forward_audio_atask = asyncio.create_task(self._forward_audio())
+    async def capture_frame(self, audio_chunk: AudioChunk) -> None:
+        audio_frame = rtc.AudioFrame(
+            data=audio_chunk.bytes,
+            sample_rate=audio_chunk.sample_rate,
+            num_channels=1,
+            samples_per_channel=len(audio_chunk.array),
+        )
+        if not self._resampler and self._sample_rate != audio_chunk.sample_rate:
+            self._resampler = rtc.AudioResampler(
+                input_rate=audio_chunk.sample_rate,
+                output_rate=self._sample_rate,
+                num_channels=1,
+            )
+        if self._resampler:
+            for f in self._resampler.push(audio_frame):
+                await self._input_buffer.send(f)
+        else:
+            await self._input_buffer.send(audio_frame)
+    def clear_buffer(self) -> None:
+        while not self._input_buffer.empty():
+            self._input_buffer.recv_nowait()
+        with self._audio_sink.lock:
+            self._audio_sink.audio_buffer.clear()
+    @utils.log_exceptions(logger=logger)
+    async def _forward_audio(self) -> None:
+        async for frame in self._input_buffer:
+            await self._audio_sink.capture_frame(frame)
+    def _update_speaker(self, *, enable: bool) -> None:
+        super()._update_speaker(enable=enable)
+        # redirect the agent's audio output
+        if enable:
+            self._session.output.audio = self._redirected_audio_output
+        else:
+            self._session.output.audio = None
+    async def aclose(self) -> None:
+        if not self._done_fut.done():
+            self._done_fut.set_result(None)
+        if self._main_atask:
+            await utils.aio.cancel_and_wait(self._main_atask)
+        self._input_buffer.close()
+        if self._forward_audio_atask:
+            await utils.aio.cancel_and_wait(self._forward_audio_atask)
+class LocalVideoPlayer(VideoOutput):
+    """Video display for rendering avatar frames with debug information."""
+    def __init__(
+        self,
+        window_size: tuple[int, int],
+        window_name: str = "BitHuman Avatar",
+        buffer_size: int = 0,
+    ) -> None:
+        self.window_name: str = window_name
+        self.start_time: Optional[float] = None
+        self._input_buffer = utils.aio.Chan[tuple[VideoFrame, float, float]](
+            maxsize=buffer_size
+        )
+        self._display_atask: Optional[asyncio.Task] = None
+        cv2.namedWindow(self.window_name, cv2.WINDOW_NORMAL)
+        cv2.resizeWindow(self.window_name, window_size[0], window_size[1])
+        self.start_time = asyncio.get_event_loop().time()
+        self._display_atask = asyncio.create_task(self._display_frame())
+    async def aclose(self) -> None:
+        cv2.destroyAllWindows()
+        if self._display_atask:
+            await utils.aio.cancel_and_wait(self._display_atask)
+    async def capture_frame(
+        self, frame: VideoFrame, fps: float = 0.0, exp_time: float = 0.0
+    ) -> None:
+        if not frame.has_image:
+            return
+        await self._input_buffer.send((frame, fps, exp_time))
+    def buffer_empty(self) -> bool:
+        return self._input_buffer.empty()
+    @utils.log_exceptions(logger=logger)
+    async def _display_frame(self) -> None:
+        async for frame, fps, exp_time in self._input_buffer:
+            image = await self.render_image(frame, fps, exp_time)
+            cv2.imshow(self.window_name, image)
+            cv2.waitKey(1)
+    async def render_image(
+        self, frame: VideoFrame, fps: float = 0.0, exp_time: float = 0.0
+    ) -> np.ndarray:
+        image = frame.bgr_image.copy()
+        # Add overlay information
+        self._add_debug_info(image, fps, exp_time)
+        return image
+    def _add_debug_info(self, image: np.ndarray, fps: float, exp_time: float) -> None:
+        # Add FPS information
+        cv2.putText(
+            image,
+            f"FPS: {fps:.1f}",
+            (10, 30),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            1,
+            (0, 255, 0),
+            2,
+        )
+        # Add elapsed time
+        current_time = asyncio.get_event_loop().time()
+        if self.start_time is not None:
+            elapsed = current_time - self.start_time
+            cv2.putText(
+                image,
+                f"Time: {elapsed:.1f}s",
+                (10, 70),
+                cv2.FONT_HERSHEY_SIMPLEX,
+                1,
+                (0, 255, 0),
+                2,
+            )
+        # Add expiration time if available
+        if exp_time > 0:
+            exp_in_seconds = exp_time - time.time()
+            cv2.putText(
+                image,
+                f"Exp in: {exp_in_seconds:.1f}s",
+                (10, 110),
+                cv2.FONT_HERSHEY_SIMPLEX,
+                1,
+                (0, 255, 0),
+                2,
+            )
+class LocalAvatarRunner:
+    """Controls and synchronizes avatar audio and video playback."""
+    def __init__(
+        self,
+        *,
+        bithuman_runtime: AsyncBithuman,
+        audio_input: AudioReceiver,
+        audio_output: AudioOutput,
+        video_output: VideoOutput,
+        options: AvatarOptions,
+        runtime_kwargs: dict[str, Any] | None = None,
+    ) -> None:
+        self._bithuman_runtime = bithuman_runtime
+        self._runtime_kwargs = runtime_kwargs or {}
+        self._options = options
+        self._audio_recv = audio_input
+        self._audio_output = audio_output
+        self._video_output = video_output
+        self._stop_event = asyncio.Event()
+        # State management
+        self._playback_position: float = 0.0
+        self._audio_playing: bool = False
+        self._tasks: set[asyncio.Task] = set()
+        self._read_audio_atask: Optional[asyncio.Task] = None
+        self._publish_video_atask: Optional[asyncio.Task] = None
+        # FPS control
+        self._fps_controller = FPSController(target_fps=options.video_fps)
+    async def start(self) -> None:
+        await self._audio_recv.start()
+        # Setup event handler
+        self._audio_recv.on("clear_buffer", self._create_clear_buffer_task)
+        # Start processing tasks
+        self._read_audio_atask = asyncio.create_task(self._read_audio())
+        self._publish_video_atask = asyncio.create_task(self._publish_video())
+    def _create_clear_buffer_task(self) -> None:
+        """Create a task to handle clear buffer events."""
+        task = asyncio.create_task(self._handle_clear_buffer())
+        self._tasks.add(task)
+        task.add_done_callback(self._tasks.discard)
+    @utils.log_exceptions(logger=logger)
+    async def _read_audio(self) -> None:
+        """Process incoming audio frames."""
+        async for frame in self._audio_recv:
+            if self._stop_event.is_set():
+                break
+            if not self._audio_playing and isinstance(frame, rtc.AudioFrame):
+                self._audio_playing = True
+            if isinstance(frame, AudioSegmentEnd):
+                await self._bithuman_runtime.flush()
+                continue
+            await self._bithuman_runtime.push_audio(
+                bytes(frame.data), frame.sample_rate, last_chunk=False
+            )
+    @utils.log_exceptions(logger=logger)
+    async def _publish_video(self) -> None:
+        """Process and display video frames."""
+        async for frame in self._bithuman_runtime.run(
+            out_buffer_empty=self._video_output.buffer_empty,
+            **self._runtime_kwargs,
+        ):
+            # Control frame rate
+            sleep_time = self._fps_controller.wait_next_frame(sleep=False)
+            if sleep_time > 0:
+                await asyncio.sleep(sleep_time)
+            # Send video frame
+            if frame.has_image:
+                await self._video_output.capture_frame(
+                    frame,
+                    fps=self._fps_controller.average_fps,
+                    exp_time=self._bithuman_runtime.get_expiration_time(),
+                )
+            # Send audio chunk
+            audio_chunk = frame.audio_chunk
+            if audio_chunk is not None:
+                await self._audio_output.capture_frame(audio_chunk)
+                self._playback_position += audio_chunk.duration
+            # Handle end of speech
+            if frame.end_of_speech:
+                await self._handle_end_of_speech()
+            self._fps_controller.update()
+    async def _handle_end_of_speech(self) -> None:
+        """Handle end of speech event."""
+        if self._audio_playing:
+            notify_task = self._audio_recv.notify_playback_finished(
+                playback_position=self._playback_position,
+                interrupted=False,
+            )
+            if asyncio.iscoroutine(notify_task):
+                await notify_task
+            self._playback_position = 0.0
+        self._audio_playing = False
+    async def _handle_clear_buffer(self) -> None:
+        """Handle clearing the buffer and notify about interrupted playback."""
+        tasks = []
+        self._bithuman_runtime.interrupt()
+        self._audio_output.clear_buffer()
+        # Handle interrupted playback
+        if self._audio_playing:
+            notify_task = self._audio_recv.notify_playback_finished(
+                playback_position=self._playback_position,
+                interrupted=True,
+            )
+            if asyncio.iscoroutine(notify_task):
+                tasks.append(notify_task)
+            self._playback_position = 0.0
+            self._audio_playing = False
+        await asyncio.gather(*tasks)
+    async def aclose(self) -> None:
+        """Close the avatar controller and clean up resources."""
+        if self._read_audio_atask:
+            await utils.aio.cancel_and_wait(self._read_audio_atask)
+        if self._publish_video_atask:
+            await utils.aio.cancel_and_wait(self._publish_video_atask)
+        await utils.aio.cancel_and_wait(*self._tasks)
+    def stop(self) -> None:
+        self._stop_event.set()

bithuman/utils/fps_controller.py ADDED Viewed

@@ -0,0 +1,90 @@
+from __future__ import annotations
+import time
+from collections import deque
+from loguru import logger
+class FPSController:
+    """Controls frame rate for synchronous processing.
+    Maintains target FPS by calculating appropriate sleep times and adjusting
+    for processing delays.
+    Attributes:
+        target_fps: Target frames per second
+        frame_interval: Time interval between frames in seconds
+        average_fps: Current average FPS
+    """
+    def __init__(
+        self, target_fps: int, max_frame_count: int = 10, disabled: bool = False
+    ) -> None:
+        """Initialize FPS controller.
+        Args:
+            target_fps: Target frames per second
+            max_frame_count: Number of frames to keep for FPS calculation
+            disabled: If True, the FPS controller will be disabled.
+        """
+        self.target_fps = target_fps
+        self.frame_interval = 1.0 / target_fps
+        self.max_frame_count = max_frame_count
+        self.disabled = disabled
+        # Timing control
+        self.next_frame_time = None
+        self.display_ts: deque[float] = deque(maxlen=max_frame_count)
+        self.average_fps = 0
+    def wait_next_frame(self, *, sleep: bool = True) -> float:
+        """Wait until it's time for the next frame.
+        Adjusts sleep time based on actual FPS to maintain target rate.
+        """
+        current_time = time.time()
+        # Initialize next_frame_time if needed
+        if self.next_frame_time is None:
+            self.next_frame_time = current_time
+            self.display_ts.clear()
+        # Calculate sleep time to maintain target FPS
+        sleep_time = self.next_frame_time - current_time
+        if sleep_time > 0 and not self.disabled:
+            # Adjust sleep time based on actual FPS
+            if len(self.display_ts) >= 2:
+                self.average_fps = (len(self.display_ts) - 1) / (
+                    self.display_ts[-1] - self.display_ts[0]
+                )
+                scale = min(1.1, max(0.9, self.average_fps / self.target_fps))
+                sleep_time *= scale
+            if sleep:
+                time.sleep(sleep_time)
+            return sleep_time
+        else:
+            # Check if significantly behind schedule
+            if -sleep_time > self.frame_interval * 8:
+                logger.warning(
+                    f"Frame processing was behind schedule for "
+                    f"{-sleep_time * 1000:.2f} ms"
+                )
+                self.next_frame_time = time.time()
+        return sleep_time
+    def update(self) -> None:
+        """Update timing information after processing a frame."""
+        current_time = time.time()
+        # Update timing information (deque auto-evicts oldest when maxlen exceeded)
+        self.display_ts.append(current_time)
+        # Calculate next frame time
+        self.next_frame_time += self.frame_interval
+    @property
+    def fps(self) -> float:
+        """Get current average FPS."""
+        return self.average_fps

bithuman/utils/image.py ADDED Viewed

@@ -0,0 +1,41 @@
+from __future__ import annotations
+import cv2
+import numpy as np
+try:
+    from turbojpeg import TurboJPEG
+    jpeg_encoder = TurboJPEG()
+except (ImportError, ModuleNotFoundError, RuntimeError):
+    jpeg_encoder = None
+def encode_image(image: np.ndarray, quality: int = 85) -> bytes:
+    """Encode the image to bytes."""
+    if jpeg_encoder is not None:
+        return jpeg_encoder.encode(image, quality=quality)
+    return cv2.imencode(".jpg", image, [int(cv2.IMWRITE_JPEG_QUALITY), quality])[
+        1
+    ].tobytes()
+def decode_image(image_bytes: bytes) -> np.ndarray:
+    """Decode the image from bytes."""
+    if jpeg_encoder is not None:
+        return jpeg_encoder.decode(image_bytes)
+    return cv2.imdecode(np.frombuffer(image_bytes, np.uint8), cv2.IMREAD_COLOR)
+class CompressedImage:
+    """A compressed image."""
+    def __init__(self, data: bytes | np.ndarray) -> None:
+        """Initialize the compressed image."""
+        if isinstance(data, np.ndarray):
+            data = encode_image(data)
+        self.data = data
+    def as_numpy(self) -> np.ndarray:
+        """Get the image data as a numpy array."""
+        return decode_image(self.data)

bithuman/utils/unzip.py ADDED Viewed

@@ -0,0 +1,38 @@
+from __future__ import annotations
+import tarfile
+from pathlib import Path
+from tempfile import TemporaryDirectory
+from typing import Optional
+def unzip_tarfile(
+    file_path: str, extract_to_local: bool = False
+) -> tuple[str, Optional[TemporaryDirectory]]:
+    """Unzip the workspace directory if it is a file."""
+    file_path: Path = Path(file_path)
+    if file_path.is_dir():
+        return str(file_path), None
+    # Extract the workspace
+    if not extract_to_local:
+        temp_dir_handle = TemporaryDirectory()
+        dest_dir = temp_dir_handle.name
+    else:
+        temp_dir_handle = None
+        dest_dir = str(file_path.parent / file_path.stem)
+        if dest_dir.endswith(".tar"):
+            dest_dir = dest_dir[:-4]  # Remove .tar suffix
+    if temp_dir_handle is not None or not Path(dest_dir).exists():
+        Path(dest_dir).mkdir(parents=True, exist_ok=True)
+        mode = "r:gz" if file_path.name.endswith("gz") else "r"
+        with tarfile.open(file_path, mode) as tar:
+            tar.extractall(dest_dir)
+        file_path = dest_dir
+    # Enter the dir if there is only one directory in the tar file
+    files = list(Path(dest_dir).iterdir())
+    if len(files) == 1 and files[0].is_dir():
+        file_path = str(files[0])
+    return file_path, temp_dir_handle

bithuman/video_graph/__init__.py ADDED Viewed

@@ -0,0 +1,16 @@
+from . import trigger
+from .driver_video import DriverVideo, Frame, LoopingVideo, SingleActionVideo
+from .navigator import VideoGraphNavigator
+from .video_script import VideoConfig, VideoConfigs, VideoScript
+__all__ = [
+    "DriverVideo",
+    "LoopingVideo",
+    "SingleActionVideo",
+    "VideoConfigs",
+    "VideoConfig",
+    "VideoScript",
+    "VideoGraphNavigator",
+    "Frame",
+    "trigger",
+]

bithuman/video_graph/action_trigger.py ADDED Viewed

@@ -0,0 +1,83 @@
+from __future__ import annotations
+import json
+from typing import List, Literal, Optional
+from loguru import logger
+from pydantic import BaseModel, Field
+class TriggerData(BaseModel):
+    """Data to be sent when a trigger is activated"""
+    target_video: Optional[str] = None
+    actions: List[str] | str = Field(default_factory=list)
+    description: str = ""
+class VideoActionTrigger(BaseModel):
+    """Base class for video action triggers"""
+    trigger_data: TriggerData = Field(
+        description="Data to be sent when trigger conditions are met"
+    )
+    def check_trigger(self, condition: any) -> Optional[TriggerData]:
+        """
+        Base method to check if trigger conditions are met
+        Args:
+            condition: The condition to check against (type varies by trigger type)
+        Returns:
+            TriggerData if triggered, None otherwise
+        """
+        return None
+    @classmethod
+    def from_json(cls, json_str: str) -> List["VideoActionTrigger"]:
+        """
+        Create KeywordTrigger instances from JSON string using Pydantic validation
+        Args:
+            json_str: JSON string containing trigger configurations
+        Returns:
+            List of validated KeywordTrigger instances
+        """
+        if not json_str:
+            return []
+        try:
+            triggers_data = json.loads(json_str)
+            return [
+                cls.model_validate_json(json.dumps(trigger))
+                for trigger in triggers_data
+            ]
+        except Exception as e:
+            logger.exception(f"Error parsing KeywordTrigger: {e}")
+            return []
+class KeywordTrigger(VideoActionTrigger):
+    """Trigger that activates when specific keywords are detected"""
+    keywords: List[str] = Field(
+        description="List of keywords that can trigger this action"
+    )
+    trigger_source: Literal["user", "agent", "both"] = Field(
+        default="both", description="Who can trigger this action - user, agent, or both"
+    )
+    def check_trigger(
+        self, text: str, source: Literal["user", "agent"]
+    ) -> Optional[TriggerData]:
+        """
+        Check if the given text and source triggers this keyword
+        Args:
+            text: The text to check
+            source: The source of the text - either "user" or "agent"
+        Returns:
+            TriggerData if triggered, None otherwise
+        """
+        if self.trigger_source != "both" and source != self.trigger_source:
+            return None
+        if any(keyword.lower() in text.lower() for keyword in self.keywords):
+            return self.trigger_data
+        return None