PyPI - bithuman - Versions diffs - 1.0.2__py3-none-any.whl - Mend

bithuman 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of bithuman might be problematic. Click here for more details.

Files changed (44) hide show

bithuman/__init__.py +13 -0
bithuman/_version.py +1 -0
bithuman/api.py +164 -0
bithuman/audio/__init__.py +19 -0
bithuman/audio/audio.py +396 -0
bithuman/audio/hparams.py +108 -0
bithuman/audio/utils.py +255 -0
bithuman/config.py +88 -0
bithuman/engine/__init__.py +15 -0
bithuman/engine/auth.py +335 -0
bithuman/engine/compression.py +257 -0
bithuman/engine/enums.py +16 -0
bithuman/engine/image_ops.py +192 -0
bithuman/engine/inference.py +108 -0
bithuman/engine/knn.py +58 -0
bithuman/engine/video_data.py +391 -0
bithuman/engine/video_reader.py +168 -0
bithuman/lib/__init__.py +1 -0
bithuman/lib/audio_encoder.onnx +45631 -28
bithuman/lib/generator.py +763 -0
bithuman/lib/pth2h5.py +106 -0
bithuman/plugins/__init__.py +0 -0
bithuman/plugins/stt.py +185 -0
bithuman/runtime.py +1004 -0
bithuman/runtime_async.py +469 -0
bithuman/service/__init__.py +9 -0
bithuman/service/client.py +788 -0
bithuman/service/messages.py +210 -0
bithuman/service/server.py +759 -0
bithuman/utils/__init__.py +43 -0
bithuman/utils/agent.py +359 -0
bithuman/utils/fps_controller.py +90 -0
bithuman/utils/image.py +41 -0
bithuman/utils/unzip.py +38 -0
bithuman/video_graph/__init__.py +16 -0
bithuman/video_graph/action_trigger.py +83 -0
bithuman/video_graph/driver_video.py +482 -0
bithuman/video_graph/navigator.py +736 -0
bithuman/video_graph/trigger.py +90 -0
bithuman/video_graph/video_script.py +344 -0
bithuman-1.0.2.dist-info/METADATA +37 -0
bithuman-1.0.2.dist-info/RECORD +44 -0
bithuman-1.0.2.dist-info/WHEEL +5 -0
bithuman-1.0.2.dist-info/top_level.txt +1 -0

bithuman/engine/video_data.py ADDED Viewed

@@ -0,0 +1,391 @@
+"""Video data management and blending pipeline — replaces video_data.cpp.
+Manages per-video frame storage, HDF5 metadata (face_coords, face_masks),
+avatar lip-sync data, and the compositing pipeline.
+"""
+from __future__ import annotations
+import threading
+from dataclasses import dataclass
+from typing import Dict, List, Optional
+import h5py
+import numpy as np
+from .compression import (
+    CompressionType,
+    cleanup_temp_dir,
+    create_temp_dir,
+    decode_image,
+    decode_jpeg,
+    encode_image,
+)
+from .enums import LoadingMode
+from .image_ops import blend_face_region, resize_image
+from .video_reader import ENCRYPT_KEY, VideoReader, is_encrypted_file, iter_video_frames
+# ---------------------------------------------------------------------------
+# Data structures matching C++ VideoInferenceData / BoundingBox
+# ---------------------------------------------------------------------------
+@dataclass
+class BoundingBox:
+    x1: int
+    y1: int
+    x2: int
+    y2: int
+@dataclass
+class VideoInferenceData:
+    face_coords: List[BoundingBox]
+    face_masks: List[bytes]  # JPEG-encoded bytes per frame
+    frame_wh: tuple  # (width, height)
+# ---------------------------------------------------------------------------
+# LipFormat — matches VideoData::LipFormat enum in C++
+# ---------------------------------------------------------------------------
+class LipFormat:
+    TIME_FIRST = "time-first"
+    FEATURE_FIRST = "feature-first"
+    NONE = "none"
+    @staticmethod
+    def detect(avatar_data_path: str) -> str:
+        """Detect lip format from avatar data path.
+        Matches video_data.cpp VideoData::detectFormat (lines 232-243).
+        """
+        if not avatar_data_path:
+            return LipFormat.NONE
+        if "feature-first" in avatar_data_path:
+            return LipFormat.FEATURE_FIRST
+        if "time-first" in avatar_data_path:
+            return LipFormat.TIME_FIRST
+        raise RuntimeError(f"Unknown avatar data format: {avatar_data_path}")
+# ---------------------------------------------------------------------------
+# HDF5 loading — matches loadVideoInferenceData in video_data.cpp
+# ---------------------------------------------------------------------------
+def load_video_inference_data(h5_path: str) -> VideoInferenceData:
+    """Load face_coords, face_masks, frame_wh from HDF5 file.
+    Matches video_data.cpp loadVideoInferenceData (lines 12-95).
+    """
+    with h5py.File(h5_path, "r") as f:
+        # Read frame_wh attribute
+        wh = f.attrs["frame_wh"]
+        frame_wh = (int(wh[0]), int(wh[1]))
+        # Read face_coords: (num_frames, 4) int32
+        coords = f["face_coords"][:]
+        face_coords = [BoundingBox(*row) for row in coords]
+        # Read face_masks: variable-length uint8 arrays (JPEG bytes)
+        masks_ds = f["face_masks"]
+        face_masks = [bytes(masks_ds[i]) for i in range(len(masks_ds))]
+    return VideoInferenceData(
+        face_coords=face_coords,
+        face_masks=face_masks,
+        frame_wh=frame_wh,
+    )
+# ---------------------------------------------------------------------------
+# VideoData — matches VideoData class in video_data.cpp
+# ---------------------------------------------------------------------------
+class VideoData:
+    """Manages frame data, avatar lip-sync, and compositing for one video.
+    Supports SYNC, ASYNC, and ON_DEMAND loading modes.
+    """
+    def __init__(
+        self,
+        video_path: str,
+        video_data_path: str,
+        avatar_data_path: str,
+        lip_format: str,
+        compression_type: CompressionType,
+        loading_mode: LoadingMode,
+        thread_count: int = 0,
+    ) -> None:
+        self._video_path = video_path
+        self._video_data_path = video_data_path
+        self._avatar_data_path = avatar_data_path
+        self._lip_format = lip_format
+        self._compression_type = compression_type
+        self._loading_mode = loading_mode
+        self._thread_count = thread_count
+        # Frame storage
+        self._frames: List[bytes] = []
+        self._data: Optional[VideoInferenceData] = None
+        self._avatar_reader: Optional[VideoReader] = None
+        # Mask decoding cache (one-shot decode, matches ensureMasksDecoded)
+        self._decoded_masks: Optional[List[np.ndarray]] = None
+        self._masks_lock = threading.Lock()
+        # Temp dir for TEMP_FILE compression
+        self._temp_dir = ""
+        if compression_type == CompressionType.TEMP_FILE:
+            self._temp_dir = create_temp_dir()
+        # Async loading state
+        self._loading_thread: Optional[threading.Thread] = None
+        self._loaded = threading.Event()
+        # Load based on mode
+        if loading_mode == LoadingMode.SYNC:
+            self._load_data()
+            self._loaded.set()
+        elif loading_mode == LoadingMode.ASYNC:
+            self._loading_thread = threading.Thread(
+                target=self._load_data_async, daemon=True
+            )
+            self._loading_thread.start()
+        elif loading_mode == LoadingMode.ON_DEMAND:
+            self._load_data_on_demand()
+            self._loaded.set()
+    def _load_data(self) -> None:
+        """Load all data synchronously. Matches video_data.cpp loadData."""
+        frame_width = -1
+        frame_height = -1
+        num_frames = -1
+        if self._video_data_path:
+            self._data = load_video_inference_data(self._video_data_path)
+            frame_width, frame_height = self._data.frame_wh
+            num_frames = len(self._data.face_coords)
+        # Load and compress video frames
+        self._frames = self._read_video_frames(frame_width, frame_height, num_frames)
+        # Verify frame count
+        if (
+            self._data
+            and self._data.face_coords
+            and len(self._frames) != len(self._data.face_coords)
+        ):
+            raise RuntimeError(
+                f"Video contains {len(self._frames)} frames, but video data "
+                f"contains {len(self._data.face_coords)} face coordinates"
+            )
+        # Initialize avatar reader for lip-sync data
+        if self._avatar_data_path:
+            self._avatar_reader = VideoReader(
+                self._avatar_data_path,
+                ENCRYPT_KEY.decode("utf-8"),
+                self._thread_count,
+            )
+    def _load_data_async(self) -> None:
+        """Load data in background thread. Matches video_data.cpp loadDataAsync."""
+        try:
+            self._load_data()
+        finally:
+            self._loaded.set()
+    def _load_data_on_demand(self) -> None:
+        """Initialize metadata only; frames loaded lazily.
+        Matches video_data.cpp loadDataOnDemand.
+        """
+        if self._video_data_path:
+            self._data = load_video_inference_data(self._video_data_path)
+        # Initialize avatar reader
+        if self._avatar_data_path:
+            self._avatar_reader = VideoReader(
+                self._avatar_data_path,
+                ENCRYPT_KEY.decode("utf-8"),
+                self._thread_count,
+            )
+        # Video reader for on-demand frame access
+        self._on_demand_reader: Optional[VideoReader] = None
+        self._on_demand_cache: Dict[int, bytes] = {}
+    def _read_video_frames(
+        self, frame_width: int, frame_height: int, num_frames: int
+    ) -> List[bytes]:
+        """Read video frames, resize if needed, and compress.
+        Uses streaming decode: each frame is decoded, compressed, and then
+        the raw frame is discarded. Peak memory is ~1 raw frame + compressed
+        frames (instead of all raw frames + compressed frames).
+        Matches image.cpp readVideoFrames (lines 85-112).
+        """
+        max_frames = num_frames if num_frames > 0 else -1
+        frames: List[bytes] = []
+        for frame in iter_video_frames(self._video_path, "", 0, max_frames):
+            if frame.size == 0:
+                continue
+            if (
+                frame_width != -1
+                and frame_height != -1
+                and (frame.shape[1] != frame_width or frame.shape[0] != frame_height)
+            ):
+                frame = resize_image(frame, frame_width, frame_height)
+            frames.append(
+                encode_image(frame, self._compression_type, temp_dir=self._temp_dir)
+            )
+        return frames
+    def _get_frame_data(self, frame_idx: int) -> bytes:
+        """Get compressed frame data by index.
+        Matches video_data.cpp getFrameData (lines 201-215).
+        """
+        if self._loading_mode in (LoadingMode.ASYNC, LoadingMode.ON_DEMAND):
+            # Wait for async loading to complete
+            if not self._loaded.is_set():
+                self._loaded.wait()
+        if self._loading_mode == LoadingMode.ON_DEMAND and frame_idx not in self._on_demand_cache:
+            # Lazy load this frame
+            if self._on_demand_reader is None:
+                self._on_demand_reader = VideoReader(self._video_path, "", 1)
+            frame = self._on_demand_reader.get(frame_idx)
+            if (
+                self._data
+                and self._data.frame_wh[0] > 0
+                and (
+                    frame.shape[1] != self._data.frame_wh[0]
+                    or frame.shape[0] != self._data.frame_wh[1]
+                )
+            ):
+                frame = resize_image(frame, self._data.frame_wh[0], self._data.frame_wh[1])
+            compressed = encode_image(
+                frame, self._compression_type, temp_dir=self._temp_dir
+            )
+            self._on_demand_cache[frame_idx] = compressed
+            return compressed
+        if self._loading_mode == LoadingMode.ON_DEMAND:
+            return self._on_demand_cache[frame_idx]
+        if frame_idx < 0 or frame_idx >= len(self._frames):
+            raise RuntimeError(f"Frame index out of range: {frame_idx}")
+        return self._frames[frame_idx]
+    def get_original_frame(self, frame_idx: int) -> np.ndarray:
+        """Decode compressed frame to BGR image.
+        Matches video_data.cpp getOriginalFrame (lines 245-248).
+        """
+        data = self._get_frame_data(frame_idx)
+        return decode_image(data, self._compression_type)
+    def get_avatar_frame(
+        self, frame_idx: int, cluster_idx: int, num_clusters: int
+    ) -> np.ndarray:
+        """Get lip overlay frame using TIME_FIRST or FEATURE_FIRST indexing.
+        Matches video_data.cpp getAvatarFrame (lines 250-265).
+        """
+        if not self._avatar_reader:
+            raise RuntimeError("Avatar reader not initialized")
+        if self._lip_format == LipFormat.TIME_FIRST:
+            index = cluster_idx * self._avatar_reader.size() // num_clusters + frame_idx
+        else:
+            index = frame_idx * num_clusters + cluster_idx
+        return self._avatar_reader.get(index)
+    def _ensure_masks_decoded(self) -> None:
+        """Decode all JPEG face masks on first access, then cache.
+        Matches video_data.cpp ensureMasksDecoded (lines 267-279).
+        Uses call_once pattern via threading lock.
+        """
+        if self._decoded_masks is not None:
+            return
+        with self._masks_lock:
+            if self._decoded_masks is not None:
+                return
+            if not self._data or not self._data.face_masks:
+                self._decoded_masks = []
+                return
+            masks = []
+            for mask_bytes in self._data.face_masks:
+                if mask_bytes:
+                    masks.append(decode_jpeg(mask_bytes))
+                else:
+                    masks.append(np.empty((0, 0, 3), dtype=np.uint8))
+            self._decoded_masks = masks
+            # Release compressed data (matches C++ clear+shrink_to_fit)
+            self._data.face_masks = []
+    def get_blended_frame(
+        self, frame_idx: int, cluster_idx: int, num_clusters: int
+    ) -> np.ndarray:
+        """Full compositing pipeline: original + resize + lip + mask blend.
+        Matches video_data.cpp getBlendedFrame (lines 281-316):
+        1. Get original frame, resize to frame_wh if needed
+        2. Get avatar lip frame
+        3. Get face bounding box
+        4. Decode face mask (cached)
+        5. Alpha blend lip into face region
+        """
+        # 1. Get original frame, resize if needed
+        frame = self.get_original_frame(frame_idx)
+        if self._data and (
+            frame.shape[1] != self._data.frame_wh[0]
+            or frame.shape[0] != self._data.frame_wh[1]
+        ):
+            frame = resize_image(frame, self._data.frame_wh[0], self._data.frame_wh[1])
+        # Make frame writable (in case it came from a read-only buffer)
+        if not frame.flags.writeable:
+            frame = frame.copy()
+        # 2. Get lip frame
+        lip = self.get_avatar_frame(frame_idx, cluster_idx, num_clusters)
+        # 3. Get face bounding box
+        box = self._data.face_coords[frame_idx]
+        face_width = box.x2 - box.x1
+        face_height = box.y2 - box.y1
+        # 4. Decode face mask (cached after first call)
+        self._ensure_masks_decoded()
+        mask = self._decoded_masks[frame_idx]
+        if mask.shape[1] != face_width or mask.shape[0] != face_height:
+            raise RuntimeError(
+                f"Mask size does not match face region size: "
+                f"{mask.shape[1]} != {face_width} or "
+                f"{mask.shape[0]} != {face_height}"
+            )
+        # 5. Blend lip into face region
+        blend_face_region(frame, box.x1, box.y1, face_width, face_height, lip, mask)
+        return frame
+    def has_lip_data(self) -> bool:
+        return self._avatar_reader is not None
+    @property
+    def num_frames(self) -> int:
+        if self._frames:
+            return len(self._frames)
+        if self._loading_mode == LoadingMode.ON_DEMAND and self._data:
+            return len(self._data.face_coords)
+        return 0
+    def __del__(self) -> None:
+        if self._avatar_reader:
+            self._avatar_reader.close()
+        if self._temp_dir:
+            cleanup_temp_dir(self._temp_dir)

bithuman/engine/video_reader.py ADDED Viewed

@@ -0,0 +1,168 @@
+"""Video decoding with PyAV — replaces video_decode.cpp.
+Handles both regular video files and XOR-encrypted .bhtensor files.
+"""
+from __future__ import annotations
+import io
+from typing import Iterator, List, Optional
+import numpy as np
+ENCRYPT_KEY = b"bithuman_video_data_key"  # 26 bytes — matches C++ video_data_key
+def is_encrypted_file(path: str) -> bool:
+    """Check if file is encrypted based on extension.
+    Matches video_decode.cpp isEncryptedFile.
+    """
+    return path.endswith(".bhtensor")
+def xor_decrypt_bytes(data: bytes, key: bytes = ENCRYPT_KEY) -> bytes:
+    """XOR-decrypt data in-place style, returning a new bytes object.
+    Matches video_decode.cpp decryptDataInPlace: byte-wise XOR with repeating key.
+    Uses numpy for vectorized XOR (much faster than Python byte loop).
+    """
+    key_len = len(key)
+    data_arr = np.frombuffer(data, dtype=np.uint8).copy()
+    key_arr = np.frombuffer(key, dtype=np.uint8)
+    # Tile key to match data length and XOR
+    full_key = np.tile(key_arr, len(data_arr) // key_len + 1)[: len(data_arr)]
+    data_arr ^= full_key
+    return data_arr.tobytes()
+def read_and_decrypt_file(path: str, key: bytes = ENCRYPT_KEY) -> bytes:
+    """Read file and XOR-decrypt its contents.
+    Matches video_decode.cpp readAndDecryptFile.
+    """
+    with open(path, "rb") as f:
+        data = f.read()
+    return xor_decrypt_bytes(data, key)
+class VideoReader:
+    """PyAV-based video reader replacing FFmpeg C wrapper.
+    Decodes all frames into memory for random access. Avatar data videos
+    are typically small, so this is acceptable and matches SYNC loading.
+    For encrypted .bhtensor files, the file is fully decrypted into memory
+    then passed to PyAV via BytesIO.
+    """
+    def __init__(
+        self,
+        video_path: str,
+        key: str = "",
+        thread_count: int = 0,
+    ) -> None:
+        import av
+        self._path = video_path
+        self._is_encrypted = is_encrypted_file(video_path)
+        # Open container
+        if self._is_encrypted:
+            decrypt_key = key.encode("utf-8") if key else ENCRYPT_KEY
+            decrypted = read_and_decrypt_file(video_path, decrypt_key)
+            self._container = av.open(io.BytesIO(decrypted))
+        else:
+            self._container = av.open(video_path)
+        stream = self._container.streams.video[0]
+        stream.thread_type = "AUTO"
+        self._width: int = stream.codec_context.width
+        self._height: int = stream.codec_context.height
+        # Decode all frames into memory for random access
+        self._frames: List[np.ndarray] = []
+        self._container.seek(0)
+        for frame in self._container.decode(video=0):
+            bgr = frame.to_ndarray(format="bgr24")
+            self._frames.append(bgr)
+        self._frame_count = len(self._frames)
+    def get(self, index: int) -> np.ndarray:
+        """Get a decoded frame by index (returns a copy).
+        Matches VideoReader::get in video_decode.cpp.
+        """
+        if index < 0:
+            index = self._frame_count + index
+        if index < 0 or index >= self._frame_count:
+            raise IndexError(
+                f"Frame index {index} out of range [0, {self._frame_count})"
+            )
+        return self._frames[index].copy()
+    def size(self) -> int:
+        """Return total number of decoded frames."""
+        return self._frame_count
+    @property
+    def width(self) -> int:
+        return self._width
+    @property
+    def height(self) -> int:
+        return self._height
+    def close(self) -> None:
+        if self._container:
+            self._container.close()
+            self._container = None
+def iter_video_frames(
+    video_path: str,
+    key: str = "",
+    thread_count: int = 0,
+    max_frames: int = -1,
+) -> Iterator[np.ndarray]:
+    """Yield decoded BGR24 frames one at a time from a video file.
+    Unlike VideoReader (which decodes all frames into memory), this function
+    yields frames sequentially and discards each one after yielding. Peak
+    memory is ~1 frame instead of all frames.
+    Args:
+        video_path: Path to video file (.mp4 or .bhtensor).
+        key: Decryption key for .bhtensor files. Empty string uses default.
+        thread_count: Unused (kept for API symmetry with VideoReader).
+        max_frames: Maximum number of frames to yield. -1 means all frames.
+    Yields:
+        BGR24 numpy arrays of shape (height, width, 3).
+    """
+    import av
+    encrypted = is_encrypted_file(video_path)
+    if encrypted:
+        decrypt_key = key.encode("utf-8") if key else ENCRYPT_KEY
+        decrypted = read_and_decrypt_file(video_path, decrypt_key)
+        container = av.open(io.BytesIO(decrypted))
+    else:
+        container = av.open(video_path)
+    try:
+        stream = container.streams.video[0]
+        stream.thread_type = "AUTO"
+        container.seek(0)
+        count = 0
+        for frame in container.decode(video=0):
+            if 0 <= max_frames <= count:
+                break
+            yield frame.to_ndarray(format="bgr24")
+            count += 1
+    finally:
+        container.close()

bithuman/lib/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+