bithuman 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bithuman might be problematic. Click here for more details.

Files changed (44) hide show
  1. bithuman/__init__.py +13 -0
  2. bithuman/_version.py +1 -0
  3. bithuman/api.py +164 -0
  4. bithuman/audio/__init__.py +19 -0
  5. bithuman/audio/audio.py +396 -0
  6. bithuman/audio/hparams.py +108 -0
  7. bithuman/audio/utils.py +255 -0
  8. bithuman/config.py +88 -0
  9. bithuman/engine/__init__.py +15 -0
  10. bithuman/engine/auth.py +335 -0
  11. bithuman/engine/compression.py +257 -0
  12. bithuman/engine/enums.py +16 -0
  13. bithuman/engine/image_ops.py +192 -0
  14. bithuman/engine/inference.py +108 -0
  15. bithuman/engine/knn.py +58 -0
  16. bithuman/engine/video_data.py +391 -0
  17. bithuman/engine/video_reader.py +168 -0
  18. bithuman/lib/__init__.py +1 -0
  19. bithuman/lib/audio_encoder.onnx +45631 -28
  20. bithuman/lib/generator.py +763 -0
  21. bithuman/lib/pth2h5.py +106 -0
  22. bithuman/plugins/__init__.py +0 -0
  23. bithuman/plugins/stt.py +185 -0
  24. bithuman/runtime.py +1004 -0
  25. bithuman/runtime_async.py +469 -0
  26. bithuman/service/__init__.py +9 -0
  27. bithuman/service/client.py +788 -0
  28. bithuman/service/messages.py +210 -0
  29. bithuman/service/server.py +759 -0
  30. bithuman/utils/__init__.py +43 -0
  31. bithuman/utils/agent.py +359 -0
  32. bithuman/utils/fps_controller.py +90 -0
  33. bithuman/utils/image.py +41 -0
  34. bithuman/utils/unzip.py +38 -0
  35. bithuman/video_graph/__init__.py +16 -0
  36. bithuman/video_graph/action_trigger.py +83 -0
  37. bithuman/video_graph/driver_video.py +482 -0
  38. bithuman/video_graph/navigator.py +736 -0
  39. bithuman/video_graph/trigger.py +90 -0
  40. bithuman/video_graph/video_script.py +344 -0
  41. bithuman-1.0.2.dist-info/METADATA +37 -0
  42. bithuman-1.0.2.dist-info/RECORD +44 -0
  43. bithuman-1.0.2.dist-info/WHEEL +5 -0
  44. bithuman-1.0.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,391 @@
1
+ """Video data management and blending pipeline — replaces video_data.cpp.
2
+
3
+ Manages per-video frame storage, HDF5 metadata (face_coords, face_masks),
4
+ avatar lip-sync data, and the compositing pipeline.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import threading
10
+ from dataclasses import dataclass
11
+ from typing import Dict, List, Optional
12
+
13
+ import h5py
14
+ import numpy as np
15
+
16
+ from .compression import (
17
+ CompressionType,
18
+ cleanup_temp_dir,
19
+ create_temp_dir,
20
+ decode_image,
21
+ decode_jpeg,
22
+ encode_image,
23
+ )
24
+ from .enums import LoadingMode
25
+ from .image_ops import blend_face_region, resize_image
26
+ from .video_reader import ENCRYPT_KEY, VideoReader, is_encrypted_file, iter_video_frames
27
+
28
+
29
+ # ---------------------------------------------------------------------------
30
+ # Data structures matching C++ VideoInferenceData / BoundingBox
31
+ # ---------------------------------------------------------------------------
32
+ @dataclass
33
+ class BoundingBox:
34
+ x1: int
35
+ y1: int
36
+ x2: int
37
+ y2: int
38
+
39
+
40
+ @dataclass
41
+ class VideoInferenceData:
42
+ face_coords: List[BoundingBox]
43
+ face_masks: List[bytes] # JPEG-encoded bytes per frame
44
+ frame_wh: tuple # (width, height)
45
+
46
+
47
+ # ---------------------------------------------------------------------------
48
+ # LipFormat — matches VideoData::LipFormat enum in C++
49
+ # ---------------------------------------------------------------------------
50
+ class LipFormat:
51
+ TIME_FIRST = "time-first"
52
+ FEATURE_FIRST = "feature-first"
53
+ NONE = "none"
54
+
55
+ @staticmethod
56
+ def detect(avatar_data_path: str) -> str:
57
+ """Detect lip format from avatar data path.
58
+
59
+ Matches video_data.cpp VideoData::detectFormat (lines 232-243).
60
+ """
61
+ if not avatar_data_path:
62
+ return LipFormat.NONE
63
+ if "feature-first" in avatar_data_path:
64
+ return LipFormat.FEATURE_FIRST
65
+ if "time-first" in avatar_data_path:
66
+ return LipFormat.TIME_FIRST
67
+ raise RuntimeError(f"Unknown avatar data format: {avatar_data_path}")
68
+
69
+
70
+ # ---------------------------------------------------------------------------
71
+ # HDF5 loading — matches loadVideoInferenceData in video_data.cpp
72
+ # ---------------------------------------------------------------------------
73
+ def load_video_inference_data(h5_path: str) -> VideoInferenceData:
74
+ """Load face_coords, face_masks, frame_wh from HDF5 file.
75
+
76
+ Matches video_data.cpp loadVideoInferenceData (lines 12-95).
77
+ """
78
+ with h5py.File(h5_path, "r") as f:
79
+ # Read frame_wh attribute
80
+ wh = f.attrs["frame_wh"]
81
+ frame_wh = (int(wh[0]), int(wh[1]))
82
+
83
+ # Read face_coords: (num_frames, 4) int32
84
+ coords = f["face_coords"][:]
85
+ face_coords = [BoundingBox(*row) for row in coords]
86
+
87
+ # Read face_masks: variable-length uint8 arrays (JPEG bytes)
88
+ masks_ds = f["face_masks"]
89
+ face_masks = [bytes(masks_ds[i]) for i in range(len(masks_ds))]
90
+
91
+ return VideoInferenceData(
92
+ face_coords=face_coords,
93
+ face_masks=face_masks,
94
+ frame_wh=frame_wh,
95
+ )
96
+
97
+
98
+ # ---------------------------------------------------------------------------
99
+ # VideoData — matches VideoData class in video_data.cpp
100
+ # ---------------------------------------------------------------------------
101
+ class VideoData:
102
+ """Manages frame data, avatar lip-sync, and compositing for one video.
103
+
104
+ Supports SYNC, ASYNC, and ON_DEMAND loading modes.
105
+ """
106
+
107
+ def __init__(
108
+ self,
109
+ video_path: str,
110
+ video_data_path: str,
111
+ avatar_data_path: str,
112
+ lip_format: str,
113
+ compression_type: CompressionType,
114
+ loading_mode: LoadingMode,
115
+ thread_count: int = 0,
116
+ ) -> None:
117
+ self._video_path = video_path
118
+ self._video_data_path = video_data_path
119
+ self._avatar_data_path = avatar_data_path
120
+ self._lip_format = lip_format
121
+ self._compression_type = compression_type
122
+ self._loading_mode = loading_mode
123
+ self._thread_count = thread_count
124
+
125
+ # Frame storage
126
+ self._frames: List[bytes] = []
127
+ self._data: Optional[VideoInferenceData] = None
128
+ self._avatar_reader: Optional[VideoReader] = None
129
+
130
+ # Mask decoding cache (one-shot decode, matches ensureMasksDecoded)
131
+ self._decoded_masks: Optional[List[np.ndarray]] = None
132
+ self._masks_lock = threading.Lock()
133
+
134
+ # Temp dir for TEMP_FILE compression
135
+ self._temp_dir = ""
136
+ if compression_type == CompressionType.TEMP_FILE:
137
+ self._temp_dir = create_temp_dir()
138
+
139
+ # Async loading state
140
+ self._loading_thread: Optional[threading.Thread] = None
141
+ self._loaded = threading.Event()
142
+
143
+ # Load based on mode
144
+ if loading_mode == LoadingMode.SYNC:
145
+ self._load_data()
146
+ self._loaded.set()
147
+ elif loading_mode == LoadingMode.ASYNC:
148
+ self._loading_thread = threading.Thread(
149
+ target=self._load_data_async, daemon=True
150
+ )
151
+ self._loading_thread.start()
152
+ elif loading_mode == LoadingMode.ON_DEMAND:
153
+ self._load_data_on_demand()
154
+ self._loaded.set()
155
+
156
+ def _load_data(self) -> None:
157
+ """Load all data synchronously. Matches video_data.cpp loadData."""
158
+ frame_width = -1
159
+ frame_height = -1
160
+ num_frames = -1
161
+
162
+ if self._video_data_path:
163
+ self._data = load_video_inference_data(self._video_data_path)
164
+ frame_width, frame_height = self._data.frame_wh
165
+ num_frames = len(self._data.face_coords)
166
+
167
+ # Load and compress video frames
168
+ self._frames = self._read_video_frames(frame_width, frame_height, num_frames)
169
+
170
+ # Verify frame count
171
+ if (
172
+ self._data
173
+ and self._data.face_coords
174
+ and len(self._frames) != len(self._data.face_coords)
175
+ ):
176
+ raise RuntimeError(
177
+ f"Video contains {len(self._frames)} frames, but video data "
178
+ f"contains {len(self._data.face_coords)} face coordinates"
179
+ )
180
+
181
+ # Initialize avatar reader for lip-sync data
182
+ if self._avatar_data_path:
183
+ self._avatar_reader = VideoReader(
184
+ self._avatar_data_path,
185
+ ENCRYPT_KEY.decode("utf-8"),
186
+ self._thread_count,
187
+ )
188
+
189
+ def _load_data_async(self) -> None:
190
+ """Load data in background thread. Matches video_data.cpp loadDataAsync."""
191
+ try:
192
+ self._load_data()
193
+ finally:
194
+ self._loaded.set()
195
+
196
+ def _load_data_on_demand(self) -> None:
197
+ """Initialize metadata only; frames loaded lazily.
198
+
199
+ Matches video_data.cpp loadDataOnDemand.
200
+ """
201
+ if self._video_data_path:
202
+ self._data = load_video_inference_data(self._video_data_path)
203
+
204
+ # Initialize avatar reader
205
+ if self._avatar_data_path:
206
+ self._avatar_reader = VideoReader(
207
+ self._avatar_data_path,
208
+ ENCRYPT_KEY.decode("utf-8"),
209
+ self._thread_count,
210
+ )
211
+
212
+ # Video reader for on-demand frame access
213
+ self._on_demand_reader: Optional[VideoReader] = None
214
+ self._on_demand_cache: Dict[int, bytes] = {}
215
+
216
+ def _read_video_frames(
217
+ self, frame_width: int, frame_height: int, num_frames: int
218
+ ) -> List[bytes]:
219
+ """Read video frames, resize if needed, and compress.
220
+
221
+ Uses streaming decode: each frame is decoded, compressed, and then
222
+ the raw frame is discarded. Peak memory is ~1 raw frame + compressed
223
+ frames (instead of all raw frames + compressed frames).
224
+
225
+ Matches image.cpp readVideoFrames (lines 85-112).
226
+ """
227
+ max_frames = num_frames if num_frames > 0 else -1
228
+ frames: List[bytes] = []
229
+ for frame in iter_video_frames(self._video_path, "", 0, max_frames):
230
+ if frame.size == 0:
231
+ continue
232
+ if (
233
+ frame_width != -1
234
+ and frame_height != -1
235
+ and (frame.shape[1] != frame_width or frame.shape[0] != frame_height)
236
+ ):
237
+ frame = resize_image(frame, frame_width, frame_height)
238
+ frames.append(
239
+ encode_image(frame, self._compression_type, temp_dir=self._temp_dir)
240
+ )
241
+ return frames
242
+
243
+ def _get_frame_data(self, frame_idx: int) -> bytes:
244
+ """Get compressed frame data by index.
245
+
246
+ Matches video_data.cpp getFrameData (lines 201-215).
247
+ """
248
+ if self._loading_mode in (LoadingMode.ASYNC, LoadingMode.ON_DEMAND):
249
+ # Wait for async loading to complete
250
+ if not self._loaded.is_set():
251
+ self._loaded.wait()
252
+
253
+ if self._loading_mode == LoadingMode.ON_DEMAND and frame_idx not in self._on_demand_cache:
254
+ # Lazy load this frame
255
+ if self._on_demand_reader is None:
256
+ self._on_demand_reader = VideoReader(self._video_path, "", 1)
257
+ frame = self._on_demand_reader.get(frame_idx)
258
+ if (
259
+ self._data
260
+ and self._data.frame_wh[0] > 0
261
+ and (
262
+ frame.shape[1] != self._data.frame_wh[0]
263
+ or frame.shape[0] != self._data.frame_wh[1]
264
+ )
265
+ ):
266
+ frame = resize_image(frame, self._data.frame_wh[0], self._data.frame_wh[1])
267
+ compressed = encode_image(
268
+ frame, self._compression_type, temp_dir=self._temp_dir
269
+ )
270
+ self._on_demand_cache[frame_idx] = compressed
271
+ return compressed
272
+
273
+ if self._loading_mode == LoadingMode.ON_DEMAND:
274
+ return self._on_demand_cache[frame_idx]
275
+
276
+ if frame_idx < 0 or frame_idx >= len(self._frames):
277
+ raise RuntimeError(f"Frame index out of range: {frame_idx}")
278
+ return self._frames[frame_idx]
279
+
280
+ def get_original_frame(self, frame_idx: int) -> np.ndarray:
281
+ """Decode compressed frame to BGR image.
282
+
283
+ Matches video_data.cpp getOriginalFrame (lines 245-248).
284
+ """
285
+ data = self._get_frame_data(frame_idx)
286
+ return decode_image(data, self._compression_type)
287
+
288
+ def get_avatar_frame(
289
+ self, frame_idx: int, cluster_idx: int, num_clusters: int
290
+ ) -> np.ndarray:
291
+ """Get lip overlay frame using TIME_FIRST or FEATURE_FIRST indexing.
292
+
293
+ Matches video_data.cpp getAvatarFrame (lines 250-265).
294
+ """
295
+ if not self._avatar_reader:
296
+ raise RuntimeError("Avatar reader not initialized")
297
+
298
+ if self._lip_format == LipFormat.TIME_FIRST:
299
+ index = cluster_idx * self._avatar_reader.size() // num_clusters + frame_idx
300
+ else:
301
+ index = frame_idx * num_clusters + cluster_idx
302
+
303
+ return self._avatar_reader.get(index)
304
+
305
+ def _ensure_masks_decoded(self) -> None:
306
+ """Decode all JPEG face masks on first access, then cache.
307
+
308
+ Matches video_data.cpp ensureMasksDecoded (lines 267-279).
309
+ Uses call_once pattern via threading lock.
310
+ """
311
+ if self._decoded_masks is not None:
312
+ return
313
+ with self._masks_lock:
314
+ if self._decoded_masks is not None:
315
+ return
316
+ if not self._data or not self._data.face_masks:
317
+ self._decoded_masks = []
318
+ return
319
+ masks = []
320
+ for mask_bytes in self._data.face_masks:
321
+ if mask_bytes:
322
+ masks.append(decode_jpeg(mask_bytes))
323
+ else:
324
+ masks.append(np.empty((0, 0, 3), dtype=np.uint8))
325
+ self._decoded_masks = masks
326
+ # Release compressed data (matches C++ clear+shrink_to_fit)
327
+ self._data.face_masks = []
328
+
329
+ def get_blended_frame(
330
+ self, frame_idx: int, cluster_idx: int, num_clusters: int
331
+ ) -> np.ndarray:
332
+ """Full compositing pipeline: original + resize + lip + mask blend.
333
+
334
+ Matches video_data.cpp getBlendedFrame (lines 281-316):
335
+ 1. Get original frame, resize to frame_wh if needed
336
+ 2. Get avatar lip frame
337
+ 3. Get face bounding box
338
+ 4. Decode face mask (cached)
339
+ 5. Alpha blend lip into face region
340
+ """
341
+ # 1. Get original frame, resize if needed
342
+ frame = self.get_original_frame(frame_idx)
343
+ if self._data and (
344
+ frame.shape[1] != self._data.frame_wh[0]
345
+ or frame.shape[0] != self._data.frame_wh[1]
346
+ ):
347
+ frame = resize_image(frame, self._data.frame_wh[0], self._data.frame_wh[1])
348
+
349
+ # Make frame writable (in case it came from a read-only buffer)
350
+ if not frame.flags.writeable:
351
+ frame = frame.copy()
352
+
353
+ # 2. Get lip frame
354
+ lip = self.get_avatar_frame(frame_idx, cluster_idx, num_clusters)
355
+
356
+ # 3. Get face bounding box
357
+ box = self._data.face_coords[frame_idx]
358
+ face_width = box.x2 - box.x1
359
+ face_height = box.y2 - box.y1
360
+
361
+ # 4. Decode face mask (cached after first call)
362
+ self._ensure_masks_decoded()
363
+ mask = self._decoded_masks[frame_idx]
364
+ if mask.shape[1] != face_width or mask.shape[0] != face_height:
365
+ raise RuntimeError(
366
+ f"Mask size does not match face region size: "
367
+ f"{mask.shape[1]} != {face_width} or "
368
+ f"{mask.shape[0]} != {face_height}"
369
+ )
370
+
371
+ # 5. Blend lip into face region
372
+ blend_face_region(frame, box.x1, box.y1, face_width, face_height, lip, mask)
373
+
374
+ return frame
375
+
376
+ def has_lip_data(self) -> bool:
377
+ return self._avatar_reader is not None
378
+
379
+ @property
380
+ def num_frames(self) -> int:
381
+ if self._frames:
382
+ return len(self._frames)
383
+ if self._loading_mode == LoadingMode.ON_DEMAND and self._data:
384
+ return len(self._data.face_coords)
385
+ return 0
386
+
387
+ def __del__(self) -> None:
388
+ if self._avatar_reader:
389
+ self._avatar_reader.close()
390
+ if self._temp_dir:
391
+ cleanup_temp_dir(self._temp_dir)
@@ -0,0 +1,168 @@
1
+ """Video decoding with PyAV — replaces video_decode.cpp.
2
+
3
+ Handles both regular video files and XOR-encrypted .bhtensor files.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import io
9
+ from typing import Iterator, List, Optional
10
+
11
+ import numpy as np
12
+
13
+ ENCRYPT_KEY = b"bithuman_video_data_key" # 26 bytes — matches C++ video_data_key
14
+
15
+
16
+ def is_encrypted_file(path: str) -> bool:
17
+ """Check if file is encrypted based on extension.
18
+
19
+ Matches video_decode.cpp isEncryptedFile.
20
+ """
21
+ return path.endswith(".bhtensor")
22
+
23
+
24
+ def xor_decrypt_bytes(data: bytes, key: bytes = ENCRYPT_KEY) -> bytes:
25
+ """XOR-decrypt data in-place style, returning a new bytes object.
26
+
27
+ Matches video_decode.cpp decryptDataInPlace: byte-wise XOR with repeating key.
28
+ Uses numpy for vectorized XOR (much faster than Python byte loop).
29
+ """
30
+ key_len = len(key)
31
+ data_arr = np.frombuffer(data, dtype=np.uint8).copy()
32
+ key_arr = np.frombuffer(key, dtype=np.uint8)
33
+
34
+ # Tile key to match data length and XOR
35
+ full_key = np.tile(key_arr, len(data_arr) // key_len + 1)[: len(data_arr)]
36
+ data_arr ^= full_key
37
+ return data_arr.tobytes()
38
+
39
+
40
+ def read_and_decrypt_file(path: str, key: bytes = ENCRYPT_KEY) -> bytes:
41
+ """Read file and XOR-decrypt its contents.
42
+
43
+ Matches video_decode.cpp readAndDecryptFile.
44
+ """
45
+ with open(path, "rb") as f:
46
+ data = f.read()
47
+ return xor_decrypt_bytes(data, key)
48
+
49
+
50
+ class VideoReader:
51
+ """PyAV-based video reader replacing FFmpeg C wrapper.
52
+
53
+ Decodes all frames into memory for random access. Avatar data videos
54
+ are typically small, so this is acceptable and matches SYNC loading.
55
+
56
+ For encrypted .bhtensor files, the file is fully decrypted into memory
57
+ then passed to PyAV via BytesIO.
58
+ """
59
+
60
+ def __init__(
61
+ self,
62
+ video_path: str,
63
+ key: str = "",
64
+ thread_count: int = 0,
65
+ ) -> None:
66
+ import av
67
+
68
+ self._path = video_path
69
+ self._is_encrypted = is_encrypted_file(video_path)
70
+
71
+ # Open container
72
+ if self._is_encrypted:
73
+ decrypt_key = key.encode("utf-8") if key else ENCRYPT_KEY
74
+ decrypted = read_and_decrypt_file(video_path, decrypt_key)
75
+ self._container = av.open(io.BytesIO(decrypted))
76
+ else:
77
+ self._container = av.open(video_path)
78
+
79
+ stream = self._container.streams.video[0]
80
+ stream.thread_type = "AUTO"
81
+
82
+ self._width: int = stream.codec_context.width
83
+ self._height: int = stream.codec_context.height
84
+
85
+ # Decode all frames into memory for random access
86
+ self._frames: List[np.ndarray] = []
87
+ self._container.seek(0)
88
+ for frame in self._container.decode(video=0):
89
+ bgr = frame.to_ndarray(format="bgr24")
90
+ self._frames.append(bgr)
91
+
92
+ self._frame_count = len(self._frames)
93
+
94
+ def get(self, index: int) -> np.ndarray:
95
+ """Get a decoded frame by index (returns a copy).
96
+
97
+ Matches VideoReader::get in video_decode.cpp.
98
+ """
99
+ if index < 0:
100
+ index = self._frame_count + index
101
+ if index < 0 or index >= self._frame_count:
102
+ raise IndexError(
103
+ f"Frame index {index} out of range [0, {self._frame_count})"
104
+ )
105
+ return self._frames[index].copy()
106
+
107
+ def size(self) -> int:
108
+ """Return total number of decoded frames."""
109
+ return self._frame_count
110
+
111
+ @property
112
+ def width(self) -> int:
113
+ return self._width
114
+
115
+ @property
116
+ def height(self) -> int:
117
+ return self._height
118
+
119
+ def close(self) -> None:
120
+ if self._container:
121
+ self._container.close()
122
+ self._container = None
123
+
124
+
125
+ def iter_video_frames(
126
+ video_path: str,
127
+ key: str = "",
128
+ thread_count: int = 0,
129
+ max_frames: int = -1,
130
+ ) -> Iterator[np.ndarray]:
131
+ """Yield decoded BGR24 frames one at a time from a video file.
132
+
133
+ Unlike VideoReader (which decodes all frames into memory), this function
134
+ yields frames sequentially and discards each one after yielding. Peak
135
+ memory is ~1 frame instead of all frames.
136
+
137
+ Args:
138
+ video_path: Path to video file (.mp4 or .bhtensor).
139
+ key: Decryption key for .bhtensor files. Empty string uses default.
140
+ thread_count: Unused (kept for API symmetry with VideoReader).
141
+ max_frames: Maximum number of frames to yield. -1 means all frames.
142
+
143
+ Yields:
144
+ BGR24 numpy arrays of shape (height, width, 3).
145
+ """
146
+ import av
147
+
148
+ encrypted = is_encrypted_file(video_path)
149
+
150
+ if encrypted:
151
+ decrypt_key = key.encode("utf-8") if key else ENCRYPT_KEY
152
+ decrypted = read_and_decrypt_file(video_path, decrypt_key)
153
+ container = av.open(io.BytesIO(decrypted))
154
+ else:
155
+ container = av.open(video_path)
156
+
157
+ try:
158
+ stream = container.streams.video[0]
159
+ stream.thread_type = "AUTO"
160
+ container.seek(0)
161
+ count = 0
162
+ for frame in container.decode(video=0):
163
+ if 0 <= max_frames <= count:
164
+ break
165
+ yield frame.to_ndarray(format="bgr24")
166
+ count += 1
167
+ finally:
168
+ container.close()
@@ -0,0 +1 @@
1
+