livekit-plugins-humanlike 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- livekit_plugins_humanlike-0.1.0/PKG-INFO +110 -0
- livekit_plugins_humanlike-0.1.0/README.md +83 -0
- livekit_plugins_humanlike-0.1.0/livekit/__init__.py +0 -0
- livekit_plugins_humanlike-0.1.0/livekit/plugins/__init__.py +0 -0
- livekit_plugins_humanlike-0.1.0/livekit/plugins/humanlike/__init__.py +22 -0
- livekit_plugins_humanlike-0.1.0/livekit/plugins/humanlike/avatar.py +399 -0
- livekit_plugins_humanlike-0.1.0/livekit_plugins_humanlike.egg-info/PKG-INFO +110 -0
- livekit_plugins_humanlike-0.1.0/livekit_plugins_humanlike.egg-info/SOURCES.txt +11 -0
- livekit_plugins_humanlike-0.1.0/livekit_plugins_humanlike.egg-info/dependency_links.txt +1 -0
- livekit_plugins_humanlike-0.1.0/livekit_plugins_humanlike.egg-info/requires.txt +8 -0
- livekit_plugins_humanlike-0.1.0/livekit_plugins_humanlike.egg-info/top_level.txt +1 -0
- livekit_plugins_humanlike-0.1.0/pyproject.toml +41 -0
- livekit_plugins_humanlike-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: livekit-plugins-humanlike
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Humanlike avatar plugin for LiveKit Agents — real-time talking-head video with expression control
|
|
5
|
+
Author: Humanlike AI
|
|
6
|
+
License-Expression: Apache-2.0
|
|
7
|
+
Project-URL: Homepage, https://github.com/HumanlikeAI/livekit-plugins-humanlike
|
|
8
|
+
Project-URL: Repository, https://github.com/HumanlikeAI/livekit-plugins-humanlike
|
|
9
|
+
Keywords: livekit,avatar,talking-head,real-time,humanlike
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Topic :: Multimedia :: Video
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
Requires-Dist: livekit-agents>=1.0
|
|
21
|
+
Requires-Dist: numpy
|
|
22
|
+
Requires-Dist: websockets
|
|
23
|
+
Requires-Dist: Pillow
|
|
24
|
+
Provides-Extra: fast
|
|
25
|
+
Requires-Dist: turbojpeg; extra == "fast"
|
|
26
|
+
Requires-Dist: soxr; extra == "fast"
|
|
27
|
+
|
|
28
|
+
# livekit-plugins-humanlike
|
|
29
|
+
|
|
30
|
+
Real-time talking-head avatar plugin for [LiveKit Agents](https://docs.livekit.io/agents/).
|
|
31
|
+
|
|
32
|
+
Streams TTS audio to a GPU-backed avatar orchestrator that returns lip-synced video frames with facial expressions guided by a natural-language prompt.
|
|
33
|
+
|
|
34
|
+
## Install
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
pip install livekit-plugins-humanlike
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Quick start
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
from livekit.plugins.humanlike import AvatarSession
|
|
44
|
+
|
|
45
|
+
avatar = AvatarSession(
|
|
46
|
+
orchestrator_url="ws://your-gpu-server:8000/ws/stream",
|
|
47
|
+
image="/path/to/face.png",
|
|
48
|
+
avatar_model="humanlike-homo",
|
|
49
|
+
prompt="warm, friendly, subtly smiling, occasional nods",
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# Wire into your LiveKit agent
|
|
53
|
+
await avatar.start(agent_session, room=ctx.room)
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Parameters
|
|
57
|
+
|
|
58
|
+
| Parameter | Default | Description |
|
|
59
|
+
|-----------|---------|-------------|
|
|
60
|
+
| `orchestrator_url` | `ws://localhost:8000/ws/stream` | Avatar server WebSocket URL |
|
|
61
|
+
| `image` | `./face.png` | Face image (file path or raw bytes) |
|
|
62
|
+
| `avatar_model` | `humanlike-homo` | Model identifier |
|
|
63
|
+
| `prompt` | `""` | Expression prompt (e.g. "warm, smiling, nods") |
|
|
64
|
+
| `seed` | `42` | Random seed for reproducibility |
|
|
65
|
+
| `video_width` | `512` | Output video width |
|
|
66
|
+
| `video_height` | `512` | Output video height |
|
|
67
|
+
| `tts_sample_rate` | `16000` | Must match your TTS output rate |
|
|
68
|
+
|
|
69
|
+
## Live expression updates
|
|
70
|
+
|
|
71
|
+
Update the avatar's expression mid-conversation:
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
await avatar.set_prompt("excited, wide eyes, big smile")
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## Full agent example
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
from livekit.agents import Agent, AgentSession, JobContext, RoomOutputOptions, WorkerOptions, cli
|
|
81
|
+
from livekit.plugins import cartesia, openai, silero
|
|
82
|
+
from livekit.plugins.humanlike import AvatarSession
|
|
83
|
+
|
|
84
|
+
async def entrypoint(ctx: JobContext):
|
|
85
|
+
await ctx.connect()
|
|
86
|
+
|
|
87
|
+
session = AgentSession(
|
|
88
|
+
stt=openai.STT(),
|
|
89
|
+
llm=openai.LLM(model="gpt-4o"),
|
|
90
|
+
tts=cartesia.TTS(model="sonic-3", sample_rate=16000),
|
|
91
|
+
vad=silero.VAD.load(),
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
avatar = AvatarSession(
|
|
95
|
+
orchestrator_url="ws://localhost:8000/ws/stream",
|
|
96
|
+
image="./face.png",
|
|
97
|
+
avatar_model="humanlike-homo",
|
|
98
|
+
prompt="warm, friendly, natural eye movement",
|
|
99
|
+
)
|
|
100
|
+
await avatar.start(session, room=ctx.room)
|
|
101
|
+
|
|
102
|
+
await session.start(
|
|
103
|
+
room=ctx.room,
|
|
104
|
+
agent=Agent(instructions="You are a helpful assistant."),
|
|
105
|
+
room_output_options=RoomOutputOptions(audio_enabled=False),
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
if __name__ == "__main__":
|
|
109
|
+
cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint))
|
|
110
|
+
```
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
# livekit-plugins-humanlike
|
|
2
|
+
|
|
3
|
+
Real-time talking-head avatar plugin for [LiveKit Agents](https://docs.livekit.io/agents/).
|
|
4
|
+
|
|
5
|
+
Streams TTS audio to a GPU-backed avatar orchestrator that returns lip-synced video frames with facial expressions guided by a natural-language prompt.
|
|
6
|
+
|
|
7
|
+
## Install
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install livekit-plugins-humanlike
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## Quick start
|
|
14
|
+
|
|
15
|
+
```python
|
|
16
|
+
from livekit.plugins.humanlike import AvatarSession
|
|
17
|
+
|
|
18
|
+
avatar = AvatarSession(
|
|
19
|
+
orchestrator_url="ws://your-gpu-server:8000/ws/stream",
|
|
20
|
+
image="/path/to/face.png",
|
|
21
|
+
avatar_model="humanlike-homo",
|
|
22
|
+
prompt="warm, friendly, subtly smiling, occasional nods",
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
# Wire into your LiveKit agent
|
|
26
|
+
await avatar.start(agent_session, room=ctx.room)
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## Parameters
|
|
30
|
+
|
|
31
|
+
| Parameter | Default | Description |
|
|
32
|
+
|-----------|---------|-------------|
|
|
33
|
+
| `orchestrator_url` | `ws://localhost:8000/ws/stream` | Avatar server WebSocket URL |
|
|
34
|
+
| `image` | `./face.png` | Face image (file path or raw bytes) |
|
|
35
|
+
| `avatar_model` | `humanlike-homo` | Model identifier |
|
|
36
|
+
| `prompt` | `""` | Expression prompt (e.g. "warm, smiling, nods") |
|
|
37
|
+
| `seed` | `42` | Random seed for reproducibility |
|
|
38
|
+
| `video_width` | `512` | Output video width |
|
|
39
|
+
| `video_height` | `512` | Output video height |
|
|
40
|
+
| `tts_sample_rate` | `16000` | Must match your TTS output rate |
|
|
41
|
+
|
|
42
|
+
## Live expression updates
|
|
43
|
+
|
|
44
|
+
Update the avatar's expression mid-conversation:
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
await avatar.set_prompt("excited, wide eyes, big smile")
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Full agent example
|
|
51
|
+
|
|
52
|
+
```python
|
|
53
|
+
from livekit.agents import Agent, AgentSession, JobContext, RoomOutputOptions, WorkerOptions, cli
|
|
54
|
+
from livekit.plugins import cartesia, openai, silero
|
|
55
|
+
from livekit.plugins.humanlike import AvatarSession
|
|
56
|
+
|
|
57
|
+
async def entrypoint(ctx: JobContext):
|
|
58
|
+
await ctx.connect()
|
|
59
|
+
|
|
60
|
+
session = AgentSession(
|
|
61
|
+
stt=openai.STT(),
|
|
62
|
+
llm=openai.LLM(model="gpt-4o"),
|
|
63
|
+
tts=cartesia.TTS(model="sonic-3", sample_rate=16000),
|
|
64
|
+
vad=silero.VAD.load(),
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
avatar = AvatarSession(
|
|
68
|
+
orchestrator_url="ws://localhost:8000/ws/stream",
|
|
69
|
+
image="./face.png",
|
|
70
|
+
avatar_model="humanlike-homo",
|
|
71
|
+
prompt="warm, friendly, natural eye movement",
|
|
72
|
+
)
|
|
73
|
+
await avatar.start(session, room=ctx.room)
|
|
74
|
+
|
|
75
|
+
await session.start(
|
|
76
|
+
room=ctx.room,
|
|
77
|
+
agent=Agent(instructions="You are a helpful assistant."),
|
|
78
|
+
room_output_options=RoomOutputOptions(audio_enabled=False),
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
if __name__ == "__main__":
|
|
82
|
+
cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint))
|
|
83
|
+
```
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LiveKit Agents plugin — Humanlike Avatar.
|
|
3
|
+
|
|
4
|
+
Provides a real-time talking-head avatar with expression-aware generation.
|
|
5
|
+
Streams TTS audio to a GPU-backed avatar orchestrator that returns lip-synced
|
|
6
|
+
video frames with facial expressions guided by a natural-language prompt.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
from livekit.plugins.humanlike import AvatarSession
|
|
10
|
+
|
|
11
|
+
avatar = AvatarSession(
|
|
12
|
+
orchestrator_url="ws://localhost:8000/ws/stream",
|
|
13
|
+
image="/path/to/face.png", # or raw bytes
|
|
14
|
+
avatar_model="humanlike-homo",
|
|
15
|
+
prompt="warm, friendly, subtly smiling, occasional nods",
|
|
16
|
+
)
|
|
17
|
+
await avatar.start(agent_session, room=ctx.room)
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from .avatar import AvatarSession, VideoGenerator
|
|
21
|
+
|
|
22
|
+
__all__ = ["AvatarSession", "VideoGenerator"]
|
|
@@ -0,0 +1,399 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Humanlike avatar — LiveKit Agents VideoGenerator + AvatarSession.
|
|
3
|
+
|
|
4
|
+
Connects to the avatar orchestrator WebSocket and streams audio in / video
|
|
5
|
+
frames out, with an expression prompt that guides facial behaviour.
|
|
6
|
+
|
|
7
|
+
Orchestrator WS protocol (extended):
|
|
8
|
+
1. Client sends JSON config:
|
|
9
|
+
{
|
|
10
|
+
"model": "humanlike-homo",
|
|
11
|
+
"image": "<base64>",
|
|
12
|
+
"seed": 42,
|
|
13
|
+
"prompt": "warm, friendly, subtly smiling"
|
|
14
|
+
}
|
|
15
|
+
2. Server responds: {"status": "ready", "fps": 25, "chunk_samples": N, ...}
|
|
16
|
+
3. Client streams binary PCM int16 LE 16 kHz mono chunks
|
|
17
|
+
4. Server streams binary: [4-byte frame_idx LE uint32][JPEG bytes]
|
|
18
|
+
5. Client sends empty bytes to end the stream
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import asyncio
|
|
24
|
+
import base64
|
|
25
|
+
import json
|
|
26
|
+
import logging
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
|
|
29
|
+
import numpy as np
|
|
30
|
+
import websockets
|
|
31
|
+
|
|
32
|
+
from livekit import rtc
|
|
33
|
+
from livekit.agents.voice import avatar as av
|
|
34
|
+
|
|
35
|
+
logger = logging.getLogger("humanlike-avatar")
|
|
36
|
+
|
|
37
|
+
PLUGIN_NAME = "humanlike"
|
|
38
|
+
AVATAR_IDENTITY = "humanlike-avatar"
|
|
39
|
+
TARGET_SR = 16000
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _resample(samples: np.ndarray, src_sr: int, dst_sr: int) -> np.ndarray:
|
|
43
|
+
if src_sr == dst_sr:
|
|
44
|
+
return samples
|
|
45
|
+
try:
|
|
46
|
+
import soxr
|
|
47
|
+
return soxr.resample(samples, src_sr, dst_sr).astype(np.float32)
|
|
48
|
+
except ImportError:
|
|
49
|
+
# Fallback: linear interpolation (lower quality but no extra dep)
|
|
50
|
+
ratio = dst_sr / src_sr
|
|
51
|
+
n_out = int(len(samples) * ratio)
|
|
52
|
+
indices = np.linspace(0, len(samples) - 1, n_out)
|
|
53
|
+
return np.interp(indices, np.arange(len(samples)), samples).astype(np.float32)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _jpeg_to_video_frame(jpeg_bytes: bytes) -> rtc.VideoFrame:
|
|
57
|
+
"""Decode JPEG to RGBA VideoFrame."""
|
|
58
|
+
try:
|
|
59
|
+
from turbojpeg import TurboJPEG, TJPF_RGBA
|
|
60
|
+
_tj = TurboJPEG()
|
|
61
|
+
rgba = _tj.decode(jpeg_bytes, pixel_format=TJPF_RGBA)
|
|
62
|
+
except ImportError:
|
|
63
|
+
from PIL import Image
|
|
64
|
+
import io
|
|
65
|
+
img = Image.open(io.BytesIO(jpeg_bytes)).convert("RGBA")
|
|
66
|
+
rgba = np.array(img)
|
|
67
|
+
|
|
68
|
+
h, w = rgba.shape[:2]
|
|
69
|
+
return rtc.VideoFrame(
|
|
70
|
+
width=w,
|
|
71
|
+
height=h,
|
|
72
|
+
type=rtc.VideoBufferType.RGBA,
|
|
73
|
+
data=bytes(rgba),
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class VideoGenerator(av.VideoGenerator):
|
|
78
|
+
"""
|
|
79
|
+
LiveKit VideoGenerator that streams audio to the Humanlike avatar
|
|
80
|
+
orchestrator and yields video frames.
|
|
81
|
+
|
|
82
|
+
Parameters
|
|
83
|
+
----------
|
|
84
|
+
ws_url : str
|
|
85
|
+
WebSocket URL of the avatar orchestrator.
|
|
86
|
+
image : bytes
|
|
87
|
+
Reference face image (PNG/JPEG bytes).
|
|
88
|
+
avatar_model : str
|
|
89
|
+
Avatar model identifier, e.g. "humanlike-homo".
|
|
90
|
+
prompt : str
|
|
91
|
+
Natural-language expression prompt that guides facial behaviour
|
|
92
|
+
during generation, e.g. "expressive, warm smile, occasional nods".
|
|
93
|
+
seed : int
|
|
94
|
+
Random seed for reproducible generation.
|
|
95
|
+
"""
|
|
96
|
+
|
|
97
|
+
def __init__(
|
|
98
|
+
self,
|
|
99
|
+
ws_url: str,
|
|
100
|
+
image: bytes,
|
|
101
|
+
avatar_model: str = "humanlike-homo",
|
|
102
|
+
prompt: str = "",
|
|
103
|
+
seed: int = 42,
|
|
104
|
+
) -> None:
|
|
105
|
+
self._ws_url = ws_url
|
|
106
|
+
self._image_b64 = base64.b64encode(image).decode()
|
|
107
|
+
self._image_bytes = image
|
|
108
|
+
self._avatar_model = avatar_model
|
|
109
|
+
self._prompt = prompt
|
|
110
|
+
self._seed = seed
|
|
111
|
+
|
|
112
|
+
self._ws: websockets.WebSocketClientProtocol | None = None
|
|
113
|
+
self._fps: float = 25.0
|
|
114
|
+
self._chunk_samples: int = 0
|
|
115
|
+
|
|
116
|
+
self._audio_buf = np.array([], dtype=np.float32)
|
|
117
|
+
self._out_q: asyncio.Queue = asyncio.Queue()
|
|
118
|
+
self._recv_task: asyncio.Task | None = None
|
|
119
|
+
self._idle_task: asyncio.Task | None = None
|
|
120
|
+
self._idle_frame: rtc.VideoFrame | None = None
|
|
121
|
+
self._generating = False
|
|
122
|
+
|
|
123
|
+
# ------------------------------------------------------------------
|
|
124
|
+
# Setup
|
|
125
|
+
# ------------------------------------------------------------------
|
|
126
|
+
|
|
127
|
+
async def connect(self) -> None:
|
|
128
|
+
"""Open WS, perform handshake, start background recv loop."""
|
|
129
|
+
self._ws = await websockets.connect(self._ws_url, max_size=10 * 1024 * 1024)
|
|
130
|
+
|
|
131
|
+
config = {
|
|
132
|
+
"model": self._avatar_model,
|
|
133
|
+
"image": self._image_b64,
|
|
134
|
+
"seed": self._seed,
|
|
135
|
+
"prompt": self._prompt,
|
|
136
|
+
}
|
|
137
|
+
await self._ws.send(json.dumps(config))
|
|
138
|
+
logger.info(
|
|
139
|
+
"Sent config: model=%s prompt=%r seed=%d",
|
|
140
|
+
self._avatar_model, self._prompt[:60], self._seed,
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
# Wait for ready
|
|
144
|
+
while True:
|
|
145
|
+
raw = await self._ws.recv()
|
|
146
|
+
if isinstance(raw, str):
|
|
147
|
+
msg = json.loads(raw)
|
|
148
|
+
if msg.get("status") == "ready":
|
|
149
|
+
self._fps = float(msg.get("fps", 25))
|
|
150
|
+
self._chunk_samples = int(msg.get("chunk_samples", 0))
|
|
151
|
+
logger.info(
|
|
152
|
+
"Orchestrator ready: model=%s fps=%.0f chunk_samples=%d",
|
|
153
|
+
self._avatar_model, self._fps, self._chunk_samples,
|
|
154
|
+
)
|
|
155
|
+
break
|
|
156
|
+
if "error" in msg:
|
|
157
|
+
raise RuntimeError(f"Orchestrator error: {msg['error']}")
|
|
158
|
+
|
|
159
|
+
self._recv_task = asyncio.ensure_future(self._recv_loop())
|
|
160
|
+
|
|
161
|
+
# Build idle frame from reference image
|
|
162
|
+
self._idle_frame = _jpeg_to_video_frame(self._image_bytes)
|
|
163
|
+
self._idle_task = asyncio.ensure_future(self._idle_loop())
|
|
164
|
+
|
|
165
|
+
async def _idle_loop(self) -> None:
|
|
166
|
+
"""Push the reference image at fps until the first real chunk arrives."""
|
|
167
|
+
interval = 1.0 / self._fps
|
|
168
|
+
while not self._generating and self._idle_frame is not None:
|
|
169
|
+
await self._out_q.put(self._idle_frame)
|
|
170
|
+
await asyncio.sleep(interval)
|
|
171
|
+
|
|
172
|
+
async def _recv_loop(self) -> None:
|
|
173
|
+
"""Background: read JPEG frames from WS, enqueue as VideoFrames."""
|
|
174
|
+
try:
|
|
175
|
+
assert self._ws is not None
|
|
176
|
+
async for message in self._ws:
|
|
177
|
+
if not isinstance(message, bytes) or len(message) < 4:
|
|
178
|
+
continue
|
|
179
|
+
jpeg_data = message[4:] # strip 4-byte frame_idx header
|
|
180
|
+
try:
|
|
181
|
+
frame = _jpeg_to_video_frame(jpeg_data)
|
|
182
|
+
await self._out_q.put(frame)
|
|
183
|
+
except Exception as exc:
|
|
184
|
+
logger.warning("Failed to decode frame: %s", exc)
|
|
185
|
+
except websockets.ConnectionClosed:
|
|
186
|
+
logger.info("Orchestrator WS closed")
|
|
187
|
+
except Exception as exc:
|
|
188
|
+
logger.exception("Recv loop error: %s", exc)
|
|
189
|
+
|
|
190
|
+
# ------------------------------------------------------------------
|
|
191
|
+
# VideoGenerator protocol
|
|
192
|
+
# ------------------------------------------------------------------
|
|
193
|
+
|
|
194
|
+
async def push_audio(self, frame: rtc.AudioFrame | av.AudioSegmentEnd) -> None:
|
|
195
|
+
if isinstance(frame, av.AudioSegmentEnd):
|
|
196
|
+
await self._flush_audio()
|
|
197
|
+
await self._out_q.put(av.AudioSegmentEnd())
|
|
198
|
+
return
|
|
199
|
+
|
|
200
|
+
pcm_f32 = np.frombuffer(frame.data, dtype=np.int16).astype(np.float32) / 32768.0
|
|
201
|
+
if frame.num_channels > 1:
|
|
202
|
+
pcm_f32 = pcm_f32.reshape(-1, frame.num_channels).mean(axis=1)
|
|
203
|
+
|
|
204
|
+
pcm_f32 = _resample(pcm_f32, frame.sample_rate, TARGET_SR)
|
|
205
|
+
self._audio_buf = np.concatenate([self._audio_buf, pcm_f32])
|
|
206
|
+
|
|
207
|
+
while self._chunk_samples > 0 and len(self._audio_buf) >= self._chunk_samples:
|
|
208
|
+
chunk = self._audio_buf[: self._chunk_samples]
|
|
209
|
+
self._audio_buf = self._audio_buf[self._chunk_samples :]
|
|
210
|
+
await self._send_chunk(chunk)
|
|
211
|
+
|
|
212
|
+
def clear_buffer(self) -> None:
|
|
213
|
+
self._audio_buf = np.array([], dtype=np.float32)
|
|
214
|
+
while not self._out_q.empty():
|
|
215
|
+
try:
|
|
216
|
+
self._out_q.get_nowait()
|
|
217
|
+
except asyncio.QueueEmpty:
|
|
218
|
+
break
|
|
219
|
+
|
|
220
|
+
def __aiter__(self):
|
|
221
|
+
return self._output_stream()
|
|
222
|
+
|
|
223
|
+
async def _output_stream(self):
|
|
224
|
+
while True:
|
|
225
|
+
item = await self._out_q.get()
|
|
226
|
+
yield item
|
|
227
|
+
|
|
228
|
+
# ------------------------------------------------------------------
|
|
229
|
+
# Prompt update (live)
|
|
230
|
+
# ------------------------------------------------------------------
|
|
231
|
+
|
|
232
|
+
async def set_prompt(self, prompt: str) -> None:
|
|
233
|
+
"""Update the expression prompt mid-session (if orchestrator supports it)."""
|
|
234
|
+
self._prompt = prompt
|
|
235
|
+
if self._ws is not None:
|
|
236
|
+
try:
|
|
237
|
+
await self._ws.send(json.dumps({"update_prompt": prompt}))
|
|
238
|
+
logger.info("Expression prompt updated: %r", prompt[:60])
|
|
239
|
+
except Exception as exc:
|
|
240
|
+
logger.warning("Failed to send prompt update: %s", exc)
|
|
241
|
+
|
|
242
|
+
# ------------------------------------------------------------------
|
|
243
|
+
# Helpers
|
|
244
|
+
# ------------------------------------------------------------------
|
|
245
|
+
|
|
246
|
+
async def _flush_audio(self) -> None:
|
|
247
|
+
if self._chunk_samples > 0 and len(self._audio_buf) > 0:
|
|
248
|
+
chunk = self._audio_buf
|
|
249
|
+
self._audio_buf = np.array([], dtype=np.float32)
|
|
250
|
+
if len(chunk) < self._chunk_samples:
|
|
251
|
+
chunk = np.pad(chunk, (0, self._chunk_samples - len(chunk)))
|
|
252
|
+
await self._send_chunk(chunk)
|
|
253
|
+
if self._ws is not None:
|
|
254
|
+
try:
|
|
255
|
+
await self._ws.send(b"")
|
|
256
|
+
except Exception:
|
|
257
|
+
pass
|
|
258
|
+
|
|
259
|
+
async def _send_chunk(self, chunk: np.ndarray) -> None:
|
|
260
|
+
if self._ws is None:
|
|
261
|
+
return
|
|
262
|
+
if not self._generating:
|
|
263
|
+
self._generating = True
|
|
264
|
+
if self._idle_task is not None:
|
|
265
|
+
self._idle_task.cancel()
|
|
266
|
+
self._idle_task = None
|
|
267
|
+
pcm_int16 = (np.clip(chunk, -1.0, 1.0) * 32767).astype(np.int16)
|
|
268
|
+
await self._ws.send(pcm_int16.tobytes())
|
|
269
|
+
|
|
270
|
+
async def aclose(self) -> None:
|
|
271
|
+
if self._idle_task is not None:
|
|
272
|
+
self._idle_task.cancel()
|
|
273
|
+
if self._recv_task is not None:
|
|
274
|
+
self._recv_task.cancel()
|
|
275
|
+
if self._ws is not None:
|
|
276
|
+
await self._ws.close()
|
|
277
|
+
|
|
278
|
+
@property
|
|
279
|
+
def fps(self) -> float:
|
|
280
|
+
return self._fps
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
class AvatarSession:
|
|
284
|
+
"""
|
|
285
|
+
High-level Humanlike avatar session for LiveKit Agents.
|
|
286
|
+
|
|
287
|
+
Wires up the VideoGenerator with LiveKit's AvatarRunner, DataStream
|
|
288
|
+
audio routing, and video publishing.
|
|
289
|
+
|
|
290
|
+
Parameters
|
|
291
|
+
----------
|
|
292
|
+
orchestrator_url : str
|
|
293
|
+
WebSocket URL, e.g. "ws://localhost:8000/ws/stream".
|
|
294
|
+
image : str | bytes
|
|
295
|
+
Path to a face image, or raw image bytes.
|
|
296
|
+
avatar_model : str
|
|
297
|
+
Model identifier. Default "humanlike-homo".
|
|
298
|
+
prompt : str
|
|
299
|
+
Expression guidance prompt, e.g. "warm, friendly, subtly smiling".
|
|
300
|
+
Can be updated live via set_prompt().
|
|
301
|
+
seed : int
|
|
302
|
+
Random seed.
|
|
303
|
+
video_width / video_height : int
|
|
304
|
+
Output dimensions (should match orchestrator output, default 512).
|
|
305
|
+
avatar_identity : str
|
|
306
|
+
LiveKit participant identity for the avatar.
|
|
307
|
+
tts_sample_rate : int
|
|
308
|
+
TTS output sample rate (default 16000 to match pipeline).
|
|
309
|
+
"""
|
|
310
|
+
|
|
311
|
+
def __init__(
|
|
312
|
+
self,
|
|
313
|
+
orchestrator_url: str = "ws://localhost:8000/ws/stream",
|
|
314
|
+
image: str | bytes = "./face.png",
|
|
315
|
+
avatar_model: str = "humanlike-homo",
|
|
316
|
+
prompt: str = "",
|
|
317
|
+
seed: int = 42,
|
|
318
|
+
video_width: int = 512,
|
|
319
|
+
video_height: int = 512,
|
|
320
|
+
avatar_identity: str = AVATAR_IDENTITY,
|
|
321
|
+
tts_sample_rate: int = TARGET_SR,
|
|
322
|
+
) -> None:
|
|
323
|
+
self._orchestrator_url = orchestrator_url
|
|
324
|
+
self._image = image
|
|
325
|
+
self._avatar_model = avatar_model
|
|
326
|
+
self._prompt = prompt
|
|
327
|
+
self._seed = seed
|
|
328
|
+
self._video_width = video_width
|
|
329
|
+
self._video_height = video_height
|
|
330
|
+
self._avatar_identity = avatar_identity
|
|
331
|
+
self._tts_sample_rate = tts_sample_rate
|
|
332
|
+
self._runner: av.AvatarRunner | None = None
|
|
333
|
+
self._gen: VideoGenerator | None = None
|
|
334
|
+
|
|
335
|
+
def _load_image(self) -> bytes:
|
|
336
|
+
if isinstance(self._image, bytes):
|
|
337
|
+
return self._image
|
|
338
|
+
return Path(self._image).read_bytes()
|
|
339
|
+
|
|
340
|
+
async def start(
|
|
341
|
+
self,
|
|
342
|
+
agent_session,
|
|
343
|
+
room: rtc.Room,
|
|
344
|
+
) -> None:
|
|
345
|
+
"""
|
|
346
|
+
Connect to the orchestrator and wire up the LiveKit room.
|
|
347
|
+
Must be called before agent_session.start().
|
|
348
|
+
"""
|
|
349
|
+
image_bytes = self._load_image()
|
|
350
|
+
|
|
351
|
+
gen = VideoGenerator(
|
|
352
|
+
ws_url=self._orchestrator_url,
|
|
353
|
+
image=image_bytes,
|
|
354
|
+
avatar_model=self._avatar_model,
|
|
355
|
+
prompt=self._prompt,
|
|
356
|
+
seed=self._seed,
|
|
357
|
+
)
|
|
358
|
+
await gen.connect()
|
|
359
|
+
self._gen = gen
|
|
360
|
+
|
|
361
|
+
audio_recv = av.DataStreamAudioReceiver(room)
|
|
362
|
+
|
|
363
|
+
agent_session.output.audio = av.DataStreamAudioOutput(
|
|
364
|
+
room=room,
|
|
365
|
+
destination_identity=self._avatar_identity,
|
|
366
|
+
sample_rate=self._tts_sample_rate,
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
opts = av.AvatarOptions(
|
|
370
|
+
video_width=self._video_width,
|
|
371
|
+
video_height=self._video_height,
|
|
372
|
+
video_fps=gen.fps,
|
|
373
|
+
audio_sample_rate=self._tts_sample_rate,
|
|
374
|
+
audio_channels=1,
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
self._runner = av.AvatarRunner(
|
|
378
|
+
room=room,
|
|
379
|
+
audio_recv=audio_recv,
|
|
380
|
+
video_gen=gen,
|
|
381
|
+
avatar_opts=opts,
|
|
382
|
+
)
|
|
383
|
+
await self._runner.start()
|
|
384
|
+
logger.info(
|
|
385
|
+
"Humanlike avatar started: model=%s prompt=%r identity=%s",
|
|
386
|
+
self._avatar_model, self._prompt[:60], self._avatar_identity,
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
async def set_prompt(self, prompt: str) -> None:
|
|
390
|
+
"""Update the expression prompt on a live session."""
|
|
391
|
+
self._prompt = prompt
|
|
392
|
+
if self._gen is not None:
|
|
393
|
+
await self._gen.set_prompt(prompt)
|
|
394
|
+
|
|
395
|
+
async def aclose(self) -> None:
|
|
396
|
+
if self._runner is not None:
|
|
397
|
+
await self._runner.aclose()
|
|
398
|
+
if self._gen is not None:
|
|
399
|
+
await self._gen.aclose()
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: livekit-plugins-humanlike
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Humanlike avatar plugin for LiveKit Agents — real-time talking-head video with expression control
|
|
5
|
+
Author: Humanlike AI
|
|
6
|
+
License-Expression: Apache-2.0
|
|
7
|
+
Project-URL: Homepage, https://github.com/HumanlikeAI/livekit-plugins-humanlike
|
|
8
|
+
Project-URL: Repository, https://github.com/HumanlikeAI/livekit-plugins-humanlike
|
|
9
|
+
Keywords: livekit,avatar,talking-head,real-time,humanlike
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Topic :: Multimedia :: Video
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
Requires-Dist: livekit-agents>=1.0
|
|
21
|
+
Requires-Dist: numpy
|
|
22
|
+
Requires-Dist: websockets
|
|
23
|
+
Requires-Dist: Pillow
|
|
24
|
+
Provides-Extra: fast
|
|
25
|
+
Requires-Dist: turbojpeg; extra == "fast"
|
|
26
|
+
Requires-Dist: soxr; extra == "fast"
|
|
27
|
+
|
|
28
|
+
# livekit-plugins-humanlike
|
|
29
|
+
|
|
30
|
+
Real-time talking-head avatar plugin for [LiveKit Agents](https://docs.livekit.io/agents/).
|
|
31
|
+
|
|
32
|
+
Streams TTS audio to a GPU-backed avatar orchestrator that returns lip-synced video frames with facial expressions guided by a natural-language prompt.
|
|
33
|
+
|
|
34
|
+
## Install
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
pip install livekit-plugins-humanlike
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Quick start
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
from livekit.plugins.humanlike import AvatarSession
|
|
44
|
+
|
|
45
|
+
avatar = AvatarSession(
|
|
46
|
+
orchestrator_url="ws://your-gpu-server:8000/ws/stream",
|
|
47
|
+
image="/path/to/face.png",
|
|
48
|
+
avatar_model="humanlike-homo",
|
|
49
|
+
prompt="warm, friendly, subtly smiling, occasional nods",
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# Wire into your LiveKit agent
|
|
53
|
+
await avatar.start(agent_session, room=ctx.room)
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Parameters
|
|
57
|
+
|
|
58
|
+
| Parameter | Default | Description |
|
|
59
|
+
|-----------|---------|-------------|
|
|
60
|
+
| `orchestrator_url` | `ws://localhost:8000/ws/stream` | Avatar server WebSocket URL |
|
|
61
|
+
| `image` | `./face.png` | Face image (file path or raw bytes) |
|
|
62
|
+
| `avatar_model` | `humanlike-homo` | Model identifier |
|
|
63
|
+
| `prompt` | `""` | Expression prompt (e.g. "warm, smiling, nods") |
|
|
64
|
+
| `seed` | `42` | Random seed for reproducibility |
|
|
65
|
+
| `video_width` | `512` | Output video width |
|
|
66
|
+
| `video_height` | `512` | Output video height |
|
|
67
|
+
| `tts_sample_rate` | `16000` | Must match your TTS output rate |
|
|
68
|
+
|
|
69
|
+
## Live expression updates
|
|
70
|
+
|
|
71
|
+
Update the avatar's expression mid-conversation:
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
await avatar.set_prompt("excited, wide eyes, big smile")
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## Full agent example
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
from livekit.agents import Agent, AgentSession, JobContext, RoomOutputOptions, WorkerOptions, cli
|
|
81
|
+
from livekit.plugins import cartesia, openai, silero
|
|
82
|
+
from livekit.plugins.humanlike import AvatarSession
|
|
83
|
+
|
|
84
|
+
async def entrypoint(ctx: JobContext):
|
|
85
|
+
await ctx.connect()
|
|
86
|
+
|
|
87
|
+
session = AgentSession(
|
|
88
|
+
stt=openai.STT(),
|
|
89
|
+
llm=openai.LLM(model="gpt-4o"),
|
|
90
|
+
tts=cartesia.TTS(model="sonic-3", sample_rate=16000),
|
|
91
|
+
vad=silero.VAD.load(),
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
avatar = AvatarSession(
|
|
95
|
+
orchestrator_url="ws://localhost:8000/ws/stream",
|
|
96
|
+
image="./face.png",
|
|
97
|
+
avatar_model="humanlike-homo",
|
|
98
|
+
prompt="warm, friendly, natural eye movement",
|
|
99
|
+
)
|
|
100
|
+
await avatar.start(session, room=ctx.room)
|
|
101
|
+
|
|
102
|
+
await session.start(
|
|
103
|
+
room=ctx.room,
|
|
104
|
+
agent=Agent(instructions="You are a helpful assistant."),
|
|
105
|
+
room_output_options=RoomOutputOptions(audio_enabled=False),
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
if __name__ == "__main__":
|
|
109
|
+
cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint))
|
|
110
|
+
```
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
livekit/__init__.py
|
|
4
|
+
livekit/plugins/__init__.py
|
|
5
|
+
livekit/plugins/humanlike/__init__.py
|
|
6
|
+
livekit/plugins/humanlike/avatar.py
|
|
7
|
+
livekit_plugins_humanlike.egg-info/PKG-INFO
|
|
8
|
+
livekit_plugins_humanlike.egg-info/SOURCES.txt
|
|
9
|
+
livekit_plugins_humanlike.egg-info/dependency_links.txt
|
|
10
|
+
livekit_plugins_humanlike.egg-info/requires.txt
|
|
11
|
+
livekit_plugins_humanlike.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
livekit
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "livekit-plugins-humanlike"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Humanlike avatar plugin for LiveKit Agents — real-time talking-head video with expression control"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "Apache-2.0"
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Humanlike AI" },
|
|
14
|
+
]
|
|
15
|
+
keywords = ["livekit", "avatar", "talking-head", "real-time", "humanlike"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 3 - Alpha",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"Topic :: Multimedia :: Video",
|
|
20
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
21
|
+
"Programming Language :: Python :: 3",
|
|
22
|
+
"Programming Language :: Python :: 3.10",
|
|
23
|
+
"Programming Language :: Python :: 3.11",
|
|
24
|
+
"Programming Language :: Python :: 3.12",
|
|
25
|
+
]
|
|
26
|
+
dependencies = [
|
|
27
|
+
"livekit-agents>=1.0",
|
|
28
|
+
"numpy",
|
|
29
|
+
"websockets",
|
|
30
|
+
"Pillow",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[project.optional-dependencies]
|
|
34
|
+
fast = ["turbojpeg", "soxr"]
|
|
35
|
+
|
|
36
|
+
[project.urls]
|
|
37
|
+
Homepage = "https://github.com/HumanlikeAI/livekit-plugins-humanlike"
|
|
38
|
+
Repository = "https://github.com/HumanlikeAI/livekit-plugins-humanlike"
|
|
39
|
+
|
|
40
|
+
[tool.setuptools.packages.find]
|
|
41
|
+
include = ["livekit*"]
|