gemilive 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gemilive-0.1.0/LICENSE +21 -0
- gemilive-0.1.0/PKG-INFO +146 -0
- gemilive-0.1.0/README.md +130 -0
- gemilive-0.1.0/gemilive/__init__.py +3 -0
- gemilive-0.1.0/gemilive/config.py +37 -0
- gemilive-0.1.0/gemilive/mount.py +69 -0
- gemilive-0.1.0/gemilive/py.typed +1 -0
- gemilive-0.1.0/gemilive/router.py +147 -0
- gemilive-0.1.0/gemilive.egg-info/PKG-INFO +146 -0
- gemilive-0.1.0/gemilive.egg-info/SOURCES.txt +13 -0
- gemilive-0.1.0/gemilive.egg-info/dependency_links.txt +1 -0
- gemilive-0.1.0/gemilive.egg-info/requires.txt +5 -0
- gemilive-0.1.0/gemilive.egg-info/top_level.txt +2 -0
- gemilive-0.1.0/pyproject.toml +18 -0
- gemilive-0.1.0/setup.cfg +4 -0
gemilive-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Saidur Rahman Pulok
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
gemilive-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: gemilive
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Minimal Gemini Live AI WebSocket proxy for FastAPI — plug and play voice + video AI.
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
Keywords: gemini,live,ai,voice,video,fastapi,websocket
|
|
7
|
+
Requires-Python: >=3.14
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Requires-Dist: fastapi[standard]>=0.115.0
|
|
11
|
+
Requires-Dist: google-genai>=1.70.0
|
|
12
|
+
Requires-Dist: pydantic-settings>=2.13.1
|
|
13
|
+
Requires-Dist: python-dotenv>=1.2.2
|
|
14
|
+
Requires-Dist: websockets>=16.0
|
|
15
|
+
Dynamic: license-file
|
|
16
|
+
|
|
17
|
+
# gemilive
|
|
18
|
+
|
|
19
|
+
Plug-and-play **Gemini Live AI** (voice + video) for your FastAPI app.
|
|
20
|
+
|
|
21
|
+
`gemilive` provides a seamless bridge between a web-based frontend and Google's Gemini Multimodal Live API. It handles the heavy lifting of WebSockets, bidirectional audio streams (16kHz up / 24kHz down), gapless browser PCM playback, and live video framing — allowing you to add conversational AI to your project in just **six lines of code**.
|
|
22
|
+
|
|
23
|
+
This repo contains both the Python backend plugin (`gemilive`) and the companion JavaScript client (`gemilive-js`).
|
|
24
|
+
|
|
25
|
+
## 🚀 Features
|
|
26
|
+
|
|
27
|
+
- **Bidirectional Voice AI**: Real-time PCM audio streaming for natural, fluid conversions. No laggy turn-by-turn.
|
|
28
|
+
- **Multimodal Vision**: The AI can see what your camera sees via 1fps JPEG snapshots.
|
|
29
|
+
- **Zero-Boilerplate Backend**: Just wrap your existing FastAPI app with `mount_gemilive()`.
|
|
30
|
+
- **Lightweight JS SDK**: A clean browser `GemiliveClient` handling media capture and resampling.
|
|
31
|
+
- **Toggleable Media**: Turn your camera off/on mid-session seamlessly.
|
|
32
|
+
|
|
33
|
+
---
|
|
34
|
+
|
|
35
|
+
## 🛠️ Installation & Quickstart
|
|
36
|
+
|
|
37
|
+
Integration requires two pieces: the Python server endpoint and the JavaScript browser client.
|
|
38
|
+
|
|
39
|
+
### Backend (Python)
|
|
40
|
+
|
|
41
|
+
Install the pip package:
|
|
42
|
+
```bash
|
|
43
|
+
uv add gemilive
|
|
44
|
+
# or pip install gemilive
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Setup requires an API key. You can provide it in code or grab it from your `.env`:
|
|
48
|
+
```env
|
|
49
|
+
GOOGLE_API_KEY=your_gemini_api_key_here
|
|
50
|
+
MODEL_NAME=gemini-3.1-flash-live-preview
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
Mount it into any FastAPI app:
|
|
54
|
+
```python
|
|
55
|
+
from fastapi import FastAPI
|
|
56
|
+
from gemilive import mount_gemilive
|
|
57
|
+
|
|
58
|
+
app = FastAPI()
|
|
59
|
+
|
|
60
|
+
# Mounts the WebSocket route at /ws/live
|
|
61
|
+
mount_gemilive(app, system_prompt="You are a helpful assistant. Keep answers brief.")
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### Frontend (JavaScript)
|
|
65
|
+
|
|
66
|
+
Install the npm package:
|
|
67
|
+
```bash
|
|
68
|
+
npm install gemilive-js
|
|
69
|
+
```
|
|
70
|
+
*Or use via CDN in plain HTML:*
|
|
71
|
+
```html
|
|
72
|
+
<script src="https://cdn.jsdelivr.net/npm/gemilive-js/dist/gemilive.min.js"></script>
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
Initialize the client, connect, and start talking:
|
|
76
|
+
```javascript
|
|
77
|
+
import { GemiliveClient } from 'gemilive-js';
|
|
78
|
+
|
|
79
|
+
// Point it to your FastAPI server's mount path
|
|
80
|
+
const client = new GemiliveClient("ws://localhost:8000/ws/live");
|
|
81
|
+
|
|
82
|
+
client.onMessage = (text) => console.log("Gemini:", text);
|
|
83
|
+
client.onError = (err) => console.error("Error:", err);
|
|
84
|
+
|
|
85
|
+
// Start the connection (prompts user for Mic & Camera)
|
|
86
|
+
await client.start();
|
|
87
|
+
|
|
88
|
+
// Disable video mid-session (audio continues)
|
|
89
|
+
// client.toggleVideo(false);
|
|
90
|
+
|
|
91
|
+
// Stop and disconnect
|
|
92
|
+
// client.stop();
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
---
|
|
96
|
+
|
|
97
|
+
## ⚙️ Advanced Configuration
|
|
98
|
+
|
|
99
|
+
### Python `mount_gemilive()` Overrides
|
|
100
|
+
You can override environment variables dynamically when mounting the API:
|
|
101
|
+
|
|
102
|
+
```python
|
|
103
|
+
mount_gemilive(
|
|
104
|
+
app,
|
|
105
|
+
google_api_key="...", # Overrides GOOGLE_API_KEY env
|
|
106
|
+
model="gemini-3.1-flash-live-preview",# Overrides MODEL_NAME env
|
|
107
|
+
voice="Aoede", # Optional Gemini Voice ("Aoede", "Charon", etc.)
|
|
108
|
+
allow_origins=["https://myapp.com"], # Essential if your frontend is on a different domain
|
|
109
|
+
debug_mode=True # Console logging of message flow
|
|
110
|
+
)
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
### The System Prompt
|
|
114
|
+
You can set system prompts on the **server-side** (via `mount_gemilive`) or the **client-side** (via `new GemiliveClient(url, { systemPrompt: "..." })`).
|
|
115
|
+
If both are provided, the server-side prompt takes precedence, and the client-side prompt is appended securely as "Additional context".
|
|
116
|
+
|
|
117
|
+
---
|
|
118
|
+
|
|
119
|
+
## 📂 Project Structure (For Contributors)
|
|
120
|
+
|
|
121
|
+
`gemilive` is developed as a monorepo containing two packages:
|
|
122
|
+
|
|
123
|
+
```text
|
|
124
|
+
├── gemilive/ # PyPI package source
|
|
125
|
+
│ ├── mount.py # Public FastAPI installer
|
|
126
|
+
│ ├── config.py # Pydantic env validation
|
|
127
|
+
│ └── router.py # Internal WebSocket / GenAI flow
|
|
128
|
+
├── gemilive-js/ # npm package source
|
|
129
|
+
│ ├── src/index.js # Browser SDK (Web Audio API logic)
|
|
130
|
+
│ └── package.json
|
|
131
|
+
└── main.py # Sandbox FastAPI app for testing and local dev
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
For guidelines on local development and how to publish to PyPI and npm, read `PUBLISHING.md`.
|
|
135
|
+
|
|
136
|
+
---
|
|
137
|
+
|
|
138
|
+
## ⚠️ Important Considerations
|
|
139
|
+
|
|
140
|
+
1. **Browser Security**: Browsers restrict microphone/camera access to secure contexts. `getUserMedia` requires **HTTPS** in production. `localhost` works for development.
|
|
141
|
+
2. **Audio Resampling**: Browsers typically record audio at 44.1kHz or 48kHz. The `gemilive-js` SDK seamlessly resamples microphone inputs to **16kHz PCM** to meet Gemini's strict API requirements. Responses from Gemini are returned as 24kHz PCM and gaplessly played back using Javascript time-scheduling.
|
|
142
|
+
|
|
143
|
+
---
|
|
144
|
+
|
|
145
|
+
## 📄 License
|
|
146
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
gemilive-0.1.0/README.md
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
# gemilive
|
|
2
|
+
|
|
3
|
+
Plug-and-play **Gemini Live AI** (voice + video) for your FastAPI app.
|
|
4
|
+
|
|
5
|
+
`gemilive` provides a seamless bridge between a web-based frontend and Google's Gemini Multimodal Live API. It handles the heavy lifting of WebSockets, bidirectional audio streams (16kHz up / 24kHz down), gapless browser PCM playback, and live video framing — allowing you to add conversational AI to your project in just **six lines of code**.
|
|
6
|
+
|
|
7
|
+
This repo contains both the Python backend plugin (`gemilive`) and the companion JavaScript client (`gemilive-js`).
|
|
8
|
+
|
|
9
|
+
## 🚀 Features
|
|
10
|
+
|
|
11
|
+
- **Bidirectional Voice AI**: Real-time PCM audio streaming for natural, fluid conversions. No laggy turn-by-turn.
|
|
12
|
+
- **Multimodal Vision**: The AI can see what your camera sees via 1fps JPEG snapshots.
|
|
13
|
+
- **Zero-Boilerplate Backend**: Just wrap your existing FastAPI app with `mount_gemilive()`.
|
|
14
|
+
- **Lightweight JS SDK**: A clean browser `GemiliveClient` handling media capture and resampling.
|
|
15
|
+
- **Toggleable Media**: Turn your camera off/on mid-session seamlessly.
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
## 🛠️ Installation & Quickstart
|
|
20
|
+
|
|
21
|
+
Integration requires two pieces: the Python server endpoint and the JavaScript browser client.
|
|
22
|
+
|
|
23
|
+
### Backend (Python)
|
|
24
|
+
|
|
25
|
+
Install the pip package:
|
|
26
|
+
```bash
|
|
27
|
+
uv add gemilive
|
|
28
|
+
# or pip install gemilive
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
Setup requires an API key. You can provide it in code or grab it from your `.env`:
|
|
32
|
+
```env
|
|
33
|
+
GOOGLE_API_KEY=your_gemini_api_key_here
|
|
34
|
+
MODEL_NAME=gemini-3.1-flash-live-preview
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
Mount it into any FastAPI app:
|
|
38
|
+
```python
|
|
39
|
+
from fastapi import FastAPI
|
|
40
|
+
from gemilive import mount_gemilive
|
|
41
|
+
|
|
42
|
+
app = FastAPI()
|
|
43
|
+
|
|
44
|
+
# Mounts the WebSocket route at /ws/live
|
|
45
|
+
mount_gemilive(app, system_prompt="You are a helpful assistant. Keep answers brief.")
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
### Frontend (JavaScript)
|
|
49
|
+
|
|
50
|
+
Install the npm package:
|
|
51
|
+
```bash
|
|
52
|
+
npm install gemilive-js
|
|
53
|
+
```
|
|
54
|
+
*Or use via CDN in plain HTML:*
|
|
55
|
+
```html
|
|
56
|
+
<script src="https://cdn.jsdelivr.net/npm/gemilive-js/dist/gemilive.min.js"></script>
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
Initialize the client, connect, and start talking:
|
|
60
|
+
```javascript
|
|
61
|
+
import { GemiliveClient } from 'gemilive-js';
|
|
62
|
+
|
|
63
|
+
// Point it to your FastAPI server's mount path
|
|
64
|
+
const client = new GemiliveClient("ws://localhost:8000/ws/live");
|
|
65
|
+
|
|
66
|
+
client.onMessage = (text) => console.log("Gemini:", text);
|
|
67
|
+
client.onError = (err) => console.error("Error:", err);
|
|
68
|
+
|
|
69
|
+
// Start the connection (prompts user for Mic & Camera)
|
|
70
|
+
await client.start();
|
|
71
|
+
|
|
72
|
+
// Disable video mid-session (audio continues)
|
|
73
|
+
// client.toggleVideo(false);
|
|
74
|
+
|
|
75
|
+
// Stop and disconnect
|
|
76
|
+
// client.stop();
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
## ⚙️ Advanced Configuration
|
|
82
|
+
|
|
83
|
+
### Python `mount_gemilive()` Overrides
|
|
84
|
+
You can override environment variables dynamically when mounting the API:
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
mount_gemilive(
|
|
88
|
+
app,
|
|
89
|
+
google_api_key="...", # Overrides GOOGLE_API_KEY env
|
|
90
|
+
model="gemini-3.1-flash-live-preview",# Overrides MODEL_NAME env
|
|
91
|
+
voice="Aoede", # Optional Gemini Voice ("Aoede", "Charon", etc.)
|
|
92
|
+
allow_origins=["https://myapp.com"], # Essential if your frontend is on a different domain
|
|
93
|
+
debug_mode=True # Console logging of message flow
|
|
94
|
+
)
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### The System Prompt
|
|
98
|
+
You can set system prompts on the **server-side** (via `mount_gemilive`) or the **client-side** (via `new GemiliveClient(url, { systemPrompt: "..." })`).
|
|
99
|
+
If both are provided, the server-side prompt takes precedence, and the client-side prompt is appended securely as "Additional context".
|
|
100
|
+
|
|
101
|
+
---
|
|
102
|
+
|
|
103
|
+
## 📂 Project Structure (For Contributors)
|
|
104
|
+
|
|
105
|
+
`gemilive` is developed as a monorepo containing two packages:
|
|
106
|
+
|
|
107
|
+
```text
|
|
108
|
+
├── gemilive/ # PyPI package source
|
|
109
|
+
│ ├── mount.py # Public FastAPI installer
|
|
110
|
+
│ ├── config.py # Pydantic env validation
|
|
111
|
+
│ └── router.py # Internal WebSocket / GenAI flow
|
|
112
|
+
├── gemilive-js/ # npm package source
|
|
113
|
+
│ ├── src/index.js # Browser SDK (Web Audio API logic)
|
|
114
|
+
│ └── package.json
|
|
115
|
+
└── main.py # Sandbox FastAPI app for testing and local dev
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
For guidelines on local development and how to publish to PyPI and npm, read `PUBLISHING.md`.
|
|
119
|
+
|
|
120
|
+
---
|
|
121
|
+
|
|
122
|
+
## ⚠️ Important Considerations
|
|
123
|
+
|
|
124
|
+
1. **Browser Security**: Browsers restrict microphone/camera access to secure contexts. `getUserMedia` requires **HTTPS** in production. `localhost` works for development.
|
|
125
|
+
2. **Audio Resampling**: Browsers typically record audio at 44.1kHz or 48kHz. The `gemilive-js` SDK seamlessly resamples microphone inputs to **16kHz PCM** to meet Gemini's strict API requirements. Responses from Gemini are returned as 24kHz PCM and gaplessly played back using Javascript time-scheduling.
|
|
126
|
+
|
|
127
|
+
---
|
|
128
|
+
|
|
129
|
+
## 📄 License
|
|
130
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
3
|
+
from pydantic import Field
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class GemiliveSettings(BaseSettings):
|
|
7
|
+
"""
|
|
8
|
+
Configuration for the gemilive package.
|
|
9
|
+
|
|
10
|
+
All fields can be set via environment variables.
|
|
11
|
+
The GOOGLE_API_KEY env var maps directly (no prefix).
|
|
12
|
+
All other settings use the GEMILIVE_ prefix, e.g. GEMILIVE_VOICE.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
google_api_key: str = Field(default="", validation_alias="GOOGLE_API_KEY")
|
|
16
|
+
model: str = Field(
|
|
17
|
+
default="gemini-3.1-flash-live-preview",
|
|
18
|
+
validation_alias="MODEL_NAME",
|
|
19
|
+
)
|
|
20
|
+
voice: Optional[str] = Field(
|
|
21
|
+
default=None,
|
|
22
|
+
validation_alias="GEMILIVE_VOICE",
|
|
23
|
+
description="Gemini voice name (e.g. 'Aoede', 'Charon'). Omit to use Gemini default.",
|
|
24
|
+
)
|
|
25
|
+
system_prompt: Optional[str] = Field(
|
|
26
|
+
default=None,
|
|
27
|
+
validation_alias="GEMILIVE_SYSTEM_PROMPT",
|
|
28
|
+
description="Server-side system prompt baked in at mount time.",
|
|
29
|
+
)
|
|
30
|
+
debug_mode: bool = Field(default=False, validation_alias="GEMILIVE_DEBUG")
|
|
31
|
+
|
|
32
|
+
model_config = SettingsConfigDict(
|
|
33
|
+
env_file=".env",
|
|
34
|
+
env_file_encoding="utf-8",
|
|
35
|
+
extra="ignore",
|
|
36
|
+
populate_by_name=True,
|
|
37
|
+
)
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
from typing import List, Optional
|
|
2
|
+
|
|
3
|
+
from fastapi import FastAPI
|
|
4
|
+
from fastapi.middleware.cors import CORSMiddleware
|
|
5
|
+
|
|
6
|
+
from .config import GemiliveSettings
|
|
7
|
+
from .router import create_router
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def mount_gemilive(
|
|
11
|
+
app: FastAPI,
|
|
12
|
+
*,
|
|
13
|
+
google_api_key: Optional[str] = None,
|
|
14
|
+
model: Optional[str] = None,
|
|
15
|
+
voice: Optional[str] = None,
|
|
16
|
+
system_prompt: Optional[str] = None,
|
|
17
|
+
allow_origins: List[str] = ["*"],
|
|
18
|
+
debug_mode: bool = False,
|
|
19
|
+
) -> None:
|
|
20
|
+
"""
|
|
21
|
+
Mount the Gemini Live AI WebSocket endpoint onto a FastAPI app.
|
|
22
|
+
|
|
23
|
+
All keyword arguments are optional overrides. When omitted, values are
|
|
24
|
+
read from environment variables (GOOGLE_API_KEY, MODEL_NAME, etc.).
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
app: The FastAPI application instance.
|
|
28
|
+
google_api_key: Gemini API key. Falls back to GOOGLE_API_KEY env var.
|
|
29
|
+
model: Gemini Live model name. Falls back to MODEL_NAME env var.
|
|
30
|
+
voice: Gemini voice (e.g. "Aoede", "Charon"). None = Gemini default.
|
|
31
|
+
system_prompt: Server-side system prompt baked in at mount time.
|
|
32
|
+
Client-supplied prompts are appended as extra context.
|
|
33
|
+
allow_origins: CORS origins to allow for WebSocket connections.
|
|
34
|
+
Defaults to ["*"]. Restrict in production.
|
|
35
|
+
debug_mode: Print verbose logs. Falls back to GEMILIVE_DEBUG env var.
|
|
36
|
+
|
|
37
|
+
Example::
|
|
38
|
+
|
|
39
|
+
from fastapi import FastAPI
|
|
40
|
+
from gemilive import mount_gemilive
|
|
41
|
+
|
|
42
|
+
app = FastAPI()
|
|
43
|
+
mount_gemilive(app, system_prompt="You are a helpful assistant.")
|
|
44
|
+
"""
|
|
45
|
+
# Build settings, applying any explicit overrides on top of env vars
|
|
46
|
+
settings = GemiliveSettings()
|
|
47
|
+
if google_api_key is not None:
|
|
48
|
+
settings = settings.model_copy(update={"google_api_key": google_api_key})
|
|
49
|
+
if model is not None:
|
|
50
|
+
settings = settings.model_copy(update={"model": model})
|
|
51
|
+
if voice is not None:
|
|
52
|
+
settings = settings.model_copy(update={"voice": voice})
|
|
53
|
+
if system_prompt is not None:
|
|
54
|
+
settings = settings.model_copy(update={"system_prompt": system_prompt})
|
|
55
|
+
if debug_mode:
|
|
56
|
+
settings = settings.model_copy(update={"debug_mode": True})
|
|
57
|
+
|
|
58
|
+
# Add CORS middleware so separate-origin frontends can reach the WebSocket
|
|
59
|
+
app.add_middleware(
|
|
60
|
+
CORSMiddleware,
|
|
61
|
+
allow_origins=allow_origins,
|
|
62
|
+
allow_credentials=True,
|
|
63
|
+
allow_methods=["*"],
|
|
64
|
+
allow_headers=["*"],
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
# Register the WebSocket router
|
|
68
|
+
router = create_router(settings)
|
|
69
|
+
app.include_router(router)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# PEP 561 marker file
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import base64
|
|
3
|
+
import traceback
|
|
4
|
+
|
|
5
|
+
from fastapi import APIRouter, WebSocket, WebSocketDisconnect, status
|
|
6
|
+
from google import genai
|
|
7
|
+
from google.genai import types
|
|
8
|
+
|
|
9
|
+
from .config import GemiliveSettings
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def create_router(settings: GemiliveSettings) -> APIRouter:
|
|
13
|
+
"""
|
|
14
|
+
Build and return the FastAPI router for the /ws/live endpoint.
|
|
15
|
+
Called once at mount time with a resolved config.
|
|
16
|
+
"""
|
|
17
|
+
router = APIRouter()
|
|
18
|
+
|
|
19
|
+
_client = genai.Client(
|
|
20
|
+
api_key=settings.google_api_key,
|
|
21
|
+
http_options={"api_version": "v1beta"},
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
# Ensure model name has the required "models/" prefix
|
|
25
|
+
_model = settings.model if settings.model.startswith("models/") else f"models/{settings.model}"
|
|
26
|
+
|
|
27
|
+
async def _receive_from_client(ws: WebSocket, session) -> None:
|
|
28
|
+
try:
|
|
29
|
+
while True:
|
|
30
|
+
data = await ws.receive_json()
|
|
31
|
+
|
|
32
|
+
if "text" in data:
|
|
33
|
+
if settings.debug_mode:
|
|
34
|
+
print(f"[gemilive] text → Gemini: {data['text']}")
|
|
35
|
+
await session.send_client_content(
|
|
36
|
+
turns=types.Content(
|
|
37
|
+
role="user",
|
|
38
|
+
parts=[types.Part.from_text(text=data["text"])],
|
|
39
|
+
),
|
|
40
|
+
turn_complete=True,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
elif "realtimeInput" in data:
|
|
44
|
+
for chunk in data["realtimeInput"].get("mediaChunks", []):
|
|
45
|
+
mime_type = chunk.get("mimeType")
|
|
46
|
+
b64_data = chunk.get("data")
|
|
47
|
+
if not mime_type or not b64_data:
|
|
48
|
+
continue
|
|
49
|
+
decoded = base64.b64decode(b64_data)
|
|
50
|
+
if mime_type.startswith("audio/"):
|
|
51
|
+
await session.send_realtime_input(
|
|
52
|
+
audio=types.Blob(mime_type="audio/pcm", data=decoded)
|
|
53
|
+
)
|
|
54
|
+
elif mime_type.startswith("image/") or mime_type.startswith("video/"):
|
|
55
|
+
await session.send_realtime_input(
|
|
56
|
+
video=types.Blob(mime_type=mime_type, data=decoded)
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
except WebSocketDisconnect:
|
|
60
|
+
pass
|
|
61
|
+
except Exception as e:
|
|
62
|
+
if settings.debug_mode:
|
|
63
|
+
print(f"[gemilive] client receive error: {e}")
|
|
64
|
+
traceback.print_exc()
|
|
65
|
+
|
|
66
|
+
async def _receive_from_gemini(ws: WebSocket, session) -> None:
|
|
67
|
+
try:
|
|
68
|
+
while True:
|
|
69
|
+
turn = session.receive()
|
|
70
|
+
async for response in turn:
|
|
71
|
+
if response.session_resumption_update is not None:
|
|
72
|
+
continue
|
|
73
|
+
if getattr(response, "data", None):
|
|
74
|
+
await ws.send_json({
|
|
75
|
+
"type": "audio",
|
|
76
|
+
"mimeType": "audio/pcm",
|
|
77
|
+
"data": base64.b64encode(response.data).decode("utf-8"),
|
|
78
|
+
})
|
|
79
|
+
if getattr(response, "text", None):
|
|
80
|
+
await ws.send_json({"type": "text", "text": response.text})
|
|
81
|
+
await ws.send_json({"type": "turn_complete"})
|
|
82
|
+
|
|
83
|
+
except asyncio.CancelledError:
|
|
84
|
+
pass
|
|
85
|
+
except Exception as e:
|
|
86
|
+
if settings.debug_mode:
|
|
87
|
+
print(f"[gemilive] Gemini receive error: {e}")
|
|
88
|
+
traceback.print_exc()
|
|
89
|
+
|
|
90
|
+
@router.websocket("/ws/live")
|
|
91
|
+
async def live_endpoint(websocket: WebSocket) -> None:
|
|
92
|
+
await websocket.accept()
|
|
93
|
+
|
|
94
|
+
# --- Setup phase: read system prompt from client, merge with server-side ---
|
|
95
|
+
try:
|
|
96
|
+
setup_msg = await websocket.receive_json()
|
|
97
|
+
client_prompt = setup_msg.get("setup", {}).get("system_prompt", "")
|
|
98
|
+
|
|
99
|
+
# Server-side prompt wins; client prompt is appended as extra context
|
|
100
|
+
if settings.system_prompt and client_prompt:
|
|
101
|
+
full_prompt = f"{settings.system_prompt}\n\nAdditional context: {client_prompt}"
|
|
102
|
+
else:
|
|
103
|
+
full_prompt = settings.system_prompt or client_prompt or ""
|
|
104
|
+
|
|
105
|
+
except Exception as e:
|
|
106
|
+
if settings.debug_mode:
|
|
107
|
+
print(f"[gemilive] setup error: {e}")
|
|
108
|
+
await websocket.close(code=status.WS_1003_UNSUPPORTED_DATA)
|
|
109
|
+
return
|
|
110
|
+
|
|
111
|
+
# --- Build Gemini session config ---
|
|
112
|
+
speech_config = None
|
|
113
|
+
if settings.voice:
|
|
114
|
+
speech_config = types.SpeechConfig(
|
|
115
|
+
voice_config=types.VoiceConfig(
|
|
116
|
+
prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name=settings.voice)
|
|
117
|
+
)
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
config = types.LiveConnectConfig(
|
|
121
|
+
response_modalities=[types.Modality.AUDIO],
|
|
122
|
+
**({"system_instruction": types.Content(parts=[types.Part.from_text(text=full_prompt)])} if full_prompt else {}),
|
|
123
|
+
**({"speech_config": speech_config} if speech_config else {}),
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
# --- Connect to Gemini and run bidirectional streaming ---
|
|
127
|
+
try:
|
|
128
|
+
print(f"[gemilive] connecting → {_model}")
|
|
129
|
+
async with _client.aio.live.connect(model=_model, config=config) as session:
|
|
130
|
+
print("[gemilive] connected")
|
|
131
|
+
client_task = asyncio.create_task(_receive_from_client(websocket, session))
|
|
132
|
+
gemini_task = asyncio.create_task(_receive_from_gemini(websocket, session))
|
|
133
|
+
|
|
134
|
+
_, pending = await asyncio.wait(
|
|
135
|
+
[client_task, gemini_task],
|
|
136
|
+
return_when=asyncio.FIRST_COMPLETED,
|
|
137
|
+
)
|
|
138
|
+
for task in pending:
|
|
139
|
+
task.cancel()
|
|
140
|
+
|
|
141
|
+
except Exception as e:
|
|
142
|
+
if settings.debug_mode:
|
|
143
|
+
print(f"[gemilive] Gemini connection error: {e}")
|
|
144
|
+
if websocket.client_state.name == "CONNECTED":
|
|
145
|
+
await websocket.close(code=status.WS_1011_INTERNAL_ERROR)
|
|
146
|
+
|
|
147
|
+
return router
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: gemilive
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Minimal Gemini Live AI WebSocket proxy for FastAPI — plug and play voice + video AI.
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
Keywords: gemini,live,ai,voice,video,fastapi,websocket
|
|
7
|
+
Requires-Python: >=3.14
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Requires-Dist: fastapi[standard]>=0.115.0
|
|
11
|
+
Requires-Dist: google-genai>=1.70.0
|
|
12
|
+
Requires-Dist: pydantic-settings>=2.13.1
|
|
13
|
+
Requires-Dist: python-dotenv>=1.2.2
|
|
14
|
+
Requires-Dist: websockets>=16.0
|
|
15
|
+
Dynamic: license-file
|
|
16
|
+
|
|
17
|
+
# gemilive
|
|
18
|
+
|
|
19
|
+
Plug-and-play **Gemini Live AI** (voice + video) for your FastAPI app.
|
|
20
|
+
|
|
21
|
+
`gemilive` provides a seamless bridge between a web-based frontend and Google's Gemini Multimodal Live API. It handles the heavy lifting of WebSockets, bidirectional audio streams (16kHz up / 24kHz down), gapless browser PCM playback, and live video framing — allowing you to add conversational AI to your project in just **six lines of code**.
|
|
22
|
+
|
|
23
|
+
This repo contains both the Python backend plugin (`gemilive`) and the companion JavaScript client (`gemilive-js`).
|
|
24
|
+
|
|
25
|
+
## 🚀 Features
|
|
26
|
+
|
|
27
|
+
- **Bidirectional Voice AI**: Real-time PCM audio streaming for natural, fluid conversions. No laggy turn-by-turn.
|
|
28
|
+
- **Multimodal Vision**: The AI can see what your camera sees via 1fps JPEG snapshots.
|
|
29
|
+
- **Zero-Boilerplate Backend**: Just wrap your existing FastAPI app with `mount_gemilive()`.
|
|
30
|
+
- **Lightweight JS SDK**: A clean browser `GemiliveClient` handling media capture and resampling.
|
|
31
|
+
- **Toggleable Media**: Turn your camera off/on mid-session seamlessly.
|
|
32
|
+
|
|
33
|
+
---
|
|
34
|
+
|
|
35
|
+
## 🛠️ Installation & Quickstart
|
|
36
|
+
|
|
37
|
+
Integration requires two pieces: the Python server endpoint and the JavaScript browser client.
|
|
38
|
+
|
|
39
|
+
### Backend (Python)
|
|
40
|
+
|
|
41
|
+
Install the pip package:
|
|
42
|
+
```bash
|
|
43
|
+
uv add gemilive
|
|
44
|
+
# or pip install gemilive
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Setup requires an API key. You can provide it in code or grab it from your `.env`:
|
|
48
|
+
```env
|
|
49
|
+
GOOGLE_API_KEY=your_gemini_api_key_here
|
|
50
|
+
MODEL_NAME=gemini-3.1-flash-live-preview
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
Mount it into any FastAPI app:
|
|
54
|
+
```python
|
|
55
|
+
from fastapi import FastAPI
|
|
56
|
+
from gemilive import mount_gemilive
|
|
57
|
+
|
|
58
|
+
app = FastAPI()
|
|
59
|
+
|
|
60
|
+
# Mounts the WebSocket route at /ws/live
|
|
61
|
+
mount_gemilive(app, system_prompt="You are a helpful assistant. Keep answers brief.")
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### Frontend (JavaScript)
|
|
65
|
+
|
|
66
|
+
Install the npm package:
|
|
67
|
+
```bash
|
|
68
|
+
npm install gemilive-js
|
|
69
|
+
```
|
|
70
|
+
*Or use via CDN in plain HTML:*
|
|
71
|
+
```html
|
|
72
|
+
<script src="https://cdn.jsdelivr.net/npm/gemilive-js/dist/gemilive.min.js"></script>
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
Initialize the client, connect, and start talking:
|
|
76
|
+
```javascript
|
|
77
|
+
import { GemiliveClient } from 'gemilive-js';
|
|
78
|
+
|
|
79
|
+
// Point it to your FastAPI server's mount path
|
|
80
|
+
const client = new GemiliveClient("ws://localhost:8000/ws/live");
|
|
81
|
+
|
|
82
|
+
client.onMessage = (text) => console.log("Gemini:", text);
|
|
83
|
+
client.onError = (err) => console.error("Error:", err);
|
|
84
|
+
|
|
85
|
+
// Start the connection (prompts user for Mic & Camera)
|
|
86
|
+
await client.start();
|
|
87
|
+
|
|
88
|
+
// Disable video mid-session (audio continues)
|
|
89
|
+
// client.toggleVideo(false);
|
|
90
|
+
|
|
91
|
+
// Stop and disconnect
|
|
92
|
+
// client.stop();
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
---
|
|
96
|
+
|
|
97
|
+
## ⚙️ Advanced Configuration
|
|
98
|
+
|
|
99
|
+
### Python `mount_gemilive()` Overrides
|
|
100
|
+
You can override environment variables dynamically when mounting the API:
|
|
101
|
+
|
|
102
|
+
```python
|
|
103
|
+
mount_gemilive(
|
|
104
|
+
app,
|
|
105
|
+
google_api_key="...", # Overrides GOOGLE_API_KEY env
|
|
106
|
+
model="gemini-3.1-flash-live-preview",# Overrides MODEL_NAME env
|
|
107
|
+
voice="Aoede", # Optional Gemini Voice ("Aoede", "Charon", etc.)
|
|
108
|
+
allow_origins=["https://myapp.com"], # Essential if your frontend is on a different domain
|
|
109
|
+
debug_mode=True # Console logging of message flow
|
|
110
|
+
)
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
### The System Prompt
|
|
114
|
+
You can set system prompts on the **server-side** (via `mount_gemilive`) or the **client-side** (via `new GemiliveClient(url, { systemPrompt: "..." })`).
|
|
115
|
+
If both are provided, the server-side prompt takes precedence, and the client-side prompt is appended securely as "Additional context".
|
|
116
|
+
|
|
117
|
+
---
|
|
118
|
+
|
|
119
|
+
## 📂 Project Structure (For Contributors)
|
|
120
|
+
|
|
121
|
+
`gemilive` is developed as a monorepo containing two packages:
|
|
122
|
+
|
|
123
|
+
```text
|
|
124
|
+
├── gemilive/ # PyPI package source
|
|
125
|
+
│ ├── mount.py # Public FastAPI installer
|
|
126
|
+
│ ├── config.py # Pydantic env validation
|
|
127
|
+
│ └── router.py # Internal WebSocket / GenAI flow
|
|
128
|
+
├── gemilive-js/ # npm package source
|
|
129
|
+
│ ├── src/index.js # Browser SDK (Web Audio API logic)
|
|
130
|
+
│ └── package.json
|
|
131
|
+
└── main.py # Sandbox FastAPI app for testing and local dev
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
For guidelines on local development and how to publish to PyPI and npm, read `PUBLISHING.md`.
|
|
135
|
+
|
|
136
|
+
---
|
|
137
|
+
|
|
138
|
+
## ⚠️ Important Considerations
|
|
139
|
+
|
|
140
|
+
1. **Browser Security**: Browsers restrict microphone/camera access to secure contexts. `getUserMedia` requires **HTTPS** in production. `localhost` works for development.
|
|
141
|
+
2. **Audio Resampling**: Browsers typically record audio at 44.1kHz or 48kHz. The `gemilive-js` SDK seamlessly resamples microphone inputs to **16kHz PCM** to meet Gemini's strict API requirements. Responses from Gemini are returned as 24kHz PCM and gaplessly played back using Javascript time-scheduling.
|
|
142
|
+
|
|
143
|
+
---
|
|
144
|
+
|
|
145
|
+
## 📄 License
|
|
146
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
gemilive/__init__.py
|
|
5
|
+
gemilive/config.py
|
|
6
|
+
gemilive/mount.py
|
|
7
|
+
gemilive/py.typed
|
|
8
|
+
gemilive/router.py
|
|
9
|
+
gemilive.egg-info/PKG-INFO
|
|
10
|
+
gemilive.egg-info/SOURCES.txt
|
|
11
|
+
gemilive.egg-info/dependency_links.txt
|
|
12
|
+
gemilive.egg-info/requires.txt
|
|
13
|
+
gemilive.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "gemilive"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Minimal Gemini Live AI WebSocket proxy for FastAPI — plug and play voice + video AI."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.14"
|
|
7
|
+
license = "MIT"
|
|
8
|
+
keywords = ["gemini", "live", "ai", "voice", "video", "fastapi", "websocket"]
|
|
9
|
+
dependencies = [
|
|
10
|
+
"fastapi[standard]>=0.115.0",
|
|
11
|
+
"google-genai>=1.70.0",
|
|
12
|
+
"pydantic-settings>=2.13.1",
|
|
13
|
+
"python-dotenv>=1.2.2",
|
|
14
|
+
"websockets>=16.0",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
[tool.setuptools.packages.find]
|
|
18
|
+
include = ["gemilive*"]
|
gemilive-0.1.0/setup.cfg
ADDED