gemilive 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gemilive-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Saidur Rahman Pulok
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,146 @@
1
+ Metadata-Version: 2.4
2
+ Name: gemilive
3
+ Version: 0.1.0
4
+ Summary: Minimal Gemini Live AI WebSocket proxy for FastAPI — plug and play voice + video AI.
5
+ License-Expression: MIT
6
+ Keywords: gemini,live,ai,voice,video,fastapi,websocket
7
+ Requires-Python: >=3.14
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: fastapi[standard]>=0.115.0
11
+ Requires-Dist: google-genai>=1.70.0
12
+ Requires-Dist: pydantic-settings>=2.13.1
13
+ Requires-Dist: python-dotenv>=1.2.2
14
+ Requires-Dist: websockets>=16.0
15
+ Dynamic: license-file
16
+
17
+ # gemilive
18
+
19
+ Plug-and-play **Gemini Live AI** (voice + video) for your FastAPI app.
20
+
21
+ `gemilive` provides a seamless bridge between a web-based frontend and Google's Gemini Multimodal Live API. It handles the heavy lifting of WebSockets, bidirectional audio streams (16kHz up / 24kHz down), gapless browser PCM playback, and live video framing — allowing you to add conversational AI to your project in just **six lines of code**.
22
+
23
+ This repo contains both the Python backend plugin (`gemilive`) and the companion JavaScript client (`gemilive-js`).
24
+
25
+ ## 🚀 Features
26
+
27
+ - **Bidirectional Voice AI**: Real-time PCM audio streaming for natural, fluid conversions. No laggy turn-by-turn.
28
+ - **Multimodal Vision**: The AI can see what your camera sees via 1fps JPEG snapshots.
29
+ - **Zero-Boilerplate Backend**: Just wrap your existing FastAPI app with `mount_gemilive()`.
30
+ - **Lightweight JS SDK**: A clean browser `GemiliveClient` handling media capture and resampling.
31
+ - **Toggleable Media**: Turn your camera off/on mid-session seamlessly.
32
+
33
+ ---
34
+
35
+ ## 🛠️ Installation & Quickstart
36
+
37
+ Integration requires two pieces: the Python server endpoint and the JavaScript browser client.
38
+
39
+ ### Backend (Python)
40
+
41
+ Install the pip package:
42
+ ```bash
43
+ uv add gemilive
44
+ # or pip install gemilive
45
+ ```
46
+
47
+ Setup requires an API key. You can provide it in code or grab it from your `.env`:
48
+ ```env
49
+ GOOGLE_API_KEY=your_gemini_api_key_here
50
+ MODEL_NAME=gemini-3.1-flash-live-preview
51
+ ```
52
+
53
+ Mount it into any FastAPI app:
54
+ ```python
55
+ from fastapi import FastAPI
56
+ from gemilive import mount_gemilive
57
+
58
+ app = FastAPI()
59
+
60
+ # Mounts the WebSocket route at /ws/live
61
+ mount_gemilive(app, system_prompt="You are a helpful assistant. Keep answers brief.")
62
+ ```
63
+
64
+ ### Frontend (JavaScript)
65
+
66
+ Install the npm package:
67
+ ```bash
68
+ npm install gemilive-js
69
+ ```
70
+ *Or use via CDN in plain HTML:*
71
+ ```html
72
+ <script src="https://cdn.jsdelivr.net/npm/gemilive-js/dist/gemilive.min.js"></script>
73
+ ```
74
+
75
+ Initialize the client, connect, and start talking:
76
+ ```javascript
77
+ import { GemiliveClient } from 'gemilive-js';
78
+
79
+ // Point it to your FastAPI server's mount path
80
+ const client = new GemiliveClient("ws://localhost:8000/ws/live");
81
+
82
+ client.onMessage = (text) => console.log("Gemini:", text);
83
+ client.onError = (err) => console.error("Error:", err);
84
+
85
+ // Start the connection (prompts user for Mic & Camera)
86
+ await client.start();
87
+
88
+ // Disable video mid-session (audio continues)
89
+ // client.toggleVideo(false);
90
+
91
+ // Stop and disconnect
92
+ // client.stop();
93
+ ```
94
+
95
+ ---
96
+
97
+ ## ⚙️ Advanced Configuration
98
+
99
+ ### Python `mount_gemilive()` Overrides
100
+ You can override environment variables dynamically when mounting the API:
101
+
102
+ ```python
103
+ mount_gemilive(
104
+ app,
105
+ google_api_key="...", # Overrides GOOGLE_API_KEY env
106
+ model="gemini-3.1-flash-live-preview",# Overrides MODEL_NAME env
107
+ voice="Aoede", # Optional Gemini Voice ("Aoede", "Charon", etc.)
108
+ allow_origins=["https://myapp.com"], # Essential if your frontend is on a different domain
109
+ debug_mode=True # Console logging of message flow
110
+ )
111
+ ```
112
+
113
+ ### The System Prompt
114
+ You can set system prompts on the **server-side** (via `mount_gemilive`) or the **client-side** (via `new GemiliveClient(url, { systemPrompt: "..." })`).
115
+ If both are provided, the server-side prompt takes precedence, and the client-side prompt is appended securely as "Additional context".
116
+
117
+ ---
118
+
119
+ ## 📂 Project Structure (For Contributors)
120
+
121
+ `gemilive` is developed as a monorepo containing two packages:
122
+
123
+ ```text
124
+ ├── gemilive/ # PyPI package source
125
+ │ ├── mount.py # Public FastAPI installer
126
+ │ ├── config.py # Pydantic env validation
127
+ │ └── router.py # Internal WebSocket / GenAI flow
128
+ ├── gemilive-js/ # npm package source
129
+ │ ├── src/index.js # Browser SDK (Web Audio API logic)
130
+ │ └── package.json
131
+ └── main.py # Sandbox FastAPI app for testing and local dev
132
+ ```
133
+
134
+ For guidelines on local development and how to publish to PyPI and npm, read `PUBLISHING.md`.
135
+
136
+ ---
137
+
138
+ ## ⚠️ Important Considerations
139
+
140
+ 1. **Browser Security**: Browsers restrict microphone/camera access to secure contexts. `getUserMedia` requires **HTTPS** in production. `localhost` works for development.
141
+ 2. **Audio Resampling**: Browsers typically record audio at 44.1kHz or 48kHz. The `gemilive-js` SDK seamlessly resamples microphone inputs to **16kHz PCM** to meet Gemini's strict API requirements. Responses from Gemini are returned as 24kHz PCM and gaplessly played back using Javascript time-scheduling.
142
+
143
+ ---
144
+
145
+ ## 📄 License
146
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
@@ -0,0 +1,130 @@
1
+ # gemilive
2
+
3
+ Plug-and-play **Gemini Live AI** (voice + video) for your FastAPI app.
4
+
5
+ `gemilive` provides a seamless bridge between a web-based frontend and Google's Gemini Multimodal Live API. It handles the heavy lifting of WebSockets, bidirectional audio streams (16kHz up / 24kHz down), gapless browser PCM playback, and live video framing — allowing you to add conversational AI to your project in just **six lines of code**.
6
+
7
+ This repo contains both the Python backend plugin (`gemilive`) and the companion JavaScript client (`gemilive-js`).
8
+
9
+ ## 🚀 Features
10
+
11
+ - **Bidirectional Voice AI**: Real-time PCM audio streaming for natural, fluid conversions. No laggy turn-by-turn.
12
+ - **Multimodal Vision**: The AI can see what your camera sees via 1fps JPEG snapshots.
13
+ - **Zero-Boilerplate Backend**: Just wrap your existing FastAPI app with `mount_gemilive()`.
14
+ - **Lightweight JS SDK**: A clean browser `GemiliveClient` handling media capture and resampling.
15
+ - **Toggleable Media**: Turn your camera off/on mid-session seamlessly.
16
+
17
+ ---
18
+
19
+ ## 🛠️ Installation & Quickstart
20
+
21
+ Integration requires two pieces: the Python server endpoint and the JavaScript browser client.
22
+
23
+ ### Backend (Python)
24
+
25
+ Install the pip package:
26
+ ```bash
27
+ uv add gemilive
28
+ # or pip install gemilive
29
+ ```
30
+
31
+ Setup requires an API key. You can provide it in code or grab it from your `.env`:
32
+ ```env
33
+ GOOGLE_API_KEY=your_gemini_api_key_here
34
+ MODEL_NAME=gemini-3.1-flash-live-preview
35
+ ```
36
+
37
+ Mount it into any FastAPI app:
38
+ ```python
39
+ from fastapi import FastAPI
40
+ from gemilive import mount_gemilive
41
+
42
+ app = FastAPI()
43
+
44
+ # Mounts the WebSocket route at /ws/live
45
+ mount_gemilive(app, system_prompt="You are a helpful assistant. Keep answers brief.")
46
+ ```
47
+
48
+ ### Frontend (JavaScript)
49
+
50
+ Install the npm package:
51
+ ```bash
52
+ npm install gemilive-js
53
+ ```
54
+ *Or use via CDN in plain HTML:*
55
+ ```html
56
+ <script src="https://cdn.jsdelivr.net/npm/gemilive-js/dist/gemilive.min.js"></script>
57
+ ```
58
+
59
+ Initialize the client, connect, and start talking:
60
+ ```javascript
61
+ import { GemiliveClient } from 'gemilive-js';
62
+
63
+ // Point it to your FastAPI server's mount path
64
+ const client = new GemiliveClient("ws://localhost:8000/ws/live");
65
+
66
+ client.onMessage = (text) => console.log("Gemini:", text);
67
+ client.onError = (err) => console.error("Error:", err);
68
+
69
+ // Start the connection (prompts user for Mic & Camera)
70
+ await client.start();
71
+
72
+ // Disable video mid-session (audio continues)
73
+ // client.toggleVideo(false);
74
+
75
+ // Stop and disconnect
76
+ // client.stop();
77
+ ```
78
+
79
+ ---
80
+
81
+ ## ⚙️ Advanced Configuration
82
+
83
+ ### Python `mount_gemilive()` Overrides
84
+ You can override environment variables dynamically when mounting the API:
85
+
86
+ ```python
87
+ mount_gemilive(
88
+ app,
89
+ google_api_key="...", # Overrides GOOGLE_API_KEY env
90
+ model="gemini-3.1-flash-live-preview",# Overrides MODEL_NAME env
91
+ voice="Aoede", # Optional Gemini Voice ("Aoede", "Charon", etc.)
92
+ allow_origins=["https://myapp.com"], # Essential if your frontend is on a different domain
93
+ debug_mode=True # Console logging of message flow
94
+ )
95
+ ```
96
+
97
+ ### The System Prompt
98
+ You can set system prompts on the **server-side** (via `mount_gemilive`) or the **client-side** (via `new GemiliveClient(url, { systemPrompt: "..." })`).
99
+ If both are provided, the server-side prompt takes precedence, and the client-side prompt is appended securely as "Additional context".
100
+
101
+ ---
102
+
103
+ ## 📂 Project Structure (For Contributors)
104
+
105
+ `gemilive` is developed as a monorepo containing two packages:
106
+
107
+ ```text
108
+ ├── gemilive/ # PyPI package source
109
+ │ ├── mount.py # Public FastAPI installer
110
+ │ ├── config.py # Pydantic env validation
111
+ │ └── router.py # Internal WebSocket / GenAI flow
112
+ ├── gemilive-js/ # npm package source
113
+ │ ├── src/index.js # Browser SDK (Web Audio API logic)
114
+ │ └── package.json
115
+ └── main.py # Sandbox FastAPI app for testing and local dev
116
+ ```
117
+
118
+ For guidelines on local development and how to publish to PyPI and npm, read `PUBLISHING.md`.
119
+
120
+ ---
121
+
122
+ ## ⚠️ Important Considerations
123
+
124
+ 1. **Browser Security**: Browsers restrict microphone/camera access to secure contexts. `getUserMedia` requires **HTTPS** in production. `localhost` works for development.
125
+ 2. **Audio Resampling**: Browsers typically record audio at 44.1kHz or 48kHz. The `gemilive-js` SDK seamlessly resamples microphone inputs to **16kHz PCM** to meet Gemini's strict API requirements. Responses from Gemini are returned as 24kHz PCM and gaplessly played back using Javascript time-scheduling.
126
+
127
+ ---
128
+
129
+ ## 📄 License
130
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
@@ -0,0 +1,3 @@
1
+ from .mount import mount_gemilive
2
+
3
+ __all__ = ["mount_gemilive"]
@@ -0,0 +1,37 @@
1
+ from typing import Optional
2
+ from pydantic_settings import BaseSettings, SettingsConfigDict
3
+ from pydantic import Field
4
+
5
+
6
+ class GemiliveSettings(BaseSettings):
7
+ """
8
+ Configuration for the gemilive package.
9
+
10
+ All fields can be set via environment variables.
11
+ The GOOGLE_API_KEY env var maps directly (no prefix).
12
+ All other settings use the GEMILIVE_ prefix, e.g. GEMILIVE_VOICE.
13
+ """
14
+
15
+ google_api_key: str = Field(default="", validation_alias="GOOGLE_API_KEY")
16
+ model: str = Field(
17
+ default="gemini-3.1-flash-live-preview",
18
+ validation_alias="MODEL_NAME",
19
+ )
20
+ voice: Optional[str] = Field(
21
+ default=None,
22
+ validation_alias="GEMILIVE_VOICE",
23
+ description="Gemini voice name (e.g. 'Aoede', 'Charon'). Omit to use Gemini default.",
24
+ )
25
+ system_prompt: Optional[str] = Field(
26
+ default=None,
27
+ validation_alias="GEMILIVE_SYSTEM_PROMPT",
28
+ description="Server-side system prompt baked in at mount time.",
29
+ )
30
+ debug_mode: bool = Field(default=False, validation_alias="GEMILIVE_DEBUG")
31
+
32
+ model_config = SettingsConfigDict(
33
+ env_file=".env",
34
+ env_file_encoding="utf-8",
35
+ extra="ignore",
36
+ populate_by_name=True,
37
+ )
@@ -0,0 +1,69 @@
1
+ from typing import List, Optional
2
+
3
+ from fastapi import FastAPI
4
+ from fastapi.middleware.cors import CORSMiddleware
5
+
6
+ from .config import GemiliveSettings
7
+ from .router import create_router
8
+
9
+
10
+ def mount_gemilive(
11
+ app: FastAPI,
12
+ *,
13
+ google_api_key: Optional[str] = None,
14
+ model: Optional[str] = None,
15
+ voice: Optional[str] = None,
16
+ system_prompt: Optional[str] = None,
17
+ allow_origins: List[str] = ["*"],
18
+ debug_mode: bool = False,
19
+ ) -> None:
20
+ """
21
+ Mount the Gemini Live AI WebSocket endpoint onto a FastAPI app.
22
+
23
+ All keyword arguments are optional overrides. When omitted, values are
24
+ read from environment variables (GOOGLE_API_KEY, MODEL_NAME, etc.).
25
+
26
+ Args:
27
+ app: The FastAPI application instance.
28
+ google_api_key: Gemini API key. Falls back to GOOGLE_API_KEY env var.
29
+ model: Gemini Live model name. Falls back to MODEL_NAME env var.
30
+ voice: Gemini voice (e.g. "Aoede", "Charon"). None = Gemini default.
31
+ system_prompt: Server-side system prompt baked in at mount time.
32
+ Client-supplied prompts are appended as extra context.
33
+ allow_origins: CORS origins to allow for WebSocket connections.
34
+ Defaults to ["*"]. Restrict in production.
35
+ debug_mode: Print verbose logs. Falls back to GEMILIVE_DEBUG env var.
36
+
37
+ Example::
38
+
39
+ from fastapi import FastAPI
40
+ from gemilive import mount_gemilive
41
+
42
+ app = FastAPI()
43
+ mount_gemilive(app, system_prompt="You are a helpful assistant.")
44
+ """
45
+ # Build settings, applying any explicit overrides on top of env vars
46
+ settings = GemiliveSettings()
47
+ if google_api_key is not None:
48
+ settings = settings.model_copy(update={"google_api_key": google_api_key})
49
+ if model is not None:
50
+ settings = settings.model_copy(update={"model": model})
51
+ if voice is not None:
52
+ settings = settings.model_copy(update={"voice": voice})
53
+ if system_prompt is not None:
54
+ settings = settings.model_copy(update={"system_prompt": system_prompt})
55
+ if debug_mode:
56
+ settings = settings.model_copy(update={"debug_mode": True})
57
+
58
+ # Add CORS middleware so separate-origin frontends can reach the WebSocket
59
+ app.add_middleware(
60
+ CORSMiddleware,
61
+ allow_origins=allow_origins,
62
+ allow_credentials=True,
63
+ allow_methods=["*"],
64
+ allow_headers=["*"],
65
+ )
66
+
67
+ # Register the WebSocket router
68
+ router = create_router(settings)
69
+ app.include_router(router)
@@ -0,0 +1 @@
1
+ # PEP 561 marker file
@@ -0,0 +1,147 @@
1
+ import asyncio
2
+ import base64
3
+ import traceback
4
+
5
+ from fastapi import APIRouter, WebSocket, WebSocketDisconnect, status
6
+ from google import genai
7
+ from google.genai import types
8
+
9
+ from .config import GemiliveSettings
10
+
11
+
12
+ def create_router(settings: GemiliveSettings) -> APIRouter:
13
+ """
14
+ Build and return the FastAPI router for the /ws/live endpoint.
15
+ Called once at mount time with a resolved config.
16
+ """
17
+ router = APIRouter()
18
+
19
+ _client = genai.Client(
20
+ api_key=settings.google_api_key,
21
+ http_options={"api_version": "v1beta"},
22
+ )
23
+
24
+ # Ensure model name has the required "models/" prefix
25
+ _model = settings.model if settings.model.startswith("models/") else f"models/{settings.model}"
26
+
27
+ async def _receive_from_client(ws: WebSocket, session) -> None:
28
+ try:
29
+ while True:
30
+ data = await ws.receive_json()
31
+
32
+ if "text" in data:
33
+ if settings.debug_mode:
34
+ print(f"[gemilive] text → Gemini: {data['text']}")
35
+ await session.send_client_content(
36
+ turns=types.Content(
37
+ role="user",
38
+ parts=[types.Part.from_text(text=data["text"])],
39
+ ),
40
+ turn_complete=True,
41
+ )
42
+
43
+ elif "realtimeInput" in data:
44
+ for chunk in data["realtimeInput"].get("mediaChunks", []):
45
+ mime_type = chunk.get("mimeType")
46
+ b64_data = chunk.get("data")
47
+ if not mime_type or not b64_data:
48
+ continue
49
+ decoded = base64.b64decode(b64_data)
50
+ if mime_type.startswith("audio/"):
51
+ await session.send_realtime_input(
52
+ audio=types.Blob(mime_type="audio/pcm", data=decoded)
53
+ )
54
+ elif mime_type.startswith("image/") or mime_type.startswith("video/"):
55
+ await session.send_realtime_input(
56
+ video=types.Blob(mime_type=mime_type, data=decoded)
57
+ )
58
+
59
+ except WebSocketDisconnect:
60
+ pass
61
+ except Exception as e:
62
+ if settings.debug_mode:
63
+ print(f"[gemilive] client receive error: {e}")
64
+ traceback.print_exc()
65
+
66
+ async def _receive_from_gemini(ws: WebSocket, session) -> None:
67
+ try:
68
+ while True:
69
+ turn = session.receive()
70
+ async for response in turn:
71
+ if response.session_resumption_update is not None:
72
+ continue
73
+ if getattr(response, "data", None):
74
+ await ws.send_json({
75
+ "type": "audio",
76
+ "mimeType": "audio/pcm",
77
+ "data": base64.b64encode(response.data).decode("utf-8"),
78
+ })
79
+ if getattr(response, "text", None):
80
+ await ws.send_json({"type": "text", "text": response.text})
81
+ await ws.send_json({"type": "turn_complete"})
82
+
83
+ except asyncio.CancelledError:
84
+ pass
85
+ except Exception as e:
86
+ if settings.debug_mode:
87
+ print(f"[gemilive] Gemini receive error: {e}")
88
+ traceback.print_exc()
89
+
90
+ @router.websocket("/ws/live")
91
+ async def live_endpoint(websocket: WebSocket) -> None:
92
+ await websocket.accept()
93
+
94
+ # --- Setup phase: read system prompt from client, merge with server-side ---
95
+ try:
96
+ setup_msg = await websocket.receive_json()
97
+ client_prompt = setup_msg.get("setup", {}).get("system_prompt", "")
98
+
99
+ # Server-side prompt wins; client prompt is appended as extra context
100
+ if settings.system_prompt and client_prompt:
101
+ full_prompt = f"{settings.system_prompt}\n\nAdditional context: {client_prompt}"
102
+ else:
103
+ full_prompt = settings.system_prompt or client_prompt or ""
104
+
105
+ except Exception as e:
106
+ if settings.debug_mode:
107
+ print(f"[gemilive] setup error: {e}")
108
+ await websocket.close(code=status.WS_1003_UNSUPPORTED_DATA)
109
+ return
110
+
111
+ # --- Build Gemini session config ---
112
+ speech_config = None
113
+ if settings.voice:
114
+ speech_config = types.SpeechConfig(
115
+ voice_config=types.VoiceConfig(
116
+ prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name=settings.voice)
117
+ )
118
+ )
119
+
120
+ config = types.LiveConnectConfig(
121
+ response_modalities=[types.Modality.AUDIO],
122
+ **({"system_instruction": types.Content(parts=[types.Part.from_text(text=full_prompt)])} if full_prompt else {}),
123
+ **({"speech_config": speech_config} if speech_config else {}),
124
+ )
125
+
126
+ # --- Connect to Gemini and run bidirectional streaming ---
127
+ try:
128
+ print(f"[gemilive] connecting → {_model}")
129
+ async with _client.aio.live.connect(model=_model, config=config) as session:
130
+ print("[gemilive] connected")
131
+ client_task = asyncio.create_task(_receive_from_client(websocket, session))
132
+ gemini_task = asyncio.create_task(_receive_from_gemini(websocket, session))
133
+
134
+ _, pending = await asyncio.wait(
135
+ [client_task, gemini_task],
136
+ return_when=asyncio.FIRST_COMPLETED,
137
+ )
138
+ for task in pending:
139
+ task.cancel()
140
+
141
+ except Exception as e:
142
+ if settings.debug_mode:
143
+ print(f"[gemilive] Gemini connection error: {e}")
144
+ if websocket.client_state.name == "CONNECTED":
145
+ await websocket.close(code=status.WS_1011_INTERNAL_ERROR)
146
+
147
+ return router
@@ -0,0 +1,146 @@
1
+ Metadata-Version: 2.4
2
+ Name: gemilive
3
+ Version: 0.1.0
4
+ Summary: Minimal Gemini Live AI WebSocket proxy for FastAPI — plug and play voice + video AI.
5
+ License-Expression: MIT
6
+ Keywords: gemini,live,ai,voice,video,fastapi,websocket
7
+ Requires-Python: >=3.14
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: fastapi[standard]>=0.115.0
11
+ Requires-Dist: google-genai>=1.70.0
12
+ Requires-Dist: pydantic-settings>=2.13.1
13
+ Requires-Dist: python-dotenv>=1.2.2
14
+ Requires-Dist: websockets>=16.0
15
+ Dynamic: license-file
16
+
17
+ # gemilive
18
+
19
+ Plug-and-play **Gemini Live AI** (voice + video) for your FastAPI app.
20
+
21
+ `gemilive` provides a seamless bridge between a web-based frontend and Google's Gemini Multimodal Live API. It handles the heavy lifting of WebSockets, bidirectional audio streams (16kHz up / 24kHz down), gapless browser PCM playback, and live video framing — allowing you to add conversational AI to your project in just **six lines of code**.
22
+
23
+ This repo contains both the Python backend plugin (`gemilive`) and the companion JavaScript client (`gemilive-js`).
24
+
25
+ ## 🚀 Features
26
+
27
+ - **Bidirectional Voice AI**: Real-time PCM audio streaming for natural, fluid conversions. No laggy turn-by-turn.
28
+ - **Multimodal Vision**: The AI can see what your camera sees via 1fps JPEG snapshots.
29
+ - **Zero-Boilerplate Backend**: Just wrap your existing FastAPI app with `mount_gemilive()`.
30
+ - **Lightweight JS SDK**: A clean browser `GemiliveClient` handling media capture and resampling.
31
+ - **Toggleable Media**: Turn your camera off/on mid-session seamlessly.
32
+
33
+ ---
34
+
35
+ ## 🛠️ Installation & Quickstart
36
+
37
+ Integration requires two pieces: the Python server endpoint and the JavaScript browser client.
38
+
39
+ ### Backend (Python)
40
+
41
+ Install the pip package:
42
+ ```bash
43
+ uv add gemilive
44
+ # or pip install gemilive
45
+ ```
46
+
47
+ Setup requires an API key. You can provide it in code or grab it from your `.env`:
48
+ ```env
49
+ GOOGLE_API_KEY=your_gemini_api_key_here
50
+ MODEL_NAME=gemini-3.1-flash-live-preview
51
+ ```
52
+
53
+ Mount it into any FastAPI app:
54
+ ```python
55
+ from fastapi import FastAPI
56
+ from gemilive import mount_gemilive
57
+
58
+ app = FastAPI()
59
+
60
+ # Mounts the WebSocket route at /ws/live
61
+ mount_gemilive(app, system_prompt="You are a helpful assistant. Keep answers brief.")
62
+ ```
63
+
64
+ ### Frontend (JavaScript)
65
+
66
+ Install the npm package:
67
+ ```bash
68
+ npm install gemilive-js
69
+ ```
70
+ *Or use via CDN in plain HTML:*
71
+ ```html
72
+ <script src="https://cdn.jsdelivr.net/npm/gemilive-js/dist/gemilive.min.js"></script>
73
+ ```
74
+
75
+ Initialize the client, connect, and start talking:
76
+ ```javascript
77
+ import { GemiliveClient } from 'gemilive-js';
78
+
79
+ // Point it to your FastAPI server's mount path
80
+ const client = new GemiliveClient("ws://localhost:8000/ws/live");
81
+
82
+ client.onMessage = (text) => console.log("Gemini:", text);
83
+ client.onError = (err) => console.error("Error:", err);
84
+
85
+ // Start the connection (prompts user for Mic & Camera)
86
+ await client.start();
87
+
88
+ // Disable video mid-session (audio continues)
89
+ // client.toggleVideo(false);
90
+
91
+ // Stop and disconnect
92
+ // client.stop();
93
+ ```
94
+
95
+ ---
96
+
97
+ ## ⚙️ Advanced Configuration
98
+
99
+ ### Python `mount_gemilive()` Overrides
100
+ You can override environment variables dynamically when mounting the API:
101
+
102
+ ```python
103
+ mount_gemilive(
104
+ app,
105
+ google_api_key="...", # Overrides GOOGLE_API_KEY env
106
+ model="gemini-3.1-flash-live-preview",# Overrides MODEL_NAME env
107
+ voice="Aoede", # Optional Gemini Voice ("Aoede", "Charon", etc.)
108
+ allow_origins=["https://myapp.com"], # Essential if your frontend is on a different domain
109
+ debug_mode=True # Console logging of message flow
110
+ )
111
+ ```
112
+
113
+ ### The System Prompt
114
+ You can set system prompts on the **server-side** (via `mount_gemilive`) or the **client-side** (via `new GemiliveClient(url, { systemPrompt: "..." })`).
115
+ If both are provided, the server-side prompt takes precedence, and the client-side prompt is appended securely as "Additional context".
116
+
117
+ ---
118
+
119
+ ## 📂 Project Structure (For Contributors)
120
+
121
+ `gemilive` is developed as a monorepo containing two packages:
122
+
123
+ ```text
124
+ ├── gemilive/ # PyPI package source
125
+ │ ├── mount.py # Public FastAPI installer
126
+ │ ├── config.py # Pydantic env validation
127
+ │ └── router.py # Internal WebSocket / GenAI flow
128
+ ├── gemilive-js/ # npm package source
129
+ │ ├── src/index.js # Browser SDK (Web Audio API logic)
130
+ │ └── package.json
131
+ └── main.py # Sandbox FastAPI app for testing and local dev
132
+ ```
133
+
134
+ For guidelines on local development and how to publish to PyPI and npm, read `PUBLISHING.md`.
135
+
136
+ ---
137
+
138
+ ## ⚠️ Important Considerations
139
+
140
+ 1. **Browser Security**: Browsers restrict microphone/camera access to secure contexts. `getUserMedia` requires **HTTPS** in production. `localhost` works for development.
141
+ 2. **Audio Resampling**: Browsers typically record audio at 44.1kHz or 48kHz. The `gemilive-js` SDK seamlessly resamples microphone inputs to **16kHz PCM** to meet Gemini's strict API requirements. Responses from Gemini are returned as 24kHz PCM and gaplessly played back using Javascript time-scheduling.
142
+
143
+ ---
144
+
145
+ ## 📄 License
146
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
@@ -0,0 +1,13 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ gemilive/__init__.py
5
+ gemilive/config.py
6
+ gemilive/mount.py
7
+ gemilive/py.typed
8
+ gemilive/router.py
9
+ gemilive.egg-info/PKG-INFO
10
+ gemilive.egg-info/SOURCES.txt
11
+ gemilive.egg-info/dependency_links.txt
12
+ gemilive.egg-info/requires.txt
13
+ gemilive.egg-info/top_level.txt
@@ -0,0 +1,5 @@
1
+ fastapi[standard]>=0.115.0
2
+ google-genai>=1.70.0
3
+ pydantic-settings>=2.13.1
4
+ python-dotenv>=1.2.2
5
+ websockets>=16.0
@@ -0,0 +1,2 @@
1
+ gemilive
2
+ gemilive-js
@@ -0,0 +1,18 @@
1
+ [project]
2
+ name = "gemilive"
3
+ version = "0.1.0"
4
+ description = "Minimal Gemini Live AI WebSocket proxy for FastAPI — plug and play voice + video AI."
5
+ readme = "README.md"
6
+ requires-python = ">=3.14"
7
+ license = "MIT"
8
+ keywords = ["gemini", "live", "ai", "voice", "video", "fastapi", "websocket"]
9
+ dependencies = [
10
+ "fastapi[standard]>=0.115.0",
11
+ "google-genai>=1.70.0",
12
+ "pydantic-settings>=2.13.1",
13
+ "python-dotenv>=1.2.2",
14
+ "websockets>=16.0",
15
+ ]
16
+
17
+ [tool.setuptools.packages.find]
18
+ include = ["gemilive*"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+