vllmd 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
src/vllmd/__init__.py ADDED
@@ -0,0 +1,5 @@
1
+ """vllmd — run local models via vLLM in Docker containers."""
2
+
3
+ from .runner import RunConfig, logs, start, status, stop
4
+
5
+ __all__ = ["RunConfig", "logs", "start", "status", "stop"]
src/vllmd/cli.py ADDED
@@ -0,0 +1,292 @@
1
+ """CLI entry point."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import subprocess
6
+ from pathlib import Path
7
+ from typing import Annotated
8
+
9
+ import typer
10
+ from rich.console import Console
11
+ from rich.table import Table
12
+
13
+ from .runner import (
14
+ RunConfig,
15
+ build_docker_run_cmd,
16
+ list_containers,
17
+ logs,
18
+ status,
19
+ stop,
20
+ stop_all,
21
+ wait_ready,
22
+ )
23
+ from .sessions.cli import session_app
24
+ from .vectordb.cli import db_app
25
+
26
+ app = typer.Typer(
27
+ name="vllmd",
28
+ help="Run local models via vLLM in Docker containers.",
29
+ no_args_is_help=True,
30
+ )
31
+ app.add_typer(db_app, name="db")
32
+ app.add_typer(session_app, name="session")
33
+ console = Console()
34
+
35
+ _NAME_HELP = "Container name (default: vllmd-<model-dir-name>)"
36
+
37
+
38
+ @app.command()
39
+ def run(
40
+ model: Annotated[
41
+ Path,
42
+ typer.Option("--model", "-m", help="Path to the model directory on disk"),
43
+ ],
44
+ port: Annotated[
45
+ int,
46
+ typer.Option("--port", "-p", help="Host port to expose the vLLM API on"),
47
+ ] = 8000,
48
+ name: Annotated[
49
+ str | None,
50
+ typer.Option("--name", "-n", help=_NAME_HELP),
51
+ ] = None,
52
+ gpu: Annotated[
53
+ bool,
54
+ typer.Option("--gpu/--no-gpu", help="Pass --gpus all to the container"),
55
+ ] = True,
56
+ dtype: Annotated[
57
+ str,
58
+ typer.Option("--dtype", help="Model dtype (auto, float16, bfloat16, float32)"),
59
+ ] = "auto",
60
+ max_model_len: Annotated[
61
+ int | None,
62
+ typer.Option("--max-model-len", help="Override max context length"),
63
+ ] = None,
64
+ detach: Annotated[
65
+ bool,
66
+ typer.Option(
67
+ "--detach",
68
+ "-d",
69
+ help="Start container in background; wait for API ready, then return.",
70
+ ),
71
+ ] = False,
72
+ wait: Annotated[
73
+ bool,
74
+ typer.Option(
75
+ "--wait/--no-wait",
76
+ help="With --detach: wait for the API to be ready before returning.",
77
+ ),
78
+ ] = True,
79
+ extra: Annotated[
80
+ list[str] | None,
81
+ typer.Argument(help="Extra args forwarded verbatim to vLLM"),
82
+ ] = None,
83
+ ) -> None:
84
+ """Start a vLLM container serving MODEL on PORT."""
85
+ config = RunConfig(
86
+ model_path=model,
87
+ port=port,
88
+ name=name,
89
+ gpu=gpu,
90
+ dtype=dtype,
91
+ max_model_len=max_model_len,
92
+ extra_args=extra or [],
93
+ )
94
+
95
+ model_path = model.resolve()
96
+ console.print(f"[bold]Starting vLLM container[/bold] '{config.container_name}'")
97
+ console.print(f" Model: {model_path}")
98
+ console.print(f" Model ID: {config.model_id}")
99
+ console.print(f" Port: {port}")
100
+ console.print(f" GPU: {'yes' if gpu else 'no'}")
101
+ console.print()
102
+
103
+ docker_cmd = build_docker_run_cmd(config)
104
+
105
+ if detach:
106
+ subprocess.Popen(
107
+ docker_cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
108
+ )
109
+ if wait:
110
+ console.print("[dim]Waiting for API to become ready…[/dim]")
111
+ if wait_ready(config):
112
+ console.print(f"[green]Ready.[/green] Endpoint: {config.endpoint}")
113
+ console.print(f" Model ID: [bold]{config.model_id}[/bold]")
114
+ else:
115
+ console.print(
116
+ "[yellow]Timed out waiting for API. "
117
+ "Container may still be loading.[/yellow]"
118
+ )
119
+ console.print(f" Endpoint: {config.endpoint}")
120
+ else:
121
+ console.print(f"Container started. Endpoint: {config.endpoint}")
122
+ else:
123
+ try:
124
+ subprocess.run(docker_cmd, check=True)
125
+ except FileNotFoundError as e:
126
+ console.print(f"[red]{e}[/red]")
127
+ raise typer.Exit(1) from e
128
+ except subprocess.CalledProcessError as e:
129
+ console.print(f"[red]Docker exited with code {e.returncode}[/red]")
130
+ raise typer.Exit(e.returncode) from e
131
+
132
+
133
+ @app.command(name="stop")
134
+ def stop_cmd(
135
+ name: Annotated[
136
+ str | None,
137
+ typer.Option("--name", "-n", help="Container name to stop"),
138
+ ] = None,
139
+ all_containers: Annotated[
140
+ bool,
141
+ typer.Option("--all", "-a", help="Stop all vllmd-managed containers"),
142
+ ] = False,
143
+ ) -> None:
144
+ """Stop one or all vllmd containers."""
145
+ if all_containers:
146
+ stopped = stop_all()
147
+ if stopped:
148
+ for n in stopped:
149
+ console.print(f"[green]Stopped '{n}'.[/green]")
150
+ else:
151
+ console.print("[yellow]No running vllmd containers found.[/yellow]")
152
+ return
153
+
154
+ if name is None:
155
+ running = list_containers()
156
+ if len(running) == 1:
157
+ name = running[0]["name"]
158
+ elif len(running) == 0:
159
+ console.print("[yellow]No running vllmd containers found.[/yellow]")
160
+ raise typer.Exit(1)
161
+ else:
162
+ console.print(
163
+ "[red]Multiple containers running — specify --name or use --all:[/red]"
164
+ )
165
+ for c in running:
166
+ console.print(f" {c['name']}")
167
+ raise typer.Exit(1)
168
+
169
+ try:
170
+ stop(name)
171
+ console.print(f"[green]Stopped '{name}'.[/green]")
172
+ except RuntimeError as e:
173
+ console.print(f"[red]{e}[/red]")
174
+ raise typer.Exit(1) from e
175
+ except subprocess.CalledProcessError as e:
176
+ console.print(f"[red]Docker exited with code {e.returncode}[/red]")
177
+ raise typer.Exit(e.returncode) from e
178
+
179
+
180
+ @app.command(name="ps")
181
+ def ps_cmd() -> None:
182
+ """List all running vllmd-managed containers."""
183
+ containers = list_containers()
184
+ if not containers:
185
+ console.print("[dim]No vllmd containers running.[/dim]")
186
+ return
187
+
188
+ table = Table("Name", "Model", "Port", "Endpoint", "Status", show_header=True)
189
+ for c in containers:
190
+ table.add_row(
191
+ c["name"],
192
+ c["model_id"],
193
+ str(c["port"] or "?"),
194
+ c["endpoint"],
195
+ c["status"],
196
+ )
197
+ console.print(table)
198
+
199
+
200
+ @app.command(name="status")
201
+ def status_cmd(
202
+ name: Annotated[
203
+ str | None,
204
+ typer.Option("--name", "-n", help="Container name (omit to show all)"),
205
+ ] = None,
206
+ ) -> None:
207
+ """Show container and API status. Omit --name to show all containers."""
208
+ if name is None:
209
+ # Show summary table for all managed containers
210
+ containers = list_containers()
211
+ if not containers:
212
+ console.print("[dim]No vllmd containers running.[/dim]")
213
+ raise typer.Exit(1)
214
+
215
+ table = Table("Name", "Model", "Port", "API", "Status", show_header=True)
216
+ all_healthy = True
217
+ for c in containers:
218
+ info = status(c["name"])
219
+ api_str = (
220
+ "[green]healthy[/green]"
221
+ if info["api_healthy"]
222
+ else "[yellow]unreachable[/yellow]"
223
+ )
224
+ if not info["api_healthy"]:
225
+ all_healthy = False
226
+ table.add_row(
227
+ c["name"],
228
+ c["model_id"],
229
+ str(c["port"] or "?"),
230
+ api_str,
231
+ c["status"],
232
+ )
233
+ console.print(table)
234
+ if not all_healthy:
235
+ raise typer.Exit(1)
236
+ return
237
+
238
+ info = status(name)
239
+
240
+ table = Table(show_header=False, box=None, padding=(0, 2))
241
+ table.add_column("Key", style="bold")
242
+ table.add_column("Value")
243
+
244
+ running_str = "[green]running[/green]" if info["running"] else "[red]stopped[/red]"
245
+ api_str = (
246
+ "[green]healthy[/green]"
247
+ if info["api_healthy"]
248
+ else "[yellow]unreachable[/yellow]"
249
+ )
250
+
251
+ table.add_row("Container", running_str)
252
+ table.add_row("API", api_str)
253
+ if info["container"]:
254
+ table.add_row("Started", info["container"].get("StartedAt", "—"))
255
+
256
+ console.print(table)
257
+ if not info["running"]:
258
+ raise typer.Exit(1)
259
+
260
+
261
+ @app.command(name="logs")
262
+ def logs_cmd(
263
+ name: Annotated[
264
+ str | None,
265
+ typer.Option(
266
+ "--name", "-n", help="Container name (auto-resolved if only one is running)"
267
+ ),
268
+ ] = None,
269
+ follow: Annotated[
270
+ bool,
271
+ typer.Option("--follow", "-f", help="Follow log output"),
272
+ ] = False,
273
+ ) -> None:
274
+ """Print logs from a vllmd container."""
275
+ if name is None:
276
+ running = list_containers()
277
+ if len(running) == 1:
278
+ name = running[0]["name"]
279
+ elif len(running) == 0:
280
+ console.print("[yellow]No running vllmd containers found.[/yellow]")
281
+ raise typer.Exit(1)
282
+ else:
283
+ console.print("[red]Multiple containers running — specify --name:[/red]")
284
+ for c in running:
285
+ console.print(f" {c['name']}")
286
+ raise typer.Exit(1)
287
+
288
+ try:
289
+ logs(name, follow=follow)
290
+ except subprocess.CalledProcessError as e:
291
+ console.print(f"[red]Docker exited with code {e.returncode}[/red]")
292
+ raise typer.Exit(e.returncode) from e
src/vllmd/runner.py ADDED
@@ -0,0 +1,239 @@
1
+ """Docker management for vLLM model containers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import contextlib
6
+ import json
7
+ import re
8
+ import subprocess
9
+ import time
10
+ import urllib.request
11
+ from dataclasses import dataclass, field
12
+ from pathlib import Path
13
+
14
+ VLLM_IMAGE = "vllm/vllm-openai:latest"
15
+ HEALTH_TIMEOUT = 300
16
+ HEALTH_INTERVAL = 3
17
+
18
+ MANAGED_LABEL = "com.vllmd.managed"
19
+ MODEL_LABEL = "com.vllmd.model"
20
+ MODEL_PATH_LABEL = "com.vllmd.model_path"
21
+
22
+
23
+ @dataclass
24
+ class RunConfig:
25
+ model_path: Path
26
+ port: int = 8000
27
+ name: str | None = None # None → derived from model dir name
28
+ gpu: bool = True
29
+ dtype: str = "auto"
30
+ max_model_len: int | None = None
31
+ extra_args: list[str] = field(default_factory=list)
32
+
33
+ @property
34
+ def model_id(self) -> str:
35
+ return self.model_path.resolve().name
36
+
37
+ @property
38
+ def container_name(self) -> str:
39
+ return self.name or f"vllmd-{self.model_id}"
40
+
41
+ @property
42
+ def endpoint(self) -> str:
43
+ return f"http://localhost:{self.port}"
44
+
45
+
46
+ def _docker(*args: str, capture: bool = False) -> subprocess.CompletedProcess:
47
+ return subprocess.run(
48
+ ["docker", *args],
49
+ check=True,
50
+ capture_output=capture,
51
+ text=True,
52
+ )
53
+
54
+
55
+ def _container_exists(name: str) -> bool:
56
+ result = subprocess.run(
57
+ ["docker", "ps", "-a", "--filter", f"name=^{name}$", "--format", "{{.Names}}"],
58
+ capture_output=True,
59
+ text=True,
60
+ )
61
+ return name in result.stdout.splitlines()
62
+
63
+
64
+ def _wait_ready(endpoint: str, timeout: int = HEALTH_TIMEOUT) -> bool:
65
+ deadline = time.monotonic() + timeout
66
+ while time.monotonic() < deadline:
67
+ try:
68
+ with urllib.request.urlopen(f"{endpoint}/v1/models", timeout=5):
69
+ return True
70
+ except Exception:
71
+ time.sleep(HEALTH_INTERVAL)
72
+ return False
73
+
74
+
75
+ def _parse_host_port(ports_str: str) -> int | None:
76
+ """Extract the host port from a Ports string like '0.0.0.0:8001->8000/tcp'."""
77
+ m = re.search(r":(\d+)->8000", ports_str)
78
+ return int(m.group(1)) if m else None
79
+
80
+
81
+ def _parse_labels(labels_str: str) -> dict[str, str]:
82
+ """Parse Docker's comma-separated 'key=value,key=value' label string."""
83
+ result: dict[str, str] = {}
84
+ for part in labels_str.split(","):
85
+ part = part.strip()
86
+ if "=" in part:
87
+ k, _, v = part.partition("=")
88
+ result[k.strip()] = v.strip()
89
+ return result
90
+
91
+
92
+ def build_docker_run_cmd(config: RunConfig) -> list[str]:
93
+ model_path = config.model_path.resolve()
94
+ cmd = [
95
+ "docker",
96
+ "run",
97
+ "--rm",
98
+ "--name",
99
+ config.container_name,
100
+ "-p",
101
+ f"{config.port}:8000",
102
+ "-v",
103
+ f"{model_path}:/model:ro",
104
+ f"--label={MANAGED_LABEL}=true",
105
+ f"--label={MODEL_LABEL}={config.model_id}",
106
+ f"--label={MODEL_PATH_LABEL}={model_path}",
107
+ ]
108
+ if config.gpu:
109
+ cmd += ["--gpus", "all"]
110
+ cmd += [
111
+ VLLM_IMAGE,
112
+ "--model",
113
+ "/model",
114
+ "--served-model-name",
115
+ config.model_id,
116
+ "--dtype",
117
+ config.dtype,
118
+ "--host",
119
+ "0.0.0.0",
120
+ "--port",
121
+ "8000",
122
+ ]
123
+ if config.max_model_len is not None:
124
+ cmd += ["--max-model-len", str(config.max_model_len)]
125
+ cmd += config.extra_args
126
+ return cmd
127
+
128
+
129
+ def start(config: RunConfig) -> None:
130
+ """Start a vLLM container (foreground, blocking)."""
131
+ model_path = config.model_path.resolve()
132
+ if not model_path.exists():
133
+ raise FileNotFoundError(f"Model path not found: {model_path}")
134
+ if _container_exists(config.container_name):
135
+ raise RuntimeError(
136
+ f"Container '{config.container_name}' already exists. "
137
+ "Stop it first or use a different --name."
138
+ )
139
+ subprocess.run(build_docker_run_cmd(config), check=True)
140
+
141
+
142
+ def stop(name: str) -> None:
143
+ """Stop and remove a named container."""
144
+ if not _container_exists(name):
145
+ raise RuntimeError(f"No container named '{name}' found.")
146
+ _docker("stop", name)
147
+
148
+
149
+ def stop_all() -> list[str]:
150
+ """Stop all vllmd-managed containers. Returns list of stopped names."""
151
+ containers = list_containers()
152
+ stopped = []
153
+ for c in containers:
154
+ _docker("stop", c["name"])
155
+ stopped.append(c["name"])
156
+ return stopped
157
+
158
+
159
+ def list_containers() -> list[dict]:
160
+ """Return info for all running vllmd-managed containers."""
161
+ result = subprocess.run(
162
+ [
163
+ "docker",
164
+ "ps",
165
+ "--filter",
166
+ f"label={MANAGED_LABEL}=true",
167
+ "--format",
168
+ "{{json .}}",
169
+ ],
170
+ capture_output=True,
171
+ text=True,
172
+ )
173
+ containers = []
174
+ for line in result.stdout.strip().splitlines():
175
+ if not line:
176
+ continue
177
+ data = json.loads(line)
178
+ labels = _parse_labels(data.get("Labels", ""))
179
+ host_port = _parse_host_port(data.get("Ports", ""))
180
+ model_id = labels.get(MODEL_LABEL, "?")
181
+ model_path = labels.get(MODEL_PATH_LABEL, "?")
182
+ containers.append(
183
+ {
184
+ "name": data["Names"],
185
+ "model_id": model_id,
186
+ "model_path": model_path,
187
+ "port": host_port,
188
+ "endpoint": f"http://localhost:{host_port}" if host_port else "?",
189
+ "status": data.get("Status", "?"),
190
+ }
191
+ )
192
+ return containers
193
+
194
+
195
+ def status(name: str) -> dict:
196
+ """Return container state and API health for a named container."""
197
+ result = subprocess.run(
198
+ ["docker", "inspect", name, "--format", "{{json .State}}"],
199
+ capture_output=True,
200
+ text=True,
201
+ )
202
+ if result.returncode != 0:
203
+ return {"running": False, "api_healthy": False, "container": None}
204
+
205
+ state = json.loads(result.stdout.strip())
206
+ running = state.get("Running", False)
207
+ api_healthy = False
208
+
209
+ if running:
210
+ port_result = subprocess.run(
211
+ ["docker", "port", name, "8000"],
212
+ capture_output=True,
213
+ text=True,
214
+ )
215
+ port = 8000
216
+ if port_result.returncode == 0:
217
+ binding = port_result.stdout.strip().split(":")[-1]
218
+ with contextlib.suppress(ValueError):
219
+ port = int(binding)
220
+ with contextlib.suppress(Exception):
221
+ url = f"http://localhost:{port}/v1/models"
222
+ with urllib.request.urlopen(url, timeout=3):
223
+ api_healthy = True
224
+
225
+ return {"running": running, "api_healthy": api_healthy, "container": state}
226
+
227
+
228
+ def wait_ready(config: RunConfig) -> bool:
229
+ """Block until the vLLM API is reachable, or timeout."""
230
+ return _wait_ready(config.endpoint)
231
+
232
+
233
+ def logs(name: str, follow: bool = False) -> None:
234
+ """Stream or print container logs."""
235
+ cmd = ["logs"]
236
+ if follow:
237
+ cmd.append("-f")
238
+ cmd.append(name)
239
+ _docker(*cmd)
@@ -0,0 +1,5 @@
1
+ """Session management for persistent model conversations."""
2
+
3
+ from .session import Message, Session
4
+
5
+ __all__ = ["Message", "Session"]
@@ -0,0 +1,123 @@
1
+ """Chat function: assembles context and calls the vLLM completions endpoint."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import urllib.request
7
+ from pathlib import Path
8
+
9
+ from ..vectordb.embeddings import embed
10
+ from ..vectordb.store import COLLECTION_CODE, COLLECTION_DOCUMENTS, VectorStore
11
+ from .session import Session
12
+
13
+ MAX_HISTORY = 20
14
+ N_CONTEXT_CHUNKS = 3
15
+ MAX_TOKENS = 2048
16
+ TIMEOUT = 120
17
+
18
+
19
+ def chat(
20
+ session: Session,
21
+ user_message: str,
22
+ *,
23
+ max_history: int = MAX_HISTORY,
24
+ n_context: int = N_CONTEXT_CHUNKS,
25
+ max_tokens: int = MAX_TOKENS,
26
+ ) -> str:
27
+ """Send *user_message* within *session*, returning the assistant reply.
28
+
29
+ Retrieves semantic context from the session's vector store if an
30
+ embedding_model is configured. Falls back silently to history-only context
31
+ if the embedding endpoint is unavailable.
32
+ """
33
+ messages: list[dict] = []
34
+
35
+ if session.system_prompt:
36
+ messages.append({"role": "system", "content": session.system_prompt})
37
+
38
+ # Semantic context retrieval (skip entirely if no embedding model configured)
39
+ context_text = (
40
+ _retrieve_context(session, user_message, n_context)
41
+ if session.embedding_model
42
+ else ""
43
+ )
44
+ if context_text:
45
+ messages.append(
46
+ {"role": "system", "content": f"Relevant context:\n\n{context_text}"}
47
+ )
48
+
49
+ # Recent sequential history
50
+ for msg in session.messages[-max_history:]:
51
+ messages.append({"role": msg.role, "content": msg.content})
52
+
53
+ messages.append({"role": "user", "content": user_message})
54
+
55
+ response = _complete(session.endpoint, session.model_id, messages, max_tokens)
56
+
57
+ # Persist the exchange
58
+ from .session import Message
59
+
60
+ session.messages.append(Message(role="user", content=user_message))
61
+ session.messages.append(Message(role="assistant", content=response))
62
+
63
+ # Store embeddings in vector DB for future retrieval
64
+ if session.embedding_model:
65
+ _store_history(session, user_message, response)
66
+
67
+ return response
68
+
69
+
70
+ def retrieve_context(session: Session, query: str, n: int = N_CONTEXT_CHUNKS) -> str:
71
+ """Public helper to show what context would be injected for *query*."""
72
+ return _retrieve_context(session, query, n)
73
+
74
+
75
+ # ------------------------------------------------------------------
76
+ # Internals
77
+ # ------------------------------------------------------------------
78
+
79
+
80
+ def _retrieve_context(session: Session, query: str, n: int) -> str:
81
+ if not session.embedding_model:
82
+ return ""
83
+ try:
84
+ store = VectorStore(Path(session.db_path))
85
+ query_vec = embed(session.endpoint, session.embedding_model, [query])[0]
86
+ chunks: list[str] = []
87
+ for collection in (COLLECTION_DOCUMENTS, COLLECTION_CODE):
88
+ results = store.search(query_vec, collection, n_results=n)
89
+ chunks.extend(r["content"] for r in results)
90
+ if not chunks:
91
+ return ""
92
+ return "\n\n---\n\n".join(chunks[:n])
93
+ except Exception:
94
+ return ""
95
+
96
+
97
+ def _complete(
98
+ endpoint: str, model_id: str, messages: list[dict], max_tokens: int
99
+ ) -> str:
100
+ payload = json.dumps(
101
+ {"model": model_id, "messages": messages, "max_tokens": max_tokens}
102
+ ).encode()
103
+ req = urllib.request.Request(
104
+ f"{endpoint.rstrip('/')}/v1/chat/completions",
105
+ data=payload,
106
+ headers={"Content-Type": "application/json"},
107
+ )
108
+ with urllib.request.urlopen(req, timeout=TIMEOUT) as resp:
109
+ data = json.loads(resp.read())
110
+ return data["choices"][0]["message"]["content"]
111
+
112
+
113
+ def _store_history(session: Session, user_message: str, response: str) -> None:
114
+ try:
115
+ store = VectorStore(Path(session.db_path))
116
+
117
+ def embedder(texts: list[str]) -> list[list[float]]:
118
+ return embed(session.endpoint, session.embedding_model, texts)
119
+
120
+ store.add_history(session.id, "user", user_message, embedder)
121
+ store.add_history(session.id, "assistant", response, embedder)
122
+ except Exception:
123
+ pass