vllmd 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- src/vllmd/__init__.py +5 -0
- src/vllmd/cli.py +292 -0
- src/vllmd/runner.py +239 -0
- src/vllmd/sessions/__init__.py +5 -0
- src/vllmd/sessions/chat.py +123 -0
- src/vllmd/sessions/cli.py +367 -0
- src/vllmd/sessions/session.py +101 -0
- src/vllmd/vectordb/__init__.py +5 -0
- src/vllmd/vectordb/cli.py +223 -0
- src/vllmd/vectordb/embeddings.py +24 -0
- src/vllmd/vectordb/store.py +259 -0
- src/vllmd/vectordb/sync.py +24 -0
- vllmd-0.2.0.dist-info/METADATA +238 -0
- vllmd-0.2.0.dist-info/RECORD +17 -0
- vllmd-0.2.0.dist-info/WHEEL +4 -0
- vllmd-0.2.0.dist-info/entry_points.txt +2 -0
- vllmd-0.2.0.dist-info/licenses/LICENSE +21 -0
src/vllmd/__init__.py
ADDED
src/vllmd/cli.py
ADDED
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
"""CLI entry point."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import subprocess
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Annotated
|
|
8
|
+
|
|
9
|
+
import typer
|
|
10
|
+
from rich.console import Console
|
|
11
|
+
from rich.table import Table
|
|
12
|
+
|
|
13
|
+
from .runner import (
|
|
14
|
+
RunConfig,
|
|
15
|
+
build_docker_run_cmd,
|
|
16
|
+
list_containers,
|
|
17
|
+
logs,
|
|
18
|
+
status,
|
|
19
|
+
stop,
|
|
20
|
+
stop_all,
|
|
21
|
+
wait_ready,
|
|
22
|
+
)
|
|
23
|
+
from .sessions.cli import session_app
|
|
24
|
+
from .vectordb.cli import db_app
|
|
25
|
+
|
|
26
|
+
app = typer.Typer(
|
|
27
|
+
name="vllmd",
|
|
28
|
+
help="Run local models via vLLM in Docker containers.",
|
|
29
|
+
no_args_is_help=True,
|
|
30
|
+
)
|
|
31
|
+
app.add_typer(db_app, name="db")
|
|
32
|
+
app.add_typer(session_app, name="session")
|
|
33
|
+
console = Console()
|
|
34
|
+
|
|
35
|
+
_NAME_HELP = "Container name (default: vllmd-<model-dir-name>)"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@app.command()
|
|
39
|
+
def run(
|
|
40
|
+
model: Annotated[
|
|
41
|
+
Path,
|
|
42
|
+
typer.Option("--model", "-m", help="Path to the model directory on disk"),
|
|
43
|
+
],
|
|
44
|
+
port: Annotated[
|
|
45
|
+
int,
|
|
46
|
+
typer.Option("--port", "-p", help="Host port to expose the vLLM API on"),
|
|
47
|
+
] = 8000,
|
|
48
|
+
name: Annotated[
|
|
49
|
+
str | None,
|
|
50
|
+
typer.Option("--name", "-n", help=_NAME_HELP),
|
|
51
|
+
] = None,
|
|
52
|
+
gpu: Annotated[
|
|
53
|
+
bool,
|
|
54
|
+
typer.Option("--gpu/--no-gpu", help="Pass --gpus all to the container"),
|
|
55
|
+
] = True,
|
|
56
|
+
dtype: Annotated[
|
|
57
|
+
str,
|
|
58
|
+
typer.Option("--dtype", help="Model dtype (auto, float16, bfloat16, float32)"),
|
|
59
|
+
] = "auto",
|
|
60
|
+
max_model_len: Annotated[
|
|
61
|
+
int | None,
|
|
62
|
+
typer.Option("--max-model-len", help="Override max context length"),
|
|
63
|
+
] = None,
|
|
64
|
+
detach: Annotated[
|
|
65
|
+
bool,
|
|
66
|
+
typer.Option(
|
|
67
|
+
"--detach",
|
|
68
|
+
"-d",
|
|
69
|
+
help="Start container in background; wait for API ready, then return.",
|
|
70
|
+
),
|
|
71
|
+
] = False,
|
|
72
|
+
wait: Annotated[
|
|
73
|
+
bool,
|
|
74
|
+
typer.Option(
|
|
75
|
+
"--wait/--no-wait",
|
|
76
|
+
help="With --detach: wait for the API to be ready before returning.",
|
|
77
|
+
),
|
|
78
|
+
] = True,
|
|
79
|
+
extra: Annotated[
|
|
80
|
+
list[str] | None,
|
|
81
|
+
typer.Argument(help="Extra args forwarded verbatim to vLLM"),
|
|
82
|
+
] = None,
|
|
83
|
+
) -> None:
|
|
84
|
+
"""Start a vLLM container serving MODEL on PORT."""
|
|
85
|
+
config = RunConfig(
|
|
86
|
+
model_path=model,
|
|
87
|
+
port=port,
|
|
88
|
+
name=name,
|
|
89
|
+
gpu=gpu,
|
|
90
|
+
dtype=dtype,
|
|
91
|
+
max_model_len=max_model_len,
|
|
92
|
+
extra_args=extra or [],
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
model_path = model.resolve()
|
|
96
|
+
console.print(f"[bold]Starting vLLM container[/bold] '{config.container_name}'")
|
|
97
|
+
console.print(f" Model: {model_path}")
|
|
98
|
+
console.print(f" Model ID: {config.model_id}")
|
|
99
|
+
console.print(f" Port: {port}")
|
|
100
|
+
console.print(f" GPU: {'yes' if gpu else 'no'}")
|
|
101
|
+
console.print()
|
|
102
|
+
|
|
103
|
+
docker_cmd = build_docker_run_cmd(config)
|
|
104
|
+
|
|
105
|
+
if detach:
|
|
106
|
+
subprocess.Popen(
|
|
107
|
+
docker_cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
|
|
108
|
+
)
|
|
109
|
+
if wait:
|
|
110
|
+
console.print("[dim]Waiting for API to become ready…[/dim]")
|
|
111
|
+
if wait_ready(config):
|
|
112
|
+
console.print(f"[green]Ready.[/green] Endpoint: {config.endpoint}")
|
|
113
|
+
console.print(f" Model ID: [bold]{config.model_id}[/bold]")
|
|
114
|
+
else:
|
|
115
|
+
console.print(
|
|
116
|
+
"[yellow]Timed out waiting for API. "
|
|
117
|
+
"Container may still be loading.[/yellow]"
|
|
118
|
+
)
|
|
119
|
+
console.print(f" Endpoint: {config.endpoint}")
|
|
120
|
+
else:
|
|
121
|
+
console.print(f"Container started. Endpoint: {config.endpoint}")
|
|
122
|
+
else:
|
|
123
|
+
try:
|
|
124
|
+
subprocess.run(docker_cmd, check=True)
|
|
125
|
+
except FileNotFoundError as e:
|
|
126
|
+
console.print(f"[red]{e}[/red]")
|
|
127
|
+
raise typer.Exit(1) from e
|
|
128
|
+
except subprocess.CalledProcessError as e:
|
|
129
|
+
console.print(f"[red]Docker exited with code {e.returncode}[/red]")
|
|
130
|
+
raise typer.Exit(e.returncode) from e
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
@app.command(name="stop")
|
|
134
|
+
def stop_cmd(
|
|
135
|
+
name: Annotated[
|
|
136
|
+
str | None,
|
|
137
|
+
typer.Option("--name", "-n", help="Container name to stop"),
|
|
138
|
+
] = None,
|
|
139
|
+
all_containers: Annotated[
|
|
140
|
+
bool,
|
|
141
|
+
typer.Option("--all", "-a", help="Stop all vllmd-managed containers"),
|
|
142
|
+
] = False,
|
|
143
|
+
) -> None:
|
|
144
|
+
"""Stop one or all vllmd containers."""
|
|
145
|
+
if all_containers:
|
|
146
|
+
stopped = stop_all()
|
|
147
|
+
if stopped:
|
|
148
|
+
for n in stopped:
|
|
149
|
+
console.print(f"[green]Stopped '{n}'.[/green]")
|
|
150
|
+
else:
|
|
151
|
+
console.print("[yellow]No running vllmd containers found.[/yellow]")
|
|
152
|
+
return
|
|
153
|
+
|
|
154
|
+
if name is None:
|
|
155
|
+
running = list_containers()
|
|
156
|
+
if len(running) == 1:
|
|
157
|
+
name = running[0]["name"]
|
|
158
|
+
elif len(running) == 0:
|
|
159
|
+
console.print("[yellow]No running vllmd containers found.[/yellow]")
|
|
160
|
+
raise typer.Exit(1)
|
|
161
|
+
else:
|
|
162
|
+
console.print(
|
|
163
|
+
"[red]Multiple containers running — specify --name or use --all:[/red]"
|
|
164
|
+
)
|
|
165
|
+
for c in running:
|
|
166
|
+
console.print(f" {c['name']}")
|
|
167
|
+
raise typer.Exit(1)
|
|
168
|
+
|
|
169
|
+
try:
|
|
170
|
+
stop(name)
|
|
171
|
+
console.print(f"[green]Stopped '{name}'.[/green]")
|
|
172
|
+
except RuntimeError as e:
|
|
173
|
+
console.print(f"[red]{e}[/red]")
|
|
174
|
+
raise typer.Exit(1) from e
|
|
175
|
+
except subprocess.CalledProcessError as e:
|
|
176
|
+
console.print(f"[red]Docker exited with code {e.returncode}[/red]")
|
|
177
|
+
raise typer.Exit(e.returncode) from e
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
@app.command(name="ps")
|
|
181
|
+
def ps_cmd() -> None:
|
|
182
|
+
"""List all running vllmd-managed containers."""
|
|
183
|
+
containers = list_containers()
|
|
184
|
+
if not containers:
|
|
185
|
+
console.print("[dim]No vllmd containers running.[/dim]")
|
|
186
|
+
return
|
|
187
|
+
|
|
188
|
+
table = Table("Name", "Model", "Port", "Endpoint", "Status", show_header=True)
|
|
189
|
+
for c in containers:
|
|
190
|
+
table.add_row(
|
|
191
|
+
c["name"],
|
|
192
|
+
c["model_id"],
|
|
193
|
+
str(c["port"] or "?"),
|
|
194
|
+
c["endpoint"],
|
|
195
|
+
c["status"],
|
|
196
|
+
)
|
|
197
|
+
console.print(table)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
@app.command(name="status")
|
|
201
|
+
def status_cmd(
|
|
202
|
+
name: Annotated[
|
|
203
|
+
str | None,
|
|
204
|
+
typer.Option("--name", "-n", help="Container name (omit to show all)"),
|
|
205
|
+
] = None,
|
|
206
|
+
) -> None:
|
|
207
|
+
"""Show container and API status. Omit --name to show all containers."""
|
|
208
|
+
if name is None:
|
|
209
|
+
# Show summary table for all managed containers
|
|
210
|
+
containers = list_containers()
|
|
211
|
+
if not containers:
|
|
212
|
+
console.print("[dim]No vllmd containers running.[/dim]")
|
|
213
|
+
raise typer.Exit(1)
|
|
214
|
+
|
|
215
|
+
table = Table("Name", "Model", "Port", "API", "Status", show_header=True)
|
|
216
|
+
all_healthy = True
|
|
217
|
+
for c in containers:
|
|
218
|
+
info = status(c["name"])
|
|
219
|
+
api_str = (
|
|
220
|
+
"[green]healthy[/green]"
|
|
221
|
+
if info["api_healthy"]
|
|
222
|
+
else "[yellow]unreachable[/yellow]"
|
|
223
|
+
)
|
|
224
|
+
if not info["api_healthy"]:
|
|
225
|
+
all_healthy = False
|
|
226
|
+
table.add_row(
|
|
227
|
+
c["name"],
|
|
228
|
+
c["model_id"],
|
|
229
|
+
str(c["port"] or "?"),
|
|
230
|
+
api_str,
|
|
231
|
+
c["status"],
|
|
232
|
+
)
|
|
233
|
+
console.print(table)
|
|
234
|
+
if not all_healthy:
|
|
235
|
+
raise typer.Exit(1)
|
|
236
|
+
return
|
|
237
|
+
|
|
238
|
+
info = status(name)
|
|
239
|
+
|
|
240
|
+
table = Table(show_header=False, box=None, padding=(0, 2))
|
|
241
|
+
table.add_column("Key", style="bold")
|
|
242
|
+
table.add_column("Value")
|
|
243
|
+
|
|
244
|
+
running_str = "[green]running[/green]" if info["running"] else "[red]stopped[/red]"
|
|
245
|
+
api_str = (
|
|
246
|
+
"[green]healthy[/green]"
|
|
247
|
+
if info["api_healthy"]
|
|
248
|
+
else "[yellow]unreachable[/yellow]"
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
table.add_row("Container", running_str)
|
|
252
|
+
table.add_row("API", api_str)
|
|
253
|
+
if info["container"]:
|
|
254
|
+
table.add_row("Started", info["container"].get("StartedAt", "—"))
|
|
255
|
+
|
|
256
|
+
console.print(table)
|
|
257
|
+
if not info["running"]:
|
|
258
|
+
raise typer.Exit(1)
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
@app.command(name="logs")
|
|
262
|
+
def logs_cmd(
|
|
263
|
+
name: Annotated[
|
|
264
|
+
str | None,
|
|
265
|
+
typer.Option(
|
|
266
|
+
"--name", "-n", help="Container name (auto-resolved if only one is running)"
|
|
267
|
+
),
|
|
268
|
+
] = None,
|
|
269
|
+
follow: Annotated[
|
|
270
|
+
bool,
|
|
271
|
+
typer.Option("--follow", "-f", help="Follow log output"),
|
|
272
|
+
] = False,
|
|
273
|
+
) -> None:
|
|
274
|
+
"""Print logs from a vllmd container."""
|
|
275
|
+
if name is None:
|
|
276
|
+
running = list_containers()
|
|
277
|
+
if len(running) == 1:
|
|
278
|
+
name = running[0]["name"]
|
|
279
|
+
elif len(running) == 0:
|
|
280
|
+
console.print("[yellow]No running vllmd containers found.[/yellow]")
|
|
281
|
+
raise typer.Exit(1)
|
|
282
|
+
else:
|
|
283
|
+
console.print("[red]Multiple containers running — specify --name:[/red]")
|
|
284
|
+
for c in running:
|
|
285
|
+
console.print(f" {c['name']}")
|
|
286
|
+
raise typer.Exit(1)
|
|
287
|
+
|
|
288
|
+
try:
|
|
289
|
+
logs(name, follow=follow)
|
|
290
|
+
except subprocess.CalledProcessError as e:
|
|
291
|
+
console.print(f"[red]Docker exited with code {e.returncode}[/red]")
|
|
292
|
+
raise typer.Exit(e.returncode) from e
|
src/vllmd/runner.py
ADDED
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
"""Docker management for vLLM model containers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import contextlib
|
|
6
|
+
import json
|
|
7
|
+
import re
|
|
8
|
+
import subprocess
|
|
9
|
+
import time
|
|
10
|
+
import urllib.request
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
VLLM_IMAGE = "vllm/vllm-openai:latest"
|
|
15
|
+
HEALTH_TIMEOUT = 300
|
|
16
|
+
HEALTH_INTERVAL = 3
|
|
17
|
+
|
|
18
|
+
MANAGED_LABEL = "com.vllmd.managed"
|
|
19
|
+
MODEL_LABEL = "com.vllmd.model"
|
|
20
|
+
MODEL_PATH_LABEL = "com.vllmd.model_path"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class RunConfig:
|
|
25
|
+
model_path: Path
|
|
26
|
+
port: int = 8000
|
|
27
|
+
name: str | None = None # None → derived from model dir name
|
|
28
|
+
gpu: bool = True
|
|
29
|
+
dtype: str = "auto"
|
|
30
|
+
max_model_len: int | None = None
|
|
31
|
+
extra_args: list[str] = field(default_factory=list)
|
|
32
|
+
|
|
33
|
+
@property
|
|
34
|
+
def model_id(self) -> str:
|
|
35
|
+
return self.model_path.resolve().name
|
|
36
|
+
|
|
37
|
+
@property
|
|
38
|
+
def container_name(self) -> str:
|
|
39
|
+
return self.name or f"vllmd-{self.model_id}"
|
|
40
|
+
|
|
41
|
+
@property
|
|
42
|
+
def endpoint(self) -> str:
|
|
43
|
+
return f"http://localhost:{self.port}"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _docker(*args: str, capture: bool = False) -> subprocess.CompletedProcess:
|
|
47
|
+
return subprocess.run(
|
|
48
|
+
["docker", *args],
|
|
49
|
+
check=True,
|
|
50
|
+
capture_output=capture,
|
|
51
|
+
text=True,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _container_exists(name: str) -> bool:
|
|
56
|
+
result = subprocess.run(
|
|
57
|
+
["docker", "ps", "-a", "--filter", f"name=^{name}$", "--format", "{{.Names}}"],
|
|
58
|
+
capture_output=True,
|
|
59
|
+
text=True,
|
|
60
|
+
)
|
|
61
|
+
return name in result.stdout.splitlines()
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _wait_ready(endpoint: str, timeout: int = HEALTH_TIMEOUT) -> bool:
|
|
65
|
+
deadline = time.monotonic() + timeout
|
|
66
|
+
while time.monotonic() < deadline:
|
|
67
|
+
try:
|
|
68
|
+
with urllib.request.urlopen(f"{endpoint}/v1/models", timeout=5):
|
|
69
|
+
return True
|
|
70
|
+
except Exception:
|
|
71
|
+
time.sleep(HEALTH_INTERVAL)
|
|
72
|
+
return False
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _parse_host_port(ports_str: str) -> int | None:
|
|
76
|
+
"""Extract the host port from a Ports string like '0.0.0.0:8001->8000/tcp'."""
|
|
77
|
+
m = re.search(r":(\d+)->8000", ports_str)
|
|
78
|
+
return int(m.group(1)) if m else None
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _parse_labels(labels_str: str) -> dict[str, str]:
|
|
82
|
+
"""Parse Docker's comma-separated 'key=value,key=value' label string."""
|
|
83
|
+
result: dict[str, str] = {}
|
|
84
|
+
for part in labels_str.split(","):
|
|
85
|
+
part = part.strip()
|
|
86
|
+
if "=" in part:
|
|
87
|
+
k, _, v = part.partition("=")
|
|
88
|
+
result[k.strip()] = v.strip()
|
|
89
|
+
return result
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def build_docker_run_cmd(config: RunConfig) -> list[str]:
|
|
93
|
+
model_path = config.model_path.resolve()
|
|
94
|
+
cmd = [
|
|
95
|
+
"docker",
|
|
96
|
+
"run",
|
|
97
|
+
"--rm",
|
|
98
|
+
"--name",
|
|
99
|
+
config.container_name,
|
|
100
|
+
"-p",
|
|
101
|
+
f"{config.port}:8000",
|
|
102
|
+
"-v",
|
|
103
|
+
f"{model_path}:/model:ro",
|
|
104
|
+
f"--label={MANAGED_LABEL}=true",
|
|
105
|
+
f"--label={MODEL_LABEL}={config.model_id}",
|
|
106
|
+
f"--label={MODEL_PATH_LABEL}={model_path}",
|
|
107
|
+
]
|
|
108
|
+
if config.gpu:
|
|
109
|
+
cmd += ["--gpus", "all"]
|
|
110
|
+
cmd += [
|
|
111
|
+
VLLM_IMAGE,
|
|
112
|
+
"--model",
|
|
113
|
+
"/model",
|
|
114
|
+
"--served-model-name",
|
|
115
|
+
config.model_id,
|
|
116
|
+
"--dtype",
|
|
117
|
+
config.dtype,
|
|
118
|
+
"--host",
|
|
119
|
+
"0.0.0.0",
|
|
120
|
+
"--port",
|
|
121
|
+
"8000",
|
|
122
|
+
]
|
|
123
|
+
if config.max_model_len is not None:
|
|
124
|
+
cmd += ["--max-model-len", str(config.max_model_len)]
|
|
125
|
+
cmd += config.extra_args
|
|
126
|
+
return cmd
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def start(config: RunConfig) -> None:
|
|
130
|
+
"""Start a vLLM container (foreground, blocking)."""
|
|
131
|
+
model_path = config.model_path.resolve()
|
|
132
|
+
if not model_path.exists():
|
|
133
|
+
raise FileNotFoundError(f"Model path not found: {model_path}")
|
|
134
|
+
if _container_exists(config.container_name):
|
|
135
|
+
raise RuntimeError(
|
|
136
|
+
f"Container '{config.container_name}' already exists. "
|
|
137
|
+
"Stop it first or use a different --name."
|
|
138
|
+
)
|
|
139
|
+
subprocess.run(build_docker_run_cmd(config), check=True)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def stop(name: str) -> None:
|
|
143
|
+
"""Stop and remove a named container."""
|
|
144
|
+
if not _container_exists(name):
|
|
145
|
+
raise RuntimeError(f"No container named '{name}' found.")
|
|
146
|
+
_docker("stop", name)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def stop_all() -> list[str]:
|
|
150
|
+
"""Stop all vllmd-managed containers. Returns list of stopped names."""
|
|
151
|
+
containers = list_containers()
|
|
152
|
+
stopped = []
|
|
153
|
+
for c in containers:
|
|
154
|
+
_docker("stop", c["name"])
|
|
155
|
+
stopped.append(c["name"])
|
|
156
|
+
return stopped
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def list_containers() -> list[dict]:
|
|
160
|
+
"""Return info for all running vllmd-managed containers."""
|
|
161
|
+
result = subprocess.run(
|
|
162
|
+
[
|
|
163
|
+
"docker",
|
|
164
|
+
"ps",
|
|
165
|
+
"--filter",
|
|
166
|
+
f"label={MANAGED_LABEL}=true",
|
|
167
|
+
"--format",
|
|
168
|
+
"{{json .}}",
|
|
169
|
+
],
|
|
170
|
+
capture_output=True,
|
|
171
|
+
text=True,
|
|
172
|
+
)
|
|
173
|
+
containers = []
|
|
174
|
+
for line in result.stdout.strip().splitlines():
|
|
175
|
+
if not line:
|
|
176
|
+
continue
|
|
177
|
+
data = json.loads(line)
|
|
178
|
+
labels = _parse_labels(data.get("Labels", ""))
|
|
179
|
+
host_port = _parse_host_port(data.get("Ports", ""))
|
|
180
|
+
model_id = labels.get(MODEL_LABEL, "?")
|
|
181
|
+
model_path = labels.get(MODEL_PATH_LABEL, "?")
|
|
182
|
+
containers.append(
|
|
183
|
+
{
|
|
184
|
+
"name": data["Names"],
|
|
185
|
+
"model_id": model_id,
|
|
186
|
+
"model_path": model_path,
|
|
187
|
+
"port": host_port,
|
|
188
|
+
"endpoint": f"http://localhost:{host_port}" if host_port else "?",
|
|
189
|
+
"status": data.get("Status", "?"),
|
|
190
|
+
}
|
|
191
|
+
)
|
|
192
|
+
return containers
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def status(name: str) -> dict:
|
|
196
|
+
"""Return container state and API health for a named container."""
|
|
197
|
+
result = subprocess.run(
|
|
198
|
+
["docker", "inspect", name, "--format", "{{json .State}}"],
|
|
199
|
+
capture_output=True,
|
|
200
|
+
text=True,
|
|
201
|
+
)
|
|
202
|
+
if result.returncode != 0:
|
|
203
|
+
return {"running": False, "api_healthy": False, "container": None}
|
|
204
|
+
|
|
205
|
+
state = json.loads(result.stdout.strip())
|
|
206
|
+
running = state.get("Running", False)
|
|
207
|
+
api_healthy = False
|
|
208
|
+
|
|
209
|
+
if running:
|
|
210
|
+
port_result = subprocess.run(
|
|
211
|
+
["docker", "port", name, "8000"],
|
|
212
|
+
capture_output=True,
|
|
213
|
+
text=True,
|
|
214
|
+
)
|
|
215
|
+
port = 8000
|
|
216
|
+
if port_result.returncode == 0:
|
|
217
|
+
binding = port_result.stdout.strip().split(":")[-1]
|
|
218
|
+
with contextlib.suppress(ValueError):
|
|
219
|
+
port = int(binding)
|
|
220
|
+
with contextlib.suppress(Exception):
|
|
221
|
+
url = f"http://localhost:{port}/v1/models"
|
|
222
|
+
with urllib.request.urlopen(url, timeout=3):
|
|
223
|
+
api_healthy = True
|
|
224
|
+
|
|
225
|
+
return {"running": running, "api_healthy": api_healthy, "container": state}
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def wait_ready(config: RunConfig) -> bool:
|
|
229
|
+
"""Block until the vLLM API is reachable, or timeout."""
|
|
230
|
+
return _wait_ready(config.endpoint)
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def logs(name: str, follow: bool = False) -> None:
|
|
234
|
+
"""Stream or print container logs."""
|
|
235
|
+
cmd = ["logs"]
|
|
236
|
+
if follow:
|
|
237
|
+
cmd.append("-f")
|
|
238
|
+
cmd.append(name)
|
|
239
|
+
_docker(*cmd)
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
"""Chat function: assembles context and calls the vLLM completions endpoint."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import urllib.request
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from ..vectordb.embeddings import embed
|
|
10
|
+
from ..vectordb.store import COLLECTION_CODE, COLLECTION_DOCUMENTS, VectorStore
|
|
11
|
+
from .session import Session
|
|
12
|
+
|
|
13
|
+
MAX_HISTORY = 20
|
|
14
|
+
N_CONTEXT_CHUNKS = 3
|
|
15
|
+
MAX_TOKENS = 2048
|
|
16
|
+
TIMEOUT = 120
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def chat(
|
|
20
|
+
session: Session,
|
|
21
|
+
user_message: str,
|
|
22
|
+
*,
|
|
23
|
+
max_history: int = MAX_HISTORY,
|
|
24
|
+
n_context: int = N_CONTEXT_CHUNKS,
|
|
25
|
+
max_tokens: int = MAX_TOKENS,
|
|
26
|
+
) -> str:
|
|
27
|
+
"""Send *user_message* within *session*, returning the assistant reply.
|
|
28
|
+
|
|
29
|
+
Retrieves semantic context from the session's vector store if an
|
|
30
|
+
embedding_model is configured. Falls back silently to history-only context
|
|
31
|
+
if the embedding endpoint is unavailable.
|
|
32
|
+
"""
|
|
33
|
+
messages: list[dict] = []
|
|
34
|
+
|
|
35
|
+
if session.system_prompt:
|
|
36
|
+
messages.append({"role": "system", "content": session.system_prompt})
|
|
37
|
+
|
|
38
|
+
# Semantic context retrieval (skip entirely if no embedding model configured)
|
|
39
|
+
context_text = (
|
|
40
|
+
_retrieve_context(session, user_message, n_context)
|
|
41
|
+
if session.embedding_model
|
|
42
|
+
else ""
|
|
43
|
+
)
|
|
44
|
+
if context_text:
|
|
45
|
+
messages.append(
|
|
46
|
+
{"role": "system", "content": f"Relevant context:\n\n{context_text}"}
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
# Recent sequential history
|
|
50
|
+
for msg in session.messages[-max_history:]:
|
|
51
|
+
messages.append({"role": msg.role, "content": msg.content})
|
|
52
|
+
|
|
53
|
+
messages.append({"role": "user", "content": user_message})
|
|
54
|
+
|
|
55
|
+
response = _complete(session.endpoint, session.model_id, messages, max_tokens)
|
|
56
|
+
|
|
57
|
+
# Persist the exchange
|
|
58
|
+
from .session import Message
|
|
59
|
+
|
|
60
|
+
session.messages.append(Message(role="user", content=user_message))
|
|
61
|
+
session.messages.append(Message(role="assistant", content=response))
|
|
62
|
+
|
|
63
|
+
# Store embeddings in vector DB for future retrieval
|
|
64
|
+
if session.embedding_model:
|
|
65
|
+
_store_history(session, user_message, response)
|
|
66
|
+
|
|
67
|
+
return response
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def retrieve_context(session: Session, query: str, n: int = N_CONTEXT_CHUNKS) -> str:
|
|
71
|
+
"""Public helper to show what context would be injected for *query*."""
|
|
72
|
+
return _retrieve_context(session, query, n)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
# ------------------------------------------------------------------
|
|
76
|
+
# Internals
|
|
77
|
+
# ------------------------------------------------------------------
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _retrieve_context(session: Session, query: str, n: int) -> str:
|
|
81
|
+
if not session.embedding_model:
|
|
82
|
+
return ""
|
|
83
|
+
try:
|
|
84
|
+
store = VectorStore(Path(session.db_path))
|
|
85
|
+
query_vec = embed(session.endpoint, session.embedding_model, [query])[0]
|
|
86
|
+
chunks: list[str] = []
|
|
87
|
+
for collection in (COLLECTION_DOCUMENTS, COLLECTION_CODE):
|
|
88
|
+
results = store.search(query_vec, collection, n_results=n)
|
|
89
|
+
chunks.extend(r["content"] for r in results)
|
|
90
|
+
if not chunks:
|
|
91
|
+
return ""
|
|
92
|
+
return "\n\n---\n\n".join(chunks[:n])
|
|
93
|
+
except Exception:
|
|
94
|
+
return ""
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _complete(
|
|
98
|
+
endpoint: str, model_id: str, messages: list[dict], max_tokens: int
|
|
99
|
+
) -> str:
|
|
100
|
+
payload = json.dumps(
|
|
101
|
+
{"model": model_id, "messages": messages, "max_tokens": max_tokens}
|
|
102
|
+
).encode()
|
|
103
|
+
req = urllib.request.Request(
|
|
104
|
+
f"{endpoint.rstrip('/')}/v1/chat/completions",
|
|
105
|
+
data=payload,
|
|
106
|
+
headers={"Content-Type": "application/json"},
|
|
107
|
+
)
|
|
108
|
+
with urllib.request.urlopen(req, timeout=TIMEOUT) as resp:
|
|
109
|
+
data = json.loads(resp.read())
|
|
110
|
+
return data["choices"][0]["message"]["content"]
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _store_history(session: Session, user_message: str, response: str) -> None:
|
|
114
|
+
try:
|
|
115
|
+
store = VectorStore(Path(session.db_path))
|
|
116
|
+
|
|
117
|
+
def embedder(texts: list[str]) -> list[list[float]]:
|
|
118
|
+
return embed(session.endpoint, session.embedding_model, texts)
|
|
119
|
+
|
|
120
|
+
store.add_history(session.id, "user", user_message, embedder)
|
|
121
|
+
store.add_history(session.id, "assistant", response, embedder)
|
|
122
|
+
except Exception:
|
|
123
|
+
pass
|