loreguard-cli 0.15.2__tar.gz → 0.20.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/PKG-INFO +2 -2
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/pyproject.toml +2 -2
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/__main__.py +23 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/chunk_detector.py +1 -1
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/cli.py +27 -5
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/config.py +57 -8
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/http_server.py +128 -1
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/llama_server.py +18 -8
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/llm.py +27 -23
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/main.py +3 -3
- loreguard_cli-0.20.2/src/model_families.py +121 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/models_registry.py +12 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/nli.py +53 -5
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/npc_chat.py +7 -5
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/steam.py +4 -3
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tui/app.py +1 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tui/modals/auth_menu.py +3 -2
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tui/modals/token_input.py +1 -1
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tui/modals/unified_palette.py +28 -1
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tui/screens/auth.py +2 -1
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tui/screens/main.py +9 -8
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tui/screens/running.py +13 -4
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tui/widgets/npc_chat.py +4 -2
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tunnel.py +8 -2
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/wizard.py +4 -2
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/uv.lock +8 -8
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/.claude/skills/llama-cpp-troubleshooting/SKILL.md +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/.env.example +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/.github/workflows/release.yml +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/.gitignore +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/LICENSE +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/README.md +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/THIRD_PARTY_NOTICES.md +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/loreguard.spec +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/loreguard_entry.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/scripts/build.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/sdk/API.md +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/sdk/csharp/LoreguardSDK.cs +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/sdk/gdscript/LoreguardSDK.gd +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/sdk/javascript/loreguard-sdk.js +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/sdk/python/loreguard_sdk.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/__init__.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/dialogue_act_classifier.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/hf_discovery.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/intent_classifier.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/runtime.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/term_ui.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tui/__init__.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tui/modals/__init__.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tui/modals/npc_chat.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tui/screens/__init__.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tui/screens/model_select.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tui/screens/nli_setup.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tui/styles.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tui/widgets/__init__.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tui/widgets/banner.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tui/widgets/footer.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tui/widgets/hardware_info.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tui/widgets/server_monitor.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tui/widgets/status_panel.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/templates/llama31-no-tools.jinja +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/tests/test_intent_classifier.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/tests/test_nli_hhem.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/tests/test_websocket_timeout.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: loreguard-cli
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.20.2
|
|
4
4
|
Summary: Local inference client for Loreguard NPCs
|
|
5
5
|
Project-URL: Homepage, https://loreguard.com
|
|
6
6
|
Project-URL: Documentation, https://github.com/beyond-logic-labs/loreguard-cli#readme
|
|
@@ -29,7 +29,7 @@ Requires-Dist: rich>=13.0.0
|
|
|
29
29
|
Requires-Dist: textual>=0.47.0
|
|
30
30
|
Requires-Dist: tf-keras>=2.16.0
|
|
31
31
|
Requires-Dist: torch>=2.0.0
|
|
32
|
-
Requires-Dist: transformers
|
|
32
|
+
Requires-Dist: transformers<5,>=4.36.0
|
|
33
33
|
Requires-Dist: uvicorn>=0.27.0
|
|
34
34
|
Requires-Dist: websockets>=12.0
|
|
35
35
|
Provides-Extra: build
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "loreguard-cli"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.20.2"
|
|
8
8
|
description = "Local inference client for Loreguard NPCs"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = "MIT"
|
|
@@ -28,7 +28,7 @@ dependencies = [
|
|
|
28
28
|
"aiofiles>=24.1.0",
|
|
29
29
|
"rich>=13.0.0",
|
|
30
30
|
"textual>=0.47.0",
|
|
31
|
-
"transformers>=
|
|
31
|
+
"transformers>=4.36.0,<5",
|
|
32
32
|
"torch>=2.0.0",
|
|
33
33
|
"fastapi>=0.109.0",
|
|
34
34
|
"uvicorn>=0.27.0",
|
|
@@ -28,6 +28,29 @@ def main():
|
|
|
28
28
|
print(json.dumps(status, indent=2))
|
|
29
29
|
sys.exit(0 if status.get("running") else 1)
|
|
30
30
|
|
|
31
|
+
# Handle 'download-llama-server' command - for bundle tool delegation (ADR-0027)
|
|
32
|
+
if args and args[0] == "download-llama-server":
|
|
33
|
+
import asyncio
|
|
34
|
+
from pathlib import Path
|
|
35
|
+
from .llama_server import download_llama_server
|
|
36
|
+
|
|
37
|
+
output_dir = None
|
|
38
|
+
for i, a in enumerate(args):
|
|
39
|
+
if a == "--output-dir" and i + 1 < len(args):
|
|
40
|
+
output_dir = Path(args[i + 1])
|
|
41
|
+
|
|
42
|
+
if not output_dir:
|
|
43
|
+
print("Usage: loreguard download-llama-server --output-dir <path>", file=sys.stderr)
|
|
44
|
+
sys.exit(1)
|
|
45
|
+
|
|
46
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
47
|
+
|
|
48
|
+
def on_progress(msg, progress=None):
|
|
49
|
+
print(f" {msg}")
|
|
50
|
+
|
|
51
|
+
asyncio.run(download_llama_server(progress_callback=on_progress, target_dir=output_dir))
|
|
52
|
+
sys.exit(0)
|
|
53
|
+
|
|
31
54
|
# Filter out help flags - these should show CLI help
|
|
32
55
|
if any(a in ('-h', '--help') for a in args):
|
|
33
56
|
from .cli import main as cli_main
|
|
@@ -11,7 +11,8 @@ Environment variables (alternative to args):
|
|
|
11
11
|
LOREGUARD_MODEL Path to model file
|
|
12
12
|
LOREGUARD_MODEL_ID Model ID to download (if not using custom model)
|
|
13
13
|
LOREGUARD_PORT Local llama-server port (default: 8080)
|
|
14
|
-
LOREGUARD_BACKEND Backend URL (default: wss://
|
|
14
|
+
LOREGUARD_BACKEND Backend WebSocket URL (default: wss://console.loreguard.com/workers)
|
|
15
|
+
LOREGUARD_API API base URL (default: https://console.loreguard.com)
|
|
15
16
|
LOREGUARD_WORKER_ID Worker ID (default: hostname)
|
|
16
17
|
"""
|
|
17
18
|
|
|
@@ -26,6 +27,8 @@ from datetime import datetime
|
|
|
26
27
|
from pathlib import Path
|
|
27
28
|
from typing import Optional
|
|
28
29
|
|
|
30
|
+
from .config import DEFAULT_API_URL, DEFAULT_BACKEND_URL
|
|
31
|
+
|
|
29
32
|
# Setup logging
|
|
30
33
|
logging.basicConfig(
|
|
31
34
|
level=logging.INFO,
|
|
@@ -44,14 +47,16 @@ class LoreguardCLI:
|
|
|
44
47
|
model_path: Optional[Path] = None,
|
|
45
48
|
model_id: Optional[str] = None,
|
|
46
49
|
port: int = 8080,
|
|
47
|
-
backend_url: str =
|
|
50
|
+
backend_url: str = DEFAULT_BACKEND_URL,
|
|
48
51
|
worker_id: Optional[str] = None,
|
|
52
|
+
model_family: str = "llama3",
|
|
49
53
|
):
|
|
50
54
|
self.token = token
|
|
51
55
|
self.model_path = model_path
|
|
52
56
|
self.model_id = model_id
|
|
53
57
|
self.port = port
|
|
54
58
|
self.backend_url = backend_url
|
|
59
|
+
self.model_family = model_family
|
|
55
60
|
# Worker ID: use provided value, or default to sanitized hostname.
|
|
56
61
|
# Validator requires ^[a-zA-Z0-9_-]{1,64}$ — replace dots with hyphens.
|
|
57
62
|
raw_id = worker_id or socket.gethostname() or "worker"
|
|
@@ -209,7 +214,7 @@ class LoreguardCLI:
|
|
|
209
214
|
# Start server
|
|
210
215
|
log.info(f"Starting llama-server on port {self.port}...")
|
|
211
216
|
try:
|
|
212
|
-
self._llama = LlamaServerProcess(self.model_path, port=self.port)
|
|
217
|
+
self._llama = LlamaServerProcess(self.model_path, port=self.port, model_family=self.model_family)
|
|
213
218
|
self._llama.start()
|
|
214
219
|
|
|
215
220
|
# Wait for ready
|
|
@@ -241,7 +246,7 @@ class LoreguardCLI:
|
|
|
241
246
|
log.info(f"Worker ID: {self.worker_id}")
|
|
242
247
|
|
|
243
248
|
try:
|
|
244
|
-
llm_proxy = LLMProxy(f"http://127.0.0.1:{self.port}")
|
|
249
|
+
llm_proxy = LLMProxy(f"http://127.0.0.1:{self.port}", model_family=self.model_family)
|
|
245
250
|
|
|
246
251
|
# ADR-0027: Load all ML services — the client is the sole provider
|
|
247
252
|
# of NLI, intent, dialogue act, and chunk capabilities.
|
|
@@ -323,6 +328,11 @@ class LoreguardCLI:
|
|
|
323
328
|
port=sdk_port,
|
|
324
329
|
)
|
|
325
330
|
log.info(f"SDK server listening on 127.0.0.1:{self._sdk_port}")
|
|
331
|
+
|
|
332
|
+
# Wire llama process for runtime model switching
|
|
333
|
+
from .http_server import set_llama_process
|
|
334
|
+
models_dir = self.model_path.parent if self.model_path else None
|
|
335
|
+
set_llama_process(self._llama, models_dir)
|
|
326
336
|
except Exception as e:
|
|
327
337
|
log.error(f"Failed to start SDK server: {e}")
|
|
328
338
|
return False
|
|
@@ -447,9 +457,14 @@ Available model IDs:
|
|
|
447
457
|
)
|
|
448
458
|
parser.add_argument(
|
|
449
459
|
"--backend",
|
|
450
|
-
default=os.getenv("LOREGUARD_BACKEND",
|
|
460
|
+
default=os.getenv("LOREGUARD_BACKEND", DEFAULT_BACKEND_URL),
|
|
451
461
|
help="Backend WebSocket URL",
|
|
452
462
|
)
|
|
463
|
+
parser.add_argument(
|
|
464
|
+
"--api-url",
|
|
465
|
+
default=os.getenv("LOREGUARD_API", DEFAULT_API_URL),
|
|
466
|
+
help=f"API base URL (default: {DEFAULT_API_URL})",
|
|
467
|
+
)
|
|
453
468
|
parser.add_argument(
|
|
454
469
|
"-v", "--verbose",
|
|
455
470
|
action="store_true",
|
|
@@ -460,6 +475,12 @@ Available model IDs:
|
|
|
460
475
|
default=os.getenv("LOREGUARD_BUNDLE_DIR", ""),
|
|
461
476
|
help="Loreguard bundle directory. Auto-discovers models from manifest.txt.",
|
|
462
477
|
)
|
|
478
|
+
parser.add_argument(
|
|
479
|
+
"--model-family",
|
|
480
|
+
default=os.getenv("LOREGUARD_MODEL_FAMILY", "auto"),
|
|
481
|
+
choices=["auto", "llama3", "qwen3", "gemma", "chatml"],
|
|
482
|
+
help="Model family profile for chat template/stop sequences (default: auto)",
|
|
483
|
+
)
|
|
463
484
|
parser.add_argument(
|
|
464
485
|
"--dev",
|
|
465
486
|
action="store_true",
|
|
@@ -531,6 +552,7 @@ Available model IDs:
|
|
|
531
552
|
port=args.port,
|
|
532
553
|
backend_url=args.backend,
|
|
533
554
|
worker_id=args.worker_id or None, # None will use hostname
|
|
555
|
+
model_family=args.model_family,
|
|
534
556
|
)
|
|
535
557
|
|
|
536
558
|
exit_code = asyncio.run(cli.run())
|
|
@@ -49,6 +49,8 @@ class LoreguardConfig:
|
|
|
49
49
|
dev_mode: bool = False
|
|
50
50
|
context_size: int = 16384 # llama-server context window size (configurable per game)
|
|
51
51
|
max_speech_tokens: int = 50 # Max tokens for NPC speech output (Pass 4). Default: 50 (~40 words)
|
|
52
|
+
model_family: str = "auto" # Model family profile (auto, llama3, qwen3, gemma, chatml)
|
|
53
|
+
dialogue_act_enabled: bool = False # Dialogue act classifier for filler selection
|
|
52
54
|
|
|
53
55
|
def save(self) -> None:
|
|
54
56
|
"""Save configuration to disk."""
|
|
@@ -71,6 +73,8 @@ class LoreguardConfig:
|
|
|
71
73
|
dev_mode=data.get("dev_mode", False),
|
|
72
74
|
context_size=data.get("context_size", 16384),
|
|
73
75
|
max_speech_tokens=data.get("max_speech_tokens", 50),
|
|
76
|
+
model_family=data.get("model_family", "auto"),
|
|
77
|
+
dialogue_act_enabled=data.get("dialogue_act_enabled", False),
|
|
74
78
|
)
|
|
75
79
|
except (json.JSONDecodeError, KeyError):
|
|
76
80
|
pass
|
|
@@ -119,6 +123,14 @@ class LoreguardConfig:
|
|
|
119
123
|
# Environment Variable Configuration
|
|
120
124
|
# =============================================================================
|
|
121
125
|
|
|
126
|
+
DEFAULT_API_URL = "https://console.loreguard.com"
|
|
127
|
+
DEFAULT_BACKEND_URL = "wss://console.loreguard.com/workers"
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def get_api_url() -> str:
|
|
131
|
+
"""Get the Loreguard API base URL (configurable via LOREGUARD_API env var)."""
|
|
132
|
+
return os.getenv("LOREGUARD_API", DEFAULT_API_URL)
|
|
133
|
+
|
|
122
134
|
|
|
123
135
|
@lru_cache(maxsize=1)
|
|
124
136
|
def load_config() -> dict:
|
|
@@ -131,12 +143,13 @@ def load_config() -> dict:
|
|
|
131
143
|
return {
|
|
132
144
|
# Server settings
|
|
133
145
|
"LLM_ENDPOINT": os.getenv("LLM_ENDPOINT", "http://localhost:8080"),
|
|
134
|
-
"BACKEND_URL": os.getenv("LOREGUARD_BACKEND",
|
|
146
|
+
"BACKEND_URL": os.getenv("LOREGUARD_BACKEND", DEFAULT_BACKEND_URL),
|
|
147
|
+
"API_URL": os.getenv("LOREGUARD_API", DEFAULT_API_URL),
|
|
135
148
|
"HOST": os.getenv("HOST", "127.0.0.1"),
|
|
136
149
|
"PORT": os.getenv("PORT", "8081"),
|
|
137
150
|
|
|
138
151
|
# Worker authentication (required for backend connection)
|
|
139
|
-
# Get API token from loreguard.com
|
|
152
|
+
# Get API token from console.loreguard.com
|
|
140
153
|
"WORKER_ID": os.getenv("LOREGUARD_WORKER_ID", os.getenv("WORKER_ID", "")),
|
|
141
154
|
# LOREGUARD_TOKEN is preferred, WORKER_TOKEN kept for backwards compatibility
|
|
142
155
|
"LOREGUARD_TOKEN": os.getenv("LOREGUARD_TOKEN", os.getenv("WORKER_TOKEN", "")),
|
|
@@ -230,20 +243,21 @@ def get_models_dir() -> Optional[Path]:
|
|
|
230
243
|
|
|
231
244
|
|
|
232
245
|
def resolve_model_path(model_name: str, subdir: str = "") -> str:
|
|
233
|
-
"""Resolve a model path, preferring
|
|
246
|
+
"""Resolve a model path, preferring local models over HF downloads.
|
|
234
247
|
|
|
235
248
|
Resolution order:
|
|
236
249
|
1. LOREGUARD_MODELS_DIR/<subdir> (explicit override)
|
|
237
|
-
2.
|
|
238
|
-
3. Bundle models dir using HF name →
|
|
239
|
-
4.
|
|
250
|
+
2. Application Support models dir/<subdir> (standard install location)
|
|
251
|
+
3. Bundle models dir using manifest.txt (HF name → manifest key → local dir)
|
|
252
|
+
4. Bundle models dir using HF name → org--model convention (fallback)
|
|
253
|
+
5. Download from HuggingFace to Application Support models dir
|
|
240
254
|
|
|
241
255
|
Args:
|
|
242
256
|
model_name: HuggingFace model name (e.g., 'vectara/hallucination_evaluation_model')
|
|
243
257
|
subdir: Subdirectory within MODELS_DIR to check (e.g., 'hhem', 'deberta')
|
|
244
258
|
|
|
245
259
|
Returns:
|
|
246
|
-
Local path
|
|
260
|
+
Local path to the model directory.
|
|
247
261
|
"""
|
|
248
262
|
# 1. Explicit LOREGUARD_MODELS_DIR/<subdir>
|
|
249
263
|
explicit_dir = get_config_value("MODELS_DIR")
|
|
@@ -252,7 +266,14 @@ def resolve_model_path(model_name: str, subdir: str = "") -> str:
|
|
|
252
266
|
if local_path.exists() and any(local_path.iterdir()):
|
|
253
267
|
return str(local_path)
|
|
254
268
|
|
|
255
|
-
# 2
|
|
269
|
+
# 2. Application Support models dir/<subdir>
|
|
270
|
+
app_models = get_data_dir() / "models"
|
|
271
|
+
if subdir:
|
|
272
|
+
local_path = app_models / subdir
|
|
273
|
+
if local_path.exists() and any(local_path.iterdir()):
|
|
274
|
+
return str(local_path)
|
|
275
|
+
|
|
276
|
+
# 3 & 4. Bundle directory resolution
|
|
256
277
|
bundle_dir = get_bundle_dir()
|
|
257
278
|
if bundle_dir:
|
|
258
279
|
bundle_models = bundle_dir / "models"
|
|
@@ -273,9 +294,37 @@ def resolve_model_path(model_name: str, subdir: str = "") -> str:
|
|
|
273
294
|
if local_path.exists() and any(local_path.iterdir()):
|
|
274
295
|
return str(local_path)
|
|
275
296
|
|
|
297
|
+
# 5. Download from HuggingFace to Application Support models dir
|
|
298
|
+
if subdir:
|
|
299
|
+
return _download_hf_model(model_name, app_models / subdir)
|
|
300
|
+
|
|
276
301
|
return model_name
|
|
277
302
|
|
|
278
303
|
|
|
304
|
+
def _download_hf_model(model_name: str, target_dir: Path) -> str:
|
|
305
|
+
"""Download a HuggingFace model to the loreguard models directory.
|
|
306
|
+
|
|
307
|
+
Returns:
|
|
308
|
+
Path to the downloaded model directory.
|
|
309
|
+
"""
|
|
310
|
+
import logging
|
|
311
|
+
logger = logging.getLogger(__name__)
|
|
312
|
+
try:
|
|
313
|
+
from huggingface_hub import snapshot_download
|
|
314
|
+
target_dir.mkdir(parents=True, exist_ok=True)
|
|
315
|
+
logger.info(f"Downloading {model_name} to {target_dir}")
|
|
316
|
+
snapshot_download(
|
|
317
|
+
model_name,
|
|
318
|
+
local_dir=str(target_dir),
|
|
319
|
+
local_dir_use_symlinks=False,
|
|
320
|
+
)
|
|
321
|
+
logger.info(f"Downloaded {model_name} to {target_dir}")
|
|
322
|
+
return str(target_dir)
|
|
323
|
+
except Exception as e:
|
|
324
|
+
logger.warning(f"Failed to download {model_name}: {e}")
|
|
325
|
+
return model_name
|
|
326
|
+
|
|
327
|
+
|
|
279
328
|
def get_config_value(key: str, default: Optional[str] = None) -> Optional[str]:
|
|
280
329
|
"""Get a single configuration value."""
|
|
281
330
|
config = load_config()
|
|
@@ -8,6 +8,8 @@ HTTP endpoints:
|
|
|
8
8
|
GET /api/capabilities - Feature discovery (streaming, chunk modes)
|
|
9
9
|
GET /api/characters - List available NPCs (proxied from engine)
|
|
10
10
|
POST /api/chat - Chat with an NPC (streaming SSE or JSON)
|
|
11
|
+
GET /api/models - List available GGUF models
|
|
12
|
+
POST /api/admin/reload-model - Hot-swap LLM model at runtime
|
|
11
13
|
|
|
12
14
|
The server shares the existing tunnel connection instead of creating
|
|
13
15
|
a new one, ensuring a single WebSocket connection per worker.
|
|
@@ -17,10 +19,12 @@ Uses uvicorn with socket-first binding for race-condition-free port allocation.
|
|
|
17
19
|
|
|
18
20
|
import asyncio
|
|
19
21
|
import json
|
|
22
|
+
import os
|
|
20
23
|
import threading
|
|
21
24
|
import time
|
|
22
25
|
import uuid
|
|
23
26
|
from concurrent.futures import Future
|
|
27
|
+
from pathlib import Path
|
|
24
28
|
from typing import Any, Callable, Optional
|
|
25
29
|
|
|
26
30
|
from .runtime import write_runtime_info, RuntimeInfo, get_runtime_path, get_version
|
|
@@ -63,6 +67,8 @@ class EmbeddedHTTPServer:
|
|
|
63
67
|
self._running = False
|
|
64
68
|
self._bound_socket: Optional[Any] = None
|
|
65
69
|
self._ready_event = threading.Event()
|
|
70
|
+
self.llama_process: Optional[Any] = None # LlamaServerProcess — set by RunningScreen
|
|
71
|
+
self.models_dir: Optional[Path] = None # Path to models/ directory
|
|
66
72
|
|
|
67
73
|
def start(self) -> int:
|
|
68
74
|
"""Start the HTTP server in a background thread.
|
|
@@ -361,7 +367,7 @@ class EmbeddedHTTPServer:
|
|
|
361
367
|
|
|
362
368
|
# Derive HTTP base URL from WebSocket URL
|
|
363
369
|
# ws://localhost:8090/workers → http://localhost:8090
|
|
364
|
-
# wss://
|
|
370
|
+
# wss://console.loreguard.com/workers → https://console.loreguard.com
|
|
365
371
|
backend_ws = server.tunnel.backend_url
|
|
366
372
|
if backend_ws.startswith("wss://"):
|
|
367
373
|
base_url = "https://" + backend_ws[6:].split("/")[0]
|
|
@@ -483,6 +489,118 @@ class EmbeddedHTTPServer:
|
|
|
483
489
|
return JSONResponse(status_code=500, content=result)
|
|
484
490
|
return result
|
|
485
491
|
|
|
492
|
+
@app.get("/api/models")
|
|
493
|
+
async def list_models():
|
|
494
|
+
"""List available GGUF models in the models directory."""
|
|
495
|
+
if not server.models_dir or not server.models_dir.exists():
|
|
496
|
+
return JSONResponse(
|
|
497
|
+
status_code=404,
|
|
498
|
+
content={"error": "Models directory not configured"},
|
|
499
|
+
)
|
|
500
|
+
|
|
501
|
+
models = []
|
|
502
|
+
active_model = None
|
|
503
|
+
if server.llama_process and hasattr(server.llama_process, "model_path"):
|
|
504
|
+
active_model = server.llama_process.model_path.name
|
|
505
|
+
|
|
506
|
+
for f in sorted(server.models_dir.iterdir()):
|
|
507
|
+
if f.suffix == ".gguf" and f.is_file():
|
|
508
|
+
models.append({
|
|
509
|
+
"name": f.name,
|
|
510
|
+
"size": f.stat().st_size,
|
|
511
|
+
"active": f.name == active_model,
|
|
512
|
+
})
|
|
513
|
+
|
|
514
|
+
return {"models": models, "activeModel": active_model}
|
|
515
|
+
|
|
516
|
+
@app.post("/api/admin/reload-model")
|
|
517
|
+
async def reload_model(request: Request):
|
|
518
|
+
"""Hot-swap the LLM model by restarting llama-server."""
|
|
519
|
+
if not server.llama_process:
|
|
520
|
+
return JSONResponse(
|
|
521
|
+
status_code=503,
|
|
522
|
+
content={"error": "LLM server not available"},
|
|
523
|
+
)
|
|
524
|
+
if not server.models_dir:
|
|
525
|
+
return JSONResponse(
|
|
526
|
+
status_code=503,
|
|
527
|
+
content={"error": "Models directory not configured"},
|
|
528
|
+
)
|
|
529
|
+
|
|
530
|
+
body = await request.json()
|
|
531
|
+
model_name = body.get("model", "")
|
|
532
|
+
if not model_name:
|
|
533
|
+
return JSONResponse(
|
|
534
|
+
status_code=400,
|
|
535
|
+
content={"error": "Missing 'model' field"},
|
|
536
|
+
)
|
|
537
|
+
|
|
538
|
+
# Security: resolve and verify path stays inside models_dir
|
|
539
|
+
model_path = (server.models_dir / model_name).resolve()
|
|
540
|
+
if model_path.parent != server.models_dir.resolve():
|
|
541
|
+
return JSONResponse(
|
|
542
|
+
status_code=400,
|
|
543
|
+
content={"error": "Invalid model name"},
|
|
544
|
+
)
|
|
545
|
+
|
|
546
|
+
if not model_path.exists() or model_path.suffix != ".gguf":
|
|
547
|
+
return JSONResponse(
|
|
548
|
+
status_code=404,
|
|
549
|
+
content={"error": f"Model '{model_name}' not found"},
|
|
550
|
+
)
|
|
551
|
+
|
|
552
|
+
# Check if already active
|
|
553
|
+
if hasattr(server.llama_process, "model_path") and server.llama_process.model_path.name == model_name:
|
|
554
|
+
return {"status": "already_active", "model": model_name}
|
|
555
|
+
|
|
556
|
+
# Save original model_path for rollback on failure
|
|
557
|
+
original_model_path = server.llama_process.model_path
|
|
558
|
+
|
|
559
|
+
try:
|
|
560
|
+
# Stop current llama-server
|
|
561
|
+
server.llama_process.stop()
|
|
562
|
+
|
|
563
|
+
# Update model path and restart
|
|
564
|
+
server.llama_process.model_path = model_path
|
|
565
|
+
server.llama_process.start()
|
|
566
|
+
|
|
567
|
+
# Wait for health check (llama-server takes a few seconds to load model)
|
|
568
|
+
import httpx
|
|
569
|
+
llama_url = f"http://127.0.0.1:{server.llama_process.port}/health"
|
|
570
|
+
async with httpx.AsyncClient(timeout=2.0) as client:
|
|
571
|
+
for attempt in range(60): # 60 attempts × 0.5s = 30s timeout
|
|
572
|
+
await asyncio.sleep(0.5)
|
|
573
|
+
try:
|
|
574
|
+
resp = await client.get(llama_url)
|
|
575
|
+
if resp.status_code == 200:
|
|
576
|
+
# Persist selection so it survives restarts
|
|
577
|
+
try:
|
|
578
|
+
from .config import LoreguardConfig
|
|
579
|
+
cfg = LoreguardConfig.load()
|
|
580
|
+
cfg.set_model_path(model_path)
|
|
581
|
+
cfg.save()
|
|
582
|
+
except Exception:
|
|
583
|
+
pass # Best-effort persistence
|
|
584
|
+
return {"status": "ok", "model": model_name}
|
|
585
|
+
except Exception:
|
|
586
|
+
continue
|
|
587
|
+
|
|
588
|
+
return JSONResponse(
|
|
589
|
+
status_code=500,
|
|
590
|
+
content={"error": "Model loaded but health check timed out after 30s"},
|
|
591
|
+
)
|
|
592
|
+
except Exception as e:
|
|
593
|
+
# Rollback: restore original model path and try to restart
|
|
594
|
+
server.llama_process.model_path = original_model_path
|
|
595
|
+
try:
|
|
596
|
+
server.llama_process.start()
|
|
597
|
+
except Exception:
|
|
598
|
+
pass # Best-effort rollback
|
|
599
|
+
return JSONResponse(
|
|
600
|
+
status_code=500,
|
|
601
|
+
content={"error": f"Failed to reload model: {e}"},
|
|
602
|
+
)
|
|
603
|
+
|
|
486
604
|
# Write runtime info
|
|
487
605
|
with open(debug_path, "a") as f:
|
|
488
606
|
f.write(f"[SDK Server] Writing runtime info for port {self.actual_port}...\n")
|
|
@@ -610,6 +728,15 @@ def force_stop_sdk_server() -> None:
|
|
|
610
728
|
_server = None
|
|
611
729
|
|
|
612
730
|
|
|
731
|
+
def set_llama_process(llama_process: Any, models_dir: Optional[Path] = None) -> None:
|
|
732
|
+
"""Set the LlamaServerProcess reference on the SDK server for model management."""
|
|
733
|
+
global _server
|
|
734
|
+
if _server:
|
|
735
|
+
_server.llama_process = llama_process
|
|
736
|
+
if models_dir:
|
|
737
|
+
_server.models_dir = models_dir
|
|
738
|
+
|
|
739
|
+
|
|
613
740
|
def update_backend_status(connected: bool) -> None:
|
|
614
741
|
"""Update backend connection status in runtime.json."""
|
|
615
742
|
global _server
|
|
@@ -30,7 +30,7 @@ def _get_templates_dir() -> Path:
|
|
|
30
30
|
return Path(__file__).parent.parent / "templates"
|
|
31
31
|
|
|
32
32
|
|
|
33
|
-
LLAMA_VERSION = "
|
|
33
|
+
LLAMA_VERSION = "b8467" # Must match loreguard-engine bundle version
|
|
34
34
|
|
|
35
35
|
# Download URLs for each platform
|
|
36
36
|
BINARIES = {
|
|
@@ -265,18 +265,21 @@ def make_executable(path: Path) -> None:
|
|
|
265
265
|
|
|
266
266
|
async def download_llama_server(
|
|
267
267
|
progress_callback: Optional[Callable[[str, DownloadProgress | None], None]] = None,
|
|
268
|
+
target_dir: Optional[Path] = None,
|
|
268
269
|
) -> Path:
|
|
269
270
|
"""Download and install llama-server for the current platform.
|
|
270
271
|
|
|
271
272
|
Args:
|
|
272
273
|
progress_callback: Called with (status_message, progress_or_none)
|
|
274
|
+
target_dir: If provided, install into this directory instead of the default.
|
|
275
|
+
Used by the bundle tool to pre-ship llama-server.
|
|
273
276
|
|
|
274
277
|
Returns:
|
|
275
278
|
Path to the installed llama-server binary
|
|
276
279
|
"""
|
|
277
280
|
plat = get_platform()
|
|
278
281
|
config = BINARIES[plat]
|
|
279
|
-
bin_dir = get_bin_dir()
|
|
282
|
+
bin_dir = target_dir or get_bin_dir()
|
|
280
283
|
|
|
281
284
|
def notify(msg: str, progress: DownloadProgress | None = None):
|
|
282
285
|
if progress_callback:
|
|
@@ -355,12 +358,12 @@ async def download_llama_server(
|
|
|
355
358
|
make_executable(lib)
|
|
356
359
|
|
|
357
360
|
# Write version marker file for future version checks
|
|
358
|
-
version_file = get_version_file_path()
|
|
361
|
+
version_file = bin_dir / ".llama_version" if target_dir else get_version_file_path()
|
|
359
362
|
version_file.write_text(LLAMA_VERSION)
|
|
360
363
|
|
|
361
364
|
notify(f"llama-server {LLAMA_VERSION} installed successfully!")
|
|
362
365
|
|
|
363
|
-
return
|
|
366
|
+
return bin_dir / config["binary_name"]
|
|
364
367
|
|
|
365
368
|
|
|
366
369
|
class LlamaServerProcess:
|
|
@@ -372,11 +375,13 @@ class LlamaServerProcess:
|
|
|
372
375
|
port: int = 8080,
|
|
373
376
|
lora_path: Optional[Path] = None,
|
|
374
377
|
context_size: int = 16384,
|
|
378
|
+
model_family: str = "llama3",
|
|
375
379
|
):
|
|
376
380
|
self.model_path = model_path
|
|
377
381
|
self.port = port
|
|
378
382
|
self.lora_path = lora_path
|
|
379
383
|
self.context_size = context_size
|
|
384
|
+
self.model_family = model_family
|
|
380
385
|
self.process: Optional[subprocess.Popen] = None
|
|
381
386
|
self._output_lines: list[str] = []
|
|
382
387
|
|
|
@@ -406,13 +411,18 @@ class LlamaServerProcess:
|
|
|
406
411
|
# Without this, llama-server may allocate multiple slots, each consuming
|
|
407
412
|
# KV cache memory proportional to context_size * model_hidden_dim.
|
|
408
413
|
"-np", "1",
|
|
409
|
-
#
|
|
410
|
-
# Llama 3.1's built-in template forces tool-calling format even without tools,
|
|
411
|
-
# so we use a stripped-down template that only handles chat messages.
|
|
414
|
+
# Enable Jinja template processing (required for both custom and embedded templates)
|
|
412
415
|
"--jinja",
|
|
413
|
-
"--chat-template-file", str(_get_templates_dir() / "llama31-no-tools.jinja"),
|
|
414
416
|
]
|
|
415
417
|
|
|
418
|
+
# Apply model-family-specific chat template override.
|
|
419
|
+
# Llama 3.1 requires a custom template to avoid the tool-calling bug;
|
|
420
|
+
# other families use their GGUF-embedded template (--jinja alone).
|
|
421
|
+
from .model_families import get_model_family
|
|
422
|
+
family = get_model_family(self.model_family)
|
|
423
|
+
if family.chat_template_file:
|
|
424
|
+
cmd.extend(["--chat-template-file", str(_get_templates_dir() / family.chat_template_file)])
|
|
425
|
+
|
|
416
426
|
# Add LoRA adapter if specified
|
|
417
427
|
if self.lora_path and self.lora_path.exists():
|
|
418
428
|
cmd.extend(["--lora", str(self.lora_path)])
|
|
@@ -37,16 +37,16 @@ class SamplingConfig:
|
|
|
37
37
|
presence_penalty: float = 0.0
|
|
38
38
|
|
|
39
39
|
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
"
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
40
|
+
from .model_families import get_model_family, ALL_STOP_MARKERS, DEFAULT_MODEL_FAMILY
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def get_stop_sequences(model_family: str = DEFAULT_MODEL_FAMILY) -> list[str]:
|
|
44
|
+
"""Get stop sequences for the given model family."""
|
|
45
|
+
return list(get_model_family(model_family).stop_sequences)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# Backward-compatible default (Llama 3 stop sequences)
|
|
49
|
+
DEFAULT_STOP_SEQUENCES = get_stop_sequences(DEFAULT_MODEL_FAMILY)
|
|
50
50
|
|
|
51
51
|
|
|
52
52
|
@dataclass
|
|
@@ -61,7 +61,9 @@ class LLMRequest:
|
|
|
61
61
|
stop: list[str] = field(default_factory=lambda: DEFAULT_STOP_SEQUENCES.copy())
|
|
62
62
|
|
|
63
63
|
# Thinking mode control (for Qwen3)
|
|
64
|
-
|
|
64
|
+
# Defaults to True: thinking wastes tokens and breaks pipelines.
|
|
65
|
+
# Only enable explicitly when extended reasoning is desired.
|
|
66
|
+
disable_thinking: bool = True
|
|
65
67
|
|
|
66
68
|
# If true, error if content is empty instead of falling back to reasoning_content
|
|
67
69
|
require_content: bool = False
|
|
@@ -94,11 +96,13 @@ class LLMProxy:
|
|
|
94
96
|
sampling configuration, stop sequences, and JSON mode support.
|
|
95
97
|
"""
|
|
96
98
|
|
|
97
|
-
def __init__(self, endpoint: str, timeout: float = 120.0):
|
|
99
|
+
def __init__(self, endpoint: str, timeout: float = 120.0, model_family: str = DEFAULT_MODEL_FAMILY):
|
|
98
100
|
if not endpoint:
|
|
99
101
|
raise ValueError("LLM endpoint is required")
|
|
100
102
|
self.endpoint = endpoint.rstrip("/")
|
|
101
103
|
self.default_timeout = timeout
|
|
104
|
+
self.model_family = model_family
|
|
105
|
+
self._stop_sequences = get_stop_sequences(model_family)
|
|
102
106
|
self.client = httpx.AsyncClient(
|
|
103
107
|
timeout=timeout,
|
|
104
108
|
limits=httpx.Limits(
|
|
@@ -255,9 +259,10 @@ class LLMProxy:
|
|
|
255
259
|
payload["id_slot"] = 0
|
|
256
260
|
logger.info("KV cache: cache_prompt=true, id_slot=0 (verify -np 1 on server)")
|
|
257
261
|
|
|
258
|
-
# Disable thinking mode
|
|
262
|
+
# Disable thinking mode (for Qwen3/3.5).
|
|
263
|
+
# Must use chat_template_kwargs — top-level enable_thinking is ignored by llama.cpp b8467+.
|
|
259
264
|
if req.disable_thinking:
|
|
260
|
-
payload["enable_thinking"] = False
|
|
265
|
+
payload.setdefault("chat_template_kwargs", {})["enable_thinking"] = False
|
|
261
266
|
|
|
262
267
|
# Note: JSON mode is not compatible with streaming in llama.cpp
|
|
263
268
|
# If force_json is requested, fall back to non-streaming
|
|
@@ -524,7 +529,7 @@ class LLMProxy:
|
|
|
524
529
|
max_tokens=d.get("max_tokens", 512),
|
|
525
530
|
timeout=timeout,
|
|
526
531
|
sampling=sampling,
|
|
527
|
-
stop=d.get("stop",
|
|
532
|
+
stop=d.get("stop", self._stop_sequences.copy()),
|
|
528
533
|
disable_thinking=d.get("disable_thinking", False),
|
|
529
534
|
require_content=d.get("require_content", False),
|
|
530
535
|
force_json=d.get("force_json", False),
|
|
@@ -571,9 +576,10 @@ class LLMProxy:
|
|
|
571
576
|
payload["id_slot"] = 0
|
|
572
577
|
logger.info("KV cache: cache_prompt=true, id_slot=0 (verify -np 1 on server)")
|
|
573
578
|
|
|
574
|
-
# Disable thinking mode
|
|
579
|
+
# Disable thinking mode (for Qwen3/3.5).
|
|
580
|
+
# Must use chat_template_kwargs — top-level enable_thinking is ignored by llama.cpp b8467+.
|
|
575
581
|
if req.disable_thinking:
|
|
576
|
-
payload["enable_thinking"] = False
|
|
582
|
+
payload.setdefault("chat_template_kwargs", {})["enable_thinking"] = False
|
|
577
583
|
|
|
578
584
|
# Force JSON output if requested
|
|
579
585
|
if req.force_json:
|
|
@@ -717,14 +723,12 @@ class LLMProxy:
|
|
|
717
723
|
return -1
|
|
718
724
|
|
|
719
725
|
def _strip_chat_markers(self, content: str) -> str:
|
|
720
|
-
"""Remove content after
|
|
721
|
-
markers = [
|
|
722
|
-
"<|im_end|>", "<|im_start|>", "<|endoftext|>",
|
|
723
|
-
"</s>", "<|end|>", "<|user|>", "<|assistant|>",
|
|
724
|
-
]
|
|
726
|
+
"""Remove content after chat markers that indicate hallucinated turns.
|
|
725
727
|
|
|
728
|
+
Uses a superset of all model families' tokens as a safety net.
|
|
729
|
+
"""
|
|
726
730
|
result = content
|
|
727
|
-
for marker in
|
|
731
|
+
for marker in ALL_STOP_MARKERS:
|
|
728
732
|
if marker in result:
|
|
729
733
|
idx = result.index(marker)
|
|
730
734
|
result = result[:idx]
|