loreguard-cli 0.15.2__tar.gz → 0.16.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/PKG-INFO +1 -1
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/pyproject.toml +1 -1
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/chunk_detector.py +1 -1
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/cli.py +16 -2
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/config.py +2 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/http_server.py +110 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/llama_server.py +11 -4
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/llm.py +18 -18
- loreguard_cli-0.16.0/src/model_families.py +121 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/app.py +1 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/screens/main.py +3 -2
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/screens/running.py +10 -2
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/.claude/skills/llama-cpp-troubleshooting/SKILL.md +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/.env.example +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/.github/workflows/release.yml +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/.gitignore +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/LICENSE +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/README.md +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/THIRD_PARTY_NOTICES.md +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/loreguard.spec +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/loreguard_entry.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/scripts/build.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/sdk/API.md +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/sdk/csharp/LoreguardSDK.cs +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/sdk/gdscript/LoreguardSDK.gd +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/sdk/javascript/loreguard-sdk.js +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/sdk/python/loreguard_sdk.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/__init__.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/__main__.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/dialogue_act_classifier.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/hf_discovery.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/intent_classifier.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/main.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/models_registry.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/nli.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/npc_chat.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/runtime.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/steam.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/term_ui.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/__init__.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/modals/__init__.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/modals/auth_menu.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/modals/npc_chat.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/modals/token_input.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/modals/unified_palette.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/screens/__init__.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/screens/auth.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/screens/model_select.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/screens/nli_setup.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/styles.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/widgets/__init__.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/widgets/banner.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/widgets/footer.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/widgets/hardware_info.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/widgets/npc_chat.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/widgets/server_monitor.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/widgets/status_panel.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tunnel.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/wizard.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/templates/llama31-no-tools.jinja +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/tests/test_intent_classifier.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/tests/test_nli_hhem.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/tests/test_websocket_timeout.py +0 -0
- {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/uv.lock +0 -0
|
@@ -46,12 +46,14 @@ class LoreguardCLI:
|
|
|
46
46
|
port: int = 8080,
|
|
47
47
|
backend_url: str = "wss://api.loreguard.com/workers",
|
|
48
48
|
worker_id: Optional[str] = None,
|
|
49
|
+
model_family: str = "llama3",
|
|
49
50
|
):
|
|
50
51
|
self.token = token
|
|
51
52
|
self.model_path = model_path
|
|
52
53
|
self.model_id = model_id
|
|
53
54
|
self.port = port
|
|
54
55
|
self.backend_url = backend_url
|
|
56
|
+
self.model_family = model_family
|
|
55
57
|
# Worker ID: use provided value, or default to sanitized hostname.
|
|
56
58
|
# Validator requires ^[a-zA-Z0-9_-]{1,64}$ — replace dots with hyphens.
|
|
57
59
|
raw_id = worker_id or socket.gethostname() or "worker"
|
|
@@ -209,7 +211,7 @@ class LoreguardCLI:
|
|
|
209
211
|
# Start server
|
|
210
212
|
log.info(f"Starting llama-server on port {self.port}...")
|
|
211
213
|
try:
|
|
212
|
-
self._llama = LlamaServerProcess(self.model_path, port=self.port)
|
|
214
|
+
self._llama = LlamaServerProcess(self.model_path, port=self.port, model_family=self.model_family)
|
|
213
215
|
self._llama.start()
|
|
214
216
|
|
|
215
217
|
# Wait for ready
|
|
@@ -241,7 +243,7 @@ class LoreguardCLI:
|
|
|
241
243
|
log.info(f"Worker ID: {self.worker_id}")
|
|
242
244
|
|
|
243
245
|
try:
|
|
244
|
-
llm_proxy = LLMProxy(f"http://127.0.0.1:{self.port}")
|
|
246
|
+
llm_proxy = LLMProxy(f"http://127.0.0.1:{self.port}", model_family=self.model_family)
|
|
245
247
|
|
|
246
248
|
# ADR-0027: Load all ML services — the client is the sole provider
|
|
247
249
|
# of NLI, intent, dialogue act, and chunk capabilities.
|
|
@@ -323,6 +325,11 @@ class LoreguardCLI:
|
|
|
323
325
|
port=sdk_port,
|
|
324
326
|
)
|
|
325
327
|
log.info(f"SDK server listening on 127.0.0.1:{self._sdk_port}")
|
|
328
|
+
|
|
329
|
+
# Wire llama process for runtime model switching
|
|
330
|
+
from .http_server import set_llama_process
|
|
331
|
+
models_dir = self.model_path.parent if self.model_path else None
|
|
332
|
+
set_llama_process(self._llama, models_dir)
|
|
326
333
|
except Exception as e:
|
|
327
334
|
log.error(f"Failed to start SDK server: {e}")
|
|
328
335
|
return False
|
|
@@ -460,6 +467,12 @@ Available model IDs:
|
|
|
460
467
|
default=os.getenv("LOREGUARD_BUNDLE_DIR", ""),
|
|
461
468
|
help="Loreguard bundle directory. Auto-discovers models from manifest.txt.",
|
|
462
469
|
)
|
|
470
|
+
parser.add_argument(
|
|
471
|
+
"--model-family",
|
|
472
|
+
default=os.getenv("LOREGUARD_MODEL_FAMILY", "auto"),
|
|
473
|
+
choices=["auto", "llama3", "qwen3", "gemma", "chatml"],
|
|
474
|
+
help="Model family profile for chat template/stop sequences (default: auto)",
|
|
475
|
+
)
|
|
463
476
|
parser.add_argument(
|
|
464
477
|
"--dev",
|
|
465
478
|
action="store_true",
|
|
@@ -531,6 +544,7 @@ Available model IDs:
|
|
|
531
544
|
port=args.port,
|
|
532
545
|
backend_url=args.backend,
|
|
533
546
|
worker_id=args.worker_id or None, # None will use hostname
|
|
547
|
+
model_family=args.model_family,
|
|
534
548
|
)
|
|
535
549
|
|
|
536
550
|
exit_code = asyncio.run(cli.run())
|
|
@@ -49,6 +49,7 @@ class LoreguardConfig:
|
|
|
49
49
|
dev_mode: bool = False
|
|
50
50
|
context_size: int = 16384 # llama-server context window size (configurable per game)
|
|
51
51
|
max_speech_tokens: int = 50 # Max tokens for NPC speech output (Pass 4). Default: 50 (~40 words)
|
|
52
|
+
model_family: str = "auto" # Model family profile (auto, llama3, qwen3, gemma, chatml)
|
|
52
53
|
|
|
53
54
|
def save(self) -> None:
|
|
54
55
|
"""Save configuration to disk."""
|
|
@@ -71,6 +72,7 @@ class LoreguardConfig:
|
|
|
71
72
|
dev_mode=data.get("dev_mode", False),
|
|
72
73
|
context_size=data.get("context_size", 16384),
|
|
73
74
|
max_speech_tokens=data.get("max_speech_tokens", 50),
|
|
75
|
+
model_family=data.get("model_family", "auto"),
|
|
74
76
|
)
|
|
75
77
|
except (json.JSONDecodeError, KeyError):
|
|
76
78
|
pass
|
|
@@ -8,6 +8,8 @@ HTTP endpoints:
|
|
|
8
8
|
GET /api/capabilities - Feature discovery (streaming, chunk modes)
|
|
9
9
|
GET /api/characters - List available NPCs (proxied from engine)
|
|
10
10
|
POST /api/chat - Chat with an NPC (streaming SSE or JSON)
|
|
11
|
+
GET /api/models - List available GGUF models
|
|
12
|
+
POST /api/admin/reload-model - Hot-swap LLM model at runtime
|
|
11
13
|
|
|
12
14
|
The server shares the existing tunnel connection instead of creating
|
|
13
15
|
a new one, ensuring a single WebSocket connection per worker.
|
|
@@ -17,10 +19,12 @@ Uses uvicorn with socket-first binding for race-condition-free port allocation.
|
|
|
17
19
|
|
|
18
20
|
import asyncio
|
|
19
21
|
import json
|
|
22
|
+
import os
|
|
20
23
|
import threading
|
|
21
24
|
import time
|
|
22
25
|
import uuid
|
|
23
26
|
from concurrent.futures import Future
|
|
27
|
+
from pathlib import Path
|
|
24
28
|
from typing import Any, Callable, Optional
|
|
25
29
|
|
|
26
30
|
from .runtime import write_runtime_info, RuntimeInfo, get_runtime_path, get_version
|
|
@@ -63,6 +67,8 @@ class EmbeddedHTTPServer:
|
|
|
63
67
|
self._running = False
|
|
64
68
|
self._bound_socket: Optional[Any] = None
|
|
65
69
|
self._ready_event = threading.Event()
|
|
70
|
+
self.llama_process: Optional[Any] = None # LlamaServerProcess — set by RunningScreen
|
|
71
|
+
self.models_dir: Optional[Path] = None # Path to models/ directory
|
|
66
72
|
|
|
67
73
|
def start(self) -> int:
|
|
68
74
|
"""Start the HTTP server in a background thread.
|
|
@@ -483,6 +489,101 @@ class EmbeddedHTTPServer:
|
|
|
483
489
|
return JSONResponse(status_code=500, content=result)
|
|
484
490
|
return result
|
|
485
491
|
|
|
492
|
+
@app.get("/api/models")
|
|
493
|
+
async def list_models():
|
|
494
|
+
"""List available GGUF models in the models directory."""
|
|
495
|
+
if not server.models_dir or not server.models_dir.exists():
|
|
496
|
+
return JSONResponse(
|
|
497
|
+
status_code=404,
|
|
498
|
+
content={"error": "Models directory not configured"},
|
|
499
|
+
)
|
|
500
|
+
|
|
501
|
+
models = []
|
|
502
|
+
active_model = None
|
|
503
|
+
if server.llama_process and hasattr(server.llama_process, "model_path"):
|
|
504
|
+
active_model = server.llama_process.model_path.name
|
|
505
|
+
|
|
506
|
+
for f in sorted(server.models_dir.iterdir()):
|
|
507
|
+
if f.suffix == ".gguf" and f.is_file():
|
|
508
|
+
models.append({
|
|
509
|
+
"name": f.name,
|
|
510
|
+
"size": f.stat().st_size,
|
|
511
|
+
"active": f.name == active_model,
|
|
512
|
+
})
|
|
513
|
+
|
|
514
|
+
return {"models": models, "activeModel": active_model}
|
|
515
|
+
|
|
516
|
+
@app.post("/api/admin/reload-model")
|
|
517
|
+
async def reload_model(request: Request):
|
|
518
|
+
"""Hot-swap the LLM model by restarting llama-server."""
|
|
519
|
+
if not server.llama_process:
|
|
520
|
+
return JSONResponse(
|
|
521
|
+
status_code=503,
|
|
522
|
+
content={"error": "LLM server not available"},
|
|
523
|
+
)
|
|
524
|
+
if not server.models_dir:
|
|
525
|
+
return JSONResponse(
|
|
526
|
+
status_code=503,
|
|
527
|
+
content={"error": "Models directory not configured"},
|
|
528
|
+
)
|
|
529
|
+
|
|
530
|
+
body = await request.json()
|
|
531
|
+
model_name = body.get("model", "")
|
|
532
|
+
if not model_name:
|
|
533
|
+
return JSONResponse(
|
|
534
|
+
status_code=400,
|
|
535
|
+
content={"error": "Missing 'model' field"},
|
|
536
|
+
)
|
|
537
|
+
|
|
538
|
+
# Security: prevent path traversal
|
|
539
|
+
if "/" in model_name or "\\" in model_name or ".." in model_name:
|
|
540
|
+
return JSONResponse(
|
|
541
|
+
status_code=400,
|
|
542
|
+
content={"error": "Invalid model name"},
|
|
543
|
+
)
|
|
544
|
+
|
|
545
|
+
model_path = server.models_dir / model_name
|
|
546
|
+
if not model_path.exists() or not model_path.suffix == ".gguf":
|
|
547
|
+
return JSONResponse(
|
|
548
|
+
status_code=404,
|
|
549
|
+
content={"error": f"Model '{model_name}' not found"},
|
|
550
|
+
)
|
|
551
|
+
|
|
552
|
+
# Check if already active
|
|
553
|
+
if hasattr(server.llama_process, "model_path") and server.llama_process.model_path.name == model_name:
|
|
554
|
+
return {"status": "already_active", "model": model_name}
|
|
555
|
+
|
|
556
|
+
try:
|
|
557
|
+
# Stop current llama-server
|
|
558
|
+
server.llama_process.stop()
|
|
559
|
+
|
|
560
|
+
# Update model path and restart
|
|
561
|
+
server.llama_process.model_path = model_path
|
|
562
|
+
server.llama_process.start()
|
|
563
|
+
|
|
564
|
+
# Wait for health check (llama-server takes a few seconds to load model)
|
|
565
|
+
import httpx
|
|
566
|
+
llama_url = f"http://127.0.0.1:{server.llama_process.port}/health"
|
|
567
|
+
for attempt in range(60): # 60 attempts × 0.5s = 30s timeout
|
|
568
|
+
await asyncio.sleep(0.5)
|
|
569
|
+
try:
|
|
570
|
+
async with httpx.AsyncClient(timeout=2.0) as client:
|
|
571
|
+
resp = await client.get(llama_url)
|
|
572
|
+
if resp.status_code == 200:
|
|
573
|
+
return {"status": "ok", "model": model_name}
|
|
574
|
+
except Exception:
|
|
575
|
+
continue
|
|
576
|
+
|
|
577
|
+
return JSONResponse(
|
|
578
|
+
status_code=500,
|
|
579
|
+
content={"error": "Model loaded but health check timed out after 30s"},
|
|
580
|
+
)
|
|
581
|
+
except Exception as e:
|
|
582
|
+
return JSONResponse(
|
|
583
|
+
status_code=500,
|
|
584
|
+
content={"error": f"Failed to reload model: {e}"},
|
|
585
|
+
)
|
|
586
|
+
|
|
486
587
|
# Write runtime info
|
|
487
588
|
with open(debug_path, "a") as f:
|
|
488
589
|
f.write(f"[SDK Server] Writing runtime info for port {self.actual_port}...\n")
|
|
@@ -610,6 +711,15 @@ def force_stop_sdk_server() -> None:
|
|
|
610
711
|
_server = None
|
|
611
712
|
|
|
612
713
|
|
|
714
|
+
def set_llama_process(llama_process: Any, models_dir: Optional[Path] = None) -> None:
|
|
715
|
+
"""Set the LlamaServerProcess reference on the SDK server for model management."""
|
|
716
|
+
global _server
|
|
717
|
+
if _server:
|
|
718
|
+
_server.llama_process = llama_process
|
|
719
|
+
if models_dir:
|
|
720
|
+
_server.models_dir = models_dir
|
|
721
|
+
|
|
722
|
+
|
|
613
723
|
def update_backend_status(connected: bool) -> None:
|
|
614
724
|
"""Update backend connection status in runtime.json."""
|
|
615
725
|
global _server
|
|
@@ -372,11 +372,13 @@ class LlamaServerProcess:
|
|
|
372
372
|
port: int = 8080,
|
|
373
373
|
lora_path: Optional[Path] = None,
|
|
374
374
|
context_size: int = 16384,
|
|
375
|
+
model_family: str = "llama3",
|
|
375
376
|
):
|
|
376
377
|
self.model_path = model_path
|
|
377
378
|
self.port = port
|
|
378
379
|
self.lora_path = lora_path
|
|
379
380
|
self.context_size = context_size
|
|
381
|
+
self.model_family = model_family
|
|
380
382
|
self.process: Optional[subprocess.Popen] = None
|
|
381
383
|
self._output_lines: list[str] = []
|
|
382
384
|
|
|
@@ -406,13 +408,18 @@ class LlamaServerProcess:
|
|
|
406
408
|
# Without this, llama-server may allocate multiple slots, each consuming
|
|
407
409
|
# KV cache memory proportional to context_size * model_hidden_dim.
|
|
408
410
|
"-np", "1",
|
|
409
|
-
#
|
|
410
|
-
# Llama 3.1's built-in template forces tool-calling format even without tools,
|
|
411
|
-
# so we use a stripped-down template that only handles chat messages.
|
|
411
|
+
# Enable Jinja template processing (required for both custom and embedded templates)
|
|
412
412
|
"--jinja",
|
|
413
|
-
"--chat-template-file", str(_get_templates_dir() / "llama31-no-tools.jinja"),
|
|
414
413
|
]
|
|
415
414
|
|
|
415
|
+
# Apply model-family-specific chat template override.
|
|
416
|
+
# Llama 3.1 requires a custom template to avoid the tool-calling bug;
|
|
417
|
+
# other families use their GGUF-embedded template (--jinja alone).
|
|
418
|
+
from .model_families import get_model_family
|
|
419
|
+
family = get_model_family(self.model_family)
|
|
420
|
+
if family.chat_template_file:
|
|
421
|
+
cmd.extend(["--chat-template-file", str(_get_templates_dir() / family.chat_template_file)])
|
|
422
|
+
|
|
416
423
|
# Add LoRA adapter if specified
|
|
417
424
|
if self.lora_path and self.lora_path.exists():
|
|
418
425
|
cmd.extend(["--lora", str(self.lora_path)])
|
|
@@ -37,16 +37,16 @@ class SamplingConfig:
|
|
|
37
37
|
presence_penalty: float = 0.0
|
|
38
38
|
|
|
39
39
|
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
"
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
40
|
+
from .model_families import get_model_family, ALL_STOP_MARKERS, DEFAULT_MODEL_FAMILY
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def get_stop_sequences(model_family: str = DEFAULT_MODEL_FAMILY) -> list[str]:
|
|
44
|
+
"""Get stop sequences for the given model family."""
|
|
45
|
+
return list(get_model_family(model_family).stop_sequences)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# Backward-compatible default (Llama 3 stop sequences)
|
|
49
|
+
DEFAULT_STOP_SEQUENCES = get_stop_sequences(DEFAULT_MODEL_FAMILY)
|
|
50
50
|
|
|
51
51
|
|
|
52
52
|
@dataclass
|
|
@@ -94,11 +94,13 @@ class LLMProxy:
|
|
|
94
94
|
sampling configuration, stop sequences, and JSON mode support.
|
|
95
95
|
"""
|
|
96
96
|
|
|
97
|
-
def __init__(self, endpoint: str, timeout: float = 120.0):
|
|
97
|
+
def __init__(self, endpoint: str, timeout: float = 120.0, model_family: str = DEFAULT_MODEL_FAMILY):
|
|
98
98
|
if not endpoint:
|
|
99
99
|
raise ValueError("LLM endpoint is required")
|
|
100
100
|
self.endpoint = endpoint.rstrip("/")
|
|
101
101
|
self.default_timeout = timeout
|
|
102
|
+
self.model_family = model_family
|
|
103
|
+
self._stop_sequences = get_stop_sequences(model_family)
|
|
102
104
|
self.client = httpx.AsyncClient(
|
|
103
105
|
timeout=timeout,
|
|
104
106
|
limits=httpx.Limits(
|
|
@@ -524,7 +526,7 @@ class LLMProxy:
|
|
|
524
526
|
max_tokens=d.get("max_tokens", 512),
|
|
525
527
|
timeout=timeout,
|
|
526
528
|
sampling=sampling,
|
|
527
|
-
stop=d.get("stop",
|
|
529
|
+
stop=d.get("stop", self._stop_sequences.copy()),
|
|
528
530
|
disable_thinking=d.get("disable_thinking", False),
|
|
529
531
|
require_content=d.get("require_content", False),
|
|
530
532
|
force_json=d.get("force_json", False),
|
|
@@ -717,14 +719,12 @@ class LLMProxy:
|
|
|
717
719
|
return -1
|
|
718
720
|
|
|
719
721
|
def _strip_chat_markers(self, content: str) -> str:
|
|
720
|
-
"""Remove content after
|
|
721
|
-
markers = [
|
|
722
|
-
"<|im_end|>", "<|im_start|>", "<|endoftext|>",
|
|
723
|
-
"</s>", "<|end|>", "<|user|>", "<|assistant|>",
|
|
724
|
-
]
|
|
722
|
+
"""Remove content after chat markers that indicate hallucinated turns.
|
|
725
723
|
|
|
724
|
+
Uses a superset of all model families' tokens as a safety net.
|
|
725
|
+
"""
|
|
726
726
|
result = content
|
|
727
|
-
for marker in
|
|
727
|
+
for marker in ALL_STOP_MARKERS:
|
|
728
728
|
if marker in result:
|
|
729
729
|
idx = result.index(marker)
|
|
730
730
|
result = result[:idx]
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
"""Model family profiles for chat template and stop sequence configuration.
|
|
2
|
+
|
|
3
|
+
Different model families (Llama, Qwen, Gemma, etc.) use different chat template
|
|
4
|
+
formats and stop tokens. This module provides preconfigured profiles so users
|
|
5
|
+
can switch models without manually adjusting server flags.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from typing import Optional
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass(frozen=True)
|
|
16
|
+
class ModelFamilyProfile:
|
|
17
|
+
"""Preconfigured settings for a model family.
|
|
18
|
+
|
|
19
|
+
Attributes:
|
|
20
|
+
id: Unique identifier (used in config.json).
|
|
21
|
+
name: Human-readable display name.
|
|
22
|
+
chat_template_file: Jinja template filename (relative to templates/).
|
|
23
|
+
None means use the model's GGUF-embedded template via --jinja.
|
|
24
|
+
stop_sequences: Model-family-specific stop tokens for generation.
|
|
25
|
+
description: Short description for UI display.
|
|
26
|
+
"""
|
|
27
|
+
id: str
|
|
28
|
+
name: str
|
|
29
|
+
chat_template_file: Optional[str]
|
|
30
|
+
stop_sequences: tuple[str, ...]
|
|
31
|
+
description: str
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# Registry of known model family profiles.
|
|
35
|
+
# Key = profile ID (stored in config.json as model_family).
|
|
36
|
+
MODEL_FAMILIES: dict[str, ModelFamilyProfile] = {
|
|
37
|
+
"auto": ModelFamilyProfile(
|
|
38
|
+
id="auto",
|
|
39
|
+
name="Auto (Model Embedded)",
|
|
40
|
+
chat_template_file=None,
|
|
41
|
+
stop_sequences=(
|
|
42
|
+
# Superset — works for any model, extra tokens are inert
|
|
43
|
+
"<|im_end|>", "<|im_start|>", "<|endoftext|>",
|
|
44
|
+
"<|eot_id|>", "<|end_of_text|>",
|
|
45
|
+
"<end_of_turn>", "<start_of_turn>",
|
|
46
|
+
"</s>", "<|end|>",
|
|
47
|
+
),
|
|
48
|
+
description="Uses model's embedded chat template. Works for most models.",
|
|
49
|
+
),
|
|
50
|
+
"llama3": ModelFamilyProfile(
|
|
51
|
+
id="llama3",
|
|
52
|
+
name="Llama 3 / 3.1",
|
|
53
|
+
chat_template_file="llama31-no-tools.jinja",
|
|
54
|
+
stop_sequences=(
|
|
55
|
+
"<|eot_id|>",
|
|
56
|
+
"<|end_of_text|>",
|
|
57
|
+
),
|
|
58
|
+
description="Meta Llama 3.x series. Uses custom template to disable tool-calling.",
|
|
59
|
+
),
|
|
60
|
+
"qwen3": ModelFamilyProfile(
|
|
61
|
+
id="qwen3",
|
|
62
|
+
name="Qwen 3 / 3.5",
|
|
63
|
+
chat_template_file=None,
|
|
64
|
+
stop_sequences=(
|
|
65
|
+
"<|im_end|>",
|
|
66
|
+
"<|im_start|>",
|
|
67
|
+
"<|endoftext|>",
|
|
68
|
+
),
|
|
69
|
+
description="Alibaba Qwen 3.x series. ChatML format with thinking support.",
|
|
70
|
+
),
|
|
71
|
+
"gemma": ModelFamilyProfile(
|
|
72
|
+
id="gemma",
|
|
73
|
+
name="Google Gemma",
|
|
74
|
+
chat_template_file=None,
|
|
75
|
+
stop_sequences=(
|
|
76
|
+
"<end_of_turn>",
|
|
77
|
+
"<start_of_turn>",
|
|
78
|
+
),
|
|
79
|
+
description="Google Gemma models. Uses model-embedded template.",
|
|
80
|
+
),
|
|
81
|
+
"chatml": ModelFamilyProfile(
|
|
82
|
+
id="chatml",
|
|
83
|
+
name="ChatML (Generic)",
|
|
84
|
+
chat_template_file=None,
|
|
85
|
+
stop_sequences=(
|
|
86
|
+
"<|im_end|>",
|
|
87
|
+
"<|im_start|>",
|
|
88
|
+
"<|endoftext|>",
|
|
89
|
+
"</s>",
|
|
90
|
+
),
|
|
91
|
+
description="Generic ChatML-compatible models (Nous Hermes, OpenChat, etc.).",
|
|
92
|
+
),
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
DEFAULT_MODEL_FAMILY = "auto"
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def get_model_family(family_id: str) -> ModelFamilyProfile:
|
|
99
|
+
"""Get a model family profile by ID.
|
|
100
|
+
|
|
101
|
+
Falls back to DEFAULT_MODEL_FAMILY if the ID is unknown.
|
|
102
|
+
"""
|
|
103
|
+
profile = MODEL_FAMILIES.get(family_id)
|
|
104
|
+
if profile is None:
|
|
105
|
+
logger.warning(
|
|
106
|
+
"Unknown model family '%s', falling back to '%s'. Valid: %s",
|
|
107
|
+
family_id, DEFAULT_MODEL_FAMILY, ", ".join(MODEL_FAMILIES.keys()),
|
|
108
|
+
)
|
|
109
|
+
profile = MODEL_FAMILIES[DEFAULT_MODEL_FAMILY]
|
|
110
|
+
return profile
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
# Superset of all stop markers across all families.
|
|
114
|
+
# Used for _strip_chat_markers() safety net — catches markers from ANY model family.
|
|
115
|
+
ALL_STOP_MARKERS: tuple[str, ...] = tuple(sorted(set(
|
|
116
|
+
marker
|
|
117
|
+
for profile in MODEL_FAMILIES.values()
|
|
118
|
+
for marker in profile.stop_sequences
|
|
119
|
+
) | {
|
|
120
|
+
"</s>", "<|end|>", "<|user|>", "<|assistant|>",
|
|
121
|
+
}))
|
|
@@ -55,6 +55,7 @@ class LoreguardApp(App):
|
|
|
55
55
|
worker_id: str = ""
|
|
56
56
|
model_path: Optional[Path] = None
|
|
57
57
|
adapter_path: Optional[Path] = None # Optional LoRA adapter
|
|
58
|
+
model_family: str = "auto" # Model family profile (auto, llama3, qwen3, gemma, chatml)
|
|
58
59
|
hardware: Optional[HardwareData] = None
|
|
59
60
|
dev_mode: bool = False
|
|
60
61
|
verbose: bool = False
|
|
@@ -156,6 +156,7 @@ class MainScreen(Screen):
|
|
|
156
156
|
app.api_token = config.api_token
|
|
157
157
|
app.model_path = config.get_model_path_obj()
|
|
158
158
|
app.adapter_path = config.get_adapter_path_obj()
|
|
159
|
+
app.model_family = config.model_family
|
|
159
160
|
app.dev_mode = config.dev_mode
|
|
160
161
|
|
|
161
162
|
model_name = app.model_path.name if app.model_path else 'unknown'
|
|
@@ -420,7 +421,7 @@ class MainScreen(Screen):
|
|
|
420
421
|
|
|
421
422
|
# Start llama-server (with optional LoRA adapter)
|
|
422
423
|
self._update_status("Starting llama-server...")
|
|
423
|
-
app._llama_process = LlamaServerProcess(app.model_path, port=8080, lora_path=app.adapter_path)
|
|
424
|
+
app._llama_process = LlamaServerProcess(app.model_path, port=8080, lora_path=app.adapter_path, model_family=app.model_family)
|
|
424
425
|
app._llama_process.start()
|
|
425
426
|
|
|
426
427
|
# Wait for model to load with progress updates
|
|
@@ -507,7 +508,7 @@ class MainScreen(Screen):
|
|
|
507
508
|
self._update_connection_status("connecting")
|
|
508
509
|
|
|
509
510
|
try:
|
|
510
|
-
llm_proxy = LLMProxy("http://127.0.0.1:8080")
|
|
511
|
+
llm_proxy = LLMProxy("http://127.0.0.1:8080", model_family=app.model_family)
|
|
511
512
|
|
|
512
513
|
# Load NLI service (run in thread pool to not block event loop)
|
|
513
514
|
nli_service = None
|
|
@@ -138,7 +138,7 @@ class RunningScreen(Screen):
|
|
|
138
138
|
self._update_status("model", "Model", app.model_path.name)
|
|
139
139
|
self._log(f"Starting llama-server with {app.model_path.name}", "info")
|
|
140
140
|
|
|
141
|
-
self._llama_process = LlamaServerProcess(app.model_path, port=8080)
|
|
141
|
+
self._llama_process = LlamaServerProcess(app.model_path, port=8080, model_family=app.model_family)
|
|
142
142
|
self._llama_process.start()
|
|
143
143
|
|
|
144
144
|
# Wait for model to load with progress updates
|
|
@@ -185,6 +185,14 @@ class RunningScreen(Screen):
|
|
|
185
185
|
self._update_status("server", "llama-server", f"Running on :8080 ({elapsed}s)", "success")
|
|
186
186
|
self._log(f"LLM ready in {elapsed}s", "success")
|
|
187
187
|
|
|
188
|
+
# Wire llama process to SDK server for runtime model switching
|
|
189
|
+
try:
|
|
190
|
+
from ...http_server import set_llama_process
|
|
191
|
+
models_dir = app.model_path.parent if app.model_path else None
|
|
192
|
+
set_llama_process(self._llama_process, models_dir)
|
|
193
|
+
except Exception:
|
|
194
|
+
pass # SDK server may not be running yet in all modes
|
|
195
|
+
|
|
188
196
|
# Connect backend
|
|
189
197
|
if not app.dev_mode:
|
|
190
198
|
self._update_status("backend", "Backend", "Connecting...", "info")
|
|
@@ -200,7 +208,7 @@ class RunningScreen(Screen):
|
|
|
200
208
|
get_dialogue_act_model_info,
|
|
201
209
|
)
|
|
202
210
|
|
|
203
|
-
llm_proxy = LLMProxy("http://127.0.0.1:8080")
|
|
211
|
+
llm_proxy = LLMProxy("http://127.0.0.1:8080", model_family=app.model_family)
|
|
204
212
|
|
|
205
213
|
# Load NLI service (run in thread pool to not block event loop)
|
|
206
214
|
nli_service = None
|
{loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/.claude/skills/llama-cpp-troubleshooting/SKILL.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|