PyPI - loreguard-cli - Versions diffs - 0.15.2__tar.gz → 0.16.0__tar.gz - Mend

loreguard-cli 0.15.2tar.gz → 0.16.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

{loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: loreguard-cli
-Version: 0.15.2
+Version: 0.16.0
 Summary: Local inference client for Loreguard NPCs
 Project-URL: Homepage, https://loreguard.com
 Project-URL: Documentation, https://github.com/beyond-logic-labs/loreguard-cli#readme

{loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "loreguard-cli"
-version = "0.15.2"
+version = "0.16.0"
 description = "Local inference client for Loreguard NPCs"
 readme = "README.md"
 license = "MIT"

{loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/chunk_detector.py RENAMED Viewed

@@ -44,7 +44,7 @@ CHUNK_HYPOTHESES = {
 # Threshold for "starts new thought" classification
 # If confidence > threshold, we create a new chunk
-NEW_THOUGHT_THRESHOLD = 0.55
+NEW_THOUGHT_THRESHOLD = 0.38
 class ChunkDetector:

{loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/cli.py RENAMED Viewed

@@ -46,12 +46,14 @@ class LoreguardCLI:
         port: int = 8080,
         backend_url: str = "wss://api.loreguard.com/workers",
         worker_id: Optional[str] = None,
+        model_family: str = "llama3",
     ):
         self.token = token
         self.model_path = model_path
         self.model_id = model_id
         self.port = port
         self.backend_url = backend_url
+        self.model_family = model_family
         # Worker ID: use provided value, or default to sanitized hostname.
         # Validator requires ^[a-zA-Z0-9_-]{1,64}$ — replace dots with hyphens.
         raw_id = worker_id or socket.gethostname() or "worker"
@@ -209,7 +211,7 @@ class LoreguardCLI:
         # Start server
         log.info(f"Starting llama-server on port {self.port}...")
         try:
-            self._llama = LlamaServerProcess(self.model_path, port=self.port)
+            self._llama = LlamaServerProcess(self.model_path, port=self.port, model_family=self.model_family)
             self._llama.start()
             # Wait for ready
@@ -241,7 +243,7 @@ class LoreguardCLI:
         log.info(f"Worker ID: {self.worker_id}")
         try:
-            llm_proxy = LLMProxy(f"http://127.0.0.1:{self.port}")
+            llm_proxy = LLMProxy(f"http://127.0.0.1:{self.port}", model_family=self.model_family)
             # ADR-0027: Load all ML services — the client is the sole provider
             # of NLI, intent, dialogue act, and chunk capabilities.
@@ -323,6 +325,11 @@ class LoreguardCLI:
                     port=sdk_port,
                 )
                 log.info(f"SDK server listening on 127.0.0.1:{self._sdk_port}")
+                # Wire llama process for runtime model switching
+                from .http_server import set_llama_process
+                models_dir = self.model_path.parent if self.model_path else None
+                set_llama_process(self._llama, models_dir)
             except Exception as e:
                 log.error(f"Failed to start SDK server: {e}")
                 return False
@@ -460,6 +467,12 @@ Available model IDs:
         default=os.getenv("LOREGUARD_BUNDLE_DIR", ""),
         help="Loreguard bundle directory. Auto-discovers models from manifest.txt.",
     )
+    parser.add_argument(
+        "--model-family",
+        default=os.getenv("LOREGUARD_MODEL_FAMILY", "auto"),
+        choices=["auto", "llama3", "qwen3", "gemma", "chatml"],
+        help="Model family profile for chat template/stop sequences (default: auto)",
+    )
     parser.add_argument(
         "--dev",
         action="store_true",
@@ -531,6 +544,7 @@ Available model IDs:
         port=args.port,
         backend_url=args.backend,
         worker_id=args.worker_id or None,  # None will use hostname
+        model_family=args.model_family,
     )
     exit_code = asyncio.run(cli.run())

{loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/config.py RENAMED Viewed

@@ -49,6 +49,7 @@ class LoreguardConfig:
     dev_mode: bool = False
     context_size: int = 16384  # llama-server context window size (configurable per game)
     max_speech_tokens: int = 50  # Max tokens for NPC speech output (Pass 4). Default: 50 (~40 words)
+    model_family: str = "auto"  # Model family profile (auto, llama3, qwen3, gemma, chatml)
     def save(self) -> None:
         """Save configuration to disk."""
@@ -71,6 +72,7 @@ class LoreguardConfig:
                     dev_mode=data.get("dev_mode", False),
                     context_size=data.get("context_size", 16384),
                     max_speech_tokens=data.get("max_speech_tokens", 50),
+                    model_family=data.get("model_family", "auto"),
                 )
             except (json.JSONDecodeError, KeyError):
                 pass

{loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/http_server.py RENAMED Viewed

@@ -8,6 +8,8 @@ HTTP endpoints:
   GET  /api/capabilities  - Feature discovery (streaming, chunk modes)
   GET  /api/characters    - List available NPCs (proxied from engine)
   POST /api/chat          - Chat with an NPC (streaming SSE or JSON)
+  GET  /api/models        - List available GGUF models
+  POST /api/admin/reload-model - Hot-swap LLM model at runtime
 The server shares the existing tunnel connection instead of creating
 a new one, ensuring a single WebSocket connection per worker.
@@ -17,10 +19,12 @@ Uses uvicorn with socket-first binding for race-condition-free port allocation.
 import asyncio
 import json
+import os
 import threading
 import time
 import uuid
 from concurrent.futures import Future
+from pathlib import Path
 from typing import Any, Callable, Optional
 from .runtime import write_runtime_info, RuntimeInfo, get_runtime_path, get_version
@@ -63,6 +67,8 @@ class EmbeddedHTTPServer:
         self._running = False
         self._bound_socket: Optional[Any] = None
         self._ready_event = threading.Event()
+        self.llama_process: Optional[Any] = None  # LlamaServerProcess — set by RunningScreen
+        self.models_dir: Optional[Path] = None     # Path to models/ directory
     def start(self) -> int:
         """Start the HTTP server in a background thread.
@@ -483,6 +489,101 @@ class EmbeddedHTTPServer:
                     return JSONResponse(status_code=500, content=result)
                 return result
+        @app.get("/api/models")
+        async def list_models():
+            """List available GGUF models in the models directory."""
+            if not server.models_dir or not server.models_dir.exists():
+                return JSONResponse(
+                    status_code=404,
+                    content={"error": "Models directory not configured"},
+                )
+            models = []
+            active_model = None
+            if server.llama_process and hasattr(server.llama_process, "model_path"):
+                active_model = server.llama_process.model_path.name
+            for f in sorted(server.models_dir.iterdir()):
+                if f.suffix == ".gguf" and f.is_file():
+                    models.append({
+                        "name": f.name,
+                        "size": f.stat().st_size,
+                        "active": f.name == active_model,
+                    })
+            return {"models": models, "activeModel": active_model}
+        @app.post("/api/admin/reload-model")
+        async def reload_model(request: Request):
+            """Hot-swap the LLM model by restarting llama-server."""
+            if not server.llama_process:
+                return JSONResponse(
+                    status_code=503,
+                    content={"error": "LLM server not available"},
+                )
+            if not server.models_dir:
+                return JSONResponse(
+                    status_code=503,
+                    content={"error": "Models directory not configured"},
+                )
+            body = await request.json()
+            model_name = body.get("model", "")
+            if not model_name:
+                return JSONResponse(
+                    status_code=400,
+                    content={"error": "Missing 'model' field"},
+                )
+            # Security: prevent path traversal
+            if "/" in model_name or "\\" in model_name or ".." in model_name:
+                return JSONResponse(
+                    status_code=400,
+                    content={"error": "Invalid model name"},
+                )
+            model_path = server.models_dir / model_name
+            if not model_path.exists() or not model_path.suffix == ".gguf":
+                return JSONResponse(
+                    status_code=404,
+                    content={"error": f"Model '{model_name}' not found"},
+                )
+            # Check if already active
+            if hasattr(server.llama_process, "model_path") and server.llama_process.model_path.name == model_name:
+                return {"status": "already_active", "model": model_name}
+            try:
+                # Stop current llama-server
+                server.llama_process.stop()
+                # Update model path and restart
+                server.llama_process.model_path = model_path
+                server.llama_process.start()
+                # Wait for health check (llama-server takes a few seconds to load model)
+                import httpx
+                llama_url = f"http://127.0.0.1:{server.llama_process.port}/health"
+                for attempt in range(60):  # 60 attempts × 0.5s = 30s timeout
+                    await asyncio.sleep(0.5)
+                    try:
+                        async with httpx.AsyncClient(timeout=2.0) as client:
+                            resp = await client.get(llama_url)
+                            if resp.status_code == 200:
+                                return {"status": "ok", "model": model_name}
+                    except Exception:
+                        continue
+                return JSONResponse(
+                    status_code=500,
+                    content={"error": "Model loaded but health check timed out after 30s"},
+                )
+            except Exception as e:
+                return JSONResponse(
+                    status_code=500,
+                    content={"error": f"Failed to reload model: {e}"},
+                )
         # Write runtime info
         with open(debug_path, "a") as f:
             f.write(f"[SDK Server] Writing runtime info for port {self.actual_port}...\n")
@@ -610,6 +711,15 @@ def force_stop_sdk_server() -> None:
         _server = None
+def set_llama_process(llama_process: Any, models_dir: Optional[Path] = None) -> None:
+    """Set the LlamaServerProcess reference on the SDK server for model management."""
+    global _server
+    if _server:
+        _server.llama_process = llama_process
+        if models_dir:
+            _server.models_dir = models_dir
 def update_backend_status(connected: bool) -> None:
     """Update backend connection status in runtime.json."""
     global _server

{loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/llama_server.py RENAMED Viewed

@@ -372,11 +372,13 @@ class LlamaServerProcess:
         port: int = 8080,
         lora_path: Optional[Path] = None,
         context_size: int = 16384,
+        model_family: str = "llama3",
     ):
         self.model_path = model_path
         self.port = port
         self.lora_path = lora_path
         self.context_size = context_size
+        self.model_family = model_family
         self.process: Optional[subprocess.Popen] = None
         self._output_lines: list[str] = []
@@ -406,13 +408,18 @@ class LlamaServerProcess:
             # Without this, llama-server may allocate multiple slots, each consuming
             # KV cache memory proportional to context_size * model_hidden_dim.
             "-np", "1",
-            # Use custom Jinja template without tool-calling logic.
-            # Llama 3.1's built-in template forces tool-calling format even without tools,
-            # so we use a stripped-down template that only handles chat messages.
+            # Enable Jinja template processing (required for both custom and embedded templates)
             "--jinja",
-            "--chat-template-file", str(_get_templates_dir() / "llama31-no-tools.jinja"),
         ]
+        # Apply model-family-specific chat template override.
+        # Llama 3.1 requires a custom template to avoid the tool-calling bug;
+        # other families use their GGUF-embedded template (--jinja alone).
+        from .model_families import get_model_family
+        family = get_model_family(self.model_family)
+        if family.chat_template_file:
+            cmd.extend(["--chat-template-file", str(_get_templates_dir() / family.chat_template_file)])
         # Add LoRA adapter if specified
         if self.lora_path and self.lora_path.exists():
             cmd.extend(["--lora", str(self.lora_path)])

{loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/llm.py RENAMED Viewed

@@ -37,16 +37,16 @@ class SamplingConfig:
     presence_penalty: float = 0.0
-# Default stop sequences - ChatML/instruction markers that signal end of turn
-DEFAULT_STOP_SEQUENCES = [
-    "<|im_end|>",
-    "<|im_start|>",
-    "<|endoftext|>",
-    "</s>",
-    "<|end|>",
-    "<|user|>",
-    "<|assistant|>",
-]
+from .model_families import get_model_family, ALL_STOP_MARKERS, DEFAULT_MODEL_FAMILY
+def get_stop_sequences(model_family: str = DEFAULT_MODEL_FAMILY) -> list[str]:
+    """Get stop sequences for the given model family."""
+    return list(get_model_family(model_family).stop_sequences)
+# Backward-compatible default (Llama 3 stop sequences)
+DEFAULT_STOP_SEQUENCES = get_stop_sequences(DEFAULT_MODEL_FAMILY)
 @dataclass
@@ -94,11 +94,13 @@ class LLMProxy:
     sampling configuration, stop sequences, and JSON mode support.
     """
-    def __init__(self, endpoint: str, timeout: float = 120.0):
+    def __init__(self, endpoint: str, timeout: float = 120.0, model_family: str = DEFAULT_MODEL_FAMILY):
         if not endpoint:
             raise ValueError("LLM endpoint is required")
         self.endpoint = endpoint.rstrip("/")
         self.default_timeout = timeout
+        self.model_family = model_family
+        self._stop_sequences = get_stop_sequences(model_family)
         self.client = httpx.AsyncClient(
             timeout=timeout,
             limits=httpx.Limits(
@@ -524,7 +526,7 @@ class LLMProxy:
             max_tokens=d.get("max_tokens", 512),
             timeout=timeout,
             sampling=sampling,
-            stop=d.get("stop", DEFAULT_STOP_SEQUENCES.copy()),
+            stop=d.get("stop", self._stop_sequences.copy()),
             disable_thinking=d.get("disable_thinking", False),
             require_content=d.get("require_content", False),
             force_json=d.get("force_json", False),
@@ -717,14 +719,12 @@ class LLMProxy:
         return -1
     def _strip_chat_markers(self, content: str) -> str:
-        """Remove content after ChatML markers that indicate hallucinated turns."""
-        markers = [
-            "<|im_end|>", "<|im_start|>", "<|endoftext|>",
-            "</s>", "<|end|>", "<|user|>", "<|assistant|>",
-        ]
+        """Remove content after chat markers that indicate hallucinated turns.
+        Uses a superset of all model families' tokens as a safety net.
+        """
         result = content
-        for marker in markers:
+        for marker in ALL_STOP_MARKERS:
             if marker in result:
                 idx = result.index(marker)
                 result = result[:idx]

loreguard_cli-0.16.0/src/model_families.py ADDED Viewed

@@ -0,0 +1,121 @@
+"""Model family profiles for chat template and stop sequence configuration.
+Different model families (Llama, Qwen, Gemma, etc.) use different chat template
+formats and stop tokens. This module provides preconfigured profiles so users
+can switch models without manually adjusting server flags.
+"""
+import logging
+from dataclasses import dataclass
+from typing import Optional
+logger = logging.getLogger(__name__)
+@dataclass(frozen=True)
+class ModelFamilyProfile:
+    """Preconfigured settings for a model family.
+    Attributes:
+        id: Unique identifier (used in config.json).
+        name: Human-readable display name.
+        chat_template_file: Jinja template filename (relative to templates/).
+            None means use the model's GGUF-embedded template via --jinja.
+        stop_sequences: Model-family-specific stop tokens for generation.
+        description: Short description for UI display.
+    """
+    id: str
+    name: str
+    chat_template_file: Optional[str]
+    stop_sequences: tuple[str, ...]
+    description: str
+# Registry of known model family profiles.
+# Key = profile ID (stored in config.json as model_family).
+MODEL_FAMILIES: dict[str, ModelFamilyProfile] = {
+    "auto": ModelFamilyProfile(
+        id="auto",
+        name="Auto (Model Embedded)",
+        chat_template_file=None,
+        stop_sequences=(
+            # Superset — works for any model, extra tokens are inert
+            "<|im_end|>", "<|im_start|>", "<|endoftext|>",
+            "<|eot_id|>", "<|end_of_text|>",
+            "<end_of_turn>", "<start_of_turn>",
+            "</s>", "<|end|>",
+        ),
+        description="Uses model's embedded chat template. Works for most models.",
+    ),
+    "llama3": ModelFamilyProfile(
+        id="llama3",
+        name="Llama 3 / 3.1",
+        chat_template_file="llama31-no-tools.jinja",
+        stop_sequences=(
+            "<|eot_id|>",
+            "<|end_of_text|>",
+        ),
+        description="Meta Llama 3.x series. Uses custom template to disable tool-calling.",
+    ),
+    "qwen3": ModelFamilyProfile(
+        id="qwen3",
+        name="Qwen 3 / 3.5",
+        chat_template_file=None,
+        stop_sequences=(
+            "<|im_end|>",
+            "<|im_start|>",
+            "<|endoftext|>",
+        ),
+        description="Alibaba Qwen 3.x series. ChatML format with thinking support.",
+    ),
+    "gemma": ModelFamilyProfile(
+        id="gemma",
+        name="Google Gemma",
+        chat_template_file=None,
+        stop_sequences=(
+            "<end_of_turn>",
+            "<start_of_turn>",
+        ),
+        description="Google Gemma models. Uses model-embedded template.",
+    ),
+    "chatml": ModelFamilyProfile(
+        id="chatml",
+        name="ChatML (Generic)",
+        chat_template_file=None,
+        stop_sequences=(
+            "<|im_end|>",
+            "<|im_start|>",
+            "<|endoftext|>",
+            "</s>",
+        ),
+        description="Generic ChatML-compatible models (Nous Hermes, OpenChat, etc.).",
+    ),
+}
+DEFAULT_MODEL_FAMILY = "auto"
+def get_model_family(family_id: str) -> ModelFamilyProfile:
+    """Get a model family profile by ID.
+    Falls back to DEFAULT_MODEL_FAMILY if the ID is unknown.
+    """
+    profile = MODEL_FAMILIES.get(family_id)
+    if profile is None:
+        logger.warning(
+            "Unknown model family '%s', falling back to '%s'. Valid: %s",
+            family_id, DEFAULT_MODEL_FAMILY, ", ".join(MODEL_FAMILIES.keys()),
+        )
+        profile = MODEL_FAMILIES[DEFAULT_MODEL_FAMILY]
+    return profile
+# Superset of all stop markers across all families.
+# Used for _strip_chat_markers() safety net — catches markers from ANY model family.
+ALL_STOP_MARKERS: tuple[str, ...] = tuple(sorted(set(
+    marker
+    for profile in MODEL_FAMILIES.values()
+    for marker in profile.stop_sequences
+) | {
+    "</s>", "<|end|>", "<|user|>", "<|assistant|>",
+}))

{loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/app.py RENAMED Viewed

@@ -55,6 +55,7 @@ class LoreguardApp(App):
     worker_id: str = ""
     model_path: Optional[Path] = None
     adapter_path: Optional[Path] = None  # Optional LoRA adapter
+    model_family: str = "auto"  # Model family profile (auto, llama3, qwen3, gemma, chatml)
     hardware: Optional[HardwareData] = None
     dev_mode: bool = False
     verbose: bool = False

{loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/screens/main.py RENAMED Viewed

@@ -156,6 +156,7 @@ class MainScreen(Screen):
             app.api_token = config.api_token
             app.model_path = config.get_model_path_obj()
             app.adapter_path = config.get_adapter_path_obj()
+            app.model_family = config.model_family
             app.dev_mode = config.dev_mode
             model_name = app.model_path.name if app.model_path else 'unknown'
@@ -420,7 +421,7 @@ class MainScreen(Screen):
         # Start llama-server (with optional LoRA adapter)
         self._update_status("Starting llama-server...")
-        app._llama_process = LlamaServerProcess(app.model_path, port=8080, lora_path=app.adapter_path)
+        app._llama_process = LlamaServerProcess(app.model_path, port=8080, lora_path=app.adapter_path, model_family=app.model_family)
         app._llama_process.start()
         # Wait for model to load with progress updates
@@ -507,7 +508,7 @@ class MainScreen(Screen):
         self._update_connection_status("connecting")
         try:
-            llm_proxy = LLMProxy("http://127.0.0.1:8080")
+            llm_proxy = LLMProxy("http://127.0.0.1:8080", model_family=app.model_family)
             # Load NLI service (run in thread pool to not block event loop)
             nli_service = None

{loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/screens/running.py RENAMED Viewed

@@ -138,7 +138,7 @@ class RunningScreen(Screen):
         self._update_status("model", "Model", app.model_path.name)
         self._log(f"Starting llama-server with {app.model_path.name}", "info")
-        self._llama_process = LlamaServerProcess(app.model_path, port=8080)
+        self._llama_process = LlamaServerProcess(app.model_path, port=8080, model_family=app.model_family)
         self._llama_process.start()
         # Wait for model to load with progress updates
@@ -185,6 +185,14 @@ class RunningScreen(Screen):
         self._update_status("server", "llama-server", f"Running on :8080 ({elapsed}s)", "success")
         self._log(f"LLM ready in {elapsed}s", "success")
+        # Wire llama process to SDK server for runtime model switching
+        try:
+            from ...http_server import set_llama_process
+            models_dir = app.model_path.parent if app.model_path else None
+            set_llama_process(self._llama_process, models_dir)
+        except Exception:
+            pass  # SDK server may not be running yet in all modes
         # Connect backend
         if not app.dev_mode:
             self._update_status("backend", "Backend", "Connecting...", "info")
@@ -200,7 +208,7 @@ class RunningScreen(Screen):
                     get_dialogue_act_model_info,
                 )
-                llm_proxy = LLMProxy("http://127.0.0.1:8080")
+                llm_proxy = LLMProxy("http://127.0.0.1:8080", model_family=app.model_family)
                 # Load NLI service (run in thread pool to not block event loop)
                 nli_service = None