PyPI - loreguard-cli - Versions diffs - 0.15.2__tar.gz → 0.20.2__tar.gz - Mend

loreguard-cli 0.15.2tar.gz → 0.20.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

{loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: loreguard-cli
-Version: 0.15.2
+Version: 0.20.2
 Summary: Local inference client for Loreguard NPCs
 Project-URL: Homepage, https://loreguard.com
 Project-URL: Documentation, https://github.com/beyond-logic-labs/loreguard-cli#readme
@@ -29,7 +29,7 @@ Requires-Dist: rich>=13.0.0
 Requires-Dist: textual>=0.47.0
 Requires-Dist: tf-keras>=2.16.0
 Requires-Dist: torch>=2.0.0
-Requires-Dist: transformers>=5.0.0
+Requires-Dist: transformers<5,>=4.36.0
 Requires-Dist: uvicorn>=0.27.0
 Requires-Dist: websockets>=12.0
 Provides-Extra: build

{loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "loreguard-cli"
-version = "0.15.2"
+version = "0.20.2"
 description = "Local inference client for Loreguard NPCs"
 readme = "README.md"
 license = "MIT"
@@ -28,7 +28,7 @@ dependencies = [
     "aiofiles>=24.1.0",
     "rich>=13.0.0",
     "textual>=0.47.0",
-    "transformers>=5.0.0",
+    "transformers>=4.36.0,<5",
     "torch>=2.0.0",
     "fastapi>=0.109.0",
     "uvicorn>=0.27.0",

{loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/__main__.py RENAMED Viewed

@@ -28,6 +28,29 @@ def main():
         print(json.dumps(status, indent=2))
         sys.exit(0 if status.get("running") else 1)
+    # Handle 'download-llama-server' command - for bundle tool delegation (ADR-0027)
+    if args and args[0] == "download-llama-server":
+        import asyncio
+        from pathlib import Path
+        from .llama_server import download_llama_server
+        output_dir = None
+        for i, a in enumerate(args):
+            if a == "--output-dir" and i + 1 < len(args):
+                output_dir = Path(args[i + 1])
+        if not output_dir:
+            print("Usage: loreguard download-llama-server --output-dir <path>", file=sys.stderr)
+            sys.exit(1)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        def on_progress(msg, progress=None):
+            print(f"       {msg}")
+        asyncio.run(download_llama_server(progress_callback=on_progress, target_dir=output_dir))
+        sys.exit(0)
     # Filter out help flags - these should show CLI help
     if any(a in ('-h', '--help') for a in args):
         from .cli import main as cli_main

{loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/chunk_detector.py RENAMED Viewed

@@ -44,7 +44,7 @@ CHUNK_HYPOTHESES = {
 # Threshold for "starts new thought" classification
 # If confidence > threshold, we create a new chunk
-NEW_THOUGHT_THRESHOLD = 0.55
+NEW_THOUGHT_THRESHOLD = 0.38
 class ChunkDetector:

{loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/cli.py RENAMED Viewed

@@ -11,7 +11,8 @@ Environment variables (alternative to args):
     LOREGUARD_MODEL     Path to model file
     LOREGUARD_MODEL_ID  Model ID to download (if not using custom model)
     LOREGUARD_PORT      Local llama-server port (default: 8080)
-    LOREGUARD_BACKEND   Backend URL (default: wss://api.loreguard.com/workers)
+    LOREGUARD_BACKEND   Backend WebSocket URL (default: wss://console.loreguard.com/workers)
+    LOREGUARD_API       API base URL (default: https://console.loreguard.com)
     LOREGUARD_WORKER_ID Worker ID (default: hostname)
 """
@@ -26,6 +27,8 @@ from datetime import datetime
 from pathlib import Path
 from typing import Optional
+from .config import DEFAULT_API_URL, DEFAULT_BACKEND_URL
 # Setup logging
 logging.basicConfig(
     level=logging.INFO,
@@ -44,14 +47,16 @@ class LoreguardCLI:
         model_path: Optional[Path] = None,
         model_id: Optional[str] = None,
         port: int = 8080,
-        backend_url: str = "wss://api.loreguard.com/workers",
+        backend_url: str = DEFAULT_BACKEND_URL,
         worker_id: Optional[str] = None,
+        model_family: str = "llama3",
     ):
         self.token = token
         self.model_path = model_path
         self.model_id = model_id
         self.port = port
         self.backend_url = backend_url
+        self.model_family = model_family
         # Worker ID: use provided value, or default to sanitized hostname.
         # Validator requires ^[a-zA-Z0-9_-]{1,64}$ — replace dots with hyphens.
         raw_id = worker_id or socket.gethostname() or "worker"
@@ -209,7 +214,7 @@ class LoreguardCLI:
         # Start server
         log.info(f"Starting llama-server on port {self.port}...")
         try:
-            self._llama = LlamaServerProcess(self.model_path, port=self.port)
+            self._llama = LlamaServerProcess(self.model_path, port=self.port, model_family=self.model_family)
             self._llama.start()
             # Wait for ready
@@ -241,7 +246,7 @@ class LoreguardCLI:
         log.info(f"Worker ID: {self.worker_id}")
         try:
-            llm_proxy = LLMProxy(f"http://127.0.0.1:{self.port}")
+            llm_proxy = LLMProxy(f"http://127.0.0.1:{self.port}", model_family=self.model_family)
             # ADR-0027: Load all ML services — the client is the sole provider
             # of NLI, intent, dialogue act, and chunk capabilities.
@@ -323,6 +328,11 @@ class LoreguardCLI:
                     port=sdk_port,
                 )
                 log.info(f"SDK server listening on 127.0.0.1:{self._sdk_port}")
+                # Wire llama process for runtime model switching
+                from .http_server import set_llama_process
+                models_dir = self.model_path.parent if self.model_path else None
+                set_llama_process(self._llama, models_dir)
             except Exception as e:
                 log.error(f"Failed to start SDK server: {e}")
                 return False
@@ -447,9 +457,14 @@ Available model IDs:
     )
     parser.add_argument(
         "--backend",
-        default=os.getenv("LOREGUARD_BACKEND", "wss://api.loreguard.com/workers"),
+        default=os.getenv("LOREGUARD_BACKEND", DEFAULT_BACKEND_URL),
         help="Backend WebSocket URL",
     )
+    parser.add_argument(
+        "--api-url",
+        default=os.getenv("LOREGUARD_API", DEFAULT_API_URL),
+        help=f"API base URL (default: {DEFAULT_API_URL})",
+    )
     parser.add_argument(
         "-v", "--verbose",
         action="store_true",
@@ -460,6 +475,12 @@ Available model IDs:
         default=os.getenv("LOREGUARD_BUNDLE_DIR", ""),
         help="Loreguard bundle directory. Auto-discovers models from manifest.txt.",
     )
+    parser.add_argument(
+        "--model-family",
+        default=os.getenv("LOREGUARD_MODEL_FAMILY", "auto"),
+        choices=["auto", "llama3", "qwen3", "gemma", "chatml"],
+        help="Model family profile for chat template/stop sequences (default: auto)",
+    )
     parser.add_argument(
         "--dev",
         action="store_true",
@@ -531,6 +552,7 @@ Available model IDs:
         port=args.port,
         backend_url=args.backend,
         worker_id=args.worker_id or None,  # None will use hostname
+        model_family=args.model_family,
     )
     exit_code = asyncio.run(cli.run())

{loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/config.py RENAMED Viewed

@@ -49,6 +49,8 @@ class LoreguardConfig:
     dev_mode: bool = False
     context_size: int = 16384  # llama-server context window size (configurable per game)
     max_speech_tokens: int = 50  # Max tokens for NPC speech output (Pass 4). Default: 50 (~40 words)
+    model_family: str = "auto"  # Model family profile (auto, llama3, qwen3, gemma, chatml)
+    dialogue_act_enabled: bool = False  # Dialogue act classifier for filler selection
     def save(self) -> None:
         """Save configuration to disk."""
@@ -71,6 +73,8 @@ class LoreguardConfig:
                     dev_mode=data.get("dev_mode", False),
                     context_size=data.get("context_size", 16384),
                     max_speech_tokens=data.get("max_speech_tokens", 50),
+                    model_family=data.get("model_family", "auto"),
+                    dialogue_act_enabled=data.get("dialogue_act_enabled", False),
                 )
             except (json.JSONDecodeError, KeyError):
                 pass
@@ -119,6 +123,14 @@ class LoreguardConfig:
 # Environment Variable Configuration
 # =============================================================================
+DEFAULT_API_URL = "https://console.loreguard.com"
+DEFAULT_BACKEND_URL = "wss://console.loreguard.com/workers"
+def get_api_url() -> str:
+    """Get the Loreguard API base URL (configurable via LOREGUARD_API env var)."""
+    return os.getenv("LOREGUARD_API", DEFAULT_API_URL)
 @lru_cache(maxsize=1)
 def load_config() -> dict:
@@ -131,12 +143,13 @@ def load_config() -> dict:
     return {
         # Server settings
         "LLM_ENDPOINT": os.getenv("LLM_ENDPOINT", "http://localhost:8080"),
-        "BACKEND_URL": os.getenv("LOREGUARD_BACKEND", "wss://api.loreguard.com/workers"),
+        "BACKEND_URL": os.getenv("LOREGUARD_BACKEND", DEFAULT_BACKEND_URL),
+        "API_URL": os.getenv("LOREGUARD_API", DEFAULT_API_URL),
         "HOST": os.getenv("HOST", "127.0.0.1"),
         "PORT": os.getenv("PORT", "8081"),
         # Worker authentication (required for backend connection)
-        # Get API token from loreguard.com dashboard
+        # Get API token from console.loreguard.com
         "WORKER_ID": os.getenv("LOREGUARD_WORKER_ID", os.getenv("WORKER_ID", "")),
         # LOREGUARD_TOKEN is preferred, WORKER_TOKEN kept for backwards compatibility
         "LOREGUARD_TOKEN": os.getenv("LOREGUARD_TOKEN", os.getenv("WORKER_TOKEN", "")),
@@ -230,20 +243,21 @@ def get_models_dir() -> Optional[Path]:
 def resolve_model_path(model_name: str, subdir: str = "") -> str:
-    """Resolve a model path, preferring pre-shipped models over HF downloads.
+    """Resolve a model path, preferring local models over HF downloads.
     Resolution order:
     1. LOREGUARD_MODELS_DIR/<subdir>  (explicit override)
-    2. Bundle models dir using manifest.txt  (HF name → manifest key → local dir)
-    3. Bundle models dir using HF name → org--model convention  (fallback)
-    4. Original HF model name  (download from HuggingFace)
+    2. Application Support models dir/<subdir>  (standard install location)
+    3. Bundle models dir using manifest.txt  (HF name → manifest key → local dir)
+    4. Bundle models dir using HF name → org--model convention  (fallback)
+    5. Download from HuggingFace to Application Support models dir
     Args:
         model_name: HuggingFace model name (e.g., 'vectara/hallucination_evaluation_model')
         subdir: Subdirectory within MODELS_DIR to check (e.g., 'hhem', 'deberta')
     Returns:
-        Local path if pre-shipped model found, otherwise the original HF model name.
+        Local path to the model directory.
     """
     # 1. Explicit LOREGUARD_MODELS_DIR/<subdir>
     explicit_dir = get_config_value("MODELS_DIR")
@@ -252,7 +266,14 @@ def resolve_model_path(model_name: str, subdir: str = "") -> str:
         if local_path.exists() and any(local_path.iterdir()):
             return str(local_path)
-    # 2 & 3. Bundle directory resolution
+    # 2. Application Support models dir/<subdir>
+    app_models = get_data_dir() / "models"
+    if subdir:
+        local_path = app_models / subdir
+        if local_path.exists() and any(local_path.iterdir()):
+            return str(local_path)
+    # 3 & 4. Bundle directory resolution
     bundle_dir = get_bundle_dir()
     if bundle_dir:
         bundle_models = bundle_dir / "models"
@@ -273,9 +294,37 @@ def resolve_model_path(model_name: str, subdir: str = "") -> str:
         if local_path.exists() and any(local_path.iterdir()):
             return str(local_path)
+    # 5. Download from HuggingFace to Application Support models dir
+    if subdir:
+        return _download_hf_model(model_name, app_models / subdir)
     return model_name
+def _download_hf_model(model_name: str, target_dir: Path) -> str:
+    """Download a HuggingFace model to the loreguard models directory.
+    Returns:
+        Path to the downloaded model directory.
+    """
+    import logging
+    logger = logging.getLogger(__name__)
+    try:
+        from huggingface_hub import snapshot_download
+        target_dir.mkdir(parents=True, exist_ok=True)
+        logger.info(f"Downloading {model_name} to {target_dir}")
+        snapshot_download(
+            model_name,
+            local_dir=str(target_dir),
+            local_dir_use_symlinks=False,
+        )
+        logger.info(f"Downloaded {model_name} to {target_dir}")
+        return str(target_dir)
+    except Exception as e:
+        logger.warning(f"Failed to download {model_name}: {e}")
+        return model_name
 def get_config_value(key: str, default: Optional[str] = None) -> Optional[str]:
     """Get a single configuration value."""
     config = load_config()

{loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/http_server.py RENAMED Viewed

@@ -8,6 +8,8 @@ HTTP endpoints:
   GET  /api/capabilities  - Feature discovery (streaming, chunk modes)
   GET  /api/characters    - List available NPCs (proxied from engine)
   POST /api/chat          - Chat with an NPC (streaming SSE or JSON)
+  GET  /api/models        - List available GGUF models
+  POST /api/admin/reload-model - Hot-swap LLM model at runtime
 The server shares the existing tunnel connection instead of creating
 a new one, ensuring a single WebSocket connection per worker.
@@ -17,10 +19,12 @@ Uses uvicorn with socket-first binding for race-condition-free port allocation.
 import asyncio
 import json
+import os
 import threading
 import time
 import uuid
 from concurrent.futures import Future
+from pathlib import Path
 from typing import Any, Callable, Optional
 from .runtime import write_runtime_info, RuntimeInfo, get_runtime_path, get_version
@@ -63,6 +67,8 @@ class EmbeddedHTTPServer:
         self._running = False
         self._bound_socket: Optional[Any] = None
         self._ready_event = threading.Event()
+        self.llama_process: Optional[Any] = None  # LlamaServerProcess — set by RunningScreen
+        self.models_dir: Optional[Path] = None     # Path to models/ directory
     def start(self) -> int:
         """Start the HTTP server in a background thread.
@@ -361,7 +367,7 @@ class EmbeddedHTTPServer:
             # Derive HTTP base URL from WebSocket URL
             # ws://localhost:8090/workers → http://localhost:8090
-            # wss://api.loreguard.com/workers → https://api.loreguard.com
+            # wss://console.loreguard.com/workers → https://console.loreguard.com
             backend_ws = server.tunnel.backend_url
             if backend_ws.startswith("wss://"):
                 base_url = "https://" + backend_ws[6:].split("/")[0]
@@ -483,6 +489,118 @@ class EmbeddedHTTPServer:
                     return JSONResponse(status_code=500, content=result)
                 return result
+        @app.get("/api/models")
+        async def list_models():
+            """List available GGUF models in the models directory."""
+            if not server.models_dir or not server.models_dir.exists():
+                return JSONResponse(
+                    status_code=404,
+                    content={"error": "Models directory not configured"},
+                )
+            models = []
+            active_model = None
+            if server.llama_process and hasattr(server.llama_process, "model_path"):
+                active_model = server.llama_process.model_path.name
+            for f in sorted(server.models_dir.iterdir()):
+                if f.suffix == ".gguf" and f.is_file():
+                    models.append({
+                        "name": f.name,
+                        "size": f.stat().st_size,
+                        "active": f.name == active_model,
+                    })
+            return {"models": models, "activeModel": active_model}
+        @app.post("/api/admin/reload-model")
+        async def reload_model(request: Request):
+            """Hot-swap the LLM model by restarting llama-server."""
+            if not server.llama_process:
+                return JSONResponse(
+                    status_code=503,
+                    content={"error": "LLM server not available"},
+                )
+            if not server.models_dir:
+                return JSONResponse(
+                    status_code=503,
+                    content={"error": "Models directory not configured"},
+                )
+            body = await request.json()
+            model_name = body.get("model", "")
+            if not model_name:
+                return JSONResponse(
+                    status_code=400,
+                    content={"error": "Missing 'model' field"},
+                )
+            # Security: resolve and verify path stays inside models_dir
+            model_path = (server.models_dir / model_name).resolve()
+            if model_path.parent != server.models_dir.resolve():
+                return JSONResponse(
+                    status_code=400,
+                    content={"error": "Invalid model name"},
+                )
+            if not model_path.exists() or model_path.suffix != ".gguf":
+                return JSONResponse(
+                    status_code=404,
+                    content={"error": f"Model '{model_name}' not found"},
+                )
+            # Check if already active
+            if hasattr(server.llama_process, "model_path") and server.llama_process.model_path.name == model_name:
+                return {"status": "already_active", "model": model_name}
+            # Save original model_path for rollback on failure
+            original_model_path = server.llama_process.model_path
+            try:
+                # Stop current llama-server
+                server.llama_process.stop()
+                # Update model path and restart
+                server.llama_process.model_path = model_path
+                server.llama_process.start()
+                # Wait for health check (llama-server takes a few seconds to load model)
+                import httpx
+                llama_url = f"http://127.0.0.1:{server.llama_process.port}/health"
+                async with httpx.AsyncClient(timeout=2.0) as client:
+                    for attempt in range(60):  # 60 attempts × 0.5s = 30s timeout
+                        await asyncio.sleep(0.5)
+                        try:
+                            resp = await client.get(llama_url)
+                            if resp.status_code == 200:
+                                # Persist selection so it survives restarts
+                                try:
+                                    from .config import LoreguardConfig
+                                    cfg = LoreguardConfig.load()
+                                    cfg.set_model_path(model_path)
+                                    cfg.save()
+                                except Exception:
+                                    pass  # Best-effort persistence
+                                return {"status": "ok", "model": model_name}
+                        except Exception:
+                            continue
+                return JSONResponse(
+                    status_code=500,
+                    content={"error": "Model loaded but health check timed out after 30s"},
+                )
+            except Exception as e:
+                # Rollback: restore original model path and try to restart
+                server.llama_process.model_path = original_model_path
+                try:
+                    server.llama_process.start()
+                except Exception:
+                    pass  # Best-effort rollback
+                return JSONResponse(
+                    status_code=500,
+                    content={"error": f"Failed to reload model: {e}"},
+                )
         # Write runtime info
         with open(debug_path, "a") as f:
             f.write(f"[SDK Server] Writing runtime info for port {self.actual_port}...\n")
@@ -610,6 +728,15 @@ def force_stop_sdk_server() -> None:
         _server = None
+def set_llama_process(llama_process: Any, models_dir: Optional[Path] = None) -> None:
+    """Set the LlamaServerProcess reference on the SDK server for model management."""
+    global _server
+    if _server:
+        _server.llama_process = llama_process
+        if models_dir:
+            _server.models_dir = models_dir
 def update_backend_status(connected: bool) -> None:
     """Update backend connection status in runtime.json."""
     global _server

{loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/llama_server.py RENAMED Viewed

@@ -30,7 +30,7 @@ def _get_templates_dir() -> Path:
     return Path(__file__).parent.parent / "templates"
-LLAMA_VERSION = "b7789"  # Must match loreguard-engine bundle version
+LLAMA_VERSION = "b8467"  # Must match loreguard-engine bundle version
 # Download URLs for each platform
 BINARIES = {
@@ -265,18 +265,21 @@ def make_executable(path: Path) -> None:
 async def download_llama_server(
     progress_callback: Optional[Callable[[str, DownloadProgress | None], None]] = None,
+    target_dir: Optional[Path] = None,
 ) -> Path:
     """Download and install llama-server for the current platform.
     Args:
         progress_callback: Called with (status_message, progress_or_none)
+        target_dir: If provided, install into this directory instead of the default.
+                    Used by the bundle tool to pre-ship llama-server.
     Returns:
         Path to the installed llama-server binary
     """
     plat = get_platform()
     config = BINARIES[plat]
-    bin_dir = get_bin_dir()
+    bin_dir = target_dir or get_bin_dir()
     def notify(msg: str, progress: DownloadProgress | None = None):
         if progress_callback:
@@ -355,12 +358,12 @@ async def download_llama_server(
                 make_executable(lib)
         # Write version marker file for future version checks
-        version_file = get_version_file_path()
+        version_file = bin_dir / ".llama_version" if target_dir else get_version_file_path()
         version_file.write_text(LLAMA_VERSION)
         notify(f"llama-server {LLAMA_VERSION} installed successfully!")
-    return get_llama_server_path()
+    return bin_dir / config["binary_name"]
 class LlamaServerProcess:
@@ -372,11 +375,13 @@ class LlamaServerProcess:
         port: int = 8080,
         lora_path: Optional[Path] = None,
         context_size: int = 16384,
+        model_family: str = "llama3",
     ):
         self.model_path = model_path
         self.port = port
         self.lora_path = lora_path
         self.context_size = context_size
+        self.model_family = model_family
         self.process: Optional[subprocess.Popen] = None
         self._output_lines: list[str] = []
@@ -406,13 +411,18 @@ class LlamaServerProcess:
             # Without this, llama-server may allocate multiple slots, each consuming
             # KV cache memory proportional to context_size * model_hidden_dim.
             "-np", "1",
-            # Use custom Jinja template without tool-calling logic.
-            # Llama 3.1's built-in template forces tool-calling format even without tools,
-            # so we use a stripped-down template that only handles chat messages.
+            # Enable Jinja template processing (required for both custom and embedded templates)
             "--jinja",
-            "--chat-template-file", str(_get_templates_dir() / "llama31-no-tools.jinja"),
         ]
+        # Apply model-family-specific chat template override.
+        # Llama 3.1 requires a custom template to avoid the tool-calling bug;
+        # other families use their GGUF-embedded template (--jinja alone).
+        from .model_families import get_model_family
+        family = get_model_family(self.model_family)
+        if family.chat_template_file:
+            cmd.extend(["--chat-template-file", str(_get_templates_dir() / family.chat_template_file)])
         # Add LoRA adapter if specified
         if self.lora_path and self.lora_path.exists():
             cmd.extend(["--lora", str(self.lora_path)])

{loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/llm.py RENAMED Viewed

@@ -37,16 +37,16 @@ class SamplingConfig:
     presence_penalty: float = 0.0
-# Default stop sequences - ChatML/instruction markers that signal end of turn
-DEFAULT_STOP_SEQUENCES = [
-    "<|im_end|>",
-    "<|im_start|>",
-    "<|endoftext|>",
-    "</s>",
-    "<|end|>",
-    "<|user|>",
-    "<|assistant|>",
-]
+from .model_families import get_model_family, ALL_STOP_MARKERS, DEFAULT_MODEL_FAMILY
+def get_stop_sequences(model_family: str = DEFAULT_MODEL_FAMILY) -> list[str]:
+    """Get stop sequences for the given model family."""
+    return list(get_model_family(model_family).stop_sequences)
+# Backward-compatible default (Llama 3 stop sequences)
+DEFAULT_STOP_SEQUENCES = get_stop_sequences(DEFAULT_MODEL_FAMILY)
 @dataclass
@@ -61,7 +61,9 @@ class LLMRequest:
     stop: list[str] = field(default_factory=lambda: DEFAULT_STOP_SEQUENCES.copy())
     # Thinking mode control (for Qwen3)
-    disable_thinking: bool = False
+    # Defaults to True: thinking wastes tokens and breaks pipelines.
+    # Only enable explicitly when extended reasoning is desired.
+    disable_thinking: bool = True
     # If true, error if content is empty instead of falling back to reasoning_content
     require_content: bool = False
@@ -94,11 +96,13 @@ class LLMProxy:
     sampling configuration, stop sequences, and JSON mode support.
     """
-    def __init__(self, endpoint: str, timeout: float = 120.0):
+    def __init__(self, endpoint: str, timeout: float = 120.0, model_family: str = DEFAULT_MODEL_FAMILY):
         if not endpoint:
             raise ValueError("LLM endpoint is required")
         self.endpoint = endpoint.rstrip("/")
         self.default_timeout = timeout
+        self.model_family = model_family
+        self._stop_sequences = get_stop_sequences(model_family)
         self.client = httpx.AsyncClient(
             timeout=timeout,
             limits=httpx.Limits(
@@ -255,9 +259,10 @@ class LLMProxy:
             payload["id_slot"] = 0
             logger.info("KV cache: cache_prompt=true, id_slot=0 (verify -np 1 on server)")
-        # Disable thinking mode if requested (for Qwen3)
+        # Disable thinking mode (for Qwen3/3.5).
+        # Must use chat_template_kwargs — top-level enable_thinking is ignored by llama.cpp b8467+.
         if req.disable_thinking:
-            payload["enable_thinking"] = False
+            payload.setdefault("chat_template_kwargs", {})["enable_thinking"] = False
         # Note: JSON mode is not compatible with streaming in llama.cpp
         # If force_json is requested, fall back to non-streaming
@@ -524,7 +529,7 @@ class LLMProxy:
             max_tokens=d.get("max_tokens", 512),
             timeout=timeout,
             sampling=sampling,
-            stop=d.get("stop", DEFAULT_STOP_SEQUENCES.copy()),
+            stop=d.get("stop", self._stop_sequences.copy()),
             disable_thinking=d.get("disable_thinking", False),
             require_content=d.get("require_content", False),
             force_json=d.get("force_json", False),
@@ -571,9 +576,10 @@ class LLMProxy:
             payload["id_slot"] = 0
             logger.info("KV cache: cache_prompt=true, id_slot=0 (verify -np 1 on server)")
-        # Disable thinking mode if requested (for Qwen3)
+        # Disable thinking mode (for Qwen3/3.5).
+        # Must use chat_template_kwargs — top-level enable_thinking is ignored by llama.cpp b8467+.
         if req.disable_thinking:
-            payload["enable_thinking"] = False
+            payload.setdefault("chat_template_kwargs", {})["enable_thinking"] = False
         # Force JSON output if requested
         if req.force_json:
@@ -717,14 +723,12 @@ class LLMProxy:
         return -1
     def _strip_chat_markers(self, content: str) -> str:
-        """Remove content after ChatML markers that indicate hallucinated turns."""
-        markers = [
-            "<|im_end|>", "<|im_start|>", "<|endoftext|>",
-            "</s>", "<|end|>", "<|user|>", "<|assistant|>",
-        ]
+        """Remove content after chat markers that indicate hallucinated turns.
+        Uses a superset of all model families' tokens as a safety net.
+        """
         result = content
-        for marker in markers:
+        for marker in ALL_STOP_MARKERS:
             if marker in result:
                 idx = result.index(marker)
                 result = result[:idx]

loreguard-cli 0.15.2__tar.gz → 0.20.2__tar.gz

loreguard-cli 0.15.2tar.gz → 0.20.2tar.gz