loreguard-cli 0.15.2__tar.gz → 0.20.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/PKG-INFO +2 -2
  2. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/pyproject.toml +2 -2
  3. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/__main__.py +23 -0
  4. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/chunk_detector.py +1 -1
  5. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/cli.py +27 -5
  6. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/config.py +57 -8
  7. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/http_server.py +128 -1
  8. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/llama_server.py +18 -8
  9. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/llm.py +27 -23
  10. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/main.py +3 -3
  11. loreguard_cli-0.20.2/src/model_families.py +121 -0
  12. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/models_registry.py +12 -0
  13. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/nli.py +53 -5
  14. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/npc_chat.py +7 -5
  15. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/steam.py +4 -3
  16. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tui/app.py +1 -0
  17. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tui/modals/auth_menu.py +3 -2
  18. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tui/modals/token_input.py +1 -1
  19. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tui/modals/unified_palette.py +28 -1
  20. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tui/screens/auth.py +2 -1
  21. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tui/screens/main.py +9 -8
  22. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tui/screens/running.py +13 -4
  23. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tui/widgets/npc_chat.py +4 -2
  24. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tunnel.py +8 -2
  25. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/wizard.py +4 -2
  26. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/uv.lock +8 -8
  27. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/.claude/skills/llama-cpp-troubleshooting/SKILL.md +0 -0
  28. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/.env.example +0 -0
  29. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/.github/workflows/release.yml +0 -0
  30. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/.gitignore +0 -0
  31. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/LICENSE +0 -0
  32. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/README.md +0 -0
  33. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/THIRD_PARTY_NOTICES.md +0 -0
  34. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/loreguard.spec +0 -0
  35. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/loreguard_entry.py +0 -0
  36. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/scripts/build.py +0 -0
  37. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/sdk/API.md +0 -0
  38. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/sdk/csharp/LoreguardSDK.cs +0 -0
  39. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/sdk/gdscript/LoreguardSDK.gd +0 -0
  40. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/sdk/javascript/loreguard-sdk.js +0 -0
  41. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/sdk/python/loreguard_sdk.py +0 -0
  42. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/__init__.py +0 -0
  43. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/dialogue_act_classifier.py +0 -0
  44. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/hf_discovery.py +0 -0
  45. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/intent_classifier.py +0 -0
  46. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/runtime.py +0 -0
  47. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/term_ui.py +0 -0
  48. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tui/__init__.py +0 -0
  49. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tui/modals/__init__.py +0 -0
  50. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tui/modals/npc_chat.py +0 -0
  51. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tui/screens/__init__.py +0 -0
  52. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tui/screens/model_select.py +0 -0
  53. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tui/screens/nli_setup.py +0 -0
  54. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tui/styles.py +0 -0
  55. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tui/widgets/__init__.py +0 -0
  56. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tui/widgets/banner.py +0 -0
  57. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tui/widgets/footer.py +0 -0
  58. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tui/widgets/hardware_info.py +0 -0
  59. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tui/widgets/server_monitor.py +0 -0
  60. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/src/tui/widgets/status_panel.py +0 -0
  61. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/templates/llama31-no-tools.jinja +0 -0
  62. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/tests/test_intent_classifier.py +0 -0
  63. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/tests/test_nli_hhem.py +0 -0
  64. {loreguard_cli-0.15.2 → loreguard_cli-0.20.2}/tests/test_websocket_timeout.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: loreguard-cli
3
- Version: 0.15.2
3
+ Version: 0.20.2
4
4
  Summary: Local inference client for Loreguard NPCs
5
5
  Project-URL: Homepage, https://loreguard.com
6
6
  Project-URL: Documentation, https://github.com/beyond-logic-labs/loreguard-cli#readme
@@ -29,7 +29,7 @@ Requires-Dist: rich>=13.0.0
29
29
  Requires-Dist: textual>=0.47.0
30
30
  Requires-Dist: tf-keras>=2.16.0
31
31
  Requires-Dist: torch>=2.0.0
32
- Requires-Dist: transformers>=5.0.0
32
+ Requires-Dist: transformers<5,>=4.36.0
33
33
  Requires-Dist: uvicorn>=0.27.0
34
34
  Requires-Dist: websockets>=12.0
35
35
  Provides-Extra: build
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "loreguard-cli"
7
- version = "0.15.2"
7
+ version = "0.20.2"
8
8
  description = "Local inference client for Loreguard NPCs"
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -28,7 +28,7 @@ dependencies = [
28
28
  "aiofiles>=24.1.0",
29
29
  "rich>=13.0.0",
30
30
  "textual>=0.47.0",
31
- "transformers>=5.0.0",
31
+ "transformers>=4.36.0,<5",
32
32
  "torch>=2.0.0",
33
33
  "fastapi>=0.109.0",
34
34
  "uvicorn>=0.27.0",
@@ -28,6 +28,29 @@ def main():
28
28
  print(json.dumps(status, indent=2))
29
29
  sys.exit(0 if status.get("running") else 1)
30
30
 
31
+ # Handle 'download-llama-server' command - for bundle tool delegation (ADR-0027)
32
+ if args and args[0] == "download-llama-server":
33
+ import asyncio
34
+ from pathlib import Path
35
+ from .llama_server import download_llama_server
36
+
37
+ output_dir = None
38
+ for i, a in enumerate(args):
39
+ if a == "--output-dir" and i + 1 < len(args):
40
+ output_dir = Path(args[i + 1])
41
+
42
+ if not output_dir:
43
+ print("Usage: loreguard download-llama-server --output-dir <path>", file=sys.stderr)
44
+ sys.exit(1)
45
+
46
+ output_dir.mkdir(parents=True, exist_ok=True)
47
+
48
+ def on_progress(msg, progress=None):
49
+ print(f" {msg}")
50
+
51
+ asyncio.run(download_llama_server(progress_callback=on_progress, target_dir=output_dir))
52
+ sys.exit(0)
53
+
31
54
  # Filter out help flags - these should show CLI help
32
55
  if any(a in ('-h', '--help') for a in args):
33
56
  from .cli import main as cli_main
@@ -44,7 +44,7 @@ CHUNK_HYPOTHESES = {
44
44
 
45
45
  # Threshold for "starts new thought" classification
46
46
  # If confidence > threshold, we create a new chunk
47
- NEW_THOUGHT_THRESHOLD = 0.55
47
+ NEW_THOUGHT_THRESHOLD = 0.38
48
48
 
49
49
 
50
50
  class ChunkDetector:
@@ -11,7 +11,8 @@ Environment variables (alternative to args):
11
11
  LOREGUARD_MODEL Path to model file
12
12
  LOREGUARD_MODEL_ID Model ID to download (if not using custom model)
13
13
  LOREGUARD_PORT Local llama-server port (default: 8080)
14
- LOREGUARD_BACKEND Backend URL (default: wss://api.loreguard.com/workers)
14
+ LOREGUARD_BACKEND Backend WebSocket URL (default: wss://console.loreguard.com/workers)
15
+ LOREGUARD_API API base URL (default: https://console.loreguard.com)
15
16
  LOREGUARD_WORKER_ID Worker ID (default: hostname)
16
17
  """
17
18
 
@@ -26,6 +27,8 @@ from datetime import datetime
26
27
  from pathlib import Path
27
28
  from typing import Optional
28
29
 
30
+ from .config import DEFAULT_API_URL, DEFAULT_BACKEND_URL
31
+
29
32
  # Setup logging
30
33
  logging.basicConfig(
31
34
  level=logging.INFO,
@@ -44,14 +47,16 @@ class LoreguardCLI:
44
47
  model_path: Optional[Path] = None,
45
48
  model_id: Optional[str] = None,
46
49
  port: int = 8080,
47
- backend_url: str = "wss://api.loreguard.com/workers",
50
+ backend_url: str = DEFAULT_BACKEND_URL,
48
51
  worker_id: Optional[str] = None,
52
+ model_family: str = "llama3",
49
53
  ):
50
54
  self.token = token
51
55
  self.model_path = model_path
52
56
  self.model_id = model_id
53
57
  self.port = port
54
58
  self.backend_url = backend_url
59
+ self.model_family = model_family
55
60
  # Worker ID: use provided value, or default to sanitized hostname.
56
61
  # Validator requires ^[a-zA-Z0-9_-]{1,64}$ — replace dots with hyphens.
57
62
  raw_id = worker_id or socket.gethostname() or "worker"
@@ -209,7 +214,7 @@ class LoreguardCLI:
209
214
  # Start server
210
215
  log.info(f"Starting llama-server on port {self.port}...")
211
216
  try:
212
- self._llama = LlamaServerProcess(self.model_path, port=self.port)
217
+ self._llama = LlamaServerProcess(self.model_path, port=self.port, model_family=self.model_family)
213
218
  self._llama.start()
214
219
 
215
220
  # Wait for ready
@@ -241,7 +246,7 @@ class LoreguardCLI:
241
246
  log.info(f"Worker ID: {self.worker_id}")
242
247
 
243
248
  try:
244
- llm_proxy = LLMProxy(f"http://127.0.0.1:{self.port}")
249
+ llm_proxy = LLMProxy(f"http://127.0.0.1:{self.port}", model_family=self.model_family)
245
250
 
246
251
  # ADR-0027: Load all ML services — the client is the sole provider
247
252
  # of NLI, intent, dialogue act, and chunk capabilities.
@@ -323,6 +328,11 @@ class LoreguardCLI:
323
328
  port=sdk_port,
324
329
  )
325
330
  log.info(f"SDK server listening on 127.0.0.1:{self._sdk_port}")
331
+
332
+ # Wire llama process for runtime model switching
333
+ from .http_server import set_llama_process
334
+ models_dir = self.model_path.parent if self.model_path else None
335
+ set_llama_process(self._llama, models_dir)
326
336
  except Exception as e:
327
337
  log.error(f"Failed to start SDK server: {e}")
328
338
  return False
@@ -447,9 +457,14 @@ Available model IDs:
447
457
  )
448
458
  parser.add_argument(
449
459
  "--backend",
450
- default=os.getenv("LOREGUARD_BACKEND", "wss://api.loreguard.com/workers"),
460
+ default=os.getenv("LOREGUARD_BACKEND", DEFAULT_BACKEND_URL),
451
461
  help="Backend WebSocket URL",
452
462
  )
463
+ parser.add_argument(
464
+ "--api-url",
465
+ default=os.getenv("LOREGUARD_API", DEFAULT_API_URL),
466
+ help=f"API base URL (default: {DEFAULT_API_URL})",
467
+ )
453
468
  parser.add_argument(
454
469
  "-v", "--verbose",
455
470
  action="store_true",
@@ -460,6 +475,12 @@ Available model IDs:
460
475
  default=os.getenv("LOREGUARD_BUNDLE_DIR", ""),
461
476
  help="Loreguard bundle directory. Auto-discovers models from manifest.txt.",
462
477
  )
478
+ parser.add_argument(
479
+ "--model-family",
480
+ default=os.getenv("LOREGUARD_MODEL_FAMILY", "auto"),
481
+ choices=["auto", "llama3", "qwen3", "gemma", "chatml"],
482
+ help="Model family profile for chat template/stop sequences (default: auto)",
483
+ )
463
484
  parser.add_argument(
464
485
  "--dev",
465
486
  action="store_true",
@@ -531,6 +552,7 @@ Available model IDs:
531
552
  port=args.port,
532
553
  backend_url=args.backend,
533
554
  worker_id=args.worker_id or None, # None will use hostname
555
+ model_family=args.model_family,
534
556
  )
535
557
 
536
558
  exit_code = asyncio.run(cli.run())
@@ -49,6 +49,8 @@ class LoreguardConfig:
49
49
  dev_mode: bool = False
50
50
  context_size: int = 16384 # llama-server context window size (configurable per game)
51
51
  max_speech_tokens: int = 50 # Max tokens for NPC speech output (Pass 4). Default: 50 (~40 words)
52
+ model_family: str = "auto" # Model family profile (auto, llama3, qwen3, gemma, chatml)
53
+ dialogue_act_enabled: bool = False # Dialogue act classifier for filler selection
52
54
 
53
55
  def save(self) -> None:
54
56
  """Save configuration to disk."""
@@ -71,6 +73,8 @@ class LoreguardConfig:
71
73
  dev_mode=data.get("dev_mode", False),
72
74
  context_size=data.get("context_size", 16384),
73
75
  max_speech_tokens=data.get("max_speech_tokens", 50),
76
+ model_family=data.get("model_family", "auto"),
77
+ dialogue_act_enabled=data.get("dialogue_act_enabled", False),
74
78
  )
75
79
  except (json.JSONDecodeError, KeyError):
76
80
  pass
@@ -119,6 +123,14 @@ class LoreguardConfig:
119
123
  # Environment Variable Configuration
120
124
  # =============================================================================
121
125
 
126
+ DEFAULT_API_URL = "https://console.loreguard.com"
127
+ DEFAULT_BACKEND_URL = "wss://console.loreguard.com/workers"
128
+
129
+
130
+ def get_api_url() -> str:
131
+ """Get the Loreguard API base URL (configurable via LOREGUARD_API env var)."""
132
+ return os.getenv("LOREGUARD_API", DEFAULT_API_URL)
133
+
122
134
 
123
135
  @lru_cache(maxsize=1)
124
136
  def load_config() -> dict:
@@ -131,12 +143,13 @@ def load_config() -> dict:
131
143
  return {
132
144
  # Server settings
133
145
  "LLM_ENDPOINT": os.getenv("LLM_ENDPOINT", "http://localhost:8080"),
134
- "BACKEND_URL": os.getenv("LOREGUARD_BACKEND", "wss://api.loreguard.com/workers"),
146
+ "BACKEND_URL": os.getenv("LOREGUARD_BACKEND", DEFAULT_BACKEND_URL),
147
+ "API_URL": os.getenv("LOREGUARD_API", DEFAULT_API_URL),
135
148
  "HOST": os.getenv("HOST", "127.0.0.1"),
136
149
  "PORT": os.getenv("PORT", "8081"),
137
150
 
138
151
  # Worker authentication (required for backend connection)
139
- # Get API token from loreguard.com dashboard
152
+ # Get API token from console.loreguard.com
140
153
  "WORKER_ID": os.getenv("LOREGUARD_WORKER_ID", os.getenv("WORKER_ID", "")),
141
154
  # LOREGUARD_TOKEN is preferred, WORKER_TOKEN kept for backwards compatibility
142
155
  "LOREGUARD_TOKEN": os.getenv("LOREGUARD_TOKEN", os.getenv("WORKER_TOKEN", "")),
@@ -230,20 +243,21 @@ def get_models_dir() -> Optional[Path]:
230
243
 
231
244
 
232
245
  def resolve_model_path(model_name: str, subdir: str = "") -> str:
233
- """Resolve a model path, preferring pre-shipped models over HF downloads.
246
+ """Resolve a model path, preferring local models over HF downloads.
234
247
 
235
248
  Resolution order:
236
249
  1. LOREGUARD_MODELS_DIR/<subdir> (explicit override)
237
- 2. Bundle models dir using manifest.txt (HF name → manifest key → local dir)
238
- 3. Bundle models dir using HF name → org--model convention (fallback)
239
- 4. Original HF model name (download from HuggingFace)
250
+ 2. Application Support models dir/<subdir> (standard install location)
251
+ 3. Bundle models dir using manifest.txt (HF name → manifest key → local dir)
252
+ 4. Bundle models dir using HF name → org--model convention (fallback)
253
+ 5. Download from HuggingFace to Application Support models dir
240
254
 
241
255
  Args:
242
256
  model_name: HuggingFace model name (e.g., 'vectara/hallucination_evaluation_model')
243
257
  subdir: Subdirectory within MODELS_DIR to check (e.g., 'hhem', 'deberta')
244
258
 
245
259
  Returns:
246
- Local path if pre-shipped model found, otherwise the original HF model name.
260
+ Local path to the model directory.
247
261
  """
248
262
  # 1. Explicit LOREGUARD_MODELS_DIR/<subdir>
249
263
  explicit_dir = get_config_value("MODELS_DIR")
@@ -252,7 +266,14 @@ def resolve_model_path(model_name: str, subdir: str = "") -> str:
252
266
  if local_path.exists() and any(local_path.iterdir()):
253
267
  return str(local_path)
254
268
 
255
- # 2 & 3. Bundle directory resolution
269
+ # 2. Application Support models dir/<subdir>
270
+ app_models = get_data_dir() / "models"
271
+ if subdir:
272
+ local_path = app_models / subdir
273
+ if local_path.exists() and any(local_path.iterdir()):
274
+ return str(local_path)
275
+
276
+ # 3 & 4. Bundle directory resolution
256
277
  bundle_dir = get_bundle_dir()
257
278
  if bundle_dir:
258
279
  bundle_models = bundle_dir / "models"
@@ -273,9 +294,37 @@ def resolve_model_path(model_name: str, subdir: str = "") -> str:
273
294
  if local_path.exists() and any(local_path.iterdir()):
274
295
  return str(local_path)
275
296
 
297
+ # 5. Download from HuggingFace to Application Support models dir
298
+ if subdir:
299
+ return _download_hf_model(model_name, app_models / subdir)
300
+
276
301
  return model_name
277
302
 
278
303
 
304
+ def _download_hf_model(model_name: str, target_dir: Path) -> str:
305
+ """Download a HuggingFace model to the loreguard models directory.
306
+
307
+ Returns:
308
+ Path to the downloaded model directory.
309
+ """
310
+ import logging
311
+ logger = logging.getLogger(__name__)
312
+ try:
313
+ from huggingface_hub import snapshot_download
314
+ target_dir.mkdir(parents=True, exist_ok=True)
315
+ logger.info(f"Downloading {model_name} to {target_dir}")
316
+ snapshot_download(
317
+ model_name,
318
+ local_dir=str(target_dir),
319
+ local_dir_use_symlinks=False,
320
+ )
321
+ logger.info(f"Downloaded {model_name} to {target_dir}")
322
+ return str(target_dir)
323
+ except Exception as e:
324
+ logger.warning(f"Failed to download {model_name}: {e}")
325
+ return model_name
326
+
327
+
279
328
  def get_config_value(key: str, default: Optional[str] = None) -> Optional[str]:
280
329
  """Get a single configuration value."""
281
330
  config = load_config()
@@ -8,6 +8,8 @@ HTTP endpoints:
8
8
  GET /api/capabilities - Feature discovery (streaming, chunk modes)
9
9
  GET /api/characters - List available NPCs (proxied from engine)
10
10
  POST /api/chat - Chat with an NPC (streaming SSE or JSON)
11
+ GET /api/models - List available GGUF models
12
+ POST /api/admin/reload-model - Hot-swap LLM model at runtime
11
13
 
12
14
  The server shares the existing tunnel connection instead of creating
13
15
  a new one, ensuring a single WebSocket connection per worker.
@@ -17,10 +19,12 @@ Uses uvicorn with socket-first binding for race-condition-free port allocation.
17
19
 
18
20
  import asyncio
19
21
  import json
22
+ import os
20
23
  import threading
21
24
  import time
22
25
  import uuid
23
26
  from concurrent.futures import Future
27
+ from pathlib import Path
24
28
  from typing import Any, Callable, Optional
25
29
 
26
30
  from .runtime import write_runtime_info, RuntimeInfo, get_runtime_path, get_version
@@ -63,6 +67,8 @@ class EmbeddedHTTPServer:
63
67
  self._running = False
64
68
  self._bound_socket: Optional[Any] = None
65
69
  self._ready_event = threading.Event()
70
+ self.llama_process: Optional[Any] = None # LlamaServerProcess — set by RunningScreen
71
+ self.models_dir: Optional[Path] = None # Path to models/ directory
66
72
 
67
73
  def start(self) -> int:
68
74
  """Start the HTTP server in a background thread.
@@ -361,7 +367,7 @@ class EmbeddedHTTPServer:
361
367
 
362
368
  # Derive HTTP base URL from WebSocket URL
363
369
  # ws://localhost:8090/workers → http://localhost:8090
364
- # wss://api.loreguard.com/workers → https://api.loreguard.com
370
+ # wss://console.loreguard.com/workers → https://console.loreguard.com
365
371
  backend_ws = server.tunnel.backend_url
366
372
  if backend_ws.startswith("wss://"):
367
373
  base_url = "https://" + backend_ws[6:].split("/")[0]
@@ -483,6 +489,118 @@ class EmbeddedHTTPServer:
483
489
  return JSONResponse(status_code=500, content=result)
484
490
  return result
485
491
 
492
+ @app.get("/api/models")
493
+ async def list_models():
494
+ """List available GGUF models in the models directory."""
495
+ if not server.models_dir or not server.models_dir.exists():
496
+ return JSONResponse(
497
+ status_code=404,
498
+ content={"error": "Models directory not configured"},
499
+ )
500
+
501
+ models = []
502
+ active_model = None
503
+ if server.llama_process and hasattr(server.llama_process, "model_path"):
504
+ active_model = server.llama_process.model_path.name
505
+
506
+ for f in sorted(server.models_dir.iterdir()):
507
+ if f.suffix == ".gguf" and f.is_file():
508
+ models.append({
509
+ "name": f.name,
510
+ "size": f.stat().st_size,
511
+ "active": f.name == active_model,
512
+ })
513
+
514
+ return {"models": models, "activeModel": active_model}
515
+
516
+ @app.post("/api/admin/reload-model")
517
+ async def reload_model(request: Request):
518
+ """Hot-swap the LLM model by restarting llama-server."""
519
+ if not server.llama_process:
520
+ return JSONResponse(
521
+ status_code=503,
522
+ content={"error": "LLM server not available"},
523
+ )
524
+ if not server.models_dir:
525
+ return JSONResponse(
526
+ status_code=503,
527
+ content={"error": "Models directory not configured"},
528
+ )
529
+
530
+ body = await request.json()
531
+ model_name = body.get("model", "")
532
+ if not model_name:
533
+ return JSONResponse(
534
+ status_code=400,
535
+ content={"error": "Missing 'model' field"},
536
+ )
537
+
538
+ # Security: resolve and verify path stays inside models_dir
539
+ model_path = (server.models_dir / model_name).resolve()
540
+ if model_path.parent != server.models_dir.resolve():
541
+ return JSONResponse(
542
+ status_code=400,
543
+ content={"error": "Invalid model name"},
544
+ )
545
+
546
+ if not model_path.exists() or model_path.suffix != ".gguf":
547
+ return JSONResponse(
548
+ status_code=404,
549
+ content={"error": f"Model '{model_name}' not found"},
550
+ )
551
+
552
+ # Check if already active
553
+ if hasattr(server.llama_process, "model_path") and server.llama_process.model_path.name == model_name:
554
+ return {"status": "already_active", "model": model_name}
555
+
556
+ # Save original model_path for rollback on failure
557
+ original_model_path = server.llama_process.model_path
558
+
559
+ try:
560
+ # Stop current llama-server
561
+ server.llama_process.stop()
562
+
563
+ # Update model path and restart
564
+ server.llama_process.model_path = model_path
565
+ server.llama_process.start()
566
+
567
+ # Wait for health check (llama-server takes a few seconds to load model)
568
+ import httpx
569
+ llama_url = f"http://127.0.0.1:{server.llama_process.port}/health"
570
+ async with httpx.AsyncClient(timeout=2.0) as client:
571
+ for attempt in range(60): # 60 attempts × 0.5s = 30s timeout
572
+ await asyncio.sleep(0.5)
573
+ try:
574
+ resp = await client.get(llama_url)
575
+ if resp.status_code == 200:
576
+ # Persist selection so it survives restarts
577
+ try:
578
+ from .config import LoreguardConfig
579
+ cfg = LoreguardConfig.load()
580
+ cfg.set_model_path(model_path)
581
+ cfg.save()
582
+ except Exception:
583
+ pass # Best-effort persistence
584
+ return {"status": "ok", "model": model_name}
585
+ except Exception:
586
+ continue
587
+
588
+ return JSONResponse(
589
+ status_code=500,
590
+ content={"error": "Model loaded but health check timed out after 30s"},
591
+ )
592
+ except Exception as e:
593
+ # Rollback: restore original model path and try to restart
594
+ server.llama_process.model_path = original_model_path
595
+ try:
596
+ server.llama_process.start()
597
+ except Exception:
598
+ pass # Best-effort rollback
599
+ return JSONResponse(
600
+ status_code=500,
601
+ content={"error": f"Failed to reload model: {e}"},
602
+ )
603
+
486
604
  # Write runtime info
487
605
  with open(debug_path, "a") as f:
488
606
  f.write(f"[SDK Server] Writing runtime info for port {self.actual_port}...\n")
@@ -610,6 +728,15 @@ def force_stop_sdk_server() -> None:
610
728
  _server = None
611
729
 
612
730
 
731
+ def set_llama_process(llama_process: Any, models_dir: Optional[Path] = None) -> None:
732
+ """Set the LlamaServerProcess reference on the SDK server for model management."""
733
+ global _server
734
+ if _server:
735
+ _server.llama_process = llama_process
736
+ if models_dir:
737
+ _server.models_dir = models_dir
738
+
739
+
613
740
  def update_backend_status(connected: bool) -> None:
614
741
  """Update backend connection status in runtime.json."""
615
742
  global _server
@@ -30,7 +30,7 @@ def _get_templates_dir() -> Path:
30
30
  return Path(__file__).parent.parent / "templates"
31
31
 
32
32
 
33
- LLAMA_VERSION = "b7789" # Must match loreguard-engine bundle version
33
+ LLAMA_VERSION = "b8467" # Must match loreguard-engine bundle version
34
34
 
35
35
  # Download URLs for each platform
36
36
  BINARIES = {
@@ -265,18 +265,21 @@ def make_executable(path: Path) -> None:
265
265
 
266
266
  async def download_llama_server(
267
267
  progress_callback: Optional[Callable[[str, DownloadProgress | None], None]] = None,
268
+ target_dir: Optional[Path] = None,
268
269
  ) -> Path:
269
270
  """Download and install llama-server for the current platform.
270
271
 
271
272
  Args:
272
273
  progress_callback: Called with (status_message, progress_or_none)
274
+ target_dir: If provided, install into this directory instead of the default.
275
+ Used by the bundle tool to pre-ship llama-server.
273
276
 
274
277
  Returns:
275
278
  Path to the installed llama-server binary
276
279
  """
277
280
  plat = get_platform()
278
281
  config = BINARIES[plat]
279
- bin_dir = get_bin_dir()
282
+ bin_dir = target_dir or get_bin_dir()
280
283
 
281
284
  def notify(msg: str, progress: DownloadProgress | None = None):
282
285
  if progress_callback:
@@ -355,12 +358,12 @@ async def download_llama_server(
355
358
  make_executable(lib)
356
359
 
357
360
  # Write version marker file for future version checks
358
- version_file = get_version_file_path()
361
+ version_file = bin_dir / ".llama_version" if target_dir else get_version_file_path()
359
362
  version_file.write_text(LLAMA_VERSION)
360
363
 
361
364
  notify(f"llama-server {LLAMA_VERSION} installed successfully!")
362
365
 
363
- return get_llama_server_path()
366
+ return bin_dir / config["binary_name"]
364
367
 
365
368
 
366
369
  class LlamaServerProcess:
@@ -372,11 +375,13 @@ class LlamaServerProcess:
372
375
  port: int = 8080,
373
376
  lora_path: Optional[Path] = None,
374
377
  context_size: int = 16384,
378
+ model_family: str = "llama3",
375
379
  ):
376
380
  self.model_path = model_path
377
381
  self.port = port
378
382
  self.lora_path = lora_path
379
383
  self.context_size = context_size
384
+ self.model_family = model_family
380
385
  self.process: Optional[subprocess.Popen] = None
381
386
  self._output_lines: list[str] = []
382
387
 
@@ -406,13 +411,18 @@ class LlamaServerProcess:
406
411
  # Without this, llama-server may allocate multiple slots, each consuming
407
412
  # KV cache memory proportional to context_size * model_hidden_dim.
408
413
  "-np", "1",
409
- # Use custom Jinja template without tool-calling logic.
410
- # Llama 3.1's built-in template forces tool-calling format even without tools,
411
- # so we use a stripped-down template that only handles chat messages.
414
+ # Enable Jinja template processing (required for both custom and embedded templates)
412
415
  "--jinja",
413
- "--chat-template-file", str(_get_templates_dir() / "llama31-no-tools.jinja"),
414
416
  ]
415
417
 
418
+ # Apply model-family-specific chat template override.
419
+ # Llama 3.1 requires a custom template to avoid the tool-calling bug;
420
+ # other families use their GGUF-embedded template (--jinja alone).
421
+ from .model_families import get_model_family
422
+ family = get_model_family(self.model_family)
423
+ if family.chat_template_file:
424
+ cmd.extend(["--chat-template-file", str(_get_templates_dir() / family.chat_template_file)])
425
+
416
426
  # Add LoRA adapter if specified
417
427
  if self.lora_path and self.lora_path.exists():
418
428
  cmd.extend(["--lora", str(self.lora_path)])
@@ -37,16 +37,16 @@ class SamplingConfig:
37
37
  presence_penalty: float = 0.0
38
38
 
39
39
 
40
- # Default stop sequences - ChatML/instruction markers that signal end of turn
41
- DEFAULT_STOP_SEQUENCES = [
42
- "<|im_end|>",
43
- "<|im_start|>",
44
- "<|endoftext|>",
45
- "</s>",
46
- "<|end|>",
47
- "<|user|>",
48
- "<|assistant|>",
49
- ]
40
+ from .model_families import get_model_family, ALL_STOP_MARKERS, DEFAULT_MODEL_FAMILY
41
+
42
+
43
+ def get_stop_sequences(model_family: str = DEFAULT_MODEL_FAMILY) -> list[str]:
44
+ """Get stop sequences for the given model family."""
45
+ return list(get_model_family(model_family).stop_sequences)
46
+
47
+
48
+ # Backward-compatible default (Llama 3 stop sequences)
49
+ DEFAULT_STOP_SEQUENCES = get_stop_sequences(DEFAULT_MODEL_FAMILY)
50
50
 
51
51
 
52
52
  @dataclass
@@ -61,7 +61,9 @@ class LLMRequest:
61
61
  stop: list[str] = field(default_factory=lambda: DEFAULT_STOP_SEQUENCES.copy())
62
62
 
63
63
  # Thinking mode control (for Qwen3)
64
- disable_thinking: bool = False
64
+ # Defaults to True: thinking wastes tokens and breaks pipelines.
65
+ # Only enable explicitly when extended reasoning is desired.
66
+ disable_thinking: bool = True
65
67
 
66
68
  # If true, error if content is empty instead of falling back to reasoning_content
67
69
  require_content: bool = False
@@ -94,11 +96,13 @@ class LLMProxy:
94
96
  sampling configuration, stop sequences, and JSON mode support.
95
97
  """
96
98
 
97
- def __init__(self, endpoint: str, timeout: float = 120.0):
99
+ def __init__(self, endpoint: str, timeout: float = 120.0, model_family: str = DEFAULT_MODEL_FAMILY):
98
100
  if not endpoint:
99
101
  raise ValueError("LLM endpoint is required")
100
102
  self.endpoint = endpoint.rstrip("/")
101
103
  self.default_timeout = timeout
104
+ self.model_family = model_family
105
+ self._stop_sequences = get_stop_sequences(model_family)
102
106
  self.client = httpx.AsyncClient(
103
107
  timeout=timeout,
104
108
  limits=httpx.Limits(
@@ -255,9 +259,10 @@ class LLMProxy:
255
259
  payload["id_slot"] = 0
256
260
  logger.info("KV cache: cache_prompt=true, id_slot=0 (verify -np 1 on server)")
257
261
 
258
- # Disable thinking mode if requested (for Qwen3)
262
+ # Disable thinking mode (for Qwen3/3.5).
263
+ # Must use chat_template_kwargs — top-level enable_thinking is ignored by llama.cpp b8467+.
259
264
  if req.disable_thinking:
260
- payload["enable_thinking"] = False
265
+ payload.setdefault("chat_template_kwargs", {})["enable_thinking"] = False
261
266
 
262
267
  # Note: JSON mode is not compatible with streaming in llama.cpp
263
268
  # If force_json is requested, fall back to non-streaming
@@ -524,7 +529,7 @@ class LLMProxy:
524
529
  max_tokens=d.get("max_tokens", 512),
525
530
  timeout=timeout,
526
531
  sampling=sampling,
527
- stop=d.get("stop", DEFAULT_STOP_SEQUENCES.copy()),
532
+ stop=d.get("stop", self._stop_sequences.copy()),
528
533
  disable_thinking=d.get("disable_thinking", False),
529
534
  require_content=d.get("require_content", False),
530
535
  force_json=d.get("force_json", False),
@@ -571,9 +576,10 @@ class LLMProxy:
571
576
  payload["id_slot"] = 0
572
577
  logger.info("KV cache: cache_prompt=true, id_slot=0 (verify -np 1 on server)")
573
578
 
574
- # Disable thinking mode if requested (for Qwen3)
579
+ # Disable thinking mode (for Qwen3/3.5).
580
+ # Must use chat_template_kwargs — top-level enable_thinking is ignored by llama.cpp b8467+.
575
581
  if req.disable_thinking:
576
- payload["enable_thinking"] = False
582
+ payload.setdefault("chat_template_kwargs", {})["enable_thinking"] = False
577
583
 
578
584
  # Force JSON output if requested
579
585
  if req.force_json:
@@ -717,14 +723,12 @@ class LLMProxy:
717
723
  return -1
718
724
 
719
725
  def _strip_chat_markers(self, content: str) -> str:
720
- """Remove content after ChatML markers that indicate hallucinated turns."""
721
- markers = [
722
- "<|im_end|>", "<|im_start|>", "<|endoftext|>",
723
- "</s>", "<|end|>", "<|user|>", "<|assistant|>",
724
- ]
726
+ """Remove content after chat markers that indicate hallucinated turns.
725
727
 
728
+ Uses a superset of all model families' tokens as a safety net.
729
+ """
726
730
  result = content
727
- for marker in markers:
731
+ for marker in ALL_STOP_MARKERS:
728
732
  if marker in result:
729
733
  idx = result.index(marker)
730
734
  result = result[:idx]