loreguard-cli 0.15.2__tar.gz → 0.16.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/PKG-INFO +1 -1
  2. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/pyproject.toml +1 -1
  3. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/chunk_detector.py +1 -1
  4. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/cli.py +16 -2
  5. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/config.py +2 -0
  6. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/http_server.py +110 -0
  7. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/llama_server.py +11 -4
  8. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/llm.py +18 -18
  9. loreguard_cli-0.16.0/src/model_families.py +121 -0
  10. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/app.py +1 -0
  11. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/screens/main.py +3 -2
  12. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/screens/running.py +10 -2
  13. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/.claude/skills/llama-cpp-troubleshooting/SKILL.md +0 -0
  14. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/.env.example +0 -0
  15. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/.github/workflows/release.yml +0 -0
  16. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/.gitignore +0 -0
  17. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/LICENSE +0 -0
  18. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/README.md +0 -0
  19. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/THIRD_PARTY_NOTICES.md +0 -0
  20. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/loreguard.spec +0 -0
  21. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/loreguard_entry.py +0 -0
  22. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/scripts/build.py +0 -0
  23. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/sdk/API.md +0 -0
  24. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/sdk/csharp/LoreguardSDK.cs +0 -0
  25. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/sdk/gdscript/LoreguardSDK.gd +0 -0
  26. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/sdk/javascript/loreguard-sdk.js +0 -0
  27. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/sdk/python/loreguard_sdk.py +0 -0
  28. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/__init__.py +0 -0
  29. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/__main__.py +0 -0
  30. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/dialogue_act_classifier.py +0 -0
  31. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/hf_discovery.py +0 -0
  32. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/intent_classifier.py +0 -0
  33. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/main.py +0 -0
  34. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/models_registry.py +0 -0
  35. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/nli.py +0 -0
  36. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/npc_chat.py +0 -0
  37. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/runtime.py +0 -0
  38. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/steam.py +0 -0
  39. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/term_ui.py +0 -0
  40. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/__init__.py +0 -0
  41. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/modals/__init__.py +0 -0
  42. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/modals/auth_menu.py +0 -0
  43. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/modals/npc_chat.py +0 -0
  44. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/modals/token_input.py +0 -0
  45. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/modals/unified_palette.py +0 -0
  46. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/screens/__init__.py +0 -0
  47. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/screens/auth.py +0 -0
  48. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/screens/model_select.py +0 -0
  49. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/screens/nli_setup.py +0 -0
  50. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/styles.py +0 -0
  51. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/widgets/__init__.py +0 -0
  52. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/widgets/banner.py +0 -0
  53. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/widgets/footer.py +0 -0
  54. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/widgets/hardware_info.py +0 -0
  55. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/widgets/npc_chat.py +0 -0
  56. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/widgets/server_monitor.py +0 -0
  57. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tui/widgets/status_panel.py +0 -0
  58. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/tunnel.py +0 -0
  59. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/src/wizard.py +0 -0
  60. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/templates/llama31-no-tools.jinja +0 -0
  61. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/tests/test_intent_classifier.py +0 -0
  62. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/tests/test_nli_hhem.py +0 -0
  63. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/tests/test_websocket_timeout.py +0 -0
  64. {loreguard_cli-0.15.2 → loreguard_cli-0.16.0}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: loreguard-cli
3
- Version: 0.15.2
3
+ Version: 0.16.0
4
4
  Summary: Local inference client for Loreguard NPCs
5
5
  Project-URL: Homepage, https://loreguard.com
6
6
  Project-URL: Documentation, https://github.com/beyond-logic-labs/loreguard-cli#readme
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "loreguard-cli"
7
- version = "0.15.2"
7
+ version = "0.16.0"
8
8
  description = "Local inference client for Loreguard NPCs"
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -44,7 +44,7 @@ CHUNK_HYPOTHESES = {
44
44
 
45
45
  # Threshold for "starts new thought" classification
46
46
  # If confidence > threshold, we create a new chunk
47
- NEW_THOUGHT_THRESHOLD = 0.55
47
+ NEW_THOUGHT_THRESHOLD = 0.38
48
48
 
49
49
 
50
50
  class ChunkDetector:
@@ -46,12 +46,14 @@ class LoreguardCLI:
46
46
  port: int = 8080,
47
47
  backend_url: str = "wss://api.loreguard.com/workers",
48
48
  worker_id: Optional[str] = None,
49
+ model_family: str = "llama3",
49
50
  ):
50
51
  self.token = token
51
52
  self.model_path = model_path
52
53
  self.model_id = model_id
53
54
  self.port = port
54
55
  self.backend_url = backend_url
56
+ self.model_family = model_family
55
57
  # Worker ID: use provided value, or default to sanitized hostname.
56
58
  # Validator requires ^[a-zA-Z0-9_-]{1,64}$ — replace dots with hyphens.
57
59
  raw_id = worker_id or socket.gethostname() or "worker"
@@ -209,7 +211,7 @@ class LoreguardCLI:
209
211
  # Start server
210
212
  log.info(f"Starting llama-server on port {self.port}...")
211
213
  try:
212
- self._llama = LlamaServerProcess(self.model_path, port=self.port)
214
+ self._llama = LlamaServerProcess(self.model_path, port=self.port, model_family=self.model_family)
213
215
  self._llama.start()
214
216
 
215
217
  # Wait for ready
@@ -241,7 +243,7 @@ class LoreguardCLI:
241
243
  log.info(f"Worker ID: {self.worker_id}")
242
244
 
243
245
  try:
244
- llm_proxy = LLMProxy(f"http://127.0.0.1:{self.port}")
246
+ llm_proxy = LLMProxy(f"http://127.0.0.1:{self.port}", model_family=self.model_family)
245
247
 
246
248
  # ADR-0027: Load all ML services — the client is the sole provider
247
249
  # of NLI, intent, dialogue act, and chunk capabilities.
@@ -323,6 +325,11 @@ class LoreguardCLI:
323
325
  port=sdk_port,
324
326
  )
325
327
  log.info(f"SDK server listening on 127.0.0.1:{self._sdk_port}")
328
+
329
+ # Wire llama process for runtime model switching
330
+ from .http_server import set_llama_process
331
+ models_dir = self.model_path.parent if self.model_path else None
332
+ set_llama_process(self._llama, models_dir)
326
333
  except Exception as e:
327
334
  log.error(f"Failed to start SDK server: {e}")
328
335
  return False
@@ -460,6 +467,12 @@ Available model IDs:
460
467
  default=os.getenv("LOREGUARD_BUNDLE_DIR", ""),
461
468
  help="Loreguard bundle directory. Auto-discovers models from manifest.txt.",
462
469
  )
470
+ parser.add_argument(
471
+ "--model-family",
472
+ default=os.getenv("LOREGUARD_MODEL_FAMILY", "auto"),
473
+ choices=["auto", "llama3", "qwen3", "gemma", "chatml"],
474
+ help="Model family profile for chat template/stop sequences (default: auto)",
475
+ )
463
476
  parser.add_argument(
464
477
  "--dev",
465
478
  action="store_true",
@@ -531,6 +544,7 @@ Available model IDs:
531
544
  port=args.port,
532
545
  backend_url=args.backend,
533
546
  worker_id=args.worker_id or None, # None will use hostname
547
+ model_family=args.model_family,
534
548
  )
535
549
 
536
550
  exit_code = asyncio.run(cli.run())
@@ -49,6 +49,7 @@ class LoreguardConfig:
49
49
  dev_mode: bool = False
50
50
  context_size: int = 16384 # llama-server context window size (configurable per game)
51
51
  max_speech_tokens: int = 50 # Max tokens for NPC speech output (Pass 4). Default: 50 (~40 words)
52
+ model_family: str = "auto" # Model family profile (auto, llama3, qwen3, gemma, chatml)
52
53
 
53
54
  def save(self) -> None:
54
55
  """Save configuration to disk."""
@@ -71,6 +72,7 @@ class LoreguardConfig:
71
72
  dev_mode=data.get("dev_mode", False),
72
73
  context_size=data.get("context_size", 16384),
73
74
  max_speech_tokens=data.get("max_speech_tokens", 50),
75
+ model_family=data.get("model_family", "auto"),
74
76
  )
75
77
  except (json.JSONDecodeError, KeyError):
76
78
  pass
@@ -8,6 +8,8 @@ HTTP endpoints:
8
8
  GET /api/capabilities - Feature discovery (streaming, chunk modes)
9
9
  GET /api/characters - List available NPCs (proxied from engine)
10
10
  POST /api/chat - Chat with an NPC (streaming SSE or JSON)
11
+ GET /api/models - List available GGUF models
12
+ POST /api/admin/reload-model - Hot-swap LLM model at runtime
11
13
 
12
14
  The server shares the existing tunnel connection instead of creating
13
15
  a new one, ensuring a single WebSocket connection per worker.
@@ -17,10 +19,12 @@ Uses uvicorn with socket-first binding for race-condition-free port allocation.
17
19
 
18
20
  import asyncio
19
21
  import json
22
+ import os
20
23
  import threading
21
24
  import time
22
25
  import uuid
23
26
  from concurrent.futures import Future
27
+ from pathlib import Path
24
28
  from typing import Any, Callable, Optional
25
29
 
26
30
  from .runtime import write_runtime_info, RuntimeInfo, get_runtime_path, get_version
@@ -63,6 +67,8 @@ class EmbeddedHTTPServer:
63
67
  self._running = False
64
68
  self._bound_socket: Optional[Any] = None
65
69
  self._ready_event = threading.Event()
70
+ self.llama_process: Optional[Any] = None # LlamaServerProcess — set by RunningScreen
71
+ self.models_dir: Optional[Path] = None # Path to models/ directory
66
72
 
67
73
  def start(self) -> int:
68
74
  """Start the HTTP server in a background thread.
@@ -483,6 +489,101 @@ class EmbeddedHTTPServer:
483
489
  return JSONResponse(status_code=500, content=result)
484
490
  return result
485
491
 
492
+ @app.get("/api/models")
493
+ async def list_models():
494
+ """List available GGUF models in the models directory."""
495
+ if not server.models_dir or not server.models_dir.exists():
496
+ return JSONResponse(
497
+ status_code=404,
498
+ content={"error": "Models directory not configured"},
499
+ )
500
+
501
+ models = []
502
+ active_model = None
503
+ if server.llama_process and hasattr(server.llama_process, "model_path"):
504
+ active_model = server.llama_process.model_path.name
505
+
506
+ for f in sorted(server.models_dir.iterdir()):
507
+ if f.suffix == ".gguf" and f.is_file():
508
+ models.append({
509
+ "name": f.name,
510
+ "size": f.stat().st_size,
511
+ "active": f.name == active_model,
512
+ })
513
+
514
+ return {"models": models, "activeModel": active_model}
515
+
516
+ @app.post("/api/admin/reload-model")
517
+ async def reload_model(request: Request):
518
+ """Hot-swap the LLM model by restarting llama-server."""
519
+ if not server.llama_process:
520
+ return JSONResponse(
521
+ status_code=503,
522
+ content={"error": "LLM server not available"},
523
+ )
524
+ if not server.models_dir:
525
+ return JSONResponse(
526
+ status_code=503,
527
+ content={"error": "Models directory not configured"},
528
+ )
529
+
530
+ body = await request.json()
531
+ model_name = body.get("model", "")
532
+ if not model_name:
533
+ return JSONResponse(
534
+ status_code=400,
535
+ content={"error": "Missing 'model' field"},
536
+ )
537
+
538
+ # Security: prevent path traversal
539
+ if "/" in model_name or "\\" in model_name or ".." in model_name:
540
+ return JSONResponse(
541
+ status_code=400,
542
+ content={"error": "Invalid model name"},
543
+ )
544
+
545
+ model_path = server.models_dir / model_name
546
+ if not model_path.exists() or not model_path.suffix == ".gguf":
547
+ return JSONResponse(
548
+ status_code=404,
549
+ content={"error": f"Model '{model_name}' not found"},
550
+ )
551
+
552
+ # Check if already active
553
+ if hasattr(server.llama_process, "model_path") and server.llama_process.model_path.name == model_name:
554
+ return {"status": "already_active", "model": model_name}
555
+
556
+ try:
557
+ # Stop current llama-server
558
+ server.llama_process.stop()
559
+
560
+ # Update model path and restart
561
+ server.llama_process.model_path = model_path
562
+ server.llama_process.start()
563
+
564
+ # Wait for health check (llama-server takes a few seconds to load model)
565
+ import httpx
566
+ llama_url = f"http://127.0.0.1:{server.llama_process.port}/health"
567
+ for attempt in range(60): # 60 attempts × 0.5s = 30s timeout
568
+ await asyncio.sleep(0.5)
569
+ try:
570
+ async with httpx.AsyncClient(timeout=2.0) as client:
571
+ resp = await client.get(llama_url)
572
+ if resp.status_code == 200:
573
+ return {"status": "ok", "model": model_name}
574
+ except Exception:
575
+ continue
576
+
577
+ return JSONResponse(
578
+ status_code=500,
579
+ content={"error": "Model loaded but health check timed out after 30s"},
580
+ )
581
+ except Exception as e:
582
+ return JSONResponse(
583
+ status_code=500,
584
+ content={"error": f"Failed to reload model: {e}"},
585
+ )
586
+
486
587
  # Write runtime info
487
588
  with open(debug_path, "a") as f:
488
589
  f.write(f"[SDK Server] Writing runtime info for port {self.actual_port}...\n")
@@ -610,6 +711,15 @@ def force_stop_sdk_server() -> None:
610
711
  _server = None
611
712
 
612
713
 
714
+ def set_llama_process(llama_process: Any, models_dir: Optional[Path] = None) -> None:
715
+ """Set the LlamaServerProcess reference on the SDK server for model management."""
716
+ global _server
717
+ if _server:
718
+ _server.llama_process = llama_process
719
+ if models_dir:
720
+ _server.models_dir = models_dir
721
+
722
+
613
723
  def update_backend_status(connected: bool) -> None:
614
724
  """Update backend connection status in runtime.json."""
615
725
  global _server
@@ -372,11 +372,13 @@ class LlamaServerProcess:
372
372
  port: int = 8080,
373
373
  lora_path: Optional[Path] = None,
374
374
  context_size: int = 16384,
375
+ model_family: str = "llama3",
375
376
  ):
376
377
  self.model_path = model_path
377
378
  self.port = port
378
379
  self.lora_path = lora_path
379
380
  self.context_size = context_size
381
+ self.model_family = model_family
380
382
  self.process: Optional[subprocess.Popen] = None
381
383
  self._output_lines: list[str] = []
382
384
 
@@ -406,13 +408,18 @@ class LlamaServerProcess:
406
408
  # Without this, llama-server may allocate multiple slots, each consuming
407
409
  # KV cache memory proportional to context_size * model_hidden_dim.
408
410
  "-np", "1",
409
- # Use custom Jinja template without tool-calling logic.
410
- # Llama 3.1's built-in template forces tool-calling format even without tools,
411
- # so we use a stripped-down template that only handles chat messages.
411
+ # Enable Jinja template processing (required for both custom and embedded templates)
412
412
  "--jinja",
413
- "--chat-template-file", str(_get_templates_dir() / "llama31-no-tools.jinja"),
414
413
  ]
415
414
 
415
+ # Apply model-family-specific chat template override.
416
+ # Llama 3.1 requires a custom template to avoid the tool-calling bug;
417
+ # other families use their GGUF-embedded template (--jinja alone).
418
+ from .model_families import get_model_family
419
+ family = get_model_family(self.model_family)
420
+ if family.chat_template_file:
421
+ cmd.extend(["--chat-template-file", str(_get_templates_dir() / family.chat_template_file)])
422
+
416
423
  # Add LoRA adapter if specified
417
424
  if self.lora_path and self.lora_path.exists():
418
425
  cmd.extend(["--lora", str(self.lora_path)])
@@ -37,16 +37,16 @@ class SamplingConfig:
37
37
  presence_penalty: float = 0.0
38
38
 
39
39
 
40
- # Default stop sequences - ChatML/instruction markers that signal end of turn
41
- DEFAULT_STOP_SEQUENCES = [
42
- "<|im_end|>",
43
- "<|im_start|>",
44
- "<|endoftext|>",
45
- "</s>",
46
- "<|end|>",
47
- "<|user|>",
48
- "<|assistant|>",
49
- ]
40
+ from .model_families import get_model_family, ALL_STOP_MARKERS, DEFAULT_MODEL_FAMILY
41
+
42
+
43
+ def get_stop_sequences(model_family: str = DEFAULT_MODEL_FAMILY) -> list[str]:
44
+ """Get stop sequences for the given model family."""
45
+ return list(get_model_family(model_family).stop_sequences)
46
+
47
+
48
+ # Backward-compatible default (Llama 3 stop sequences)
49
+ DEFAULT_STOP_SEQUENCES = get_stop_sequences(DEFAULT_MODEL_FAMILY)
50
50
 
51
51
 
52
52
  @dataclass
@@ -94,11 +94,13 @@ class LLMProxy:
94
94
  sampling configuration, stop sequences, and JSON mode support.
95
95
  """
96
96
 
97
- def __init__(self, endpoint: str, timeout: float = 120.0):
97
+ def __init__(self, endpoint: str, timeout: float = 120.0, model_family: str = DEFAULT_MODEL_FAMILY):
98
98
  if not endpoint:
99
99
  raise ValueError("LLM endpoint is required")
100
100
  self.endpoint = endpoint.rstrip("/")
101
101
  self.default_timeout = timeout
102
+ self.model_family = model_family
103
+ self._stop_sequences = get_stop_sequences(model_family)
102
104
  self.client = httpx.AsyncClient(
103
105
  timeout=timeout,
104
106
  limits=httpx.Limits(
@@ -524,7 +526,7 @@ class LLMProxy:
524
526
  max_tokens=d.get("max_tokens", 512),
525
527
  timeout=timeout,
526
528
  sampling=sampling,
527
- stop=d.get("stop", DEFAULT_STOP_SEQUENCES.copy()),
529
+ stop=d.get("stop", self._stop_sequences.copy()),
528
530
  disable_thinking=d.get("disable_thinking", False),
529
531
  require_content=d.get("require_content", False),
530
532
  force_json=d.get("force_json", False),
@@ -717,14 +719,12 @@ class LLMProxy:
717
719
  return -1
718
720
 
719
721
  def _strip_chat_markers(self, content: str) -> str:
720
- """Remove content after ChatML markers that indicate hallucinated turns."""
721
- markers = [
722
- "<|im_end|>", "<|im_start|>", "<|endoftext|>",
723
- "</s>", "<|end|>", "<|user|>", "<|assistant|>",
724
- ]
722
+ """Remove content after chat markers that indicate hallucinated turns.
725
723
 
724
+ Uses a superset of all model families' tokens as a safety net.
725
+ """
726
726
  result = content
727
- for marker in markers:
727
+ for marker in ALL_STOP_MARKERS:
728
728
  if marker in result:
729
729
  idx = result.index(marker)
730
730
  result = result[:idx]
@@ -0,0 +1,121 @@
1
+ """Model family profiles for chat template and stop sequence configuration.
2
+
3
+ Different model families (Llama, Qwen, Gemma, etc.) use different chat template
4
+ formats and stop tokens. This module provides preconfigured profiles so users
5
+ can switch models without manually adjusting server flags.
6
+ """
7
+
8
+ import logging
9
+ from dataclasses import dataclass
10
+ from typing import Optional
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ @dataclass(frozen=True)
16
+ class ModelFamilyProfile:
17
+ """Preconfigured settings for a model family.
18
+
19
+ Attributes:
20
+ id: Unique identifier (used in config.json).
21
+ name: Human-readable display name.
22
+ chat_template_file: Jinja template filename (relative to templates/).
23
+ None means use the model's GGUF-embedded template via --jinja.
24
+ stop_sequences: Model-family-specific stop tokens for generation.
25
+ description: Short description for UI display.
26
+ """
27
+ id: str
28
+ name: str
29
+ chat_template_file: Optional[str]
30
+ stop_sequences: tuple[str, ...]
31
+ description: str
32
+
33
+
34
+ # Registry of known model family profiles.
35
+ # Key = profile ID (stored in config.json as model_family).
36
+ MODEL_FAMILIES: dict[str, ModelFamilyProfile] = {
37
+ "auto": ModelFamilyProfile(
38
+ id="auto",
39
+ name="Auto (Model Embedded)",
40
+ chat_template_file=None,
41
+ stop_sequences=(
42
+ # Superset — works for any model, extra tokens are inert
43
+ "<|im_end|>", "<|im_start|>", "<|endoftext|>",
44
+ "<|eot_id|>", "<|end_of_text|>",
45
+ "<end_of_turn>", "<start_of_turn>",
46
+ "</s>", "<|end|>",
47
+ ),
48
+ description="Uses model's embedded chat template. Works for most models.",
49
+ ),
50
+ "llama3": ModelFamilyProfile(
51
+ id="llama3",
52
+ name="Llama 3 / 3.1",
53
+ chat_template_file="llama31-no-tools.jinja",
54
+ stop_sequences=(
55
+ "<|eot_id|>",
56
+ "<|end_of_text|>",
57
+ ),
58
+ description="Meta Llama 3.x series. Uses custom template to disable tool-calling.",
59
+ ),
60
+ "qwen3": ModelFamilyProfile(
61
+ id="qwen3",
62
+ name="Qwen 3 / 3.5",
63
+ chat_template_file=None,
64
+ stop_sequences=(
65
+ "<|im_end|>",
66
+ "<|im_start|>",
67
+ "<|endoftext|>",
68
+ ),
69
+ description="Alibaba Qwen 3.x series. ChatML format with thinking support.",
70
+ ),
71
+ "gemma": ModelFamilyProfile(
72
+ id="gemma",
73
+ name="Google Gemma",
74
+ chat_template_file=None,
75
+ stop_sequences=(
76
+ "<end_of_turn>",
77
+ "<start_of_turn>",
78
+ ),
79
+ description="Google Gemma models. Uses model-embedded template.",
80
+ ),
81
+ "chatml": ModelFamilyProfile(
82
+ id="chatml",
83
+ name="ChatML (Generic)",
84
+ chat_template_file=None,
85
+ stop_sequences=(
86
+ "<|im_end|>",
87
+ "<|im_start|>",
88
+ "<|endoftext|>",
89
+ "</s>",
90
+ ),
91
+ description="Generic ChatML-compatible models (Nous Hermes, OpenChat, etc.).",
92
+ ),
93
+ }
94
+
95
+ DEFAULT_MODEL_FAMILY = "auto"
96
+
97
+
98
+ def get_model_family(family_id: str) -> ModelFamilyProfile:
99
+ """Get a model family profile by ID.
100
+
101
+ Falls back to DEFAULT_MODEL_FAMILY if the ID is unknown.
102
+ """
103
+ profile = MODEL_FAMILIES.get(family_id)
104
+ if profile is None:
105
+ logger.warning(
106
+ "Unknown model family '%s', falling back to '%s'. Valid: %s",
107
+ family_id, DEFAULT_MODEL_FAMILY, ", ".join(MODEL_FAMILIES.keys()),
108
+ )
109
+ profile = MODEL_FAMILIES[DEFAULT_MODEL_FAMILY]
110
+ return profile
111
+
112
+
113
+ # Superset of all stop markers across all families.
114
+ # Used for _strip_chat_markers() safety net — catches markers from ANY model family.
115
+ ALL_STOP_MARKERS: tuple[str, ...] = tuple(sorted(set(
116
+ marker
117
+ for profile in MODEL_FAMILIES.values()
118
+ for marker in profile.stop_sequences
119
+ ) | {
120
+ "</s>", "<|end|>", "<|user|>", "<|assistant|>",
121
+ }))
@@ -55,6 +55,7 @@ class LoreguardApp(App):
55
55
  worker_id: str = ""
56
56
  model_path: Optional[Path] = None
57
57
  adapter_path: Optional[Path] = None # Optional LoRA adapter
58
+ model_family: str = "auto" # Model family profile (auto, llama3, qwen3, gemma, chatml)
58
59
  hardware: Optional[HardwareData] = None
59
60
  dev_mode: bool = False
60
61
  verbose: bool = False
@@ -156,6 +156,7 @@ class MainScreen(Screen):
156
156
  app.api_token = config.api_token
157
157
  app.model_path = config.get_model_path_obj()
158
158
  app.adapter_path = config.get_adapter_path_obj()
159
+ app.model_family = config.model_family
159
160
  app.dev_mode = config.dev_mode
160
161
 
161
162
  model_name = app.model_path.name if app.model_path else 'unknown'
@@ -420,7 +421,7 @@ class MainScreen(Screen):
420
421
 
421
422
  # Start llama-server (with optional LoRA adapter)
422
423
  self._update_status("Starting llama-server...")
423
- app._llama_process = LlamaServerProcess(app.model_path, port=8080, lora_path=app.adapter_path)
424
+ app._llama_process = LlamaServerProcess(app.model_path, port=8080, lora_path=app.adapter_path, model_family=app.model_family)
424
425
  app._llama_process.start()
425
426
 
426
427
  # Wait for model to load with progress updates
@@ -507,7 +508,7 @@ class MainScreen(Screen):
507
508
  self._update_connection_status("connecting")
508
509
 
509
510
  try:
510
- llm_proxy = LLMProxy("http://127.0.0.1:8080")
511
+ llm_proxy = LLMProxy("http://127.0.0.1:8080", model_family=app.model_family)
511
512
 
512
513
  # Load NLI service (run in thread pool to not block event loop)
513
514
  nli_service = None
@@ -138,7 +138,7 @@ class RunningScreen(Screen):
138
138
  self._update_status("model", "Model", app.model_path.name)
139
139
  self._log(f"Starting llama-server with {app.model_path.name}", "info")
140
140
 
141
- self._llama_process = LlamaServerProcess(app.model_path, port=8080)
141
+ self._llama_process = LlamaServerProcess(app.model_path, port=8080, model_family=app.model_family)
142
142
  self._llama_process.start()
143
143
 
144
144
  # Wait for model to load with progress updates
@@ -185,6 +185,14 @@ class RunningScreen(Screen):
185
185
  self._update_status("server", "llama-server", f"Running on :8080 ({elapsed}s)", "success")
186
186
  self._log(f"LLM ready in {elapsed}s", "success")
187
187
 
188
+ # Wire llama process to SDK server for runtime model switching
189
+ try:
190
+ from ...http_server import set_llama_process
191
+ models_dir = app.model_path.parent if app.model_path else None
192
+ set_llama_process(self._llama_process, models_dir)
193
+ except Exception:
194
+ pass # SDK server may not be running yet in all modes
195
+
188
196
  # Connect backend
189
197
  if not app.dev_mode:
190
198
  self._update_status("backend", "Backend", "Connecting...", "info")
@@ -200,7 +208,7 @@ class RunningScreen(Screen):
200
208
  get_dialogue_act_model_info,
201
209
  )
202
210
 
203
- llm_proxy = LLMProxy("http://127.0.0.1:8080")
211
+ llm_proxy = LLMProxy("http://127.0.0.1:8080", model_family=app.model_family)
204
212
 
205
213
  # Load NLI service (run in thread pool to not block event loop)
206
214
  nli_service = None
File without changes
File without changes
File without changes