loreguard-cli 0.12.2__tar.gz → 0.14.0rc1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/.github/workflows/release.yml +28 -8
  2. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/PKG-INFO +1 -1
  3. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/pyproject.toml +1 -1
  4. loreguard_cli-0.14.0rc1/sdk/API.md +249 -0
  5. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/sdk/python/loreguard_sdk.py +59 -3
  6. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/src/cli.py +45 -2
  7. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/src/config.py +40 -0
  8. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/src/http_server.py +82 -6
  9. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/src/llama_server.py +22 -2
  10. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/src/llm.py +16 -15
  11. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/src/main.py +5 -3
  12. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/src/tui/widgets/banner.py +3 -2
  13. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/src/tunnel.py +8 -0
  14. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/.claude/skills/llama-cpp-troubleshooting/SKILL.md +0 -0
  15. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/.env.example +0 -0
  16. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/.gitignore +0 -0
  17. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/LICENSE +0 -0
  18. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/README.md +0 -0
  19. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/THIRD_PARTY_NOTICES.md +0 -0
  20. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/scripts/build.py +0 -0
  21. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/sdk/csharp/LoreguardSDK.cs +0 -0
  22. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/sdk/gdscript/LoreguardSDK.gd +0 -0
  23. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/sdk/javascript/loreguard-sdk.js +0 -0
  24. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/src/__init__.py +0 -0
  25. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/src/__main__.py +0 -0
  26. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/src/chunk_detector.py +0 -0
  27. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/src/dialogue_act_classifier.py +0 -0
  28. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/src/hf_discovery.py +0 -0
  29. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/src/intent_classifier.py +0 -0
  30. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/src/models_registry.py +0 -0
  31. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/src/nli.py +0 -0
  32. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/src/npc_chat.py +0 -0
  33. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/src/runtime.py +0 -0
  34. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/src/steam.py +0 -0
  35. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/src/term_ui.py +0 -0
  36. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/src/tui/__init__.py +0 -0
  37. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/src/tui/app.py +0 -0
  38. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/src/tui/modals/__init__.py +0 -0
  39. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/src/tui/modals/auth_menu.py +0 -0
  40. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/src/tui/modals/npc_chat.py +0 -0
  41. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/src/tui/modals/token_input.py +0 -0
  42. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/src/tui/modals/unified_palette.py +0 -0
  43. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/src/tui/screens/__init__.py +0 -0
  44. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/src/tui/screens/auth.py +0 -0
  45. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/src/tui/screens/main.py +0 -0
  46. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/src/tui/screens/model_select.py +0 -0
  47. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/src/tui/screens/nli_setup.py +0 -0
  48. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/src/tui/screens/running.py +0 -0
  49. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/src/tui/styles.py +0 -0
  50. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/src/tui/widgets/__init__.py +0 -0
  51. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/src/tui/widgets/footer.py +0 -0
  52. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/src/tui/widgets/hardware_info.py +0 -0
  53. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/src/tui/widgets/npc_chat.py +0 -0
  54. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/src/tui/widgets/server_monitor.py +0 -0
  55. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/src/tui/widgets/status_panel.py +0 -0
  56. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/src/wizard.py +0 -0
  57. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/templates/llama31-no-tools.jinja +0 -0
  58. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/tests/test_nli_hhem.py +0 -0
  59. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/tests/test_websocket_timeout.py +0 -0
  60. {loreguard_cli-0.12.2 → loreguard_cli-0.14.0rc1}/uv.lock +0 -0
@@ -7,8 +7,21 @@ on:
7
7
  workflow_dispatch:
8
8
 
9
9
  jobs:
10
- build-windows:
11
- runs-on: windows-latest
10
+ build:
11
+ strategy:
12
+ matrix:
13
+ include:
14
+ - os: windows-latest
15
+ artifact-name: loreguard-windows-amd64.exe
16
+ build-output: dist/loreguard.exe
17
+ - os: macos-latest
18
+ artifact-name: loreguard-darwin-arm64
19
+ build-output: dist/loreguard
20
+ - os: ubuntu-latest
21
+ artifact-name: loreguard-linux-amd64
22
+ build-output: dist/loreguard
23
+
24
+ runs-on: ${{ matrix.os }}
12
25
 
13
26
  steps:
14
27
  - uses: actions/checkout@v4
@@ -26,30 +39,37 @@ jobs:
26
39
  - name: Build with PyInstaller
27
40
  run: python scripts/build.py
28
41
 
42
+ - name: Rename artifact
43
+ shell: bash
44
+ run: mv ${{ matrix.build-output }} dist/${{ matrix.artifact-name }}
45
+
29
46
  - name: Upload artifact
30
47
  uses: actions/upload-artifact@v4
31
48
  with:
32
- name: loreguard-windows.exe
33
- path: dist/loreguard.exe
49
+ name: ${{ matrix.artifact-name }}
50
+ path: dist/${{ matrix.artifact-name }}
34
51
 
35
52
  release:
36
- needs: build-windows
53
+ needs: build
37
54
  runs-on: ubuntu-latest
38
55
  if: startsWith(github.ref, 'refs/tags/')
39
56
  permissions:
40
57
  contents: write
41
58
 
42
59
  steps:
43
- - name: Download Windows artifact
60
+ - name: Download all artifacts
44
61
  uses: actions/download-artifact@v4
45
62
  with:
46
- name: loreguard-windows.exe
47
63
  path: artifacts
64
+ merge-multiple: true
48
65
 
49
66
  - name: Create Release
50
67
  uses: softprops/action-gh-release@v1
51
68
  with:
52
- files: artifacts/loreguard.exe
69
+ files: |
70
+ artifacts/loreguard-windows-amd64.exe
71
+ artifacts/loreguard-darwin-arm64
72
+ artifacts/loreguard-linux-amd64
53
73
  generate_release_notes: true
54
74
  env:
55
75
  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: loreguard-cli
3
- Version: 0.12.2
3
+ Version: 0.14.0rc1
4
4
  Summary: Local inference client for Loreguard NPCs
5
5
  Project-URL: Homepage, https://loreguard.com
6
6
  Project-URL: Documentation, https://github.com/beyond-logic-labs/loreguard-cli#readme
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "loreguard-cli"
7
- version = "0.12.2"
7
+ version = "0.14.0-rc.1"
8
8
  description = "Local inference client for Loreguard NPCs"
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -0,0 +1,249 @@
1
+ # Loreguard Client API Reference
2
+
3
+ Loreguard Client exposes a local HTTP API that any game can call. The server runs on `127.0.0.1` with a dynamic port written to `runtime.json`.
4
+
5
+ ## Service Discovery
6
+
7
+ On startup, loreguard-client writes a `runtime.json` file:
8
+
9
+ | Platform | Path |
10
+ |----------|------|
11
+ | macOS | `~/Library/Application Support/loreguard/runtime.json` |
12
+ | Linux | `~/.local/share/loreguard/runtime.json` (or `$XDG_DATA_HOME/loreguard/`) |
13
+ | Windows | `%APPDATA%/loreguard/runtime.json` |
14
+
15
+ ```json
16
+ {
17
+ "port": 52341,
18
+ "pid": 12345,
19
+ "url": "http://127.0.0.1:52341",
20
+ "started_at": "2026-02-20T10:30:00Z",
21
+ "version": "0.7.0",
22
+ "backend_connected": true
23
+ }
24
+ ```
25
+
26
+ Read this file to discover the port, then make HTTP calls to `http://127.0.0.1:{port}`.
27
+
28
+ ---
29
+
30
+ ## Endpoints
31
+
32
+ ### `GET /health`
33
+
34
+ Check if loreguard-client is running and connected to the backend.
35
+
36
+ **Response:**
37
+
38
+ ```json
39
+ {
40
+ "status": "ok",
41
+ "backend_connected": true
42
+ }
43
+ ```
44
+
45
+ Returns `500` if the server is in an error state.
46
+
47
+ ---
48
+
49
+ ### `GET /api/capabilities`
50
+
51
+ Discover what features this bundle supports. Use this to feature-detect before sending requests with optional fields.
52
+
53
+ **Response:**
54
+
55
+ ```json
56
+ {
57
+ "streaming": true,
58
+ "chunk_modes": ["deberta", "sentence"],
59
+ "manages_history": false
60
+ }
61
+ ```
62
+
63
+ | Field | Type | Description |
64
+ |-------|------|-------------|
65
+ | `streaming` | bool | Whether SSE streaming is supported |
66
+ | `chunk_modes` | string[] | Available chunk detection modes (e.g. `"deberta"`, `"sentence"`) |
67
+ | `manages_history` | bool | Whether the bundle can manage conversation history internally |
68
+
69
+ ---
70
+
71
+ ### `POST /api/chat`
72
+
73
+ Send a player message and get an NPC response. Supports both blocking JSON and SSE streaming.
74
+
75
+ **Request Headers:**
76
+
77
+ | Header | Value | Effect |
78
+ |--------|-------|--------|
79
+ | `Content-Type` | `application/json` | Required |
80
+ | `Accept` | `text/event-stream` | Enables SSE streaming (optional) |
81
+ | `Authorization` | `Bearer <token>` | API token for character access (optional) |
82
+
83
+ **Request Body:**
84
+
85
+ ```json
86
+ {
87
+ "character_id": "merchant-npc",
88
+ "message": "What do you have for sale?",
89
+ "player_handle": "player1",
90
+ "player_id": "uuid-here",
91
+ "current_context": "player is in the marketplace",
92
+ "scenario_id": "main-quest",
93
+ "history": [
94
+ {"role": "user", "content": "Hello!"},
95
+ {"role": "assistant", "content": "Welcome, traveler!"}
96
+ ],
97
+ "chunk_mode": "deberta",
98
+ "manage_history": false,
99
+ "max_speech_tokens": 150,
100
+ "verbose": false,
101
+ "enable_thinking": false
102
+ }
103
+ ```
104
+
105
+ | Field | Type | Required | Description |
106
+ |-------|------|----------|-------------|
107
+ | `character_id` | string | Yes | NPC identifier |
108
+ | `message` | string | Yes | Player's message |
109
+ | `player_handle` | string | No | Player's display name |
110
+ | `player_id` | string | No | Player's unique ID for per-player state. If empty, backend uses the developer's owner ID |
111
+ | `current_context` | string | No | Game context (location, situation) |
112
+ | `scenario_id` | string | No | Scenario identifier |
113
+ | `history` | array | No | Conversation history. Omit if `manage_history` is true |
114
+ | `chunk_mode` | string | No | `"deberta"` for ML-based chunk splitting, `"sentence"` for regex sentence splitting, `""` or omit for none |
115
+ | `manage_history` | bool | No | If true, the backend manages conversation history per character+player pair |
116
+ | `max_speech_tokens` | int | No | Maximum tokens in NPC speech (0 = default) |
117
+ | `verbose` | bool | No | Include pipeline pass updates in response (for debugging) |
118
+ | `enable_thinking` | bool | No | Include NPC internal monologue |
119
+
120
+ All field names accept both `snake_case` and `camelCase` (e.g. `character_id` or `characterId`).
121
+
122
+ #### JSON Response (default)
123
+
124
+ When `Accept` is not `text/event-stream`:
125
+
126
+ ```json
127
+ {
128
+ "response": "I have potions, swords, and shields. What interests you?",
129
+ "verified": true,
130
+ "citations": ["knowledge/inventory.md:5"],
131
+ "chunks": [
132
+ "I have potions, swords, and shields.",
133
+ "What interests you?"
134
+ ],
135
+ "pipeline_trace": []
136
+ }
137
+ ```
138
+
139
+ | Field | Type | Description |
140
+ |-------|------|-------------|
141
+ | `response` | string | Full NPC speech |
142
+ | `verified` | bool | Whether NeMo verification passed |
143
+ | `citations` | string[] | Knowledge sources used |
144
+ | `chunks` | string[] | Sentence/chunk boundaries (only present when `chunk_mode` was set) |
145
+ | `pipeline_trace` | array | Pipeline pass details (only present when `verbose` was true) |
146
+
147
+ #### SSE Streaming Response
148
+
149
+ When `Accept: text/event-stream` is set, the response is a stream of Server-Sent Events:
150
+
151
+ ```
152
+ event: filler
153
+ data: {"text": "Hmm...", "dialogueAct": "wh-question"}
154
+
155
+ event: pass_update
156
+ data: {"pass": "retrieval", "status": "complete", "latencyMs": 120}
157
+
158
+ event: token
159
+ data: {"t": "I"}
160
+
161
+ event: token
162
+ data: {"t": " have"}
163
+
164
+ event: token
165
+ data: {"t": " potions"}
166
+
167
+ event: done
168
+ data: {"speech": "I have potions...", "verified": true, "citations": [...], "chunks": [...]}
169
+
170
+ event: follow_up
171
+ data: {"speech": "By the way, new stock arrives tomorrow.", ...}
172
+ ```
173
+
174
+ | Event | Data | Description |
175
+ |-------|------|-------------|
176
+ | `filler` | `{text, dialogueAct}` | Contextual filler message. Sent early (~100ms) before the pipeline completes. Display immediately for perceived responsiveness |
177
+ | `token` | `{t}` | Single token from the LLM. Append to build the response incrementally |
178
+ | `pass_update` | `{pass, status, ...}` | Pipeline pass progress (only when `verbose` is true). For debugging |
179
+ | `done` | `{speech, verified, citations, chunks, ...}` | Final verified response. Contains the same fields as the JSON response |
180
+ | `follow_up` | `{speech, ...}` | Unsolicited follow-up message from the NPC (may arrive after `done`) |
181
+ | `error` | `{error}` | Error message. Stream ends after this |
182
+
183
+ ---
184
+
185
+ ## Integration Patterns
186
+
187
+ ### Minimal (any language)
188
+
189
+ Just POST JSON and read the response:
190
+
191
+ ```
192
+ POST http://127.0.0.1:{port}/api/chat
193
+ Content-Type: application/json
194
+
195
+ {"character_id": "merchant", "message": "Hello!"}
196
+ ```
197
+
198
+ ### With Chunks (for staggered display)
199
+
200
+ Request chunked responses and display each chunk separately with delays:
201
+
202
+ ```json
203
+ {
204
+ "character_id": "merchant",
205
+ "message": "Hello!",
206
+ "chunk_mode": "deberta"
207
+ }
208
+ ```
209
+
210
+ Response includes `chunks` array. Display each chunk as a separate message/bubble with ~500-700ms delays between them.
211
+
212
+ ### With Server-Managed History
213
+
214
+ Let the backend track conversation history so your game doesn't have to:
215
+
216
+ ```json
217
+ {
218
+ "character_id": "merchant",
219
+ "message": "Hello!",
220
+ "player_id": "player-uuid",
221
+ "manage_history": true
222
+ }
223
+ ```
224
+
225
+ No `history` field needed. The backend maintains a rolling conversation buffer per `(character_id, player_id)` pair.
226
+
227
+ ### With Streaming + Filler
228
+
229
+ For games with real-time UX (speech bubbles, typing indicators):
230
+
231
+ 1. Set `Accept: text/event-stream`
232
+ 2. On `filler` event: show typing indicator or filler text ("Hmm...")
233
+ 3. On `token` events: append tokens to display
234
+ 4. On `done` event: finalize the response, show verified status
235
+
236
+ ---
237
+
238
+ ## SDK Files
239
+
240
+ Pre-built SDK files for common engines are in the `sdk/` directory. Copy the relevant file into your project:
241
+
242
+ | Engine | File | Notes |
243
+ |--------|------|-------|
244
+ | Python | `sdk/python/loreguard_sdk.py` | Requires `httpx`. Async + sync support |
245
+ | JavaScript / Electron | `sdk/javascript/loreguard-sdk.js` | Node.js CommonJS. Uses `fetch` |
246
+ | Unity / C# | `sdk/csharp/LoreguardSDK.cs` | Uses `UnityWebRequest`. Coroutine-based |
247
+ | Godot 4 | `sdk/gdscript/LoreguardSDK.gd` | Signal-based. Supports streaming |
248
+
249
+ These are thin HTTP wrappers around the endpoints documented above. You can also call the API directly from any language that supports HTTP.
@@ -145,7 +145,12 @@ async def chat(
145
145
  character_id: str,
146
146
  message: str,
147
147
  player_handle: str = "",
148
+ player_id: str = "",
148
149
  current_context: str = "",
150
+ history: list[dict[str, Any]] | None = None,
151
+ chunk_mode: str = "",
152
+ manage_history: bool = False,
153
+ max_speech_tokens: int = 0,
149
154
  stream: bool = True,
150
155
  ) -> AsyncIterator[dict[str, Any]]:
151
156
  """Chat with an NPC via loreguard-client.
@@ -154,12 +159,17 @@ async def chat(
154
159
  character_id: The NPC's ID
155
160
  message: Player's message to the NPC
156
161
  player_handle: Player's display name (optional)
162
+ player_id: Player's unique ID for per-player state (optional)
157
163
  current_context: Game context like "in a dark cave" (optional)
164
+ history: Conversation history as [{"role": "user"|"assistant", "content": "..."}] (optional)
165
+ chunk_mode: Chunk detection mode — "deberta" for ML-based, "" for none (optional)
166
+ manage_history: If True, bundle manages history internally per character+player (optional)
167
+ max_speech_tokens: Max tokens in NPC speech (optional, 0 = default)
158
168
  stream: If True, yields tokens as they arrive. If False, yields final response.
159
169
 
160
170
  Yields:
161
171
  For streaming: {"t": "token"} for each token, then {"speech": "...", "verified": True, ...}
162
- For non-streaming: Single dict with complete response
172
+ For non-streaming: Single dict with complete response (includes "chunks" if chunk_mode set)
163
173
 
164
174
  Raises:
165
175
  RuntimeError: If loreguard-client is not running
@@ -181,12 +191,22 @@ async def chat(
181
191
  if stream:
182
192
  headers["Accept"] = "text/event-stream"
183
193
 
184
- body = {
194
+ body: dict[str, Any] = {
185
195
  "character_id": character_id,
186
196
  "message": message,
187
197
  "player_handle": player_handle,
188
198
  "current_context": current_context,
189
199
  }
200
+ if player_id:
201
+ body["player_id"] = player_id
202
+ if history:
203
+ body["history"] = history
204
+ if chunk_mode:
205
+ body["chunk_mode"] = chunk_mode
206
+ if manage_history:
207
+ body["manage_history"] = True
208
+ if max_speech_tokens > 0:
209
+ body["max_speech_tokens"] = max_speech_tokens
190
210
 
191
211
  async with httpx.AsyncClient() as client:
192
212
  if stream:
@@ -214,6 +234,26 @@ async def chat(
214
234
  yield response.json()
215
235
 
216
236
 
237
+ async def get_capabilities() -> dict[str, Any]:
238
+ """Get bundle capabilities.
239
+
240
+ Returns:
241
+ Capabilities dict with streaming, chunk_modes, manages_history.
242
+
243
+ Raises:
244
+ RuntimeError: If loreguard-client is not running
245
+ ImportError: If httpx is not installed
246
+ """
247
+ if httpx is None:
248
+ raise ImportError("httpx is required. Install with: pip install httpx")
249
+
250
+ url = f"{get_base_url()}/api/capabilities"
251
+ async with httpx.AsyncClient() as client:
252
+ response = await client.get(url, timeout=5.0)
253
+ response.raise_for_status()
254
+ return response.json()
255
+
256
+
217
257
  async def health_check() -> dict[str, Any]:
218
258
  """Check loreguard-client health.
219
259
 
@@ -239,7 +279,12 @@ def chat_sync(
239
279
  character_id: str,
240
280
  message: str,
241
281
  player_handle: str = "",
282
+ player_id: str = "",
242
283
  current_context: str = "",
284
+ history: list[dict[str, Any]] | None = None,
285
+ chunk_mode: str = "",
286
+ manage_history: bool = False,
287
+ max_speech_tokens: int = 0,
243
288
  ) -> dict[str, Any]:
244
289
  """Synchronous chat (non-streaming).
245
290
 
@@ -247,17 +292,28 @@ def chat_sync(
247
292
 
248
293
  Returns:
249
294
  Complete response dict with speech, verified, citations, etc.
295
+ Includes "chunks" list if chunk_mode was set.
250
296
  """
251
297
  if httpx is None:
252
298
  raise ImportError("httpx is required. Install with: pip install httpx")
253
299
 
254
300
  url = f"{get_base_url()}/api/chat"
255
- body = {
301
+ body: dict[str, Any] = {
256
302
  "character_id": character_id,
257
303
  "message": message,
258
304
  "player_handle": player_handle,
259
305
  "current_context": current_context,
260
306
  }
307
+ if player_id:
308
+ body["player_id"] = player_id
309
+ if history:
310
+ body["history"] = history
311
+ if chunk_mode:
312
+ body["chunk_mode"] = chunk_mode
313
+ if manage_history:
314
+ body["manage_history"] = True
315
+ if max_speech_tokens > 0:
316
+ body["max_speech_tokens"] = max_speech_tokens
261
317
 
262
318
  with httpx.Client() as client:
263
319
  response = client.post(url, json=body, timeout=120.0)
@@ -240,12 +240,33 @@ class LoreguardCLI:
240
240
  try:
241
241
  llm_proxy = LLMProxy(f"http://127.0.0.1:{self.port}")
242
242
 
243
+ # ADR-0027: Load all ML services — the client is the sole provider
244
+ # of NLI, intent, dialogue act, and chunk capabilities.
245
+ # Use resolve_model_path() to prefer pre-shipped models (enterprise bundles).
246
+ from .config import resolve_model_path
247
+
248
+ # Initialize NLI service (HHEM grounding model)
249
+ nli_service = None
250
+ try:
251
+ from .nli import NLIService
252
+ nli_model = resolve_model_path("vectara/hallucination_evaluation_model", "hhem")
253
+ log.info(f"Loading NLI model ({nli_model})...")
254
+ nli_service = NLIService(model_path=nli_model)
255
+ if nli_service.load_model():
256
+ log.info(f"NLI ready (device: {nli_service.device})")
257
+ else:
258
+ log.warning("NLI model failed to load")
259
+ nli_service = None
260
+ except Exception as e:
261
+ log.warning(f"NLI error: {e}")
262
+
243
263
  # Initialize intent classifier (ADR-0010)
244
264
  intent_classifier = None
245
265
  try:
246
266
  from .intent_classifier import IntentClassifier
247
- log.info("Loading intent classifier...")
248
- intent_classifier = IntentClassifier()
267
+ intent_model = resolve_model_path("MoritzLaurer/DeBERTa-v3-large-zeroshot-v2.0", "deberta")
268
+ log.info(f"Loading intent classifier ({intent_model})...")
269
+ intent_classifier = IntentClassifier(model_path=intent_model)
249
270
  if intent_classifier.load_model():
250
271
  log.info(f"Intent classifier ready (device: {intent_classifier.device})")
251
272
  else:
@@ -254,6 +275,26 @@ class LoreguardCLI:
254
275
  except Exception as e:
255
276
  log.warning(f"Intent classifier error: {e}")
256
277
 
278
+ # Initialize dialogue act classifier
279
+ dialogue_act_classifier = None
280
+ try:
281
+ from .dialogue_act_classifier import (
282
+ DialogueActClassifier,
283
+ is_dialogue_act_model_available,
284
+ )
285
+ if is_dialogue_act_model_available():
286
+ log.info("Loading dialogue act classifier...")
287
+ dialogue_act_classifier = DialogueActClassifier()
288
+ if dialogue_act_classifier.load_model():
289
+ log.info(f"Dialogue act classifier ready (device: {dialogue_act_classifier.device})")
290
+ else:
291
+ log.warning("Dialogue act classifier failed to load")
292
+ dialogue_act_classifier = None
293
+ else:
294
+ log.info("Dialogue act model not available, skipping")
295
+ except Exception as e:
296
+ log.warning(f"Dialogue act classifier error: {e}")
297
+
257
298
  # Initialize chunk detector (ADR-0023) - shares model with intent classifier
258
299
  chunk_detector = None
259
300
  try:
@@ -278,7 +319,9 @@ class LoreguardCLI:
278
319
  worker_id=self.worker_id,
279
320
  worker_token=self.token,
280
321
  model_id=self.model_path.stem if self.model_path else "unknown",
322
+ nli_service=nli_service,
281
323
  intent_classifier=intent_classifier,
324
+ dialogue_act_classifier=dialogue_act_classifier,
282
325
  chunk_detector=chunk_detector,
283
326
  )
284
327
 
@@ -152,9 +152,49 @@ def load_config() -> dict:
152
152
 
153
153
  # Context compaction: if True, truncate old messages instead of erroring
154
154
  "CONTEXT_COMPACTION": os.getenv("CONTEXT_COMPACTION", "true").lower() == "true",
155
+
156
+ # ADR-0027: Pre-shipped models directory for enterprise bundles.
157
+ # When set, model loaders check this directory first before downloading from HF.
158
+ # Expected subdirectories: hhem/, deberta/, distilbert/, llm/
159
+ "MODELS_DIR": os.getenv("LOREGUARD_MODELS_DIR", ""),
160
+
161
+ # Pre-shipped llama-server binary path (enterprise bundles).
162
+ # When set, skips auto-download and uses this binary directly.
163
+ "LLAMA_SERVER_PATH": os.getenv("LOREGUARD_LLAMA_SERVER_PATH", ""),
155
164
  }
156
165
 
157
166
 
167
+ def get_models_dir() -> Optional[Path]:
168
+ """Get the pre-shipped models directory, if configured (ADR-0027).
169
+
170
+ Returns None if not set, meaning models should be auto-downloaded from HF.
171
+ """
172
+ models_dir = get_config_value("MODELS_DIR")
173
+ if models_dir:
174
+ path = Path(models_dir)
175
+ if path.exists() and path.is_dir():
176
+ return path
177
+ return None
178
+
179
+
180
+ def resolve_model_path(model_name: str, subdir: str = "") -> str:
181
+ """Resolve a model path, preferring pre-shipped models over HF downloads.
182
+
183
+ Args:
184
+ model_name: HuggingFace model name (e.g., 'vectara/hallucination_evaluation_model')
185
+ subdir: Subdirectory within MODELS_DIR to check (e.g., 'hhem', 'deberta')
186
+
187
+ Returns:
188
+ Local path if pre-shipped model found, otherwise the original HF model name.
189
+ """
190
+ models_dir = get_models_dir()
191
+ if models_dir and subdir:
192
+ local_path = models_dir / subdir
193
+ if local_path.exists() and any(local_path.iterdir()):
194
+ return str(local_path)
195
+ return model_name
196
+
197
+
158
198
  def get_config_value(key: str, default: Optional[str] = None) -> Optional[str]:
159
199
  """Get a single configuration value."""
160
200
  config = load_config()
@@ -1,12 +1,18 @@
1
- """Embedded HTTP server for game SDK connections.
1
+ """Embedded HTTP server for game SDK connections (ADR-0027).
2
2
 
3
- This module provides a lightweight HTTP server that can be started
4
- alongside the TUI or CLI to handle game client requests via SSE.
3
+ This module provides the official SDK HTTP interface for games to interact
4
+ with loreguard. Games discover this server via runtime.json and call its
5
+ HTTP endpoints:
6
+
7
+ GET /health - Health check with backend connection status
8
+ GET /api/capabilities - Feature discovery (streaming, chunk modes)
9
+ GET /api/characters - List available NPCs (proxied from engine)
10
+ POST /api/chat - Chat with an NPC (streaming SSE or JSON)
5
11
 
6
12
  The server shares the existing tunnel connection instead of creating
7
- a new one, ensuring a single WebSocket connection per player.
13
+ a new one, ensuring a single WebSocket connection per worker.
8
14
 
9
- Uses hypercorn with socket-first binding for race-condition-free port allocation.
15
+ Uses uvicorn with socket-first binding for race-condition-free port allocation.
10
16
  """
11
17
 
12
18
  import asyncio
@@ -266,6 +272,8 @@ class EmbeddedHTTPServer:
266
272
  "verified": data.get("verified", False),
267
273
  "citations": data.get("citations", []),
268
274
  }
275
+ if data.get("chunks"):
276
+ result["chunks"] = data["chunks"]
269
277
  if pipeline_trace:
270
278
  result["pipeline_trace"] = pipeline_trace
271
279
  return result
@@ -313,16 +321,78 @@ class EmbeddedHTTPServer:
313
321
  async def health():
314
322
  try:
315
323
  backend_connected = server.tunnel.connected if server.tunnel else False
316
- return {
324
+ result = {
317
325
  "status": "ok",
318
326
  "backend_connected": backend_connected,
319
327
  }
328
+ # Include capabilities for game clients to check readiness
329
+ if server.tunnel and hasattr(server.tunnel, "capabilities"):
330
+ result["capabilities"] = server.tunnel.capabilities
331
+ return result
320
332
  except Exception as e:
321
333
  return JSONResponse(
322
334
  status_code=500,
323
335
  content={"status": "error", "error": str(e)},
324
336
  )
325
337
 
338
+ @app.get("/api/capabilities")
339
+ async def capabilities():
340
+ caps = {
341
+ "streaming": True,
342
+ "chunk_modes": ["sentence"],
343
+ "manages_history": False,
344
+ }
345
+ if server.tunnel:
346
+ if getattr(server.tunnel, "chunk_detector", None) and server.tunnel.chunk_detector.is_loaded:
347
+ caps["chunk_modes"].append("deberta")
348
+ return caps
349
+
350
+ @app.get("/api/characters")
351
+ async def characters(request: Request):
352
+ """Proxy character listing from the engine (ADR-0027).
353
+
354
+ Games discover NPCs through the client SDK, not by calling the engine directly.
355
+ """
356
+ if not server.tunnel or not server.tunnel.connected:
357
+ return JSONResponse(
358
+ status_code=503,
359
+ content={"error": "Not connected to backend. Start the engine first."},
360
+ )
361
+
362
+ # Derive HTTP base URL from WebSocket URL
363
+ # ws://localhost:8090/workers → http://localhost:8090
364
+ # wss://api.loreguard.com/workers → https://api.loreguard.com
365
+ backend_ws = server.tunnel.backend_url
366
+ if backend_ws.startswith("wss://"):
367
+ base_url = "https://" + backend_ws[6:].split("/")[0]
368
+ elif backend_ws.startswith("ws://"):
369
+ base_url = "http://" + backend_ws[5:].split("/")[0]
370
+ else:
371
+ return JSONResponse(
372
+ status_code=500,
373
+ content={"error": f"Cannot derive HTTP URL from backend: {backend_ws}"},
374
+ )
375
+
376
+ try:
377
+ import httpx
378
+ # Forward Authorization header if present
379
+ headers = {}
380
+ auth_header = request.headers.get("authorization", "")
381
+ if auth_header:
382
+ headers["Authorization"] = auth_header
383
+
384
+ async with httpx.AsyncClient(timeout=10.0) as client:
385
+ resp = await client.get(f"{base_url}/api/characters", headers=headers)
386
+ return JSONResponse(
387
+ status_code=resp.status_code,
388
+ content=resp.json(),
389
+ )
390
+ except Exception as e:
391
+ return JSONResponse(
392
+ status_code=502,
393
+ content={"error": f"Failed to reach engine: {e}"},
394
+ )
395
+
326
396
  @app.post("/api/chat")
327
397
  async def chat(request: Request):
328
398
  if not server.tunnel or not server.tunnel.connected:
@@ -340,6 +410,8 @@ class EmbeddedHTTPServer:
340
410
  scenario_id = body.get("scenario_id", body.get("scenarioId", ""))
341
411
  enable_thinking = body.get("enable_thinking", body.get("enableThinking", False))
342
412
  max_speech_tokens = body.get("max_speech_tokens", body.get("maxSpeechTokens", 0))
413
+ chunk_mode = body.get("chunk_mode", body.get("chunkMode", ""))
414
+ manage_history = body.get("manage_history", body.get("manageHistory", False))
343
415
  accept = request.headers.get("accept", "")
344
416
  streaming = "text/event-stream" in accept
345
417
 
@@ -366,6 +438,8 @@ class EmbeddedHTTPServer:
366
438
  verbose=body.get("verbose", False),
367
439
  api_token=api_token,
368
440
  max_speech_tokens=max_speech_tokens,
441
+ chunk_mode=chunk_mode,
442
+ manage_history=manage_history,
369
443
  )
370
444
  )
371
445
  # Wait for the result
@@ -389,6 +463,8 @@ class EmbeddedHTTPServer:
389
463
  verbose=body.get("verbose", False),
390
464
  api_token=api_token,
391
465
  max_speech_tokens=max_speech_tokens,
466
+ chunk_mode=chunk_mode,
467
+ manage_history=manage_history,
392
468
  )
393
469
 
394
470
  if streaming:
@@ -106,7 +106,18 @@ def get_slot_cache_dir() -> Path:
106
106
 
107
107
 
108
108
  def get_llama_server_path() -> Path:
109
- """Get the path to llama-server binary."""
109
+ """Get the path to llama-server binary.
110
+
111
+ ADR-0027: Checks LOREGUARD_LLAMA_SERVER_PATH first for pre-shipped binaries
112
+ (enterprise bundles), then falls back to the default bin directory.
113
+ """
114
+ # Check for pre-shipped binary (enterprise bundle)
115
+ override = os.environ.get("LOREGUARD_LLAMA_SERVER_PATH", "")
116
+ if override:
117
+ p = Path(override)
118
+ if p.exists() and p.is_file():
119
+ return p
120
+
110
121
  plat = get_platform()
111
122
  binary_name = BINARIES[plat]["binary_name"]
112
123
  return get_bin_dir() / binary_name
@@ -129,7 +140,16 @@ def get_installed_version() -> Optional[str]:
129
140
 
130
141
 
131
142
  def is_llama_server_installed() -> bool:
132
- """Check if llama-server is installed with the correct version."""
143
+ """Check if llama-server is installed with the correct version.
144
+
145
+ ADR-0027: If LOREGUARD_LLAMA_SERVER_PATH is set and the binary exists,
146
+ always returns True (pre-shipped binary, skip version check).
147
+ """
148
+ # Pre-shipped binary always counts as installed
149
+ override = os.environ.get("LOREGUARD_LLAMA_SERVER_PATH", "")
150
+ if override and Path(override).exists():
151
+ return True
152
+
133
153
  server_path = get_llama_server_path()
134
154
  if not (server_path.exists() and server_path.is_file()):
135
155
  return False
@@ -572,25 +572,26 @@ class LLMProxy:
572
572
  payload["enable_thinking"] = False
573
573
 
574
574
  # Force JSON output if requested
575
- # Use json_object type with schema field - this is supported in llama.cpp server
576
- # (server-common.cpp extracts schema from response_format.schema for json_object type)
577
- # Note: json_schema type has a bug (issue #10732, PR #18963 pending)
578
575
  if req.force_json:
579
- # Merge system prompt into user message for "Content-only" template
580
- if len(req.messages) >= 2 and req.messages[0].get("role") == "system":
581
- system_content = req.messages[0]["content"]
582
- user_content = req.messages[-1]["content"]
583
- merged = f"INSTRUCTIONS:\n{system_content}\n\nREQUEST:\n{user_content}"
584
- payload["messages"] = [{"role": "user", "content": merged}]
585
- logger.debug("JSON MODE: Merged system into user message")
586
-
587
- # Use json_object with schema for proper constraint enforcement
588
576
  if req.json_schema:
589
- payload["response_format"] = {"type": "json_object", "schema": req.json_schema}
590
- logger.info(f"JSON MODE: response_format=json_object with schema")
577
+ # Use top-level json_schema field for grammar-constrained JSON.
578
+ # This bypasses the chat template (which corrupts schemas on Llama 3.1
579
+ # by routing them through function-calling) and goes directly to GBNF
580
+ # grammar conversion.
581
+ payload["json_schema"] = req.json_schema
582
+ logger.info("JSON MODE: top-level json_schema for grammar constraint")
591
583
  else:
584
+ # Generic JSON object mode (no schema constraint).
585
+ # response_format changes chat template to "Content-only" which loses
586
+ # system prompt context, so merge system into user message.
587
+ if len(req.messages) >= 2 and req.messages[0].get("role") == "system":
588
+ system_content = req.messages[0]["content"]
589
+ user_content = req.messages[-1]["content"]
590
+ merged = f"INSTRUCTIONS:\n{system_content}\n\nREQUEST:\n{user_content}"
591
+ payload["messages"] = [{"role": "user", "content": merged}]
592
+ logger.debug("JSON MODE: Merged system into user message")
592
593
  payload["response_format"] = {"type": "json_object"}
593
- logger.info(f"JSON MODE: response_format=json_object (no schema)")
594
+ logger.info("JSON MODE: response_format=json_object (no schema)")
594
595
 
595
596
  # Use per-request timeout if specified
596
597
  timeout = req.timeout or self.default_timeout
@@ -31,7 +31,7 @@ from rich.console import Console
31
31
 
32
32
  from .tunnel import BackendTunnel
33
33
  from .llm import LLMProxy
34
- from .config import get_config_value
34
+ from .config import get_config_value, resolve_model_path
35
35
  from .nli import NLIService, is_nli_model_available
36
36
  from .intent_classifier import IntentClassifier, is_intent_model_available
37
37
  from .dialogue_act_classifier import (
@@ -91,7 +91,8 @@ async def startup():
91
91
  enable_nli = os.getenv("LOREGUARD_NLI_ENABLED", "true").lower() == "true"
92
92
  if enable_nli:
93
93
  console.print("[cyan]Initializing NLI service...[/cyan]")
94
- nli_service = NLIService()
94
+ nli_model = resolve_model_path("vectara/hallucination_evaluation_model", "hhem")
95
+ nli_service = NLIService(model_path=nli_model)
95
96
  if nli_service.load_model():
96
97
  console.print(f"[green]NLI service ready (device: {nli_service.device})[/green]")
97
98
  else:
@@ -105,7 +106,8 @@ async def startup():
105
106
  enable_intent = os.getenv("LOREGUARD_INTENT_ENABLED", "true").lower() == "true"
106
107
  if enable_intent:
107
108
  console.print("[cyan]Initializing intent classifier...[/cyan]")
108
- intent_classifier = IntentClassifier()
109
+ intent_model = resolve_model_path("MoritzLaurer/DeBERTa-v3-large-zeroshot-v2.0", "deberta")
110
+ intent_classifier = IntentClassifier(model_path=intent_model)
109
111
  if intent_classifier.load_model():
110
112
  console.print(f"[green]Intent classifier ready (device: {intent_classifier.device})[/green]")
111
113
  else:
@@ -5,6 +5,7 @@ from rich.text import Text
5
5
  from rich.style import Style
6
6
 
7
7
  from ..styles import FG_DIM, PINK
8
+ from ...runtime import get_version
8
9
 
9
10
  # Simple stylized logo
10
11
  LOGO = r"""
@@ -43,9 +44,9 @@ class LoreguardBanner(Static):
43
44
  }
44
45
  """
45
46
 
46
- def __init__(self, version: str = "0.11.0") -> None:
47
+ def __init__(self, version: str = None) -> None:
47
48
  super().__init__()
48
- self._version = version
49
+ self._version = version or get_version()
49
50
 
50
51
  def render(self) -> Text:
51
52
  """Render minimal banner."""
@@ -468,6 +468,7 @@ class BackendTunnel:
468
468
  elif msg_type == "pass_update":
469
469
  # Pipeline pass update (verbose mode)
470
470
  payload = data.get("payload", {})
471
+ self._log(f"[pass_update] received pass={payload.get('pass','?')} name={payload.get('name','?')}", "info")
471
472
  if self.on_pass_update:
472
473
  self.on_pass_update(payload)
473
474
  # Also route to per-request queue for HTTP/SSE clients
@@ -1388,6 +1389,8 @@ class BackendTunnel:
1388
1389
  verbose: bool = False,
1389
1390
  api_token: str = "",
1390
1391
  max_speech_tokens: int = 0,
1392
+ chunk_mode: str = "",
1393
+ manage_history: bool = False,
1391
1394
  ) -> asyncio.Queue[dict[str, Any]]:
1392
1395
  """Send a chat request to the backend and return a queue for responses.
1393
1396
 
@@ -1434,6 +1437,10 @@ class BackendTunnel:
1434
1437
  # Only include maxSpeechTokens if explicitly set (non-zero)
1435
1438
  if max_speech_tokens > 0:
1436
1439
  payload["maxSpeechTokens"] = max_speech_tokens
1440
+ if chunk_mode:
1441
+ payload["chunkMode"] = chunk_mode
1442
+ if manage_history:
1443
+ payload["manageHistory"] = True
1437
1444
 
1438
1445
  await self._send({
1439
1446
  "id": self._generate_message_id(),
@@ -1498,6 +1505,7 @@ class BackendTunnel:
1498
1505
  "type": "done",
1499
1506
  "data": {
1500
1507
  "speech": payload.get("speech", ""),
1508
+ "chunks": payload.get("chunks"),
1501
1509
  "thoughts": payload.get("thoughts", ""),
1502
1510
  "citations": payload.get("citations", []),
1503
1511
  "verified": payload.get("verified", False),