lemonade-sdk 7.0.0__py3-none-any.whl → 7.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lemonade-sdk might be problematic. Click here for more details.
- lemonade/tools/server/instructions.py +294 -0
- lemonade/tools/server/llamacpp.py +289 -0
- lemonade/tools/server/pydantic_models.py +83 -0
- lemonade/tools/server/serve.py +152 -146
- lemonade/tools/server/static/styles.css +313 -0
- lemonade/tools/server/tool_calls.py +50 -43
- lemonade/version.py +1 -1
- {lemonade_sdk-7.0.0.dist-info → lemonade_sdk-7.0.1.dist-info}/METADATA +4 -7
- {lemonade_sdk-7.0.0.dist-info → lemonade_sdk-7.0.1.dist-info}/RECORD +17 -13
- {lemonade_sdk-7.0.0.dist-info → lemonade_sdk-7.0.1.dist-info}/WHEEL +1 -1
- lemonade_server/cli.py +4 -2
- lemonade_server/model_manager.py +34 -17
- lemonade_server/server_models.json +42 -0
- {lemonade_sdk-7.0.0.dist-info → lemonade_sdk-7.0.1.dist-info}/entry_points.txt +0 -0
- {lemonade_sdk-7.0.0.dist-info → lemonade_sdk-7.0.1.dist-info}/licenses/LICENSE +0 -0
- {lemonade_sdk-7.0.0.dist-info → lemonade_sdk-7.0.1.dist-info}/licenses/NOTICE.md +0 -0
- {lemonade_sdk-7.0.0.dist-info → lemonade_sdk-7.0.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
import json
|
|
3
|
+
from fastapi.responses import HTMLResponse
|
|
4
|
+
from lemonade_server.model_manager import ModelManager
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def get_instructions_html(port=8000):
|
|
8
|
+
"""
|
|
9
|
+
Show instructions on how to use the server.
|
|
10
|
+
"""
|
|
11
|
+
# Load server models from JSON
|
|
12
|
+
server_models_path = (
|
|
13
|
+
Path(__file__).parent.parent.parent.parent
|
|
14
|
+
/ "lemonade_server"
|
|
15
|
+
/ "server_models.json"
|
|
16
|
+
)
|
|
17
|
+
with open(server_models_path, "r", encoding="utf-8") as f:
|
|
18
|
+
server_models = json.load(f)
|
|
19
|
+
|
|
20
|
+
# Use shared filter function from model_manager.py
|
|
21
|
+
filtered_models = ModelManager().filter_models_by_backend(server_models)
|
|
22
|
+
|
|
23
|
+
# Pass filtered server_models to JS
|
|
24
|
+
server_models_js = (
|
|
25
|
+
f"<script>window.SERVER_MODELS = {json.dumps(filtered_models)};</script>"
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
# New lemon-themed HTML structure
|
|
29
|
+
# pylint: disable=W1401
|
|
30
|
+
styled_html = f"""
|
|
31
|
+
<!DOCTYPE html>
|
|
32
|
+
<html lang=\"en\">
|
|
33
|
+
<head>
|
|
34
|
+
<meta charset=\"UTF-8\">
|
|
35
|
+
<meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">
|
|
36
|
+
<title>Lemonade Server</title>
|
|
37
|
+
<link rel="icon" href="data:,">
|
|
38
|
+
<link rel=\"stylesheet\" href=\"/static/styles.css\">
|
|
39
|
+
<script>
|
|
40
|
+
window.SERVER_PORT = {port};
|
|
41
|
+
</script>
|
|
42
|
+
{server_models_js}
|
|
43
|
+
</head>
|
|
44
|
+
<body>
|
|
45
|
+
<nav class=\"navbar\">
|
|
46
|
+
<a href=\"https://github.com/lemonade-sdk/lemonade\">GitHub</a>
|
|
47
|
+
<a href=\"https://lemonade-server.ai/docs/\">Docs</a>
|
|
48
|
+
<a href=\"https://lemonade-server.ai/docs/server/server_models/\">Models</a>
|
|
49
|
+
<a href=\"https://lemonade-server.ai/docs/server/apps/\">Featured Apps</a>
|
|
50
|
+
</nav>
|
|
51
|
+
<main class=\"main\">
|
|
52
|
+
<div class=\"title\">🍋 Lemonade Server</div>
|
|
53
|
+
<div class=\"tab-container\">
|
|
54
|
+
<div class=\"tabs\">
|
|
55
|
+
<button class=\"tab active\" id=\"tab-chat\" onclick=\"showTab('chat')\">LLM Chat</button>
|
|
56
|
+
<button class=\"tab\" id=\"tab-models\" onclick=\"showTab('models')\">Model Management</button>
|
|
57
|
+
</div>
|
|
58
|
+
<div class=\"tab-content active\" id=\"content-chat\">
|
|
59
|
+
<div class=\"chat-container\">
|
|
60
|
+
<div class=\"chat-history\" id=\"chat-history\"></div>
|
|
61
|
+
<div class=\"chat-input-row\">
|
|
62
|
+
<select id=\"model-select\"></select>
|
|
63
|
+
<input type=\"text\" id=\"chat-input\" placeholder=\"Type your message...\" />
|
|
64
|
+
<button id=\"send-btn\">Send</button>
|
|
65
|
+
</div>
|
|
66
|
+
</div>
|
|
67
|
+
</div>
|
|
68
|
+
<div class=\"tab-content\" id=\"content-models\">
|
|
69
|
+
<div class=\"model-mgmt-container\">
|
|
70
|
+
<div class=\"model-mgmt-pane\">
|
|
71
|
+
<h3>Installed Models</h3>
|
|
72
|
+
<table class=\"model-table\" id=\"installed-models-table\">
|
|
73
|
+
<colgroup><col style=\"width:100%\"></colgroup>
|
|
74
|
+
<tbody id=\"installed-models-tbody\"></tbody>
|
|
75
|
+
</table>
|
|
76
|
+
</div>
|
|
77
|
+
<div class=\"model-mgmt-pane\">
|
|
78
|
+
<h3>Suggested Models</h3>
|
|
79
|
+
<table class=\"model-table\" id=\"suggested-models-table\">
|
|
80
|
+
<tbody id=\"suggested-models-tbody\"></tbody>
|
|
81
|
+
</table>
|
|
82
|
+
</div>
|
|
83
|
+
</div>
|
|
84
|
+
</div>
|
|
85
|
+
</div>
|
|
86
|
+
</main>
|
|
87
|
+
<footer class=\"site-footer\">
|
|
88
|
+
<div class=\"dad-joke\">When life gives you LLMs, make an LLM aide.</div>
|
|
89
|
+
<div class=\"copyright\">Copyright 2025 AMD</div>
|
|
90
|
+
</footer>
|
|
91
|
+
<script src=\"https://cdn.jsdelivr.net/npm/openai@4.21.0/dist/openai.min.js\"></script>
|
|
92
|
+
<script>
|
|
93
|
+
// Tab switching logic
|
|
94
|
+
function showTab(tab) {{
|
|
95
|
+
document.getElementById('tab-chat').classList.remove('active');
|
|
96
|
+
document.getElementById('tab-models').classList.remove('active');
|
|
97
|
+
document.getElementById('content-chat').classList.remove('active');
|
|
98
|
+
document.getElementById('content-models').classList.remove('active');
|
|
99
|
+
if (tab === 'chat') {{
|
|
100
|
+
document.getElementById('tab-chat').classList.add('active');
|
|
101
|
+
document.getElementById('content-chat').classList.add('active');
|
|
102
|
+
}} else {{
|
|
103
|
+
document.getElementById('tab-models').classList.add('active');
|
|
104
|
+
document.getElementById('content-models').classList.add('active');
|
|
105
|
+
}}
|
|
106
|
+
}}
|
|
107
|
+
|
|
108
|
+
// Helper to get server base URL
|
|
109
|
+
function getServerBaseUrl() {{
|
|
110
|
+
const port = window.SERVER_PORT || 8000;
|
|
111
|
+
return `http://localhost:{port}`;
|
|
112
|
+
}}
|
|
113
|
+
|
|
114
|
+
// Populate model dropdown from /api/v1/models endpoint
|
|
115
|
+
async function loadModels() {{
|
|
116
|
+
try {{
|
|
117
|
+
const resp = await fetch(getServerBaseUrl() + '/api/v1/models');
|
|
118
|
+
const data = await resp.json();
|
|
119
|
+
const select = document.getElementById('model-select');
|
|
120
|
+
select.innerHTML = '';
|
|
121
|
+
if (!data.data || !Array.isArray(data.data)) {{
|
|
122
|
+
select.innerHTML = '<option>No models found (malformed response)</option>';
|
|
123
|
+
return;
|
|
124
|
+
}}
|
|
125
|
+
if (data.data.length === 0) {{
|
|
126
|
+
select.innerHTML = '<option>No models available</option>';
|
|
127
|
+
return;
|
|
128
|
+
}}
|
|
129
|
+
let defaultIndex = 0;
|
|
130
|
+
data.data.forEach(function(model, index) {{
|
|
131
|
+
const modelId = model.id || model.name || model;
|
|
132
|
+
const opt = document.createElement('option');
|
|
133
|
+
opt.value = modelId;
|
|
134
|
+
opt.textContent = modelId;
|
|
135
|
+
if (modelId === 'Llama-3.2-1B-Instruct-Hybrid') {{
|
|
136
|
+
defaultIndex = index;
|
|
137
|
+
}}
|
|
138
|
+
select.appendChild(opt);
|
|
139
|
+
}});
|
|
140
|
+
select.selectedIndex = defaultIndex;
|
|
141
|
+
}} catch (e) {{
|
|
142
|
+
const select = document.getElementById('model-select');
|
|
143
|
+
select.innerHTML = `<option>Error loading models: ${{e.message}}</option>`;
|
|
144
|
+
console.error('Error loading models:', e);
|
|
145
|
+
}}
|
|
146
|
+
}}
|
|
147
|
+
loadModels();
|
|
148
|
+
|
|
149
|
+
// Model Management Tab Logic
|
|
150
|
+
async function refreshModelMgmtUI() {{
|
|
151
|
+
// Get installed models from /api/v1/models
|
|
152
|
+
let installed = [];
|
|
153
|
+
try {{
|
|
154
|
+
const resp = await fetch(getServerBaseUrl() + '/api/v1/models');
|
|
155
|
+
const data = await resp.json();
|
|
156
|
+
if (data.data && Array.isArray(data.data)) {{
|
|
157
|
+
installed = data.data.map(m => m.id || m.name || m);
|
|
158
|
+
}}
|
|
159
|
+
}} catch (e) {{}}
|
|
160
|
+
// All models from server_models.json (window.SERVER_MODELS)
|
|
161
|
+
const allModels = window.SERVER_MODELS || {{}};
|
|
162
|
+
// Filter suggested models not installed
|
|
163
|
+
const suggested = Object.keys(allModels).filter(
|
|
164
|
+
k => allModels[k].suggested && !installed.includes(k)
|
|
165
|
+
);
|
|
166
|
+
// Render installed models as a table (two columns, second is invisible)
|
|
167
|
+
const installedTbody = document.getElementById('installed-models-tbody');
|
|
168
|
+
installedTbody.innerHTML = '';
|
|
169
|
+
installed.forEach(function(mid) {{
|
|
170
|
+
var tr = document.createElement('tr');
|
|
171
|
+
var tdName = document.createElement('td');
|
|
172
|
+
tdName.textContent = mid;
|
|
173
|
+
var tdEmpty = document.createElement('td');
|
|
174
|
+
tdEmpty.style.width = '0';
|
|
175
|
+
tdEmpty.style.padding = '0';
|
|
176
|
+
tdEmpty.style.border = 'none';
|
|
177
|
+
tr.appendChild(tdName);
|
|
178
|
+
tr.appendChild(tdEmpty);
|
|
179
|
+
installedTbody.appendChild(tr);
|
|
180
|
+
}});
|
|
181
|
+
// Render suggested models as a table
|
|
182
|
+
const suggestedTbody = document.getElementById('suggested-models-tbody');
|
|
183
|
+
suggestedTbody.innerHTML = '';
|
|
184
|
+
suggested.forEach(mid => {{
|
|
185
|
+
const tr = document.createElement('tr');
|
|
186
|
+
const tdName = document.createElement('td');
|
|
187
|
+
tdName.textContent = mid;
|
|
188
|
+
tdName.style.paddingRight = '1em';
|
|
189
|
+
tdName.style.verticalAlign = 'middle';
|
|
190
|
+
const tdBtn = document.createElement('td');
|
|
191
|
+
tdBtn.style.width = '1%';
|
|
192
|
+
tdBtn.style.verticalAlign = 'middle';
|
|
193
|
+
const btn = document.createElement('button');
|
|
194
|
+
btn.textContent = '+';
|
|
195
|
+
btn.title = 'Install model';
|
|
196
|
+
btn.onclick = async function() {{
|
|
197
|
+
btn.disabled = true;
|
|
198
|
+
btn.textContent = 'Installing...';
|
|
199
|
+
btn.classList.add('installing-btn');
|
|
200
|
+
try {{
|
|
201
|
+
await fetch(getServerBaseUrl() + '/api/v1/pull', {{
|
|
202
|
+
method: 'POST',
|
|
203
|
+
headers: {{ 'Content-Type': 'application/json' }},
|
|
204
|
+
body: JSON.stringify({{ model_name: mid }})
|
|
205
|
+
}});
|
|
206
|
+
await refreshModelMgmtUI();
|
|
207
|
+
await loadModels(); // update chat dropdown too
|
|
208
|
+
}} catch (e) {{
|
|
209
|
+
btn.textContent = 'Error';
|
|
210
|
+
}}
|
|
211
|
+
}};
|
|
212
|
+
tdBtn.appendChild(btn);
|
|
213
|
+
tr.appendChild(tdName);
|
|
214
|
+
tr.appendChild(tdBtn);
|
|
215
|
+
suggestedTbody.appendChild(tr);
|
|
216
|
+
}});
|
|
217
|
+
}}
|
|
218
|
+
// Initial load
|
|
219
|
+
refreshModelMgmtUI();
|
|
220
|
+
// Optionally, refresh when switching to the tab
|
|
221
|
+
document.getElementById('tab-models').addEventListener('click', refreshModelMgmtUI);
|
|
222
|
+
|
|
223
|
+
// Chat logic (streaming with OpenAI JS client placeholder)
|
|
224
|
+
const chatHistory = document.getElementById('chat-history');
|
|
225
|
+
const chatInput = document.getElementById('chat-input');
|
|
226
|
+
const sendBtn = document.getElementById('send-btn');
|
|
227
|
+
const modelSelect = document.getElementById('model-select');
|
|
228
|
+
let messages = [];
|
|
229
|
+
|
|
230
|
+
function appendMessage(role, text) {{
|
|
231
|
+
const div = document.createElement('div');
|
|
232
|
+
div.className = 'chat-message ' + role;
|
|
233
|
+
// Add a bubble for iMessage style
|
|
234
|
+
const bubble = document.createElement('div');
|
|
235
|
+
bubble.className = 'chat-bubble ' + role;
|
|
236
|
+
bubble.innerHTML = text;
|
|
237
|
+
div.appendChild(bubble);
|
|
238
|
+
chatHistory.appendChild(div);
|
|
239
|
+
chatHistory.scrollTop = chatHistory.scrollHeight;
|
|
240
|
+
}}
|
|
241
|
+
|
|
242
|
+
async function sendMessage() {{
|
|
243
|
+
const text = chatInput.value.trim();
|
|
244
|
+
if (!text) return;
|
|
245
|
+
appendMessage('user', text);
|
|
246
|
+
messages.push({{ role: 'user', content: text }});
|
|
247
|
+
chatInput.value = '';
|
|
248
|
+
sendBtn.disabled = true;
|
|
249
|
+
// Streaming OpenAI completions (placeholder, adapt as needed)
|
|
250
|
+
let llmText = '';
|
|
251
|
+
appendMessage('llm', '...');
|
|
252
|
+
const llmDiv = chatHistory.lastChild.querySelector('.chat-bubble.llm');
|
|
253
|
+
try {{
|
|
254
|
+
// Use the correct endpoint for chat completions
|
|
255
|
+
const resp = await fetch(getServerBaseUrl() + '/api/v1/chat/completions', {{
|
|
256
|
+
method: 'POST',
|
|
257
|
+
headers: {{ 'Content-Type': 'application/json' }},
|
|
258
|
+
body: JSON.stringify({{
|
|
259
|
+
model: modelSelect.value,
|
|
260
|
+
messages: messages,
|
|
261
|
+
stream: true
|
|
262
|
+
}})
|
|
263
|
+
}});
|
|
264
|
+
if (!resp.body) throw new Error('No stream');
|
|
265
|
+
const reader = resp.body.getReader();
|
|
266
|
+
let decoder = new TextDecoder();
|
|
267
|
+
llmDiv.textContent = '';
|
|
268
|
+
while (true) {{
|
|
269
|
+
const {{ done, value }} = await reader.read();
|
|
270
|
+
if (done) break;
|
|
271
|
+
const chunk = decoder.decode(value);
|
|
272
|
+
if (chunk.trim() === 'data: [DONE]' || chunk.trim() === '[DONE]') continue;
|
|
273
|
+
// Try to extract the content from the OpenAI chunk
|
|
274
|
+
const match = chunk.match(/"content"\s*:\s*"([^"]*)"/);
|
|
275
|
+
if (match && match[1]) {{
|
|
276
|
+
llmText += match[1];
|
|
277
|
+
llmDiv.textContent = llmText;
|
|
278
|
+
}}
|
|
279
|
+
}}
|
|
280
|
+
messages.push({{ role: 'assistant', content: llmText }});
|
|
281
|
+
}} catch (e) {{
|
|
282
|
+
llmDiv.textContent = '[Error: ' + e.message + ']';
|
|
283
|
+
}}
|
|
284
|
+
sendBtn.disabled = false;
|
|
285
|
+
}}
|
|
286
|
+
sendBtn.onclick = sendMessage;
|
|
287
|
+
chatInput.addEventListener('keydown', function(e) {{
|
|
288
|
+
if (e.key === 'Enter') sendMessage();
|
|
289
|
+
}});
|
|
290
|
+
</script>
|
|
291
|
+
</body>
|
|
292
|
+
</html>
|
|
293
|
+
"""
|
|
294
|
+
return HTMLResponse(content=styled_html)
|
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import os
|
|
3
|
+
import logging
|
|
4
|
+
import time
|
|
5
|
+
import subprocess
|
|
6
|
+
import zipfile
|
|
7
|
+
import re
|
|
8
|
+
import threading
|
|
9
|
+
|
|
10
|
+
import requests
|
|
11
|
+
from tabulate import tabulate
|
|
12
|
+
from fastapi import HTTPException, status
|
|
13
|
+
from fastapi.responses import StreamingResponse
|
|
14
|
+
|
|
15
|
+
from openai import OpenAI
|
|
16
|
+
|
|
17
|
+
from lemonade_server.model_manager import ModelManager
|
|
18
|
+
from lemonade.tools.server.pydantic_models import ChatCompletionRequest
|
|
19
|
+
|
|
20
|
+
LLAMA_VERSION = "b5543"
|
|
21
|
+
LLAMA_SERVER_PORT = "8081"
|
|
22
|
+
|
|
23
|
+
LLAMA_SERVER_EXE_DIR = os.path.join(
|
|
24
|
+
os.path.dirname(sys.executable),
|
|
25
|
+
"llama_server",
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
LLAMA_SERVER_EXE_PATH = os.path.join(
|
|
29
|
+
LLAMA_SERVER_EXE_DIR,
|
|
30
|
+
"llama-server.exe",
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class LlamaTelemetry:
|
|
35
|
+
"""
|
|
36
|
+
Manages telemetry data collection and display for llama server.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(self):
|
|
40
|
+
self.input_tokens = None
|
|
41
|
+
self.output_tokens = None
|
|
42
|
+
self.time_to_first_token = None
|
|
43
|
+
self.tokens_per_second = None
|
|
44
|
+
self.prompt_eval_time = None
|
|
45
|
+
self.eval_time = None
|
|
46
|
+
|
|
47
|
+
def parse_telemetry_line(self, line: str):
|
|
48
|
+
"""
|
|
49
|
+
Parse telemetry data from llama server output lines.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
# Parse prompt evaluation line
|
|
53
|
+
prompt_match = re.search(
|
|
54
|
+
# pylint: disable=C0301
|
|
55
|
+
r"prompt eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens.*?([\d.]+)\s*tokens per second",
|
|
56
|
+
line,
|
|
57
|
+
)
|
|
58
|
+
if prompt_match:
|
|
59
|
+
prompt_time_ms = float(prompt_match.group(1))
|
|
60
|
+
input_tokens = int(prompt_match.group(2))
|
|
61
|
+
|
|
62
|
+
self.prompt_eval_time = prompt_time_ms / 1000.0
|
|
63
|
+
self.input_tokens = input_tokens
|
|
64
|
+
self.time_to_first_token = prompt_time_ms / 1000.0
|
|
65
|
+
return
|
|
66
|
+
|
|
67
|
+
# Parse generation evaluation line
|
|
68
|
+
eval_match = re.search(
|
|
69
|
+
r"eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens.*?([\d.]+)\s*tokens per second",
|
|
70
|
+
line,
|
|
71
|
+
)
|
|
72
|
+
if eval_match:
|
|
73
|
+
eval_time_ms = float(eval_match.group(1))
|
|
74
|
+
output_tokens = int(eval_match.group(2))
|
|
75
|
+
tokens_per_second = float(eval_match.group(3))
|
|
76
|
+
|
|
77
|
+
self.eval_time = eval_time_ms / 1000.0
|
|
78
|
+
self.output_tokens = output_tokens
|
|
79
|
+
self.tokens_per_second = tokens_per_second
|
|
80
|
+
return
|
|
81
|
+
|
|
82
|
+
def get_telemetry_data(self):
|
|
83
|
+
return {
|
|
84
|
+
"input_tokens": self.input_tokens,
|
|
85
|
+
"output_tokens": self.output_tokens,
|
|
86
|
+
"time_to_first_token": self.time_to_first_token,
|
|
87
|
+
"tokens_per_second": self.tokens_per_second,
|
|
88
|
+
"decode_token_times": None,
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
def show_telemetry(self):
|
|
92
|
+
# Check if debug logging is enabled
|
|
93
|
+
if not logging.getLogger().isEnabledFor(logging.DEBUG):
|
|
94
|
+
return
|
|
95
|
+
|
|
96
|
+
# Prepare telemetry data (transposed format)
|
|
97
|
+
telemetry = [
|
|
98
|
+
["Input tokens", self.input_tokens],
|
|
99
|
+
["Output tokens", self.output_tokens],
|
|
100
|
+
["TTFT (s)", f"{self.time_to_first_token:.2f}"],
|
|
101
|
+
["TPS", f"{self.tokens_per_second:.2f}"],
|
|
102
|
+
]
|
|
103
|
+
|
|
104
|
+
table = tabulate(
|
|
105
|
+
telemetry, headers=["Metric", "Value"], tablefmt="fancy_grid"
|
|
106
|
+
).split("\n")
|
|
107
|
+
|
|
108
|
+
# Show telemetry in debug while complying with uvicorn's log indentation
|
|
109
|
+
logging.debug("\n ".join(table))
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _log_subprocess_output(
|
|
113
|
+
process: subprocess.Popen, prefix: str, telemetry: LlamaTelemetry
|
|
114
|
+
):
|
|
115
|
+
"""
|
|
116
|
+
Read subprocess output line by line, log to debug, and parse telemetry
|
|
117
|
+
"""
|
|
118
|
+
|
|
119
|
+
if process.stdout:
|
|
120
|
+
for line in iter(process.stdout.readline, ""):
|
|
121
|
+
if line:
|
|
122
|
+
line_stripped = line.strip()
|
|
123
|
+
logging.debug("%s: %s", prefix, line_stripped)
|
|
124
|
+
|
|
125
|
+
telemetry.parse_telemetry_line(line_stripped)
|
|
126
|
+
|
|
127
|
+
if process.poll() is not None:
|
|
128
|
+
break
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _wait_for_load(llama_server_process: subprocess.Popen, fail_message: str):
|
|
132
|
+
status_code = None
|
|
133
|
+
while not llama_server_process.poll() and status_code != 200:
|
|
134
|
+
health_url = f"http://localhost:{LLAMA_SERVER_PORT}/health"
|
|
135
|
+
try:
|
|
136
|
+
health_response = requests.get(health_url)
|
|
137
|
+
except requests.exceptions.ConnectionError:
|
|
138
|
+
logging.warning(fail_message)
|
|
139
|
+
else:
|
|
140
|
+
status_code = health_response.status_code
|
|
141
|
+
logging.debug(
|
|
142
|
+
"Testing llama-server readiness (will retry until ready), "
|
|
143
|
+
f"result: {health_response.json()}"
|
|
144
|
+
)
|
|
145
|
+
time.sleep(1)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _launch_llama_subprocess(
|
|
149
|
+
model_path: str, use_gpu: bool, telemetry: LlamaTelemetry
|
|
150
|
+
) -> subprocess.Popen:
|
|
151
|
+
"""
|
|
152
|
+
Launch llama server subprocess with GPU or CPU configuration
|
|
153
|
+
"""
|
|
154
|
+
|
|
155
|
+
base_command = [
|
|
156
|
+
LLAMA_SERVER_EXE_PATH,
|
|
157
|
+
"-m",
|
|
158
|
+
model_path,
|
|
159
|
+
"--port",
|
|
160
|
+
LLAMA_SERVER_PORT,
|
|
161
|
+
]
|
|
162
|
+
|
|
163
|
+
# Configure GPU layers: 99 for GPU, 0 for CPU-only
|
|
164
|
+
ngl_value = "99" if use_gpu else "0"
|
|
165
|
+
command = base_command + ["-ngl", ngl_value]
|
|
166
|
+
|
|
167
|
+
# Start subprocess with output capture
|
|
168
|
+
process = subprocess.Popen(
|
|
169
|
+
command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
# Start background thread to log subprocess output
|
|
173
|
+
device_type = "GPU" if use_gpu else "CPU"
|
|
174
|
+
threading.Thread(
|
|
175
|
+
target=_log_subprocess_output,
|
|
176
|
+
args=(process, f"LLAMA SERVER {device_type}", telemetry),
|
|
177
|
+
daemon=True,
|
|
178
|
+
).start()
|
|
179
|
+
|
|
180
|
+
return process
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def server_load(checkpoint: str, model_reference: str, telemetry: LlamaTelemetry):
|
|
184
|
+
# Download llama.cpp server if it isn't already available
|
|
185
|
+
if not os.path.exists(LLAMA_SERVER_EXE_DIR):
|
|
186
|
+
# Download llama.cpp server zip
|
|
187
|
+
# pylint: disable=C0301
|
|
188
|
+
llama_zip_url = f"https://github.com/ggml-org/llama.cpp/releases/download/{LLAMA_VERSION}/llama-{LLAMA_VERSION}-bin-win-vulkan-x64.zip"
|
|
189
|
+
llama_zip_path = os.path.join(
|
|
190
|
+
os.path.dirname(sys.executable), "llama-server.zip"
|
|
191
|
+
)
|
|
192
|
+
logging.info(f"Downloading llama.cpp server from {llama_zip_url}")
|
|
193
|
+
|
|
194
|
+
with requests.get(llama_zip_url, stream=True) as r:
|
|
195
|
+
r.raise_for_status()
|
|
196
|
+
with open(llama_zip_path, "wb") as f:
|
|
197
|
+
for chunk in r.iter_content(chunk_size=8192):
|
|
198
|
+
f.write(chunk)
|
|
199
|
+
|
|
200
|
+
# Extract zip
|
|
201
|
+
logging.info(f"Extracting {llama_zip_path} to {LLAMA_SERVER_EXE_DIR}")
|
|
202
|
+
with zipfile.ZipFile(llama_zip_path, "r") as zip_ref:
|
|
203
|
+
zip_ref.extractall(LLAMA_SERVER_EXE_DIR)
|
|
204
|
+
|
|
205
|
+
# Save version.txt
|
|
206
|
+
version_txt_path = os.path.join(LLAMA_SERVER_EXE_DIR, "version.txt")
|
|
207
|
+
with open(version_txt_path, "w", encoding="utf-8") as vf:
|
|
208
|
+
vf.write(LLAMA_VERSION)
|
|
209
|
+
|
|
210
|
+
# Delete zip file
|
|
211
|
+
os.remove(llama_zip_path)
|
|
212
|
+
logging.info("Cleaned up zip file")
|
|
213
|
+
|
|
214
|
+
# Download the gguf to the hugging face cache
|
|
215
|
+
snapshot_path = ModelManager().download_gguf(checkpoint)
|
|
216
|
+
model_path = os.path.join(snapshot_path, os.listdir(snapshot_path)[0])
|
|
217
|
+
logging.debug(f"GGUF file path: {model_path}")
|
|
218
|
+
|
|
219
|
+
# Start the llama-serve.exe process
|
|
220
|
+
logging.debug(f"Using llama_server for GGUF model: {LLAMA_SERVER_EXE_PATH}")
|
|
221
|
+
|
|
222
|
+
# Attempt loading on GPU first
|
|
223
|
+
llama_server_process = _launch_llama_subprocess(
|
|
224
|
+
model_path, use_gpu=True, telemetry=telemetry
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
# Check the /health endpoint until GPU server is ready
|
|
228
|
+
_wait_for_load(
|
|
229
|
+
llama_server_process,
|
|
230
|
+
f"Loading {model_reference} on GPU didn't work, re-attempting on CPU",
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
# If loading on GPU failed, try loading on CPU
|
|
234
|
+
if llama_server_process.poll():
|
|
235
|
+
llama_server_process = _launch_llama_subprocess(
|
|
236
|
+
model_path, use_gpu=False, telemetry=telemetry
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
# Check the /health endpoint until CPU server is ready
|
|
240
|
+
_wait_for_load(
|
|
241
|
+
llama_server_process,
|
|
242
|
+
f"Loading {model_reference} on CPU didn't work",
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
if llama_server_process.poll():
|
|
246
|
+
raise HTTPException(
|
|
247
|
+
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
|
248
|
+
detail=f"Failed to load {model_reference} with llama.cpp",
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
return llama_server_process
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def chat_completion(
|
|
255
|
+
chat_completion_request: ChatCompletionRequest, telemetry: LlamaTelemetry
|
|
256
|
+
):
|
|
257
|
+
base_url = f"http://127.0.0.1:{LLAMA_SERVER_PORT}/v1"
|
|
258
|
+
client = OpenAI(
|
|
259
|
+
base_url=base_url,
|
|
260
|
+
api_key="lemonade",
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
# Convert Pydantic model to dict and remove unset/null values
|
|
264
|
+
request_dict = chat_completion_request.model_dump(
|
|
265
|
+
exclude_unset=True, exclude_none=True
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
def event_stream():
|
|
269
|
+
try:
|
|
270
|
+
# Enable streaming
|
|
271
|
+
request_dict["stream"] = True
|
|
272
|
+
for chunk in client.chat.completions.create(**request_dict):
|
|
273
|
+
yield f"data: {chunk.model_dump_json()}\n\n"
|
|
274
|
+
yield "data: [DONE]\n\n"
|
|
275
|
+
|
|
276
|
+
# Show telemetry after completion
|
|
277
|
+
telemetry.show_telemetry()
|
|
278
|
+
|
|
279
|
+
except Exception as e: # pylint: disable=broad-exception-caught
|
|
280
|
+
yield f'data: {{"error": "{str(e)}"}}\n\n'
|
|
281
|
+
|
|
282
|
+
return StreamingResponse(
|
|
283
|
+
event_stream(),
|
|
284
|
+
media_type="text/event-stream",
|
|
285
|
+
headers={
|
|
286
|
+
"Cache-Control": "no-cache",
|
|
287
|
+
"Connection": "keep-alive",
|
|
288
|
+
},
|
|
289
|
+
)
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel
|
|
4
|
+
|
|
5
|
+
# Set to a high number to allow for interesting experiences in real apps
|
|
6
|
+
# Tests should use the max_new_tokens argument to set a lower value
|
|
7
|
+
DEFAULT_MAX_NEW_TOKENS = 1500
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class LoadConfig(BaseModel):
|
|
11
|
+
"""
|
|
12
|
+
Configuration for loading a language model.
|
|
13
|
+
|
|
14
|
+
Specifies the model checkpoint, generation parameters,
|
|
15
|
+
and hardware/framework configuration (recipe) for model loading.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
model_name: Optional[str] = None
|
|
19
|
+
checkpoint: Optional[str] = None
|
|
20
|
+
max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS
|
|
21
|
+
recipe: Optional[str] = None
|
|
22
|
+
# Indicates the maximum prompt length allowed for that specific
|
|
23
|
+
# checkpoint + recipe combination
|
|
24
|
+
max_prompt_length: Optional[int] = None
|
|
25
|
+
# Indicates whether the model is a reasoning model, like DeepSeek
|
|
26
|
+
reasoning: Optional[bool] = False
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class CompletionRequest(BaseModel):
|
|
30
|
+
"""
|
|
31
|
+
Request model for text completion API endpoint.
|
|
32
|
+
|
|
33
|
+
Contains a prompt, a model identifier, and a streaming
|
|
34
|
+
flag to control response delivery.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
prompt: str
|
|
38
|
+
model: str
|
|
39
|
+
echo: bool = False
|
|
40
|
+
stream: bool = False
|
|
41
|
+
logprobs: int | None = False
|
|
42
|
+
stop: list[str] | str | None = None
|
|
43
|
+
temperature: float | None = None
|
|
44
|
+
max_tokens: int | None = None
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class ChatCompletionRequest(BaseModel):
|
|
48
|
+
"""
|
|
49
|
+
Request model for chat completion API endpoint.
|
|
50
|
+
|
|
51
|
+
Contains a list of chat messages, a model identifier,
|
|
52
|
+
and a streaming flag to control response delivery.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
messages: list[dict]
|
|
56
|
+
model: str
|
|
57
|
+
stream: bool = False
|
|
58
|
+
logprobs: int | None = False
|
|
59
|
+
stop: list[str] | str | None = None
|
|
60
|
+
temperature: float | None = None
|
|
61
|
+
tools: list[dict] | None = None
|
|
62
|
+
max_tokens: int | None = None
|
|
63
|
+
max_completion_tokens: int | None = None
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class ResponsesRequest(BaseModel):
|
|
67
|
+
"""
|
|
68
|
+
Request model for responses API endpoint.
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
input: list[dict] | str
|
|
72
|
+
model: str
|
|
73
|
+
max_output_tokens: int | None = None
|
|
74
|
+
temperature: float | None = None
|
|
75
|
+
stream: bool = False
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class PullConfig(BaseModel):
|
|
79
|
+
"""
|
|
80
|
+
Configurating for installing a supported LLM.
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
model_name: str
|