lemonade-sdk 7.0.1__py3-none-any.whl → 7.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lemonade-sdk might be problematic. Click here for more details.
- lemonade/cli.py +2 -0
- lemonade/tools/accuracy.py +335 -0
- lemonade/tools/huggingface_load.py +6 -0
- lemonade/tools/ort_genai/oga.py +6 -4
- lemonade/tools/prompt.py +28 -1
- lemonade/tools/server/instructions.py +8 -265
- lemonade/tools/server/llamacpp.py +45 -19
- lemonade/tools/server/port_utils.py +57 -0
- lemonade/tools/server/serve.py +96 -44
- lemonade/tools/server/static/instructions.html +262 -0
- lemonade/tools/server/thread_utils.py +87 -0
- lemonade/version.py +1 -1
- {lemonade_sdk-7.0.1.dist-info → lemonade_sdk-7.0.3.dist-info}/METADATA +1 -1
- {lemonade_sdk-7.0.1.dist-info → lemonade_sdk-7.0.3.dist-info}/RECORD +22 -18
- lemonade_server/model_manager.py +45 -12
- {lemonade/tools/server → lemonade_server}/pydantic_models.py +2 -0
- lemonade_server/server_models.json +25 -4
- {lemonade_sdk-7.0.1.dist-info → lemonade_sdk-7.0.3.dist-info}/WHEEL +0 -0
- {lemonade_sdk-7.0.1.dist-info → lemonade_sdk-7.0.3.dist-info}/entry_points.txt +0 -0
- {lemonade_sdk-7.0.1.dist-info → lemonade_sdk-7.0.3.dist-info}/licenses/LICENSE +0 -0
- {lemonade_sdk-7.0.1.dist-info → lemonade_sdk-7.0.3.dist-info}/licenses/NOTICE.md +0 -0
- {lemonade_sdk-7.0.1.dist-info → lemonade_sdk-7.0.3.dist-info}/top_level.txt +0 -0
|
@@ -25,270 +25,13 @@ def get_instructions_html(port=8000):
|
|
|
25
25
|
f"<script>window.SERVER_MODELS = {json.dumps(filtered_models)};</script>"
|
|
26
26
|
)
|
|
27
27
|
|
|
28
|
-
#
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
<html lang=\"en\">
|
|
33
|
-
<head>
|
|
34
|
-
<meta charset=\"UTF-8\">
|
|
35
|
-
<meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">
|
|
36
|
-
<title>Lemonade Server</title>
|
|
37
|
-
<link rel="icon" href="data:,">
|
|
38
|
-
<link rel=\"stylesheet\" href=\"/static/styles.css\">
|
|
39
|
-
<script>
|
|
40
|
-
window.SERVER_PORT = {port};
|
|
41
|
-
</script>
|
|
42
|
-
{server_models_js}
|
|
43
|
-
</head>
|
|
44
|
-
<body>
|
|
45
|
-
<nav class=\"navbar\">
|
|
46
|
-
<a href=\"https://github.com/lemonade-sdk/lemonade\">GitHub</a>
|
|
47
|
-
<a href=\"https://lemonade-server.ai/docs/\">Docs</a>
|
|
48
|
-
<a href=\"https://lemonade-server.ai/docs/server/server_models/\">Models</a>
|
|
49
|
-
<a href=\"https://lemonade-server.ai/docs/server/apps/\">Featured Apps</a>
|
|
50
|
-
</nav>
|
|
51
|
-
<main class=\"main\">
|
|
52
|
-
<div class=\"title\">🍋 Lemonade Server</div>
|
|
53
|
-
<div class=\"tab-container\">
|
|
54
|
-
<div class=\"tabs\">
|
|
55
|
-
<button class=\"tab active\" id=\"tab-chat\" onclick=\"showTab('chat')\">LLM Chat</button>
|
|
56
|
-
<button class=\"tab\" id=\"tab-models\" onclick=\"showTab('models')\">Model Management</button>
|
|
57
|
-
</div>
|
|
58
|
-
<div class=\"tab-content active\" id=\"content-chat\">
|
|
59
|
-
<div class=\"chat-container\">
|
|
60
|
-
<div class=\"chat-history\" id=\"chat-history\"></div>
|
|
61
|
-
<div class=\"chat-input-row\">
|
|
62
|
-
<select id=\"model-select\"></select>
|
|
63
|
-
<input type=\"text\" id=\"chat-input\" placeholder=\"Type your message...\" />
|
|
64
|
-
<button id=\"send-btn\">Send</button>
|
|
65
|
-
</div>
|
|
66
|
-
</div>
|
|
67
|
-
</div>
|
|
68
|
-
<div class=\"tab-content\" id=\"content-models\">
|
|
69
|
-
<div class=\"model-mgmt-container\">
|
|
70
|
-
<div class=\"model-mgmt-pane\">
|
|
71
|
-
<h3>Installed Models</h3>
|
|
72
|
-
<table class=\"model-table\" id=\"installed-models-table\">
|
|
73
|
-
<colgroup><col style=\"width:100%\"></colgroup>
|
|
74
|
-
<tbody id=\"installed-models-tbody\"></tbody>
|
|
75
|
-
</table>
|
|
76
|
-
</div>
|
|
77
|
-
<div class=\"model-mgmt-pane\">
|
|
78
|
-
<h3>Suggested Models</h3>
|
|
79
|
-
<table class=\"model-table\" id=\"suggested-models-table\">
|
|
80
|
-
<tbody id=\"suggested-models-tbody\"></tbody>
|
|
81
|
-
</table>
|
|
82
|
-
</div>
|
|
83
|
-
</div>
|
|
84
|
-
</div>
|
|
85
|
-
</div>
|
|
86
|
-
</main>
|
|
87
|
-
<footer class=\"site-footer\">
|
|
88
|
-
<div class=\"dad-joke\">When life gives you LLMs, make an LLM aide.</div>
|
|
89
|
-
<div class=\"copyright\">Copyright 2025 AMD</div>
|
|
90
|
-
</footer>
|
|
91
|
-
<script src=\"https://cdn.jsdelivr.net/npm/openai@4.21.0/dist/openai.min.js\"></script>
|
|
92
|
-
<script>
|
|
93
|
-
// Tab switching logic
|
|
94
|
-
function showTab(tab) {{
|
|
95
|
-
document.getElementById('tab-chat').classList.remove('active');
|
|
96
|
-
document.getElementById('tab-models').classList.remove('active');
|
|
97
|
-
document.getElementById('content-chat').classList.remove('active');
|
|
98
|
-
document.getElementById('content-models').classList.remove('active');
|
|
99
|
-
if (tab === 'chat') {{
|
|
100
|
-
document.getElementById('tab-chat').classList.add('active');
|
|
101
|
-
document.getElementById('content-chat').classList.add('active');
|
|
102
|
-
}} else {{
|
|
103
|
-
document.getElementById('tab-models').classList.add('active');
|
|
104
|
-
document.getElementById('content-models').classList.add('active');
|
|
105
|
-
}}
|
|
106
|
-
}}
|
|
28
|
+
# Load HTML template
|
|
29
|
+
template_path = Path(__file__).parent / "static" / "instructions.html"
|
|
30
|
+
with open(template_path, "r", encoding="utf-8") as f:
|
|
31
|
+
html_template = f.read()
|
|
107
32
|
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
return `http://localhost:{port}`;
|
|
112
|
-
}}
|
|
33
|
+
# Replace template variables
|
|
34
|
+
html_content = html_template.replace("{{SERVER_PORT}}", str(port))
|
|
35
|
+
html_content = html_content.replace("{{SERVER_MODELS_JS}}", server_models_js)
|
|
113
36
|
|
|
114
|
-
|
|
115
|
-
async function loadModels() {{
|
|
116
|
-
try {{
|
|
117
|
-
const resp = await fetch(getServerBaseUrl() + '/api/v1/models');
|
|
118
|
-
const data = await resp.json();
|
|
119
|
-
const select = document.getElementById('model-select');
|
|
120
|
-
select.innerHTML = '';
|
|
121
|
-
if (!data.data || !Array.isArray(data.data)) {{
|
|
122
|
-
select.innerHTML = '<option>No models found (malformed response)</option>';
|
|
123
|
-
return;
|
|
124
|
-
}}
|
|
125
|
-
if (data.data.length === 0) {{
|
|
126
|
-
select.innerHTML = '<option>No models available</option>';
|
|
127
|
-
return;
|
|
128
|
-
}}
|
|
129
|
-
let defaultIndex = 0;
|
|
130
|
-
data.data.forEach(function(model, index) {{
|
|
131
|
-
const modelId = model.id || model.name || model;
|
|
132
|
-
const opt = document.createElement('option');
|
|
133
|
-
opt.value = modelId;
|
|
134
|
-
opt.textContent = modelId;
|
|
135
|
-
if (modelId === 'Llama-3.2-1B-Instruct-Hybrid') {{
|
|
136
|
-
defaultIndex = index;
|
|
137
|
-
}}
|
|
138
|
-
select.appendChild(opt);
|
|
139
|
-
}});
|
|
140
|
-
select.selectedIndex = defaultIndex;
|
|
141
|
-
}} catch (e) {{
|
|
142
|
-
const select = document.getElementById('model-select');
|
|
143
|
-
select.innerHTML = `<option>Error loading models: ${{e.message}}</option>`;
|
|
144
|
-
console.error('Error loading models:', e);
|
|
145
|
-
}}
|
|
146
|
-
}}
|
|
147
|
-
loadModels();
|
|
148
|
-
|
|
149
|
-
// Model Management Tab Logic
|
|
150
|
-
async function refreshModelMgmtUI() {{
|
|
151
|
-
// Get installed models from /api/v1/models
|
|
152
|
-
let installed = [];
|
|
153
|
-
try {{
|
|
154
|
-
const resp = await fetch(getServerBaseUrl() + '/api/v1/models');
|
|
155
|
-
const data = await resp.json();
|
|
156
|
-
if (data.data && Array.isArray(data.data)) {{
|
|
157
|
-
installed = data.data.map(m => m.id || m.name || m);
|
|
158
|
-
}}
|
|
159
|
-
}} catch (e) {{}}
|
|
160
|
-
// All models from server_models.json (window.SERVER_MODELS)
|
|
161
|
-
const allModels = window.SERVER_MODELS || {{}};
|
|
162
|
-
// Filter suggested models not installed
|
|
163
|
-
const suggested = Object.keys(allModels).filter(
|
|
164
|
-
k => allModels[k].suggested && !installed.includes(k)
|
|
165
|
-
);
|
|
166
|
-
// Render installed models as a table (two columns, second is invisible)
|
|
167
|
-
const installedTbody = document.getElementById('installed-models-tbody');
|
|
168
|
-
installedTbody.innerHTML = '';
|
|
169
|
-
installed.forEach(function(mid) {{
|
|
170
|
-
var tr = document.createElement('tr');
|
|
171
|
-
var tdName = document.createElement('td');
|
|
172
|
-
tdName.textContent = mid;
|
|
173
|
-
var tdEmpty = document.createElement('td');
|
|
174
|
-
tdEmpty.style.width = '0';
|
|
175
|
-
tdEmpty.style.padding = '0';
|
|
176
|
-
tdEmpty.style.border = 'none';
|
|
177
|
-
tr.appendChild(tdName);
|
|
178
|
-
tr.appendChild(tdEmpty);
|
|
179
|
-
installedTbody.appendChild(tr);
|
|
180
|
-
}});
|
|
181
|
-
// Render suggested models as a table
|
|
182
|
-
const suggestedTbody = document.getElementById('suggested-models-tbody');
|
|
183
|
-
suggestedTbody.innerHTML = '';
|
|
184
|
-
suggested.forEach(mid => {{
|
|
185
|
-
const tr = document.createElement('tr');
|
|
186
|
-
const tdName = document.createElement('td');
|
|
187
|
-
tdName.textContent = mid;
|
|
188
|
-
tdName.style.paddingRight = '1em';
|
|
189
|
-
tdName.style.verticalAlign = 'middle';
|
|
190
|
-
const tdBtn = document.createElement('td');
|
|
191
|
-
tdBtn.style.width = '1%';
|
|
192
|
-
tdBtn.style.verticalAlign = 'middle';
|
|
193
|
-
const btn = document.createElement('button');
|
|
194
|
-
btn.textContent = '+';
|
|
195
|
-
btn.title = 'Install model';
|
|
196
|
-
btn.onclick = async function() {{
|
|
197
|
-
btn.disabled = true;
|
|
198
|
-
btn.textContent = 'Installing...';
|
|
199
|
-
btn.classList.add('installing-btn');
|
|
200
|
-
try {{
|
|
201
|
-
await fetch(getServerBaseUrl() + '/api/v1/pull', {{
|
|
202
|
-
method: 'POST',
|
|
203
|
-
headers: {{ 'Content-Type': 'application/json' }},
|
|
204
|
-
body: JSON.stringify({{ model_name: mid }})
|
|
205
|
-
}});
|
|
206
|
-
await refreshModelMgmtUI();
|
|
207
|
-
await loadModels(); // update chat dropdown too
|
|
208
|
-
}} catch (e) {{
|
|
209
|
-
btn.textContent = 'Error';
|
|
210
|
-
}}
|
|
211
|
-
}};
|
|
212
|
-
tdBtn.appendChild(btn);
|
|
213
|
-
tr.appendChild(tdName);
|
|
214
|
-
tr.appendChild(tdBtn);
|
|
215
|
-
suggestedTbody.appendChild(tr);
|
|
216
|
-
}});
|
|
217
|
-
}}
|
|
218
|
-
// Initial load
|
|
219
|
-
refreshModelMgmtUI();
|
|
220
|
-
// Optionally, refresh when switching to the tab
|
|
221
|
-
document.getElementById('tab-models').addEventListener('click', refreshModelMgmtUI);
|
|
222
|
-
|
|
223
|
-
// Chat logic (streaming with OpenAI JS client placeholder)
|
|
224
|
-
const chatHistory = document.getElementById('chat-history');
|
|
225
|
-
const chatInput = document.getElementById('chat-input');
|
|
226
|
-
const sendBtn = document.getElementById('send-btn');
|
|
227
|
-
const modelSelect = document.getElementById('model-select');
|
|
228
|
-
let messages = [];
|
|
229
|
-
|
|
230
|
-
function appendMessage(role, text) {{
|
|
231
|
-
const div = document.createElement('div');
|
|
232
|
-
div.className = 'chat-message ' + role;
|
|
233
|
-
// Add a bubble for iMessage style
|
|
234
|
-
const bubble = document.createElement('div');
|
|
235
|
-
bubble.className = 'chat-bubble ' + role;
|
|
236
|
-
bubble.innerHTML = text;
|
|
237
|
-
div.appendChild(bubble);
|
|
238
|
-
chatHistory.appendChild(div);
|
|
239
|
-
chatHistory.scrollTop = chatHistory.scrollHeight;
|
|
240
|
-
}}
|
|
241
|
-
|
|
242
|
-
async function sendMessage() {{
|
|
243
|
-
const text = chatInput.value.trim();
|
|
244
|
-
if (!text) return;
|
|
245
|
-
appendMessage('user', text);
|
|
246
|
-
messages.push({{ role: 'user', content: text }});
|
|
247
|
-
chatInput.value = '';
|
|
248
|
-
sendBtn.disabled = true;
|
|
249
|
-
// Streaming OpenAI completions (placeholder, adapt as needed)
|
|
250
|
-
let llmText = '';
|
|
251
|
-
appendMessage('llm', '...');
|
|
252
|
-
const llmDiv = chatHistory.lastChild.querySelector('.chat-bubble.llm');
|
|
253
|
-
try {{
|
|
254
|
-
// Use the correct endpoint for chat completions
|
|
255
|
-
const resp = await fetch(getServerBaseUrl() + '/api/v1/chat/completions', {{
|
|
256
|
-
method: 'POST',
|
|
257
|
-
headers: {{ 'Content-Type': 'application/json' }},
|
|
258
|
-
body: JSON.stringify({{
|
|
259
|
-
model: modelSelect.value,
|
|
260
|
-
messages: messages,
|
|
261
|
-
stream: true
|
|
262
|
-
}})
|
|
263
|
-
}});
|
|
264
|
-
if (!resp.body) throw new Error('No stream');
|
|
265
|
-
const reader = resp.body.getReader();
|
|
266
|
-
let decoder = new TextDecoder();
|
|
267
|
-
llmDiv.textContent = '';
|
|
268
|
-
while (true) {{
|
|
269
|
-
const {{ done, value }} = await reader.read();
|
|
270
|
-
if (done) break;
|
|
271
|
-
const chunk = decoder.decode(value);
|
|
272
|
-
if (chunk.trim() === 'data: [DONE]' || chunk.trim() === '[DONE]') continue;
|
|
273
|
-
// Try to extract the content from the OpenAI chunk
|
|
274
|
-
const match = chunk.match(/"content"\s*:\s*"([^"]*)"/);
|
|
275
|
-
if (match && match[1]) {{
|
|
276
|
-
llmText += match[1];
|
|
277
|
-
llmDiv.textContent = llmText;
|
|
278
|
-
}}
|
|
279
|
-
}}
|
|
280
|
-
messages.push({{ role: 'assistant', content: llmText }});
|
|
281
|
-
}} catch (e) {{
|
|
282
|
-
llmDiv.textContent = '[Error: ' + e.message + ']';
|
|
283
|
-
}}
|
|
284
|
-
sendBtn.disabled = false;
|
|
285
|
-
}}
|
|
286
|
-
sendBtn.onclick = sendMessage;
|
|
287
|
-
chatInput.addEventListener('keydown', function(e) {{
|
|
288
|
-
if (e.key === 'Enter') sendMessage();
|
|
289
|
-
}});
|
|
290
|
-
</script>
|
|
291
|
-
</body>
|
|
292
|
-
</html>
|
|
293
|
-
"""
|
|
294
|
-
return HTMLResponse(content=styled_html)
|
|
37
|
+
return HTMLResponse(content=html_content)
|
|
@@ -14,11 +14,11 @@ from fastapi.responses import StreamingResponse
|
|
|
14
14
|
|
|
15
15
|
from openai import OpenAI
|
|
16
16
|
|
|
17
|
+
from lemonade_server.pydantic_models import ChatCompletionRequest
|
|
17
18
|
from lemonade_server.model_manager import ModelManager
|
|
18
|
-
from lemonade.tools.server.
|
|
19
|
+
from lemonade.tools.server.port_utils import find_free_port
|
|
19
20
|
|
|
20
21
|
LLAMA_VERSION = "b5543"
|
|
21
|
-
LLAMA_SERVER_PORT = "8081"
|
|
22
22
|
|
|
23
23
|
LLAMA_SERVER_EXE_DIR = os.path.join(
|
|
24
24
|
os.path.dirname(sys.executable),
|
|
@@ -43,6 +43,23 @@ class LlamaTelemetry:
|
|
|
43
43
|
self.tokens_per_second = None
|
|
44
44
|
self.prompt_eval_time = None
|
|
45
45
|
self.eval_time = None
|
|
46
|
+
self.port = None
|
|
47
|
+
|
|
48
|
+
def choose_port(self):
|
|
49
|
+
"""
|
|
50
|
+
Users probably don't care what port we start llama-server on, so let's
|
|
51
|
+
search for an empty port
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
self.port = find_free_port()
|
|
55
|
+
|
|
56
|
+
if self.port is None:
|
|
57
|
+
msg = "Failed to find an empty port to start llama-server on"
|
|
58
|
+
logging.error(msg)
|
|
59
|
+
raise HTTPException(
|
|
60
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
61
|
+
detail=msg,
|
|
62
|
+
)
|
|
46
63
|
|
|
47
64
|
def parse_telemetry_line(self, line: str):
|
|
48
65
|
"""
|
|
@@ -128,10 +145,12 @@ def _log_subprocess_output(
|
|
|
128
145
|
break
|
|
129
146
|
|
|
130
147
|
|
|
131
|
-
def _wait_for_load(
|
|
148
|
+
def _wait_for_load(
|
|
149
|
+
llama_server_process: subprocess.Popen, port: int, fail_message: str
|
|
150
|
+
):
|
|
132
151
|
status_code = None
|
|
133
152
|
while not llama_server_process.poll() and status_code != 200:
|
|
134
|
-
health_url = f"http://localhost:{
|
|
153
|
+
health_url = f"http://localhost:{port}/health"
|
|
135
154
|
try:
|
|
136
155
|
health_response = requests.get(health_url)
|
|
137
156
|
except requests.exceptions.ConnectionError:
|
|
@@ -146,19 +165,25 @@ def _wait_for_load(llama_server_process: subprocess.Popen, fail_message: str):
|
|
|
146
165
|
|
|
147
166
|
|
|
148
167
|
def _launch_llama_subprocess(
|
|
149
|
-
|
|
168
|
+
snapshot_files: dict, use_gpu: bool, telemetry: LlamaTelemetry
|
|
150
169
|
) -> subprocess.Popen:
|
|
151
170
|
"""
|
|
152
171
|
Launch llama server subprocess with GPU or CPU configuration
|
|
153
172
|
"""
|
|
154
173
|
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
174
|
+
# Build the base command
|
|
175
|
+
base_command = [LLAMA_SERVER_EXE_PATH, "-m", snapshot_files["variant"]]
|
|
176
|
+
if "mmproj" in snapshot_files:
|
|
177
|
+
base_command.extend(["--mmproj", snapshot_files["mmproj"]])
|
|
178
|
+
if not use_gpu:
|
|
179
|
+
base_command.extend(["--no-mmproj-offload"])
|
|
180
|
+
|
|
181
|
+
# Find a port, and save it in the telemetry object for future reference
|
|
182
|
+
# by other functions
|
|
183
|
+
telemetry.choose_port()
|
|
184
|
+
|
|
185
|
+
# Add port and jinja to enable tool use
|
|
186
|
+
base_command.extend(["--port", str(telemetry.port), "--jinja"])
|
|
162
187
|
|
|
163
188
|
# Configure GPU layers: 99 for GPU, 0 for CPU-only
|
|
164
189
|
ngl_value = "99" if use_gpu else "0"
|
|
@@ -180,7 +205,7 @@ def _launch_llama_subprocess(
|
|
|
180
205
|
return process
|
|
181
206
|
|
|
182
207
|
|
|
183
|
-
def server_load(
|
|
208
|
+
def server_load(model_config: dict, model_reference: str, telemetry: LlamaTelemetry):
|
|
184
209
|
# Download llama.cpp server if it isn't already available
|
|
185
210
|
if not os.path.exists(LLAMA_SERVER_EXE_DIR):
|
|
186
211
|
# Download llama.cpp server zip
|
|
@@ -212,33 +237,34 @@ def server_load(checkpoint: str, model_reference: str, telemetry: LlamaTelemetry
|
|
|
212
237
|
logging.info("Cleaned up zip file")
|
|
213
238
|
|
|
214
239
|
# Download the gguf to the hugging face cache
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
logging.debug(f"GGUF file path: {model_path}")
|
|
240
|
+
snapshot_files = ModelManager().download_gguf(model_config)
|
|
241
|
+
logging.debug(f"GGUF file paths: {snapshot_files}")
|
|
218
242
|
|
|
219
243
|
# Start the llama-serve.exe process
|
|
220
244
|
logging.debug(f"Using llama_server for GGUF model: {LLAMA_SERVER_EXE_PATH}")
|
|
221
245
|
|
|
222
246
|
# Attempt loading on GPU first
|
|
223
247
|
llama_server_process = _launch_llama_subprocess(
|
|
224
|
-
|
|
248
|
+
snapshot_files, use_gpu=True, telemetry=telemetry
|
|
225
249
|
)
|
|
226
250
|
|
|
227
251
|
# Check the /health endpoint until GPU server is ready
|
|
228
252
|
_wait_for_load(
|
|
229
253
|
llama_server_process,
|
|
254
|
+
telemetry.port,
|
|
230
255
|
f"Loading {model_reference} on GPU didn't work, re-attempting on CPU",
|
|
231
256
|
)
|
|
232
257
|
|
|
233
258
|
# If loading on GPU failed, try loading on CPU
|
|
234
259
|
if llama_server_process.poll():
|
|
235
260
|
llama_server_process = _launch_llama_subprocess(
|
|
236
|
-
|
|
261
|
+
snapshot_files, use_gpu=False, telemetry=telemetry
|
|
237
262
|
)
|
|
238
263
|
|
|
239
264
|
# Check the /health endpoint until CPU server is ready
|
|
240
265
|
_wait_for_load(
|
|
241
266
|
llama_server_process,
|
|
267
|
+
telemetry.port,
|
|
242
268
|
f"Loading {model_reference} on CPU didn't work",
|
|
243
269
|
)
|
|
244
270
|
|
|
@@ -254,7 +280,7 @@ def server_load(checkpoint: str, model_reference: str, telemetry: LlamaTelemetry
|
|
|
254
280
|
def chat_completion(
|
|
255
281
|
chat_completion_request: ChatCompletionRequest, telemetry: LlamaTelemetry
|
|
256
282
|
):
|
|
257
|
-
base_url = f"http://127.0.0.1:{
|
|
283
|
+
base_url = f"http://127.0.0.1:{telemetry.port}/v1"
|
|
258
284
|
client = OpenAI(
|
|
259
285
|
base_url=base_url,
|
|
260
286
|
api_key="lemonade",
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import socketserver
|
|
2
|
+
import sys
|
|
3
|
+
import logging
|
|
4
|
+
from contextlib import asynccontextmanager
|
|
5
|
+
from fastapi import FastAPI
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def find_free_port():
|
|
9
|
+
"""
|
|
10
|
+
Scans for an unoccupied TCP port
|
|
11
|
+
|
|
12
|
+
Returns the port number as an int on success
|
|
13
|
+
Returns None if no port can be found
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
try:
|
|
17
|
+
with socketserver.TCPServer(("localhost", 0), None) as s:
|
|
18
|
+
return s.server_address[1]
|
|
19
|
+
# pylint: disable=broad-exception-caught
|
|
20
|
+
except Exception:
|
|
21
|
+
return None
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@asynccontextmanager
|
|
25
|
+
async def lifespan(app: FastAPI):
|
|
26
|
+
# Code here will run when the application starts up
|
|
27
|
+
# Check if console can handle Unicode by testing emoji encoding
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
if sys.stdout.encoding:
|
|
31
|
+
"🍋".encode(sys.stdout.encoding)
|
|
32
|
+
use_emojis = True
|
|
33
|
+
except (UnicodeEncodeError, AttributeError):
|
|
34
|
+
use_emojis = False
|
|
35
|
+
|
|
36
|
+
if use_emojis:
|
|
37
|
+
logging.info(
|
|
38
|
+
"\n"
|
|
39
|
+
"\n"
|
|
40
|
+
"🍋 Lemonade Server Ready!\n"
|
|
41
|
+
f"🍋 Open http://localhost:{app.port} in your browser for:\n"
|
|
42
|
+
"🍋 💬 chat\n"
|
|
43
|
+
"🍋 💻 model management\n"
|
|
44
|
+
"🍋 📄 docs\n"
|
|
45
|
+
)
|
|
46
|
+
else:
|
|
47
|
+
logging.info(
|
|
48
|
+
"\n"
|
|
49
|
+
"\n"
|
|
50
|
+
"[Lemonade] Lemonade Server Ready!\n"
|
|
51
|
+
f"[Lemonade] Open http://localhost:{app.port} in your browser for:\n"
|
|
52
|
+
"[Lemonade] chat\n"
|
|
53
|
+
"[Lemonade] model management\n"
|
|
54
|
+
"[Lemonade] docs\n"
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
yield
|