lemonade-sdk 7.0.1__py3-none-any.whl → 7.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

@@ -25,270 +25,13 @@ def get_instructions_html(port=8000):
25
25
  f"<script>window.SERVER_MODELS = {json.dumps(filtered_models)};</script>"
26
26
  )
27
27
 
28
- # New lemon-themed HTML structure
29
- # pylint: disable=W1401
30
- styled_html = f"""
31
- <!DOCTYPE html>
32
- <html lang=\"en\">
33
- <head>
34
- <meta charset=\"UTF-8\">
35
- <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">
36
- <title>Lemonade Server</title>
37
- <link rel="icon" href="data:,">
38
- <link rel=\"stylesheet\" href=\"/static/styles.css\">
39
- <script>
40
- window.SERVER_PORT = {port};
41
- </script>
42
- {server_models_js}
43
- </head>
44
- <body>
45
- <nav class=\"navbar\">
46
- <a href=\"https://github.com/lemonade-sdk/lemonade\">GitHub</a>
47
- <a href=\"https://lemonade-server.ai/docs/\">Docs</a>
48
- <a href=\"https://lemonade-server.ai/docs/server/server_models/\">Models</a>
49
- <a href=\"https://lemonade-server.ai/docs/server/apps/\">Featured Apps</a>
50
- </nav>
51
- <main class=\"main\">
52
- <div class=\"title\">🍋 Lemonade Server</div>
53
- <div class=\"tab-container\">
54
- <div class=\"tabs\">
55
- <button class=\"tab active\" id=\"tab-chat\" onclick=\"showTab('chat')\">LLM Chat</button>
56
- <button class=\"tab\" id=\"tab-models\" onclick=\"showTab('models')\">Model Management</button>
57
- </div>
58
- <div class=\"tab-content active\" id=\"content-chat\">
59
- <div class=\"chat-container\">
60
- <div class=\"chat-history\" id=\"chat-history\"></div>
61
- <div class=\"chat-input-row\">
62
- <select id=\"model-select\"></select>
63
- <input type=\"text\" id=\"chat-input\" placeholder=\"Type your message...\" />
64
- <button id=\"send-btn\">Send</button>
65
- </div>
66
- </div>
67
- </div>
68
- <div class=\"tab-content\" id=\"content-models\">
69
- <div class=\"model-mgmt-container\">
70
- <div class=\"model-mgmt-pane\">
71
- <h3>Installed Models</h3>
72
- <table class=\"model-table\" id=\"installed-models-table\">
73
- <colgroup><col style=\"width:100%\"></colgroup>
74
- <tbody id=\"installed-models-tbody\"></tbody>
75
- </table>
76
- </div>
77
- <div class=\"model-mgmt-pane\">
78
- <h3>Suggested Models</h3>
79
- <table class=\"model-table\" id=\"suggested-models-table\">
80
- <tbody id=\"suggested-models-tbody\"></tbody>
81
- </table>
82
- </div>
83
- </div>
84
- </div>
85
- </div>
86
- </main>
87
- <footer class=\"site-footer\">
88
- <div class=\"dad-joke\">When life gives you LLMs, make an LLM aide.</div>
89
- <div class=\"copyright\">Copyright 2025 AMD</div>
90
- </footer>
91
- <script src=\"https://cdn.jsdelivr.net/npm/openai@4.21.0/dist/openai.min.js\"></script>
92
- <script>
93
- // Tab switching logic
94
- function showTab(tab) {{
95
- document.getElementById('tab-chat').classList.remove('active');
96
- document.getElementById('tab-models').classList.remove('active');
97
- document.getElementById('content-chat').classList.remove('active');
98
- document.getElementById('content-models').classList.remove('active');
99
- if (tab === 'chat') {{
100
- document.getElementById('tab-chat').classList.add('active');
101
- document.getElementById('content-chat').classList.add('active');
102
- }} else {{
103
- document.getElementById('tab-models').classList.add('active');
104
- document.getElementById('content-models').classList.add('active');
105
- }}
106
- }}
28
+ # Load HTML template
29
+ template_path = Path(__file__).parent / "static" / "instructions.html"
30
+ with open(template_path, "r", encoding="utf-8") as f:
31
+ html_template = f.read()
107
32
 
108
- // Helper to get server base URL
109
- function getServerBaseUrl() {{
110
- const port = window.SERVER_PORT || 8000;
111
- return `http://localhost:{port}`;
112
- }}
33
+ # Replace template variables
34
+ html_content = html_template.replace("{{SERVER_PORT}}", str(port))
35
+ html_content = html_content.replace("{{SERVER_MODELS_JS}}", server_models_js)
113
36
 
114
- // Populate model dropdown from /api/v1/models endpoint
115
- async function loadModels() {{
116
- try {{
117
- const resp = await fetch(getServerBaseUrl() + '/api/v1/models');
118
- const data = await resp.json();
119
- const select = document.getElementById('model-select');
120
- select.innerHTML = '';
121
- if (!data.data || !Array.isArray(data.data)) {{
122
- select.innerHTML = '<option>No models found (malformed response)</option>';
123
- return;
124
- }}
125
- if (data.data.length === 0) {{
126
- select.innerHTML = '<option>No models available</option>';
127
- return;
128
- }}
129
- let defaultIndex = 0;
130
- data.data.forEach(function(model, index) {{
131
- const modelId = model.id || model.name || model;
132
- const opt = document.createElement('option');
133
- opt.value = modelId;
134
- opt.textContent = modelId;
135
- if (modelId === 'Llama-3.2-1B-Instruct-Hybrid') {{
136
- defaultIndex = index;
137
- }}
138
- select.appendChild(opt);
139
- }});
140
- select.selectedIndex = defaultIndex;
141
- }} catch (e) {{
142
- const select = document.getElementById('model-select');
143
- select.innerHTML = `<option>Error loading models: ${{e.message}}</option>`;
144
- console.error('Error loading models:', e);
145
- }}
146
- }}
147
- loadModels();
148
-
149
- // Model Management Tab Logic
150
- async function refreshModelMgmtUI() {{
151
- // Get installed models from /api/v1/models
152
- let installed = [];
153
- try {{
154
- const resp = await fetch(getServerBaseUrl() + '/api/v1/models');
155
- const data = await resp.json();
156
- if (data.data && Array.isArray(data.data)) {{
157
- installed = data.data.map(m => m.id || m.name || m);
158
- }}
159
- }} catch (e) {{}}
160
- // All models from server_models.json (window.SERVER_MODELS)
161
- const allModels = window.SERVER_MODELS || {{}};
162
- // Filter suggested models not installed
163
- const suggested = Object.keys(allModels).filter(
164
- k => allModels[k].suggested && !installed.includes(k)
165
- );
166
- // Render installed models as a table (two columns, second is invisible)
167
- const installedTbody = document.getElementById('installed-models-tbody');
168
- installedTbody.innerHTML = '';
169
- installed.forEach(function(mid) {{
170
- var tr = document.createElement('tr');
171
- var tdName = document.createElement('td');
172
- tdName.textContent = mid;
173
- var tdEmpty = document.createElement('td');
174
- tdEmpty.style.width = '0';
175
- tdEmpty.style.padding = '0';
176
- tdEmpty.style.border = 'none';
177
- tr.appendChild(tdName);
178
- tr.appendChild(tdEmpty);
179
- installedTbody.appendChild(tr);
180
- }});
181
- // Render suggested models as a table
182
- const suggestedTbody = document.getElementById('suggested-models-tbody');
183
- suggestedTbody.innerHTML = '';
184
- suggested.forEach(mid => {{
185
- const tr = document.createElement('tr');
186
- const tdName = document.createElement('td');
187
- tdName.textContent = mid;
188
- tdName.style.paddingRight = '1em';
189
- tdName.style.verticalAlign = 'middle';
190
- const tdBtn = document.createElement('td');
191
- tdBtn.style.width = '1%';
192
- tdBtn.style.verticalAlign = 'middle';
193
- const btn = document.createElement('button');
194
- btn.textContent = '+';
195
- btn.title = 'Install model';
196
- btn.onclick = async function() {{
197
- btn.disabled = true;
198
- btn.textContent = 'Installing...';
199
- btn.classList.add('installing-btn');
200
- try {{
201
- await fetch(getServerBaseUrl() + '/api/v1/pull', {{
202
- method: 'POST',
203
- headers: {{ 'Content-Type': 'application/json' }},
204
- body: JSON.stringify({{ model_name: mid }})
205
- }});
206
- await refreshModelMgmtUI();
207
- await loadModels(); // update chat dropdown too
208
- }} catch (e) {{
209
- btn.textContent = 'Error';
210
- }}
211
- }};
212
- tdBtn.appendChild(btn);
213
- tr.appendChild(tdName);
214
- tr.appendChild(tdBtn);
215
- suggestedTbody.appendChild(tr);
216
- }});
217
- }}
218
- // Initial load
219
- refreshModelMgmtUI();
220
- // Optionally, refresh when switching to the tab
221
- document.getElementById('tab-models').addEventListener('click', refreshModelMgmtUI);
222
-
223
- // Chat logic (streaming with OpenAI JS client placeholder)
224
- const chatHistory = document.getElementById('chat-history');
225
- const chatInput = document.getElementById('chat-input');
226
- const sendBtn = document.getElementById('send-btn');
227
- const modelSelect = document.getElementById('model-select');
228
- let messages = [];
229
-
230
- function appendMessage(role, text) {{
231
- const div = document.createElement('div');
232
- div.className = 'chat-message ' + role;
233
- // Add a bubble for iMessage style
234
- const bubble = document.createElement('div');
235
- bubble.className = 'chat-bubble ' + role;
236
- bubble.innerHTML = text;
237
- div.appendChild(bubble);
238
- chatHistory.appendChild(div);
239
- chatHistory.scrollTop = chatHistory.scrollHeight;
240
- }}
241
-
242
- async function sendMessage() {{
243
- const text = chatInput.value.trim();
244
- if (!text) return;
245
- appendMessage('user', text);
246
- messages.push({{ role: 'user', content: text }});
247
- chatInput.value = '';
248
- sendBtn.disabled = true;
249
- // Streaming OpenAI completions (placeholder, adapt as needed)
250
- let llmText = '';
251
- appendMessage('llm', '...');
252
- const llmDiv = chatHistory.lastChild.querySelector('.chat-bubble.llm');
253
- try {{
254
- // Use the correct endpoint for chat completions
255
- const resp = await fetch(getServerBaseUrl() + '/api/v1/chat/completions', {{
256
- method: 'POST',
257
- headers: {{ 'Content-Type': 'application/json' }},
258
- body: JSON.stringify({{
259
- model: modelSelect.value,
260
- messages: messages,
261
- stream: true
262
- }})
263
- }});
264
- if (!resp.body) throw new Error('No stream');
265
- const reader = resp.body.getReader();
266
- let decoder = new TextDecoder();
267
- llmDiv.textContent = '';
268
- while (true) {{
269
- const {{ done, value }} = await reader.read();
270
- if (done) break;
271
- const chunk = decoder.decode(value);
272
- if (chunk.trim() === 'data: [DONE]' || chunk.trim() === '[DONE]') continue;
273
- // Try to extract the content from the OpenAI chunk
274
- const match = chunk.match(/"content"\s*:\s*"([^"]*)"/);
275
- if (match && match[1]) {{
276
- llmText += match[1];
277
- llmDiv.textContent = llmText;
278
- }}
279
- }}
280
- messages.push({{ role: 'assistant', content: llmText }});
281
- }} catch (e) {{
282
- llmDiv.textContent = '[Error: ' + e.message + ']';
283
- }}
284
- sendBtn.disabled = false;
285
- }}
286
- sendBtn.onclick = sendMessage;
287
- chatInput.addEventListener('keydown', function(e) {{
288
- if (e.key === 'Enter') sendMessage();
289
- }});
290
- </script>
291
- </body>
292
- </html>
293
- """
294
- return HTMLResponse(content=styled_html)
37
+ return HTMLResponse(content=html_content)
@@ -14,11 +14,11 @@ from fastapi.responses import StreamingResponse
14
14
 
15
15
  from openai import OpenAI
16
16
 
17
+ from lemonade_server.pydantic_models import ChatCompletionRequest
17
18
  from lemonade_server.model_manager import ModelManager
18
- from lemonade.tools.server.pydantic_models import ChatCompletionRequest
19
+ from lemonade.tools.server.port_utils import find_free_port
19
20
 
20
21
  LLAMA_VERSION = "b5543"
21
- LLAMA_SERVER_PORT = "8081"
22
22
 
23
23
  LLAMA_SERVER_EXE_DIR = os.path.join(
24
24
  os.path.dirname(sys.executable),
@@ -43,6 +43,23 @@ class LlamaTelemetry:
43
43
  self.tokens_per_second = None
44
44
  self.prompt_eval_time = None
45
45
  self.eval_time = None
46
+ self.port = None
47
+
48
+ def choose_port(self):
49
+ """
50
+ Users probably don't care what port we start llama-server on, so let's
51
+ search for an empty port
52
+ """
53
+
54
+ self.port = find_free_port()
55
+
56
+ if self.port is None:
57
+ msg = "Failed to find an empty port to start llama-server on"
58
+ logging.error(msg)
59
+ raise HTTPException(
60
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
61
+ detail=msg,
62
+ )
46
63
 
47
64
  def parse_telemetry_line(self, line: str):
48
65
  """
@@ -128,10 +145,12 @@ def _log_subprocess_output(
128
145
  break
129
146
 
130
147
 
131
- def _wait_for_load(llama_server_process: subprocess.Popen, fail_message: str):
148
+ def _wait_for_load(
149
+ llama_server_process: subprocess.Popen, port: int, fail_message: str
150
+ ):
132
151
  status_code = None
133
152
  while not llama_server_process.poll() and status_code != 200:
134
- health_url = f"http://localhost:{LLAMA_SERVER_PORT}/health"
153
+ health_url = f"http://localhost:{port}/health"
135
154
  try:
136
155
  health_response = requests.get(health_url)
137
156
  except requests.exceptions.ConnectionError:
@@ -146,19 +165,25 @@ def _wait_for_load(llama_server_process: subprocess.Popen, fail_message: str):
146
165
 
147
166
 
148
167
  def _launch_llama_subprocess(
149
- model_path: str, use_gpu: bool, telemetry: LlamaTelemetry
168
+ snapshot_files: dict, use_gpu: bool, telemetry: LlamaTelemetry
150
169
  ) -> subprocess.Popen:
151
170
  """
152
171
  Launch llama server subprocess with GPU or CPU configuration
153
172
  """
154
173
 
155
- base_command = [
156
- LLAMA_SERVER_EXE_PATH,
157
- "-m",
158
- model_path,
159
- "--port",
160
- LLAMA_SERVER_PORT,
161
- ]
174
+ # Build the base command
175
+ base_command = [LLAMA_SERVER_EXE_PATH, "-m", snapshot_files["variant"]]
176
+ if "mmproj" in snapshot_files:
177
+ base_command.extend(["--mmproj", snapshot_files["mmproj"]])
178
+ if not use_gpu:
179
+ base_command.extend(["--no-mmproj-offload"])
180
+
181
+ # Find a port, and save it in the telemetry object for future reference
182
+ # by other functions
183
+ telemetry.choose_port()
184
+
185
+ # Add port and jinja to enable tool use
186
+ base_command.extend(["--port", str(telemetry.port), "--jinja"])
162
187
 
163
188
  # Configure GPU layers: 99 for GPU, 0 for CPU-only
164
189
  ngl_value = "99" if use_gpu else "0"
@@ -180,7 +205,7 @@ def _launch_llama_subprocess(
180
205
  return process
181
206
 
182
207
 
183
- def server_load(checkpoint: str, model_reference: str, telemetry: LlamaTelemetry):
208
+ def server_load(model_config: dict, model_reference: str, telemetry: LlamaTelemetry):
184
209
  # Download llama.cpp server if it isn't already available
185
210
  if not os.path.exists(LLAMA_SERVER_EXE_DIR):
186
211
  # Download llama.cpp server zip
@@ -212,33 +237,34 @@ def server_load(checkpoint: str, model_reference: str, telemetry: LlamaTelemetry
212
237
  logging.info("Cleaned up zip file")
213
238
 
214
239
  # Download the gguf to the hugging face cache
215
- snapshot_path = ModelManager().download_gguf(checkpoint)
216
- model_path = os.path.join(snapshot_path, os.listdir(snapshot_path)[0])
217
- logging.debug(f"GGUF file path: {model_path}")
240
+ snapshot_files = ModelManager().download_gguf(model_config)
241
+ logging.debug(f"GGUF file paths: {snapshot_files}")
218
242
 
219
243
  # Start the llama-serve.exe process
220
244
  logging.debug(f"Using llama_server for GGUF model: {LLAMA_SERVER_EXE_PATH}")
221
245
 
222
246
  # Attempt loading on GPU first
223
247
  llama_server_process = _launch_llama_subprocess(
224
- model_path, use_gpu=True, telemetry=telemetry
248
+ snapshot_files, use_gpu=True, telemetry=telemetry
225
249
  )
226
250
 
227
251
  # Check the /health endpoint until GPU server is ready
228
252
  _wait_for_load(
229
253
  llama_server_process,
254
+ telemetry.port,
230
255
  f"Loading {model_reference} on GPU didn't work, re-attempting on CPU",
231
256
  )
232
257
 
233
258
  # If loading on GPU failed, try loading on CPU
234
259
  if llama_server_process.poll():
235
260
  llama_server_process = _launch_llama_subprocess(
236
- model_path, use_gpu=False, telemetry=telemetry
261
+ snapshot_files, use_gpu=False, telemetry=telemetry
237
262
  )
238
263
 
239
264
  # Check the /health endpoint until CPU server is ready
240
265
  _wait_for_load(
241
266
  llama_server_process,
267
+ telemetry.port,
242
268
  f"Loading {model_reference} on CPU didn't work",
243
269
  )
244
270
 
@@ -254,7 +280,7 @@ def server_load(checkpoint: str, model_reference: str, telemetry: LlamaTelemetry
254
280
  def chat_completion(
255
281
  chat_completion_request: ChatCompletionRequest, telemetry: LlamaTelemetry
256
282
  ):
257
- base_url = f"http://127.0.0.1:{LLAMA_SERVER_PORT}/v1"
283
+ base_url = f"http://127.0.0.1:{telemetry.port}/v1"
258
284
  client = OpenAI(
259
285
  base_url=base_url,
260
286
  api_key="lemonade",
@@ -0,0 +1,57 @@
1
+ import socketserver
2
+ import sys
3
+ import logging
4
+ from contextlib import asynccontextmanager
5
+ from fastapi import FastAPI
6
+
7
+
8
+ def find_free_port():
9
+ """
10
+ Scans for an unoccupied TCP port
11
+
12
+ Returns the port number as an int on success
13
+ Returns None if no port can be found
14
+ """
15
+
16
+ try:
17
+ with socketserver.TCPServer(("localhost", 0), None) as s:
18
+ return s.server_address[1]
19
+ # pylint: disable=broad-exception-caught
20
+ except Exception:
21
+ return None
22
+
23
+
24
+ @asynccontextmanager
25
+ async def lifespan(app: FastAPI):
26
+ # Code here will run when the application starts up
27
+ # Check if console can handle Unicode by testing emoji encoding
28
+
29
+ try:
30
+ if sys.stdout.encoding:
31
+ "🍋".encode(sys.stdout.encoding)
32
+ use_emojis = True
33
+ except (UnicodeEncodeError, AttributeError):
34
+ use_emojis = False
35
+
36
+ if use_emojis:
37
+ logging.info(
38
+ "\n"
39
+ "\n"
40
+ "🍋 Lemonade Server Ready!\n"
41
+ f"🍋 Open http://localhost:{app.port} in your browser for:\n"
42
+ "🍋 💬 chat\n"
43
+ "🍋 💻 model management\n"
44
+ "🍋 📄 docs\n"
45
+ )
46
+ else:
47
+ logging.info(
48
+ "\n"
49
+ "\n"
50
+ "[Lemonade] Lemonade Server Ready!\n"
51
+ f"[Lemonade] Open http://localhost:{app.port} in your browser for:\n"
52
+ "[Lemonade] chat\n"
53
+ "[Lemonade] model management\n"
54
+ "[Lemonade] docs\n"
55
+ )
56
+
57
+ yield