lemonade-sdk 7.0.2__py3-none-any.whl → 7.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lemonade-sdk might be problematic. Click here for more details.
- lemonade/tools/huggingface_load.py +6 -0
- lemonade/tools/ort_genai/oga.py +6 -4
- lemonade/tools/prompt.py +28 -1
- lemonade/tools/server/instructions.py +8 -265
- lemonade/tools/server/llamacpp.py +16 -16
- lemonade/tools/server/serve.py +5 -5
- lemonade/tools/server/static/instructions.html +262 -0
- lemonade/version.py +1 -1
- {lemonade_sdk-7.0.2.dist-info → lemonade_sdk-7.0.3.dist-info}/METADATA +1 -1
- {lemonade_sdk-7.0.2.dist-info → lemonade_sdk-7.0.3.dist-info}/RECORD +18 -17
- lemonade_server/model_manager.py +45 -12
- {lemonade/tools/server → lemonade_server}/pydantic_models.py +2 -0
- lemonade_server/server_models.json +14 -0
- {lemonade_sdk-7.0.2.dist-info → lemonade_sdk-7.0.3.dist-info}/WHEEL +0 -0
- {lemonade_sdk-7.0.2.dist-info → lemonade_sdk-7.0.3.dist-info}/entry_points.txt +0 -0
- {lemonade_sdk-7.0.2.dist-info → lemonade_sdk-7.0.3.dist-info}/licenses/LICENSE +0 -0
- {lemonade_sdk-7.0.2.dist-info → lemonade_sdk-7.0.3.dist-info}/licenses/NOTICE.md +0 -0
- {lemonade_sdk-7.0.2.dist-info → lemonade_sdk-7.0.3.dist-info}/top_level.txt +0 -0
|
@@ -326,6 +326,7 @@ class HuggingfaceAdapter(ModelAdapter):
|
|
|
326
326
|
def generate(
|
|
327
327
|
self,
|
|
328
328
|
input_ids,
|
|
329
|
+
random_seed=1,
|
|
329
330
|
**kwargs,
|
|
330
331
|
):
|
|
331
332
|
|
|
@@ -346,6 +347,11 @@ class HuggingfaceAdapter(ModelAdapter):
|
|
|
346
347
|
**kwargs,
|
|
347
348
|
}
|
|
348
349
|
|
|
350
|
+
if random_seed is None:
|
|
351
|
+
torch.random.seed()
|
|
352
|
+
else:
|
|
353
|
+
torch.random.manual_seed(random_seed)
|
|
354
|
+
|
|
349
355
|
with torch.no_grad(), torch.inference_mode():
|
|
350
356
|
outputs = self.model.generate(input_ids=input_ids, **generation_kwargs)
|
|
351
357
|
|
lemonade/tools/ort_genai/oga.py
CHANGED
|
@@ -139,6 +139,7 @@ class OrtGenaiModel(ModelAdapter):
|
|
|
139
139
|
pad_token_id=None,
|
|
140
140
|
stopping_criteria=None,
|
|
141
141
|
max_length=None,
|
|
142
|
+
random_seed=1,
|
|
142
143
|
):
|
|
143
144
|
params = og.GeneratorParams(self.model)
|
|
144
145
|
|
|
@@ -179,6 +180,9 @@ class OrtGenaiModel(ModelAdapter):
|
|
|
179
180
|
if use_oga_pre_6_api:
|
|
180
181
|
params.input_ids = input_ids
|
|
181
182
|
|
|
183
|
+
if random_seed is None:
|
|
184
|
+
random_seed = -1 # In og.Generator, -1 = seed with random device
|
|
185
|
+
|
|
182
186
|
if self.config and "search" in self.config:
|
|
183
187
|
search_config = self.config["search"]
|
|
184
188
|
params.set_search_options(
|
|
@@ -196,10 +200,7 @@ class OrtGenaiModel(ModelAdapter):
|
|
|
196
200
|
past_present_share_buffer=search_config.get(
|
|
197
201
|
"past_present_share_buffer", True
|
|
198
202
|
),
|
|
199
|
-
|
|
200
|
-
# by default, random_seed=-1 causes different laptops to give
|
|
201
|
-
# different results
|
|
202
|
-
random_seed=1,
|
|
203
|
+
random_seed=random_seed,
|
|
203
204
|
# Not currently supported by OGA
|
|
204
205
|
# diversity_penalty=search_config.get('diversity_penalty', 0.0),
|
|
205
206
|
# no_repeat_ngram_size=search_config.get('no_repeat_ngram_size', 0),
|
|
@@ -212,6 +213,7 @@ class OrtGenaiModel(ModelAdapter):
|
|
|
212
213
|
temperature=temperature,
|
|
213
214
|
max_length=max_length_to_use,
|
|
214
215
|
min_length=min_length,
|
|
216
|
+
random_seed=random_seed,
|
|
215
217
|
)
|
|
216
218
|
params.try_graph_capture_with_max_batch_size(1)
|
|
217
219
|
|
lemonade/tools/prompt.py
CHANGED
|
@@ -15,6 +15,7 @@ DEFAULT_GENERATE_PARAMS = {
|
|
|
15
15
|
"temperature": 0.7,
|
|
16
16
|
}
|
|
17
17
|
|
|
18
|
+
DEFAULT_RANDOM_SEED = 1
|
|
18
19
|
DEFAULT_MAX_NEW_TOKENS = 512
|
|
19
20
|
DEFAULT_N_TRIALS = 1
|
|
20
21
|
|
|
@@ -108,6 +109,19 @@ class LLMPrompt(Tool):
|
|
|
108
109
|
f"(useful for testing, default is {DEFAULT_N_TRIALS})",
|
|
109
110
|
)
|
|
110
111
|
|
|
112
|
+
parser.add_argument(
|
|
113
|
+
"--random-seed",
|
|
114
|
+
"-r",
|
|
115
|
+
default=str(DEFAULT_RANDOM_SEED),
|
|
116
|
+
help="Positive integer seed for random number generator used in "
|
|
117
|
+
"sampling tokens "
|
|
118
|
+
f"(default is {DEFAULT_RANDOM_SEED}). If the number of trials is "
|
|
119
|
+
"greater than one, then the seed is incremented by one for each "
|
|
120
|
+
"trial. Set to `None` for random, non-repeatable results. This "
|
|
121
|
+
"random seed behavior only applies to models loaded with "
|
|
122
|
+
"`oga-load` or `huggingface-load`.",
|
|
123
|
+
)
|
|
124
|
+
|
|
111
125
|
return parser
|
|
112
126
|
|
|
113
127
|
def parse(self, state: State, args, known_only=True) -> argparse.Namespace:
|
|
@@ -123,6 +137,11 @@ class LLMPrompt(Tool):
|
|
|
123
137
|
with open(parsed_args.prompt, "r", encoding="utf-8") as f:
|
|
124
138
|
parsed_args.prompt = f.read()
|
|
125
139
|
|
|
140
|
+
if parsed_args.random_seed == "None":
|
|
141
|
+
parsed_args.random_seed = None
|
|
142
|
+
else:
|
|
143
|
+
parsed_args.random_seed = int(parsed_args.random_seed)
|
|
144
|
+
|
|
126
145
|
return parsed_args
|
|
127
146
|
|
|
128
147
|
def run(
|
|
@@ -132,6 +151,7 @@ class LLMPrompt(Tool):
|
|
|
132
151
|
max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
|
|
133
152
|
n_trials: int = DEFAULT_N_TRIALS,
|
|
134
153
|
template: bool = False,
|
|
154
|
+
random_seed: int = DEFAULT_RANDOM_SEED,
|
|
135
155
|
) -> State:
|
|
136
156
|
|
|
137
157
|
model: ModelAdapter = state.model
|
|
@@ -170,9 +190,16 @@ class LLMPrompt(Tool):
|
|
|
170
190
|
|
|
171
191
|
# Get the response from the LLM, which may include the prompt in it
|
|
172
192
|
response = model.generate(
|
|
173
|
-
input_ids,
|
|
193
|
+
input_ids,
|
|
194
|
+
max_new_tokens=max_new_tokens,
|
|
195
|
+
random_seed=random_seed,
|
|
196
|
+
**DEFAULT_GENERATE_PARAMS,
|
|
174
197
|
)
|
|
175
198
|
|
|
199
|
+
# Increment random seed if not none
|
|
200
|
+
if random_seed is not None:
|
|
201
|
+
random_seed += 1
|
|
202
|
+
|
|
176
203
|
# Flatten the input and response
|
|
177
204
|
input_ids_array = (
|
|
178
205
|
input_ids if isinstance(input_ids, (list, str)) else input_ids[0]
|
|
@@ -25,270 +25,13 @@ def get_instructions_html(port=8000):
|
|
|
25
25
|
f"<script>window.SERVER_MODELS = {json.dumps(filtered_models)};</script>"
|
|
26
26
|
)
|
|
27
27
|
|
|
28
|
-
#
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
<html lang=\"en\">
|
|
33
|
-
<head>
|
|
34
|
-
<meta charset=\"UTF-8\">
|
|
35
|
-
<meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">
|
|
36
|
-
<title>Lemonade Server</title>
|
|
37
|
-
<link rel="icon" href="data:,">
|
|
38
|
-
<link rel=\"stylesheet\" href=\"/static/styles.css\">
|
|
39
|
-
<script>
|
|
40
|
-
window.SERVER_PORT = {port};
|
|
41
|
-
</script>
|
|
42
|
-
{server_models_js}
|
|
43
|
-
</head>
|
|
44
|
-
<body>
|
|
45
|
-
<nav class=\"navbar\">
|
|
46
|
-
<a href=\"https://github.com/lemonade-sdk/lemonade\">GitHub</a>
|
|
47
|
-
<a href=\"https://lemonade-server.ai/docs/\">Docs</a>
|
|
48
|
-
<a href=\"https://lemonade-server.ai/docs/server/server_models/\">Models</a>
|
|
49
|
-
<a href=\"https://lemonade-server.ai/docs/server/apps/\">Featured Apps</a>
|
|
50
|
-
</nav>
|
|
51
|
-
<main class=\"main\">
|
|
52
|
-
<div class=\"title\">🍋 Lemonade Server</div>
|
|
53
|
-
<div class=\"tab-container\">
|
|
54
|
-
<div class=\"tabs\">
|
|
55
|
-
<button class=\"tab active\" id=\"tab-chat\" onclick=\"showTab('chat')\">LLM Chat</button>
|
|
56
|
-
<button class=\"tab\" id=\"tab-models\" onclick=\"showTab('models')\">Model Management</button>
|
|
57
|
-
</div>
|
|
58
|
-
<div class=\"tab-content active\" id=\"content-chat\">
|
|
59
|
-
<div class=\"chat-container\">
|
|
60
|
-
<div class=\"chat-history\" id=\"chat-history\"></div>
|
|
61
|
-
<div class=\"chat-input-row\">
|
|
62
|
-
<select id=\"model-select\"></select>
|
|
63
|
-
<input type=\"text\" id=\"chat-input\" placeholder=\"Type your message...\" />
|
|
64
|
-
<button id=\"send-btn\">Send</button>
|
|
65
|
-
</div>
|
|
66
|
-
</div>
|
|
67
|
-
</div>
|
|
68
|
-
<div class=\"tab-content\" id=\"content-models\">
|
|
69
|
-
<div class=\"model-mgmt-container\">
|
|
70
|
-
<div class=\"model-mgmt-pane\">
|
|
71
|
-
<h3>Installed Models</h3>
|
|
72
|
-
<table class=\"model-table\" id=\"installed-models-table\">
|
|
73
|
-
<colgroup><col style=\"width:100%\"></colgroup>
|
|
74
|
-
<tbody id=\"installed-models-tbody\"></tbody>
|
|
75
|
-
</table>
|
|
76
|
-
</div>
|
|
77
|
-
<div class=\"model-mgmt-pane\">
|
|
78
|
-
<h3>Suggested Models</h3>
|
|
79
|
-
<table class=\"model-table\" id=\"suggested-models-table\">
|
|
80
|
-
<tbody id=\"suggested-models-tbody\"></tbody>
|
|
81
|
-
</table>
|
|
82
|
-
</div>
|
|
83
|
-
</div>
|
|
84
|
-
</div>
|
|
85
|
-
</div>
|
|
86
|
-
</main>
|
|
87
|
-
<footer class=\"site-footer\">
|
|
88
|
-
<div class=\"dad-joke\">When life gives you LLMs, make an LLM aide.</div>
|
|
89
|
-
<div class=\"copyright\">Copyright 2025 AMD</div>
|
|
90
|
-
</footer>
|
|
91
|
-
<script src=\"https://cdn.jsdelivr.net/npm/openai@4.21.0/dist/openai.min.js\"></script>
|
|
92
|
-
<script>
|
|
93
|
-
// Tab switching logic
|
|
94
|
-
function showTab(tab) {{
|
|
95
|
-
document.getElementById('tab-chat').classList.remove('active');
|
|
96
|
-
document.getElementById('tab-models').classList.remove('active');
|
|
97
|
-
document.getElementById('content-chat').classList.remove('active');
|
|
98
|
-
document.getElementById('content-models').classList.remove('active');
|
|
99
|
-
if (tab === 'chat') {{
|
|
100
|
-
document.getElementById('tab-chat').classList.add('active');
|
|
101
|
-
document.getElementById('content-chat').classList.add('active');
|
|
102
|
-
}} else {{
|
|
103
|
-
document.getElementById('tab-models').classList.add('active');
|
|
104
|
-
document.getElementById('content-models').classList.add('active');
|
|
105
|
-
}}
|
|
106
|
-
}}
|
|
28
|
+
# Load HTML template
|
|
29
|
+
template_path = Path(__file__).parent / "static" / "instructions.html"
|
|
30
|
+
with open(template_path, "r", encoding="utf-8") as f:
|
|
31
|
+
html_template = f.read()
|
|
107
32
|
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
return `http://localhost:{port}`;
|
|
112
|
-
}}
|
|
33
|
+
# Replace template variables
|
|
34
|
+
html_content = html_template.replace("{{SERVER_PORT}}", str(port))
|
|
35
|
+
html_content = html_content.replace("{{SERVER_MODELS_JS}}", server_models_js)
|
|
113
36
|
|
|
114
|
-
|
|
115
|
-
async function loadModels() {{
|
|
116
|
-
try {{
|
|
117
|
-
const resp = await fetch(getServerBaseUrl() + '/api/v1/models');
|
|
118
|
-
const data = await resp.json();
|
|
119
|
-
const select = document.getElementById('model-select');
|
|
120
|
-
select.innerHTML = '';
|
|
121
|
-
if (!data.data || !Array.isArray(data.data)) {{
|
|
122
|
-
select.innerHTML = '<option>No models found (malformed response)</option>';
|
|
123
|
-
return;
|
|
124
|
-
}}
|
|
125
|
-
if (data.data.length === 0) {{
|
|
126
|
-
select.innerHTML = '<option>No models available</option>';
|
|
127
|
-
return;
|
|
128
|
-
}}
|
|
129
|
-
let defaultIndex = 0;
|
|
130
|
-
data.data.forEach(function(model, index) {{
|
|
131
|
-
const modelId = model.id || model.name || model;
|
|
132
|
-
const opt = document.createElement('option');
|
|
133
|
-
opt.value = modelId;
|
|
134
|
-
opt.textContent = modelId;
|
|
135
|
-
if (modelId === 'Llama-3.2-1B-Instruct-Hybrid') {{
|
|
136
|
-
defaultIndex = index;
|
|
137
|
-
}}
|
|
138
|
-
select.appendChild(opt);
|
|
139
|
-
}});
|
|
140
|
-
select.selectedIndex = defaultIndex;
|
|
141
|
-
}} catch (e) {{
|
|
142
|
-
const select = document.getElementById('model-select');
|
|
143
|
-
select.innerHTML = `<option>Error loading models: ${{e.message}}</option>`;
|
|
144
|
-
console.error('Error loading models:', e);
|
|
145
|
-
}}
|
|
146
|
-
}}
|
|
147
|
-
loadModels();
|
|
148
|
-
|
|
149
|
-
// Model Management Tab Logic
|
|
150
|
-
async function refreshModelMgmtUI() {{
|
|
151
|
-
// Get installed models from /api/v1/models
|
|
152
|
-
let installed = [];
|
|
153
|
-
try {{
|
|
154
|
-
const resp = await fetch(getServerBaseUrl() + '/api/v1/models');
|
|
155
|
-
const data = await resp.json();
|
|
156
|
-
if (data.data && Array.isArray(data.data)) {{
|
|
157
|
-
installed = data.data.map(m => m.id || m.name || m);
|
|
158
|
-
}}
|
|
159
|
-
}} catch (e) {{}}
|
|
160
|
-
// All models from server_models.json (window.SERVER_MODELS)
|
|
161
|
-
const allModels = window.SERVER_MODELS || {{}};
|
|
162
|
-
// Filter suggested models not installed
|
|
163
|
-
const suggested = Object.keys(allModels).filter(
|
|
164
|
-
k => allModels[k].suggested && !installed.includes(k)
|
|
165
|
-
);
|
|
166
|
-
// Render installed models as a table (two columns, second is invisible)
|
|
167
|
-
const installedTbody = document.getElementById('installed-models-tbody');
|
|
168
|
-
installedTbody.innerHTML = '';
|
|
169
|
-
installed.forEach(function(mid) {{
|
|
170
|
-
var tr = document.createElement('tr');
|
|
171
|
-
var tdName = document.createElement('td');
|
|
172
|
-
tdName.textContent = mid;
|
|
173
|
-
var tdEmpty = document.createElement('td');
|
|
174
|
-
tdEmpty.style.width = '0';
|
|
175
|
-
tdEmpty.style.padding = '0';
|
|
176
|
-
tdEmpty.style.border = 'none';
|
|
177
|
-
tr.appendChild(tdName);
|
|
178
|
-
tr.appendChild(tdEmpty);
|
|
179
|
-
installedTbody.appendChild(tr);
|
|
180
|
-
}});
|
|
181
|
-
// Render suggested models as a table
|
|
182
|
-
const suggestedTbody = document.getElementById('suggested-models-tbody');
|
|
183
|
-
suggestedTbody.innerHTML = '';
|
|
184
|
-
suggested.forEach(mid => {{
|
|
185
|
-
const tr = document.createElement('tr');
|
|
186
|
-
const tdName = document.createElement('td');
|
|
187
|
-
tdName.textContent = mid;
|
|
188
|
-
tdName.style.paddingRight = '1em';
|
|
189
|
-
tdName.style.verticalAlign = 'middle';
|
|
190
|
-
const tdBtn = document.createElement('td');
|
|
191
|
-
tdBtn.style.width = '1%';
|
|
192
|
-
tdBtn.style.verticalAlign = 'middle';
|
|
193
|
-
const btn = document.createElement('button');
|
|
194
|
-
btn.textContent = '+';
|
|
195
|
-
btn.title = 'Install model';
|
|
196
|
-
btn.onclick = async function() {{
|
|
197
|
-
btn.disabled = true;
|
|
198
|
-
btn.textContent = 'Installing...';
|
|
199
|
-
btn.classList.add('installing-btn');
|
|
200
|
-
try {{
|
|
201
|
-
await fetch(getServerBaseUrl() + '/api/v1/pull', {{
|
|
202
|
-
method: 'POST',
|
|
203
|
-
headers: {{ 'Content-Type': 'application/json' }},
|
|
204
|
-
body: JSON.stringify({{ model_name: mid }})
|
|
205
|
-
}});
|
|
206
|
-
await refreshModelMgmtUI();
|
|
207
|
-
await loadModels(); // update chat dropdown too
|
|
208
|
-
}} catch (e) {{
|
|
209
|
-
btn.textContent = 'Error';
|
|
210
|
-
}}
|
|
211
|
-
}};
|
|
212
|
-
tdBtn.appendChild(btn);
|
|
213
|
-
tr.appendChild(tdName);
|
|
214
|
-
tr.appendChild(tdBtn);
|
|
215
|
-
suggestedTbody.appendChild(tr);
|
|
216
|
-
}});
|
|
217
|
-
}}
|
|
218
|
-
// Initial load
|
|
219
|
-
refreshModelMgmtUI();
|
|
220
|
-
// Optionally, refresh when switching to the tab
|
|
221
|
-
document.getElementById('tab-models').addEventListener('click', refreshModelMgmtUI);
|
|
222
|
-
|
|
223
|
-
// Chat logic (streaming with OpenAI JS client placeholder)
|
|
224
|
-
const chatHistory = document.getElementById('chat-history');
|
|
225
|
-
const chatInput = document.getElementById('chat-input');
|
|
226
|
-
const sendBtn = document.getElementById('send-btn');
|
|
227
|
-
const modelSelect = document.getElementById('model-select');
|
|
228
|
-
let messages = [];
|
|
229
|
-
|
|
230
|
-
function appendMessage(role, text) {{
|
|
231
|
-
const div = document.createElement('div');
|
|
232
|
-
div.className = 'chat-message ' + role;
|
|
233
|
-
// Add a bubble for iMessage style
|
|
234
|
-
const bubble = document.createElement('div');
|
|
235
|
-
bubble.className = 'chat-bubble ' + role;
|
|
236
|
-
bubble.innerHTML = text;
|
|
237
|
-
div.appendChild(bubble);
|
|
238
|
-
chatHistory.appendChild(div);
|
|
239
|
-
chatHistory.scrollTop = chatHistory.scrollHeight;
|
|
240
|
-
}}
|
|
241
|
-
|
|
242
|
-
async function sendMessage() {{
|
|
243
|
-
const text = chatInput.value.trim();
|
|
244
|
-
if (!text) return;
|
|
245
|
-
appendMessage('user', text);
|
|
246
|
-
messages.push({{ role: 'user', content: text }});
|
|
247
|
-
chatInput.value = '';
|
|
248
|
-
sendBtn.disabled = true;
|
|
249
|
-
// Streaming OpenAI completions (placeholder, adapt as needed)
|
|
250
|
-
let llmText = '';
|
|
251
|
-
appendMessage('llm', '...');
|
|
252
|
-
const llmDiv = chatHistory.lastChild.querySelector('.chat-bubble.llm');
|
|
253
|
-
try {{
|
|
254
|
-
// Use the correct endpoint for chat completions
|
|
255
|
-
const resp = await fetch(getServerBaseUrl() + '/api/v1/chat/completions', {{
|
|
256
|
-
method: 'POST',
|
|
257
|
-
headers: {{ 'Content-Type': 'application/json' }},
|
|
258
|
-
body: JSON.stringify({{
|
|
259
|
-
model: modelSelect.value,
|
|
260
|
-
messages: messages,
|
|
261
|
-
stream: true
|
|
262
|
-
}})
|
|
263
|
-
}});
|
|
264
|
-
if (!resp.body) throw new Error('No stream');
|
|
265
|
-
const reader = resp.body.getReader();
|
|
266
|
-
let decoder = new TextDecoder();
|
|
267
|
-
llmDiv.textContent = '';
|
|
268
|
-
while (true) {{
|
|
269
|
-
const {{ done, value }} = await reader.read();
|
|
270
|
-
if (done) break;
|
|
271
|
-
const chunk = decoder.decode(value);
|
|
272
|
-
if (chunk.trim() === 'data: [DONE]' || chunk.trim() === '[DONE]') continue;
|
|
273
|
-
// Try to extract the content from the OpenAI chunk
|
|
274
|
-
const match = chunk.match(/"content"\s*:\s*"([^"]*)"/);
|
|
275
|
-
if (match && match[1]) {{
|
|
276
|
-
llmText += match[1];
|
|
277
|
-
llmDiv.textContent = llmText;
|
|
278
|
-
}}
|
|
279
|
-
}}
|
|
280
|
-
messages.push({{ role: 'assistant', content: llmText }});
|
|
281
|
-
}} catch (e) {{
|
|
282
|
-
llmDiv.textContent = '[Error: ' + e.message + ']';
|
|
283
|
-
}}
|
|
284
|
-
sendBtn.disabled = false;
|
|
285
|
-
}}
|
|
286
|
-
sendBtn.onclick = sendMessage;
|
|
287
|
-
chatInput.addEventListener('keydown', function(e) {{
|
|
288
|
-
if (e.key === 'Enter') sendMessage();
|
|
289
|
-
}});
|
|
290
|
-
</script>
|
|
291
|
-
</body>
|
|
292
|
-
</html>
|
|
293
|
-
"""
|
|
294
|
-
return HTMLResponse(content=styled_html)
|
|
37
|
+
return HTMLResponse(content=html_content)
|
|
@@ -14,8 +14,8 @@ from fastapi.responses import StreamingResponse
|
|
|
14
14
|
|
|
15
15
|
from openai import OpenAI
|
|
16
16
|
|
|
17
|
+
from lemonade_server.pydantic_models import ChatCompletionRequest
|
|
17
18
|
from lemonade_server.model_manager import ModelManager
|
|
18
|
-
from lemonade.tools.server.pydantic_models import ChatCompletionRequest
|
|
19
19
|
from lemonade.tools.server.port_utils import find_free_port
|
|
20
20
|
|
|
21
21
|
LLAMA_VERSION = "b5543"
|
|
@@ -165,24 +165,25 @@ def _wait_for_load(
|
|
|
165
165
|
|
|
166
166
|
|
|
167
167
|
def _launch_llama_subprocess(
|
|
168
|
-
|
|
168
|
+
snapshot_files: dict, use_gpu: bool, telemetry: LlamaTelemetry
|
|
169
169
|
) -> subprocess.Popen:
|
|
170
170
|
"""
|
|
171
171
|
Launch llama server subprocess with GPU or CPU configuration
|
|
172
172
|
"""
|
|
173
173
|
|
|
174
|
+
# Build the base command
|
|
175
|
+
base_command = [LLAMA_SERVER_EXE_PATH, "-m", snapshot_files["variant"]]
|
|
176
|
+
if "mmproj" in snapshot_files:
|
|
177
|
+
base_command.extend(["--mmproj", snapshot_files["mmproj"]])
|
|
178
|
+
if not use_gpu:
|
|
179
|
+
base_command.extend(["--no-mmproj-offload"])
|
|
180
|
+
|
|
174
181
|
# Find a port, and save it in the telemetry object for future reference
|
|
175
182
|
# by other functions
|
|
176
183
|
telemetry.choose_port()
|
|
177
184
|
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
"-m",
|
|
181
|
-
model_path,
|
|
182
|
-
"--port",
|
|
183
|
-
str(telemetry.port),
|
|
184
|
-
"--jinja",
|
|
185
|
-
]
|
|
185
|
+
# Add port and jinja to enable tool use
|
|
186
|
+
base_command.extend(["--port", str(telemetry.port), "--jinja"])
|
|
186
187
|
|
|
187
188
|
# Configure GPU layers: 99 for GPU, 0 for CPU-only
|
|
188
189
|
ngl_value = "99" if use_gpu else "0"
|
|
@@ -204,7 +205,7 @@ def _launch_llama_subprocess(
|
|
|
204
205
|
return process
|
|
205
206
|
|
|
206
207
|
|
|
207
|
-
def server_load(
|
|
208
|
+
def server_load(model_config: dict, model_reference: str, telemetry: LlamaTelemetry):
|
|
208
209
|
# Download llama.cpp server if it isn't already available
|
|
209
210
|
if not os.path.exists(LLAMA_SERVER_EXE_DIR):
|
|
210
211
|
# Download llama.cpp server zip
|
|
@@ -236,16 +237,15 @@ def server_load(checkpoint: str, model_reference: str, telemetry: LlamaTelemetry
|
|
|
236
237
|
logging.info("Cleaned up zip file")
|
|
237
238
|
|
|
238
239
|
# Download the gguf to the hugging face cache
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
logging.debug(f"GGUF file path: {model_path}")
|
|
240
|
+
snapshot_files = ModelManager().download_gguf(model_config)
|
|
241
|
+
logging.debug(f"GGUF file paths: {snapshot_files}")
|
|
242
242
|
|
|
243
243
|
# Start the llama-serve.exe process
|
|
244
244
|
logging.debug(f"Using llama_server for GGUF model: {LLAMA_SERVER_EXE_PATH}")
|
|
245
245
|
|
|
246
246
|
# Attempt loading on GPU first
|
|
247
247
|
llama_server_process = _launch_llama_subprocess(
|
|
248
|
-
|
|
248
|
+
snapshot_files, use_gpu=True, telemetry=telemetry
|
|
249
249
|
)
|
|
250
250
|
|
|
251
251
|
# Check the /health endpoint until GPU server is ready
|
|
@@ -258,7 +258,7 @@ def server_load(checkpoint: str, model_reference: str, telemetry: LlamaTelemetry
|
|
|
258
258
|
# If loading on GPU failed, try loading on CPU
|
|
259
259
|
if llama_server_process.poll():
|
|
260
260
|
llama_server_process = _launch_llama_subprocess(
|
|
261
|
-
|
|
261
|
+
snapshot_files, use_gpu=False, telemetry=telemetry
|
|
262
262
|
)
|
|
263
263
|
|
|
264
264
|
# Check the /health endpoint until CPU server is ready
|
lemonade/tools/server/serve.py
CHANGED
|
@@ -46,9 +46,7 @@ from openai.types.responses import (
|
|
|
46
46
|
|
|
47
47
|
import lemonade.api as lemonade_api
|
|
48
48
|
from lemonade_server.model_manager import ModelManager
|
|
49
|
-
from
|
|
50
|
-
import lemonade.tools.server.llamacpp as llamacpp
|
|
51
|
-
from lemonade.tools.server.pydantic_models import (
|
|
49
|
+
from lemonade_server.pydantic_models import (
|
|
52
50
|
DEFAULT_MAX_NEW_TOKENS,
|
|
53
51
|
LoadConfig,
|
|
54
52
|
CompletionRequest,
|
|
@@ -56,6 +54,8 @@ from lemonade.tools.server.pydantic_models import (
|
|
|
56
54
|
ResponsesRequest,
|
|
57
55
|
PullConfig,
|
|
58
56
|
)
|
|
57
|
+
from lemonade.tools.management_tools import ManagementTool
|
|
58
|
+
import lemonade.tools.server.llamacpp as llamacpp
|
|
59
59
|
from lemonade.tools.server.tool_calls import extract_tool_calls, get_tool_call_pattern
|
|
60
60
|
from lemonade.tools.server.instructions import get_instructions_html
|
|
61
61
|
from lemonade.tools.server.port_utils import lifespan
|
|
@@ -1200,7 +1200,7 @@ class Server(ManagementTool):
|
|
|
1200
1200
|
# We will populate a LoadConfig that has all of the required fields
|
|
1201
1201
|
config_to_use: LoadConfig
|
|
1202
1202
|
|
|
1203
|
-
# First,
|
|
1203
|
+
# First, ensure that the arguments are valid
|
|
1204
1204
|
if config.model_name:
|
|
1205
1205
|
# Get the dictionary of supported model from disk
|
|
1206
1206
|
supported_models = ModelManager().supported_models
|
|
@@ -1293,7 +1293,7 @@ class Server(ManagementTool):
|
|
|
1293
1293
|
try:
|
|
1294
1294
|
if config_to_use.recipe == "llamacpp":
|
|
1295
1295
|
self.llama_server_process = llamacpp.server_load(
|
|
1296
|
-
|
|
1296
|
+
model_config=config_to_use,
|
|
1297
1297
|
model_reference=model_reference,
|
|
1298
1298
|
telemetry=self.llama_telemetry,
|
|
1299
1299
|
)
|
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
<!DOCTYPE html>
|
|
2
|
+
<html lang="en">
|
|
3
|
+
<head>
|
|
4
|
+
<meta charset="UTF-8">
|
|
5
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
6
|
+
<title>Lemonade Server</title>
|
|
7
|
+
<link rel="icon" href="data:,">
|
|
8
|
+
<link rel="stylesheet" href="/static/styles.css">
|
|
9
|
+
<script>
|
|
10
|
+
window.SERVER_PORT = {{SERVER_PORT}};
|
|
11
|
+
</script>
|
|
12
|
+
{{SERVER_MODELS_JS}}
|
|
13
|
+
</head>
|
|
14
|
+
<body>
|
|
15
|
+
<nav class="navbar">
|
|
16
|
+
<a href="https://github.com/lemonade-sdk/lemonade">GitHub</a>
|
|
17
|
+
<a href="https://lemonade-server.ai/docs/">Docs</a>
|
|
18
|
+
<a href="https://lemonade-server.ai/docs/server/server_models/">Models</a>
|
|
19
|
+
<a href="https://lemonade-server.ai/docs/server/apps/">Featured Apps</a>
|
|
20
|
+
</nav>
|
|
21
|
+
<main class="main">
|
|
22
|
+
<div class="title">🍋 Lemonade Server</div>
|
|
23
|
+
<div class="tab-container">
|
|
24
|
+
<div class="tabs">
|
|
25
|
+
<button class="tab active" id="tab-chat" onclick="showTab('chat')">LLM Chat</button>
|
|
26
|
+
<button class="tab" id="tab-models" onclick="showTab('models')">Model Management</button>
|
|
27
|
+
</div>
|
|
28
|
+
<div class="tab-content active" id="content-chat">
|
|
29
|
+
<div class="chat-container">
|
|
30
|
+
<div class="chat-history" id="chat-history"></div>
|
|
31
|
+
<div class="chat-input-row">
|
|
32
|
+
<select id="model-select"></select>
|
|
33
|
+
<input type="text" id="chat-input" placeholder="Type your message..." />
|
|
34
|
+
<button id="send-btn">Send</button>
|
|
35
|
+
</div>
|
|
36
|
+
</div>
|
|
37
|
+
</div>
|
|
38
|
+
<div class="tab-content" id="content-models">
|
|
39
|
+
<div class="model-mgmt-container">
|
|
40
|
+
<div class="model-mgmt-pane">
|
|
41
|
+
<h3>Installed Models</h3>
|
|
42
|
+
<table class="model-table" id="installed-models-table">
|
|
43
|
+
<colgroup><col style="width:100%"></colgroup>
|
|
44
|
+
<tbody id="installed-models-tbody"></tbody>
|
|
45
|
+
</table>
|
|
46
|
+
</div>
|
|
47
|
+
<div class="model-mgmt-pane">
|
|
48
|
+
<h3>Suggested Models</h3>
|
|
49
|
+
<table class="model-table" id="suggested-models-table">
|
|
50
|
+
<tbody id="suggested-models-tbody"></tbody>
|
|
51
|
+
</table>
|
|
52
|
+
</div>
|
|
53
|
+
</div>
|
|
54
|
+
</div>
|
|
55
|
+
</div>
|
|
56
|
+
</main>
|
|
57
|
+
<footer class="site-footer">
|
|
58
|
+
<div class="dad-joke">When life gives you LLMs, make an LLM aide.</div>
|
|
59
|
+
<div class="copyright">Copyright 2025 AMD</div>
|
|
60
|
+
</footer>
|
|
61
|
+
<script src="https://cdn.jsdelivr.net/npm/openai@4.21.0/dist/openai.min.js"></script>
|
|
62
|
+
<script>
|
|
63
|
+
// Tab switching logic
|
|
64
|
+
function showTab(tab) {
|
|
65
|
+
document.getElementById('tab-chat').classList.remove('active');
|
|
66
|
+
document.getElementById('tab-models').classList.remove('active');
|
|
67
|
+
document.getElementById('content-chat').classList.remove('active');
|
|
68
|
+
document.getElementById('content-models').classList.remove('active');
|
|
69
|
+
if (tab === 'chat') {
|
|
70
|
+
document.getElementById('tab-chat').classList.add('active');
|
|
71
|
+
document.getElementById('content-chat').classList.add('active');
|
|
72
|
+
} else {
|
|
73
|
+
document.getElementById('tab-models').classList.add('active');
|
|
74
|
+
document.getElementById('content-models').classList.add('active');
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// Helper to get server base URL
|
|
79
|
+
function getServerBaseUrl() {
|
|
80
|
+
const port = window.SERVER_PORT || 8000;
|
|
81
|
+
return `http://localhost:${port}`;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// Populate model dropdown from /api/v1/models endpoint
|
|
85
|
+
async function loadModels() {
|
|
86
|
+
try {
|
|
87
|
+
const resp = await fetch(getServerBaseUrl() + '/api/v1/models');
|
|
88
|
+
const data = await resp.json();
|
|
89
|
+
const select = document.getElementById('model-select');
|
|
90
|
+
select.innerHTML = '';
|
|
91
|
+
if (!data.data || !Array.isArray(data.data)) {
|
|
92
|
+
select.innerHTML = '<option>No models found (malformed response)</option>';
|
|
93
|
+
return;
|
|
94
|
+
}
|
|
95
|
+
if (data.data.length === 0) {
|
|
96
|
+
select.innerHTML = '<option>No models available</option>';
|
|
97
|
+
return;
|
|
98
|
+
}
|
|
99
|
+
let defaultIndex = 0;
|
|
100
|
+
data.data.forEach(function(model, index) {
|
|
101
|
+
const modelId = model.id || model.name || model;
|
|
102
|
+
const opt = document.createElement('option');
|
|
103
|
+
opt.value = modelId;
|
|
104
|
+
opt.textContent = modelId;
|
|
105
|
+
if (modelId === 'Llama-3.2-1B-Instruct-Hybrid') {
|
|
106
|
+
defaultIndex = index;
|
|
107
|
+
}
|
|
108
|
+
select.appendChild(opt);
|
|
109
|
+
});
|
|
110
|
+
select.selectedIndex = defaultIndex;
|
|
111
|
+
} catch (e) {
|
|
112
|
+
const select = document.getElementById('model-select');
|
|
113
|
+
select.innerHTML = `<option>Error loading models: ${e.message}</option>`;
|
|
114
|
+
console.error('Error loading models:', e);
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
loadModels();
|
|
118
|
+
|
|
119
|
+
// Model Management Tab Logic
|
|
120
|
+
async function refreshModelMgmtUI() {
|
|
121
|
+
// Get installed models from /api/v1/models
|
|
122
|
+
let installed = [];
|
|
123
|
+
try {
|
|
124
|
+
const resp = await fetch(getServerBaseUrl() + '/api/v1/models');
|
|
125
|
+
const data = await resp.json();
|
|
126
|
+
if (data.data && Array.isArray(data.data)) {
|
|
127
|
+
installed = data.data.map(m => m.id || m.name || m);
|
|
128
|
+
}
|
|
129
|
+
} catch (e) {}
|
|
130
|
+
// All models from server_models.json (window.SERVER_MODELS)
|
|
131
|
+
const allModels = window.SERVER_MODELS || {};
|
|
132
|
+
// Filter suggested models not installed
|
|
133
|
+
const suggested = Object.keys(allModels).filter(
|
|
134
|
+
k => allModels[k].suggested && !installed.includes(k)
|
|
135
|
+
);
|
|
136
|
+
// Render installed models as a table (two columns, second is invisible)
|
|
137
|
+
const installedTbody = document.getElementById('installed-models-tbody');
|
|
138
|
+
installedTbody.innerHTML = '';
|
|
139
|
+
installed.forEach(function(mid) {
|
|
140
|
+
var tr = document.createElement('tr');
|
|
141
|
+
var tdName = document.createElement('td');
|
|
142
|
+
tdName.textContent = mid;
|
|
143
|
+
var tdEmpty = document.createElement('td');
|
|
144
|
+
tdEmpty.style.width = '0';
|
|
145
|
+
tdEmpty.style.padding = '0';
|
|
146
|
+
tdEmpty.style.border = 'none';
|
|
147
|
+
tr.appendChild(tdName);
|
|
148
|
+
tr.appendChild(tdEmpty);
|
|
149
|
+
installedTbody.appendChild(tr);
|
|
150
|
+
});
|
|
151
|
+
// Render suggested models as a table
|
|
152
|
+
const suggestedTbody = document.getElementById('suggested-models-tbody');
|
|
153
|
+
suggestedTbody.innerHTML = '';
|
|
154
|
+
suggested.forEach(mid => {
|
|
155
|
+
const tr = document.createElement('tr');
|
|
156
|
+
const tdName = document.createElement('td');
|
|
157
|
+
tdName.textContent = mid;
|
|
158
|
+
tdName.style.paddingRight = '1em';
|
|
159
|
+
tdName.style.verticalAlign = 'middle';
|
|
160
|
+
const tdBtn = document.createElement('td');
|
|
161
|
+
tdBtn.style.width = '1%';
|
|
162
|
+
tdBtn.style.verticalAlign = 'middle';
|
|
163
|
+
const btn = document.createElement('button');
|
|
164
|
+
btn.textContent = '+';
|
|
165
|
+
btn.title = 'Install model';
|
|
166
|
+
btn.onclick = async function() {
|
|
167
|
+
btn.disabled = true;
|
|
168
|
+
btn.textContent = 'Installing...';
|
|
169
|
+
btn.classList.add('installing-btn');
|
|
170
|
+
try {
|
|
171
|
+
await fetch(getServerBaseUrl() + '/api/v1/pull', {
|
|
172
|
+
method: 'POST',
|
|
173
|
+
headers: { 'Content-Type': 'application/json' },
|
|
174
|
+
body: JSON.stringify({ model_name: mid })
|
|
175
|
+
});
|
|
176
|
+
await refreshModelMgmtUI();
|
|
177
|
+
await loadModels(); // update chat dropdown too
|
|
178
|
+
} catch (e) {
|
|
179
|
+
btn.textContent = 'Error';
|
|
180
|
+
}
|
|
181
|
+
};
|
|
182
|
+
tdBtn.appendChild(btn);
|
|
183
|
+
tr.appendChild(tdName);
|
|
184
|
+
tr.appendChild(tdBtn);
|
|
185
|
+
suggestedTbody.appendChild(tr);
|
|
186
|
+
});
|
|
187
|
+
}
|
|
188
|
+
// Initial load
|
|
189
|
+
refreshModelMgmtUI();
|
|
190
|
+
// Optionally, refresh when switching to the tab
|
|
191
|
+
document.getElementById('tab-models').addEventListener('click', refreshModelMgmtUI);
|
|
192
|
+
|
|
193
|
+
// Chat logic (streaming with OpenAI JS client placeholder)
|
|
194
|
+
const chatHistory = document.getElementById('chat-history');
|
|
195
|
+
const chatInput = document.getElementById('chat-input');
|
|
196
|
+
const sendBtn = document.getElementById('send-btn');
|
|
197
|
+
const modelSelect = document.getElementById('model-select');
|
|
198
|
+
let messages = [];
|
|
199
|
+
|
|
200
|
+
function appendMessage(role, text) {
|
|
201
|
+
const div = document.createElement('div');
|
|
202
|
+
div.className = 'chat-message ' + role;
|
|
203
|
+
// Add a bubble for iMessage style
|
|
204
|
+
const bubble = document.createElement('div');
|
|
205
|
+
bubble.className = 'chat-bubble ' + role;
|
|
206
|
+
bubble.innerHTML = text;
|
|
207
|
+
div.appendChild(bubble);
|
|
208
|
+
chatHistory.appendChild(div);
|
|
209
|
+
chatHistory.scrollTop = chatHistory.scrollHeight;
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
async function sendMessage() {
|
|
213
|
+
const text = chatInput.value.trim();
|
|
214
|
+
if (!text) return;
|
|
215
|
+
appendMessage('user', text);
|
|
216
|
+
messages.push({ role: 'user', content: text });
|
|
217
|
+
chatInput.value = '';
|
|
218
|
+
sendBtn.disabled = true;
|
|
219
|
+
// Streaming OpenAI completions (placeholder, adapt as needed)
|
|
220
|
+
let llmText = '';
|
|
221
|
+
appendMessage('llm', '...');
|
|
222
|
+
const llmDiv = chatHistory.lastChild.querySelector('.chat-bubble.llm');
|
|
223
|
+
try {
|
|
224
|
+
// Use the correct endpoint for chat completions
|
|
225
|
+
const resp = await fetch(getServerBaseUrl() + '/api/v1/chat/completions', {
|
|
226
|
+
method: 'POST',
|
|
227
|
+
headers: { 'Content-Type': 'application/json' },
|
|
228
|
+
body: JSON.stringify({
|
|
229
|
+
model: modelSelect.value,
|
|
230
|
+
messages: messages,
|
|
231
|
+
stream: true
|
|
232
|
+
})
|
|
233
|
+
});
|
|
234
|
+
if (!resp.body) throw new Error('No stream');
|
|
235
|
+
const reader = resp.body.getReader();
|
|
236
|
+
let decoder = new TextDecoder();
|
|
237
|
+
llmDiv.textContent = '';
|
|
238
|
+
while (true) {
|
|
239
|
+
const { done, value } = await reader.read();
|
|
240
|
+
if (done) break;
|
|
241
|
+
const chunk = decoder.decode(value);
|
|
242
|
+
if (chunk.trim() === 'data: [DONE]' || chunk.trim() === '[DONE]') continue;
|
|
243
|
+
// Try to extract the content from the OpenAI chunk
|
|
244
|
+
const match = chunk.match(/"content"\s*:\s*"([^"]*)"/);
|
|
245
|
+
if (match && match[1]) {
|
|
246
|
+
llmText += match[1];
|
|
247
|
+
llmDiv.textContent = llmText;
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
messages.push({ role: 'assistant', content: llmText });
|
|
251
|
+
} catch (e) {
|
|
252
|
+
llmDiv.textContent = '[Error: ' + e.message + ']';
|
|
253
|
+
}
|
|
254
|
+
sendBtn.disabled = false;
|
|
255
|
+
}
|
|
256
|
+
sendBtn.onclick = sendMessage;
|
|
257
|
+
chatInput.addEventListener('keydown', function(e) {
|
|
258
|
+
if (e.key === 'Enter') sendMessage();
|
|
259
|
+
});
|
|
260
|
+
</script>
|
|
261
|
+
</body>
|
|
262
|
+
</html>
|
lemonade/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "7.0.
|
|
1
|
+
__version__ = "7.0.3"
|
|
@@ -4,7 +4,7 @@ lemonade/cache.py,sha256=djr2qgyUUAWlQv8FehU9qlNtCwK0IZqo82hcBDyZ3-A,2850
|
|
|
4
4
|
lemonade/cli.py,sha256=ddN2QqsGMsVwydfcR7MSZu1z8_-bUgUP7dhw9lzbHa8,4424
|
|
5
5
|
lemonade/sequence.py,sha256=KSH7BPsiyDKsOsg_ziQKEGsDwMmuO_YbgPRBxkZd0pw,13267
|
|
6
6
|
lemonade/state.py,sha256=sdSezla7Cd7KYL90xY3p9kcNV4ndSyN6UvNLOr3vBMA,5261
|
|
7
|
-
lemonade/version.py,sha256=
|
|
7
|
+
lemonade/version.py,sha256=Ur-fY8dgd79WuOM208uDSw5amQiSzM7VmTbWPLQBZvw,22
|
|
8
8
|
lemonade/common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
9
|
lemonade/common/analyze_model.py,sha256=sYWDznEUEWjx_Qekg7f1hHY4Pfe87IQ77lmsWqePgE0,803
|
|
10
10
|
lemonade/common/build.py,sha256=Pk86mCr6fyBIx2zXDpq0BkdahlCmWRnwSTpShA_gwZw,7849
|
|
@@ -27,17 +27,17 @@ lemonade/tools/accuracy.py,sha256=QndammQ1bmlTaF_6YDaaiJp6fpkKZDYGySdQpAgZIp8,11
|
|
|
27
27
|
lemonade/tools/adapter.py,sha256=4H6gfbjvqyU6qm1_-b2FE-c3a7N9OzEBeDVnIwqRDvg,3014
|
|
28
28
|
lemonade/tools/bench.py,sha256=aN5LMA_EH6-ZhAH3Gf26JYL7s0eKpUd3j-bReRhzvEY,10016
|
|
29
29
|
lemonade/tools/huggingface_bench.py,sha256=POE5JYzArK2FBktazOkluLNFzlLctM39B19fK5sMx-0,10017
|
|
30
|
-
lemonade/tools/huggingface_load.py,sha256=
|
|
30
|
+
lemonade/tools/huggingface_load.py,sha256=857GxaQcqmSv2DSsMh503aSicwQDQg5wGGlpwehHHrg,18868
|
|
31
31
|
lemonade/tools/humaneval.py,sha256=RCkVR-yOL56T4EyURaU3MR3yhU4NCbeuWHDyhVWZtxw,9502
|
|
32
32
|
lemonade/tools/llamacpp.py,sha256=uv-xv5KfHm0eU1I6vEKuaRC-QpilE1FffVA-zoCvHt4,8659
|
|
33
33
|
lemonade/tools/llamacpp_bench.py,sha256=tZamG-1Z5pG_bD4O4yz2mUo2AWwEgOw9RSdEDllW4HY,5941
|
|
34
34
|
lemonade/tools/management_tools.py,sha256=RO-lU-hjZhrP9KD9qcLI7MrLu-Rxnkrxzn45qqwKInE,8554
|
|
35
35
|
lemonade/tools/mmlu.py,sha256=hNa7A8dhpjOtgfd5MGcagpwpw4_AZRZvVj5Duz9LJ88,11381
|
|
36
36
|
lemonade/tools/perplexity.py,sha256=Z6ha7LS5DhdZWHZxhDz8mDnfESbTGc6TGo8KnPjRmiE,5606
|
|
37
|
-
lemonade/tools/prompt.py,sha256=
|
|
37
|
+
lemonade/tools/prompt.py,sha256=AhRdWpx5BVnuJTmCsxSCw_oKHRlTiRLmOkriXon_mLE,8629
|
|
38
38
|
lemonade/tools/tool.py,sha256=UsxVYukfm_iM3BfeGYPZxQlTK5UfDfDOl3RIyLr8A1Y,13256
|
|
39
39
|
lemonade/tools/ort_genai/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
40
|
-
lemonade/tools/ort_genai/oga.py,sha256
|
|
40
|
+
lemonade/tools/ort_genai/oga.py,sha256=dZ6kbwHBVfzTujAG0ojYDhjS8uH6kwW5xZTcu20hFIc,43886
|
|
41
41
|
lemonade/tools/ort_genai/oga_bench.py,sha256=T3c40NevM3NA7CT98B6vBj1nXfdITDqpfMHYSjhjwpA,5061
|
|
42
42
|
lemonade/tools/quark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
43
43
|
lemonade/tools/quark/quark_load.py,sha256=QWzhXP8MehgD_KjnsmN5a-3D5kdI2XZtKTH4HoDoFoo,5572
|
|
@@ -46,23 +46,24 @@ lemonade/tools/report/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3h
|
|
|
46
46
|
lemonade/tools/report/llm_report.py,sha256=bVHhwCINA-Ok2EdSwAsLubsc83N3KWOVuwTguw7jDcE,6676
|
|
47
47
|
lemonade/tools/report/table.py,sha256=a0TXo1X84RxCSu0un_XM3ANOlhLtPDuqtGwR7eomf2s,24853
|
|
48
48
|
lemonade/tools/server/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
49
|
-
lemonade/tools/server/instructions.py,sha256=
|
|
50
|
-
lemonade/tools/server/llamacpp.py,sha256=
|
|
49
|
+
lemonade/tools/server/instructions.py,sha256=PbQ8HItagIWbJLYf2IVPhthYVi1E878vNdS42qmTc3E,1230
|
|
50
|
+
lemonade/tools/server/llamacpp.py,sha256=YqUzx-TmyvWMrZfue7xURFfgTRLPGGSzNJtF9GERC_8,10184
|
|
51
51
|
lemonade/tools/server/port_utils.py,sha256=24Ryz5cNU0R9L1kuVSapZoyXTZHzhF4y0Yje9MVOrE0,1535
|
|
52
|
-
lemonade/tools/server/
|
|
53
|
-
lemonade/tools/server/serve.py,sha256=3JQa42WZdllKAf_DY-cal0Pc8vdBZd4vwsfhZmpheS8,52500
|
|
52
|
+
lemonade/tools/server/serve.py,sha256=O2ZcM1xogIRAqBE49tQ-gTFpEXExlwHOT3bYL1rZgmc,52483
|
|
54
53
|
lemonade/tools/server/thread_utils.py,sha256=pK9K_6DNWoQ78NArkAX3Ym2WsxLnCs9sKTk6TitlYnI,2804
|
|
55
54
|
lemonade/tools/server/tool_calls.py,sha256=xrAlQwKG-nv2xLlf8f9CDSaUbyMn8ZtHkds9iZLG9K8,5230
|
|
55
|
+
lemonade/tools/server/static/instructions.html,sha256=tCkc55LrI4oWQM2VYuK3_m02MvG5XxIcTbCSgxyTAIU,11257
|
|
56
56
|
lemonade/tools/server/static/styles.css,sha256=8U1EejQaqRLQ6QTCF5UG_dLPtLjRwT1menUHMDhaq2M,5045
|
|
57
57
|
lemonade_install/__init__.py,sha256=26zohKg2jgr_5y7tObduWMYQg8zCTWMZHL8lfi2zZVQ,40
|
|
58
58
|
lemonade_install/install.py,sha256=61qUO7kWCLcdjK0_IQZ46-rKP_AWkyznh4YpDclPKyM,28036
|
|
59
|
-
lemonade_sdk-7.0.
|
|
60
|
-
lemonade_sdk-7.0.
|
|
59
|
+
lemonade_sdk-7.0.3.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
60
|
+
lemonade_sdk-7.0.3.dist-info/licenses/NOTICE.md,sha256=B8lEqi4QE41J9ljz4Riv2JgHD1v8GCZE6nNBHO3KIA0,2135
|
|
61
61
|
lemonade_server/cli.py,sha256=DR6sIt66K1sZZG3ascEw_6HUgz3UhU9KGUyzxf4nO_A,7351
|
|
62
|
-
lemonade_server/model_manager.py,sha256
|
|
63
|
-
lemonade_server/
|
|
64
|
-
|
|
65
|
-
lemonade_sdk-7.0.
|
|
66
|
-
lemonade_sdk-7.0.
|
|
67
|
-
lemonade_sdk-7.0.
|
|
68
|
-
lemonade_sdk-7.0.
|
|
62
|
+
lemonade_server/model_manager.py,sha256=-r9JS_fPcoLCQCFKZfkInBIIgT4F1tQ_EIKqMqNYpqM,5546
|
|
63
|
+
lemonade_server/pydantic_models.py,sha256=pdOZW6nAYKWKllMLR7y5wdbIofIznxe5Vehac0Hgqto,2276
|
|
64
|
+
lemonade_server/server_models.json,sha256=3C-lJ2lsNwdy0AKT_US_lcVOoiF3xmadbiOUeOQuJXA,6927
|
|
65
|
+
lemonade_sdk-7.0.3.dist-info/METADATA,sha256=pSSPTu7kUyAh4W8lCVvxS-WAnjMT9Dsyw0r0WHcrxgA,5443
|
|
66
|
+
lemonade_sdk-7.0.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
67
|
+
lemonade_sdk-7.0.3.dist-info/entry_points.txt,sha256=gJppn0ETtXXR6ceKWEIRdk42kMC7ps59EmU3NCPyPUk,144
|
|
68
|
+
lemonade_sdk-7.0.3.dist-info/top_level.txt,sha256=10ap5GNiPhalO4V50LRoxA1FqRT9g3Xkia6BITu880k,42
|
|
69
|
+
lemonade_sdk-7.0.3.dist-info/RECORD,,
|
lemonade_server/model_manager.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
3
|
import huggingface_hub
|
|
4
|
-
import
|
|
4
|
+
from importlib.metadata import distributions
|
|
5
|
+
from lemonade_server.pydantic_models import LoadConfig
|
|
5
6
|
|
|
6
7
|
|
|
7
8
|
class ModelManager:
|
|
@@ -64,16 +65,45 @@ class ModelManager:
|
|
|
64
65
|
"""
|
|
65
66
|
return self.filter_models_by_backend(self.downloaded_models)
|
|
66
67
|
|
|
67
|
-
def download_gguf(self,
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
68
|
+
def download_gguf(self, model_config: LoadConfig) -> dict:
|
|
69
|
+
"""
|
|
70
|
+
Downloads the GGUF file for the given model configuration.
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
# The variant parameter can be either:
|
|
74
|
+
# 1. A full GGUF filename (e.g. "model-Q4_0.gguf")
|
|
75
|
+
# 2. A quantization variant (e.g. "Q4_0")
|
|
76
|
+
# This code handles both cases by constructing the appropriate filename
|
|
77
|
+
checkpoint, variant = model_config.checkpoint.split(":")
|
|
78
|
+
hf_base_name = checkpoint.split("/")[-1].replace("-GGUF", "")
|
|
79
|
+
variant_name = (
|
|
80
|
+
variant if variant.endswith(".gguf") else f"{hf_base_name}-{variant}.gguf"
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# If there is a mmproj file, add it to the patterns
|
|
84
|
+
expected_files = {"variant": variant_name}
|
|
85
|
+
if model_config.mmproj:
|
|
86
|
+
expected_files["mmproj"] = model_config.mmproj
|
|
87
|
+
|
|
88
|
+
# Download the files
|
|
89
|
+
snapshot_folder = huggingface_hub.snapshot_download(
|
|
90
|
+
repo_id=checkpoint,
|
|
91
|
+
allow_patterns=list(expected_files.values()),
|
|
75
92
|
)
|
|
76
93
|
|
|
94
|
+
# Ensure we downloaded all expected files while creating a dict of the downloaded files
|
|
95
|
+
snapshot_files = {}
|
|
96
|
+
for file in expected_files:
|
|
97
|
+
snapshot_files[file] = os.path.join(snapshot_folder, expected_files[file])
|
|
98
|
+
if expected_files[file] not in os.listdir(snapshot_folder):
|
|
99
|
+
raise ValueError(
|
|
100
|
+
f"Hugging Face snapshot download for {model_config.checkpoint} "
|
|
101
|
+
f"expected file {expected_files[file]} not found in {snapshot_folder}"
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
# Return a dict that points to the snapshot path of the downloaded GGUF files
|
|
105
|
+
return snapshot_files
|
|
106
|
+
|
|
77
107
|
def download_models(self, models: list[str]):
|
|
78
108
|
"""
|
|
79
109
|
Downloads the specified models from Hugging Face.
|
|
@@ -88,7 +118,8 @@ class ModelManager:
|
|
|
88
118
|
print(f"Downloading {model} ({checkpoint})")
|
|
89
119
|
|
|
90
120
|
if "gguf" in checkpoint.lower():
|
|
91
|
-
self.
|
|
121
|
+
model_config = LoadConfig(**self.supported_models[model])
|
|
122
|
+
self.download_gguf(model_config)
|
|
92
123
|
else:
|
|
93
124
|
huggingface_hub.snapshot_download(repo_id=checkpoint)
|
|
94
125
|
|
|
@@ -97,9 +128,11 @@ class ModelManager:
|
|
|
97
128
|
Returns a filtered dict of models that are enabled by the
|
|
98
129
|
current environment.
|
|
99
130
|
"""
|
|
131
|
+
installed_packages = {dist.metadata["Name"].lower() for dist in distributions()}
|
|
132
|
+
|
|
100
133
|
hybrid_installed = (
|
|
101
|
-
"onnxruntime-vitisai" in
|
|
102
|
-
and "onnxruntime-genai-directml-ryzenai" in
|
|
134
|
+
"onnxruntime-vitisai" in installed_packages
|
|
135
|
+
and "onnxruntime-genai-directml-ryzenai" in installed_packages
|
|
103
136
|
)
|
|
104
137
|
filtered = {}
|
|
105
138
|
for model, value in models.items():
|
|
@@ -24,6 +24,8 @@ class LoadConfig(BaseModel):
|
|
|
24
24
|
max_prompt_length: Optional[int] = None
|
|
25
25
|
# Indicates whether the model is a reasoning model, like DeepSeek
|
|
26
26
|
reasoning: Optional[bool] = False
|
|
27
|
+
# Indicates which Multimodal Projector (mmproj) file to use
|
|
28
|
+
mmproj: Optional[str] = None
|
|
27
29
|
|
|
28
30
|
|
|
29
31
|
class CompletionRequest(BaseModel):
|
|
@@ -187,5 +187,19 @@
|
|
|
187
187
|
"recipe": "llamacpp",
|
|
188
188
|
"reasoning": true,
|
|
189
189
|
"suggested": true
|
|
190
|
+
},
|
|
191
|
+
"Gemma-3-4b-it-GGUF": {
|
|
192
|
+
"checkpoint": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M",
|
|
193
|
+
"mmproj": "mmproj-model-f16.gguf",
|
|
194
|
+
"recipe": "llamacpp",
|
|
195
|
+
"reasoning": false,
|
|
196
|
+
"suggested": true
|
|
197
|
+
},
|
|
198
|
+
"Qwen2.5-VL-7B-Instruct": {
|
|
199
|
+
"checkpoint": "ggml-org/Qwen2.5-VL-7B-Instruct-GGUF:Q4_K_M",
|
|
200
|
+
"mmproj": "mmproj-Qwen2.5-VL-7B-Instruct-f16.gguf",
|
|
201
|
+
"recipe": "llamacpp",
|
|
202
|
+
"reasoning": false,
|
|
203
|
+
"suggested": true
|
|
190
204
|
}
|
|
191
205
|
}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|