lemonade-sdk 8.0.4__py3-none-any.whl → 8.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lemonade-sdk might be problematic. Click here for more details.
- lemonade/api.py +50 -0
- lemonade/cache.py +3 -1
- lemonade/common/inference_engines.py +415 -0
- lemonade/common/system_info.py +493 -47
- lemonade/tools/adapter.py +6 -0
- lemonade/tools/huggingface/utils.py +6 -5
- lemonade/tools/llamacpp/bench.py +26 -46
- lemonade/tools/llamacpp/load.py +104 -196
- lemonade/tools/llamacpp/utils.py +612 -0
- lemonade/tools/management_tools.py +53 -7
- lemonade/tools/oga/bench.py +5 -6
- lemonade/tools/oga/utils.py +8 -2
- lemonade/tools/prompt.py +17 -25
- lemonade/tools/report/table.py +12 -9
- lemonade/tools/server/llamacpp.py +80 -92
- lemonade/tools/server/serve.py +32 -0
- lemonade/tools/server/static/styles.css +137 -58
- lemonade/tools/server/static/webapp.html +34 -8
- lemonade/tools/server/tray.py +7 -0
- lemonade/version.py +1 -1
- lemonade_sdk-8.0.6.dist-info/METADATA +295 -0
- {lemonade_sdk-8.0.4.dist-info → lemonade_sdk-8.0.6.dist-info}/RECORD +30 -28
- lemonade_server/cli.py +168 -22
- lemonade_server/model_manager.py +4 -148
- lemonade_server/server_models.json +11 -0
- lemonade_sdk-8.0.4.dist-info/METADATA +0 -176
- {lemonade_sdk-8.0.4.dist-info → lemonade_sdk-8.0.6.dist-info}/WHEEL +0 -0
- {lemonade_sdk-8.0.4.dist-info → lemonade_sdk-8.0.6.dist-info}/entry_points.txt +0 -0
- {lemonade_sdk-8.0.4.dist-info → lemonade_sdk-8.0.6.dist-info}/licenses/LICENSE +0 -0
- {lemonade_sdk-8.0.4.dist-info → lemonade_sdk-8.0.6.dist-info}/licenses/NOTICE.md +0 -0
- {lemonade_sdk-8.0.4.dist-info → lemonade_sdk-8.0.6.dist-info}/top_level.txt +0 -0
|
@@ -1,12 +1,17 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
import abc
|
|
3
|
+
import json
|
|
3
4
|
from typing import List
|
|
4
5
|
import lemonade.common.filesystem as fs
|
|
5
6
|
import lemonade.common.exceptions as exp
|
|
6
7
|
import lemonade.common.printing as printing
|
|
7
8
|
from lemonade.tools.tool import ToolParser
|
|
8
9
|
from lemonade.version import __version__ as lemonade_version
|
|
9
|
-
from lemonade.common.system_info import
|
|
10
|
+
from lemonade.common.system_info import (
|
|
11
|
+
get_system_info_dict,
|
|
12
|
+
get_device_info_dict,
|
|
13
|
+
get_system_info,
|
|
14
|
+
)
|
|
10
15
|
from lemonade.common.build import output_dir
|
|
11
16
|
import lemonade.cache as lemonade_cache
|
|
12
17
|
|
|
@@ -245,28 +250,69 @@ class SystemInfo(ManagementTool):
|
|
|
245
250
|
@staticmethod
|
|
246
251
|
def parser(add_help: bool = True) -> argparse.ArgumentParser:
|
|
247
252
|
parser = __class__.helpful_parser(
|
|
248
|
-
short_description="Print system information",
|
|
253
|
+
short_description="Print system and device information",
|
|
249
254
|
add_help=add_help,
|
|
250
255
|
)
|
|
251
256
|
|
|
257
|
+
parser.add_argument(
|
|
258
|
+
"--format", choices=["table", "json"], default="table", help="Output format"
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
parser.add_argument(
|
|
262
|
+
"--verbose",
|
|
263
|
+
action="store_true",
|
|
264
|
+
help="Show detailed system information",
|
|
265
|
+
)
|
|
266
|
+
|
|
252
267
|
return parser
|
|
253
268
|
|
|
254
269
|
@staticmethod
|
|
255
270
|
def pretty_print(my_dict: dict, level=0):
|
|
256
271
|
for k, v in my_dict.items():
|
|
272
|
+
if k == "available" and v is True:
|
|
273
|
+
continue
|
|
274
|
+
|
|
257
275
|
if isinstance(v, dict):
|
|
258
|
-
|
|
259
|
-
|
|
276
|
+
# Special handling for device availability
|
|
277
|
+
if v.get("available") is False:
|
|
278
|
+
error_msg = v.get("error", "Not available")
|
|
279
|
+
print(" " * level + f"{k}: {error_msg}")
|
|
280
|
+
else:
|
|
281
|
+
print(" " * level + f"{k}:")
|
|
282
|
+
SystemInfo.pretty_print(v, level + 1)
|
|
260
283
|
elif isinstance(v, list):
|
|
261
284
|
print(" " * level + f"{k}:")
|
|
262
285
|
for item in v:
|
|
263
|
-
|
|
286
|
+
if isinstance(item, dict):
|
|
287
|
+
SystemInfo.pretty_print(item, level + 1)
|
|
288
|
+
print()
|
|
289
|
+
else:
|
|
290
|
+
print(" " * (level + 1) + f"{item}")
|
|
264
291
|
else:
|
|
265
292
|
print(" " * level + f"{k}: {v}")
|
|
266
293
|
|
|
267
|
-
def run(self, _):
|
|
294
|
+
def run(self, _, format="table", verbose=False):
|
|
295
|
+
# Get basic system info
|
|
268
296
|
system_info_dict = get_system_info_dict()
|
|
269
|
-
|
|
297
|
+
|
|
298
|
+
# Always include devices
|
|
299
|
+
system_info_dict["Devices"] = get_device_info_dict()
|
|
300
|
+
|
|
301
|
+
# Filter out verbose-only information if not in verbose mode
|
|
302
|
+
if not verbose:
|
|
303
|
+
essential_keys = ["OS Version", "Processor", "Physical Memory", "Devices"]
|
|
304
|
+
system_info_dict = {
|
|
305
|
+
k: v for k, v in system_info_dict.items() if k in essential_keys
|
|
306
|
+
}
|
|
307
|
+
else:
|
|
308
|
+
# In verbose mode, add Python packages at the end
|
|
309
|
+
system_info = get_system_info()
|
|
310
|
+
system_info_dict["Python Packages"] = system_info.get_python_packages()
|
|
311
|
+
|
|
312
|
+
if format == "json":
|
|
313
|
+
print(json.dumps(system_info_dict, indent=2))
|
|
314
|
+
else:
|
|
315
|
+
self.pretty_print(system_info_dict)
|
|
270
316
|
|
|
271
317
|
|
|
272
318
|
# This file was originally licensed under Apache 2.0. It has been modified.
|
lemonade/tools/oga/bench.py
CHANGED
|
@@ -74,12 +74,12 @@ class OgaBench(Bench):
|
|
|
74
74
|
|
|
75
75
|
# Don't capture time for warmup
|
|
76
76
|
for count in range(warmup_iterations):
|
|
77
|
-
|
|
78
|
-
self.tokens_out_len_list.append(
|
|
77
|
+
_ = model.generate(input_ids, max_new_tokens=output_tokens)
|
|
78
|
+
self.tokens_out_len_list.append(model.response_tokens)
|
|
79
79
|
report_progress_fn((count + 1) / (warmup_iterations + iterations))
|
|
80
80
|
|
|
81
81
|
for count in range(iterations):
|
|
82
|
-
|
|
82
|
+
_ = model.generate(
|
|
83
83
|
input_ids,
|
|
84
84
|
max_new_tokens=output_tokens,
|
|
85
85
|
min_new_tokens=output_tokens,
|
|
@@ -88,11 +88,10 @@ class OgaBench(Bench):
|
|
|
88
88
|
(warmup_iterations + count + 1) / (warmup_iterations + iterations)
|
|
89
89
|
)
|
|
90
90
|
|
|
91
|
-
|
|
92
|
-
self.tokens_out_len_list.append(token_len)
|
|
91
|
+
self.tokens_out_len_list.append(model.response_tokens)
|
|
93
92
|
|
|
94
93
|
# Only count an iteration if it produced enough tokens
|
|
95
|
-
if
|
|
94
|
+
if model.response_tokens >= output_tokens:
|
|
96
95
|
per_iteration_time_to_first_token.append(model.time_to_first_token)
|
|
97
96
|
per_iteration_tokens_per_second.append(model.tokens_per_second)
|
|
98
97
|
|
lemonade/tools/oga/utils.py
CHANGED
|
@@ -99,13 +99,16 @@ class OrtGenaiModel(ModelAdapter):
|
|
|
99
99
|
):
|
|
100
100
|
params = og.GeneratorParams(self.model)
|
|
101
101
|
|
|
102
|
+
# OGA models return a list of tokens (older versions) or 1d numpy array (newer versions)
|
|
102
103
|
prompt_length = len(input_ids)
|
|
104
|
+
|
|
103
105
|
max_prompt_length = self.config.get("max_prompt_length")
|
|
104
106
|
if max_prompt_length and prompt_length > max_prompt_length:
|
|
105
107
|
raise ValueError(
|
|
106
108
|
f"This prompt (length {prompt_length}) exceeds the model's "
|
|
107
109
|
f"maximum allowed prompt length ({max_prompt_length})."
|
|
108
110
|
)
|
|
111
|
+
self.prompt_tokens = prompt_length
|
|
109
112
|
|
|
110
113
|
# There is a breaking API change in OGA 0.6.0
|
|
111
114
|
# Determine whether we should use the old or new APIs
|
|
@@ -206,18 +209,21 @@ class OrtGenaiModel(ModelAdapter):
|
|
|
206
209
|
)
|
|
207
210
|
self.tokens_per_second = 1 / avg_token_gen_latency_s
|
|
208
211
|
|
|
209
|
-
|
|
212
|
+
response = generator.get_sequence(0)
|
|
213
|
+
self.response_tokens = len(response) - self.prompt_tokens
|
|
214
|
+
return [response]
|
|
210
215
|
else:
|
|
211
216
|
if use_oga_post_6_api:
|
|
212
217
|
generator.append_tokens(input_ids)
|
|
213
218
|
tokenizer_stream = streamer.tokenizer.tokenizer.create_stream()
|
|
214
|
-
|
|
219
|
+
self.response_tokens = 0
|
|
215
220
|
stop_early = False
|
|
216
221
|
|
|
217
222
|
while not generator.is_done() and not stop_early:
|
|
218
223
|
if use_oga_pre_6_api:
|
|
219
224
|
generator.compute_logits()
|
|
220
225
|
generator.generate_next_token()
|
|
226
|
+
self.response_tokens += 1
|
|
221
227
|
|
|
222
228
|
new_token = generator.get_next_tokens()[0]
|
|
223
229
|
new_text = tokenizer_stream.decode(new_token)
|
lemonade/tools/prompt.py
CHANGED
|
@@ -161,7 +161,11 @@ class LLMPrompt(Tool):
|
|
|
161
161
|
# If template flag is set, then wrap prompt in template
|
|
162
162
|
if template:
|
|
163
163
|
# Embed prompt in model's chat template
|
|
164
|
-
if tokenizer
|
|
164
|
+
if not hasattr(tokenizer, "prompt_template"):
|
|
165
|
+
printing.log_warning(
|
|
166
|
+
"Templates for this model type are not yet implemented."
|
|
167
|
+
)
|
|
168
|
+
elif tokenizer.chat_template:
|
|
165
169
|
# Use the model's built-in chat template if available
|
|
166
170
|
messages_dict = [{"role": "user", "content": prompt}]
|
|
167
171
|
prompt = tokenizer.apply_chat_template(
|
|
@@ -175,25 +179,10 @@ class LLMPrompt(Tool):
|
|
|
175
179
|
state.save_stat(Keys.PROMPT_TEMPLATE, "Default")
|
|
176
180
|
|
|
177
181
|
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
|
|
178
|
-
if isinstance(input_ids, (list, str)):
|
|
179
|
-
# OGA models return a list of tokens (older versions)
|
|
180
|
-
# Our llama.cpp adapter returns a string
|
|
181
|
-
len_tokens_in = len(input_ids)
|
|
182
|
-
elif hasattr(input_ids, "shape"):
|
|
183
|
-
# HF models return a 2-D tensor
|
|
184
|
-
# OGA models with newer versions may return numpy arrays
|
|
185
|
-
if len(input_ids.shape) == 1:
|
|
186
|
-
# 1-D array from newer OGA versions
|
|
187
|
-
len_tokens_in = len(input_ids)
|
|
188
|
-
else:
|
|
189
|
-
# 2-D tensor from HF models
|
|
190
|
-
len_tokens_in = input_ids.shape[1]
|
|
191
|
-
else:
|
|
192
|
-
# Fallback: try to get length directly
|
|
193
|
-
len_tokens_in = len(input_ids)
|
|
194
182
|
|
|
195
183
|
len_tokens_out = []
|
|
196
184
|
response_texts = []
|
|
185
|
+
prompt_tokens = None # will be determined in generate function
|
|
197
186
|
for trial in range(n_trials):
|
|
198
187
|
if n_trials > 1:
|
|
199
188
|
self.set_percent_progress(100.0 * trial / n_trials)
|
|
@@ -222,19 +211,22 @@ class LLMPrompt(Tool):
|
|
|
222
211
|
|
|
223
212
|
response_array = response if isinstance(response, str) else response[0]
|
|
224
213
|
|
|
225
|
-
|
|
226
|
-
len_tokens_out.append(
|
|
214
|
+
prompt_tokens = model.prompt_tokens
|
|
215
|
+
len_tokens_out.append(model.response_tokens)
|
|
227
216
|
|
|
228
|
-
|
|
217
|
+
# Remove the input from the response
|
|
218
|
+
# (up to the point they diverge, which they should not)
|
|
219
|
+
counter = 0
|
|
220
|
+
len_input_ids = len(input_ids_array)
|
|
229
221
|
while (
|
|
230
|
-
|
|
231
|
-
and input_ids_array[
|
|
222
|
+
counter < len_input_ids
|
|
223
|
+
and input_ids_array[counter] == response_array[counter]
|
|
232
224
|
):
|
|
233
|
-
|
|
225
|
+
counter += 1
|
|
234
226
|
|
|
235
227
|
# Only decode the actual response (not the prompt)
|
|
236
228
|
response_text = tokenizer.decode(
|
|
237
|
-
response_array[
|
|
229
|
+
response_array[counter:], skip_special_tokens=True
|
|
238
230
|
).strip()
|
|
239
231
|
response_texts.append(response_text)
|
|
240
232
|
|
|
@@ -259,7 +251,7 @@ class LLMPrompt(Tool):
|
|
|
259
251
|
plt.savefig(figure_path)
|
|
260
252
|
state.save_stat(Keys.RESPONSE_LENGTHS_HISTOGRAM, figure_path)
|
|
261
253
|
|
|
262
|
-
state.save_stat(Keys.PROMPT_TOKENS,
|
|
254
|
+
state.save_stat(Keys.PROMPT_TOKENS, prompt_tokens)
|
|
263
255
|
state.save_stat(Keys.PROMPT, prompt)
|
|
264
256
|
state.save_stat(Keys.RESPONSE_TOKENS, len_tokens_out)
|
|
265
257
|
state.save_stat(Keys.RESPONSE, sanitize_text(response_texts))
|
lemonade/tools/report/table.py
CHANGED
|
@@ -758,15 +758,18 @@ class LemonadePerfTable(Table):
|
|
|
758
758
|
data[key] = model_stats.get(key, "")
|
|
759
759
|
|
|
760
760
|
# Create a new entry with Driver Versions and relevant Python Packages
|
|
761
|
-
sw_versions = [
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
761
|
+
sw_versions = []
|
|
762
|
+
if "Driver Versions" in data[fs.Keys.SYSTEM_INFO]:
|
|
763
|
+
sw_versions += [
|
|
764
|
+
key + ": " + value
|
|
765
|
+
for key, value in data[fs.Keys.SYSTEM_INFO]["Driver Versions"].items()
|
|
766
|
+
]
|
|
767
|
+
if "Python Packages" in data[fs.Keys.SYSTEM_INFO]:
|
|
768
|
+
sw_versions += [
|
|
769
|
+
pkg
|
|
770
|
+
for pkg in data[fs.Keys.SYSTEM_INFO]["Python Packages"]
|
|
771
|
+
if any(name in pkg for name in PYTHON_PACKAGES)
|
|
772
|
+
]
|
|
770
773
|
if isinstance(data[Keys.RYZEN_AI_VERSION_INFO], dict):
|
|
771
774
|
sw_versions += [
|
|
772
775
|
"Ryzen AI: " + value
|
|
@@ -1,13 +1,11 @@
|
|
|
1
|
-
import sys
|
|
2
1
|
import os
|
|
2
|
+
import sys
|
|
3
3
|
import logging
|
|
4
4
|
import time
|
|
5
5
|
import subprocess
|
|
6
|
-
import zipfile
|
|
7
6
|
import re
|
|
8
7
|
import threading
|
|
9
8
|
import platform
|
|
10
|
-
import shutil
|
|
11
9
|
|
|
12
10
|
import requests
|
|
13
11
|
from tabulate import tabulate
|
|
@@ -18,12 +16,18 @@ from openai import OpenAI
|
|
|
18
16
|
|
|
19
17
|
from lemonade_server.pydantic_models import (
|
|
20
18
|
ChatCompletionRequest,
|
|
19
|
+
CompletionRequest,
|
|
21
20
|
PullConfig,
|
|
22
21
|
EmbeddingsRequest,
|
|
23
22
|
RerankingRequest,
|
|
24
23
|
)
|
|
25
24
|
from lemonade_server.model_manager import ModelManager
|
|
26
25
|
from lemonade.tools.server.utils.port import find_free_port
|
|
26
|
+
from lemonade.tools.llamacpp.utils import (
|
|
27
|
+
get_llama_server_exe_path,
|
|
28
|
+
install_llamacpp,
|
|
29
|
+
download_gguf,
|
|
30
|
+
)
|
|
27
31
|
|
|
28
32
|
LLAMA_VERSION = "b5787"
|
|
29
33
|
|
|
@@ -80,39 +84,6 @@ def get_binary_url_and_filename(version):
|
|
|
80
84
|
return url, filename
|
|
81
85
|
|
|
82
86
|
|
|
83
|
-
def validate_platform_support():
|
|
84
|
-
"""
|
|
85
|
-
Validate platform support before attempting download
|
|
86
|
-
"""
|
|
87
|
-
system = platform.system().lower()
|
|
88
|
-
|
|
89
|
-
if system not in ["windows", "linux"]:
|
|
90
|
-
raise HTTPException(
|
|
91
|
-
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
|
92
|
-
detail=(
|
|
93
|
-
f"Platform {system} not supported for llamacpp. "
|
|
94
|
-
"Supported: Windows, Ubuntu Linux"
|
|
95
|
-
),
|
|
96
|
-
)
|
|
97
|
-
|
|
98
|
-
if system == "linux":
|
|
99
|
-
# Check if we're actually on Ubuntu/compatible distro and log a warning if not
|
|
100
|
-
try:
|
|
101
|
-
with open("/etc/os-release", "r", encoding="utf-8") as f:
|
|
102
|
-
os_info = f.read().lower()
|
|
103
|
-
if "ubuntu" not in os_info and "debian" not in os_info:
|
|
104
|
-
logging.warning(
|
|
105
|
-
"llamacpp binaries are built for Ubuntu. "
|
|
106
|
-
"Compatibility with other Linux distributions is not guaranteed."
|
|
107
|
-
)
|
|
108
|
-
except (FileNotFoundError, PermissionError, OSError) as e:
|
|
109
|
-
logging.warning(
|
|
110
|
-
"Could not determine Linux distribution (%s). "
|
|
111
|
-
"llamacpp binaries are built for Ubuntu.",
|
|
112
|
-
str(e),
|
|
113
|
-
)
|
|
114
|
-
|
|
115
|
-
|
|
116
87
|
class LlamaTelemetry:
|
|
117
88
|
"""
|
|
118
89
|
Manages telemetry data collection and display for llama server.
|
|
@@ -283,7 +254,7 @@ def _launch_llama_subprocess(
|
|
|
283
254
|
"""
|
|
284
255
|
|
|
285
256
|
# Get the current executable path (handles both Windows and Ubuntu structures)
|
|
286
|
-
|
|
257
|
+
exe_path = get_llama_server_exe_path()
|
|
287
258
|
|
|
288
259
|
# Build the base command
|
|
289
260
|
base_command = [exe_path, "-m", snapshot_files["variant"]]
|
|
@@ -350,68 +321,23 @@ def _launch_llama_subprocess(
|
|
|
350
321
|
|
|
351
322
|
|
|
352
323
|
def server_load(model_config: PullConfig, telemetry: LlamaTelemetry):
|
|
353
|
-
#
|
|
354
|
-
|
|
324
|
+
# Install and/or update llama.cpp if needed
|
|
325
|
+
try:
|
|
326
|
+
install_llamacpp()
|
|
327
|
+
except NotImplementedError as e:
|
|
328
|
+
raise HTTPException(
|
|
329
|
+
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, detail=str(e)
|
|
330
|
+
)
|
|
355
331
|
|
|
356
332
|
# Get platform-specific paths at runtime
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
# Check whether the llamacpp install needs an upgrade
|
|
360
|
-
version_txt_path = os.path.join(llama_server_exe_dir, "version.txt")
|
|
361
|
-
if os.path.exists(version_txt_path):
|
|
362
|
-
with open(version_txt_path, "r", encoding="utf-8") as f:
|
|
363
|
-
llamacpp_installed_version = f.read()
|
|
364
|
-
|
|
365
|
-
if llamacpp_installed_version != LLAMA_VERSION:
|
|
366
|
-
# Remove the existing install, which will trigger a new install
|
|
367
|
-
# in the next code block
|
|
368
|
-
shutil.rmtree(llama_server_exe_dir)
|
|
369
|
-
|
|
370
|
-
# Download llama.cpp server if it isn't already available
|
|
371
|
-
if not os.path.exists(llama_server_exe_dir):
|
|
372
|
-
# Download llama.cpp server zip
|
|
373
|
-
llama_zip_url, filename = get_binary_url_and_filename(LLAMA_VERSION)
|
|
374
|
-
llama_zip_path = os.path.join(os.path.dirname(sys.executable), filename)
|
|
375
|
-
logging.info(f"Downloading llama.cpp server from {llama_zip_url}")
|
|
376
|
-
|
|
377
|
-
with requests.get(llama_zip_url, stream=True) as r:
|
|
378
|
-
r.raise_for_status()
|
|
379
|
-
with open(llama_zip_path, "wb") as f:
|
|
380
|
-
for chunk in r.iter_content(chunk_size=8192):
|
|
381
|
-
f.write(chunk)
|
|
382
|
-
|
|
383
|
-
# Extract zip
|
|
384
|
-
logging.info(f"Extracting {llama_zip_path} to {llama_server_exe_dir}")
|
|
385
|
-
with zipfile.ZipFile(llama_zip_path, "r") as zip_ref:
|
|
386
|
-
zip_ref.extractall(llama_server_exe_dir)
|
|
387
|
-
|
|
388
|
-
# Make executable on Linux - need to update paths after extraction
|
|
389
|
-
if platform.system().lower() == "linux":
|
|
390
|
-
# Re-get the paths since extraction might have changed the directory structure
|
|
391
|
-
_, updated_exe_path = get_llama_server_paths()
|
|
392
|
-
if os.path.exists(updated_exe_path):
|
|
393
|
-
os.chmod(updated_exe_path, 0o755)
|
|
394
|
-
logging.info(f"Set executable permissions for {updated_exe_path}")
|
|
395
|
-
else:
|
|
396
|
-
logging.warning(
|
|
397
|
-
f"Could not find llama-server executable at {updated_exe_path}"
|
|
398
|
-
)
|
|
399
|
-
|
|
400
|
-
# Save version.txt
|
|
401
|
-
with open(version_txt_path, "w", encoding="utf-8") as vf:
|
|
402
|
-
vf.write(LLAMA_VERSION)
|
|
403
|
-
|
|
404
|
-
# Delete zip file
|
|
405
|
-
os.remove(llama_zip_path)
|
|
406
|
-
logging.info("Cleaned up zip file")
|
|
333
|
+
llama_server_exe_path = get_llama_server_exe_path()
|
|
407
334
|
|
|
408
335
|
# Download the gguf to the hugging face cache
|
|
409
|
-
|
|
410
|
-
snapshot_files = model_manager.download_gguf(model_config)
|
|
336
|
+
snapshot_files = download_gguf(model_config.checkpoint, model_config.mmproj)
|
|
411
337
|
logging.debug(f"GGUF file paths: {snapshot_files}")
|
|
412
338
|
|
|
413
339
|
# Check if model supports embeddings
|
|
414
|
-
supported_models =
|
|
340
|
+
supported_models = ModelManager().supported_models
|
|
415
341
|
model_info = supported_models.get(model_config.model_name, {})
|
|
416
342
|
supports_embeddings = "embeddings" in model_info.get("labels", [])
|
|
417
343
|
supports_reranking = "reranking" in model_info.get("labels", [])
|
|
@@ -523,6 +449,68 @@ def chat_completion(
|
|
|
523
449
|
)
|
|
524
450
|
|
|
525
451
|
|
|
452
|
+
def completion(completion_request: CompletionRequest, telemetry: LlamaTelemetry):
|
|
453
|
+
"""
|
|
454
|
+
Handle text completions using the llamacpp server.
|
|
455
|
+
|
|
456
|
+
Args:
|
|
457
|
+
completion_request: The completion request containing prompt and parameters
|
|
458
|
+
telemetry: Telemetry object containing the server port
|
|
459
|
+
|
|
460
|
+
Returns:
|
|
461
|
+
Completion response from the llamacpp server
|
|
462
|
+
"""
|
|
463
|
+
base_url = llamacpp_address(telemetry.port)
|
|
464
|
+
client = OpenAI(
|
|
465
|
+
base_url=base_url,
|
|
466
|
+
api_key="lemonade",
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
# Convert Pydantic model to dict and remove unset/null values
|
|
470
|
+
request_dict = completion_request.model_dump(exclude_unset=True, exclude_none=True)
|
|
471
|
+
|
|
472
|
+
# Check if streaming is requested
|
|
473
|
+
if completion_request.stream:
|
|
474
|
+
|
|
475
|
+
def event_stream():
|
|
476
|
+
try:
|
|
477
|
+
# Enable streaming
|
|
478
|
+
for chunk in client.completions.create(**request_dict):
|
|
479
|
+
yield f"data: {chunk.model_dump_json()}\n\n"
|
|
480
|
+
yield "data: [DONE]\n\n"
|
|
481
|
+
|
|
482
|
+
# Show telemetry after completion
|
|
483
|
+
telemetry.show_telemetry()
|
|
484
|
+
|
|
485
|
+
except Exception as e: # pylint: disable=broad-exception-caught
|
|
486
|
+
yield f'data: {{"error": "{str(e)}"}}\n\n'
|
|
487
|
+
|
|
488
|
+
return StreamingResponse(
|
|
489
|
+
event_stream(),
|
|
490
|
+
media_type="text/event-stream",
|
|
491
|
+
headers={
|
|
492
|
+
"Cache-Control": "no-cache",
|
|
493
|
+
"Connection": "keep-alive",
|
|
494
|
+
},
|
|
495
|
+
)
|
|
496
|
+
else:
|
|
497
|
+
# Non-streaming response
|
|
498
|
+
try:
|
|
499
|
+
# Disable streaming for non-streaming requests
|
|
500
|
+
response = client.completions.create(**request_dict)
|
|
501
|
+
|
|
502
|
+
# Show telemetry after completion
|
|
503
|
+
telemetry.show_telemetry()
|
|
504
|
+
|
|
505
|
+
return response
|
|
506
|
+
|
|
507
|
+
except Exception as e: # pylint: disable=broad-exception-caught
|
|
508
|
+
raise HTTPException(
|
|
509
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
510
|
+
detail=f"Completion error: {str(e)}",
|
|
511
|
+
)
|
|
512
|
+
|
|
513
|
+
|
|
526
514
|
def embeddings(embeddings_request: EmbeddingsRequest, telemetry: LlamaTelemetry):
|
|
527
515
|
"""
|
|
528
516
|
Generate embeddings using the llamacpp server.
|
lemonade/tools/server/serve.py
CHANGED
|
@@ -228,6 +228,7 @@ class Server(ManagementTool):
|
|
|
228
228
|
self.app.get(f"{prefix}/health")(self.health)
|
|
229
229
|
self.app.get(f"{prefix}/halt")(self.halt_generation)
|
|
230
230
|
self.app.get(f"{prefix}/stats")(self.send_stats)
|
|
231
|
+
self.app.get(f"{prefix}/system-info")(self.get_system_info)
|
|
231
232
|
self.app.post(f"{prefix}/completions")(self.completions)
|
|
232
233
|
self.app.post(f"{prefix}/responses")(self.responses)
|
|
233
234
|
|
|
@@ -486,6 +487,9 @@ class Server(ManagementTool):
|
|
|
486
487
|
# Load the model if it's different from the currently loaded one
|
|
487
488
|
await self.load_llm(lc)
|
|
488
489
|
|
|
490
|
+
if self.llm_loaded.recipe == "llamacpp":
|
|
491
|
+
return llamacpp.completion(completion_request, self.llama_telemetry)
|
|
492
|
+
|
|
489
493
|
# Check if the model supports reasoning
|
|
490
494
|
reasoning_first_token = self.llm_loaded.reasoning
|
|
491
495
|
|
|
@@ -1276,6 +1280,34 @@ class Server(ManagementTool):
|
|
|
1276
1280
|
),
|
|
1277
1281
|
}
|
|
1278
1282
|
|
|
1283
|
+
async def get_system_info(self, request: Request):
|
|
1284
|
+
"""
|
|
1285
|
+
Return system and device enumeration information.
|
|
1286
|
+
Supports optional 'verbose' query parameter.
|
|
1287
|
+
"""
|
|
1288
|
+
from lemonade.common.system_info import (
|
|
1289
|
+
get_system_info_dict,
|
|
1290
|
+
get_device_info_dict,
|
|
1291
|
+
get_system_info as get_system_info_obj,
|
|
1292
|
+
)
|
|
1293
|
+
|
|
1294
|
+
# Get verbose parameter from query string (default to False)
|
|
1295
|
+
verbose = request.query_params.get("verbose", "false").lower() in ["true", "1"]
|
|
1296
|
+
|
|
1297
|
+
info = get_system_info_dict()
|
|
1298
|
+
info["devices"] = get_device_info_dict()
|
|
1299
|
+
|
|
1300
|
+
# Filter out verbose-only information if not in verbose mode
|
|
1301
|
+
if not verbose:
|
|
1302
|
+
essential_keys = ["OS Version", "Processor", "Physical Memory", "devices"]
|
|
1303
|
+
info = {k: v for k, v in info.items() if k in essential_keys}
|
|
1304
|
+
else:
|
|
1305
|
+
# In verbose mode, add Python packages at the end
|
|
1306
|
+
system_info_obj = get_system_info_obj()
|
|
1307
|
+
info["Python Packages"] = system_info_obj.get_python_packages()
|
|
1308
|
+
|
|
1309
|
+
return info
|
|
1310
|
+
|
|
1279
1311
|
def model_load_failure(self, model_reference: str, message: Optional[str] = None):
|
|
1280
1312
|
"""
|
|
1281
1313
|
Clean up after a model load failure, then log it and raise
|