lemonade-sdk 8.1.4__py3-none-any.whl → 8.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lemonade-sdk might be problematic. Click here for more details.
- lemonade/cache.py +6 -1
- lemonade/cli.py +47 -5
- lemonade/common/inference_engines.py +13 -4
- lemonade/common/status.py +4 -4
- lemonade/common/system_info.py +544 -1
- lemonade/profilers/agt_power.py +437 -0
- lemonade/profilers/hwinfo_power.py +429 -0
- lemonade/tools/accuracy.py +143 -48
- lemonade/tools/adapter.py +6 -1
- lemonade/tools/bench.py +26 -8
- lemonade/tools/flm/__init__.py +1 -0
- lemonade/tools/flm/utils.py +303 -0
- lemonade/tools/huggingface/bench.py +6 -1
- lemonade/tools/llamacpp/bench.py +146 -27
- lemonade/tools/llamacpp/load.py +30 -2
- lemonade/tools/llamacpp/utils.py +393 -33
- lemonade/tools/oga/bench.py +5 -26
- lemonade/tools/oga/load.py +60 -121
- lemonade/tools/oga/migration.py +403 -0
- lemonade/tools/report/table.py +76 -8
- lemonade/tools/server/flm.py +133 -0
- lemonade/tools/server/llamacpp.py +220 -553
- lemonade/tools/server/serve.py +684 -168
- lemonade/tools/server/static/js/chat.js +666 -342
- lemonade/tools/server/static/js/model-settings.js +24 -3
- lemonade/tools/server/static/js/models.js +597 -73
- lemonade/tools/server/static/js/shared.js +79 -14
- lemonade/tools/server/static/logs.html +191 -0
- lemonade/tools/server/static/styles.css +491 -66
- lemonade/tools/server/static/webapp.html +83 -31
- lemonade/tools/server/tray.py +158 -38
- lemonade/tools/server/utils/macos_tray.py +226 -0
- lemonade/tools/server/utils/{system_tray.py → windows_tray.py} +13 -0
- lemonade/tools/server/webapp.py +4 -1
- lemonade/tools/server/wrapped_server.py +559 -0
- lemonade/version.py +1 -1
- lemonade_install/install.py +54 -611
- {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/METADATA +29 -72
- lemonade_sdk-8.2.2.dist-info/RECORD +83 -0
- lemonade_server/cli.py +145 -37
- lemonade_server/model_manager.py +521 -37
- lemonade_server/pydantic_models.py +28 -1
- lemonade_server/server_models.json +246 -92
- lemonade_server/settings.py +39 -39
- lemonade/tools/quark/__init__.py +0 -0
- lemonade/tools/quark/quark_load.py +0 -173
- lemonade/tools/quark/quark_quantize.py +0 -439
- lemonade_sdk-8.1.4.dist-info/RECORD +0 -77
- {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/WHEEL +0 -0
- {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/entry_points.txt +0 -0
- {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/licenses/LICENSE +0 -0
- {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/licenses/NOTICE.md +0 -0
- {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/top_level.txt +0 -0
|
@@ -1,148 +1,54 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import logging
|
|
3
|
-
import time
|
|
4
3
|
import subprocess
|
|
5
4
|
import re
|
|
6
5
|
import threading
|
|
7
6
|
import platform
|
|
8
7
|
|
|
9
|
-
import requests
|
|
10
|
-
from tabulate import tabulate
|
|
11
8
|
from dotenv import load_dotenv
|
|
12
9
|
from fastapi import HTTPException, status
|
|
13
|
-
from fastapi.responses import StreamingResponse
|
|
14
|
-
|
|
15
|
-
from openai import OpenAI
|
|
16
10
|
|
|
17
11
|
from lemonade_server.pydantic_models import (
|
|
18
|
-
ChatCompletionRequest,
|
|
19
|
-
CompletionRequest,
|
|
20
12
|
PullConfig,
|
|
21
|
-
EmbeddingsRequest,
|
|
22
|
-
RerankingRequest,
|
|
23
13
|
)
|
|
24
|
-
from lemonade_server.model_manager import ModelManager
|
|
25
|
-
from lemonade.tools.server.utils.port import find_free_port
|
|
26
14
|
from lemonade.tools.llamacpp.utils import (
|
|
27
15
|
get_llama_server_exe_path,
|
|
28
16
|
install_llamacpp,
|
|
29
17
|
download_gguf,
|
|
18
|
+
resolve_local_gguf_model,
|
|
19
|
+
parse_checkpoint,
|
|
30
20
|
)
|
|
21
|
+
from lemonade.tools.server.wrapped_server import WrappedServerTelemetry, WrappedServer
|
|
31
22
|
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
Args:
|
|
38
|
-
port: The port number the llamacpp server is running on
|
|
39
|
-
|
|
40
|
-
Returns:
|
|
41
|
-
The base URL for the llamacpp server
|
|
42
|
-
"""
|
|
43
|
-
return f"http://127.0.0.1:{port}/v1"
|
|
23
|
+
# Embedding model batch configuration set to 8192 as default
|
|
24
|
+
EMBEDDING_CTX_SIZE = 8192
|
|
25
|
+
EMBEDDING_BATCH_SIZE = 8192
|
|
26
|
+
EMBEDDING_UBATCH_SIZE = 8192
|
|
44
27
|
|
|
45
28
|
|
|
46
|
-
|
|
47
|
-
"""
|
|
48
|
-
Separate standard OpenAI parameters from custom llama.cpp parameters.
|
|
49
|
-
|
|
50
|
-
Args:
|
|
51
|
-
request_dict: Dictionary of all request parameters
|
|
52
|
-
endpoint_type: Type of endpoint ("chat" or "completion")
|
|
53
|
-
|
|
54
|
-
Returns:
|
|
55
|
-
Dictionary with parameters properly separated for OpenAI client
|
|
56
|
-
"""
|
|
57
|
-
openai_client_params = {}
|
|
58
|
-
extra_params = {}
|
|
59
|
-
|
|
60
|
-
# Common OpenAI parameters for both endpoint types
|
|
61
|
-
common_params = {
|
|
62
|
-
"model",
|
|
63
|
-
"frequency_penalty",
|
|
64
|
-
"logit_bias",
|
|
65
|
-
"logprobs",
|
|
66
|
-
"max_tokens",
|
|
67
|
-
"n",
|
|
68
|
-
"presence_penalty",
|
|
69
|
-
"seed",
|
|
70
|
-
"stop",
|
|
71
|
-
"stream",
|
|
72
|
-
"temperature",
|
|
73
|
-
"top_p",
|
|
74
|
-
"user",
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
# Standard OpenAI parameters by endpoint type
|
|
78
|
-
if endpoint_type == "chat":
|
|
79
|
-
chat_specific_params = {
|
|
80
|
-
"messages",
|
|
81
|
-
"top_logprobs",
|
|
82
|
-
"response_format",
|
|
83
|
-
"service_tier",
|
|
84
|
-
"stream_options",
|
|
85
|
-
"tools",
|
|
86
|
-
"tool_choice",
|
|
87
|
-
"parallel_tool_calls",
|
|
88
|
-
}
|
|
89
|
-
openai_params = common_params | chat_specific_params
|
|
90
|
-
else: # completion
|
|
91
|
-
completion_specific_params = {
|
|
92
|
-
"prompt",
|
|
93
|
-
"best_of",
|
|
94
|
-
"echo",
|
|
95
|
-
"suffix",
|
|
96
|
-
}
|
|
97
|
-
openai_params = common_params | completion_specific_params
|
|
98
|
-
|
|
99
|
-
for key, value in request_dict.items():
|
|
100
|
-
if key in openai_params:
|
|
101
|
-
openai_client_params[key] = value
|
|
102
|
-
else:
|
|
103
|
-
extra_params[key] = value
|
|
104
|
-
|
|
105
|
-
# If there are custom parameters, use extra_body to pass them through
|
|
106
|
-
if extra_params:
|
|
107
|
-
openai_client_params["extra_body"] = extra_params
|
|
108
|
-
|
|
109
|
-
return openai_client_params
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
class LlamaTelemetry:
|
|
29
|
+
class LlamaTelemetry(WrappedServerTelemetry):
|
|
113
30
|
"""
|
|
114
31
|
Manages telemetry data collection and display for llama server.
|
|
115
32
|
"""
|
|
116
33
|
|
|
117
|
-
def
|
|
118
|
-
self.input_tokens = None
|
|
119
|
-
self.output_tokens = None
|
|
120
|
-
self.time_to_first_token = None
|
|
121
|
-
self.tokens_per_second = None
|
|
122
|
-
self.prompt_eval_time = None
|
|
123
|
-
self.eval_time = None
|
|
124
|
-
self.port = None
|
|
125
|
-
|
|
126
|
-
def choose_port(self):
|
|
34
|
+
def parse_telemetry_line(self, line: str):
|
|
127
35
|
"""
|
|
128
|
-
|
|
129
|
-
search for an empty port
|
|
36
|
+
Parse telemetry data from llama server output lines.
|
|
130
37
|
"""
|
|
131
38
|
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
39
|
+
if "vk::PhysicalDevice::createDevice: ErrorExtensionNotPresent" in line:
|
|
40
|
+
msg = (
|
|
41
|
+
"Your AMD GPU driver version is not compatible with this software.\n"
|
|
42
|
+
"Please update and try again: "
|
|
43
|
+
"https://www.amd.com/en/support/download/drivers.html"
|
|
44
|
+
)
|
|
136
45
|
logging.error(msg)
|
|
137
46
|
raise HTTPException(
|
|
138
47
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
139
48
|
detail=msg,
|
|
140
49
|
)
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
"""
|
|
144
|
-
Parse telemetry data from llama server output lines.
|
|
145
|
-
"""
|
|
50
|
+
elif "error" in line.lower():
|
|
51
|
+
logging.error(line)
|
|
146
52
|
|
|
147
53
|
# Parse Vulkan device detection
|
|
148
54
|
vulkan_match = re.search(r"ggml_vulkan: Found (\d+) Vulkan devices?:", line)
|
|
@@ -186,468 +92,229 @@ class LlamaTelemetry:
|
|
|
186
92
|
self.tokens_per_second = tokens_per_second
|
|
187
93
|
return
|
|
188
94
|
|
|
189
|
-
def get_telemetry_data(self):
|
|
190
|
-
return {
|
|
191
|
-
"input_tokens": self.input_tokens,
|
|
192
|
-
"output_tokens": self.output_tokens,
|
|
193
|
-
"time_to_first_token": self.time_to_first_token,
|
|
194
|
-
"tokens_per_second": self.tokens_per_second,
|
|
195
|
-
"decode_token_times": None,
|
|
196
|
-
}
|
|
197
|
-
|
|
198
|
-
def show_telemetry(self):
|
|
199
|
-
# Check if debug logging is enabled
|
|
200
|
-
if not logging.getLogger().isEnabledFor(logging.DEBUG):
|
|
201
|
-
return
|
|
202
95
|
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
["TTFT (s)", f"{self.time_to_first_token:.2f}"],
|
|
208
|
-
["TPS", f"{self.tokens_per_second:.2f}"],
|
|
209
|
-
]
|
|
96
|
+
class LlamaServer(WrappedServer):
|
|
97
|
+
def __init__(self, backend: str):
|
|
98
|
+
self.backend = backend
|
|
99
|
+
super().__init__(server_name="llama-server", telemetry=LlamaTelemetry())
|
|
210
100
|
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
101
|
+
def install_server(self, backend=None):
|
|
102
|
+
"""
|
|
103
|
+
Install the wrapped server
|
|
104
|
+
"""
|
|
105
|
+
install_llamacpp(self.backend)
|
|
214
106
|
|
|
215
|
-
|
|
216
|
-
|
|
107
|
+
def download_model(
|
|
108
|
+
self, config_checkpoint, config_mmproj=None, do_not_upgrade=False
|
|
109
|
+
) -> dict:
|
|
110
|
+
"""
|
|
111
|
+
Download a model for the wrapper server.
|
|
112
|
+
First checks local cache, then downloads from internet if needed.
|
|
113
|
+
"""
|
|
114
|
+
# If it's a direct file path, just return it
|
|
115
|
+
|
|
116
|
+
if os.path.exists(config_checkpoint):
|
|
117
|
+
result = {"variant": config_checkpoint}
|
|
118
|
+
if config_mmproj:
|
|
119
|
+
result["mmproj"] = config_mmproj
|
|
120
|
+
return result
|
|
121
|
+
|
|
122
|
+
# Try to resolve from local cache first to avoid unnecessary downloads
|
|
123
|
+
checkpoint, variant = parse_checkpoint(config_checkpoint)
|
|
124
|
+
local_result = resolve_local_gguf_model(checkpoint, variant, config_mmproj)
|
|
125
|
+
|
|
126
|
+
if local_result:
|
|
127
|
+
return local_result
|
|
128
|
+
|
|
129
|
+
# Not found locally - download from internet
|
|
130
|
+
return download_gguf(
|
|
131
|
+
config_checkpoint=config_checkpoint,
|
|
132
|
+
config_mmproj=config_mmproj,
|
|
133
|
+
do_not_upgrade=do_not_upgrade,
|
|
134
|
+
)
|
|
217
135
|
|
|
136
|
+
def _launch_device_backend_subprocess(
|
|
137
|
+
self,
|
|
138
|
+
snapshot_files: dict,
|
|
139
|
+
use_gpu: bool,
|
|
140
|
+
ctx_size: int,
|
|
141
|
+
supports_embeddings: bool = False,
|
|
142
|
+
supports_reranking: bool = False,
|
|
143
|
+
) -> subprocess.Popen:
|
|
144
|
+
"""
|
|
145
|
+
Launch llama server subprocess with appropriate configuration.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
snapshot_files: Dictionary of model files to load
|
|
149
|
+
use_gpu: Whether to use GPU acceleration
|
|
150
|
+
telemetry: Telemetry object for tracking performance metrics
|
|
151
|
+
backend: Backend to use (e.g., 'vulkan', 'rocm')
|
|
152
|
+
supports_embeddings: Whether the model supports embeddings
|
|
153
|
+
supports_reranking: Whether the model supports reranking
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
Subprocess handle for the llama server
|
|
157
|
+
"""
|
|
218
158
|
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
159
|
+
# Get the current executable path (handles both Windows and Ubuntu structures)
|
|
160
|
+
exe_path = get_llama_server_exe_path(self.backend)
|
|
161
|
+
|
|
162
|
+
# For embedding models, use a larger context size to support longer individual
|
|
163
|
+
# strings. Embedding requests can include multiple strings in a batch, and each
|
|
164
|
+
# string needs to fit within the context window.
|
|
165
|
+
if supports_embeddings and ctx_size < EMBEDDING_CTX_SIZE:
|
|
166
|
+
ctx_size = EMBEDDING_CTX_SIZE
|
|
167
|
+
|
|
168
|
+
# Build the base command
|
|
169
|
+
base_command = [
|
|
170
|
+
exe_path,
|
|
171
|
+
"-m",
|
|
172
|
+
snapshot_files["variant"],
|
|
173
|
+
"--ctx-size",
|
|
174
|
+
str(ctx_size),
|
|
175
|
+
]
|
|
225
176
|
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
except requests.exceptions.ConnectionError:
|
|
250
|
-
logging.debug("Not able to connect to llama-server yet, will retry")
|
|
177
|
+
# Lock random seed for deterministic behavior in CI
|
|
178
|
+
if os.environ.get("LEMONADE_CI_MODE"):
|
|
179
|
+
base_command.extend(["--seed", "42"])
|
|
180
|
+
logging.info(f"Seed applied to base command: {base_command}")
|
|
181
|
+
|
|
182
|
+
if "mmproj" in snapshot_files:
|
|
183
|
+
base_command.extend(["--mmproj", snapshot_files["mmproj"]])
|
|
184
|
+
if not use_gpu:
|
|
185
|
+
base_command.extend(["--no-mmproj-offload"])
|
|
186
|
+
|
|
187
|
+
# Find a port, and save it in the telemetry object for future reference
|
|
188
|
+
# by other functions
|
|
189
|
+
self._choose_port()
|
|
190
|
+
|
|
191
|
+
# Add port and jinja to enable tool use
|
|
192
|
+
base_command.extend(["--port", str(self.port), "--jinja"])
|
|
193
|
+
|
|
194
|
+
# Enable context shift and avoid attention sink issues by preserving the initial tokens
|
|
195
|
+
# Note: --context-shift is not supported on all backends (e.g., Metal on macOS)
|
|
196
|
+
# Only add context-shift for backends that support it
|
|
197
|
+
context_shift_supported_backends = ["vulkan", "rocm"]
|
|
198
|
+
if self.backend in context_shift_supported_backends:
|
|
199
|
+
base_command.extend(["--context-shift", "--keep", "16"])
|
|
251
200
|
else:
|
|
252
|
-
|
|
201
|
+
# For backends that don't support context-shift (e.g., Metal), just use keep
|
|
202
|
+
base_command.extend(["--keep", "16"])
|
|
253
203
|
logging.debug(
|
|
254
|
-
"
|
|
255
|
-
f"result: {health_response.json()}"
|
|
204
|
+
f"Skipped --context-shift for backend: {self.backend} (not supported)"
|
|
256
205
|
)
|
|
257
|
-
time.sleep(1)
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
def _launch_llama_subprocess(
|
|
261
|
-
snapshot_files: dict,
|
|
262
|
-
use_gpu: bool,
|
|
263
|
-
telemetry: LlamaTelemetry,
|
|
264
|
-
backend: str,
|
|
265
|
-
ctx_size: int,
|
|
266
|
-
supports_embeddings: bool = False,
|
|
267
|
-
supports_reranking: bool = False,
|
|
268
|
-
) -> subprocess.Popen:
|
|
269
|
-
"""
|
|
270
|
-
Launch llama server subprocess with appropriate configuration.
|
|
271
|
-
|
|
272
|
-
Args:
|
|
273
|
-
snapshot_files: Dictionary of model files to load
|
|
274
|
-
use_gpu: Whether to use GPU acceleration
|
|
275
|
-
telemetry: Telemetry object for tracking performance metrics
|
|
276
|
-
backend: Backend to use (e.g., 'vulkan', 'rocm')
|
|
277
|
-
supports_embeddings: Whether the model supports embeddings
|
|
278
|
-
supports_reranking: Whether the model supports reranking
|
|
279
|
-
|
|
280
|
-
Returns:
|
|
281
|
-
Subprocess handle for the llama server
|
|
282
|
-
"""
|
|
283
|
-
|
|
284
|
-
# Get the current executable path (handles both Windows and Ubuntu structures)
|
|
285
|
-
exe_path = get_llama_server_exe_path(backend)
|
|
286
|
-
|
|
287
|
-
# Build the base command
|
|
288
|
-
base_command = [
|
|
289
|
-
exe_path,
|
|
290
|
-
"-m",
|
|
291
|
-
snapshot_files["variant"],
|
|
292
|
-
"--ctx-size",
|
|
293
|
-
str(ctx_size),
|
|
294
|
-
]
|
|
295
|
-
|
|
296
|
-
# Lock random seed for deterministic behavior in CI
|
|
297
|
-
if os.environ.get("LEMONADE_CI_MODE"):
|
|
298
|
-
base_command.extend(["--seed", "42"])
|
|
299
|
-
|
|
300
|
-
if "mmproj" in snapshot_files:
|
|
301
|
-
base_command.extend(["--mmproj", snapshot_files["mmproj"]])
|
|
302
|
-
if not use_gpu:
|
|
303
|
-
base_command.extend(["--no-mmproj-offload"])
|
|
304
|
-
|
|
305
|
-
# Find a port, and save it in the telemetry object for future reference
|
|
306
|
-
# by other functions
|
|
307
|
-
telemetry.choose_port()
|
|
308
|
-
|
|
309
|
-
# Add port and jinja to enable tool use
|
|
310
|
-
base_command.extend(["--port", str(telemetry.port), "--jinja"])
|
|
311
|
-
|
|
312
|
-
# Disable jinja for gpt-oss-120b on Vulkan
|
|
313
|
-
if backend == "vulkan" and "gpt-oss-120b" in snapshot_files["variant"].lower():
|
|
314
|
-
base_command.remove("--jinja")
|
|
315
|
-
logging.warning(
|
|
316
|
-
"Jinja is disabled for gpt-oss-120b on Vulkan due to a llama.cpp bug "
|
|
317
|
-
"(see https://github.com/ggml-org/llama.cpp/issues/15274). "
|
|
318
|
-
"The model cannot use tools. If needed, use the ROCm backend instead."
|
|
319
|
-
)
|
|
320
206
|
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
# Load environment variables from .env file in the executable directory
|
|
341
|
-
exe_dir = os.path.dirname(exe_path)
|
|
342
|
-
env_file_path = os.path.join(exe_dir, ".env")
|
|
343
|
-
if os.path.exists(env_file_path):
|
|
344
|
-
load_dotenv(env_file_path, override=True)
|
|
345
|
-
env.update(os.environ)
|
|
346
|
-
logging.debug(f"Loaded environment variables from {env_file_path}")
|
|
347
|
-
|
|
348
|
-
if platform.system().lower() == "linux":
|
|
349
|
-
lib_dir = os.path.dirname(exe_path) # Same directory as the executable
|
|
350
|
-
current_ld_path = env.get("LD_LIBRARY_PATH", "")
|
|
351
|
-
if current_ld_path:
|
|
352
|
-
env["LD_LIBRARY_PATH"] = f"{lib_dir}:{current_ld_path}"
|
|
353
|
-
else:
|
|
354
|
-
env["LD_LIBRARY_PATH"] = lib_dir
|
|
355
|
-
logging.debug(f"Set LD_LIBRARY_PATH to {env['LD_LIBRARY_PATH']}")
|
|
356
|
-
|
|
357
|
-
# Start subprocess with output capture
|
|
358
|
-
process = subprocess.Popen(
|
|
359
|
-
command,
|
|
360
|
-
stdout=subprocess.PIPE,
|
|
361
|
-
stderr=subprocess.STDOUT,
|
|
362
|
-
text=True,
|
|
363
|
-
encoding="utf-8",
|
|
364
|
-
errors="replace",
|
|
365
|
-
bufsize=1,
|
|
366
|
-
env=env,
|
|
367
|
-
)
|
|
368
|
-
|
|
369
|
-
# Start background thread to log subprocess output
|
|
370
|
-
device_type = "GPU" if use_gpu else "CPU"
|
|
371
|
-
threading.Thread(
|
|
372
|
-
target=_log_subprocess_output,
|
|
373
|
-
args=(process, f"LLAMA SERVER {device_type}", telemetry),
|
|
374
|
-
daemon=True,
|
|
375
|
-
).start()
|
|
376
|
-
|
|
377
|
-
return process
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
def server_load(
|
|
381
|
-
model_config: PullConfig,
|
|
382
|
-
telemetry: LlamaTelemetry,
|
|
383
|
-
backend: str,
|
|
384
|
-
ctx_size: int,
|
|
385
|
-
do_not_upgrade: bool = False,
|
|
386
|
-
):
|
|
387
|
-
# Install and/or update llama.cpp if needed
|
|
388
|
-
try:
|
|
389
|
-
install_llamacpp(backend)
|
|
390
|
-
except NotImplementedError as e:
|
|
391
|
-
raise HTTPException(
|
|
392
|
-
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, detail=str(e)
|
|
393
|
-
)
|
|
207
|
+
# Use legacy reasoning formatting, since not all apps support the new
|
|
208
|
+
# reasoning_content field
|
|
209
|
+
base_command.extend(["--reasoning-format", "auto"])
|
|
210
|
+
|
|
211
|
+
# Add embeddings support if the model supports it
|
|
212
|
+
if supports_embeddings:
|
|
213
|
+
# For embedding models, set batch sizes to handle multiple documents in a single request
|
|
214
|
+
# batch-size: logical batch size (total tokens across all sequences)
|
|
215
|
+
# ubatch-size: physical batch size (tokens processed in a single forward pass)
|
|
216
|
+
base_command.extend(
|
|
217
|
+
[
|
|
218
|
+
"--embeddings",
|
|
219
|
+
"--batch-size",
|
|
220
|
+
str(EMBEDDING_BATCH_SIZE),
|
|
221
|
+
"--ubatch-size",
|
|
222
|
+
str(EMBEDDING_UBATCH_SIZE),
|
|
223
|
+
]
|
|
224
|
+
)
|
|
394
225
|
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
226
|
+
# Add reranking support if the model supports it
|
|
227
|
+
if supports_reranking:
|
|
228
|
+
base_command.append("--reranking")
|
|
229
|
+
|
|
230
|
+
# Configure GPU layers: 99 for GPU, 0 for CPU-only
|
|
231
|
+
ngl_value = "99" if use_gpu else "0"
|
|
232
|
+
command = base_command + ["-ngl", ngl_value]
|
|
233
|
+
|
|
234
|
+
# Set up environment with library path for Linux
|
|
235
|
+
env = os.environ.copy()
|
|
236
|
+
|
|
237
|
+
# Load environment variables from .env file in the executable directory
|
|
238
|
+
exe_dir = os.path.dirname(exe_path)
|
|
239
|
+
env_file_path = os.path.join(exe_dir, ".env")
|
|
240
|
+
if os.path.exists(env_file_path):
|
|
241
|
+
load_dotenv(env_file_path, override=False)
|
|
242
|
+
env.update(os.environ)
|
|
243
|
+
logging.debug(f"Loaded environment variables from {env_file_path}")
|
|
244
|
+
|
|
245
|
+
system = platform.system().lower()
|
|
246
|
+
if system == "linux":
|
|
247
|
+
lib_dir = os.path.dirname(exe_path) # Same directory as the executable
|
|
248
|
+
current_ld_path = env.get("LD_LIBRARY_PATH", "")
|
|
249
|
+
if current_ld_path:
|
|
250
|
+
env["LD_LIBRARY_PATH"] = f"{lib_dir}:{current_ld_path}"
|
|
251
|
+
else:
|
|
252
|
+
env["LD_LIBRARY_PATH"] = lib_dir
|
|
253
|
+
logging.debug(f"Set LD_LIBRARY_PATH to {env['LD_LIBRARY_PATH']}")
|
|
254
|
+
elif system == "darwin":
|
|
255
|
+
lib_dir = os.path.dirname(exe_path)
|
|
256
|
+
current_dyld_path = env.get("DYLD_LIBRARY_PATH", "")
|
|
257
|
+
if current_dyld_path:
|
|
258
|
+
env["DYLD_LIBRARY_PATH"] = f"{lib_dir}:{current_dyld_path}"
|
|
259
|
+
else:
|
|
260
|
+
env["DYLD_LIBRARY_PATH"] = lib_dir
|
|
261
|
+
logging.debug(f"Set DYLD_LIBRARY_PATH to {env['DYLD_LIBRARY_PATH']}")
|
|
262
|
+
|
|
263
|
+
# Start subprocess with output capture
|
|
264
|
+
self.process = subprocess.Popen(
|
|
265
|
+
command,
|
|
266
|
+
stdout=subprocess.PIPE,
|
|
267
|
+
stderr=subprocess.STDOUT,
|
|
268
|
+
text=True,
|
|
269
|
+
encoding="utf-8",
|
|
270
|
+
errors="replace",
|
|
271
|
+
bufsize=1,
|
|
272
|
+
env=env,
|
|
428
273
|
)
|
|
429
274
|
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
275
|
+
# Start background thread to log subprocess output
|
|
276
|
+
device_type = "GPU" if use_gpu else "CPU"
|
|
277
|
+
threading.Thread(
|
|
278
|
+
target=self._log_subprocess_output,
|
|
279
|
+
args=(f"LLAMA SERVER {device_type}",),
|
|
280
|
+
daemon=True,
|
|
281
|
+
).start()
|
|
282
|
+
|
|
283
|
+
def _launch_server_subprocess(
|
|
284
|
+
self,
|
|
285
|
+
model_config: PullConfig,
|
|
286
|
+
snapshot_files: dict,
|
|
287
|
+
ctx_size: int,
|
|
288
|
+
supports_embeddings: bool = False,
|
|
289
|
+
supports_reranking: bool = False,
|
|
290
|
+
):
|
|
291
|
+
|
|
292
|
+
# Attempt loading on GPU first
|
|
293
|
+
self._launch_device_backend_subprocess(
|
|
435
294
|
snapshot_files,
|
|
436
|
-
use_gpu=
|
|
437
|
-
telemetry=telemetry,
|
|
438
|
-
backend=backend,
|
|
295
|
+
use_gpu=True,
|
|
439
296
|
ctx_size=ctx_size,
|
|
440
297
|
supports_embeddings=supports_embeddings,
|
|
441
298
|
supports_reranking=supports_reranking,
|
|
442
299
|
)
|
|
443
300
|
|
|
444
|
-
# Check the /health endpoint until
|
|
445
|
-
_wait_for_load(
|
|
446
|
-
llama_server_process,
|
|
447
|
-
telemetry.port,
|
|
448
|
-
)
|
|
449
|
-
|
|
450
|
-
if llama_server_process.poll():
|
|
451
|
-
raise HTTPException(
|
|
452
|
-
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
|
453
|
-
detail=f"Failed to load {model_config.model_name} with llama.cpp",
|
|
454
|
-
)
|
|
455
|
-
|
|
456
|
-
return llama_server_process
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
def chat_completion(
|
|
460
|
-
chat_completion_request: ChatCompletionRequest, telemetry: LlamaTelemetry
|
|
461
|
-
):
|
|
462
|
-
base_url = llamacpp_address(telemetry.port)
|
|
463
|
-
client = OpenAI(
|
|
464
|
-
base_url=base_url,
|
|
465
|
-
api_key="lemonade",
|
|
466
|
-
)
|
|
467
|
-
|
|
468
|
-
# Convert Pydantic model to dict and remove unset/null values
|
|
469
|
-
request_dict = chat_completion_request.model_dump(
|
|
470
|
-
exclude_unset=True, exclude_none=True
|
|
471
|
-
)
|
|
472
|
-
|
|
473
|
-
# Separate standard OpenAI parameters from custom llama.cpp parameters
|
|
474
|
-
openai_client_params = _separate_openai_params(request_dict, "chat")
|
|
475
|
-
|
|
476
|
-
# Check if streaming is requested
|
|
477
|
-
if chat_completion_request.stream:
|
|
478
|
-
|
|
479
|
-
def event_stream():
|
|
480
|
-
try:
|
|
481
|
-
# Enable streaming
|
|
482
|
-
# pylint: disable=missing-kwoa
|
|
483
|
-
for chunk in client.chat.completions.create(**openai_client_params):
|
|
484
|
-
yield f"data: {chunk.model_dump_json()}\n\n"
|
|
485
|
-
yield "data: [DONE]\n\n"
|
|
486
|
-
|
|
487
|
-
# Show telemetry after completion
|
|
488
|
-
telemetry.show_telemetry()
|
|
489
|
-
|
|
490
|
-
except Exception as e: # pylint: disable=broad-exception-caught
|
|
491
|
-
yield f'data: {{"error": "{str(e)}"}}\n\n'
|
|
492
|
-
|
|
493
|
-
return StreamingResponse(
|
|
494
|
-
event_stream(),
|
|
495
|
-
media_type="text/event-stream",
|
|
496
|
-
headers={
|
|
497
|
-
"Cache-Control": "no-cache",
|
|
498
|
-
"Connection": "keep-alive",
|
|
499
|
-
},
|
|
500
|
-
)
|
|
501
|
-
else:
|
|
502
|
-
# Non-streaming response
|
|
503
|
-
try:
|
|
504
|
-
# Disable streaming for non-streaming requests
|
|
505
|
-
# pylint: disable=missing-kwoa
|
|
506
|
-
response = client.chat.completions.create(**openai_client_params)
|
|
507
|
-
|
|
508
|
-
# Show telemetry after completion
|
|
509
|
-
telemetry.show_telemetry()
|
|
301
|
+
# Check the /health endpoint until GPU server is ready
|
|
302
|
+
self._wait_for_load()
|
|
510
303
|
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
raise HTTPException(
|
|
516
|
-
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
517
|
-
detail=f"Chat completion error: {str(e)}",
|
|
304
|
+
# If loading on GPU failed, try loading on CPU
|
|
305
|
+
if self.process.poll():
|
|
306
|
+
logging.warning(
|
|
307
|
+
f"Loading {model_config.model_name} on GPU didn't work, re-attempting on CPU"
|
|
518
308
|
)
|
|
519
309
|
|
|
310
|
+
if os.environ.get("LEMONADE_LLAMACPP_NO_FALLBACK"):
|
|
311
|
+
# Used for testing, when the test should fail if GPU didn't work
|
|
312
|
+
raise Exception("llamacpp GPU loading failed")
|
|
520
313
|
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
telemetry: Telemetry object containing the server port
|
|
528
|
-
|
|
529
|
-
Returns:
|
|
530
|
-
Completion response from the llamacpp server
|
|
531
|
-
"""
|
|
532
|
-
base_url = llamacpp_address(telemetry.port)
|
|
533
|
-
client = OpenAI(
|
|
534
|
-
base_url=base_url,
|
|
535
|
-
api_key="lemonade",
|
|
536
|
-
)
|
|
537
|
-
|
|
538
|
-
# Convert Pydantic model to dict and remove unset/null values
|
|
539
|
-
request_dict = completion_request.model_dump(exclude_unset=True, exclude_none=True)
|
|
540
|
-
|
|
541
|
-
# Separate standard OpenAI parameters from custom llama.cpp parameters
|
|
542
|
-
openai_client_params = _separate_openai_params(request_dict, "completion")
|
|
543
|
-
|
|
544
|
-
# Check if streaming is requested
|
|
545
|
-
if completion_request.stream:
|
|
546
|
-
|
|
547
|
-
def event_stream():
|
|
548
|
-
try:
|
|
549
|
-
# Enable streaming
|
|
550
|
-
# pylint: disable=missing-kwoa
|
|
551
|
-
for chunk in client.completions.create(**openai_client_params):
|
|
552
|
-
yield f"data: {chunk.model_dump_json()}\n\n"
|
|
553
|
-
yield "data: [DONE]\n\n"
|
|
554
|
-
|
|
555
|
-
# Show telemetry after completion
|
|
556
|
-
telemetry.show_telemetry()
|
|
557
|
-
|
|
558
|
-
except Exception as e: # pylint: disable=broad-exception-caught
|
|
559
|
-
yield f'data: {{"error": "{str(e)}"}}\n\n'
|
|
560
|
-
|
|
561
|
-
return StreamingResponse(
|
|
562
|
-
event_stream(),
|
|
563
|
-
media_type="text/event-stream",
|
|
564
|
-
headers={
|
|
565
|
-
"Cache-Control": "no-cache",
|
|
566
|
-
"Connection": "keep-alive",
|
|
567
|
-
},
|
|
568
|
-
)
|
|
569
|
-
else:
|
|
570
|
-
# Non-streaming response
|
|
571
|
-
try:
|
|
572
|
-
# Disable streaming for non-streaming requests
|
|
573
|
-
# pylint: disable=missing-kwoa
|
|
574
|
-
response = client.completions.create(**openai_client_params)
|
|
575
|
-
|
|
576
|
-
# Show telemetry after completion
|
|
577
|
-
telemetry.show_telemetry()
|
|
578
|
-
|
|
579
|
-
return response
|
|
580
|
-
|
|
581
|
-
except Exception as e: # pylint: disable=broad-exception-caught
|
|
582
|
-
logging.error("Error during completion: %s", str(e))
|
|
583
|
-
raise HTTPException(
|
|
584
|
-
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
585
|
-
detail=f"Completion error: {str(e)}",
|
|
314
|
+
self._launch_device_backend_subprocess(
|
|
315
|
+
snapshot_files,
|
|
316
|
+
use_gpu=False,
|
|
317
|
+
ctx_size=ctx_size,
|
|
318
|
+
supports_embeddings=supports_embeddings,
|
|
319
|
+
supports_reranking=supports_reranking,
|
|
586
320
|
)
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
def embeddings(embeddings_request: EmbeddingsRequest, telemetry: LlamaTelemetry):
|
|
590
|
-
"""
|
|
591
|
-
Generate embeddings using the llamacpp server.
|
|
592
|
-
|
|
593
|
-
Args:
|
|
594
|
-
embeddings_request: The embeddings request containing input text/tokens
|
|
595
|
-
telemetry: Telemetry object containing the server port
|
|
596
|
-
|
|
597
|
-
Returns:
|
|
598
|
-
Embeddings response from the llamacpp server
|
|
599
|
-
"""
|
|
600
|
-
base_url = llamacpp_address(telemetry.port)
|
|
601
|
-
client = OpenAI(
|
|
602
|
-
base_url=base_url,
|
|
603
|
-
api_key="lemonade",
|
|
604
|
-
)
|
|
605
|
-
|
|
606
|
-
# Convert Pydantic model to dict and remove unset/null values
|
|
607
|
-
request_dict = embeddings_request.model_dump(exclude_unset=True, exclude_none=True)
|
|
608
|
-
|
|
609
|
-
try:
|
|
610
|
-
# Call the embeddings endpoint
|
|
611
|
-
response = client.embeddings.create(**request_dict)
|
|
612
|
-
return response
|
|
613
|
-
|
|
614
|
-
except Exception as e: # pylint: disable=broad-exception-caught
|
|
615
|
-
raise HTTPException(
|
|
616
|
-
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
617
|
-
detail=f"Embeddings error: {str(e)}",
|
|
618
|
-
)
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
def reranking(reranking_request: RerankingRequest, telemetry: LlamaTelemetry):
|
|
622
|
-
"""
|
|
623
|
-
Rerank documents based on their relevance to a query using the llamacpp server.
|
|
624
|
-
|
|
625
|
-
Args:
|
|
626
|
-
reranking_request: The reranking request containing query and documents
|
|
627
|
-
telemetry: Telemetry object containing the server port
|
|
628
|
-
|
|
629
|
-
Returns:
|
|
630
|
-
Reranking response from the llamacpp server containing ranked documents and scores
|
|
631
|
-
"""
|
|
632
|
-
base_url = llamacpp_address(telemetry.port)
|
|
633
|
-
|
|
634
|
-
try:
|
|
635
|
-
# Convert Pydantic model to dict and exclude unset/null values
|
|
636
|
-
request_dict = reranking_request.model_dump(
|
|
637
|
-
exclude_unset=True, exclude_none=True
|
|
638
|
-
)
|
|
639
|
-
|
|
640
|
-
# Call the reranking endpoint directly since it's not supported by the OpenAI API
|
|
641
|
-
response = requests.post(
|
|
642
|
-
f"{base_url}/rerank",
|
|
643
|
-
json=request_dict,
|
|
644
|
-
)
|
|
645
|
-
response.raise_for_status()
|
|
646
|
-
return response.json()
|
|
647
|
-
|
|
648
|
-
except Exception as e:
|
|
649
|
-
logging.error("Error during reranking: %s", str(e))
|
|
650
|
-
raise HTTPException(
|
|
651
|
-
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
652
|
-
detail=f"Reranking error: {str(e)}",
|
|
653
|
-
) from e
|