lemonade-sdk 8.1.5__py3-none-any.whl → 8.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lemonade-sdk might be problematic. Click here for more details.
- lemonade/tools/llamacpp/utils.py +5 -1
- lemonade/tools/server/llamacpp.py +164 -562
- lemonade/tools/server/serve.py +15 -22
- lemonade/tools/server/wrapped_server.py +485 -0
- lemonade/version.py +1 -1
- {lemonade_sdk-8.1.5.dist-info → lemonade_sdk-8.1.6.dist-info}/METADATA +1 -1
- {lemonade_sdk-8.1.5.dist-info → lemonade_sdk-8.1.6.dist-info}/RECORD +14 -13
- lemonade_server/cli.py +18 -9
- lemonade_server/model_manager.py +201 -20
- {lemonade_sdk-8.1.5.dist-info → lemonade_sdk-8.1.6.dist-info}/WHEEL +0 -0
- {lemonade_sdk-8.1.5.dist-info → lemonade_sdk-8.1.6.dist-info}/entry_points.txt +0 -0
- {lemonade_sdk-8.1.5.dist-info → lemonade_sdk-8.1.6.dist-info}/licenses/LICENSE +0 -0
- {lemonade_sdk-8.1.5.dist-info → lemonade_sdk-8.1.6.dist-info}/licenses/NOTICE.md +0 -0
- {lemonade_sdk-8.1.5.dist-info → lemonade_sdk-8.1.6.dist-info}/top_level.txt +0 -0
|
@@ -1,144 +1,28 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import logging
|
|
3
|
-
import time
|
|
4
3
|
import subprocess
|
|
5
4
|
import re
|
|
6
5
|
import threading
|
|
7
6
|
import platform
|
|
8
7
|
|
|
9
|
-
import requests
|
|
10
|
-
from tabulate import tabulate
|
|
11
8
|
from dotenv import load_dotenv
|
|
12
|
-
from fastapi import HTTPException, status
|
|
13
|
-
from fastapi.responses import StreamingResponse
|
|
14
|
-
|
|
15
|
-
from openai import OpenAI
|
|
16
9
|
|
|
17
10
|
from lemonade_server.pydantic_models import (
|
|
18
|
-
ChatCompletionRequest,
|
|
19
|
-
CompletionRequest,
|
|
20
11
|
PullConfig,
|
|
21
|
-
EmbeddingsRequest,
|
|
22
|
-
RerankingRequest,
|
|
23
12
|
)
|
|
24
|
-
from lemonade_server.model_manager import ModelManager
|
|
25
|
-
from lemonade.tools.server.utils.port import find_free_port
|
|
26
13
|
from lemonade.tools.llamacpp.utils import (
|
|
27
14
|
get_llama_server_exe_path,
|
|
28
15
|
install_llamacpp,
|
|
29
16
|
download_gguf,
|
|
30
17
|
)
|
|
18
|
+
from lemonade.tools.server.wrapped_server import WrappedServerTelemetry, WrappedServer
|
|
31
19
|
|
|
32
20
|
|
|
33
|
-
|
|
34
|
-
"""
|
|
35
|
-
Generate the base URL for the llamacpp server.
|
|
36
|
-
|
|
37
|
-
Args:
|
|
38
|
-
port: The port number the llamacpp server is running on
|
|
39
|
-
|
|
40
|
-
Returns:
|
|
41
|
-
The base URL for the llamacpp server
|
|
42
|
-
"""
|
|
43
|
-
return f"http://127.0.0.1:{port}/v1"
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
def _separate_openai_params(request_dict: dict, endpoint_type: str = "chat") -> dict:
|
|
47
|
-
"""
|
|
48
|
-
Separate standard OpenAI parameters from custom llama.cpp parameters.
|
|
49
|
-
|
|
50
|
-
Args:
|
|
51
|
-
request_dict: Dictionary of all request parameters
|
|
52
|
-
endpoint_type: Type of endpoint ("chat" or "completion")
|
|
53
|
-
|
|
54
|
-
Returns:
|
|
55
|
-
Dictionary with parameters properly separated for OpenAI client
|
|
56
|
-
"""
|
|
57
|
-
openai_client_params = {}
|
|
58
|
-
extra_params = {}
|
|
59
|
-
|
|
60
|
-
# Common OpenAI parameters for both endpoint types
|
|
61
|
-
common_params = {
|
|
62
|
-
"model",
|
|
63
|
-
"frequency_penalty",
|
|
64
|
-
"logit_bias",
|
|
65
|
-
"logprobs",
|
|
66
|
-
"max_tokens",
|
|
67
|
-
"n",
|
|
68
|
-
"presence_penalty",
|
|
69
|
-
"seed",
|
|
70
|
-
"stop",
|
|
71
|
-
"stream",
|
|
72
|
-
"temperature",
|
|
73
|
-
"top_p",
|
|
74
|
-
"user",
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
# Standard OpenAI parameters by endpoint type
|
|
78
|
-
if endpoint_type == "chat":
|
|
79
|
-
chat_specific_params = {
|
|
80
|
-
"messages",
|
|
81
|
-
"top_logprobs",
|
|
82
|
-
"response_format",
|
|
83
|
-
"service_tier",
|
|
84
|
-
"stream_options",
|
|
85
|
-
"tools",
|
|
86
|
-
"tool_choice",
|
|
87
|
-
"parallel_tool_calls",
|
|
88
|
-
}
|
|
89
|
-
openai_params = common_params | chat_specific_params
|
|
90
|
-
else: # completion
|
|
91
|
-
completion_specific_params = {
|
|
92
|
-
"prompt",
|
|
93
|
-
"best_of",
|
|
94
|
-
"echo",
|
|
95
|
-
"suffix",
|
|
96
|
-
}
|
|
97
|
-
openai_params = common_params | completion_specific_params
|
|
98
|
-
|
|
99
|
-
for key, value in request_dict.items():
|
|
100
|
-
if key in openai_params:
|
|
101
|
-
openai_client_params[key] = value
|
|
102
|
-
else:
|
|
103
|
-
extra_params[key] = value
|
|
104
|
-
|
|
105
|
-
# If there are custom parameters, use extra_body to pass them through
|
|
106
|
-
if extra_params:
|
|
107
|
-
openai_client_params["extra_body"] = extra_params
|
|
108
|
-
|
|
109
|
-
return openai_client_params
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
class LlamaTelemetry:
|
|
21
|
+
class LlamaTelemetry(WrappedServerTelemetry):
|
|
113
22
|
"""
|
|
114
23
|
Manages telemetry data collection and display for llama server.
|
|
115
24
|
"""
|
|
116
25
|
|
|
117
|
-
def __init__(self):
|
|
118
|
-
self.input_tokens = None
|
|
119
|
-
self.output_tokens = None
|
|
120
|
-
self.time_to_first_token = None
|
|
121
|
-
self.tokens_per_second = None
|
|
122
|
-
self.prompt_eval_time = None
|
|
123
|
-
self.eval_time = None
|
|
124
|
-
self.port = None
|
|
125
|
-
|
|
126
|
-
def choose_port(self):
|
|
127
|
-
"""
|
|
128
|
-
Users probably don't care what port we start llama-server on, so let's
|
|
129
|
-
search for an empty port
|
|
130
|
-
"""
|
|
131
|
-
|
|
132
|
-
self.port = find_free_port()
|
|
133
|
-
|
|
134
|
-
if self.port is None:
|
|
135
|
-
msg = "Failed to find an empty port to start llama-server on"
|
|
136
|
-
logging.error(msg)
|
|
137
|
-
raise HTTPException(
|
|
138
|
-
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
139
|
-
detail=msg,
|
|
140
|
-
)
|
|
141
|
-
|
|
142
26
|
def parse_telemetry_line(self, line: str):
|
|
143
27
|
"""
|
|
144
28
|
Parse telemetry data from llama server output lines.
|
|
@@ -186,468 +70,186 @@ class LlamaTelemetry:
|
|
|
186
70
|
self.tokens_per_second = tokens_per_second
|
|
187
71
|
return
|
|
188
72
|
|
|
189
|
-
def get_telemetry_data(self):
|
|
190
|
-
return {
|
|
191
|
-
"input_tokens": self.input_tokens,
|
|
192
|
-
"output_tokens": self.output_tokens,
|
|
193
|
-
"time_to_first_token": self.time_to_first_token,
|
|
194
|
-
"tokens_per_second": self.tokens_per_second,
|
|
195
|
-
"decode_token_times": None,
|
|
196
|
-
}
|
|
197
|
-
|
|
198
|
-
def show_telemetry(self):
|
|
199
|
-
# Check if debug logging is enabled
|
|
200
|
-
if not logging.getLogger().isEnabledFor(logging.DEBUG):
|
|
201
|
-
return
|
|
202
|
-
|
|
203
|
-
# Prepare telemetry data (transposed format)
|
|
204
|
-
telemetry = [
|
|
205
|
-
["Input tokens", self.input_tokens],
|
|
206
|
-
["Output tokens", self.output_tokens],
|
|
207
|
-
["TTFT (s)", f"{self.time_to_first_token:.2f}"],
|
|
208
|
-
["TPS", f"{self.tokens_per_second:.2f}"],
|
|
209
|
-
]
|
|
210
73
|
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
74
|
+
class LlamaServer(WrappedServer):
|
|
75
|
+
def __init__(self, backend: str):
|
|
76
|
+
self.telemetry = LlamaTelemetry()
|
|
77
|
+
self.backend = backend
|
|
78
|
+
super().__init__(server_name="llama-server", telemetry=self.telemetry)
|
|
214
79
|
|
|
215
|
-
|
|
216
|
-
|
|
80
|
+
def install_server(self, backend=None):
|
|
81
|
+
"""
|
|
82
|
+
Install the wrapped server
|
|
83
|
+
"""
|
|
84
|
+
install_llamacpp(self.backend)
|
|
217
85
|
|
|
86
|
+
def download_model(
|
|
87
|
+
self, config_checkpoint, config_mmproj=None, do_not_upgrade=False
|
|
88
|
+
) -> dict:
|
|
89
|
+
"""
|
|
90
|
+
Download a model for the wrapper server
|
|
91
|
+
"""
|
|
92
|
+
return download_gguf(
|
|
93
|
+
config_checkpoint=config_checkpoint,
|
|
94
|
+
config_mmproj=config_mmproj,
|
|
95
|
+
do_not_upgrade=do_not_upgrade,
|
|
96
|
+
)
|
|
218
97
|
|
|
219
|
-
def
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
98
|
+
def _launch_device_backend_subprocess(
|
|
99
|
+
self,
|
|
100
|
+
snapshot_files: dict,
|
|
101
|
+
use_gpu: bool,
|
|
102
|
+
ctx_size: int,
|
|
103
|
+
supports_embeddings: bool = False,
|
|
104
|
+
supports_reranking: bool = False,
|
|
105
|
+
) -> subprocess.Popen:
|
|
106
|
+
"""
|
|
107
|
+
Launch llama server subprocess with appropriate configuration.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
snapshot_files: Dictionary of model files to load
|
|
111
|
+
use_gpu: Whether to use GPU acceleration
|
|
112
|
+
telemetry: Telemetry object for tracking performance metrics
|
|
113
|
+
backend: Backend to use (e.g., 'vulkan', 'rocm')
|
|
114
|
+
supports_embeddings: Whether the model supports embeddings
|
|
115
|
+
supports_reranking: Whether the model supports reranking
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
Subprocess handle for the llama server
|
|
119
|
+
"""
|
|
225
120
|
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
for line in iter(process.stdout.readline, ""):
|
|
229
|
-
if line:
|
|
230
|
-
line_stripped = line.strip()
|
|
231
|
-
logging.debug("%s: %s", prefix, line_stripped)
|
|
232
|
-
|
|
233
|
-
telemetry.parse_telemetry_line(line_stripped)
|
|
234
|
-
|
|
235
|
-
if process.poll() is not None:
|
|
236
|
-
break
|
|
237
|
-
except UnicodeDecodeError as e:
|
|
238
|
-
logging.debug("Unicode decode error reading subprocess output: %s", str(e))
|
|
239
|
-
except Exception as e: # pylint: disable=broad-exception-caught
|
|
240
|
-
logging.error("Unexpected error reading subprocess output: %s", str(e))
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
def _wait_for_load(llama_server_process: subprocess.Popen, port: int):
|
|
244
|
-
status_code = None
|
|
245
|
-
while not llama_server_process.poll() and status_code != 200:
|
|
246
|
-
health_url = f"http://localhost:{port}/health"
|
|
247
|
-
try:
|
|
248
|
-
health_response = requests.get(health_url)
|
|
249
|
-
except requests.exceptions.ConnectionError:
|
|
250
|
-
logging.debug("Not able to connect to llama-server yet, will retry")
|
|
251
|
-
else:
|
|
252
|
-
status_code = health_response.status_code
|
|
253
|
-
logging.debug(
|
|
254
|
-
"Testing llama-server readiness (will retry until ready), "
|
|
255
|
-
f"result: {health_response.json()}"
|
|
256
|
-
)
|
|
257
|
-
time.sleep(1)
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
def _launch_llama_subprocess(
|
|
261
|
-
snapshot_files: dict,
|
|
262
|
-
use_gpu: bool,
|
|
263
|
-
telemetry: LlamaTelemetry,
|
|
264
|
-
backend: str,
|
|
265
|
-
ctx_size: int,
|
|
266
|
-
supports_embeddings: bool = False,
|
|
267
|
-
supports_reranking: bool = False,
|
|
268
|
-
) -> subprocess.Popen:
|
|
269
|
-
"""
|
|
270
|
-
Launch llama server subprocess with appropriate configuration.
|
|
271
|
-
|
|
272
|
-
Args:
|
|
273
|
-
snapshot_files: Dictionary of model files to load
|
|
274
|
-
use_gpu: Whether to use GPU acceleration
|
|
275
|
-
telemetry: Telemetry object for tracking performance metrics
|
|
276
|
-
backend: Backend to use (e.g., 'vulkan', 'rocm')
|
|
277
|
-
supports_embeddings: Whether the model supports embeddings
|
|
278
|
-
supports_reranking: Whether the model supports reranking
|
|
279
|
-
|
|
280
|
-
Returns:
|
|
281
|
-
Subprocess handle for the llama server
|
|
282
|
-
"""
|
|
121
|
+
# Get the current executable path (handles both Windows and Ubuntu structures)
|
|
122
|
+
exe_path = get_llama_server_exe_path(self.backend)
|
|
283
123
|
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
"--ctx-size",
|
|
293
|
-
str(ctx_size),
|
|
294
|
-
]
|
|
295
|
-
|
|
296
|
-
# Lock random seed for deterministic behavior in CI
|
|
297
|
-
if os.environ.get("LEMONADE_CI_MODE"):
|
|
298
|
-
base_command.extend(["--seed", "42"])
|
|
299
|
-
|
|
300
|
-
if "mmproj" in snapshot_files:
|
|
301
|
-
base_command.extend(["--mmproj", snapshot_files["mmproj"]])
|
|
302
|
-
if not use_gpu:
|
|
303
|
-
base_command.extend(["--no-mmproj-offload"])
|
|
304
|
-
|
|
305
|
-
# Find a port, and save it in the telemetry object for future reference
|
|
306
|
-
# by other functions
|
|
307
|
-
telemetry.choose_port()
|
|
308
|
-
|
|
309
|
-
# Add port and jinja to enable tool use
|
|
310
|
-
base_command.extend(["--port", str(telemetry.port), "--jinja"])
|
|
311
|
-
|
|
312
|
-
# Disable jinja for gpt-oss-120b on Vulkan
|
|
313
|
-
if backend == "vulkan" and "gpt-oss-120b" in snapshot_files["variant"].lower():
|
|
314
|
-
base_command.remove("--jinja")
|
|
315
|
-
logging.warning(
|
|
316
|
-
"Jinja is disabled for gpt-oss-120b on Vulkan due to a llama.cpp bug "
|
|
317
|
-
"(see https://github.com/ggml-org/llama.cpp/issues/15274). "
|
|
318
|
-
"The model cannot use tools. If needed, use the ROCm backend instead."
|
|
319
|
-
)
|
|
124
|
+
# Build the base command
|
|
125
|
+
base_command = [
|
|
126
|
+
exe_path,
|
|
127
|
+
"-m",
|
|
128
|
+
snapshot_files["variant"],
|
|
129
|
+
"--ctx-size",
|
|
130
|
+
str(ctx_size),
|
|
131
|
+
]
|
|
320
132
|
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
lib_dir = os.path.dirname(exe_path) # Same directory as the executable
|
|
350
|
-
current_ld_path = env.get("LD_LIBRARY_PATH", "")
|
|
351
|
-
if current_ld_path:
|
|
352
|
-
env["LD_LIBRARY_PATH"] = f"{lib_dir}:{current_ld_path}"
|
|
353
|
-
else:
|
|
354
|
-
env["LD_LIBRARY_PATH"] = lib_dir
|
|
355
|
-
logging.debug(f"Set LD_LIBRARY_PATH to {env['LD_LIBRARY_PATH']}")
|
|
356
|
-
|
|
357
|
-
# Start subprocess with output capture
|
|
358
|
-
process = subprocess.Popen(
|
|
359
|
-
command,
|
|
360
|
-
stdout=subprocess.PIPE,
|
|
361
|
-
stderr=subprocess.STDOUT,
|
|
362
|
-
text=True,
|
|
363
|
-
encoding="utf-8",
|
|
364
|
-
errors="replace",
|
|
365
|
-
bufsize=1,
|
|
366
|
-
env=env,
|
|
367
|
-
)
|
|
368
|
-
|
|
369
|
-
# Start background thread to log subprocess output
|
|
370
|
-
device_type = "GPU" if use_gpu else "CPU"
|
|
371
|
-
threading.Thread(
|
|
372
|
-
target=_log_subprocess_output,
|
|
373
|
-
args=(process, f"LLAMA SERVER {device_type}", telemetry),
|
|
374
|
-
daemon=True,
|
|
375
|
-
).start()
|
|
376
|
-
|
|
377
|
-
return process
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
def server_load(
|
|
381
|
-
model_config: PullConfig,
|
|
382
|
-
telemetry: LlamaTelemetry,
|
|
383
|
-
backend: str,
|
|
384
|
-
ctx_size: int,
|
|
385
|
-
do_not_upgrade: bool = False,
|
|
386
|
-
):
|
|
387
|
-
# Install and/or update llama.cpp if needed
|
|
388
|
-
try:
|
|
389
|
-
install_llamacpp(backend)
|
|
390
|
-
except NotImplementedError as e:
|
|
391
|
-
raise HTTPException(
|
|
392
|
-
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, detail=str(e)
|
|
393
|
-
)
|
|
133
|
+
# Lock random seed for deterministic behavior in CI
|
|
134
|
+
if os.environ.get("LEMONADE_CI_MODE"):
|
|
135
|
+
base_command.extend(["--seed", "42"])
|
|
136
|
+
logging.info(f"Seed applied to base command: {base_command}")
|
|
137
|
+
|
|
138
|
+
if "mmproj" in snapshot_files:
|
|
139
|
+
base_command.extend(["--mmproj", snapshot_files["mmproj"]])
|
|
140
|
+
if not use_gpu:
|
|
141
|
+
base_command.extend(["--no-mmproj-offload"])
|
|
142
|
+
|
|
143
|
+
# Find a port, and save it in the telemetry object for future reference
|
|
144
|
+
# by other functions
|
|
145
|
+
self.choose_port()
|
|
146
|
+
|
|
147
|
+
# Add port and jinja to enable tool use
|
|
148
|
+
base_command.extend(["--port", str(self.port), "--jinja"])
|
|
149
|
+
|
|
150
|
+
# Disable jinja for gpt-oss-120b on Vulkan
|
|
151
|
+
if (
|
|
152
|
+
self.backend == "vulkan"
|
|
153
|
+
and "gpt-oss-120b" in snapshot_files["variant"].lower()
|
|
154
|
+
):
|
|
155
|
+
base_command.remove("--jinja")
|
|
156
|
+
logging.warning(
|
|
157
|
+
"Jinja is disabled for gpt-oss-120b on Vulkan due to a llama.cpp bug "
|
|
158
|
+
"(see https://github.com/ggml-org/llama.cpp/issues/15274). "
|
|
159
|
+
"The model cannot use tools. If needed, use the ROCm backend instead."
|
|
160
|
+
)
|
|
394
161
|
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
162
|
+
# Use legacy reasoning formatting, since not all apps support the new
|
|
163
|
+
# reasoning_content field
|
|
164
|
+
base_command.extend(["--reasoning-format", "none"])
|
|
165
|
+
|
|
166
|
+
# Add embeddings support if the model supports it
|
|
167
|
+
if supports_embeddings:
|
|
168
|
+
base_command.append("--embeddings")
|
|
169
|
+
|
|
170
|
+
# Add reranking support if the model supports it
|
|
171
|
+
if supports_reranking:
|
|
172
|
+
base_command.append("--reranking")
|
|
173
|
+
|
|
174
|
+
# Configure GPU layers: 99 for GPU, 0 for CPU-only
|
|
175
|
+
ngl_value = "99" if use_gpu else "0"
|
|
176
|
+
command = base_command + ["-ngl", ngl_value]
|
|
177
|
+
|
|
178
|
+
# Set up environment with library path for Linux
|
|
179
|
+
env = os.environ.copy()
|
|
180
|
+
|
|
181
|
+
# Load environment variables from .env file in the executable directory
|
|
182
|
+
exe_dir = os.path.dirname(exe_path)
|
|
183
|
+
env_file_path = os.path.join(exe_dir, ".env")
|
|
184
|
+
if os.path.exists(env_file_path):
|
|
185
|
+
load_dotenv(env_file_path, override=True)
|
|
186
|
+
env.update(os.environ)
|
|
187
|
+
logging.debug(f"Loaded environment variables from {env_file_path}")
|
|
188
|
+
|
|
189
|
+
if platform.system().lower() == "linux":
|
|
190
|
+
lib_dir = os.path.dirname(exe_path) # Same directory as the executable
|
|
191
|
+
current_ld_path = env.get("LD_LIBRARY_PATH", "")
|
|
192
|
+
if current_ld_path:
|
|
193
|
+
env["LD_LIBRARY_PATH"] = f"{lib_dir}:{current_ld_path}"
|
|
194
|
+
else:
|
|
195
|
+
env["LD_LIBRARY_PATH"] = lib_dir
|
|
196
|
+
logging.debug(f"Set LD_LIBRARY_PATH to {env['LD_LIBRARY_PATH']}")
|
|
197
|
+
|
|
198
|
+
# Start subprocess with output capture
|
|
199
|
+
self.process = subprocess.Popen(
|
|
200
|
+
command,
|
|
201
|
+
stdout=subprocess.PIPE,
|
|
202
|
+
stderr=subprocess.STDOUT,
|
|
203
|
+
text=True,
|
|
204
|
+
encoding="utf-8",
|
|
205
|
+
errors="replace",
|
|
206
|
+
bufsize=1,
|
|
207
|
+
env=env,
|
|
428
208
|
)
|
|
429
209
|
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
210
|
+
# Start background thread to log subprocess output
|
|
211
|
+
device_type = "GPU" if use_gpu else "CPU"
|
|
212
|
+
threading.Thread(
|
|
213
|
+
target=self._log_subprocess_output,
|
|
214
|
+
args=(f"LLAMA SERVER {device_type}",),
|
|
215
|
+
daemon=True,
|
|
216
|
+
).start()
|
|
217
|
+
|
|
218
|
+
def _launch_server_subprocess(
|
|
219
|
+
self,
|
|
220
|
+
model_config: PullConfig,
|
|
221
|
+
snapshot_files: dict,
|
|
222
|
+
ctx_size: int,
|
|
223
|
+
supports_embeddings: bool = False,
|
|
224
|
+
supports_reranking: bool = False,
|
|
225
|
+
):
|
|
226
|
+
|
|
227
|
+
# Attempt loading on GPU first
|
|
228
|
+
self._launch_device_backend_subprocess(
|
|
435
229
|
snapshot_files,
|
|
436
|
-
use_gpu=
|
|
437
|
-
telemetry=telemetry,
|
|
438
|
-
backend=backend,
|
|
230
|
+
use_gpu=True,
|
|
439
231
|
ctx_size=ctx_size,
|
|
440
232
|
supports_embeddings=supports_embeddings,
|
|
441
233
|
supports_reranking=supports_reranking,
|
|
442
234
|
)
|
|
443
235
|
|
|
444
|
-
# Check the /health endpoint until
|
|
445
|
-
_wait_for_load(
|
|
446
|
-
llama_server_process,
|
|
447
|
-
telemetry.port,
|
|
448
|
-
)
|
|
449
|
-
|
|
450
|
-
if llama_server_process.poll():
|
|
451
|
-
raise HTTPException(
|
|
452
|
-
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
|
453
|
-
detail=f"Failed to load {model_config.model_name} with llama.cpp",
|
|
454
|
-
)
|
|
236
|
+
# Check the /health endpoint until GPU server is ready
|
|
237
|
+
self._wait_for_load()
|
|
455
238
|
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
chat_completion_request: ChatCompletionRequest, telemetry: LlamaTelemetry
|
|
461
|
-
):
|
|
462
|
-
base_url = llamacpp_address(telemetry.port)
|
|
463
|
-
client = OpenAI(
|
|
464
|
-
base_url=base_url,
|
|
465
|
-
api_key="lemonade",
|
|
466
|
-
)
|
|
467
|
-
|
|
468
|
-
# Convert Pydantic model to dict and remove unset/null values
|
|
469
|
-
request_dict = chat_completion_request.model_dump(
|
|
470
|
-
exclude_unset=True, exclude_none=True
|
|
471
|
-
)
|
|
472
|
-
|
|
473
|
-
# Separate standard OpenAI parameters from custom llama.cpp parameters
|
|
474
|
-
openai_client_params = _separate_openai_params(request_dict, "chat")
|
|
475
|
-
|
|
476
|
-
# Check if streaming is requested
|
|
477
|
-
if chat_completion_request.stream:
|
|
478
|
-
|
|
479
|
-
def event_stream():
|
|
480
|
-
try:
|
|
481
|
-
# Enable streaming
|
|
482
|
-
# pylint: disable=missing-kwoa
|
|
483
|
-
for chunk in client.chat.completions.create(**openai_client_params):
|
|
484
|
-
yield f"data: {chunk.model_dump_json()}\n\n"
|
|
485
|
-
yield "data: [DONE]\n\n"
|
|
486
|
-
|
|
487
|
-
# Show telemetry after completion
|
|
488
|
-
telemetry.show_telemetry()
|
|
489
|
-
|
|
490
|
-
except Exception as e: # pylint: disable=broad-exception-caught
|
|
491
|
-
yield f'data: {{"error": "{str(e)}"}}\n\n'
|
|
492
|
-
|
|
493
|
-
return StreamingResponse(
|
|
494
|
-
event_stream(),
|
|
495
|
-
media_type="text/event-stream",
|
|
496
|
-
headers={
|
|
497
|
-
"Cache-Control": "no-cache",
|
|
498
|
-
"Connection": "keep-alive",
|
|
499
|
-
},
|
|
500
|
-
)
|
|
501
|
-
else:
|
|
502
|
-
# Non-streaming response
|
|
503
|
-
try:
|
|
504
|
-
# Disable streaming for non-streaming requests
|
|
505
|
-
# pylint: disable=missing-kwoa
|
|
506
|
-
response = client.chat.completions.create(**openai_client_params)
|
|
507
|
-
|
|
508
|
-
# Show telemetry after completion
|
|
509
|
-
telemetry.show_telemetry()
|
|
510
|
-
|
|
511
|
-
return response
|
|
512
|
-
|
|
513
|
-
except Exception as e: # pylint: disable=broad-exception-caught
|
|
514
|
-
logging.error("Error during chat completion: %s", str(e))
|
|
515
|
-
raise HTTPException(
|
|
516
|
-
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
517
|
-
detail=f"Chat completion error: {str(e)}",
|
|
239
|
+
# If loading on GPU failed, try loading on CPU
|
|
240
|
+
if self.process.poll():
|
|
241
|
+
logging.warning(
|
|
242
|
+
f"Loading {model_config.model_name} on GPU didn't work, re-attempting on CPU"
|
|
518
243
|
)
|
|
519
244
|
|
|
245
|
+
if os.environ.get("LEMONADE_LLAMACPP_NO_FALLBACK"):
|
|
246
|
+
# Used for testing, when the test should fail if GPU didn't work
|
|
247
|
+
raise Exception("llamacpp GPU loading failed")
|
|
520
248
|
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
telemetry: Telemetry object containing the server port
|
|
528
|
-
|
|
529
|
-
Returns:
|
|
530
|
-
Completion response from the llamacpp server
|
|
531
|
-
"""
|
|
532
|
-
base_url = llamacpp_address(telemetry.port)
|
|
533
|
-
client = OpenAI(
|
|
534
|
-
base_url=base_url,
|
|
535
|
-
api_key="lemonade",
|
|
536
|
-
)
|
|
537
|
-
|
|
538
|
-
# Convert Pydantic model to dict and remove unset/null values
|
|
539
|
-
request_dict = completion_request.model_dump(exclude_unset=True, exclude_none=True)
|
|
540
|
-
|
|
541
|
-
# Separate standard OpenAI parameters from custom llama.cpp parameters
|
|
542
|
-
openai_client_params = _separate_openai_params(request_dict, "completion")
|
|
543
|
-
|
|
544
|
-
# Check if streaming is requested
|
|
545
|
-
if completion_request.stream:
|
|
546
|
-
|
|
547
|
-
def event_stream():
|
|
548
|
-
try:
|
|
549
|
-
# Enable streaming
|
|
550
|
-
# pylint: disable=missing-kwoa
|
|
551
|
-
for chunk in client.completions.create(**openai_client_params):
|
|
552
|
-
yield f"data: {chunk.model_dump_json()}\n\n"
|
|
553
|
-
yield "data: [DONE]\n\n"
|
|
554
|
-
|
|
555
|
-
# Show telemetry after completion
|
|
556
|
-
telemetry.show_telemetry()
|
|
557
|
-
|
|
558
|
-
except Exception as e: # pylint: disable=broad-exception-caught
|
|
559
|
-
yield f'data: {{"error": "{str(e)}"}}\n\n'
|
|
560
|
-
|
|
561
|
-
return StreamingResponse(
|
|
562
|
-
event_stream(),
|
|
563
|
-
media_type="text/event-stream",
|
|
564
|
-
headers={
|
|
565
|
-
"Cache-Control": "no-cache",
|
|
566
|
-
"Connection": "keep-alive",
|
|
567
|
-
},
|
|
568
|
-
)
|
|
569
|
-
else:
|
|
570
|
-
# Non-streaming response
|
|
571
|
-
try:
|
|
572
|
-
# Disable streaming for non-streaming requests
|
|
573
|
-
# pylint: disable=missing-kwoa
|
|
574
|
-
response = client.completions.create(**openai_client_params)
|
|
575
|
-
|
|
576
|
-
# Show telemetry after completion
|
|
577
|
-
telemetry.show_telemetry()
|
|
578
|
-
|
|
579
|
-
return response
|
|
580
|
-
|
|
581
|
-
except Exception as e: # pylint: disable=broad-exception-caught
|
|
582
|
-
logging.error("Error during completion: %s", str(e))
|
|
583
|
-
raise HTTPException(
|
|
584
|
-
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
585
|
-
detail=f"Completion error: {str(e)}",
|
|
249
|
+
self._launch_device_backend_subprocess(
|
|
250
|
+
snapshot_files,
|
|
251
|
+
use_gpu=False,
|
|
252
|
+
ctx_size=ctx_size,
|
|
253
|
+
supports_embeddings=supports_embeddings,
|
|
254
|
+
supports_reranking=supports_reranking,
|
|
586
255
|
)
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
def embeddings(embeddings_request: EmbeddingsRequest, telemetry: LlamaTelemetry):
|
|
590
|
-
"""
|
|
591
|
-
Generate embeddings using the llamacpp server.
|
|
592
|
-
|
|
593
|
-
Args:
|
|
594
|
-
embeddings_request: The embeddings request containing input text/tokens
|
|
595
|
-
telemetry: Telemetry object containing the server port
|
|
596
|
-
|
|
597
|
-
Returns:
|
|
598
|
-
Embeddings response from the llamacpp server
|
|
599
|
-
"""
|
|
600
|
-
base_url = llamacpp_address(telemetry.port)
|
|
601
|
-
client = OpenAI(
|
|
602
|
-
base_url=base_url,
|
|
603
|
-
api_key="lemonade",
|
|
604
|
-
)
|
|
605
|
-
|
|
606
|
-
# Convert Pydantic model to dict and remove unset/null values
|
|
607
|
-
request_dict = embeddings_request.model_dump(exclude_unset=True, exclude_none=True)
|
|
608
|
-
|
|
609
|
-
try:
|
|
610
|
-
# Call the embeddings endpoint
|
|
611
|
-
response = client.embeddings.create(**request_dict)
|
|
612
|
-
return response
|
|
613
|
-
|
|
614
|
-
except Exception as e: # pylint: disable=broad-exception-caught
|
|
615
|
-
raise HTTPException(
|
|
616
|
-
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
617
|
-
detail=f"Embeddings error: {str(e)}",
|
|
618
|
-
)
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
def reranking(reranking_request: RerankingRequest, telemetry: LlamaTelemetry):
|
|
622
|
-
"""
|
|
623
|
-
Rerank documents based on their relevance to a query using the llamacpp server.
|
|
624
|
-
|
|
625
|
-
Args:
|
|
626
|
-
reranking_request: The reranking request containing query and documents
|
|
627
|
-
telemetry: Telemetry object containing the server port
|
|
628
|
-
|
|
629
|
-
Returns:
|
|
630
|
-
Reranking response from the llamacpp server containing ranked documents and scores
|
|
631
|
-
"""
|
|
632
|
-
base_url = llamacpp_address(telemetry.port)
|
|
633
|
-
|
|
634
|
-
try:
|
|
635
|
-
# Convert Pydantic model to dict and exclude unset/null values
|
|
636
|
-
request_dict = reranking_request.model_dump(
|
|
637
|
-
exclude_unset=True, exclude_none=True
|
|
638
|
-
)
|
|
639
|
-
|
|
640
|
-
# Call the reranking endpoint directly since it's not supported by the OpenAI API
|
|
641
|
-
response = requests.post(
|
|
642
|
-
f"{base_url}/rerank",
|
|
643
|
-
json=request_dict,
|
|
644
|
-
)
|
|
645
|
-
response.raise_for_status()
|
|
646
|
-
return response.json()
|
|
647
|
-
|
|
648
|
-
except Exception as e:
|
|
649
|
-
logging.error("Error during reranking: %s", str(e))
|
|
650
|
-
raise HTTPException(
|
|
651
|
-
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
652
|
-
detail=f"Reranking error: {str(e)}",
|
|
653
|
-
) from e
|