lemonade-sdk 8.1.4__py3-none-any.whl → 8.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (53) hide show
  1. lemonade/cache.py +6 -1
  2. lemonade/cli.py +47 -5
  3. lemonade/common/inference_engines.py +13 -4
  4. lemonade/common/status.py +4 -4
  5. lemonade/common/system_info.py +544 -1
  6. lemonade/profilers/agt_power.py +437 -0
  7. lemonade/profilers/hwinfo_power.py +429 -0
  8. lemonade/tools/accuracy.py +143 -48
  9. lemonade/tools/adapter.py +6 -1
  10. lemonade/tools/bench.py +26 -8
  11. lemonade/tools/flm/__init__.py +1 -0
  12. lemonade/tools/flm/utils.py +303 -0
  13. lemonade/tools/huggingface/bench.py +6 -1
  14. lemonade/tools/llamacpp/bench.py +146 -27
  15. lemonade/tools/llamacpp/load.py +30 -2
  16. lemonade/tools/llamacpp/utils.py +393 -33
  17. lemonade/tools/oga/bench.py +5 -26
  18. lemonade/tools/oga/load.py +60 -121
  19. lemonade/tools/oga/migration.py +403 -0
  20. lemonade/tools/report/table.py +76 -8
  21. lemonade/tools/server/flm.py +133 -0
  22. lemonade/tools/server/llamacpp.py +220 -553
  23. lemonade/tools/server/serve.py +684 -168
  24. lemonade/tools/server/static/js/chat.js +666 -342
  25. lemonade/tools/server/static/js/model-settings.js +24 -3
  26. lemonade/tools/server/static/js/models.js +597 -73
  27. lemonade/tools/server/static/js/shared.js +79 -14
  28. lemonade/tools/server/static/logs.html +191 -0
  29. lemonade/tools/server/static/styles.css +491 -66
  30. lemonade/tools/server/static/webapp.html +83 -31
  31. lemonade/tools/server/tray.py +158 -38
  32. lemonade/tools/server/utils/macos_tray.py +226 -0
  33. lemonade/tools/server/utils/{system_tray.py → windows_tray.py} +13 -0
  34. lemonade/tools/server/webapp.py +4 -1
  35. lemonade/tools/server/wrapped_server.py +559 -0
  36. lemonade/version.py +1 -1
  37. lemonade_install/install.py +54 -611
  38. {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/METADATA +29 -72
  39. lemonade_sdk-8.2.2.dist-info/RECORD +83 -0
  40. lemonade_server/cli.py +145 -37
  41. lemonade_server/model_manager.py +521 -37
  42. lemonade_server/pydantic_models.py +28 -1
  43. lemonade_server/server_models.json +246 -92
  44. lemonade_server/settings.py +39 -39
  45. lemonade/tools/quark/__init__.py +0 -0
  46. lemonade/tools/quark/quark_load.py +0 -173
  47. lemonade/tools/quark/quark_quantize.py +0 -439
  48. lemonade_sdk-8.1.4.dist-info/RECORD +0 -77
  49. {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/WHEEL +0 -0
  50. {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/entry_points.txt +0 -0
  51. {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/licenses/LICENSE +0 -0
  52. {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/licenses/NOTICE.md +0 -0
  53. {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/top_level.txt +0 -0
@@ -1,148 +1,54 @@
1
1
  import os
2
2
  import logging
3
- import time
4
3
  import subprocess
5
4
  import re
6
5
  import threading
7
6
  import platform
8
7
 
9
- import requests
10
- from tabulate import tabulate
11
8
  from dotenv import load_dotenv
12
9
  from fastapi import HTTPException, status
13
- from fastapi.responses import StreamingResponse
14
-
15
- from openai import OpenAI
16
10
 
17
11
  from lemonade_server.pydantic_models import (
18
- ChatCompletionRequest,
19
- CompletionRequest,
20
12
  PullConfig,
21
- EmbeddingsRequest,
22
- RerankingRequest,
23
13
  )
24
- from lemonade_server.model_manager import ModelManager
25
- from lemonade.tools.server.utils.port import find_free_port
26
14
  from lemonade.tools.llamacpp.utils import (
27
15
  get_llama_server_exe_path,
28
16
  install_llamacpp,
29
17
  download_gguf,
18
+ resolve_local_gguf_model,
19
+ parse_checkpoint,
30
20
  )
21
+ from lemonade.tools.server.wrapped_server import WrappedServerTelemetry, WrappedServer
31
22
 
32
-
33
- def llamacpp_address(port: int) -> str:
34
- """
35
- Generate the base URL for the llamacpp server.
36
-
37
- Args:
38
- port: The port number the llamacpp server is running on
39
-
40
- Returns:
41
- The base URL for the llamacpp server
42
- """
43
- return f"http://127.0.0.1:{port}/v1"
23
+ # Embedding model batch configuration set to 8192 as default
24
+ EMBEDDING_CTX_SIZE = 8192
25
+ EMBEDDING_BATCH_SIZE = 8192
26
+ EMBEDDING_UBATCH_SIZE = 8192
44
27
 
45
28
 
46
- def _separate_openai_params(request_dict: dict, endpoint_type: str = "chat") -> dict:
47
- """
48
- Separate standard OpenAI parameters from custom llama.cpp parameters.
49
-
50
- Args:
51
- request_dict: Dictionary of all request parameters
52
- endpoint_type: Type of endpoint ("chat" or "completion")
53
-
54
- Returns:
55
- Dictionary with parameters properly separated for OpenAI client
56
- """
57
- openai_client_params = {}
58
- extra_params = {}
59
-
60
- # Common OpenAI parameters for both endpoint types
61
- common_params = {
62
- "model",
63
- "frequency_penalty",
64
- "logit_bias",
65
- "logprobs",
66
- "max_tokens",
67
- "n",
68
- "presence_penalty",
69
- "seed",
70
- "stop",
71
- "stream",
72
- "temperature",
73
- "top_p",
74
- "user",
75
- }
76
-
77
- # Standard OpenAI parameters by endpoint type
78
- if endpoint_type == "chat":
79
- chat_specific_params = {
80
- "messages",
81
- "top_logprobs",
82
- "response_format",
83
- "service_tier",
84
- "stream_options",
85
- "tools",
86
- "tool_choice",
87
- "parallel_tool_calls",
88
- }
89
- openai_params = common_params | chat_specific_params
90
- else: # completion
91
- completion_specific_params = {
92
- "prompt",
93
- "best_of",
94
- "echo",
95
- "suffix",
96
- }
97
- openai_params = common_params | completion_specific_params
98
-
99
- for key, value in request_dict.items():
100
- if key in openai_params:
101
- openai_client_params[key] = value
102
- else:
103
- extra_params[key] = value
104
-
105
- # If there are custom parameters, use extra_body to pass them through
106
- if extra_params:
107
- openai_client_params["extra_body"] = extra_params
108
-
109
- return openai_client_params
110
-
111
-
112
- class LlamaTelemetry:
29
+ class LlamaTelemetry(WrappedServerTelemetry):
113
30
  """
114
31
  Manages telemetry data collection and display for llama server.
115
32
  """
116
33
 
117
- def __init__(self):
118
- self.input_tokens = None
119
- self.output_tokens = None
120
- self.time_to_first_token = None
121
- self.tokens_per_second = None
122
- self.prompt_eval_time = None
123
- self.eval_time = None
124
- self.port = None
125
-
126
- def choose_port(self):
34
+ def parse_telemetry_line(self, line: str):
127
35
  """
128
- Users probably don't care what port we start llama-server on, so let's
129
- search for an empty port
36
+ Parse telemetry data from llama server output lines.
130
37
  """
131
38
 
132
- self.port = find_free_port()
133
-
134
- if self.port is None:
135
- msg = "Failed to find an empty port to start llama-server on"
39
+ if "vk::PhysicalDevice::createDevice: ErrorExtensionNotPresent" in line:
40
+ msg = (
41
+ "Your AMD GPU driver version is not compatible with this software.\n"
42
+ "Please update and try again: "
43
+ "https://www.amd.com/en/support/download/drivers.html"
44
+ )
136
45
  logging.error(msg)
137
46
  raise HTTPException(
138
47
  status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
139
48
  detail=msg,
140
49
  )
141
-
142
- def parse_telemetry_line(self, line: str):
143
- """
144
- Parse telemetry data from llama server output lines.
145
- """
50
+ elif "error" in line.lower():
51
+ logging.error(line)
146
52
 
147
53
  # Parse Vulkan device detection
148
54
  vulkan_match = re.search(r"ggml_vulkan: Found (\d+) Vulkan devices?:", line)
@@ -186,468 +92,229 @@ class LlamaTelemetry:
186
92
  self.tokens_per_second = tokens_per_second
187
93
  return
188
94
 
189
- def get_telemetry_data(self):
190
- return {
191
- "input_tokens": self.input_tokens,
192
- "output_tokens": self.output_tokens,
193
- "time_to_first_token": self.time_to_first_token,
194
- "tokens_per_second": self.tokens_per_second,
195
- "decode_token_times": None,
196
- }
197
-
198
- def show_telemetry(self):
199
- # Check if debug logging is enabled
200
- if not logging.getLogger().isEnabledFor(logging.DEBUG):
201
- return
202
95
 
203
- # Prepare telemetry data (transposed format)
204
- telemetry = [
205
- ["Input tokens", self.input_tokens],
206
- ["Output tokens", self.output_tokens],
207
- ["TTFT (s)", f"{self.time_to_first_token:.2f}"],
208
- ["TPS", f"{self.tokens_per_second:.2f}"],
209
- ]
96
+ class LlamaServer(WrappedServer):
97
+ def __init__(self, backend: str):
98
+ self.backend = backend
99
+ super().__init__(server_name="llama-server", telemetry=LlamaTelemetry())
210
100
 
211
- table = tabulate(
212
- telemetry, headers=["Metric", "Value"], tablefmt="fancy_grid"
213
- ).split("\n")
101
+ def install_server(self, backend=None):
102
+ """
103
+ Install the wrapped server
104
+ """
105
+ install_llamacpp(self.backend)
214
106
 
215
- # Show telemetry in debug while complying with uvicorn's log indentation
216
- logging.debug("\n ".join(table))
107
+ def download_model(
108
+ self, config_checkpoint, config_mmproj=None, do_not_upgrade=False
109
+ ) -> dict:
110
+ """
111
+ Download a model for the wrapper server.
112
+ First checks local cache, then downloads from internet if needed.
113
+ """
114
+ # If it's a direct file path, just return it
115
+
116
+ if os.path.exists(config_checkpoint):
117
+ result = {"variant": config_checkpoint}
118
+ if config_mmproj:
119
+ result["mmproj"] = config_mmproj
120
+ return result
121
+
122
+ # Try to resolve from local cache first to avoid unnecessary downloads
123
+ checkpoint, variant = parse_checkpoint(config_checkpoint)
124
+ local_result = resolve_local_gguf_model(checkpoint, variant, config_mmproj)
125
+
126
+ if local_result:
127
+ return local_result
128
+
129
+ # Not found locally - download from internet
130
+ return download_gguf(
131
+ config_checkpoint=config_checkpoint,
132
+ config_mmproj=config_mmproj,
133
+ do_not_upgrade=do_not_upgrade,
134
+ )
217
135
 
136
+ def _launch_device_backend_subprocess(
137
+ self,
138
+ snapshot_files: dict,
139
+ use_gpu: bool,
140
+ ctx_size: int,
141
+ supports_embeddings: bool = False,
142
+ supports_reranking: bool = False,
143
+ ) -> subprocess.Popen:
144
+ """
145
+ Launch llama server subprocess with appropriate configuration.
146
+
147
+ Args:
148
+ snapshot_files: Dictionary of model files to load
149
+ use_gpu: Whether to use GPU acceleration
150
+ telemetry: Telemetry object for tracking performance metrics
151
+ backend: Backend to use (e.g., 'vulkan', 'rocm')
152
+ supports_embeddings: Whether the model supports embeddings
153
+ supports_reranking: Whether the model supports reranking
154
+
155
+ Returns:
156
+ Subprocess handle for the llama server
157
+ """
218
158
 
219
- def _log_subprocess_output(
220
- process: subprocess.Popen, prefix: str, telemetry: LlamaTelemetry
221
- ):
222
- """
223
- Read subprocess output line by line, log to debug, and parse telemetry
224
- """
159
+ # Get the current executable path (handles both Windows and Ubuntu structures)
160
+ exe_path = get_llama_server_exe_path(self.backend)
161
+
162
+ # For embedding models, use a larger context size to support longer individual
163
+ # strings. Embedding requests can include multiple strings in a batch, and each
164
+ # string needs to fit within the context window.
165
+ if supports_embeddings and ctx_size < EMBEDDING_CTX_SIZE:
166
+ ctx_size = EMBEDDING_CTX_SIZE
167
+
168
+ # Build the base command
169
+ base_command = [
170
+ exe_path,
171
+ "-m",
172
+ snapshot_files["variant"],
173
+ "--ctx-size",
174
+ str(ctx_size),
175
+ ]
225
176
 
226
- if process.stdout:
227
- try:
228
- for line in iter(process.stdout.readline, ""):
229
- if line:
230
- line_stripped = line.strip()
231
- logging.debug("%s: %s", prefix, line_stripped)
232
-
233
- telemetry.parse_telemetry_line(line_stripped)
234
-
235
- if process.poll() is not None:
236
- break
237
- except UnicodeDecodeError as e:
238
- logging.debug("Unicode decode error reading subprocess output: %s", str(e))
239
- except Exception as e: # pylint: disable=broad-exception-caught
240
- logging.error("Unexpected error reading subprocess output: %s", str(e))
241
-
242
-
243
- def _wait_for_load(llama_server_process: subprocess.Popen, port: int):
244
- status_code = None
245
- while not llama_server_process.poll() and status_code != 200:
246
- health_url = f"http://localhost:{port}/health"
247
- try:
248
- health_response = requests.get(health_url)
249
- except requests.exceptions.ConnectionError:
250
- logging.debug("Not able to connect to llama-server yet, will retry")
177
+ # Lock random seed for deterministic behavior in CI
178
+ if os.environ.get("LEMONADE_CI_MODE"):
179
+ base_command.extend(["--seed", "42"])
180
+ logging.info(f"Seed applied to base command: {base_command}")
181
+
182
+ if "mmproj" in snapshot_files:
183
+ base_command.extend(["--mmproj", snapshot_files["mmproj"]])
184
+ if not use_gpu:
185
+ base_command.extend(["--no-mmproj-offload"])
186
+
187
+ # Find a port, and save it in the telemetry object for future reference
188
+ # by other functions
189
+ self._choose_port()
190
+
191
+ # Add port and jinja to enable tool use
192
+ base_command.extend(["--port", str(self.port), "--jinja"])
193
+
194
+ # Enable context shift and avoid attention sink issues by preserving the initial tokens
195
+ # Note: --context-shift is not supported on all backends (e.g., Metal on macOS)
196
+ # Only add context-shift for backends that support it
197
+ context_shift_supported_backends = ["vulkan", "rocm"]
198
+ if self.backend in context_shift_supported_backends:
199
+ base_command.extend(["--context-shift", "--keep", "16"])
251
200
  else:
252
- status_code = health_response.status_code
201
+ # For backends that don't support context-shift (e.g., Metal), just use keep
202
+ base_command.extend(["--keep", "16"])
253
203
  logging.debug(
254
- "Testing llama-server readiness (will retry until ready), "
255
- f"result: {health_response.json()}"
204
+ f"Skipped --context-shift for backend: {self.backend} (not supported)"
256
205
  )
257
- time.sleep(1)
258
-
259
-
260
- def _launch_llama_subprocess(
261
- snapshot_files: dict,
262
- use_gpu: bool,
263
- telemetry: LlamaTelemetry,
264
- backend: str,
265
- ctx_size: int,
266
- supports_embeddings: bool = False,
267
- supports_reranking: bool = False,
268
- ) -> subprocess.Popen:
269
- """
270
- Launch llama server subprocess with appropriate configuration.
271
-
272
- Args:
273
- snapshot_files: Dictionary of model files to load
274
- use_gpu: Whether to use GPU acceleration
275
- telemetry: Telemetry object for tracking performance metrics
276
- backend: Backend to use (e.g., 'vulkan', 'rocm')
277
- supports_embeddings: Whether the model supports embeddings
278
- supports_reranking: Whether the model supports reranking
279
-
280
- Returns:
281
- Subprocess handle for the llama server
282
- """
283
-
284
- # Get the current executable path (handles both Windows and Ubuntu structures)
285
- exe_path = get_llama_server_exe_path(backend)
286
-
287
- # Build the base command
288
- base_command = [
289
- exe_path,
290
- "-m",
291
- snapshot_files["variant"],
292
- "--ctx-size",
293
- str(ctx_size),
294
- ]
295
-
296
- # Lock random seed for deterministic behavior in CI
297
- if os.environ.get("LEMONADE_CI_MODE"):
298
- base_command.extend(["--seed", "42"])
299
-
300
- if "mmproj" in snapshot_files:
301
- base_command.extend(["--mmproj", snapshot_files["mmproj"]])
302
- if not use_gpu:
303
- base_command.extend(["--no-mmproj-offload"])
304
-
305
- # Find a port, and save it in the telemetry object for future reference
306
- # by other functions
307
- telemetry.choose_port()
308
-
309
- # Add port and jinja to enable tool use
310
- base_command.extend(["--port", str(telemetry.port), "--jinja"])
311
-
312
- # Disable jinja for gpt-oss-120b on Vulkan
313
- if backend == "vulkan" and "gpt-oss-120b" in snapshot_files["variant"].lower():
314
- base_command.remove("--jinja")
315
- logging.warning(
316
- "Jinja is disabled for gpt-oss-120b on Vulkan due to a llama.cpp bug "
317
- "(see https://github.com/ggml-org/llama.cpp/issues/15274). "
318
- "The model cannot use tools. If needed, use the ROCm backend instead."
319
- )
320
206
 
321
- # Use legacy reasoning formatting, since not all apps support the new
322
- # reasoning_content field
323
- base_command.extend(["--reasoning-format", "none"])
324
-
325
- # Add embeddings support if the model supports it
326
- if supports_embeddings:
327
- base_command.append("--embeddings")
328
-
329
- # Add reranking support if the model supports it
330
- if supports_reranking:
331
- base_command.append("--reranking")
332
-
333
- # Configure GPU layers: 99 for GPU, 0 for CPU-only
334
- ngl_value = "99" if use_gpu else "0"
335
- command = base_command + ["-ngl", ngl_value]
336
-
337
- # Set up environment with library path for Linux
338
- env = os.environ.copy()
339
-
340
- # Load environment variables from .env file in the executable directory
341
- exe_dir = os.path.dirname(exe_path)
342
- env_file_path = os.path.join(exe_dir, ".env")
343
- if os.path.exists(env_file_path):
344
- load_dotenv(env_file_path, override=True)
345
- env.update(os.environ)
346
- logging.debug(f"Loaded environment variables from {env_file_path}")
347
-
348
- if platform.system().lower() == "linux":
349
- lib_dir = os.path.dirname(exe_path) # Same directory as the executable
350
- current_ld_path = env.get("LD_LIBRARY_PATH", "")
351
- if current_ld_path:
352
- env["LD_LIBRARY_PATH"] = f"{lib_dir}:{current_ld_path}"
353
- else:
354
- env["LD_LIBRARY_PATH"] = lib_dir
355
- logging.debug(f"Set LD_LIBRARY_PATH to {env['LD_LIBRARY_PATH']}")
356
-
357
- # Start subprocess with output capture
358
- process = subprocess.Popen(
359
- command,
360
- stdout=subprocess.PIPE,
361
- stderr=subprocess.STDOUT,
362
- text=True,
363
- encoding="utf-8",
364
- errors="replace",
365
- bufsize=1,
366
- env=env,
367
- )
368
-
369
- # Start background thread to log subprocess output
370
- device_type = "GPU" if use_gpu else "CPU"
371
- threading.Thread(
372
- target=_log_subprocess_output,
373
- args=(process, f"LLAMA SERVER {device_type}", telemetry),
374
- daemon=True,
375
- ).start()
376
-
377
- return process
378
-
379
-
380
- def server_load(
381
- model_config: PullConfig,
382
- telemetry: LlamaTelemetry,
383
- backend: str,
384
- ctx_size: int,
385
- do_not_upgrade: bool = False,
386
- ):
387
- # Install and/or update llama.cpp if needed
388
- try:
389
- install_llamacpp(backend)
390
- except NotImplementedError as e:
391
- raise HTTPException(
392
- status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, detail=str(e)
393
- )
207
+ # Use legacy reasoning formatting, since not all apps support the new
208
+ # reasoning_content field
209
+ base_command.extend(["--reasoning-format", "auto"])
210
+
211
+ # Add embeddings support if the model supports it
212
+ if supports_embeddings:
213
+ # For embedding models, set batch sizes to handle multiple documents in a single request
214
+ # batch-size: logical batch size (total tokens across all sequences)
215
+ # ubatch-size: physical batch size (tokens processed in a single forward pass)
216
+ base_command.extend(
217
+ [
218
+ "--embeddings",
219
+ "--batch-size",
220
+ str(EMBEDDING_BATCH_SIZE),
221
+ "--ubatch-size",
222
+ str(EMBEDDING_UBATCH_SIZE),
223
+ ]
224
+ )
394
225
 
395
- # Download the gguf to the hugging face cache
396
- snapshot_files = download_gguf(
397
- model_config.checkpoint, model_config.mmproj, do_not_upgrade=do_not_upgrade
398
- )
399
- logging.debug(f"GGUF file paths: {snapshot_files}")
400
-
401
- # Check if model supports embeddings
402
- supported_models = ModelManager().supported_models
403
- model_info = supported_models.get(model_config.model_name, {})
404
- supports_embeddings = "embeddings" in model_info.get("labels", [])
405
- supports_reranking = "reranking" in model_info.get("labels", [])
406
-
407
- # Attempt loading on GPU first
408
- llama_server_process = _launch_llama_subprocess(
409
- snapshot_files,
410
- use_gpu=True,
411
- telemetry=telemetry,
412
- backend=backend,
413
- ctx_size=ctx_size,
414
- supports_embeddings=supports_embeddings,
415
- supports_reranking=supports_reranking,
416
- )
417
-
418
- # Check the /health endpoint until GPU server is ready
419
- _wait_for_load(
420
- llama_server_process,
421
- telemetry.port,
422
- )
423
-
424
- # If loading on GPU failed, try loading on CPU
425
- if llama_server_process.poll():
426
- logging.warning(
427
- f"Loading {model_config.model_name} on GPU didn't work, re-attempting on CPU"
226
+ # Add reranking support if the model supports it
227
+ if supports_reranking:
228
+ base_command.append("--reranking")
229
+
230
+ # Configure GPU layers: 99 for GPU, 0 for CPU-only
231
+ ngl_value = "99" if use_gpu else "0"
232
+ command = base_command + ["-ngl", ngl_value]
233
+
234
+ # Set up environment with library path for Linux
235
+ env = os.environ.copy()
236
+
237
+ # Load environment variables from .env file in the executable directory
238
+ exe_dir = os.path.dirname(exe_path)
239
+ env_file_path = os.path.join(exe_dir, ".env")
240
+ if os.path.exists(env_file_path):
241
+ load_dotenv(env_file_path, override=False)
242
+ env.update(os.environ)
243
+ logging.debug(f"Loaded environment variables from {env_file_path}")
244
+
245
+ system = platform.system().lower()
246
+ if system == "linux":
247
+ lib_dir = os.path.dirname(exe_path) # Same directory as the executable
248
+ current_ld_path = env.get("LD_LIBRARY_PATH", "")
249
+ if current_ld_path:
250
+ env["LD_LIBRARY_PATH"] = f"{lib_dir}:{current_ld_path}"
251
+ else:
252
+ env["LD_LIBRARY_PATH"] = lib_dir
253
+ logging.debug(f"Set LD_LIBRARY_PATH to {env['LD_LIBRARY_PATH']}")
254
+ elif system == "darwin":
255
+ lib_dir = os.path.dirname(exe_path)
256
+ current_dyld_path = env.get("DYLD_LIBRARY_PATH", "")
257
+ if current_dyld_path:
258
+ env["DYLD_LIBRARY_PATH"] = f"{lib_dir}:{current_dyld_path}"
259
+ else:
260
+ env["DYLD_LIBRARY_PATH"] = lib_dir
261
+ logging.debug(f"Set DYLD_LIBRARY_PATH to {env['DYLD_LIBRARY_PATH']}")
262
+
263
+ # Start subprocess with output capture
264
+ self.process = subprocess.Popen(
265
+ command,
266
+ stdout=subprocess.PIPE,
267
+ stderr=subprocess.STDOUT,
268
+ text=True,
269
+ encoding="utf-8",
270
+ errors="replace",
271
+ bufsize=1,
272
+ env=env,
428
273
  )
429
274
 
430
- if os.environ.get("LEMONADE_LLAMACPP_NO_FALLBACK"):
431
- # Used for testing, when the test should fail if GPU didn't work
432
- raise Exception("llamacpp GPU loading failed")
433
-
434
- llama_server_process = _launch_llama_subprocess(
275
+ # Start background thread to log subprocess output
276
+ device_type = "GPU" if use_gpu else "CPU"
277
+ threading.Thread(
278
+ target=self._log_subprocess_output,
279
+ args=(f"LLAMA SERVER {device_type}",),
280
+ daemon=True,
281
+ ).start()
282
+
283
+ def _launch_server_subprocess(
284
+ self,
285
+ model_config: PullConfig,
286
+ snapshot_files: dict,
287
+ ctx_size: int,
288
+ supports_embeddings: bool = False,
289
+ supports_reranking: bool = False,
290
+ ):
291
+
292
+ # Attempt loading on GPU first
293
+ self._launch_device_backend_subprocess(
435
294
  snapshot_files,
436
- use_gpu=False,
437
- telemetry=telemetry,
438
- backend=backend,
295
+ use_gpu=True,
439
296
  ctx_size=ctx_size,
440
297
  supports_embeddings=supports_embeddings,
441
298
  supports_reranking=supports_reranking,
442
299
  )
443
300
 
444
- # Check the /health endpoint until CPU server is ready
445
- _wait_for_load(
446
- llama_server_process,
447
- telemetry.port,
448
- )
449
-
450
- if llama_server_process.poll():
451
- raise HTTPException(
452
- status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
453
- detail=f"Failed to load {model_config.model_name} with llama.cpp",
454
- )
455
-
456
- return llama_server_process
457
-
458
-
459
- def chat_completion(
460
- chat_completion_request: ChatCompletionRequest, telemetry: LlamaTelemetry
461
- ):
462
- base_url = llamacpp_address(telemetry.port)
463
- client = OpenAI(
464
- base_url=base_url,
465
- api_key="lemonade",
466
- )
467
-
468
- # Convert Pydantic model to dict and remove unset/null values
469
- request_dict = chat_completion_request.model_dump(
470
- exclude_unset=True, exclude_none=True
471
- )
472
-
473
- # Separate standard OpenAI parameters from custom llama.cpp parameters
474
- openai_client_params = _separate_openai_params(request_dict, "chat")
475
-
476
- # Check if streaming is requested
477
- if chat_completion_request.stream:
478
-
479
- def event_stream():
480
- try:
481
- # Enable streaming
482
- # pylint: disable=missing-kwoa
483
- for chunk in client.chat.completions.create(**openai_client_params):
484
- yield f"data: {chunk.model_dump_json()}\n\n"
485
- yield "data: [DONE]\n\n"
486
-
487
- # Show telemetry after completion
488
- telemetry.show_telemetry()
489
-
490
- except Exception as e: # pylint: disable=broad-exception-caught
491
- yield f'data: {{"error": "{str(e)}"}}\n\n'
492
-
493
- return StreamingResponse(
494
- event_stream(),
495
- media_type="text/event-stream",
496
- headers={
497
- "Cache-Control": "no-cache",
498
- "Connection": "keep-alive",
499
- },
500
- )
501
- else:
502
- # Non-streaming response
503
- try:
504
- # Disable streaming for non-streaming requests
505
- # pylint: disable=missing-kwoa
506
- response = client.chat.completions.create(**openai_client_params)
507
-
508
- # Show telemetry after completion
509
- telemetry.show_telemetry()
301
+ # Check the /health endpoint until GPU server is ready
302
+ self._wait_for_load()
510
303
 
511
- return response
512
-
513
- except Exception as e: # pylint: disable=broad-exception-caught
514
- logging.error("Error during chat completion: %s", str(e))
515
- raise HTTPException(
516
- status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
517
- detail=f"Chat completion error: {str(e)}",
304
+ # If loading on GPU failed, try loading on CPU
305
+ if self.process.poll():
306
+ logging.warning(
307
+ f"Loading {model_config.model_name} on GPU didn't work, re-attempting on CPU"
518
308
  )
519
309
 
310
+ if os.environ.get("LEMONADE_LLAMACPP_NO_FALLBACK"):
311
+ # Used for testing, when the test should fail if GPU didn't work
312
+ raise Exception("llamacpp GPU loading failed")
520
313
 
521
- def completion(completion_request: CompletionRequest, telemetry: LlamaTelemetry):
522
- """
523
- Handle text completions using the llamacpp server.
524
-
525
- Args:
526
- completion_request: The completion request containing prompt and parameters
527
- telemetry: Telemetry object containing the server port
528
-
529
- Returns:
530
- Completion response from the llamacpp server
531
- """
532
- base_url = llamacpp_address(telemetry.port)
533
- client = OpenAI(
534
- base_url=base_url,
535
- api_key="lemonade",
536
- )
537
-
538
- # Convert Pydantic model to dict and remove unset/null values
539
- request_dict = completion_request.model_dump(exclude_unset=True, exclude_none=True)
540
-
541
- # Separate standard OpenAI parameters from custom llama.cpp parameters
542
- openai_client_params = _separate_openai_params(request_dict, "completion")
543
-
544
- # Check if streaming is requested
545
- if completion_request.stream:
546
-
547
- def event_stream():
548
- try:
549
- # Enable streaming
550
- # pylint: disable=missing-kwoa
551
- for chunk in client.completions.create(**openai_client_params):
552
- yield f"data: {chunk.model_dump_json()}\n\n"
553
- yield "data: [DONE]\n\n"
554
-
555
- # Show telemetry after completion
556
- telemetry.show_telemetry()
557
-
558
- except Exception as e: # pylint: disable=broad-exception-caught
559
- yield f'data: {{"error": "{str(e)}"}}\n\n'
560
-
561
- return StreamingResponse(
562
- event_stream(),
563
- media_type="text/event-stream",
564
- headers={
565
- "Cache-Control": "no-cache",
566
- "Connection": "keep-alive",
567
- },
568
- )
569
- else:
570
- # Non-streaming response
571
- try:
572
- # Disable streaming for non-streaming requests
573
- # pylint: disable=missing-kwoa
574
- response = client.completions.create(**openai_client_params)
575
-
576
- # Show telemetry after completion
577
- telemetry.show_telemetry()
578
-
579
- return response
580
-
581
- except Exception as e: # pylint: disable=broad-exception-caught
582
- logging.error("Error during completion: %s", str(e))
583
- raise HTTPException(
584
- status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
585
- detail=f"Completion error: {str(e)}",
314
+ self._launch_device_backend_subprocess(
315
+ snapshot_files,
316
+ use_gpu=False,
317
+ ctx_size=ctx_size,
318
+ supports_embeddings=supports_embeddings,
319
+ supports_reranking=supports_reranking,
586
320
  )
587
-
588
-
589
- def embeddings(embeddings_request: EmbeddingsRequest, telemetry: LlamaTelemetry):
590
- """
591
- Generate embeddings using the llamacpp server.
592
-
593
- Args:
594
- embeddings_request: The embeddings request containing input text/tokens
595
- telemetry: Telemetry object containing the server port
596
-
597
- Returns:
598
- Embeddings response from the llamacpp server
599
- """
600
- base_url = llamacpp_address(telemetry.port)
601
- client = OpenAI(
602
- base_url=base_url,
603
- api_key="lemonade",
604
- )
605
-
606
- # Convert Pydantic model to dict and remove unset/null values
607
- request_dict = embeddings_request.model_dump(exclude_unset=True, exclude_none=True)
608
-
609
- try:
610
- # Call the embeddings endpoint
611
- response = client.embeddings.create(**request_dict)
612
- return response
613
-
614
- except Exception as e: # pylint: disable=broad-exception-caught
615
- raise HTTPException(
616
- status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
617
- detail=f"Embeddings error: {str(e)}",
618
- )
619
-
620
-
621
- def reranking(reranking_request: RerankingRequest, telemetry: LlamaTelemetry):
622
- """
623
- Rerank documents based on their relevance to a query using the llamacpp server.
624
-
625
- Args:
626
- reranking_request: The reranking request containing query and documents
627
- telemetry: Telemetry object containing the server port
628
-
629
- Returns:
630
- Reranking response from the llamacpp server containing ranked documents and scores
631
- """
632
- base_url = llamacpp_address(telemetry.port)
633
-
634
- try:
635
- # Convert Pydantic model to dict and exclude unset/null values
636
- request_dict = reranking_request.model_dump(
637
- exclude_unset=True, exclude_none=True
638
- )
639
-
640
- # Call the reranking endpoint directly since it's not supported by the OpenAI API
641
- response = requests.post(
642
- f"{base_url}/rerank",
643
- json=request_dict,
644
- )
645
- response.raise_for_status()
646
- return response.json()
647
-
648
- except Exception as e:
649
- logging.error("Error during reranking: %s", str(e))
650
- raise HTTPException(
651
- status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
652
- detail=f"Reranking error: {str(e)}",
653
- ) from e