lemonade-sdk 8.1.4__tar.gz → 8.1.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (87) hide show
  1. {lemonade_sdk-8.1.4/src/lemonade_sdk.egg-info → lemonade_sdk-8.1.6}/PKG-INFO +1 -1
  2. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/tools/llamacpp/utils.py +5 -1
  3. lemonade_sdk-8.1.6/src/lemonade/tools/server/llamacpp.py +255 -0
  4. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/tools/server/serve.py +15 -22
  5. lemonade_sdk-8.1.6/src/lemonade/tools/server/wrapped_server.py +485 -0
  6. lemonade_sdk-8.1.6/src/lemonade/version.py +1 -0
  7. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6/src/lemonade_sdk.egg-info}/PKG-INFO +1 -1
  8. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade_sdk.egg-info/SOURCES.txt +1 -0
  9. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade_server/cli.py +19 -15
  10. lemonade_sdk-8.1.6/src/lemonade_server/model_manager.py +455 -0
  11. lemonade_sdk-8.1.4/src/lemonade/tools/server/llamacpp.py +0 -653
  12. lemonade_sdk-8.1.4/src/lemonade/version.py +0 -1
  13. lemonade_sdk-8.1.4/src/lemonade_server/model_manager.py +0 -274
  14. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/LICENSE +0 -0
  15. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/NOTICE.md +0 -0
  16. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/README.md +0 -0
  17. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/pyproject.toml +0 -0
  18. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/setup.cfg +0 -0
  19. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/setup.py +0 -0
  20. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/__init__.py +0 -0
  21. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/api.py +0 -0
  22. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/cache.py +0 -0
  23. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/cli.py +0 -0
  24. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/common/__init__.py +0 -0
  25. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/common/build.py +0 -0
  26. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/common/cli_helpers.py +0 -0
  27. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/common/exceptions.py +0 -0
  28. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/common/filesystem.py +0 -0
  29. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/common/inference_engines.py +0 -0
  30. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/common/network.py +0 -0
  31. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/common/printing.py +0 -0
  32. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/common/status.py +0 -0
  33. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/common/system_info.py +0 -0
  34. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/common/test_helpers.py +0 -0
  35. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/profilers/__init__.py +0 -0
  36. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/profilers/memory_tracker.py +0 -0
  37. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/profilers/profiler.py +0 -0
  38. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/sequence.py +0 -0
  39. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/state.py +0 -0
  40. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/tools/__init__.py +0 -0
  41. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/tools/accuracy.py +0 -0
  42. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/tools/adapter.py +0 -0
  43. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/tools/bench.py +0 -0
  44. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/tools/huggingface/bench.py +0 -0
  45. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/tools/huggingface/load.py +0 -0
  46. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/tools/huggingface/utils.py +0 -0
  47. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/tools/humaneval.py +0 -0
  48. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/tools/llamacpp/bench.py +0 -0
  49. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/tools/llamacpp/load.py +0 -0
  50. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/tools/management_tools.py +0 -0
  51. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/tools/mmlu.py +0 -0
  52. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/tools/oga/__init__.py +0 -0
  53. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/tools/oga/bench.py +0 -0
  54. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/tools/oga/load.py +0 -0
  55. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/tools/oga/utils.py +0 -0
  56. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/tools/perplexity.py +0 -0
  57. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/tools/prompt.py +0 -0
  58. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/tools/quark/__init__.py +0 -0
  59. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/tools/quark/quark_load.py +0 -0
  60. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/tools/quark/quark_quantize.py +0 -0
  61. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/tools/report/__init__.py +0 -0
  62. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/tools/report/llm_report.py +0 -0
  63. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/tools/report/table.py +0 -0
  64. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/tools/server/__init__.py +0 -0
  65. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/tools/server/static/favicon.ico +0 -0
  66. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/tools/server/static/js/chat.js +0 -0
  67. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/tools/server/static/js/model-settings.js +0 -0
  68. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/tools/server/static/js/models.js +0 -0
  69. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/tools/server/static/js/shared.js +0 -0
  70. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/tools/server/static/styles.css +0 -0
  71. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/tools/server/static/webapp.html +0 -0
  72. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/tools/server/tool_calls.py +0 -0
  73. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/tools/server/tray.py +0 -0
  74. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/tools/server/utils/port.py +0 -0
  75. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/tools/server/utils/system_tray.py +0 -0
  76. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/tools/server/utils/thread.py +0 -0
  77. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/tools/server/webapp.py +0 -0
  78. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade/tools/tool.py +0 -0
  79. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade_install/__init__.py +0 -0
  80. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade_install/install.py +0 -0
  81. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade_sdk.egg-info/dependency_links.txt +0 -0
  82. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade_sdk.egg-info/entry_points.txt +0 -0
  83. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade_sdk.egg-info/requires.txt +0 -0
  84. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade_sdk.egg-info/top_level.txt +0 -0
  85. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade_server/pydantic_models.py +0 -0
  86. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade_server/server_models.json +0 -0
  87. {lemonade_sdk-8.1.4 → lemonade_sdk-8.1.6}/src/lemonade_server/settings.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lemonade-sdk
3
- Version: 8.1.4
3
+ Version: 8.1.6
4
4
  Summary: Lemonade SDK: Your LLM Aide for Validation and Deployment
5
5
  Author-email: lemonade@amd.com
6
6
  Requires-Python: >=3.10, <3.14
@@ -346,7 +346,11 @@ def install_llamacpp(backend):
346
346
 
347
347
  # Identify and set HIP ID
348
348
  if backend == "rocm":
349
- hip_id = identify_hip_id()
349
+ try:
350
+ hip_id = identify_hip_id()
351
+ except Exception as e: # pylint: disable=broad-exception-caught
352
+ hip_id = 0
353
+ logging.warning(f"Error identifying HIP ID: {e}. Falling back to 0.")
350
354
  env_file_path = os.path.join(llama_server_exe_dir, ".env")
351
355
  set_key(env_file_path, "HIP_VISIBLE_DEVICES", str(hip_id))
352
356
 
@@ -0,0 +1,255 @@
1
+ import os
2
+ import logging
3
+ import subprocess
4
+ import re
5
+ import threading
6
+ import platform
7
+
8
+ from dotenv import load_dotenv
9
+
10
+ from lemonade_server.pydantic_models import (
11
+ PullConfig,
12
+ )
13
+ from lemonade.tools.llamacpp.utils import (
14
+ get_llama_server_exe_path,
15
+ install_llamacpp,
16
+ download_gguf,
17
+ )
18
+ from lemonade.tools.server.wrapped_server import WrappedServerTelemetry, WrappedServer
19
+
20
+
21
+ class LlamaTelemetry(WrappedServerTelemetry):
22
+ """
23
+ Manages telemetry data collection and display for llama server.
24
+ """
25
+
26
+ def parse_telemetry_line(self, line: str):
27
+ """
28
+ Parse telemetry data from llama server output lines.
29
+ """
30
+
31
+ # Parse Vulkan device detection
32
+ vulkan_match = re.search(r"ggml_vulkan: Found (\d+) Vulkan devices?:", line)
33
+ if vulkan_match:
34
+ device_count = int(vulkan_match.group(1))
35
+ if device_count > 0:
36
+ logging.info(
37
+ f"GPU acceleration active: {device_count} device(s) "
38
+ "detected by llama-server"
39
+ )
40
+ return
41
+
42
+ # Parse prompt evaluation line
43
+ prompt_match = re.search(
44
+ r"prompt eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens.*?"
45
+ r"([\d.]+)\s*tokens per second",
46
+ line,
47
+ )
48
+ if prompt_match:
49
+ prompt_time_ms = float(prompt_match.group(1))
50
+ input_tokens = int(prompt_match.group(2))
51
+
52
+ self.prompt_eval_time = prompt_time_ms / 1000.0
53
+ self.input_tokens = input_tokens
54
+ self.time_to_first_token = prompt_time_ms / 1000.0
55
+ return
56
+
57
+ # Parse generation evaluation line
58
+ eval_match = re.search(
59
+ r"eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens.*?"
60
+ r"([\d.]+)\s*tokens per second",
61
+ line,
62
+ )
63
+ if eval_match:
64
+ eval_time_ms = float(eval_match.group(1))
65
+ output_tokens = int(eval_match.group(2))
66
+ tokens_per_second = float(eval_match.group(3))
67
+
68
+ self.eval_time = eval_time_ms / 1000.0
69
+ self.output_tokens = output_tokens
70
+ self.tokens_per_second = tokens_per_second
71
+ return
72
+
73
+
74
+ class LlamaServer(WrappedServer):
75
+ def __init__(self, backend: str):
76
+ self.telemetry = LlamaTelemetry()
77
+ self.backend = backend
78
+ super().__init__(server_name="llama-server", telemetry=self.telemetry)
79
+
80
+ def install_server(self, backend=None):
81
+ """
82
+ Install the wrapped server
83
+ """
84
+ install_llamacpp(self.backend)
85
+
86
+ def download_model(
87
+ self, config_checkpoint, config_mmproj=None, do_not_upgrade=False
88
+ ) -> dict:
89
+ """
90
+ Download a model for the wrapper server
91
+ """
92
+ return download_gguf(
93
+ config_checkpoint=config_checkpoint,
94
+ config_mmproj=config_mmproj,
95
+ do_not_upgrade=do_not_upgrade,
96
+ )
97
+
98
+ def _launch_device_backend_subprocess(
99
+ self,
100
+ snapshot_files: dict,
101
+ use_gpu: bool,
102
+ ctx_size: int,
103
+ supports_embeddings: bool = False,
104
+ supports_reranking: bool = False,
105
+ ) -> subprocess.Popen:
106
+ """
107
+ Launch llama server subprocess with appropriate configuration.
108
+
109
+ Args:
110
+ snapshot_files: Dictionary of model files to load
111
+ use_gpu: Whether to use GPU acceleration
112
+ telemetry: Telemetry object for tracking performance metrics
113
+ backend: Backend to use (e.g., 'vulkan', 'rocm')
114
+ supports_embeddings: Whether the model supports embeddings
115
+ supports_reranking: Whether the model supports reranking
116
+
117
+ Returns:
118
+ Subprocess handle for the llama server
119
+ """
120
+
121
+ # Get the current executable path (handles both Windows and Ubuntu structures)
122
+ exe_path = get_llama_server_exe_path(self.backend)
123
+
124
+ # Build the base command
125
+ base_command = [
126
+ exe_path,
127
+ "-m",
128
+ snapshot_files["variant"],
129
+ "--ctx-size",
130
+ str(ctx_size),
131
+ ]
132
+
133
+ # Lock random seed for deterministic behavior in CI
134
+ if os.environ.get("LEMONADE_CI_MODE"):
135
+ base_command.extend(["--seed", "42"])
136
+ logging.info(f"Seed applied to base command: {base_command}")
137
+
138
+ if "mmproj" in snapshot_files:
139
+ base_command.extend(["--mmproj", snapshot_files["mmproj"]])
140
+ if not use_gpu:
141
+ base_command.extend(["--no-mmproj-offload"])
142
+
143
+ # Find a port, and save it in the telemetry object for future reference
144
+ # by other functions
145
+ self.choose_port()
146
+
147
+ # Add port and jinja to enable tool use
148
+ base_command.extend(["--port", str(self.port), "--jinja"])
149
+
150
+ # Disable jinja for gpt-oss-120b on Vulkan
151
+ if (
152
+ self.backend == "vulkan"
153
+ and "gpt-oss-120b" in snapshot_files["variant"].lower()
154
+ ):
155
+ base_command.remove("--jinja")
156
+ logging.warning(
157
+ "Jinja is disabled for gpt-oss-120b on Vulkan due to a llama.cpp bug "
158
+ "(see https://github.com/ggml-org/llama.cpp/issues/15274). "
159
+ "The model cannot use tools. If needed, use the ROCm backend instead."
160
+ )
161
+
162
+ # Use legacy reasoning formatting, since not all apps support the new
163
+ # reasoning_content field
164
+ base_command.extend(["--reasoning-format", "none"])
165
+
166
+ # Add embeddings support if the model supports it
167
+ if supports_embeddings:
168
+ base_command.append("--embeddings")
169
+
170
+ # Add reranking support if the model supports it
171
+ if supports_reranking:
172
+ base_command.append("--reranking")
173
+
174
+ # Configure GPU layers: 99 for GPU, 0 for CPU-only
175
+ ngl_value = "99" if use_gpu else "0"
176
+ command = base_command + ["-ngl", ngl_value]
177
+
178
+ # Set up environment with library path for Linux
179
+ env = os.environ.copy()
180
+
181
+ # Load environment variables from .env file in the executable directory
182
+ exe_dir = os.path.dirname(exe_path)
183
+ env_file_path = os.path.join(exe_dir, ".env")
184
+ if os.path.exists(env_file_path):
185
+ load_dotenv(env_file_path, override=True)
186
+ env.update(os.environ)
187
+ logging.debug(f"Loaded environment variables from {env_file_path}")
188
+
189
+ if platform.system().lower() == "linux":
190
+ lib_dir = os.path.dirname(exe_path) # Same directory as the executable
191
+ current_ld_path = env.get("LD_LIBRARY_PATH", "")
192
+ if current_ld_path:
193
+ env["LD_LIBRARY_PATH"] = f"{lib_dir}:{current_ld_path}"
194
+ else:
195
+ env["LD_LIBRARY_PATH"] = lib_dir
196
+ logging.debug(f"Set LD_LIBRARY_PATH to {env['LD_LIBRARY_PATH']}")
197
+
198
+ # Start subprocess with output capture
199
+ self.process = subprocess.Popen(
200
+ command,
201
+ stdout=subprocess.PIPE,
202
+ stderr=subprocess.STDOUT,
203
+ text=True,
204
+ encoding="utf-8",
205
+ errors="replace",
206
+ bufsize=1,
207
+ env=env,
208
+ )
209
+
210
+ # Start background thread to log subprocess output
211
+ device_type = "GPU" if use_gpu else "CPU"
212
+ threading.Thread(
213
+ target=self._log_subprocess_output,
214
+ args=(f"LLAMA SERVER {device_type}",),
215
+ daemon=True,
216
+ ).start()
217
+
218
+ def _launch_server_subprocess(
219
+ self,
220
+ model_config: PullConfig,
221
+ snapshot_files: dict,
222
+ ctx_size: int,
223
+ supports_embeddings: bool = False,
224
+ supports_reranking: bool = False,
225
+ ):
226
+
227
+ # Attempt loading on GPU first
228
+ self._launch_device_backend_subprocess(
229
+ snapshot_files,
230
+ use_gpu=True,
231
+ ctx_size=ctx_size,
232
+ supports_embeddings=supports_embeddings,
233
+ supports_reranking=supports_reranking,
234
+ )
235
+
236
+ # Check the /health endpoint until GPU server is ready
237
+ self._wait_for_load()
238
+
239
+ # If loading on GPU failed, try loading on CPU
240
+ if self.process.poll():
241
+ logging.warning(
242
+ f"Loading {model_config.model_name} on GPU didn't work, re-attempting on CPU"
243
+ )
244
+
245
+ if os.environ.get("LEMONADE_LLAMACPP_NO_FALLBACK"):
246
+ # Used for testing, when the test should fail if GPU didn't work
247
+ raise Exception("llamacpp GPU loading failed")
248
+
249
+ self._launch_device_backend_subprocess(
250
+ snapshot_files,
251
+ use_gpu=False,
252
+ ctx_size=ctx_size,
253
+ supports_embeddings=supports_embeddings,
254
+ supports_reranking=supports_reranking,
255
+ )
@@ -9,7 +9,6 @@ import tempfile
9
9
  import traceback
10
10
  from typing import Optional, Union
11
11
  import json
12
- import subprocess
13
12
  from pathlib import Path
14
13
 
15
14
  from fastapi import FastAPI, HTTPException, status, Request
@@ -47,7 +46,8 @@ from openai.types.responses import (
47
46
  )
48
47
 
49
48
  import lemonade.api as lemonade_api
50
- import lemonade.tools.server.llamacpp as llamacpp
49
+ from lemonade.tools.server.wrapped_server import WrappedServer
50
+ from lemonade.tools.server.llamacpp import LlamaServer
51
51
  from lemonade.tools.server.tool_calls import extract_tool_calls, get_tool_call_pattern
52
52
  from lemonade.tools.server.webapp import get_webapp_html
53
53
  from lemonade.tools.server.utils.port import lifespan
@@ -232,11 +232,8 @@ class Server:
232
232
  # Add lock for load/unload operations
233
233
  self._load_lock = asyncio.Lock()
234
234
 
235
- # Subprocess handle for llama_server.exe
236
- self.llama_server_process: subprocess.Popen = None
237
-
238
- # Telemetry instance for llama server
239
- self.llama_telemetry = llamacpp.LlamaTelemetry()
235
+ # Subprocess handle for wrapped instance of llama_server.exe, etc.
236
+ self.wrapped_server: WrappedServer = None
240
237
 
241
238
  def setup_routes(self, api_prefixes: list[str]):
242
239
  for prefix in api_prefixes:
@@ -521,7 +518,7 @@ class Server:
521
518
  await self.load_llm(lc)
522
519
 
523
520
  if self.llm_loaded.recipe == "llamacpp":
524
- return llamacpp.completion(completion_request, self.llama_telemetry)
521
+ return self.wrapped_server.completion(completion_request)
525
522
 
526
523
  # Check if the model supports reasoning
527
524
  reasoning_first_token = self.llm_loaded.reasoning
@@ -656,9 +653,7 @@ class Server:
656
653
  await self.load_llm(lc)
657
654
 
658
655
  if self.llm_loaded.recipe == "llamacpp":
659
- return llamacpp.chat_completion(
660
- chat_completion_request, self.llama_telemetry
661
- )
656
+ return self.wrapped_server.chat_completion(chat_completion_request)
662
657
 
663
658
  # Convert chat messages to text using the model's chat template
664
659
  text = self.apply_chat_template(
@@ -861,7 +856,7 @@ class Server:
861
856
 
862
857
  if self.llm_loaded.recipe == "llamacpp":
863
858
  try:
864
- return llamacpp.embeddings(embeddings_request, self.llama_telemetry)
859
+ return self.wrapped_server.embeddings(embeddings_request)
865
860
  except Exception as e: # pylint: disable=broad-exception-caught
866
861
  # Check if model has embeddings label
867
862
  model_info = ModelManager().supported_models.get(
@@ -884,7 +879,7 @@ class Server:
884
879
 
885
880
  async def reranking(self, reranking_request: RerankingRequest):
886
881
  """
887
- Rerank documents based on their relevance to a query using the llamacpp server.
882
+ Rerank documents based on their relevance to a query.
888
883
  """
889
884
  # Initialize load config from reranking request
890
885
  lc = LoadConfig(model_name=reranking_request.model)
@@ -894,7 +889,7 @@ class Server:
894
889
 
895
890
  if self.llm_loaded.recipe == "llamacpp":
896
891
  try:
897
- return llamacpp.reranking(reranking_request, self.llama_telemetry)
892
+ return self.wrapped_server.reranking(reranking_request)
898
893
  except Exception as e: # pylint: disable=broad-exception-caught
899
894
  # Check if model has reranking label
900
895
  model_info = ModelManager().supported_models.get(
@@ -1287,7 +1282,7 @@ class Server:
1287
1282
  """
1288
1283
  # If using llama server, get telemetry from the telemetry instance
1289
1284
  if self.llm_loaded and self.llm_loaded.recipe == "llamacpp":
1290
- return self.llama_telemetry.get_telemetry_data()
1285
+ return self.wrapped_server.telemetry.get_telemetry_data()
1291
1286
 
1292
1287
  # For built-in server, use the existing telemetry
1293
1288
  return {
@@ -1466,9 +1461,9 @@ class Server:
1466
1461
  ):
1467
1462
  if (
1468
1463
  self.llm_loaded.recipe == "llamacpp"
1469
- and self.llama_server_process.poll()
1464
+ and self.wrapped_server.process.poll()
1470
1465
  ):
1471
- # llama-server process has gone away for some reason, so we should
1466
+ # wrapped server process has gone away for some reason, so we should
1472
1467
  # proceed with loading to get it back
1473
1468
  pass
1474
1469
  else:
@@ -1484,12 +1479,10 @@ class Server:
1484
1479
  logging.info(f"Loading llm: {config.model_name}")
1485
1480
  try:
1486
1481
  if config_to_use.recipe == "llamacpp":
1487
- self.llama_server_process = llamacpp.server_load(
1482
+ self.wrapped_server = LlamaServer(self.llamacpp_backend)
1483
+ self.wrapped_server.load(
1488
1484
  model_config=config_to_use,
1489
- telemetry=self.llama_telemetry,
1490
- backend=self.llamacpp_backend,
1491
1485
  ctx_size=self.ctx_size,
1492
- # Models should only upgrade when using the pull endpoint
1493
1486
  do_not_upgrade=True,
1494
1487
  )
1495
1488
 
@@ -1530,7 +1523,7 @@ class Server:
1530
1523
  await self._generate_semaphore.acquire()
1531
1524
 
1532
1525
  if self.llm_loaded.recipe == "llamacpp":
1533
- self.llama_server_process.terminate()
1526
+ self.wrapped_server.process.terminate()
1534
1527
 
1535
1528
  self.llm_loaded = None
1536
1529
  self.tokenizer = None