lemonade-sdk 8.1.5__tar.gz → 8.1.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lemonade-sdk might be problematic. Click here for more details.
- {lemonade_sdk-8.1.5/src/lemonade_sdk.egg-info → lemonade_sdk-8.1.6}/PKG-INFO +1 -1
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/llamacpp/utils.py +5 -1
- lemonade_sdk-8.1.6/src/lemonade/tools/server/llamacpp.py +255 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/server/serve.py +15 -22
- lemonade_sdk-8.1.6/src/lemonade/tools/server/wrapped_server.py +485 -0
- lemonade_sdk-8.1.6/src/lemonade/version.py +1 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6/src/lemonade_sdk.egg-info}/PKG-INFO +1 -1
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade_sdk.egg-info/SOURCES.txt +1 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade_server/cli.py +18 -9
- lemonade_sdk-8.1.6/src/lemonade_server/model_manager.py +455 -0
- lemonade_sdk-8.1.5/src/lemonade/tools/server/llamacpp.py +0 -653
- lemonade_sdk-8.1.5/src/lemonade/version.py +0 -1
- lemonade_sdk-8.1.5/src/lemonade_server/model_manager.py +0 -274
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/LICENSE +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/NOTICE.md +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/README.md +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/pyproject.toml +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/setup.cfg +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/setup.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/__init__.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/api.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/cache.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/cli.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/common/__init__.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/common/build.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/common/cli_helpers.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/common/exceptions.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/common/filesystem.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/common/inference_engines.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/common/network.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/common/printing.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/common/status.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/common/system_info.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/common/test_helpers.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/profilers/__init__.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/profilers/memory_tracker.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/profilers/profiler.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/sequence.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/state.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/__init__.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/accuracy.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/adapter.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/bench.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/huggingface/bench.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/huggingface/load.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/huggingface/utils.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/humaneval.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/llamacpp/bench.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/llamacpp/load.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/management_tools.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/mmlu.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/oga/__init__.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/oga/bench.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/oga/load.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/oga/utils.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/perplexity.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/prompt.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/quark/__init__.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/quark/quark_load.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/quark/quark_quantize.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/report/__init__.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/report/llm_report.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/report/table.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/server/__init__.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/server/static/favicon.ico +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/server/static/js/chat.js +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/server/static/js/model-settings.js +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/server/static/js/models.js +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/server/static/js/shared.js +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/server/static/styles.css +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/server/static/webapp.html +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/server/tool_calls.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/server/tray.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/server/utils/port.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/server/utils/system_tray.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/server/utils/thread.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/server/webapp.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/tool.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade_install/__init__.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade_install/install.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade_sdk.egg-info/dependency_links.txt +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade_sdk.egg-info/entry_points.txt +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade_sdk.egg-info/requires.txt +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade_sdk.egg-info/top_level.txt +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade_server/pydantic_models.py +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade_server/server_models.json +0 -0
- {lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade_server/settings.py +0 -0
|
@@ -346,7 +346,11 @@ def install_llamacpp(backend):
|
|
|
346
346
|
|
|
347
347
|
# Identify and set HIP ID
|
|
348
348
|
if backend == "rocm":
|
|
349
|
-
|
|
349
|
+
try:
|
|
350
|
+
hip_id = identify_hip_id()
|
|
351
|
+
except Exception as e: # pylint: disable=broad-exception-caught
|
|
352
|
+
hip_id = 0
|
|
353
|
+
logging.warning(f"Error identifying HIP ID: {e}. Falling back to 0.")
|
|
350
354
|
env_file_path = os.path.join(llama_server_exe_dir, ".env")
|
|
351
355
|
set_key(env_file_path, "HIP_VISIBLE_DEVICES", str(hip_id))
|
|
352
356
|
|
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import logging
|
|
3
|
+
import subprocess
|
|
4
|
+
import re
|
|
5
|
+
import threading
|
|
6
|
+
import platform
|
|
7
|
+
|
|
8
|
+
from dotenv import load_dotenv
|
|
9
|
+
|
|
10
|
+
from lemonade_server.pydantic_models import (
|
|
11
|
+
PullConfig,
|
|
12
|
+
)
|
|
13
|
+
from lemonade.tools.llamacpp.utils import (
|
|
14
|
+
get_llama_server_exe_path,
|
|
15
|
+
install_llamacpp,
|
|
16
|
+
download_gguf,
|
|
17
|
+
)
|
|
18
|
+
from lemonade.tools.server.wrapped_server import WrappedServerTelemetry, WrappedServer
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class LlamaTelemetry(WrappedServerTelemetry):
|
|
22
|
+
"""
|
|
23
|
+
Manages telemetry data collection and display for llama server.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def parse_telemetry_line(self, line: str):
|
|
27
|
+
"""
|
|
28
|
+
Parse telemetry data from llama server output lines.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
# Parse Vulkan device detection
|
|
32
|
+
vulkan_match = re.search(r"ggml_vulkan: Found (\d+) Vulkan devices?:", line)
|
|
33
|
+
if vulkan_match:
|
|
34
|
+
device_count = int(vulkan_match.group(1))
|
|
35
|
+
if device_count > 0:
|
|
36
|
+
logging.info(
|
|
37
|
+
f"GPU acceleration active: {device_count} device(s) "
|
|
38
|
+
"detected by llama-server"
|
|
39
|
+
)
|
|
40
|
+
return
|
|
41
|
+
|
|
42
|
+
# Parse prompt evaluation line
|
|
43
|
+
prompt_match = re.search(
|
|
44
|
+
r"prompt eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens.*?"
|
|
45
|
+
r"([\d.]+)\s*tokens per second",
|
|
46
|
+
line,
|
|
47
|
+
)
|
|
48
|
+
if prompt_match:
|
|
49
|
+
prompt_time_ms = float(prompt_match.group(1))
|
|
50
|
+
input_tokens = int(prompt_match.group(2))
|
|
51
|
+
|
|
52
|
+
self.prompt_eval_time = prompt_time_ms / 1000.0
|
|
53
|
+
self.input_tokens = input_tokens
|
|
54
|
+
self.time_to_first_token = prompt_time_ms / 1000.0
|
|
55
|
+
return
|
|
56
|
+
|
|
57
|
+
# Parse generation evaluation line
|
|
58
|
+
eval_match = re.search(
|
|
59
|
+
r"eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens.*?"
|
|
60
|
+
r"([\d.]+)\s*tokens per second",
|
|
61
|
+
line,
|
|
62
|
+
)
|
|
63
|
+
if eval_match:
|
|
64
|
+
eval_time_ms = float(eval_match.group(1))
|
|
65
|
+
output_tokens = int(eval_match.group(2))
|
|
66
|
+
tokens_per_second = float(eval_match.group(3))
|
|
67
|
+
|
|
68
|
+
self.eval_time = eval_time_ms / 1000.0
|
|
69
|
+
self.output_tokens = output_tokens
|
|
70
|
+
self.tokens_per_second = tokens_per_second
|
|
71
|
+
return
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class LlamaServer(WrappedServer):
|
|
75
|
+
def __init__(self, backend: str):
|
|
76
|
+
self.telemetry = LlamaTelemetry()
|
|
77
|
+
self.backend = backend
|
|
78
|
+
super().__init__(server_name="llama-server", telemetry=self.telemetry)
|
|
79
|
+
|
|
80
|
+
def install_server(self, backend=None):
|
|
81
|
+
"""
|
|
82
|
+
Install the wrapped server
|
|
83
|
+
"""
|
|
84
|
+
install_llamacpp(self.backend)
|
|
85
|
+
|
|
86
|
+
def download_model(
|
|
87
|
+
self, config_checkpoint, config_mmproj=None, do_not_upgrade=False
|
|
88
|
+
) -> dict:
|
|
89
|
+
"""
|
|
90
|
+
Download a model for the wrapper server
|
|
91
|
+
"""
|
|
92
|
+
return download_gguf(
|
|
93
|
+
config_checkpoint=config_checkpoint,
|
|
94
|
+
config_mmproj=config_mmproj,
|
|
95
|
+
do_not_upgrade=do_not_upgrade,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
def _launch_device_backend_subprocess(
|
|
99
|
+
self,
|
|
100
|
+
snapshot_files: dict,
|
|
101
|
+
use_gpu: bool,
|
|
102
|
+
ctx_size: int,
|
|
103
|
+
supports_embeddings: bool = False,
|
|
104
|
+
supports_reranking: bool = False,
|
|
105
|
+
) -> subprocess.Popen:
|
|
106
|
+
"""
|
|
107
|
+
Launch llama server subprocess with appropriate configuration.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
snapshot_files: Dictionary of model files to load
|
|
111
|
+
use_gpu: Whether to use GPU acceleration
|
|
112
|
+
telemetry: Telemetry object for tracking performance metrics
|
|
113
|
+
backend: Backend to use (e.g., 'vulkan', 'rocm')
|
|
114
|
+
supports_embeddings: Whether the model supports embeddings
|
|
115
|
+
supports_reranking: Whether the model supports reranking
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
Subprocess handle for the llama server
|
|
119
|
+
"""
|
|
120
|
+
|
|
121
|
+
# Get the current executable path (handles both Windows and Ubuntu structures)
|
|
122
|
+
exe_path = get_llama_server_exe_path(self.backend)
|
|
123
|
+
|
|
124
|
+
# Build the base command
|
|
125
|
+
base_command = [
|
|
126
|
+
exe_path,
|
|
127
|
+
"-m",
|
|
128
|
+
snapshot_files["variant"],
|
|
129
|
+
"--ctx-size",
|
|
130
|
+
str(ctx_size),
|
|
131
|
+
]
|
|
132
|
+
|
|
133
|
+
# Lock random seed for deterministic behavior in CI
|
|
134
|
+
if os.environ.get("LEMONADE_CI_MODE"):
|
|
135
|
+
base_command.extend(["--seed", "42"])
|
|
136
|
+
logging.info(f"Seed applied to base command: {base_command}")
|
|
137
|
+
|
|
138
|
+
if "mmproj" in snapshot_files:
|
|
139
|
+
base_command.extend(["--mmproj", snapshot_files["mmproj"]])
|
|
140
|
+
if not use_gpu:
|
|
141
|
+
base_command.extend(["--no-mmproj-offload"])
|
|
142
|
+
|
|
143
|
+
# Find a port, and save it in the telemetry object for future reference
|
|
144
|
+
# by other functions
|
|
145
|
+
self.choose_port()
|
|
146
|
+
|
|
147
|
+
# Add port and jinja to enable tool use
|
|
148
|
+
base_command.extend(["--port", str(self.port), "--jinja"])
|
|
149
|
+
|
|
150
|
+
# Disable jinja for gpt-oss-120b on Vulkan
|
|
151
|
+
if (
|
|
152
|
+
self.backend == "vulkan"
|
|
153
|
+
and "gpt-oss-120b" in snapshot_files["variant"].lower()
|
|
154
|
+
):
|
|
155
|
+
base_command.remove("--jinja")
|
|
156
|
+
logging.warning(
|
|
157
|
+
"Jinja is disabled for gpt-oss-120b on Vulkan due to a llama.cpp bug "
|
|
158
|
+
"(see https://github.com/ggml-org/llama.cpp/issues/15274). "
|
|
159
|
+
"The model cannot use tools. If needed, use the ROCm backend instead."
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
# Use legacy reasoning formatting, since not all apps support the new
|
|
163
|
+
# reasoning_content field
|
|
164
|
+
base_command.extend(["--reasoning-format", "none"])
|
|
165
|
+
|
|
166
|
+
# Add embeddings support if the model supports it
|
|
167
|
+
if supports_embeddings:
|
|
168
|
+
base_command.append("--embeddings")
|
|
169
|
+
|
|
170
|
+
# Add reranking support if the model supports it
|
|
171
|
+
if supports_reranking:
|
|
172
|
+
base_command.append("--reranking")
|
|
173
|
+
|
|
174
|
+
# Configure GPU layers: 99 for GPU, 0 for CPU-only
|
|
175
|
+
ngl_value = "99" if use_gpu else "0"
|
|
176
|
+
command = base_command + ["-ngl", ngl_value]
|
|
177
|
+
|
|
178
|
+
# Set up environment with library path for Linux
|
|
179
|
+
env = os.environ.copy()
|
|
180
|
+
|
|
181
|
+
# Load environment variables from .env file in the executable directory
|
|
182
|
+
exe_dir = os.path.dirname(exe_path)
|
|
183
|
+
env_file_path = os.path.join(exe_dir, ".env")
|
|
184
|
+
if os.path.exists(env_file_path):
|
|
185
|
+
load_dotenv(env_file_path, override=True)
|
|
186
|
+
env.update(os.environ)
|
|
187
|
+
logging.debug(f"Loaded environment variables from {env_file_path}")
|
|
188
|
+
|
|
189
|
+
if platform.system().lower() == "linux":
|
|
190
|
+
lib_dir = os.path.dirname(exe_path) # Same directory as the executable
|
|
191
|
+
current_ld_path = env.get("LD_LIBRARY_PATH", "")
|
|
192
|
+
if current_ld_path:
|
|
193
|
+
env["LD_LIBRARY_PATH"] = f"{lib_dir}:{current_ld_path}"
|
|
194
|
+
else:
|
|
195
|
+
env["LD_LIBRARY_PATH"] = lib_dir
|
|
196
|
+
logging.debug(f"Set LD_LIBRARY_PATH to {env['LD_LIBRARY_PATH']}")
|
|
197
|
+
|
|
198
|
+
# Start subprocess with output capture
|
|
199
|
+
self.process = subprocess.Popen(
|
|
200
|
+
command,
|
|
201
|
+
stdout=subprocess.PIPE,
|
|
202
|
+
stderr=subprocess.STDOUT,
|
|
203
|
+
text=True,
|
|
204
|
+
encoding="utf-8",
|
|
205
|
+
errors="replace",
|
|
206
|
+
bufsize=1,
|
|
207
|
+
env=env,
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
# Start background thread to log subprocess output
|
|
211
|
+
device_type = "GPU" if use_gpu else "CPU"
|
|
212
|
+
threading.Thread(
|
|
213
|
+
target=self._log_subprocess_output,
|
|
214
|
+
args=(f"LLAMA SERVER {device_type}",),
|
|
215
|
+
daemon=True,
|
|
216
|
+
).start()
|
|
217
|
+
|
|
218
|
+
def _launch_server_subprocess(
|
|
219
|
+
self,
|
|
220
|
+
model_config: PullConfig,
|
|
221
|
+
snapshot_files: dict,
|
|
222
|
+
ctx_size: int,
|
|
223
|
+
supports_embeddings: bool = False,
|
|
224
|
+
supports_reranking: bool = False,
|
|
225
|
+
):
|
|
226
|
+
|
|
227
|
+
# Attempt loading on GPU first
|
|
228
|
+
self._launch_device_backend_subprocess(
|
|
229
|
+
snapshot_files,
|
|
230
|
+
use_gpu=True,
|
|
231
|
+
ctx_size=ctx_size,
|
|
232
|
+
supports_embeddings=supports_embeddings,
|
|
233
|
+
supports_reranking=supports_reranking,
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
# Check the /health endpoint until GPU server is ready
|
|
237
|
+
self._wait_for_load()
|
|
238
|
+
|
|
239
|
+
# If loading on GPU failed, try loading on CPU
|
|
240
|
+
if self.process.poll():
|
|
241
|
+
logging.warning(
|
|
242
|
+
f"Loading {model_config.model_name} on GPU didn't work, re-attempting on CPU"
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
if os.environ.get("LEMONADE_LLAMACPP_NO_FALLBACK"):
|
|
246
|
+
# Used for testing, when the test should fail if GPU didn't work
|
|
247
|
+
raise Exception("llamacpp GPU loading failed")
|
|
248
|
+
|
|
249
|
+
self._launch_device_backend_subprocess(
|
|
250
|
+
snapshot_files,
|
|
251
|
+
use_gpu=False,
|
|
252
|
+
ctx_size=ctx_size,
|
|
253
|
+
supports_embeddings=supports_embeddings,
|
|
254
|
+
supports_reranking=supports_reranking,
|
|
255
|
+
)
|
|
@@ -9,7 +9,6 @@ import tempfile
|
|
|
9
9
|
import traceback
|
|
10
10
|
from typing import Optional, Union
|
|
11
11
|
import json
|
|
12
|
-
import subprocess
|
|
13
12
|
from pathlib import Path
|
|
14
13
|
|
|
15
14
|
from fastapi import FastAPI, HTTPException, status, Request
|
|
@@ -47,7 +46,8 @@ from openai.types.responses import (
|
|
|
47
46
|
)
|
|
48
47
|
|
|
49
48
|
import lemonade.api as lemonade_api
|
|
50
|
-
|
|
49
|
+
from lemonade.tools.server.wrapped_server import WrappedServer
|
|
50
|
+
from lemonade.tools.server.llamacpp import LlamaServer
|
|
51
51
|
from lemonade.tools.server.tool_calls import extract_tool_calls, get_tool_call_pattern
|
|
52
52
|
from lemonade.tools.server.webapp import get_webapp_html
|
|
53
53
|
from lemonade.tools.server.utils.port import lifespan
|
|
@@ -232,11 +232,8 @@ class Server:
|
|
|
232
232
|
# Add lock for load/unload operations
|
|
233
233
|
self._load_lock = asyncio.Lock()
|
|
234
234
|
|
|
235
|
-
# Subprocess handle for llama_server.exe
|
|
236
|
-
self.
|
|
237
|
-
|
|
238
|
-
# Telemetry instance for llama server
|
|
239
|
-
self.llama_telemetry = llamacpp.LlamaTelemetry()
|
|
235
|
+
# Subprocess handle for wrapped instance of llama_server.exe, etc.
|
|
236
|
+
self.wrapped_server: WrappedServer = None
|
|
240
237
|
|
|
241
238
|
def setup_routes(self, api_prefixes: list[str]):
|
|
242
239
|
for prefix in api_prefixes:
|
|
@@ -521,7 +518,7 @@ class Server:
|
|
|
521
518
|
await self.load_llm(lc)
|
|
522
519
|
|
|
523
520
|
if self.llm_loaded.recipe == "llamacpp":
|
|
524
|
-
return
|
|
521
|
+
return self.wrapped_server.completion(completion_request)
|
|
525
522
|
|
|
526
523
|
# Check if the model supports reasoning
|
|
527
524
|
reasoning_first_token = self.llm_loaded.reasoning
|
|
@@ -656,9 +653,7 @@ class Server:
|
|
|
656
653
|
await self.load_llm(lc)
|
|
657
654
|
|
|
658
655
|
if self.llm_loaded.recipe == "llamacpp":
|
|
659
|
-
return
|
|
660
|
-
chat_completion_request, self.llama_telemetry
|
|
661
|
-
)
|
|
656
|
+
return self.wrapped_server.chat_completion(chat_completion_request)
|
|
662
657
|
|
|
663
658
|
# Convert chat messages to text using the model's chat template
|
|
664
659
|
text = self.apply_chat_template(
|
|
@@ -861,7 +856,7 @@ class Server:
|
|
|
861
856
|
|
|
862
857
|
if self.llm_loaded.recipe == "llamacpp":
|
|
863
858
|
try:
|
|
864
|
-
return
|
|
859
|
+
return self.wrapped_server.embeddings(embeddings_request)
|
|
865
860
|
except Exception as e: # pylint: disable=broad-exception-caught
|
|
866
861
|
# Check if model has embeddings label
|
|
867
862
|
model_info = ModelManager().supported_models.get(
|
|
@@ -884,7 +879,7 @@ class Server:
|
|
|
884
879
|
|
|
885
880
|
async def reranking(self, reranking_request: RerankingRequest):
|
|
886
881
|
"""
|
|
887
|
-
Rerank documents based on their relevance to a query
|
|
882
|
+
Rerank documents based on their relevance to a query.
|
|
888
883
|
"""
|
|
889
884
|
# Initialize load config from reranking request
|
|
890
885
|
lc = LoadConfig(model_name=reranking_request.model)
|
|
@@ -894,7 +889,7 @@ class Server:
|
|
|
894
889
|
|
|
895
890
|
if self.llm_loaded.recipe == "llamacpp":
|
|
896
891
|
try:
|
|
897
|
-
return
|
|
892
|
+
return self.wrapped_server.reranking(reranking_request)
|
|
898
893
|
except Exception as e: # pylint: disable=broad-exception-caught
|
|
899
894
|
# Check if model has reranking label
|
|
900
895
|
model_info = ModelManager().supported_models.get(
|
|
@@ -1287,7 +1282,7 @@ class Server:
|
|
|
1287
1282
|
"""
|
|
1288
1283
|
# If using llama server, get telemetry from the telemetry instance
|
|
1289
1284
|
if self.llm_loaded and self.llm_loaded.recipe == "llamacpp":
|
|
1290
|
-
return self.
|
|
1285
|
+
return self.wrapped_server.telemetry.get_telemetry_data()
|
|
1291
1286
|
|
|
1292
1287
|
# For built-in server, use the existing telemetry
|
|
1293
1288
|
return {
|
|
@@ -1466,9 +1461,9 @@ class Server:
|
|
|
1466
1461
|
):
|
|
1467
1462
|
if (
|
|
1468
1463
|
self.llm_loaded.recipe == "llamacpp"
|
|
1469
|
-
and self.
|
|
1464
|
+
and self.wrapped_server.process.poll()
|
|
1470
1465
|
):
|
|
1471
|
-
#
|
|
1466
|
+
# wrapped server process has gone away for some reason, so we should
|
|
1472
1467
|
# proceed with loading to get it back
|
|
1473
1468
|
pass
|
|
1474
1469
|
else:
|
|
@@ -1484,12 +1479,10 @@ class Server:
|
|
|
1484
1479
|
logging.info(f"Loading llm: {config.model_name}")
|
|
1485
1480
|
try:
|
|
1486
1481
|
if config_to_use.recipe == "llamacpp":
|
|
1487
|
-
self.
|
|
1482
|
+
self.wrapped_server = LlamaServer(self.llamacpp_backend)
|
|
1483
|
+
self.wrapped_server.load(
|
|
1488
1484
|
model_config=config_to_use,
|
|
1489
|
-
telemetry=self.llama_telemetry,
|
|
1490
|
-
backend=self.llamacpp_backend,
|
|
1491
1485
|
ctx_size=self.ctx_size,
|
|
1492
|
-
# Models should only upgrade when using the pull endpoint
|
|
1493
1486
|
do_not_upgrade=True,
|
|
1494
1487
|
)
|
|
1495
1488
|
|
|
@@ -1530,7 +1523,7 @@ class Server:
|
|
|
1530
1523
|
await self._generate_semaphore.acquire()
|
|
1531
1524
|
|
|
1532
1525
|
if self.llm_loaded.recipe == "llamacpp":
|
|
1533
|
-
self.
|
|
1526
|
+
self.wrapped_server.process.terminate()
|
|
1534
1527
|
|
|
1535
1528
|
self.llm_loaded = None
|
|
1536
1529
|
self.tokenizer = None
|