lemonade-sdk 9.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lemonade/__init__.py +5 -0
- lemonade/api.py +180 -0
- lemonade/cache.py +92 -0
- lemonade/cli.py +173 -0
- lemonade/common/__init__.py +0 -0
- lemonade/common/build.py +176 -0
- lemonade/common/cli_helpers.py +139 -0
- lemonade/common/exceptions.py +98 -0
- lemonade/common/filesystem.py +368 -0
- lemonade/common/inference_engines.py +408 -0
- lemonade/common/network.py +93 -0
- lemonade/common/printing.py +110 -0
- lemonade/common/status.py +471 -0
- lemonade/common/system_info.py +1411 -0
- lemonade/common/test_helpers.py +28 -0
- lemonade/profilers/__init__.py +1 -0
- lemonade/profilers/agt_power.py +437 -0
- lemonade/profilers/hwinfo_power.py +429 -0
- lemonade/profilers/memory_tracker.py +259 -0
- lemonade/profilers/profiler.py +58 -0
- lemonade/sequence.py +363 -0
- lemonade/state.py +159 -0
- lemonade/tools/__init__.py +1 -0
- lemonade/tools/accuracy.py +432 -0
- lemonade/tools/adapter.py +114 -0
- lemonade/tools/bench.py +302 -0
- lemonade/tools/flm/__init__.py +1 -0
- lemonade/tools/flm/utils.py +305 -0
- lemonade/tools/huggingface/bench.py +187 -0
- lemonade/tools/huggingface/load.py +235 -0
- lemonade/tools/huggingface/utils.py +359 -0
- lemonade/tools/humaneval.py +264 -0
- lemonade/tools/llamacpp/bench.py +255 -0
- lemonade/tools/llamacpp/load.py +222 -0
- lemonade/tools/llamacpp/utils.py +1260 -0
- lemonade/tools/management_tools.py +319 -0
- lemonade/tools/mmlu.py +319 -0
- lemonade/tools/oga/__init__.py +0 -0
- lemonade/tools/oga/bench.py +120 -0
- lemonade/tools/oga/load.py +804 -0
- lemonade/tools/oga/migration.py +403 -0
- lemonade/tools/oga/utils.py +462 -0
- lemonade/tools/perplexity.py +147 -0
- lemonade/tools/prompt.py +263 -0
- lemonade/tools/report/__init__.py +0 -0
- lemonade/tools/report/llm_report.py +203 -0
- lemonade/tools/report/table.py +899 -0
- lemonade/tools/server/__init__.py +0 -0
- lemonade/tools/server/flm.py +133 -0
- lemonade/tools/server/llamacpp.py +320 -0
- lemonade/tools/server/serve.py +2123 -0
- lemonade/tools/server/static/favicon.ico +0 -0
- lemonade/tools/server/static/index.html +279 -0
- lemonade/tools/server/static/js/chat.js +1059 -0
- lemonade/tools/server/static/js/model-settings.js +183 -0
- lemonade/tools/server/static/js/models.js +1395 -0
- lemonade/tools/server/static/js/shared.js +556 -0
- lemonade/tools/server/static/logs.html +191 -0
- lemonade/tools/server/static/styles.css +2654 -0
- lemonade/tools/server/static/webapp.html +321 -0
- lemonade/tools/server/tool_calls.py +153 -0
- lemonade/tools/server/tray.py +664 -0
- lemonade/tools/server/utils/macos_tray.py +226 -0
- lemonade/tools/server/utils/port.py +77 -0
- lemonade/tools/server/utils/thread.py +85 -0
- lemonade/tools/server/utils/windows_tray.py +408 -0
- lemonade/tools/server/webapp.py +34 -0
- lemonade/tools/server/wrapped_server.py +559 -0
- lemonade/tools/tool.py +374 -0
- lemonade/version.py +1 -0
- lemonade_install/__init__.py +1 -0
- lemonade_install/install.py +239 -0
- lemonade_sdk-9.1.1.dist-info/METADATA +276 -0
- lemonade_sdk-9.1.1.dist-info/RECORD +84 -0
- lemonade_sdk-9.1.1.dist-info/WHEEL +5 -0
- lemonade_sdk-9.1.1.dist-info/entry_points.txt +5 -0
- lemonade_sdk-9.1.1.dist-info/licenses/LICENSE +201 -0
- lemonade_sdk-9.1.1.dist-info/licenses/NOTICE.md +47 -0
- lemonade_sdk-9.1.1.dist-info/top_level.txt +3 -0
- lemonade_server/cli.py +805 -0
- lemonade_server/model_manager.py +758 -0
- lemonade_server/pydantic_models.py +159 -0
- lemonade_server/server_models.json +643 -0
- lemonade_server/settings.py +39 -0
|
File without changes
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import logging
|
|
3
|
+
import subprocess
|
|
4
|
+
import time
|
|
5
|
+
import threading
|
|
6
|
+
|
|
7
|
+
import requests
|
|
8
|
+
|
|
9
|
+
from lemonade_server.pydantic_models import (
|
|
10
|
+
PullConfig,
|
|
11
|
+
ChatCompletionRequest,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
from lemonade.tools.server.wrapped_server import WrappedServerTelemetry, WrappedServer
|
|
15
|
+
from lemonade.tools.flm.utils import install_flm, download_flm_model
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class FlmTelemetry(WrappedServerTelemetry):
|
|
19
|
+
"""
|
|
20
|
+
Manages telemetry data collection and display for FLM server.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def parse_telemetry_line(self, line: str):
|
|
24
|
+
"""
|
|
25
|
+
Parse telemetry data from FLM server output lines.
|
|
26
|
+
|
|
27
|
+
Note: as of FLM 0.9.10, no telemetry data is provided by the server CLI.
|
|
28
|
+
This function is required to be implemented, so we leave it empty
|
|
29
|
+
as a placeholder for now.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
return
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class FlmServer(WrappedServer):
|
|
36
|
+
"""
|
|
37
|
+
Routes OpenAI API requests to an FLM server instance and returns the result
|
|
38
|
+
back to Lemonade Server.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(self):
|
|
42
|
+
self.flm_model_name = None
|
|
43
|
+
super().__init__(server_name="flm-server", telemetry=FlmTelemetry())
|
|
44
|
+
|
|
45
|
+
def address(self):
|
|
46
|
+
return f"http://localhost:{self.port}/v1"
|
|
47
|
+
|
|
48
|
+
def install_server(self):
|
|
49
|
+
"""
|
|
50
|
+
Check if FLM is installed and at minimum version.
|
|
51
|
+
If not, download and run the GUI installer, then wait for completion.
|
|
52
|
+
"""
|
|
53
|
+
install_flm()
|
|
54
|
+
|
|
55
|
+
def download_model(
|
|
56
|
+
self, config_checkpoint, config_mmproj=None, do_not_upgrade=False
|
|
57
|
+
) -> dict:
|
|
58
|
+
download_flm_model(config_checkpoint, config_mmproj, do_not_upgrade)
|
|
59
|
+
|
|
60
|
+
def _launch_server_subprocess(
|
|
61
|
+
self,
|
|
62
|
+
model_config: PullConfig,
|
|
63
|
+
snapshot_files: dict,
|
|
64
|
+
ctx_size: int,
|
|
65
|
+
supports_embeddings: bool = False,
|
|
66
|
+
supports_reranking: bool = False,
|
|
67
|
+
):
|
|
68
|
+
|
|
69
|
+
self._choose_port()
|
|
70
|
+
|
|
71
|
+
# Keep track of the FLM model name so that we can use it later
|
|
72
|
+
self.flm_model_name = model_config.checkpoint
|
|
73
|
+
|
|
74
|
+
command = [
|
|
75
|
+
"flm",
|
|
76
|
+
"serve",
|
|
77
|
+
f"{self.flm_model_name}",
|
|
78
|
+
"--ctx-len",
|
|
79
|
+
str(ctx_size),
|
|
80
|
+
"--port",
|
|
81
|
+
str(self.port),
|
|
82
|
+
]
|
|
83
|
+
|
|
84
|
+
# Set up environment with library path for Linux
|
|
85
|
+
env = os.environ.copy()
|
|
86
|
+
|
|
87
|
+
self.process = subprocess.Popen(
|
|
88
|
+
command,
|
|
89
|
+
stdout=subprocess.PIPE,
|
|
90
|
+
stderr=subprocess.STDOUT,
|
|
91
|
+
text=True,
|
|
92
|
+
encoding="utf-8",
|
|
93
|
+
errors="replace",
|
|
94
|
+
bufsize=1,
|
|
95
|
+
env=env,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
# Start background thread to log subprocess output
|
|
99
|
+
threading.Thread(
|
|
100
|
+
target=self._log_subprocess_output,
|
|
101
|
+
args=("FLM SERVER",),
|
|
102
|
+
daemon=True,
|
|
103
|
+
).start()
|
|
104
|
+
|
|
105
|
+
def _wait_for_load(self):
|
|
106
|
+
"""
|
|
107
|
+
FLM doesn't seem to have a health API, so we'll use the "list local models"
|
|
108
|
+
API to check if the server is up.
|
|
109
|
+
"""
|
|
110
|
+
status_code = None
|
|
111
|
+
while not self.process.poll() and status_code != 200:
|
|
112
|
+
health_url = f"http://localhost:{self.port}/api/tags"
|
|
113
|
+
try:
|
|
114
|
+
health_response = requests.get(health_url)
|
|
115
|
+
except requests.exceptions.ConnectionError:
|
|
116
|
+
logging.debug(
|
|
117
|
+
"Not able to connect to %s yet, will retry", self.server_name
|
|
118
|
+
)
|
|
119
|
+
else:
|
|
120
|
+
status_code = health_response.status_code
|
|
121
|
+
logging.debug(
|
|
122
|
+
"Testing %s readiness (will retry until ready), result: %s",
|
|
123
|
+
self.server_name,
|
|
124
|
+
health_response.json(),
|
|
125
|
+
)
|
|
126
|
+
time.sleep(1)
|
|
127
|
+
|
|
128
|
+
def chat_completion(self, chat_completion_request: ChatCompletionRequest):
|
|
129
|
+
# FLM requires the correct model name to be in the request
|
|
130
|
+
# (whereas llama-server ignores the model name field in the request)
|
|
131
|
+
chat_completion_request.model = self.flm_model_name
|
|
132
|
+
|
|
133
|
+
return super().chat_completion(chat_completion_request)
|
|
@@ -0,0 +1,320 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import logging
|
|
3
|
+
import subprocess
|
|
4
|
+
import re
|
|
5
|
+
import threading
|
|
6
|
+
import platform
|
|
7
|
+
|
|
8
|
+
from dotenv import load_dotenv
|
|
9
|
+
from fastapi import HTTPException, status
|
|
10
|
+
|
|
11
|
+
from lemonade_server.pydantic_models import (
|
|
12
|
+
PullConfig,
|
|
13
|
+
)
|
|
14
|
+
from lemonade.tools.llamacpp.utils import (
|
|
15
|
+
get_llama_server_exe_path,
|
|
16
|
+
install_llamacpp,
|
|
17
|
+
download_gguf,
|
|
18
|
+
resolve_local_gguf_model,
|
|
19
|
+
parse_checkpoint,
|
|
20
|
+
)
|
|
21
|
+
from lemonade.tools.server.wrapped_server import WrappedServerTelemetry, WrappedServer
|
|
22
|
+
|
|
23
|
+
# Embedding model batch configuration set to 8192 as default
|
|
24
|
+
EMBEDDING_CTX_SIZE = 8192
|
|
25
|
+
EMBEDDING_BATCH_SIZE = 8192
|
|
26
|
+
EMBEDDING_UBATCH_SIZE = 8192
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class LlamaTelemetry(WrappedServerTelemetry):
|
|
30
|
+
"""
|
|
31
|
+
Manages telemetry data collection and display for llama server.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def parse_telemetry_line(self, line: str):
|
|
35
|
+
"""
|
|
36
|
+
Parse telemetry data from llama server output lines.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
if "vk::PhysicalDevice::createDevice: ErrorExtensionNotPresent" in line:
|
|
40
|
+
msg = (
|
|
41
|
+
"Your AMD GPU driver version is not compatible with this software.\n"
|
|
42
|
+
"Please update and try again: "
|
|
43
|
+
"https://www.amd.com/en/support/download/drivers.html"
|
|
44
|
+
)
|
|
45
|
+
logging.error(msg)
|
|
46
|
+
raise HTTPException(
|
|
47
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
48
|
+
detail=msg,
|
|
49
|
+
)
|
|
50
|
+
elif "error" in line.lower():
|
|
51
|
+
logging.error(line)
|
|
52
|
+
|
|
53
|
+
# Parse Vulkan device detection
|
|
54
|
+
vulkan_match = re.search(r"ggml_vulkan: Found (\d+) Vulkan devices?:", line)
|
|
55
|
+
if vulkan_match:
|
|
56
|
+
device_count = int(vulkan_match.group(1))
|
|
57
|
+
if device_count > 0:
|
|
58
|
+
logging.info(
|
|
59
|
+
f"GPU acceleration active: {device_count} device(s) "
|
|
60
|
+
"detected by llama-server"
|
|
61
|
+
)
|
|
62
|
+
return
|
|
63
|
+
|
|
64
|
+
# Parse prompt evaluation line
|
|
65
|
+
prompt_match = re.search(
|
|
66
|
+
r"prompt eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens.*?"
|
|
67
|
+
r"([\d.]+)\s*tokens per second",
|
|
68
|
+
line,
|
|
69
|
+
)
|
|
70
|
+
if prompt_match:
|
|
71
|
+
prompt_time_ms = float(prompt_match.group(1))
|
|
72
|
+
input_tokens = int(prompt_match.group(2))
|
|
73
|
+
|
|
74
|
+
self.prompt_eval_time = prompt_time_ms / 1000.0
|
|
75
|
+
self.input_tokens = input_tokens
|
|
76
|
+
self.time_to_first_token = prompt_time_ms / 1000.0
|
|
77
|
+
return
|
|
78
|
+
|
|
79
|
+
# Parse generation evaluation line
|
|
80
|
+
eval_match = re.search(
|
|
81
|
+
r"eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens.*?"
|
|
82
|
+
r"([\d.]+)\s*tokens per second",
|
|
83
|
+
line,
|
|
84
|
+
)
|
|
85
|
+
if eval_match:
|
|
86
|
+
eval_time_ms = float(eval_match.group(1))
|
|
87
|
+
output_tokens = int(eval_match.group(2))
|
|
88
|
+
tokens_per_second = float(eval_match.group(3))
|
|
89
|
+
|
|
90
|
+
self.eval_time = eval_time_ms / 1000.0
|
|
91
|
+
self.output_tokens = output_tokens
|
|
92
|
+
self.tokens_per_second = tokens_per_second
|
|
93
|
+
return
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class LlamaServer(WrappedServer):
|
|
97
|
+
def __init__(self, backend: str):
|
|
98
|
+
self.backend = backend
|
|
99
|
+
super().__init__(server_name="llama-server", telemetry=LlamaTelemetry())
|
|
100
|
+
|
|
101
|
+
def install_server(self, backend=None):
|
|
102
|
+
"""
|
|
103
|
+
Install the wrapped server
|
|
104
|
+
"""
|
|
105
|
+
install_llamacpp(self.backend)
|
|
106
|
+
|
|
107
|
+
def download_model(
|
|
108
|
+
self, config_checkpoint, config_mmproj=None, do_not_upgrade=False
|
|
109
|
+
) -> dict:
|
|
110
|
+
"""
|
|
111
|
+
Download a model for the wrapper server.
|
|
112
|
+
First checks local cache, then downloads from internet if needed.
|
|
113
|
+
"""
|
|
114
|
+
# If it's a direct file path, just return it
|
|
115
|
+
|
|
116
|
+
if os.path.exists(config_checkpoint):
|
|
117
|
+
result = {"variant": config_checkpoint}
|
|
118
|
+
if config_mmproj:
|
|
119
|
+
result["mmproj"] = config_mmproj
|
|
120
|
+
return result
|
|
121
|
+
|
|
122
|
+
# Try to resolve from local cache first to avoid unnecessary downloads
|
|
123
|
+
checkpoint, variant = parse_checkpoint(config_checkpoint)
|
|
124
|
+
local_result = resolve_local_gguf_model(checkpoint, variant, config_mmproj)
|
|
125
|
+
|
|
126
|
+
if local_result:
|
|
127
|
+
return local_result
|
|
128
|
+
|
|
129
|
+
# Not found locally - download from internet
|
|
130
|
+
return download_gguf(
|
|
131
|
+
config_checkpoint=config_checkpoint,
|
|
132
|
+
config_mmproj=config_mmproj,
|
|
133
|
+
do_not_upgrade=do_not_upgrade,
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
def _launch_device_backend_subprocess(
|
|
137
|
+
self,
|
|
138
|
+
snapshot_files: dict,
|
|
139
|
+
use_gpu: bool,
|
|
140
|
+
ctx_size: int,
|
|
141
|
+
supports_embeddings: bool = False,
|
|
142
|
+
supports_reranking: bool = False,
|
|
143
|
+
) -> subprocess.Popen:
|
|
144
|
+
"""
|
|
145
|
+
Launch llama server subprocess with appropriate configuration.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
snapshot_files: Dictionary of model files to load
|
|
149
|
+
use_gpu: Whether to use GPU acceleration
|
|
150
|
+
telemetry: Telemetry object for tracking performance metrics
|
|
151
|
+
backend: Backend to use (e.g., 'vulkan', 'rocm', 'cpu')
|
|
152
|
+
supports_embeddings: Whether the model supports embeddings
|
|
153
|
+
supports_reranking: Whether the model supports reranking
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
Subprocess handle for the llama server
|
|
157
|
+
"""
|
|
158
|
+
|
|
159
|
+
# Get the current executable path (handles both Windows and Ubuntu structures)
|
|
160
|
+
exe_path = get_llama_server_exe_path(self.backend)
|
|
161
|
+
|
|
162
|
+
# For embedding models, use a larger context size to support longer individual
|
|
163
|
+
# strings. Embedding requests can include multiple strings in a batch, and each
|
|
164
|
+
# string needs to fit within the context window.
|
|
165
|
+
if supports_embeddings and ctx_size < EMBEDDING_CTX_SIZE:
|
|
166
|
+
ctx_size = EMBEDDING_CTX_SIZE
|
|
167
|
+
|
|
168
|
+
# Build the base command
|
|
169
|
+
base_command = [
|
|
170
|
+
exe_path,
|
|
171
|
+
"-m",
|
|
172
|
+
snapshot_files["variant"],
|
|
173
|
+
"--ctx-size",
|
|
174
|
+
str(ctx_size),
|
|
175
|
+
]
|
|
176
|
+
|
|
177
|
+
# Lock random seed for deterministic behavior in CI
|
|
178
|
+
if os.environ.get("LEMONADE_CI_MODE"):
|
|
179
|
+
base_command.extend(["--seed", "42"])
|
|
180
|
+
logging.info(f"Seed applied to base command: {base_command}")
|
|
181
|
+
|
|
182
|
+
if "mmproj" in snapshot_files:
|
|
183
|
+
base_command.extend(["--mmproj", snapshot_files["mmproj"]])
|
|
184
|
+
if not use_gpu:
|
|
185
|
+
base_command.extend(["--no-mmproj-offload"])
|
|
186
|
+
|
|
187
|
+
# Find a port, and save it in the telemetry object for future reference
|
|
188
|
+
# by other functions
|
|
189
|
+
self._choose_port()
|
|
190
|
+
|
|
191
|
+
# Add port and jinja to enable tool use
|
|
192
|
+
base_command.extend(["--port", str(self.port), "--jinja"])
|
|
193
|
+
|
|
194
|
+
# Enable context shift and avoid attention sink issues by preserving the initial tokens
|
|
195
|
+
# Note: --context-shift is not supported on all backends (e.g., Metal on macOS)
|
|
196
|
+
# Only add context-shift for backends that support it
|
|
197
|
+
context_shift_supported_backends = ["vulkan", "rocm"]
|
|
198
|
+
if self.backend in context_shift_supported_backends:
|
|
199
|
+
base_command.extend(["--context-shift", "--keep", "16"])
|
|
200
|
+
else:
|
|
201
|
+
# For backends that don't support context-shift (e.g., Metal), just use keep
|
|
202
|
+
base_command.extend(["--keep", "16"])
|
|
203
|
+
logging.debug(
|
|
204
|
+
f"Skipped --context-shift for backend: {self.backend} (not supported)"
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
# Use legacy reasoning formatting, since not all apps support the new
|
|
208
|
+
# reasoning_content field
|
|
209
|
+
base_command.extend(["--reasoning-format", "auto"])
|
|
210
|
+
|
|
211
|
+
# Add embeddings support if the model supports it
|
|
212
|
+
if supports_embeddings:
|
|
213
|
+
# For embedding models, set batch sizes to handle multiple documents in a single request
|
|
214
|
+
# batch-size: logical batch size (total tokens across all sequences)
|
|
215
|
+
# ubatch-size: physical batch size (tokens processed in a single forward pass)
|
|
216
|
+
base_command.extend(
|
|
217
|
+
[
|
|
218
|
+
"--embeddings",
|
|
219
|
+
"--batch-size",
|
|
220
|
+
str(EMBEDDING_BATCH_SIZE),
|
|
221
|
+
"--ubatch-size",
|
|
222
|
+
str(EMBEDDING_UBATCH_SIZE),
|
|
223
|
+
]
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
# Add reranking support if the model supports it
|
|
227
|
+
if supports_reranking:
|
|
228
|
+
base_command.append("--reranking")
|
|
229
|
+
|
|
230
|
+
# Configure GPU layers: 99 for GPU, 0 for CPU-only
|
|
231
|
+
ngl_value = "99" if use_gpu else "0"
|
|
232
|
+
command = base_command + ["-ngl", ngl_value]
|
|
233
|
+
|
|
234
|
+
# Set up environment with library path for Linux
|
|
235
|
+
env = os.environ.copy()
|
|
236
|
+
|
|
237
|
+
# Load environment variables from .env file in the executable directory
|
|
238
|
+
exe_dir = os.path.dirname(exe_path)
|
|
239
|
+
env_file_path = os.path.join(exe_dir, ".env")
|
|
240
|
+
if os.path.exists(env_file_path):
|
|
241
|
+
load_dotenv(env_file_path, override=False)
|
|
242
|
+
env.update(os.environ)
|
|
243
|
+
logging.debug(f"Loaded environment variables from {env_file_path}")
|
|
244
|
+
|
|
245
|
+
system = platform.system().lower()
|
|
246
|
+
if system == "linux":
|
|
247
|
+
lib_dir = os.path.dirname(exe_path) # Same directory as the executable
|
|
248
|
+
current_ld_path = env.get("LD_LIBRARY_PATH", "")
|
|
249
|
+
if current_ld_path:
|
|
250
|
+
env["LD_LIBRARY_PATH"] = f"{lib_dir}:{current_ld_path}"
|
|
251
|
+
else:
|
|
252
|
+
env["LD_LIBRARY_PATH"] = lib_dir
|
|
253
|
+
logging.debug(f"Set LD_LIBRARY_PATH to {env['LD_LIBRARY_PATH']}")
|
|
254
|
+
elif system == "darwin":
|
|
255
|
+
lib_dir = os.path.dirname(exe_path)
|
|
256
|
+
current_dyld_path = env.get("DYLD_LIBRARY_PATH", "")
|
|
257
|
+
if current_dyld_path:
|
|
258
|
+
env["DYLD_LIBRARY_PATH"] = f"{lib_dir}:{current_dyld_path}"
|
|
259
|
+
else:
|
|
260
|
+
env["DYLD_LIBRARY_PATH"] = lib_dir
|
|
261
|
+
logging.debug(f"Set DYLD_LIBRARY_PATH to {env['DYLD_LIBRARY_PATH']}")
|
|
262
|
+
|
|
263
|
+
# Start subprocess with output capture
|
|
264
|
+
self.process = subprocess.Popen(
|
|
265
|
+
command,
|
|
266
|
+
stdout=subprocess.PIPE,
|
|
267
|
+
stderr=subprocess.STDOUT,
|
|
268
|
+
text=True,
|
|
269
|
+
encoding="utf-8",
|
|
270
|
+
errors="replace",
|
|
271
|
+
bufsize=1,
|
|
272
|
+
env=env,
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
# Start background thread to log subprocess output
|
|
276
|
+
device_type = "GPU" if use_gpu else "CPU"
|
|
277
|
+
threading.Thread(
|
|
278
|
+
target=self._log_subprocess_output,
|
|
279
|
+
args=(f"LLAMA SERVER {device_type}",),
|
|
280
|
+
daemon=True,
|
|
281
|
+
).start()
|
|
282
|
+
|
|
283
|
+
def _launch_server_subprocess(
|
|
284
|
+
self,
|
|
285
|
+
model_config: PullConfig,
|
|
286
|
+
snapshot_files: dict,
|
|
287
|
+
ctx_size: int,
|
|
288
|
+
supports_embeddings: bool = False,
|
|
289
|
+
supports_reranking: bool = False,
|
|
290
|
+
):
|
|
291
|
+
|
|
292
|
+
# Attempt loading on GPU first
|
|
293
|
+
self._launch_device_backend_subprocess(
|
|
294
|
+
snapshot_files,
|
|
295
|
+
use_gpu=True,
|
|
296
|
+
ctx_size=ctx_size,
|
|
297
|
+
supports_embeddings=supports_embeddings,
|
|
298
|
+
supports_reranking=supports_reranking,
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
# Check the /health endpoint until GPU server is ready
|
|
302
|
+
self._wait_for_load()
|
|
303
|
+
|
|
304
|
+
# If loading on GPU failed, try loading on CPU
|
|
305
|
+
if self.process.poll():
|
|
306
|
+
logging.warning(
|
|
307
|
+
f"Loading {model_config.model_name} on GPU didn't work, re-attempting on CPU"
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
if os.environ.get("LEMONADE_LLAMACPP_NO_FALLBACK"):
|
|
311
|
+
# Used for testing, when the test should fail if GPU didn't work
|
|
312
|
+
raise Exception("llamacpp GPU loading failed")
|
|
313
|
+
|
|
314
|
+
self._launch_device_backend_subprocess(
|
|
315
|
+
snapshot_files,
|
|
316
|
+
use_gpu=False,
|
|
317
|
+
ctx_size=ctx_size,
|
|
318
|
+
supports_embeddings=supports_embeddings,
|
|
319
|
+
supports_reranking=supports_reranking,
|
|
320
|
+
)
|