lemonade-sdk 8.1.0__py3-none-any.whl → 8.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lemonade-sdk might be problematic. Click here for more details.
- lemonade/common/inference_engines.py +62 -77
- lemonade/common/system_info.py +61 -44
- lemonade/tools/llamacpp/load.py +13 -4
- lemonade/tools/llamacpp/utils.py +222 -54
- lemonade/tools/oga/load.py +3 -3
- lemonade/tools/server/llamacpp.py +30 -53
- lemonade/tools/server/serve.py +54 -104
- lemonade/tools/server/static/styles.css +203 -0
- lemonade/tools/server/static/webapp.html +507 -71
- lemonade/tools/server/tray.py +4 -2
- lemonade/tools/server/utils/thread.py +2 -4
- lemonade/version.py +1 -1
- lemonade_install/install.py +25 -2
- {lemonade_sdk-8.1.0.dist-info → lemonade_sdk-8.1.1.dist-info}/METADATA +45 -6
- {lemonade_sdk-8.1.0.dist-info → lemonade_sdk-8.1.1.dist-info}/RECORD +22 -22
- lemonade_server/cli.py +79 -26
- lemonade_server/server_models.json +26 -1
- {lemonade_sdk-8.1.0.dist-info → lemonade_sdk-8.1.1.dist-info}/WHEEL +0 -0
- {lemonade_sdk-8.1.0.dist-info → lemonade_sdk-8.1.1.dist-info}/entry_points.txt +0 -0
- {lemonade_sdk-8.1.0.dist-info → lemonade_sdk-8.1.1.dist-info}/licenses/LICENSE +0 -0
- {lemonade_sdk-8.1.0.dist-info → lemonade_sdk-8.1.1.dist-info}/licenses/NOTICE.md +0 -0
- {lemonade_sdk-8.1.0.dist-info → lemonade_sdk-8.1.1.dist-info}/top_level.txt +0 -0
lemonade/tools/llamacpp/utils.py
CHANGED
|
@@ -10,21 +10,105 @@ import requests
|
|
|
10
10
|
import lemonade.common.printing as printing
|
|
11
11
|
from lemonade.tools.adapter import PassthroughTokenizer, ModelAdapter
|
|
12
12
|
|
|
13
|
-
|
|
13
|
+
from lemonade.common.system_info import get_system_info
|
|
14
14
|
|
|
15
|
+
from dotenv import set_key, load_dotenv
|
|
15
16
|
|
|
16
|
-
|
|
17
|
+
LLAMA_VERSION_VULKAN = "b6097"
|
|
18
|
+
LLAMA_VERSION_ROCM = "b1021"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def identify_rocm_arch_from_name(device_name: str) -> str | None:
|
|
22
|
+
"""
|
|
23
|
+
Identify the appropriate ROCm target architecture based on the device name
|
|
24
|
+
"""
|
|
25
|
+
device_name_lower = device_name.lower()
|
|
26
|
+
if "radeon" not in device_name_lower:
|
|
27
|
+
return None
|
|
28
|
+
|
|
29
|
+
# Check iGPUs
|
|
30
|
+
# STX Halo iGPUs (gfx1151 architecture)
|
|
31
|
+
# Radeon 8050S Graphics / Radeon 8060S Graphics
|
|
32
|
+
target_arch = None
|
|
33
|
+
if any(halo_igpu in device_name_lower.lower() for halo_igpu in ["8050s", "8060s"]):
|
|
34
|
+
return "gfx1151"
|
|
35
|
+
|
|
36
|
+
# Check dGPUs
|
|
37
|
+
# RDNA4 GPUs (gfx120X architecture)
|
|
38
|
+
# AMD Radeon AI PRO R9700, AMD Radeon RX 9070 XT, AMD Radeon RX 9070 GRE,
|
|
39
|
+
# AMD Radeon RX 9070, AMD Radeon RX 9060 XT
|
|
40
|
+
if any(
|
|
41
|
+
rdna4_gpu in device_name_lower.lower()
|
|
42
|
+
for rdna4_gpu in ["r9700", "9060", "9070"]
|
|
43
|
+
):
|
|
44
|
+
return "gfx120X"
|
|
45
|
+
|
|
46
|
+
# RDNA3 GPUs (gfx110X architecture)
|
|
47
|
+
# AMD Radeon PRO V710, AMD Radeon PRO W7900 Dual Slot, AMD Radeon PRO W7900,
|
|
48
|
+
# AMD Radeon PRO W7800 48GB, AMD Radeon PRO W7800, AMD Radeon PRO W7700,
|
|
49
|
+
# AMD Radeon RX 7900 XTX, AMD Radeon RX 7900 XT, AMD Radeon RX 7900 GRE,
|
|
50
|
+
# AMD Radeon RX 7800 XT, AMD Radeon RX 7700 XT
|
|
51
|
+
elif any(
|
|
52
|
+
rdna3_gpu in device_name_lower.lower()
|
|
53
|
+
for rdna3_gpu in ["7700", "7800", "7900", "v710"]
|
|
54
|
+
):
|
|
55
|
+
return "gfx110X"
|
|
56
|
+
|
|
57
|
+
return None
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def identify_rocm_arch_and_hip_id() -> tuple[str, str]:
|
|
61
|
+
"""
|
|
62
|
+
Identify the appropriate ROCm target architecture based on the device info
|
|
63
|
+
Returns tuple of (architecture, gpu_type) where gpu_type is 'igpu' or 'dgpu'
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
# Check for integrated and discrete AMD GPUs
|
|
67
|
+
system_info = get_system_info()
|
|
68
|
+
amd_igpu = system_info.get_amd_igpu_device()
|
|
69
|
+
amd_dgpu = system_info.get_amd_dgpu_devices()
|
|
70
|
+
target_arch = None
|
|
71
|
+
gpu_count = 0
|
|
72
|
+
for gpu in [amd_igpu] + amd_dgpu:
|
|
73
|
+
if gpu.get("available") and gpu.get("name"):
|
|
74
|
+
gpu_count += 1
|
|
75
|
+
target_arch = identify_rocm_arch_from_name(gpu["name"].lower())
|
|
76
|
+
if target_arch:
|
|
77
|
+
break
|
|
78
|
+
|
|
79
|
+
# Get HIP ID based on the number of GPUs available
|
|
80
|
+
# Here, we assume that the iGPU will always show up before the dGPUs (if available)
|
|
81
|
+
# We also assume that selecting the dGPU is preferred over the iGPU
|
|
82
|
+
# Multiple GPUs are not supported at the moment
|
|
83
|
+
hip_id = str(gpu_count - 1)
|
|
84
|
+
|
|
85
|
+
return target_arch, hip_id
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def get_llama_version(backend: str) -> str:
|
|
89
|
+
"""
|
|
90
|
+
Select the appropriate llama.cpp version based on the backend
|
|
91
|
+
"""
|
|
92
|
+
if backend == "rocm":
|
|
93
|
+
return LLAMA_VERSION_ROCM
|
|
94
|
+
elif backend == "vulkan":
|
|
95
|
+
return LLAMA_VERSION_VULKAN
|
|
96
|
+
else:
|
|
97
|
+
raise ValueError(f"Unsupported backend: {backend}")
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def get_llama_folder_path(backend: str):
|
|
17
101
|
"""
|
|
18
102
|
Get path for llama.cpp platform-specific executables folder
|
|
19
103
|
"""
|
|
20
|
-
return os.path.join(os.path.dirname(sys.executable), "
|
|
104
|
+
return os.path.join(os.path.dirname(sys.executable), backend, "llama_server")
|
|
21
105
|
|
|
22
106
|
|
|
23
|
-
def get_llama_exe_path(exe_name):
|
|
107
|
+
def get_llama_exe_path(exe_name: str, backend: str):
|
|
24
108
|
"""
|
|
25
109
|
Get path to platform-specific llama-server executable
|
|
26
110
|
"""
|
|
27
|
-
base_dir = get_llama_folder_path()
|
|
111
|
+
base_dir = get_llama_folder_path(backend)
|
|
28
112
|
if platform.system().lower() == "windows":
|
|
29
113
|
return os.path.join(base_dir, f"{exe_name}.exe")
|
|
30
114
|
else: # Linux/Ubuntu
|
|
@@ -37,33 +121,33 @@ def get_llama_exe_path(exe_name):
|
|
|
37
121
|
return os.path.join(base_dir, exe_name)
|
|
38
122
|
|
|
39
123
|
|
|
40
|
-
def get_llama_server_exe_path():
|
|
124
|
+
def get_llama_server_exe_path(backend: str):
|
|
41
125
|
"""
|
|
42
126
|
Get path to platform-specific llama-server executable
|
|
43
127
|
"""
|
|
44
|
-
return get_llama_exe_path("llama-server")
|
|
128
|
+
return get_llama_exe_path("llama-server", backend)
|
|
45
129
|
|
|
46
130
|
|
|
47
|
-
def get_llama_cli_exe_path():
|
|
131
|
+
def get_llama_cli_exe_path(backend: str):
|
|
48
132
|
"""
|
|
49
133
|
Get path to platform-specific llama-cli executable
|
|
50
134
|
"""
|
|
51
|
-
return get_llama_exe_path("llama-cli")
|
|
135
|
+
return get_llama_exe_path("llama-cli", backend)
|
|
52
136
|
|
|
53
137
|
|
|
54
|
-
def get_version_txt_path():
|
|
138
|
+
def get_version_txt_path(backend: str):
|
|
55
139
|
"""
|
|
56
140
|
Get path to text file that contains version information
|
|
57
141
|
"""
|
|
58
|
-
return os.path.join(get_llama_folder_path(), "version.txt")
|
|
142
|
+
return os.path.join(get_llama_folder_path(backend), "version.txt")
|
|
59
143
|
|
|
60
144
|
|
|
61
|
-
def get_llama_installed_version():
|
|
145
|
+
def get_llama_installed_version(backend: str):
|
|
62
146
|
"""
|
|
63
147
|
Gets version of installed llama.cpp
|
|
64
148
|
Returns None if llama.cpp is not installed
|
|
65
149
|
"""
|
|
66
|
-
version_txt_path = get_version_txt_path()
|
|
150
|
+
version_txt_path = get_version_txt_path(backend)
|
|
67
151
|
if os.path.exists(version_txt_path):
|
|
68
152
|
with open(version_txt_path, "r", encoding="utf-8") as f:
|
|
69
153
|
llama_installed_version = f.read()
|
|
@@ -71,24 +155,48 @@ def get_llama_installed_version():
|
|
|
71
155
|
return None
|
|
72
156
|
|
|
73
157
|
|
|
74
|
-
def get_binary_url_and_filename(
|
|
158
|
+
def get_binary_url_and_filename(backend: str, target_arch: str = None):
|
|
75
159
|
"""
|
|
76
|
-
Get the appropriate
|
|
160
|
+
Get the appropriate binary URL and filename based on platform and backend
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
backend: Backend to use
|
|
77
164
|
"""
|
|
78
165
|
system = platform.system().lower()
|
|
79
166
|
|
|
80
|
-
if
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
167
|
+
if backend == "rocm":
|
|
168
|
+
|
|
169
|
+
# ROCm support from lemonade-sdk/llamacpp-rocm
|
|
170
|
+
repo = "lemonade-sdk/llamacpp-rocm"
|
|
171
|
+
version = LLAMA_VERSION_ROCM
|
|
172
|
+
if system == "windows":
|
|
173
|
+
filename = f"llama-{version}-windows-rocm-{target_arch}-x64.zip"
|
|
174
|
+
elif system == "linux":
|
|
175
|
+
filename = f"llama-{version}-ubuntu-rocm-{target_arch}-x64.zip"
|
|
176
|
+
else:
|
|
177
|
+
raise NotImplementedError(
|
|
178
|
+
f"Platform {system} not supported for ROCm llamacpp. Supported: Windows, Ubuntu Linux"
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
elif backend == "vulkan":
|
|
182
|
+
# Original Vulkan support from ggml-org/llama.cpp
|
|
183
|
+
repo = "ggml-org/llama.cpp"
|
|
184
|
+
version = LLAMA_VERSION_VULKAN
|
|
185
|
+
if system == "windows":
|
|
186
|
+
filename = f"llama-{version}-bin-win-vulkan-x64.zip"
|
|
187
|
+
elif system == "linux":
|
|
188
|
+
filename = f"llama-{version}-bin-ubuntu-vulkan-x64.zip"
|
|
189
|
+
else:
|
|
190
|
+
raise NotImplementedError(
|
|
191
|
+
f"Platform {system} not supported for Vulkan llamacpp. Supported: Windows, Ubuntu Linux"
|
|
192
|
+
)
|
|
84
193
|
else:
|
|
194
|
+
supported_backends = ["vulkan", "rocm"]
|
|
85
195
|
raise NotImplementedError(
|
|
86
|
-
f"
|
|
196
|
+
f"Unsupported backend: {backend}. Supported backends: {supported_backends}"
|
|
87
197
|
)
|
|
88
198
|
|
|
89
|
-
url =
|
|
90
|
-
f"https://github.com/ggml-org/llama.cpp/releases/download/{version}/{filename}"
|
|
91
|
-
)
|
|
199
|
+
url = f"https://github.com/{repo}/releases/download/{version}/{filename}"
|
|
92
200
|
return url, filename
|
|
93
201
|
|
|
94
202
|
|
|
@@ -122,7 +230,7 @@ def validate_platform_support():
|
|
|
122
230
|
)
|
|
123
231
|
|
|
124
232
|
|
|
125
|
-
def install_llamacpp():
|
|
233
|
+
def install_llamacpp(backend):
|
|
126
234
|
"""
|
|
127
235
|
Installs or upgrades llama.cpp binaries if needed
|
|
128
236
|
"""
|
|
@@ -130,56 +238,108 @@ def install_llamacpp():
|
|
|
130
238
|
# Exception will be thrown if platform is not supported
|
|
131
239
|
validate_platform_support()
|
|
132
240
|
|
|
133
|
-
|
|
134
|
-
|
|
241
|
+
version = get_llama_version(backend)
|
|
242
|
+
|
|
243
|
+
# Get platform-specific paths at runtime
|
|
244
|
+
llama_server_exe_dir = get_llama_folder_path(backend)
|
|
245
|
+
llama_server_exe_path = get_llama_server_exe_path(backend)
|
|
135
246
|
|
|
136
247
|
# Check whether the llamacpp install needs an upgrade
|
|
137
|
-
|
|
138
|
-
|
|
248
|
+
version_txt_path = os.path.join(llama_server_exe_dir, "version.txt")
|
|
249
|
+
backend_txt_path = os.path.join(llama_server_exe_dir, "backend.txt")
|
|
250
|
+
|
|
251
|
+
logging.info(f"Using backend: {backend}")
|
|
252
|
+
|
|
253
|
+
if os.path.exists(version_txt_path) and os.path.exists(backend_txt_path):
|
|
254
|
+
with open(version_txt_path, "r", encoding="utf-8") as f:
|
|
255
|
+
llamacpp_installed_version = f.read().strip()
|
|
256
|
+
with open(backend_txt_path, "r", encoding="utf-8") as f:
|
|
257
|
+
llamacpp_installed_backend = f.read().strip()
|
|
258
|
+
|
|
259
|
+
if (
|
|
260
|
+
llamacpp_installed_version != version
|
|
261
|
+
or llamacpp_installed_backend != backend
|
|
262
|
+
):
|
|
139
263
|
# Remove the existing install, which will trigger a new install
|
|
140
264
|
# in the next code block
|
|
141
|
-
shutil.rmtree(
|
|
265
|
+
shutil.rmtree(llama_server_exe_dir)
|
|
266
|
+
elif os.path.exists(version_txt_path):
|
|
267
|
+
# Old installation without backend tracking - remove to upgrade
|
|
268
|
+
shutil.rmtree(llama_server_exe_dir)
|
|
142
269
|
|
|
143
270
|
# Download llama.cpp server if it isn't already available
|
|
144
|
-
if not os.path.exists(
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
271
|
+
if not os.path.exists(llama_server_exe_path):
|
|
272
|
+
|
|
273
|
+
# Create the directory
|
|
274
|
+
os.makedirs(llama_server_exe_dir, exist_ok=True)
|
|
275
|
+
|
|
276
|
+
# Identify the target architecture (only needed for ROCm)
|
|
277
|
+
target_arch = None
|
|
278
|
+
if backend == "rocm":
|
|
279
|
+
# Identify the target architecture
|
|
280
|
+
target_arch, hip_id = identify_rocm_arch_and_hip_id()
|
|
281
|
+
if not target_arch:
|
|
282
|
+
system = platform.system().lower()
|
|
283
|
+
if system == "linux":
|
|
284
|
+
hint = (
|
|
285
|
+
"Hint: If you think your device is supported, "
|
|
286
|
+
"running `sudo update-pciids` may help identify your hardware."
|
|
287
|
+
)
|
|
288
|
+
else:
|
|
289
|
+
hint = ""
|
|
290
|
+
raise ValueError(
|
|
291
|
+
"ROCm backend selected but no compatible ROCm target architecture found. "
|
|
292
|
+
"See https://github.com/lemonade-sdk/lemonade?tab=readme-ov-file#supported-configurations "
|
|
293
|
+
f"for supported configurations. {hint}"
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
# Set HIP_VISIBLE_DEVICES=0 for igpu, =1 for dgpu
|
|
297
|
+
env_file_path = os.path.join(llama_server_exe_dir, ".env")
|
|
298
|
+
set_key(env_file_path, "HIP_VISIBLE_DEVICES", hip_id)
|
|
299
|
+
|
|
300
|
+
# Direct download for Vulkan/ROCm
|
|
301
|
+
llama_archive_url, filename = get_binary_url_and_filename(backend, target_arch)
|
|
302
|
+
llama_archive_path = os.path.join(llama_server_exe_dir, filename)
|
|
303
|
+
logging.info(f"Downloading llama.cpp server from {llama_archive_url}")
|
|
149
304
|
|
|
150
|
-
with requests.get(
|
|
305
|
+
with requests.get(llama_archive_url, stream=True) as r:
|
|
151
306
|
r.raise_for_status()
|
|
152
|
-
with open(
|
|
307
|
+
with open(llama_archive_path, "wb") as f:
|
|
153
308
|
for chunk in r.iter_content(chunk_size=8192):
|
|
154
309
|
f.write(chunk)
|
|
155
310
|
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
311
|
+
logging.info(f"Extracting {filename} to {llama_server_exe_dir}")
|
|
312
|
+
if filename.endswith(".zip"):
|
|
313
|
+
with zipfile.ZipFile(llama_archive_path, "r") as zip_ref:
|
|
314
|
+
zip_ref.extractall(llama_server_exe_dir)
|
|
315
|
+
else:
|
|
316
|
+
raise NotImplementedError(f"Unsupported archive format: {filename}")
|
|
160
317
|
|
|
161
318
|
# Make executable on Linux - need to update paths after extraction
|
|
162
319
|
if platform.system().lower() == "linux":
|
|
163
320
|
# Re-get the paths since extraction might have changed the directory structure
|
|
164
|
-
|
|
165
|
-
get_llama_server_exe_path(),
|
|
166
|
-
get_llama_cli_exe_path(),
|
|
167
|
-
]
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
321
|
+
exe_paths = [
|
|
322
|
+
(get_llama_server_exe_path(backend), "llama-server"),
|
|
323
|
+
(get_llama_cli_exe_path(backend), "llama-cli"),
|
|
324
|
+
]
|
|
325
|
+
|
|
326
|
+
for exe_path, exe_name in exe_paths:
|
|
327
|
+
if os.path.exists(exe_path):
|
|
328
|
+
os.chmod(exe_path, 0o755)
|
|
329
|
+
logging.info(f"Set executable permissions for {exe_path}")
|
|
171
330
|
else:
|
|
172
331
|
logging.warning(
|
|
173
|
-
f"Could not find
|
|
332
|
+
f"Could not find {exe_name} executable at {exe_path}"
|
|
174
333
|
)
|
|
175
334
|
|
|
176
|
-
# Save version
|
|
177
|
-
with open(
|
|
178
|
-
vf.write(
|
|
335
|
+
# Save version and backend info
|
|
336
|
+
with open(version_txt_path, "w", encoding="utf-8") as vf:
|
|
337
|
+
vf.write(version)
|
|
338
|
+
with open(backend_txt_path, "w", encoding="utf-8") as bf:
|
|
339
|
+
bf.write(backend)
|
|
179
340
|
|
|
180
|
-
# Delete
|
|
181
|
-
os.remove(
|
|
182
|
-
logging.info("Cleaned up zip file")
|
|
341
|
+
# Delete the archive file
|
|
342
|
+
os.remove(llama_archive_path)
|
|
183
343
|
|
|
184
344
|
|
|
185
345
|
def parse_checkpoint(checkpoint: str) -> tuple[str, str | None]:
|
|
@@ -525,6 +685,14 @@ class LlamaCppAdapter(ModelAdapter):
|
|
|
525
685
|
try:
|
|
526
686
|
# Set up environment with library path for Linux
|
|
527
687
|
env = os.environ.copy()
|
|
688
|
+
|
|
689
|
+
# Load environment variables from .env file in the executable directory
|
|
690
|
+
exe_dir = os.path.dirname(self.executable)
|
|
691
|
+
env_file_path = os.path.join(exe_dir, ".env")
|
|
692
|
+
if os.path.exists(env_file_path):
|
|
693
|
+
load_dotenv(env_file_path, override=True)
|
|
694
|
+
env.update(os.environ)
|
|
695
|
+
|
|
528
696
|
if self.lib_dir and os.name != "nt": # Not Windows
|
|
529
697
|
current_ld_path = env.get("LD_LIBRARY_PATH", "")
|
|
530
698
|
if current_ld_path:
|
lemonade/tools/oga/load.py
CHANGED
|
@@ -633,9 +633,9 @@ class OgaLoad(FirstTool):
|
|
|
633
633
|
model_generate.generate_hybrid_model(
|
|
634
634
|
input_model=input_model_path,
|
|
635
635
|
output_dir=output_model_path,
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
636
|
+
script_option="jit_npu",
|
|
637
|
+
mode="bf16",
|
|
638
|
+
dml_only=False,
|
|
639
639
|
)
|
|
640
640
|
except Exception as e:
|
|
641
641
|
raise RuntimeError(
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import os
|
|
2
|
-
import sys
|
|
3
2
|
import logging
|
|
4
3
|
import time
|
|
5
4
|
import subprocess
|
|
@@ -9,6 +8,7 @@ import platform
|
|
|
9
8
|
|
|
10
9
|
import requests
|
|
11
10
|
from tabulate import tabulate
|
|
11
|
+
from dotenv import load_dotenv
|
|
12
12
|
from fastapi import HTTPException, status
|
|
13
13
|
from fastapi.responses import StreamingResponse
|
|
14
14
|
|
|
@@ -29,8 +29,6 @@ from lemonade.tools.llamacpp.utils import (
|
|
|
29
29
|
download_gguf,
|
|
30
30
|
)
|
|
31
31
|
|
|
32
|
-
LLAMA_VERSION = "b5787"
|
|
33
|
-
|
|
34
32
|
|
|
35
33
|
def llamacpp_address(port: int) -> str:
|
|
36
34
|
"""
|
|
@@ -45,45 +43,6 @@ def llamacpp_address(port: int) -> str:
|
|
|
45
43
|
return f"http://127.0.0.1:{port}/v1"
|
|
46
44
|
|
|
47
45
|
|
|
48
|
-
def get_llama_server_paths():
|
|
49
|
-
"""
|
|
50
|
-
Get platform-specific paths for llama server directory and executable
|
|
51
|
-
"""
|
|
52
|
-
base_dir = os.path.join(os.path.dirname(sys.executable), "llama_server")
|
|
53
|
-
|
|
54
|
-
if platform.system().lower() == "windows":
|
|
55
|
-
return base_dir, os.path.join(base_dir, "llama-server.exe")
|
|
56
|
-
else: # Linux/Ubuntu
|
|
57
|
-
# Check if executable exists in build/bin subdirectory (Current Ubuntu structure)
|
|
58
|
-
build_bin_path = os.path.join(base_dir, "build", "bin", "llama-server")
|
|
59
|
-
if os.path.exists(build_bin_path):
|
|
60
|
-
return base_dir, build_bin_path
|
|
61
|
-
else:
|
|
62
|
-
# Fallback to root directory
|
|
63
|
-
return base_dir, os.path.join(base_dir, "llama-server")
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
def get_binary_url_and_filename(version):
|
|
67
|
-
"""
|
|
68
|
-
Get the appropriate binary URL and filename based on platform
|
|
69
|
-
"""
|
|
70
|
-
system = platform.system().lower()
|
|
71
|
-
|
|
72
|
-
if system == "windows":
|
|
73
|
-
filename = f"llama-{version}-bin-win-vulkan-x64.zip"
|
|
74
|
-
elif system == "linux":
|
|
75
|
-
filename = f"llama-{version}-bin-ubuntu-vulkan-x64.zip"
|
|
76
|
-
else:
|
|
77
|
-
raise NotImplementedError(
|
|
78
|
-
f"Platform {system} not supported for llamacpp. Supported: Windows, Ubuntu Linux"
|
|
79
|
-
)
|
|
80
|
-
|
|
81
|
-
url = (
|
|
82
|
-
f"https://github.com/ggml-org/llama.cpp/releases/download/{version}/{filename}"
|
|
83
|
-
)
|
|
84
|
-
return url, filename
|
|
85
|
-
|
|
86
|
-
|
|
87
46
|
class LlamaTelemetry:
|
|
88
47
|
"""
|
|
89
48
|
Manages telemetry data collection and display for llama server.
|
|
@@ -125,7 +84,7 @@ class LlamaTelemetry:
|
|
|
125
84
|
device_count = int(vulkan_match.group(1))
|
|
126
85
|
if device_count > 0:
|
|
127
86
|
logging.info(
|
|
128
|
-
f"GPU acceleration active: {device_count}
|
|
87
|
+
f"GPU acceleration active: {device_count} device(s) "
|
|
129
88
|
"detected by llama-server"
|
|
130
89
|
)
|
|
131
90
|
return
|
|
@@ -236,6 +195,8 @@ def _launch_llama_subprocess(
|
|
|
236
195
|
snapshot_files: dict,
|
|
237
196
|
use_gpu: bool,
|
|
238
197
|
telemetry: LlamaTelemetry,
|
|
198
|
+
backend: str,
|
|
199
|
+
ctx_size: int,
|
|
239
200
|
supports_embeddings: bool = False,
|
|
240
201
|
supports_reranking: bool = False,
|
|
241
202
|
) -> subprocess.Popen:
|
|
@@ -246,6 +207,7 @@ def _launch_llama_subprocess(
|
|
|
246
207
|
snapshot_files: Dictionary of model files to load
|
|
247
208
|
use_gpu: Whether to use GPU acceleration
|
|
248
209
|
telemetry: Telemetry object for tracking performance metrics
|
|
210
|
+
backend: Backend to use (e.g., 'vulkan', 'rocm')
|
|
249
211
|
supports_embeddings: Whether the model supports embeddings
|
|
250
212
|
supports_reranking: Whether the model supports reranking
|
|
251
213
|
|
|
@@ -254,10 +216,16 @@ def _launch_llama_subprocess(
|
|
|
254
216
|
"""
|
|
255
217
|
|
|
256
218
|
# Get the current executable path (handles both Windows and Ubuntu structures)
|
|
257
|
-
exe_path = get_llama_server_exe_path()
|
|
219
|
+
exe_path = get_llama_server_exe_path(backend)
|
|
258
220
|
|
|
259
221
|
# Build the base command
|
|
260
|
-
base_command = [
|
|
222
|
+
base_command = [
|
|
223
|
+
exe_path,
|
|
224
|
+
"-m",
|
|
225
|
+
snapshot_files["variant"],
|
|
226
|
+
"--ctx-size",
|
|
227
|
+
str(ctx_size),
|
|
228
|
+
]
|
|
261
229
|
if "mmproj" in snapshot_files:
|
|
262
230
|
base_command.extend(["--mmproj", snapshot_files["mmproj"]])
|
|
263
231
|
if not use_gpu:
|
|
@@ -288,6 +256,15 @@ def _launch_llama_subprocess(
|
|
|
288
256
|
|
|
289
257
|
# Set up environment with library path for Linux
|
|
290
258
|
env = os.environ.copy()
|
|
259
|
+
|
|
260
|
+
# Load environment variables from .env file in the executable directory
|
|
261
|
+
exe_dir = os.path.dirname(exe_path)
|
|
262
|
+
env_file_path = os.path.join(exe_dir, ".env")
|
|
263
|
+
if os.path.exists(env_file_path):
|
|
264
|
+
load_dotenv(env_file_path, override=True)
|
|
265
|
+
env.update(os.environ)
|
|
266
|
+
logging.debug(f"Loaded environment variables from {env_file_path}")
|
|
267
|
+
|
|
291
268
|
if platform.system().lower() == "linux":
|
|
292
269
|
lib_dir = os.path.dirname(exe_path) # Same directory as the executable
|
|
293
270
|
current_ld_path = env.get("LD_LIBRARY_PATH", "")
|
|
@@ -320,18 +297,17 @@ def _launch_llama_subprocess(
|
|
|
320
297
|
return process
|
|
321
298
|
|
|
322
299
|
|
|
323
|
-
def server_load(
|
|
300
|
+
def server_load(
|
|
301
|
+
model_config: PullConfig, telemetry: LlamaTelemetry, backend: str, ctx_size: int
|
|
302
|
+
):
|
|
324
303
|
# Install and/or update llama.cpp if needed
|
|
325
304
|
try:
|
|
326
|
-
install_llamacpp()
|
|
305
|
+
install_llamacpp(backend)
|
|
327
306
|
except NotImplementedError as e:
|
|
328
307
|
raise HTTPException(
|
|
329
308
|
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, detail=str(e)
|
|
330
309
|
)
|
|
331
310
|
|
|
332
|
-
# Get platform-specific paths at runtime
|
|
333
|
-
llama_server_exe_path = get_llama_server_exe_path()
|
|
334
|
-
|
|
335
311
|
# Download the gguf to the hugging face cache
|
|
336
312
|
snapshot_files = download_gguf(model_config.checkpoint, model_config.mmproj)
|
|
337
313
|
logging.debug(f"GGUF file paths: {snapshot_files}")
|
|
@@ -342,14 +318,13 @@ def server_load(model_config: PullConfig, telemetry: LlamaTelemetry):
|
|
|
342
318
|
supports_embeddings = "embeddings" in model_info.get("labels", [])
|
|
343
319
|
supports_reranking = "reranking" in model_info.get("labels", [])
|
|
344
320
|
|
|
345
|
-
# Start the llama-serve.exe process
|
|
346
|
-
logging.debug(f"Using llama_server for GGUF model: {llama_server_exe_path}")
|
|
347
|
-
|
|
348
321
|
# Attempt loading on GPU first
|
|
349
322
|
llama_server_process = _launch_llama_subprocess(
|
|
350
323
|
snapshot_files,
|
|
351
324
|
use_gpu=True,
|
|
352
325
|
telemetry=telemetry,
|
|
326
|
+
backend=backend,
|
|
327
|
+
ctx_size=ctx_size,
|
|
353
328
|
supports_embeddings=supports_embeddings,
|
|
354
329
|
supports_reranking=supports_reranking,
|
|
355
330
|
)
|
|
@@ -374,6 +349,8 @@ def server_load(model_config: PullConfig, telemetry: LlamaTelemetry):
|
|
|
374
349
|
snapshot_files,
|
|
375
350
|
use_gpu=False,
|
|
376
351
|
telemetry=telemetry,
|
|
352
|
+
backend=backend,
|
|
353
|
+
ctx_size=ctx_size,
|
|
377
354
|
supports_embeddings=supports_embeddings,
|
|
378
355
|
supports_reranking=supports_reranking,
|
|
379
356
|
)
|