lemonade-sdk 8.1.10__py3-none-any.whl → 8.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lemonade-sdk might be problematic. Click here for more details.
- lemonade/tools/flm/__init__.py +1 -0
- lemonade/tools/flm/utils.py +255 -0
- lemonade/tools/llamacpp/utils.py +58 -10
- lemonade/tools/server/flm.py +137 -0
- lemonade/tools/server/llamacpp.py +23 -5
- lemonade/tools/server/serve.py +260 -135
- lemonade/tools/server/static/js/chat.js +165 -82
- lemonade/tools/server/static/js/models.js +87 -54
- lemonade/tools/server/static/js/shared.js +5 -3
- lemonade/tools/server/static/logs.html +47 -0
- lemonade/tools/server/static/styles.css +159 -8
- lemonade/tools/server/static/webapp.html +28 -10
- lemonade/tools/server/tray.py +94 -38
- lemonade/tools/server/utils/macos_tray.py +226 -0
- lemonade/tools/server/utils/{system_tray.py → windows_tray.py} +13 -0
- lemonade/tools/server/webapp.py +4 -1
- lemonade/tools/server/wrapped_server.py +91 -25
- lemonade/version.py +1 -1
- lemonade_install/install.py +25 -2
- {lemonade_sdk-8.1.10.dist-info → lemonade_sdk-8.1.11.dist-info}/METADATA +9 -6
- {lemonade_sdk-8.1.10.dist-info → lemonade_sdk-8.1.11.dist-info}/RECORD +30 -25
- lemonade_server/cli.py +103 -14
- lemonade_server/model_manager.py +186 -45
- lemonade_server/pydantic_models.py +25 -1
- lemonade_server/server_models.json +162 -62
- {lemonade_sdk-8.1.10.dist-info → lemonade_sdk-8.1.11.dist-info}/WHEEL +0 -0
- {lemonade_sdk-8.1.10.dist-info → lemonade_sdk-8.1.11.dist-info}/entry_points.txt +0 -0
- {lemonade_sdk-8.1.10.dist-info → lemonade_sdk-8.1.11.dist-info}/licenses/LICENSE +0 -0
- {lemonade_sdk-8.1.10.dist-info → lemonade_sdk-8.1.11.dist-info}/licenses/NOTICE.md +0 -0
- {lemonade_sdk-8.1.10.dist-info → lemonade_sdk-8.1.11.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# FLM (FastFlowLM) utilities for Lemonade SDK
|
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
"""
|
|
2
|
+
FLM (FastFlowLM) utilities for installation, version checking, and model management.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import logging
|
|
7
|
+
import subprocess
|
|
8
|
+
import tempfile
|
|
9
|
+
import time
|
|
10
|
+
from typing import List, Optional
|
|
11
|
+
|
|
12
|
+
import requests
|
|
13
|
+
from packaging.version import Version
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
FLM_MINIMUM_VERSION = "0.9.10"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def check_flm_version() -> Optional[str]:
|
|
20
|
+
"""
|
|
21
|
+
Check if FLM is installed and return version, or None if not available.
|
|
22
|
+
"""
|
|
23
|
+
try:
|
|
24
|
+
result = subprocess.run(
|
|
25
|
+
["flm", "version"],
|
|
26
|
+
capture_output=True,
|
|
27
|
+
text=True,
|
|
28
|
+
check=True,
|
|
29
|
+
encoding="utf-8",
|
|
30
|
+
errors="replace",
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
# Parse version from output like "FLM v0.9.4"
|
|
34
|
+
output = result.stdout.strip()
|
|
35
|
+
if output.startswith("FLM v"):
|
|
36
|
+
version_str = output[5:] # Remove "FLM v" prefix
|
|
37
|
+
return version_str
|
|
38
|
+
return None
|
|
39
|
+
|
|
40
|
+
except (subprocess.CalledProcessError, FileNotFoundError):
|
|
41
|
+
return None
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def refresh_environment():
|
|
45
|
+
"""
|
|
46
|
+
Refresh PATH to pick up newly installed executables.
|
|
47
|
+
"""
|
|
48
|
+
if os.name == "nt": # Windows
|
|
49
|
+
# On Windows, we need to refresh the PATH from registry
|
|
50
|
+
import winreg
|
|
51
|
+
|
|
52
|
+
try:
|
|
53
|
+
with winreg.OpenKey(
|
|
54
|
+
winreg.HKEY_LOCAL_MACHINE,
|
|
55
|
+
r"SYSTEM\CurrentControlSet\Control\Session Manager\Environment",
|
|
56
|
+
) as key:
|
|
57
|
+
path_value, _ = winreg.QueryValueEx(key, "PATH")
|
|
58
|
+
os.environ["PATH"] = path_value + ";" + os.environ.get("PATH", "")
|
|
59
|
+
except Exception as e: # pylint: disable=broad-except
|
|
60
|
+
logging.debug("Could not refresh PATH from registry: %s", e)
|
|
61
|
+
|
|
62
|
+
# Also try to add common installation paths
|
|
63
|
+
common_paths = [
|
|
64
|
+
r"C:\Program Files\FLM",
|
|
65
|
+
r"C:\Program Files (x86)\FLM",
|
|
66
|
+
os.path.expanduser(r"~\AppData\Local\FLM"),
|
|
67
|
+
]
|
|
68
|
+
for path in common_paths:
|
|
69
|
+
if os.path.exists(path) and path not in os.environ.get("PATH", ""):
|
|
70
|
+
os.environ["PATH"] = path + ";" + os.environ.get("PATH", "")
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def install_flm():
|
|
74
|
+
"""
|
|
75
|
+
Check if FLM is installed and at minimum version.
|
|
76
|
+
If not, download and run the GUI installer, then wait for completion.
|
|
77
|
+
"""
|
|
78
|
+
# Check current FLM installation
|
|
79
|
+
current_version = check_flm_version()
|
|
80
|
+
|
|
81
|
+
if current_version and Version(current_version) >= Version(FLM_MINIMUM_VERSION):
|
|
82
|
+
logging.info(
|
|
83
|
+
"FLM v%s is already installed and meets minimum version requirement (v%s)",
|
|
84
|
+
current_version,
|
|
85
|
+
FLM_MINIMUM_VERSION,
|
|
86
|
+
)
|
|
87
|
+
return
|
|
88
|
+
|
|
89
|
+
if current_version:
|
|
90
|
+
logging.info(
|
|
91
|
+
"FLM v%s is installed but below minimum version v%s. Upgrading...",
|
|
92
|
+
current_version,
|
|
93
|
+
FLM_MINIMUM_VERSION,
|
|
94
|
+
)
|
|
95
|
+
else:
|
|
96
|
+
logging.info(
|
|
97
|
+
"FLM not found. Installing FLM v%s or later...", FLM_MINIMUM_VERSION
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
# Download the installer
|
|
101
|
+
# pylint: disable=line-too-long
|
|
102
|
+
installer_url = "https://github.com/FastFlowLM/FastFlowLM/releases/latest/download/flm-setup.exe"
|
|
103
|
+
installer_path = os.path.join(tempfile.gettempdir(), "flm-setup.exe")
|
|
104
|
+
|
|
105
|
+
try:
|
|
106
|
+
# Remove existing installer if present
|
|
107
|
+
if os.path.exists(installer_path):
|
|
108
|
+
os.remove(installer_path)
|
|
109
|
+
|
|
110
|
+
logging.info("Downloading FLM installer...")
|
|
111
|
+
response = requests.get(installer_url, stream=True, timeout=30)
|
|
112
|
+
response.raise_for_status()
|
|
113
|
+
|
|
114
|
+
# Save installer to disk
|
|
115
|
+
with open(installer_path, "wb") as f:
|
|
116
|
+
for chunk in response.iter_content(chunk_size=8192):
|
|
117
|
+
f.write(chunk)
|
|
118
|
+
f.flush()
|
|
119
|
+
os.fsync(f.fileno())
|
|
120
|
+
|
|
121
|
+
logging.info("Downloaded FLM installer to %s", installer_path)
|
|
122
|
+
|
|
123
|
+
# Launch the installer GUI
|
|
124
|
+
logging.warning(
|
|
125
|
+
"Launching FLM installer GUI. Please complete the installation..."
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
# Launch installer and wait for it to complete
|
|
129
|
+
if os.name == "nt": # Windows
|
|
130
|
+
process = subprocess.Popen([installer_path], shell=True)
|
|
131
|
+
else:
|
|
132
|
+
process = subprocess.Popen([installer_path])
|
|
133
|
+
|
|
134
|
+
# Wait for installer to complete
|
|
135
|
+
process.wait()
|
|
136
|
+
|
|
137
|
+
if process.returncode != 0:
|
|
138
|
+
raise RuntimeError(
|
|
139
|
+
f"FLM installer failed with exit code {process.returncode}"
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
logging.info("FLM installer completed successfully")
|
|
143
|
+
|
|
144
|
+
# Refresh environment to pick up new PATH entries
|
|
145
|
+
refresh_environment()
|
|
146
|
+
|
|
147
|
+
# Wait a moment for system to update
|
|
148
|
+
time.sleep(2)
|
|
149
|
+
|
|
150
|
+
# Verify installation
|
|
151
|
+
max_retries = 10
|
|
152
|
+
for attempt in range(max_retries):
|
|
153
|
+
new_version = check_flm_version()
|
|
154
|
+
if new_version and Version(new_version) >= Version(FLM_MINIMUM_VERSION):
|
|
155
|
+
logging.info("FLM v%s successfully installed and verified", new_version)
|
|
156
|
+
return
|
|
157
|
+
|
|
158
|
+
if attempt < max_retries - 1:
|
|
159
|
+
logging.debug(
|
|
160
|
+
"FLM not yet available in PATH, retrying... (attempt %d/%d)",
|
|
161
|
+
attempt + 1,
|
|
162
|
+
max_retries,
|
|
163
|
+
)
|
|
164
|
+
time.sleep(3)
|
|
165
|
+
refresh_environment()
|
|
166
|
+
|
|
167
|
+
# Final check failed
|
|
168
|
+
raise RuntimeError(
|
|
169
|
+
"FLM installation completed but 'flm' command is not available in PATH. "
|
|
170
|
+
"Please ensure FLM is properly installed and available in your system PATH."
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
except requests.RequestException as e:
|
|
174
|
+
raise RuntimeError(f"Failed to download FLM installer: {e}") from e
|
|
175
|
+
except Exception as e:
|
|
176
|
+
raise RuntimeError(f"FLM installation failed: {e}") from e
|
|
177
|
+
finally:
|
|
178
|
+
# Clean up installer file
|
|
179
|
+
if os.path.exists(installer_path):
|
|
180
|
+
try:
|
|
181
|
+
os.remove(installer_path)
|
|
182
|
+
except OSError:
|
|
183
|
+
pass # Ignore cleanup errors
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def download_flm_model(config_checkpoint, _=None, do_not_upgrade=False) -> dict:
|
|
187
|
+
"""
|
|
188
|
+
Downloads the FLM model for the given configuration.
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
config_checkpoint: name of the FLM model to install.
|
|
192
|
+
_: placeholder for `config_mmproj`, which is standard
|
|
193
|
+
for WrappedServer (see llamacpp/utils.py) .
|
|
194
|
+
do_not_upgrade: whether to re-download the model if it is already
|
|
195
|
+
available.
|
|
196
|
+
"""
|
|
197
|
+
|
|
198
|
+
if do_not_upgrade:
|
|
199
|
+
command = ["flm", "pull", f"{config_checkpoint}"]
|
|
200
|
+
else:
|
|
201
|
+
command = ["flm", "pull", f"{config_checkpoint}", "--force"]
|
|
202
|
+
|
|
203
|
+
subprocess.run(command, check=True)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def get_flm_installed_models() -> List[str]:
|
|
207
|
+
"""
|
|
208
|
+
Parse FLM model list and return installed model checkpoints.
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
List of installed FLM model checkpoints (e.g., ["llama3.2:1b", "gemma3:4b"])
|
|
212
|
+
"""
|
|
213
|
+
try:
|
|
214
|
+
result = subprocess.run(
|
|
215
|
+
["flm", "list"],
|
|
216
|
+
capture_output=True,
|
|
217
|
+
text=True,
|
|
218
|
+
check=True,
|
|
219
|
+
encoding="utf-8",
|
|
220
|
+
errors="replace",
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
# Check if we got valid output
|
|
224
|
+
if not result.stdout:
|
|
225
|
+
return []
|
|
226
|
+
|
|
227
|
+
installed_checkpoints = []
|
|
228
|
+
|
|
229
|
+
lines = result.stdout.strip().split("\n")
|
|
230
|
+
for line in lines:
|
|
231
|
+
line = line.strip()
|
|
232
|
+
if line.startswith("- "):
|
|
233
|
+
# Remove the leading "- " and parse the model info
|
|
234
|
+
model_info = line[2:].strip()
|
|
235
|
+
|
|
236
|
+
# Check if model is installed (✅)
|
|
237
|
+
if model_info.endswith(" ✅"):
|
|
238
|
+
checkpoint = model_info[:-2].strip()
|
|
239
|
+
installed_checkpoints.append(checkpoint)
|
|
240
|
+
|
|
241
|
+
return installed_checkpoints
|
|
242
|
+
|
|
243
|
+
except (subprocess.CalledProcessError, FileNotFoundError, AttributeError):
|
|
244
|
+
# FLM not installed, not available, or output parsing failed
|
|
245
|
+
return []
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def is_flm_available() -> bool:
|
|
249
|
+
"""
|
|
250
|
+
Check if FLM is available and meets minimum version requirements.
|
|
251
|
+
"""
|
|
252
|
+
current_version = check_flm_version()
|
|
253
|
+
return current_version is not None and Version(current_version) >= Version(
|
|
254
|
+
FLM_MINIMUM_VERSION
|
|
255
|
+
)
|
lemonade/tools/llamacpp/utils.py
CHANGED
|
@@ -14,8 +14,9 @@ from lemonade.common.system_info import get_system_info
|
|
|
14
14
|
|
|
15
15
|
from dotenv import set_key, load_dotenv
|
|
16
16
|
|
|
17
|
-
LLAMA_VERSION_VULKAN = "
|
|
18
|
-
LLAMA_VERSION_ROCM = "
|
|
17
|
+
LLAMA_VERSION_VULKAN = "b6510"
|
|
18
|
+
LLAMA_VERSION_ROCM = "b1066"
|
|
19
|
+
LLAMA_VERSION_METAL = "b6510"
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
def identify_rocm_arch_from_name(device_name: str) -> str | None:
|
|
@@ -126,8 +127,12 @@ def get_llama_version(backend: str) -> str:
|
|
|
126
127
|
return LLAMA_VERSION_ROCM
|
|
127
128
|
elif backend == "vulkan":
|
|
128
129
|
return LLAMA_VERSION_VULKAN
|
|
130
|
+
elif backend == "metal":
|
|
131
|
+
return LLAMA_VERSION_METAL
|
|
129
132
|
else:
|
|
130
|
-
raise ValueError(
|
|
133
|
+
raise ValueError(
|
|
134
|
+
f"Unsupported backend: {backend}. Supported: vulkan, rocm, metal"
|
|
135
|
+
)
|
|
131
136
|
|
|
132
137
|
|
|
133
138
|
def get_llama_folder_path(backend: str):
|
|
@@ -142,10 +147,12 @@ def get_llama_exe_path(exe_name: str, backend: str):
|
|
|
142
147
|
Get path to platform-specific llama-server executable
|
|
143
148
|
"""
|
|
144
149
|
base_dir = get_llama_folder_path(backend)
|
|
145
|
-
|
|
150
|
+
system = platform.system().lower()
|
|
151
|
+
|
|
152
|
+
if system == "windows":
|
|
146
153
|
return os.path.join(base_dir, f"{exe_name}.exe")
|
|
147
|
-
else: # Linux/Ubuntu
|
|
148
|
-
# Check if executable exists in build/bin subdirectory
|
|
154
|
+
else: # Darwin/Linux/Ubuntu
|
|
155
|
+
# Check if executable exists in build/bin subdirectory
|
|
149
156
|
build_bin_path = os.path.join(base_dir, "build", "bin", exe_name)
|
|
150
157
|
if os.path.exists(build_bin_path):
|
|
151
158
|
return build_bin_path
|
|
@@ -223,8 +230,24 @@ def get_binary_url_and_filename(backend: str, target_arch: str = None):
|
|
|
223
230
|
raise NotImplementedError(
|
|
224
231
|
f"Platform {system} not supported for Vulkan llamacpp. Supported: Windows, Ubuntu Linux"
|
|
225
232
|
)
|
|
233
|
+
|
|
234
|
+
elif backend == "metal":
|
|
235
|
+
# Metal support for macOS Apple Silicon from ggml-org/llama.cpp
|
|
236
|
+
repo = "ggml-org/llama.cpp"
|
|
237
|
+
version = LLAMA_VERSION_METAL
|
|
238
|
+
if system == "darwin":
|
|
239
|
+
if platform.machine().lower() in ["arm64", "aarch64"]:
|
|
240
|
+
filename = f"llama-{version}-bin-macos-arm64.zip"
|
|
241
|
+
else:
|
|
242
|
+
raise NotImplementedError(
|
|
243
|
+
"Metal backend only supports Apple Silicon (ARM64) processors"
|
|
244
|
+
)
|
|
245
|
+
else:
|
|
246
|
+
raise NotImplementedError(
|
|
247
|
+
f"Platform {system} not supported for Metal llamacpp. Metal is only supported on macOS"
|
|
248
|
+
)
|
|
226
249
|
else:
|
|
227
|
-
supported_backends = ["vulkan", "rocm"]
|
|
250
|
+
supported_backends = ["vulkan", "rocm", "metal"]
|
|
228
251
|
raise NotImplementedError(
|
|
229
252
|
f"Unsupported backend: {backend}. Supported backends: {supported_backends}"
|
|
230
253
|
)
|
|
@@ -239,10 +262,10 @@ def validate_platform_support():
|
|
|
239
262
|
"""
|
|
240
263
|
system = platform.system().lower()
|
|
241
264
|
|
|
242
|
-
if system not in ["windows", "linux"]:
|
|
265
|
+
if system not in ["windows", "linux", "darwin"]:
|
|
243
266
|
raise NotImplementedError(
|
|
244
267
|
f"Platform {system} not supported for llamacpp. "
|
|
245
|
-
"Supported: Windows, Ubuntu Linux"
|
|
268
|
+
"Supported: Windows, Ubuntu Linux, macOS"
|
|
246
269
|
)
|
|
247
270
|
|
|
248
271
|
if system == "linux":
|
|
@@ -341,6 +364,29 @@ def install_llamacpp(backend):
|
|
|
341
364
|
if filename.endswith(".zip"):
|
|
342
365
|
with zipfile.ZipFile(llama_archive_path, "r") as zip_ref:
|
|
343
366
|
zip_ref.extractall(llama_server_exe_dir)
|
|
367
|
+
|
|
368
|
+
# On Unix-like systems (macOS/Linux), make executables executable
|
|
369
|
+
if platform.system().lower() in ["darwin", "linux"]:
|
|
370
|
+
import stat
|
|
371
|
+
|
|
372
|
+
# Find and make executable files executable
|
|
373
|
+
for root, dirs, files in os.walk(llama_server_exe_dir):
|
|
374
|
+
for file in files:
|
|
375
|
+
file_path = os.path.join(root, file)
|
|
376
|
+
# Make files in bin/ directories executable
|
|
377
|
+
if "bin" in root.split(os.sep) or file in [
|
|
378
|
+
"llama-server",
|
|
379
|
+
"llama-simple",
|
|
380
|
+
]:
|
|
381
|
+
try:
|
|
382
|
+
current_permissions = os.stat(file_path).st_mode
|
|
383
|
+
os.chmod(file_path, current_permissions | stat.S_IEXEC)
|
|
384
|
+
logging.debug(f"Made {file_path} executable")
|
|
385
|
+
except Exception as e:
|
|
386
|
+
raise RuntimeError(
|
|
387
|
+
f"Failed to make {file_path} executable. This will prevent "
|
|
388
|
+
f"llama-server from starting. Error: {e}"
|
|
389
|
+
)
|
|
344
390
|
else:
|
|
345
391
|
raise NotImplementedError(f"Unsupported archive format: {filename}")
|
|
346
392
|
|
|
@@ -857,7 +903,9 @@ def get_hip_devices():
|
|
|
857
903
|
try:
|
|
858
904
|
libhip = ctypes.CDLL(matching_files[0])
|
|
859
905
|
except OSError:
|
|
860
|
-
raise RuntimeError(
|
|
906
|
+
raise RuntimeError(
|
|
907
|
+
f"Could not load HIP runtime library from {matching_files[0]}"
|
|
908
|
+
)
|
|
861
909
|
|
|
862
910
|
# Setup function signatures
|
|
863
911
|
hipError_t = c_int
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import logging
|
|
3
|
+
import subprocess
|
|
4
|
+
import time
|
|
5
|
+
import threading
|
|
6
|
+
|
|
7
|
+
import requests
|
|
8
|
+
|
|
9
|
+
from lemonade_server.pydantic_models import (
|
|
10
|
+
PullConfig,
|
|
11
|
+
ChatCompletionRequest,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
from lemonade.tools.server.wrapped_server import WrappedServerTelemetry, WrappedServer
|
|
15
|
+
from lemonade.tools.flm.utils import install_flm, download_flm_model
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class FlmTelemetry(WrappedServerTelemetry):
|
|
19
|
+
"""
|
|
20
|
+
Manages telemetry data collection and display for FLM server.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def parse_telemetry_line(self, line: str):
|
|
24
|
+
"""
|
|
25
|
+
Parse telemetry data from FLM server output lines.
|
|
26
|
+
|
|
27
|
+
Note: as of FLM 0.9.10, no telemetry data is provided by the server CLI.
|
|
28
|
+
This function is required to be implemented, so we leave it empty
|
|
29
|
+
as a placeholder for now.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
return
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class FlmServer(WrappedServer):
|
|
36
|
+
"""
|
|
37
|
+
Routes OpenAI API requests to an FLM server instance and returns the result
|
|
38
|
+
back to Lemonade Server.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(self):
|
|
42
|
+
self.flm_model_name = None
|
|
43
|
+
super().__init__(server_name="flm-server", telemetry=FlmTelemetry())
|
|
44
|
+
|
|
45
|
+
def _choose_port(self):
|
|
46
|
+
"""
|
|
47
|
+
`flm serve` doesn't support port selection as of v0.9.10
|
|
48
|
+
"""
|
|
49
|
+
self.port = 11434
|
|
50
|
+
|
|
51
|
+
def address(self):
|
|
52
|
+
return f"http://localhost:{self.port}/v1"
|
|
53
|
+
|
|
54
|
+
def install_server(self):
|
|
55
|
+
"""
|
|
56
|
+
Check if FLM is installed and at minimum version.
|
|
57
|
+
If not, download and run the GUI installer, then wait for completion.
|
|
58
|
+
"""
|
|
59
|
+
install_flm()
|
|
60
|
+
|
|
61
|
+
def download_model(
|
|
62
|
+
self, config_checkpoint, config_mmproj=None, do_not_upgrade=False
|
|
63
|
+
) -> dict:
|
|
64
|
+
download_flm_model(config_checkpoint, config_mmproj, do_not_upgrade)
|
|
65
|
+
|
|
66
|
+
def _launch_server_subprocess(
|
|
67
|
+
self,
|
|
68
|
+
model_config: PullConfig,
|
|
69
|
+
snapshot_files: dict,
|
|
70
|
+
ctx_size: int,
|
|
71
|
+
supports_embeddings: bool = False,
|
|
72
|
+
supports_reranking: bool = False,
|
|
73
|
+
):
|
|
74
|
+
|
|
75
|
+
self._choose_port()
|
|
76
|
+
|
|
77
|
+
# Keep track of the FLM model name so that we can use it later
|
|
78
|
+
self.flm_model_name = model_config.checkpoint
|
|
79
|
+
|
|
80
|
+
command = [
|
|
81
|
+
"flm",
|
|
82
|
+
"serve",
|
|
83
|
+
f"{self.flm_model_name}",
|
|
84
|
+
"--ctx-len",
|
|
85
|
+
str(ctx_size),
|
|
86
|
+
]
|
|
87
|
+
|
|
88
|
+
# Set up environment with library path for Linux
|
|
89
|
+
env = os.environ.copy()
|
|
90
|
+
|
|
91
|
+
self.process = subprocess.Popen(
|
|
92
|
+
command,
|
|
93
|
+
stdout=subprocess.PIPE,
|
|
94
|
+
stderr=subprocess.STDOUT,
|
|
95
|
+
text=True,
|
|
96
|
+
encoding="utf-8",
|
|
97
|
+
errors="replace",
|
|
98
|
+
bufsize=1,
|
|
99
|
+
env=env,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
# Start background thread to log subprocess output
|
|
103
|
+
threading.Thread(
|
|
104
|
+
target=self._log_subprocess_output,
|
|
105
|
+
args=("FLM SERVER",),
|
|
106
|
+
daemon=True,
|
|
107
|
+
).start()
|
|
108
|
+
|
|
109
|
+
def _wait_for_load(self):
|
|
110
|
+
"""
|
|
111
|
+
FLM doesn't seem to have a health API, so we'll use the "list local models"
|
|
112
|
+
API to check if the server is up.
|
|
113
|
+
"""
|
|
114
|
+
status_code = None
|
|
115
|
+
while not self.process.poll() and status_code != 200:
|
|
116
|
+
health_url = f"http://localhost:{self.port}/api/tags"
|
|
117
|
+
try:
|
|
118
|
+
health_response = requests.get(health_url)
|
|
119
|
+
except requests.exceptions.ConnectionError:
|
|
120
|
+
logging.debug(
|
|
121
|
+
"Not able to connect to %s yet, will retry", self.server_name
|
|
122
|
+
)
|
|
123
|
+
else:
|
|
124
|
+
status_code = health_response.status_code
|
|
125
|
+
logging.debug(
|
|
126
|
+
"Testing %s readiness (will retry until ready), result: %s",
|
|
127
|
+
self.server_name,
|
|
128
|
+
health_response.json(),
|
|
129
|
+
)
|
|
130
|
+
time.sleep(1)
|
|
131
|
+
|
|
132
|
+
def chat_completion(self, chat_completion_request: ChatCompletionRequest):
|
|
133
|
+
# FLM requires the correct model name to be in the request
|
|
134
|
+
# (whereas llama-server ignores the model name field in the request)
|
|
135
|
+
chat_completion_request.model = self.flm_model_name
|
|
136
|
+
|
|
137
|
+
return super().chat_completion(chat_completion_request)
|
|
@@ -88,9 +88,8 @@ class LlamaTelemetry(WrappedServerTelemetry):
|
|
|
88
88
|
|
|
89
89
|
class LlamaServer(WrappedServer):
|
|
90
90
|
def __init__(self, backend: str):
|
|
91
|
-
self.telemetry = LlamaTelemetry()
|
|
92
91
|
self.backend = backend
|
|
93
|
-
super().__init__(server_name="llama-server", telemetry=
|
|
92
|
+
super().__init__(server_name="llama-server", telemetry=LlamaTelemetry())
|
|
94
93
|
|
|
95
94
|
def install_server(self, backend=None):
|
|
96
95
|
"""
|
|
@@ -157,13 +156,23 @@ class LlamaServer(WrappedServer):
|
|
|
157
156
|
|
|
158
157
|
# Find a port, and save it in the telemetry object for future reference
|
|
159
158
|
# by other functions
|
|
160
|
-
self.
|
|
159
|
+
self._choose_port()
|
|
161
160
|
|
|
162
161
|
# Add port and jinja to enable tool use
|
|
163
162
|
base_command.extend(["--port", str(self.port), "--jinja"])
|
|
164
163
|
|
|
165
164
|
# Enable context shift and avoid attention sink issues by preserving the initial tokens
|
|
166
|
-
|
|
165
|
+
# Note: --context-shift is not supported on all backends (e.g., Metal on macOS)
|
|
166
|
+
# Only add context-shift for backends that support it
|
|
167
|
+
context_shift_supported_backends = ["vulkan", "rocm"]
|
|
168
|
+
if self.backend in context_shift_supported_backends:
|
|
169
|
+
base_command.extend(["--context-shift", "--keep", "16"])
|
|
170
|
+
else:
|
|
171
|
+
# For backends that don't support context-shift (e.g., Metal), just use keep
|
|
172
|
+
base_command.extend(["--keep", "16"])
|
|
173
|
+
logging.debug(
|
|
174
|
+
f"Skipped --context-shift for backend: {self.backend} (not supported)"
|
|
175
|
+
)
|
|
167
176
|
|
|
168
177
|
# Use legacy reasoning formatting, since not all apps support the new
|
|
169
178
|
# reasoning_content field
|
|
@@ -192,7 +201,8 @@ class LlamaServer(WrappedServer):
|
|
|
192
201
|
env.update(os.environ)
|
|
193
202
|
logging.debug(f"Loaded environment variables from {env_file_path}")
|
|
194
203
|
|
|
195
|
-
|
|
204
|
+
system = platform.system().lower()
|
|
205
|
+
if system == "linux":
|
|
196
206
|
lib_dir = os.path.dirname(exe_path) # Same directory as the executable
|
|
197
207
|
current_ld_path = env.get("LD_LIBRARY_PATH", "")
|
|
198
208
|
if current_ld_path:
|
|
@@ -200,6 +210,14 @@ class LlamaServer(WrappedServer):
|
|
|
200
210
|
else:
|
|
201
211
|
env["LD_LIBRARY_PATH"] = lib_dir
|
|
202
212
|
logging.debug(f"Set LD_LIBRARY_PATH to {env['LD_LIBRARY_PATH']}")
|
|
213
|
+
elif system == "darwin":
|
|
214
|
+
lib_dir = os.path.dirname(exe_path)
|
|
215
|
+
current_dyld_path = env.get("DYLD_LIBRARY_PATH", "")
|
|
216
|
+
if current_dyld_path:
|
|
217
|
+
env["DYLD_LIBRARY_PATH"] = f"{lib_dir}:{current_dyld_path}"
|
|
218
|
+
else:
|
|
219
|
+
env["DYLD_LIBRARY_PATH"] = lib_dir
|
|
220
|
+
logging.debug(f"Set DYLD_LIBRARY_PATH to {env['DYLD_LIBRARY_PATH']}")
|
|
203
221
|
|
|
204
222
|
# Start subprocess with output capture
|
|
205
223
|
self.process = subprocess.Popen(
|