lemonade-sdk 8.1.9__py3-none-any.whl → 8.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (33) hide show
  1. lemonade/common/inference_engines.py +13 -4
  2. lemonade/common/system_info.py +570 -1
  3. lemonade/tools/flm/__init__.py +1 -0
  4. lemonade/tools/flm/utils.py +255 -0
  5. lemonade/tools/llamacpp/utils.py +62 -13
  6. lemonade/tools/server/flm.py +137 -0
  7. lemonade/tools/server/llamacpp.py +23 -5
  8. lemonade/tools/server/serve.py +292 -135
  9. lemonade/tools/server/static/js/chat.js +165 -82
  10. lemonade/tools/server/static/js/models.js +87 -54
  11. lemonade/tools/server/static/js/shared.js +5 -3
  12. lemonade/tools/server/static/logs.html +47 -0
  13. lemonade/tools/server/static/styles.css +159 -8
  14. lemonade/tools/server/static/webapp.html +28 -10
  15. lemonade/tools/server/tray.py +158 -38
  16. lemonade/tools/server/utils/macos_tray.py +226 -0
  17. lemonade/tools/server/utils/{system_tray.py → windows_tray.py} +13 -0
  18. lemonade/tools/server/webapp.py +4 -1
  19. lemonade/tools/server/wrapped_server.py +91 -25
  20. lemonade/version.py +1 -1
  21. lemonade_install/install.py +25 -2
  22. {lemonade_sdk-8.1.9.dist-info → lemonade_sdk-8.1.11.dist-info}/METADATA +9 -6
  23. {lemonade_sdk-8.1.9.dist-info → lemonade_sdk-8.1.11.dist-info}/RECORD +33 -28
  24. lemonade_server/cli.py +105 -14
  25. lemonade_server/model_manager.py +186 -45
  26. lemonade_server/pydantic_models.py +25 -1
  27. lemonade_server/server_models.json +162 -62
  28. lemonade_server/settings.py +39 -39
  29. {lemonade_sdk-8.1.9.dist-info → lemonade_sdk-8.1.11.dist-info}/WHEEL +0 -0
  30. {lemonade_sdk-8.1.9.dist-info → lemonade_sdk-8.1.11.dist-info}/entry_points.txt +0 -0
  31. {lemonade_sdk-8.1.9.dist-info → lemonade_sdk-8.1.11.dist-info}/licenses/LICENSE +0 -0
  32. {lemonade_sdk-8.1.9.dist-info → lemonade_sdk-8.1.11.dist-info}/licenses/NOTICE.md +0 -0
  33. {lemonade_sdk-8.1.9.dist-info → lemonade_sdk-8.1.11.dist-info}/top_level.txt +0 -0
@@ -0,0 +1 @@
1
+ # FLM (FastFlowLM) utilities for Lemonade SDK
@@ -0,0 +1,255 @@
1
+ """
2
+ FLM (FastFlowLM) utilities for installation, version checking, and model management.
3
+ """
4
+
5
+ import os
6
+ import logging
7
+ import subprocess
8
+ import tempfile
9
+ import time
10
+ from typing import List, Optional
11
+
12
+ import requests
13
+ from packaging.version import Version
14
+
15
+
16
+ FLM_MINIMUM_VERSION = "0.9.10"
17
+
18
+
19
+ def check_flm_version() -> Optional[str]:
20
+ """
21
+ Check if FLM is installed and return version, or None if not available.
22
+ """
23
+ try:
24
+ result = subprocess.run(
25
+ ["flm", "version"],
26
+ capture_output=True,
27
+ text=True,
28
+ check=True,
29
+ encoding="utf-8",
30
+ errors="replace",
31
+ )
32
+
33
+ # Parse version from output like "FLM v0.9.4"
34
+ output = result.stdout.strip()
35
+ if output.startswith("FLM v"):
36
+ version_str = output[5:] # Remove "FLM v" prefix
37
+ return version_str
38
+ return None
39
+
40
+ except (subprocess.CalledProcessError, FileNotFoundError):
41
+ return None
42
+
43
+
44
+ def refresh_environment():
45
+ """
46
+ Refresh PATH to pick up newly installed executables.
47
+ """
48
+ if os.name == "nt": # Windows
49
+ # On Windows, we need to refresh the PATH from registry
50
+ import winreg
51
+
52
+ try:
53
+ with winreg.OpenKey(
54
+ winreg.HKEY_LOCAL_MACHINE,
55
+ r"SYSTEM\CurrentControlSet\Control\Session Manager\Environment",
56
+ ) as key:
57
+ path_value, _ = winreg.QueryValueEx(key, "PATH")
58
+ os.environ["PATH"] = path_value + ";" + os.environ.get("PATH", "")
59
+ except Exception as e: # pylint: disable=broad-except
60
+ logging.debug("Could not refresh PATH from registry: %s", e)
61
+
62
+ # Also try to add common installation paths
63
+ common_paths = [
64
+ r"C:\Program Files\FLM",
65
+ r"C:\Program Files (x86)\FLM",
66
+ os.path.expanduser(r"~\AppData\Local\FLM"),
67
+ ]
68
+ for path in common_paths:
69
+ if os.path.exists(path) and path not in os.environ.get("PATH", ""):
70
+ os.environ["PATH"] = path + ";" + os.environ.get("PATH", "")
71
+
72
+
73
+ def install_flm():
74
+ """
75
+ Check if FLM is installed and at minimum version.
76
+ If not, download and run the GUI installer, then wait for completion.
77
+ """
78
+ # Check current FLM installation
79
+ current_version = check_flm_version()
80
+
81
+ if current_version and Version(current_version) >= Version(FLM_MINIMUM_VERSION):
82
+ logging.info(
83
+ "FLM v%s is already installed and meets minimum version requirement (v%s)",
84
+ current_version,
85
+ FLM_MINIMUM_VERSION,
86
+ )
87
+ return
88
+
89
+ if current_version:
90
+ logging.info(
91
+ "FLM v%s is installed but below minimum version v%s. Upgrading...",
92
+ current_version,
93
+ FLM_MINIMUM_VERSION,
94
+ )
95
+ else:
96
+ logging.info(
97
+ "FLM not found. Installing FLM v%s or later...", FLM_MINIMUM_VERSION
98
+ )
99
+
100
+ # Download the installer
101
+ # pylint: disable=line-too-long
102
+ installer_url = "https://github.com/FastFlowLM/FastFlowLM/releases/latest/download/flm-setup.exe"
103
+ installer_path = os.path.join(tempfile.gettempdir(), "flm-setup.exe")
104
+
105
+ try:
106
+ # Remove existing installer if present
107
+ if os.path.exists(installer_path):
108
+ os.remove(installer_path)
109
+
110
+ logging.info("Downloading FLM installer...")
111
+ response = requests.get(installer_url, stream=True, timeout=30)
112
+ response.raise_for_status()
113
+
114
+ # Save installer to disk
115
+ with open(installer_path, "wb") as f:
116
+ for chunk in response.iter_content(chunk_size=8192):
117
+ f.write(chunk)
118
+ f.flush()
119
+ os.fsync(f.fileno())
120
+
121
+ logging.info("Downloaded FLM installer to %s", installer_path)
122
+
123
+ # Launch the installer GUI
124
+ logging.warning(
125
+ "Launching FLM installer GUI. Please complete the installation..."
126
+ )
127
+
128
+ # Launch installer and wait for it to complete
129
+ if os.name == "nt": # Windows
130
+ process = subprocess.Popen([installer_path], shell=True)
131
+ else:
132
+ process = subprocess.Popen([installer_path])
133
+
134
+ # Wait for installer to complete
135
+ process.wait()
136
+
137
+ if process.returncode != 0:
138
+ raise RuntimeError(
139
+ f"FLM installer failed with exit code {process.returncode}"
140
+ )
141
+
142
+ logging.info("FLM installer completed successfully")
143
+
144
+ # Refresh environment to pick up new PATH entries
145
+ refresh_environment()
146
+
147
+ # Wait a moment for system to update
148
+ time.sleep(2)
149
+
150
+ # Verify installation
151
+ max_retries = 10
152
+ for attempt in range(max_retries):
153
+ new_version = check_flm_version()
154
+ if new_version and Version(new_version) >= Version(FLM_MINIMUM_VERSION):
155
+ logging.info("FLM v%s successfully installed and verified", new_version)
156
+ return
157
+
158
+ if attempt < max_retries - 1:
159
+ logging.debug(
160
+ "FLM not yet available in PATH, retrying... (attempt %d/%d)",
161
+ attempt + 1,
162
+ max_retries,
163
+ )
164
+ time.sleep(3)
165
+ refresh_environment()
166
+
167
+ # Final check failed
168
+ raise RuntimeError(
169
+ "FLM installation completed but 'flm' command is not available in PATH. "
170
+ "Please ensure FLM is properly installed and available in your system PATH."
171
+ )
172
+
173
+ except requests.RequestException as e:
174
+ raise RuntimeError(f"Failed to download FLM installer: {e}") from e
175
+ except Exception as e:
176
+ raise RuntimeError(f"FLM installation failed: {e}") from e
177
+ finally:
178
+ # Clean up installer file
179
+ if os.path.exists(installer_path):
180
+ try:
181
+ os.remove(installer_path)
182
+ except OSError:
183
+ pass # Ignore cleanup errors
184
+
185
+
186
+ def download_flm_model(config_checkpoint, _=None, do_not_upgrade=False) -> dict:
187
+ """
188
+ Downloads the FLM model for the given configuration.
189
+
190
+ Args:
191
+ config_checkpoint: name of the FLM model to install.
192
+ _: placeholder for `config_mmproj`, which is standard
193
+ for WrappedServer (see llamacpp/utils.py) .
194
+ do_not_upgrade: whether to re-download the model if it is already
195
+ available.
196
+ """
197
+
198
+ if do_not_upgrade:
199
+ command = ["flm", "pull", f"{config_checkpoint}"]
200
+ else:
201
+ command = ["flm", "pull", f"{config_checkpoint}", "--force"]
202
+
203
+ subprocess.run(command, check=True)
204
+
205
+
206
+ def get_flm_installed_models() -> List[str]:
207
+ """
208
+ Parse FLM model list and return installed model checkpoints.
209
+
210
+ Returns:
211
+ List of installed FLM model checkpoints (e.g., ["llama3.2:1b", "gemma3:4b"])
212
+ """
213
+ try:
214
+ result = subprocess.run(
215
+ ["flm", "list"],
216
+ capture_output=True,
217
+ text=True,
218
+ check=True,
219
+ encoding="utf-8",
220
+ errors="replace",
221
+ )
222
+
223
+ # Check if we got valid output
224
+ if not result.stdout:
225
+ return []
226
+
227
+ installed_checkpoints = []
228
+
229
+ lines = result.stdout.strip().split("\n")
230
+ for line in lines:
231
+ line = line.strip()
232
+ if line.startswith("- "):
233
+ # Remove the leading "- " and parse the model info
234
+ model_info = line[2:].strip()
235
+
236
+ # Check if model is installed (✅)
237
+ if model_info.endswith(" ✅"):
238
+ checkpoint = model_info[:-2].strip()
239
+ installed_checkpoints.append(checkpoint)
240
+
241
+ return installed_checkpoints
242
+
243
+ except (subprocess.CalledProcessError, FileNotFoundError, AttributeError):
244
+ # FLM not installed, not available, or output parsing failed
245
+ return []
246
+
247
+
248
+ def is_flm_available() -> bool:
249
+ """
250
+ Check if FLM is available and meets minimum version requirements.
251
+ """
252
+ current_version = check_flm_version()
253
+ return current_version is not None and Version(current_version) >= Version(
254
+ FLM_MINIMUM_VERSION
255
+ )
@@ -14,8 +14,9 @@ from lemonade.common.system_info import get_system_info
14
14
 
15
15
  from dotenv import set_key, load_dotenv
16
16
 
17
- LLAMA_VERSION_VULKAN = "b6431"
18
- LLAMA_VERSION_ROCM = "b1057"
17
+ LLAMA_VERSION_VULKAN = "b6510"
18
+ LLAMA_VERSION_ROCM = "b1066"
19
+ LLAMA_VERSION_METAL = "b6510"
19
20
 
20
21
 
21
22
  def identify_rocm_arch_from_name(device_name: str) -> str | None:
@@ -126,8 +127,12 @@ def get_llama_version(backend: str) -> str:
126
127
  return LLAMA_VERSION_ROCM
127
128
  elif backend == "vulkan":
128
129
  return LLAMA_VERSION_VULKAN
130
+ elif backend == "metal":
131
+ return LLAMA_VERSION_METAL
129
132
  else:
130
- raise ValueError(f"Unsupported backend: {backend}")
133
+ raise ValueError(
134
+ f"Unsupported backend: {backend}. Supported: vulkan, rocm, metal"
135
+ )
131
136
 
132
137
 
133
138
  def get_llama_folder_path(backend: str):
@@ -142,10 +147,12 @@ def get_llama_exe_path(exe_name: str, backend: str):
142
147
  Get path to platform-specific llama-server executable
143
148
  """
144
149
  base_dir = get_llama_folder_path(backend)
145
- if platform.system().lower() == "windows":
150
+ system = platform.system().lower()
151
+
152
+ if system == "windows":
146
153
  return os.path.join(base_dir, f"{exe_name}.exe")
147
- else: # Linux/Ubuntu
148
- # Check if executable exists in build/bin subdirectory (Current Ubuntu structure)
154
+ else: # Darwin/Linux/Ubuntu
155
+ # Check if executable exists in build/bin subdirectory
149
156
  build_bin_path = os.path.join(base_dir, "build", "bin", exe_name)
150
157
  if os.path.exists(build_bin_path):
151
158
  return build_bin_path
@@ -223,8 +230,24 @@ def get_binary_url_and_filename(backend: str, target_arch: str = None):
223
230
  raise NotImplementedError(
224
231
  f"Platform {system} not supported for Vulkan llamacpp. Supported: Windows, Ubuntu Linux"
225
232
  )
233
+
234
+ elif backend == "metal":
235
+ # Metal support for macOS Apple Silicon from ggml-org/llama.cpp
236
+ repo = "ggml-org/llama.cpp"
237
+ version = LLAMA_VERSION_METAL
238
+ if system == "darwin":
239
+ if platform.machine().lower() in ["arm64", "aarch64"]:
240
+ filename = f"llama-{version}-bin-macos-arm64.zip"
241
+ else:
242
+ raise NotImplementedError(
243
+ "Metal backend only supports Apple Silicon (ARM64) processors"
244
+ )
245
+ else:
246
+ raise NotImplementedError(
247
+ f"Platform {system} not supported for Metal llamacpp. Metal is only supported on macOS"
248
+ )
226
249
  else:
227
- supported_backends = ["vulkan", "rocm"]
250
+ supported_backends = ["vulkan", "rocm", "metal"]
228
251
  raise NotImplementedError(
229
252
  f"Unsupported backend: {backend}. Supported backends: {supported_backends}"
230
253
  )
@@ -239,10 +262,10 @@ def validate_platform_support():
239
262
  """
240
263
  system = platform.system().lower()
241
264
 
242
- if system not in ["windows", "linux"]:
265
+ if system not in ["windows", "linux", "darwin"]:
243
266
  raise NotImplementedError(
244
267
  f"Platform {system} not supported for llamacpp. "
245
- "Supported: Windows, Ubuntu Linux"
268
+ "Supported: Windows, Ubuntu Linux, macOS"
246
269
  )
247
270
 
248
271
  if system == "linux":
@@ -341,6 +364,29 @@ def install_llamacpp(backend):
341
364
  if filename.endswith(".zip"):
342
365
  with zipfile.ZipFile(llama_archive_path, "r") as zip_ref:
343
366
  zip_ref.extractall(llama_server_exe_dir)
367
+
368
+ # On Unix-like systems (macOS/Linux), make executables executable
369
+ if platform.system().lower() in ["darwin", "linux"]:
370
+ import stat
371
+
372
+ # Find and make executable files executable
373
+ for root, dirs, files in os.walk(llama_server_exe_dir):
374
+ for file in files:
375
+ file_path = os.path.join(root, file)
376
+ # Make files in bin/ directories executable
377
+ if "bin" in root.split(os.sep) or file in [
378
+ "llama-server",
379
+ "llama-simple",
380
+ ]:
381
+ try:
382
+ current_permissions = os.stat(file_path).st_mode
383
+ os.chmod(file_path, current_permissions | stat.S_IEXEC)
384
+ logging.debug(f"Made {file_path} executable")
385
+ except Exception as e:
386
+ raise RuntimeError(
387
+ f"Failed to make {file_path} executable. This will prevent "
388
+ f"llama-server from starting. Error: {e}"
389
+ )
344
390
  else:
345
391
  raise NotImplementedError(f"Unsupported archive format: {filename}")
346
392
 
@@ -510,14 +556,14 @@ def identify_gguf_models(
510
556
  The CHECKPOINT:VARIANT scheme is used to specify model files in Hugging Face repositories.
511
557
 
512
558
  The VARIANT format can be one of several types:
513
- 0. wildcard (*): download all files in the repo
559
+ 0. wildcard (*): download all .gguf files in the repo
514
560
  1. Full filename: exact file to download
515
561
  2. None/empty: gets the first .gguf file in the repository (excludes mmproj files)
516
562
  3. Quantization variant: find a single file ending with the variant name (case insensitive)
517
563
  4. Folder name: downloads all .gguf files in the folder that matches the variant name (case insensitive)
518
564
 
519
565
  Examples:
520
- - "ggml-org/gpt-oss-120b-GGUF:*" -> downloads all files in repo
566
+ - "ggml-org/gpt-oss-120b-GGUF:*" -> downloads all .gguf files in repo
521
567
  - "unsloth/Qwen3-8B-GGUF:qwen3.gguf" -> downloads "qwen3.gguf"
522
568
  - "unsloth/Qwen3-30B-A3B-GGUF" -> downloads "Qwen3-30B-A3B-GGUF.gguf"
523
569
  - "unsloth/Qwen3-8B-GGUF:Q4_1" -> downloads "Qwen3-8B-GGUF-Q4_1.gguf"
@@ -531,13 +577,14 @@ def identify_gguf_models(
531
577
 
532
578
  # (case 0) Wildcard, download everything
533
579
  if variant and variant == "*":
534
- sharded_files = repo_files
580
+ sharded_files = [f for f in repo_files if f.endswith(".gguf")]
535
581
 
536
582
  # Sort to ensure consistent ordering
537
583
  sharded_files.sort()
538
584
 
539
585
  # Use first file as primary (this is how llamacpp handles it)
540
586
  variant_name = sharded_files[0]
587
+
541
588
  # (case 1) If variant ends in .gguf, use it directly
542
589
  elif variant and variant.endswith(".gguf"):
543
590
  variant_name = variant
@@ -856,7 +903,9 @@ def get_hip_devices():
856
903
  try:
857
904
  libhip = ctypes.CDLL(matching_files[0])
858
905
  except OSError:
859
- raise RuntimeError(f"Could not load HIP runtime library from {path}")
906
+ raise RuntimeError(
907
+ f"Could not load HIP runtime library from {matching_files[0]}"
908
+ )
860
909
 
861
910
  # Setup function signatures
862
911
  hipError_t = c_int
@@ -0,0 +1,137 @@
1
+ import os
2
+ import logging
3
+ import subprocess
4
+ import time
5
+ import threading
6
+
7
+ import requests
8
+
9
+ from lemonade_server.pydantic_models import (
10
+ PullConfig,
11
+ ChatCompletionRequest,
12
+ )
13
+
14
+ from lemonade.tools.server.wrapped_server import WrappedServerTelemetry, WrappedServer
15
+ from lemonade.tools.flm.utils import install_flm, download_flm_model
16
+
17
+
18
+ class FlmTelemetry(WrappedServerTelemetry):
19
+ """
20
+ Manages telemetry data collection and display for FLM server.
21
+ """
22
+
23
+ def parse_telemetry_line(self, line: str):
24
+ """
25
+ Parse telemetry data from FLM server output lines.
26
+
27
+ Note: as of FLM 0.9.10, no telemetry data is provided by the server CLI.
28
+ This function is required to be implemented, so we leave it empty
29
+ as a placeholder for now.
30
+ """
31
+
32
+ return
33
+
34
+
35
+ class FlmServer(WrappedServer):
36
+ """
37
+ Routes OpenAI API requests to an FLM server instance and returns the result
38
+ back to Lemonade Server.
39
+ """
40
+
41
+ def __init__(self):
42
+ self.flm_model_name = None
43
+ super().__init__(server_name="flm-server", telemetry=FlmTelemetry())
44
+
45
+ def _choose_port(self):
46
+ """
47
+ `flm serve` doesn't support port selection as of v0.9.10
48
+ """
49
+ self.port = 11434
50
+
51
+ def address(self):
52
+ return f"http://localhost:{self.port}/v1"
53
+
54
+ def install_server(self):
55
+ """
56
+ Check if FLM is installed and at minimum version.
57
+ If not, download and run the GUI installer, then wait for completion.
58
+ """
59
+ install_flm()
60
+
61
+ def download_model(
62
+ self, config_checkpoint, config_mmproj=None, do_not_upgrade=False
63
+ ) -> dict:
64
+ download_flm_model(config_checkpoint, config_mmproj, do_not_upgrade)
65
+
66
+ def _launch_server_subprocess(
67
+ self,
68
+ model_config: PullConfig,
69
+ snapshot_files: dict,
70
+ ctx_size: int,
71
+ supports_embeddings: bool = False,
72
+ supports_reranking: bool = False,
73
+ ):
74
+
75
+ self._choose_port()
76
+
77
+ # Keep track of the FLM model name so that we can use it later
78
+ self.flm_model_name = model_config.checkpoint
79
+
80
+ command = [
81
+ "flm",
82
+ "serve",
83
+ f"{self.flm_model_name}",
84
+ "--ctx-len",
85
+ str(ctx_size),
86
+ ]
87
+
88
+ # Set up environment with library path for Linux
89
+ env = os.environ.copy()
90
+
91
+ self.process = subprocess.Popen(
92
+ command,
93
+ stdout=subprocess.PIPE,
94
+ stderr=subprocess.STDOUT,
95
+ text=True,
96
+ encoding="utf-8",
97
+ errors="replace",
98
+ bufsize=1,
99
+ env=env,
100
+ )
101
+
102
+ # Start background thread to log subprocess output
103
+ threading.Thread(
104
+ target=self._log_subprocess_output,
105
+ args=("FLM SERVER",),
106
+ daemon=True,
107
+ ).start()
108
+
109
+ def _wait_for_load(self):
110
+ """
111
+ FLM doesn't seem to have a health API, so we'll use the "list local models"
112
+ API to check if the server is up.
113
+ """
114
+ status_code = None
115
+ while not self.process.poll() and status_code != 200:
116
+ health_url = f"http://localhost:{self.port}/api/tags"
117
+ try:
118
+ health_response = requests.get(health_url)
119
+ except requests.exceptions.ConnectionError:
120
+ logging.debug(
121
+ "Not able to connect to %s yet, will retry", self.server_name
122
+ )
123
+ else:
124
+ status_code = health_response.status_code
125
+ logging.debug(
126
+ "Testing %s readiness (will retry until ready), result: %s",
127
+ self.server_name,
128
+ health_response.json(),
129
+ )
130
+ time.sleep(1)
131
+
132
+ def chat_completion(self, chat_completion_request: ChatCompletionRequest):
133
+ # FLM requires the correct model name to be in the request
134
+ # (whereas llama-server ignores the model name field in the request)
135
+ chat_completion_request.model = self.flm_model_name
136
+
137
+ return super().chat_completion(chat_completion_request)
@@ -88,9 +88,8 @@ class LlamaTelemetry(WrappedServerTelemetry):
88
88
 
89
89
  class LlamaServer(WrappedServer):
90
90
  def __init__(self, backend: str):
91
- self.telemetry = LlamaTelemetry()
92
91
  self.backend = backend
93
- super().__init__(server_name="llama-server", telemetry=self.telemetry)
92
+ super().__init__(server_name="llama-server", telemetry=LlamaTelemetry())
94
93
 
95
94
  def install_server(self, backend=None):
96
95
  """
@@ -157,13 +156,23 @@ class LlamaServer(WrappedServer):
157
156
 
158
157
  # Find a port, and save it in the telemetry object for future reference
159
158
  # by other functions
160
- self.choose_port()
159
+ self._choose_port()
161
160
 
162
161
  # Add port and jinja to enable tool use
163
162
  base_command.extend(["--port", str(self.port), "--jinja"])
164
163
 
165
164
  # Enable context shift and avoid attention sink issues by preserving the initial tokens
166
- base_command.extend(["--context-shift", "--keep", "16"])
165
+ # Note: --context-shift is not supported on all backends (e.g., Metal on macOS)
166
+ # Only add context-shift for backends that support it
167
+ context_shift_supported_backends = ["vulkan", "rocm"]
168
+ if self.backend in context_shift_supported_backends:
169
+ base_command.extend(["--context-shift", "--keep", "16"])
170
+ else:
171
+ # For backends that don't support context-shift (e.g., Metal), just use keep
172
+ base_command.extend(["--keep", "16"])
173
+ logging.debug(
174
+ f"Skipped --context-shift for backend: {self.backend} (not supported)"
175
+ )
167
176
 
168
177
  # Use legacy reasoning formatting, since not all apps support the new
169
178
  # reasoning_content field
@@ -192,7 +201,8 @@ class LlamaServer(WrappedServer):
192
201
  env.update(os.environ)
193
202
  logging.debug(f"Loaded environment variables from {env_file_path}")
194
203
 
195
- if platform.system().lower() == "linux":
204
+ system = platform.system().lower()
205
+ if system == "linux":
196
206
  lib_dir = os.path.dirname(exe_path) # Same directory as the executable
197
207
  current_ld_path = env.get("LD_LIBRARY_PATH", "")
198
208
  if current_ld_path:
@@ -200,6 +210,14 @@ class LlamaServer(WrappedServer):
200
210
  else:
201
211
  env["LD_LIBRARY_PATH"] = lib_dir
202
212
  logging.debug(f"Set LD_LIBRARY_PATH to {env['LD_LIBRARY_PATH']}")
213
+ elif system == "darwin":
214
+ lib_dir = os.path.dirname(exe_path)
215
+ current_dyld_path = env.get("DYLD_LIBRARY_PATH", "")
216
+ if current_dyld_path:
217
+ env["DYLD_LIBRARY_PATH"] = f"{lib_dir}:{current_dyld_path}"
218
+ else:
219
+ env["DYLD_LIBRARY_PATH"] = lib_dir
220
+ logging.debug(f"Set DYLD_LIBRARY_PATH to {env['DYLD_LIBRARY_PATH']}")
203
221
 
204
222
  # Start subprocess with output capture
205
223
  self.process = subprocess.Popen(