lemonade-sdk 8.1.10__py3-none-any.whl → 8.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (37) hide show
  1. lemonade/cache.py +6 -1
  2. lemonade/common/status.py +4 -4
  3. lemonade/tools/bench.py +22 -1
  4. lemonade/tools/flm/__init__.py +1 -0
  5. lemonade/tools/flm/utils.py +255 -0
  6. lemonade/tools/llamacpp/bench.py +111 -23
  7. lemonade/tools/llamacpp/load.py +20 -1
  8. lemonade/tools/llamacpp/utils.py +210 -17
  9. lemonade/tools/oga/bench.py +0 -26
  10. lemonade/tools/report/table.py +6 -0
  11. lemonade/tools/server/flm.py +133 -0
  12. lemonade/tools/server/llamacpp.py +23 -5
  13. lemonade/tools/server/serve.py +260 -135
  14. lemonade/tools/server/static/js/chat.js +165 -82
  15. lemonade/tools/server/static/js/models.js +87 -54
  16. lemonade/tools/server/static/js/shared.js +9 -6
  17. lemonade/tools/server/static/logs.html +57 -0
  18. lemonade/tools/server/static/styles.css +159 -8
  19. lemonade/tools/server/static/webapp.html +28 -10
  20. lemonade/tools/server/tray.py +94 -38
  21. lemonade/tools/server/utils/macos_tray.py +226 -0
  22. lemonade/tools/server/utils/{system_tray.py → windows_tray.py} +13 -0
  23. lemonade/tools/server/webapp.py +4 -1
  24. lemonade/tools/server/wrapped_server.py +91 -25
  25. lemonade/version.py +1 -1
  26. lemonade_install/install.py +25 -2
  27. {lemonade_sdk-8.1.10.dist-info → lemonade_sdk-8.1.12.dist-info}/METADATA +10 -6
  28. {lemonade_sdk-8.1.10.dist-info → lemonade_sdk-8.1.12.dist-info}/RECORD +37 -32
  29. lemonade_server/cli.py +103 -14
  30. lemonade_server/model_manager.py +186 -45
  31. lemonade_server/pydantic_models.py +25 -1
  32. lemonade_server/server_models.json +175 -62
  33. {lemonade_sdk-8.1.10.dist-info → lemonade_sdk-8.1.12.dist-info}/WHEEL +0 -0
  34. {lemonade_sdk-8.1.10.dist-info → lemonade_sdk-8.1.12.dist-info}/entry_points.txt +0 -0
  35. {lemonade_sdk-8.1.10.dist-info → lemonade_sdk-8.1.12.dist-info}/licenses/LICENSE +0 -0
  36. {lemonade_sdk-8.1.10.dist-info → lemonade_sdk-8.1.12.dist-info}/licenses/NOTICE.md +0 -0
  37. {lemonade_sdk-8.1.10.dist-info → lemonade_sdk-8.1.12.dist-info}/top_level.txt +0 -0
@@ -7,6 +7,7 @@ import zipfile
7
7
  from typing import Optional
8
8
  import subprocess
9
9
  import requests
10
+ import lemonade.common.build as build
10
11
  import lemonade.common.printing as printing
11
12
  from lemonade.tools.adapter import PassthroughTokenizer, ModelAdapter
12
13
 
@@ -14,8 +15,9 @@ from lemonade.common.system_info import get_system_info
14
15
 
15
16
  from dotenv import set_key, load_dotenv
16
17
 
17
- LLAMA_VERSION_VULKAN = "b6431"
18
- LLAMA_VERSION_ROCM = "b1057"
18
+ LLAMA_VERSION_VULKAN = "b6510"
19
+ LLAMA_VERSION_ROCM = "b1066"
20
+ LLAMA_VERSION_METAL = "b6510"
19
21
 
20
22
 
21
23
  def identify_rocm_arch_from_name(device_name: str) -> str | None:
@@ -126,8 +128,12 @@ def get_llama_version(backend: str) -> str:
126
128
  return LLAMA_VERSION_ROCM
127
129
  elif backend == "vulkan":
128
130
  return LLAMA_VERSION_VULKAN
131
+ elif backend == "metal":
132
+ return LLAMA_VERSION_METAL
129
133
  else:
130
- raise ValueError(f"Unsupported backend: {backend}")
134
+ raise ValueError(
135
+ f"Unsupported backend: {backend}. Supported: vulkan, rocm, metal"
136
+ )
131
137
 
132
138
 
133
139
  def get_llama_folder_path(backend: str):
@@ -142,10 +148,12 @@ def get_llama_exe_path(exe_name: str, backend: str):
142
148
  Get path to platform-specific llama-server executable
143
149
  """
144
150
  base_dir = get_llama_folder_path(backend)
145
- if platform.system().lower() == "windows":
151
+ system = platform.system().lower()
152
+
153
+ if system == "windows":
146
154
  return os.path.join(base_dir, f"{exe_name}.exe")
147
- else: # Linux/Ubuntu
148
- # Check if executable exists in build/bin subdirectory (Current Ubuntu structure)
155
+ else: # Darwin/Linux/Ubuntu
156
+ # Check if executable exists in build/bin subdirectory
149
157
  build_bin_path = os.path.join(base_dir, "build", "bin", exe_name)
150
158
  if os.path.exists(build_bin_path):
151
159
  return build_bin_path
@@ -168,6 +176,13 @@ def get_llama_cli_exe_path(backend: str):
168
176
  return get_llama_exe_path("llama-cli", backend)
169
177
 
170
178
 
179
+ def get_llama_bench_exe_path(backend: str):
180
+ """
181
+ Get path to platform-specific llama-bench executable
182
+ """
183
+ return get_llama_exe_path("llama-bench", backend)
184
+
185
+
171
186
  def get_version_txt_path(backend: str):
172
187
  """
173
188
  Get path to text file that contains version information
@@ -223,8 +238,24 @@ def get_binary_url_and_filename(backend: str, target_arch: str = None):
223
238
  raise NotImplementedError(
224
239
  f"Platform {system} not supported for Vulkan llamacpp. Supported: Windows, Ubuntu Linux"
225
240
  )
241
+
242
+ elif backend == "metal":
243
+ # Metal support for macOS Apple Silicon from ggml-org/llama.cpp
244
+ repo = "ggml-org/llama.cpp"
245
+ version = LLAMA_VERSION_METAL
246
+ if system == "darwin":
247
+ if platform.machine().lower() in ["arm64", "aarch64"]:
248
+ filename = f"llama-{version}-bin-macos-arm64.zip"
249
+ else:
250
+ raise NotImplementedError(
251
+ "Metal backend only supports Apple Silicon (ARM64) processors"
252
+ )
253
+ else:
254
+ raise NotImplementedError(
255
+ f"Platform {system} not supported for Metal llamacpp. Metal is only supported on macOS"
256
+ )
226
257
  else:
227
- supported_backends = ["vulkan", "rocm"]
258
+ supported_backends = ["vulkan", "rocm", "metal"]
228
259
  raise NotImplementedError(
229
260
  f"Unsupported backend: {backend}. Supported backends: {supported_backends}"
230
261
  )
@@ -239,10 +270,10 @@ def validate_platform_support():
239
270
  """
240
271
  system = platform.system().lower()
241
272
 
242
- if system not in ["windows", "linux"]:
273
+ if system not in ["windows", "linux", "darwin"]:
243
274
  raise NotImplementedError(
244
275
  f"Platform {system} not supported for llamacpp. "
245
- "Supported: Windows, Ubuntu Linux"
276
+ "Supported: Windows, Ubuntu Linux, macOS"
246
277
  )
247
278
 
248
279
  if system == "linux":
@@ -341,6 +372,29 @@ def install_llamacpp(backend):
341
372
  if filename.endswith(".zip"):
342
373
  with zipfile.ZipFile(llama_archive_path, "r") as zip_ref:
343
374
  zip_ref.extractall(llama_server_exe_dir)
375
+
376
+ # On Unix-like systems (macOS/Linux), make executables executable
377
+ if platform.system().lower() in ["darwin", "linux"]:
378
+ import stat
379
+
380
+ # Find and make executable files executable
381
+ for root, dirs, files in os.walk(llama_server_exe_dir):
382
+ for file in files:
383
+ file_path = os.path.join(root, file)
384
+ # Make files in bin/ directories executable
385
+ if "bin" in root.split(os.sep) or file in [
386
+ "llama-server",
387
+ "llama-simple",
388
+ ]:
389
+ try:
390
+ current_permissions = os.stat(file_path).st_mode
391
+ os.chmod(file_path, current_permissions | stat.S_IEXEC)
392
+ logging.debug(f"Made {file_path} executable")
393
+ except Exception as e:
394
+ raise RuntimeError(
395
+ f"Failed to make {file_path} executable. This will prevent "
396
+ f"llama-server from starting. Error: {e}"
397
+ )
344
398
  else:
345
399
  raise NotImplementedError(f"Unsupported archive format: {filename}")
346
400
 
@@ -360,6 +414,7 @@ def install_llamacpp(backend):
360
414
  exe_paths = [
361
415
  (get_llama_server_exe_path(backend), "llama-server"),
362
416
  (get_llama_cli_exe_path(backend), "llama-cli"),
417
+ (get_llama_bench_exe_path(backend), "llama-bench"),
363
418
  ]
364
419
 
365
420
  for exe_path, exe_name in exe_paths:
@@ -653,8 +708,10 @@ class LlamaCppAdapter(ModelAdapter):
653
708
  context_size,
654
709
  threads,
655
710
  executable,
711
+ bench_executable,
656
712
  reasoning=False,
657
713
  lib_dir=None,
714
+ state=None,
658
715
  ):
659
716
  super().__init__()
660
717
 
@@ -666,8 +723,10 @@ class LlamaCppAdapter(ModelAdapter):
666
723
  self.context_size = context_size
667
724
  self.threads = threads
668
725
  self.executable = os.path.normpath(executable)
726
+ self.bench_executable = os.path.normpath(bench_executable)
669
727
  self.reasoning = reasoning
670
728
  self.lib_dir = lib_dir
729
+ self.state = state
671
730
 
672
731
  def generate(
673
732
  self,
@@ -708,32 +767,54 @@ class LlamaCppAdapter(ModelAdapter):
708
767
  self.executable,
709
768
  "-m",
710
769
  self.model,
711
- "--ctx-size",
770
+ "--ctx-size", # size of the prompt context, 0 = loaded from model
712
771
  str(self.context_size),
713
- "-n",
772
+ "-n", # number of tokens to predict, -1 = infinity, =2 - until context filled
714
773
  str(n_predict),
715
- "-t",
774
+ "-t", # number of threads to use during generation
716
775
  str(self.threads),
717
776
  "-p",
718
777
  prompt,
778
+ "-b", # logical maximum batch size
779
+ "1",
780
+ "-ub", # physical maximum batch size
781
+ "1",
719
782
  "--temp",
720
783
  str(temperature),
721
784
  "--top-p",
722
785
  str(top_p),
723
786
  "--top-k",
724
787
  str(top_k),
725
- "-e",
726
- "-no-cnv",
727
- "--reasoning-format",
788
+ "-e", # process escape sequences
789
+ "--no-conversation", # disable conversation mode
790
+ "--reasoning-format", # leaves thoughts unparsed in message content
728
791
  "none",
729
792
  ]
730
793
 
794
+ # If prompt exceeds 500 characters, then use a file
795
+ if len(prompt) < 500:
796
+ cmd += ["-p", prompt]
797
+ else:
798
+ # Create prompt file in cache directory
799
+ prompt_file = os.path.join(
800
+ build.output_dir(self.state.cache_dir, self.state.build_name),
801
+ "prompt.txt",
802
+ )
803
+ with open(prompt_file, "w", encoding="utf-8") as file:
804
+ file.write(prompt)
805
+ cmd += ["-f", prompt_file]
806
+
731
807
  # Configure GPU layers: 99 for GPU, 0 for CPU-only
732
808
  ngl_value = "99" if self.device == "igpu" else "0"
733
809
  cmd = cmd + ["-ngl", ngl_value]
734
810
 
735
811
  cmd = [str(m) for m in cmd]
736
812
 
813
+ # save llama-cli command
814
+ self.state.llama_cli_cmd = getattr(self.state, "llama_cli_cmd", []) + [
815
+ " ".join(cmd)
816
+ ]
817
+
737
818
  try:
738
819
  # Set up environment with library path for Linux
739
820
  env = os.environ.copy()
@@ -763,6 +844,15 @@ class LlamaCppAdapter(ModelAdapter):
763
844
  )
764
845
 
765
846
  raw_output, stderr = process.communicate(timeout=600)
847
+
848
+ # save llama-cli command output with performance info to state
849
+ # (can be viewed in state.yaml file in cache)
850
+ self.state.llama_cli_stderr = getattr(
851
+ self.state, "llama_cli_stderr", []
852
+ ) + [
853
+ [line for line in stderr.splitlines() if line.startswith("llama_perf_")]
854
+ ]
855
+
766
856
  if process.returncode != 0:
767
857
  error_msg = f"llama.cpp failed with return code {process.returncode}.\n"
768
858
  error_msg += f"Command: {' '.join(cmd)}\n"
@@ -827,7 +917,108 @@ class LlamaCppAdapter(ModelAdapter):
827
917
  return [output_text]
828
918
 
829
919
  except Exception as e:
830
- error_msg = f"Failed to run llama.cpp command: {str(e)}\n"
920
+ error_msg = f"Failed to run llama-cli.exe command: {str(e)}\n"
921
+ error_msg += f"Command: {' '.join(cmd)}"
922
+ raise Exception(error_msg)
923
+
924
+ def benchmark(self, prompts, iterations, output_tokens):
925
+ """
926
+ Runs the llama-bench.exe tool to measure TTFT and TPS
927
+ """
928
+ cmd = [
929
+ self.bench_executable,
930
+ "-m",
931
+ self.model,
932
+ "-r",
933
+ iterations,
934
+ "-p",
935
+ ",".join([str(p) for p in prompts]),
936
+ "-n",
937
+ output_tokens,
938
+ "-t",
939
+ self.threads if self.threads > 0 else 16,
940
+ "-b",
941
+ 1,
942
+ "-ub",
943
+ 1,
944
+ ]
945
+ cmd = [str(m) for m in cmd]
946
+
947
+ # save llama-bench command
948
+ self.state.llama_bench_cmd = " ".join(cmd)
949
+
950
+ try:
951
+ # Set up environment with library path for Linux
952
+ env = os.environ.copy()
953
+
954
+ # Load environment variables from .env file in the executable directory
955
+ exe_dir = os.path.dirname(self.executable)
956
+ env_file_path = os.path.join(exe_dir, ".env")
957
+ if os.path.exists(env_file_path):
958
+ load_dotenv(env_file_path, override=True)
959
+ env.update(os.environ)
960
+
961
+ if self.lib_dir and os.name != "nt": # Not Windows
962
+ current_ld_path = env.get("LD_LIBRARY_PATH", "")
963
+ if current_ld_path:
964
+ env["LD_LIBRARY_PATH"] = f"{self.lib_dir}:{current_ld_path}"
965
+ else:
966
+ env["LD_LIBRARY_PATH"] = self.lib_dir
967
+
968
+ process = subprocess.Popen(
969
+ cmd,
970
+ stdout=subprocess.PIPE,
971
+ stderr=subprocess.PIPE,
972
+ universal_newlines=True,
973
+ encoding="utf-8",
974
+ errors="replace",
975
+ env=env,
976
+ )
977
+
978
+ raw_output, stderr = process.communicate(timeout=600)
979
+
980
+ # save llama-bench command output with performance info to state
981
+ # (can be viewed in state.yaml file in cache)
982
+ self.state.llama_bench_standard_output = raw_output.splitlines()
983
+
984
+ if process.returncode != 0:
985
+ error_msg = (
986
+ f"llama-bench.exe failed with return code {process.returncode}.\n"
987
+ )
988
+ error_msg += f"Command: {' '.join(cmd)}\n"
989
+ error_msg += f"Error output:\n{stderr}\n"
990
+ error_msg += f"Standard output:\n{raw_output}"
991
+ raise Exception(error_msg)
992
+
993
+ if raw_output is None:
994
+ raise Exception("No output received from llama-bench.exe process")
995
+
996
+ # Parse information from llama-bench.exe output
997
+ prompt_lengths = []
998
+ pp_tps = []
999
+ pp_tps_sd = []
1000
+ tg_tps = None
1001
+ tg_tps_sd = None
1002
+
1003
+ for line in self.state.llama_bench_standard_output:
1004
+ # Parse TPS information
1005
+ for p in prompts:
1006
+ if f"pp{p:d}" in line:
1007
+ parts = line.split("|")
1008
+ timings = parts[-2].strip().split(" ")
1009
+ prompt_lengths.append(p)
1010
+ pp_tps.append(float(timings[0]))
1011
+ pp_tps_sd.append(float(timings[-1]))
1012
+ if f"tg{output_tokens:d}" in line:
1013
+ parts = line.split("|")
1014
+ timings = parts[-2].strip().split(" ")
1015
+ tg_tps = float(timings[0])
1016
+ tg_tps_sd = float(timings[-1])
1017
+
1018
+ return prompt_lengths, pp_tps, pp_tps_sd, tg_tps, tg_tps_sd
1019
+
1020
+ except Exception as e:
1021
+ error_msg = f"Failed to run llama-bench.exe command: {str(e)}\n"
831
1022
  error_msg += f"Command: {' '.join(cmd)}"
832
1023
  raise Exception(error_msg)
833
1024
 
@@ -857,7 +1048,9 @@ def get_hip_devices():
857
1048
  try:
858
1049
  libhip = ctypes.CDLL(matching_files[0])
859
1050
  except OSError:
860
- raise RuntimeError(f"Could not load HIP runtime library from {path}")
1051
+ raise RuntimeError(
1052
+ f"Could not load HIP runtime library from {matching_files[0]}"
1053
+ )
861
1054
 
862
1055
  # Setup function signatures
863
1056
  hipError_t = c_int
@@ -2,7 +2,6 @@ import argparse
2
2
  import statistics
3
3
  from statistics import StatisticsError
4
4
  from lemonade.state import State
5
- from lemonade.cache import Keys
6
5
  from lemonade.tools.adapter import ModelAdapter, TokenizerAdapter
7
6
  from lemonade.tools.bench import Bench
8
7
 
@@ -20,16 +19,6 @@ class OgaBench(Bench):
20
19
 
21
20
  unique_name = "oga-bench"
22
21
 
23
- def __init__(self):
24
- super().__init__()
25
-
26
- # Additional statistics generated by this bench tool
27
- self.status_stats.insert(
28
- self.status_stats.index(Keys.TOKEN_GENERATION_TOKENS_PER_SECOND) + 1,
29
- Keys.STD_DEV_TOKENS_PER_SECOND,
30
- )
31
- self.std_dev_token_generation_tokens_per_second_list = []
32
-
33
22
  @staticmethod
34
23
  def parser(add_help: bool = True) -> argparse.ArgumentParser:
35
24
  parser = __class__.helpful_parser(
@@ -121,21 +110,6 @@ class OgaBench(Bench):
121
110
  # Less than 2 measurements
122
111
  self.std_dev_token_generation_tokens_per_second_list.append(None)
123
112
 
124
- def save_stats(self, state):
125
- super().save_stats(state)
126
-
127
- # Save additional statistics
128
- if not all(
129
- element is None
130
- for element in self.std_dev_token_generation_tokens_per_second_list
131
- ):
132
- state.save_stat(
133
- Keys.STD_DEV_TOKENS_PER_SECOND,
134
- self.get_item_or_list(
135
- self.std_dev_token_generation_tokens_per_second_list
136
- ),
137
- )
138
-
139
113
 
140
114
  # This file was originally licensed under Apache 2.0. It has been modified.
141
115
  # Modifications Copyright (c) 2025 AMD
@@ -581,6 +581,12 @@ class LemonadePerfTable(Table):
581
581
  Keys.STD_DEV_SECONDS_TO_FIRST_TOKEN,
582
582
  ".2f",
583
583
  ),
584
+ StatWithSD(
585
+ _wrap("Prefill Tokens per Second", 8),
586
+ Keys.PREFILL_TOKENS_PER_SECOND,
587
+ Keys.STD_DEV_PREFILL_TOKENS_PER_SECOND,
588
+ ".2f",
589
+ ),
584
590
  StatWithSD(
585
591
  _wrap("Tokens per Second", 8),
586
592
  Keys.TOKEN_GENERATION_TOKENS_PER_SECOND,
@@ -0,0 +1,133 @@
1
+ import os
2
+ import logging
3
+ import subprocess
4
+ import time
5
+ import threading
6
+
7
+ import requests
8
+
9
+ from lemonade_server.pydantic_models import (
10
+ PullConfig,
11
+ ChatCompletionRequest,
12
+ )
13
+
14
+ from lemonade.tools.server.wrapped_server import WrappedServerTelemetry, WrappedServer
15
+ from lemonade.tools.flm.utils import install_flm, download_flm_model
16
+
17
+
18
+ class FlmTelemetry(WrappedServerTelemetry):
19
+ """
20
+ Manages telemetry data collection and display for FLM server.
21
+ """
22
+
23
+ def parse_telemetry_line(self, line: str):
24
+ """
25
+ Parse telemetry data from FLM server output lines.
26
+
27
+ Note: as of FLM 0.9.10, no telemetry data is provided by the server CLI.
28
+ This function is required to be implemented, so we leave it empty
29
+ as a placeholder for now.
30
+ """
31
+
32
+ return
33
+
34
+
35
+ class FlmServer(WrappedServer):
36
+ """
37
+ Routes OpenAI API requests to an FLM server instance and returns the result
38
+ back to Lemonade Server.
39
+ """
40
+
41
+ def __init__(self):
42
+ self.flm_model_name = None
43
+ super().__init__(server_name="flm-server", telemetry=FlmTelemetry())
44
+
45
+ def address(self):
46
+ return f"http://localhost:{self.port}/v1"
47
+
48
+ def install_server(self):
49
+ """
50
+ Check if FLM is installed and at minimum version.
51
+ If not, download and run the GUI installer, then wait for completion.
52
+ """
53
+ install_flm()
54
+
55
+ def download_model(
56
+ self, config_checkpoint, config_mmproj=None, do_not_upgrade=False
57
+ ) -> dict:
58
+ download_flm_model(config_checkpoint, config_mmproj, do_not_upgrade)
59
+
60
+ def _launch_server_subprocess(
61
+ self,
62
+ model_config: PullConfig,
63
+ snapshot_files: dict,
64
+ ctx_size: int,
65
+ supports_embeddings: bool = False,
66
+ supports_reranking: bool = False,
67
+ ):
68
+
69
+ self._choose_port()
70
+
71
+ # Keep track of the FLM model name so that we can use it later
72
+ self.flm_model_name = model_config.checkpoint
73
+
74
+ command = [
75
+ "flm",
76
+ "serve",
77
+ f"{self.flm_model_name}",
78
+ "--ctx-len",
79
+ str(ctx_size),
80
+ "--port",
81
+ str(self.port),
82
+ ]
83
+
84
+ # Set up environment with library path for Linux
85
+ env = os.environ.copy()
86
+
87
+ self.process = subprocess.Popen(
88
+ command,
89
+ stdout=subprocess.PIPE,
90
+ stderr=subprocess.STDOUT,
91
+ text=True,
92
+ encoding="utf-8",
93
+ errors="replace",
94
+ bufsize=1,
95
+ env=env,
96
+ )
97
+
98
+ # Start background thread to log subprocess output
99
+ threading.Thread(
100
+ target=self._log_subprocess_output,
101
+ args=("FLM SERVER",),
102
+ daemon=True,
103
+ ).start()
104
+
105
+ def _wait_for_load(self):
106
+ """
107
+ FLM doesn't seem to have a health API, so we'll use the "list local models"
108
+ API to check if the server is up.
109
+ """
110
+ status_code = None
111
+ while not self.process.poll() and status_code != 200:
112
+ health_url = f"http://localhost:{self.port}/api/tags"
113
+ try:
114
+ health_response = requests.get(health_url)
115
+ except requests.exceptions.ConnectionError:
116
+ logging.debug(
117
+ "Not able to connect to %s yet, will retry", self.server_name
118
+ )
119
+ else:
120
+ status_code = health_response.status_code
121
+ logging.debug(
122
+ "Testing %s readiness (will retry until ready), result: %s",
123
+ self.server_name,
124
+ health_response.json(),
125
+ )
126
+ time.sleep(1)
127
+
128
+ def chat_completion(self, chat_completion_request: ChatCompletionRequest):
129
+ # FLM requires the correct model name to be in the request
130
+ # (whereas llama-server ignores the model name field in the request)
131
+ chat_completion_request.model = self.flm_model_name
132
+
133
+ return super().chat_completion(chat_completion_request)
@@ -88,9 +88,8 @@ class LlamaTelemetry(WrappedServerTelemetry):
88
88
 
89
89
  class LlamaServer(WrappedServer):
90
90
  def __init__(self, backend: str):
91
- self.telemetry = LlamaTelemetry()
92
91
  self.backend = backend
93
- super().__init__(server_name="llama-server", telemetry=self.telemetry)
92
+ super().__init__(server_name="llama-server", telemetry=LlamaTelemetry())
94
93
 
95
94
  def install_server(self, backend=None):
96
95
  """
@@ -157,13 +156,23 @@ class LlamaServer(WrappedServer):
157
156
 
158
157
  # Find a port, and save it in the telemetry object for future reference
159
158
  # by other functions
160
- self.choose_port()
159
+ self._choose_port()
161
160
 
162
161
  # Add port and jinja to enable tool use
163
162
  base_command.extend(["--port", str(self.port), "--jinja"])
164
163
 
165
164
  # Enable context shift and avoid attention sink issues by preserving the initial tokens
166
- base_command.extend(["--context-shift", "--keep", "16"])
165
+ # Note: --context-shift is not supported on all backends (e.g., Metal on macOS)
166
+ # Only add context-shift for backends that support it
167
+ context_shift_supported_backends = ["vulkan", "rocm"]
168
+ if self.backend in context_shift_supported_backends:
169
+ base_command.extend(["--context-shift", "--keep", "16"])
170
+ else:
171
+ # For backends that don't support context-shift (e.g., Metal), just use keep
172
+ base_command.extend(["--keep", "16"])
173
+ logging.debug(
174
+ f"Skipped --context-shift for backend: {self.backend} (not supported)"
175
+ )
167
176
 
168
177
  # Use legacy reasoning formatting, since not all apps support the new
169
178
  # reasoning_content field
@@ -192,7 +201,8 @@ class LlamaServer(WrappedServer):
192
201
  env.update(os.environ)
193
202
  logging.debug(f"Loaded environment variables from {env_file_path}")
194
203
 
195
- if platform.system().lower() == "linux":
204
+ system = platform.system().lower()
205
+ if system == "linux":
196
206
  lib_dir = os.path.dirname(exe_path) # Same directory as the executable
197
207
  current_ld_path = env.get("LD_LIBRARY_PATH", "")
198
208
  if current_ld_path:
@@ -200,6 +210,14 @@ class LlamaServer(WrappedServer):
200
210
  else:
201
211
  env["LD_LIBRARY_PATH"] = lib_dir
202
212
  logging.debug(f"Set LD_LIBRARY_PATH to {env['LD_LIBRARY_PATH']}")
213
+ elif system == "darwin":
214
+ lib_dir = os.path.dirname(exe_path)
215
+ current_dyld_path = env.get("DYLD_LIBRARY_PATH", "")
216
+ if current_dyld_path:
217
+ env["DYLD_LIBRARY_PATH"] = f"{lib_dir}:{current_dyld_path}"
218
+ else:
219
+ env["DYLD_LIBRARY_PATH"] = lib_dir
220
+ logging.debug(f"Set DYLD_LIBRARY_PATH to {env['DYLD_LIBRARY_PATH']}")
203
221
 
204
222
  # Start subprocess with output capture
205
223
  self.process = subprocess.Popen(