lemonade-sdk 8.1.4__py3-none-any.whl → 8.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (53) hide show
  1. lemonade/cache.py +6 -1
  2. lemonade/cli.py +47 -5
  3. lemonade/common/inference_engines.py +13 -4
  4. lemonade/common/status.py +4 -4
  5. lemonade/common/system_info.py +544 -1
  6. lemonade/profilers/agt_power.py +437 -0
  7. lemonade/profilers/hwinfo_power.py +429 -0
  8. lemonade/tools/accuracy.py +143 -48
  9. lemonade/tools/adapter.py +6 -1
  10. lemonade/tools/bench.py +26 -8
  11. lemonade/tools/flm/__init__.py +1 -0
  12. lemonade/tools/flm/utils.py +303 -0
  13. lemonade/tools/huggingface/bench.py +6 -1
  14. lemonade/tools/llamacpp/bench.py +146 -27
  15. lemonade/tools/llamacpp/load.py +30 -2
  16. lemonade/tools/llamacpp/utils.py +393 -33
  17. lemonade/tools/oga/bench.py +5 -26
  18. lemonade/tools/oga/load.py +60 -121
  19. lemonade/tools/oga/migration.py +403 -0
  20. lemonade/tools/report/table.py +76 -8
  21. lemonade/tools/server/flm.py +133 -0
  22. lemonade/tools/server/llamacpp.py +220 -553
  23. lemonade/tools/server/serve.py +684 -168
  24. lemonade/tools/server/static/js/chat.js +666 -342
  25. lemonade/tools/server/static/js/model-settings.js +24 -3
  26. lemonade/tools/server/static/js/models.js +597 -73
  27. lemonade/tools/server/static/js/shared.js +79 -14
  28. lemonade/tools/server/static/logs.html +191 -0
  29. lemonade/tools/server/static/styles.css +491 -66
  30. lemonade/tools/server/static/webapp.html +83 -31
  31. lemonade/tools/server/tray.py +158 -38
  32. lemonade/tools/server/utils/macos_tray.py +226 -0
  33. lemonade/tools/server/utils/{system_tray.py → windows_tray.py} +13 -0
  34. lemonade/tools/server/webapp.py +4 -1
  35. lemonade/tools/server/wrapped_server.py +559 -0
  36. lemonade/version.py +1 -1
  37. lemonade_install/install.py +54 -611
  38. {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/METADATA +29 -72
  39. lemonade_sdk-8.2.2.dist-info/RECORD +83 -0
  40. lemonade_server/cli.py +145 -37
  41. lemonade_server/model_manager.py +521 -37
  42. lemonade_server/pydantic_models.py +28 -1
  43. lemonade_server/server_models.json +246 -92
  44. lemonade_server/settings.py +39 -39
  45. lemonade/tools/quark/__init__.py +0 -0
  46. lemonade/tools/quark/quark_load.py +0 -173
  47. lemonade/tools/quark/quark_quantize.py +0 -439
  48. lemonade_sdk-8.1.4.dist-info/RECORD +0 -77
  49. {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/WHEEL +0 -0
  50. {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/entry_points.txt +0 -0
  51. {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/licenses/LICENSE +0 -0
  52. {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/licenses/NOTICE.md +0 -0
  53. {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/top_level.txt +0 -0
@@ -3,19 +3,22 @@ import os
3
3
  import platform
4
4
  import shutil
5
5
  import sys
6
+ import threading
7
+ import time
6
8
  import zipfile
7
9
  from typing import Optional
10
+ import psutil
8
11
  import subprocess
9
12
  import requests
13
+ import lemonade.common.build as build
10
14
  import lemonade.common.printing as printing
11
15
  from lemonade.tools.adapter import PassthroughTokenizer, ModelAdapter
12
-
13
16
  from lemonade.common.system_info import get_system_info
14
-
15
17
  from dotenv import set_key, load_dotenv
16
18
 
17
- LLAMA_VERSION_VULKAN = "b6097"
18
- LLAMA_VERSION_ROCM = "b1021"
19
+ LLAMA_VERSION_VULKAN = "b6510"
20
+ LLAMA_VERSION_ROCM = "b1066"
21
+ LLAMA_VERSION_METAL = "b6510"
19
22
 
20
23
 
21
24
  def identify_rocm_arch_from_name(device_name: str) -> str | None:
@@ -126,8 +129,12 @@ def get_llama_version(backend: str) -> str:
126
129
  return LLAMA_VERSION_ROCM
127
130
  elif backend == "vulkan":
128
131
  return LLAMA_VERSION_VULKAN
132
+ elif backend == "metal":
133
+ return LLAMA_VERSION_METAL
129
134
  else:
130
- raise ValueError(f"Unsupported backend: {backend}")
135
+ raise ValueError(
136
+ f"Unsupported backend: {backend}. Supported: vulkan, rocm, metal"
137
+ )
131
138
 
132
139
 
133
140
  def get_llama_folder_path(backend: str):
@@ -142,10 +149,12 @@ def get_llama_exe_path(exe_name: str, backend: str):
142
149
  Get path to platform-specific llama-server executable
143
150
  """
144
151
  base_dir = get_llama_folder_path(backend)
145
- if platform.system().lower() == "windows":
152
+ system = platform.system().lower()
153
+
154
+ if system == "windows":
146
155
  return os.path.join(base_dir, f"{exe_name}.exe")
147
- else: # Linux/Ubuntu
148
- # Check if executable exists in build/bin subdirectory (Current Ubuntu structure)
156
+ else: # Darwin/Linux/Ubuntu
157
+ # Check if executable exists in build/bin subdirectory
149
158
  build_bin_path = os.path.join(base_dir, "build", "bin", exe_name)
150
159
  if os.path.exists(build_bin_path):
151
160
  return build_bin_path
@@ -168,6 +177,13 @@ def get_llama_cli_exe_path(backend: str):
168
177
  return get_llama_exe_path("llama-cli", backend)
169
178
 
170
179
 
180
+ def get_llama_bench_exe_path(backend: str):
181
+ """
182
+ Get path to platform-specific llama-bench executable
183
+ """
184
+ return get_llama_exe_path("llama-bench", backend)
185
+
186
+
171
187
  def get_version_txt_path(backend: str):
172
188
  """
173
189
  Get path to text file that contains version information
@@ -223,8 +239,24 @@ def get_binary_url_and_filename(backend: str, target_arch: str = None):
223
239
  raise NotImplementedError(
224
240
  f"Platform {system} not supported for Vulkan llamacpp. Supported: Windows, Ubuntu Linux"
225
241
  )
242
+
243
+ elif backend == "metal":
244
+ # Metal support for macOS Apple Silicon from ggml-org/llama.cpp
245
+ repo = "ggml-org/llama.cpp"
246
+ version = LLAMA_VERSION_METAL
247
+ if system == "darwin":
248
+ if platform.machine().lower() in ["arm64", "aarch64"]:
249
+ filename = f"llama-{version}-bin-macos-arm64.zip"
250
+ else:
251
+ raise NotImplementedError(
252
+ "Metal backend only supports Apple Silicon (ARM64) processors"
253
+ )
254
+ else:
255
+ raise NotImplementedError(
256
+ f"Platform {system} not supported for Metal llamacpp. Metal is only supported on macOS"
257
+ )
226
258
  else:
227
- supported_backends = ["vulkan", "rocm"]
259
+ supported_backends = ["vulkan", "rocm", "metal"]
228
260
  raise NotImplementedError(
229
261
  f"Unsupported backend: {backend}. Supported backends: {supported_backends}"
230
262
  )
@@ -239,10 +271,10 @@ def validate_platform_support():
239
271
  """
240
272
  system = platform.system().lower()
241
273
 
242
- if system not in ["windows", "linux"]:
274
+ if system not in ["windows", "linux", "darwin"]:
243
275
  raise NotImplementedError(
244
276
  f"Platform {system} not supported for llamacpp. "
245
- "Supported: Windows, Ubuntu Linux"
277
+ "Supported: Windows, Ubuntu Linux, macOS"
246
278
  )
247
279
 
248
280
  if system == "linux":
@@ -341,12 +373,39 @@ def install_llamacpp(backend):
341
373
  if filename.endswith(".zip"):
342
374
  with zipfile.ZipFile(llama_archive_path, "r") as zip_ref:
343
375
  zip_ref.extractall(llama_server_exe_dir)
376
+
377
+ # On Unix-like systems (macOS/Linux), make executables executable
378
+ if platform.system().lower() in ["darwin", "linux"]:
379
+ import stat
380
+
381
+ # Find and make executable files executable
382
+ for root, _, files in os.walk(llama_server_exe_dir):
383
+ for file in files:
384
+ file_path = os.path.join(root, file)
385
+ # Make files in bin/ directories executable
386
+ if "bin" in root.split(os.sep) or file in [
387
+ "llama-server",
388
+ "llama-simple",
389
+ ]:
390
+ try:
391
+ current_permissions = os.stat(file_path).st_mode
392
+ os.chmod(file_path, current_permissions | stat.S_IEXEC)
393
+ logging.debug(f"Made {file_path} executable")
394
+ except Exception as e:
395
+ raise RuntimeError(
396
+ f"Failed to make {file_path} executable. This will prevent "
397
+ f"llama-server from starting. Error: {e}"
398
+ )
344
399
  else:
345
400
  raise NotImplementedError(f"Unsupported archive format: {filename}")
346
401
 
347
402
  # Identify and set HIP ID
348
403
  if backend == "rocm":
349
- hip_id = identify_hip_id()
404
+ try:
405
+ hip_id = identify_hip_id()
406
+ except Exception as e: # pylint: disable=broad-exception-caught
407
+ hip_id = 0
408
+ logging.warning(f"Error identifying HIP ID: {e}. Falling back to 0.")
350
409
  env_file_path = os.path.join(llama_server_exe_dir, ".env")
351
410
  set_key(env_file_path, "HIP_VISIBLE_DEVICES", str(hip_id))
352
411
 
@@ -356,6 +415,7 @@ def install_llamacpp(backend):
356
415
  exe_paths = [
357
416
  (get_llama_server_exe_path(backend), "llama-server"),
358
417
  (get_llama_cli_exe_path(backend), "llama-cli"),
418
+ (get_llama_bench_exe_path(backend), "llama-bench"),
359
419
  ]
360
420
 
361
421
  for exe_path, exe_name in exe_paths:
@@ -496,7 +556,7 @@ def get_local_checkpoint_path(base_checkpoint, variant):
496
556
 
497
557
 
498
558
  def identify_gguf_models(
499
- checkpoint: str, variant: str, mmproj: str
559
+ checkpoint: str, variant: Optional[str], mmproj: str
500
560
  ) -> tuple[dict, list[str]]:
501
561
  """
502
562
  Identifies the GGUF model files in the repository that match the variant.
@@ -506,12 +566,14 @@ def identify_gguf_models(
506
566
  The CHECKPOINT:VARIANT scheme is used to specify model files in Hugging Face repositories.
507
567
 
508
568
  The VARIANT format can be one of several types:
569
+ 0. wildcard (*): download all .gguf files in the repo
509
570
  1. Full filename: exact file to download
510
571
  2. None/empty: gets the first .gguf file in the repository (excludes mmproj files)
511
572
  3. Quantization variant: find a single file ending with the variant name (case insensitive)
512
573
  4. Folder name: downloads all .gguf files in the folder that matches the variant name (case insensitive)
513
574
 
514
575
  Examples:
576
+ - "ggml-org/gpt-oss-120b-GGUF:*" -> downloads all .gguf files in repo
515
577
  - "unsloth/Qwen3-8B-GGUF:qwen3.gguf" -> downloads "qwen3.gguf"
516
578
  - "unsloth/Qwen3-30B-A3B-GGUF" -> downloads "Qwen3-30B-A3B-GGUF.gguf"
517
579
  - "unsloth/Qwen3-8B-GGUF:Q4_1" -> downloads "Qwen3-8B-GGUF-Q4_1.gguf"
@@ -523,8 +585,18 @@ def identify_gguf_models(
523
585
  repo_files = list_repo_files(checkpoint)
524
586
  sharded_files = []
525
587
 
588
+ # (case 0) Wildcard, download everything
589
+ if variant and variant == "*":
590
+ sharded_files = [f for f in repo_files if f.endswith(".gguf")]
591
+
592
+ # Sort to ensure consistent ordering
593
+ sharded_files.sort()
594
+
595
+ # Use first file as primary (this is how llamacpp handles it)
596
+ variant_name = sharded_files[0]
597
+
526
598
  # (case 1) If variant ends in .gguf, use it directly
527
- if variant and variant.endswith(".gguf"):
599
+ elif variant and variant.endswith(".gguf"):
528
600
  variant_name = variant
529
601
  if variant_name not in repo_files:
530
602
  raise ValueError(
@@ -585,15 +657,91 @@ def identify_gguf_models(
585
657
  return core_files, sharded_files
586
658
 
587
659
 
588
- def download_gguf(config_checkpoint, config_mmproj=None, do_not_upgrade=False) -> dict:
660
+ def resolve_local_gguf_model(
661
+ checkpoint: str, variant: str, config_mmproj: str = None
662
+ ) -> dict | None:
663
+ """
664
+ Attempts to resolve a GGUF model from the local HuggingFace cache.
589
665
  """
590
- Downloads the GGUF file for the given model configuration.
666
+ from huggingface_hub.constants import HF_HUB_CACHE
667
+
668
+ # Convert checkpoint to cache directory format
669
+ if checkpoint.startswith("models--"):
670
+ model_cache_dir = os.path.join(HF_HUB_CACHE, checkpoint)
671
+ else:
672
+ # This is a HuggingFace repo - convert to cache directory format
673
+ repo_cache_name = checkpoint.replace("/", "--")
674
+ model_cache_dir = os.path.join(HF_HUB_CACHE, f"models--{repo_cache_name}")
675
+
676
+ # Check if the cache directory exists
677
+ if not os.path.exists(model_cache_dir):
678
+ return None
591
679
 
592
- For sharded models, if the variant points to a folder (e.g. Q4_0), all files in that folder
593
- will be downloaded but only the first file will be returned for loading.
680
+ gguf_file_found = None
681
+
682
+ # If variant is specified, look for that specific file
683
+ if variant:
684
+ search_term = variant if variant.endswith(".gguf") else f"{variant}.gguf"
685
+
686
+ for root, _, files in os.walk(model_cache_dir):
687
+ if search_term in files:
688
+ gguf_file_found = os.path.join(root, search_term)
689
+ break
690
+
691
+ # If no variant or variant not found, find any .gguf file (excluding mmproj)
692
+ if not gguf_file_found:
693
+ for root, _, files in os.walk(model_cache_dir):
694
+ gguf_files = [
695
+ f for f in files if f.endswith(".gguf") and "mmproj" not in f.lower()
696
+ ]
697
+ if gguf_files:
698
+ gguf_file_found = os.path.join(root, gguf_files[0])
699
+ break
700
+
701
+ # If no GGUF file found, model is not in cache
702
+ if not gguf_file_found:
703
+ return None
704
+
705
+ # Build result dictionary
706
+ result = {"variant": gguf_file_found}
707
+
708
+ # Search for mmproj file if provided
709
+ if config_mmproj:
710
+ for root, _, files in os.walk(model_cache_dir):
711
+ if config_mmproj in files:
712
+ result["mmproj"] = os.path.join(root, config_mmproj)
713
+ break
714
+
715
+ logging.info(f"Resolved local GGUF model: {result}")
716
+ return result
717
+
718
+
719
+ def download_gguf(
720
+ config_checkpoint: str, config_mmproj=None, do_not_upgrade: bool = False
721
+ ) -> dict:
594
722
  """
723
+ Downloads the GGUF file for the given model configuration from HuggingFace.
724
+
725
+ This function downloads models from the internet. It does NOT check the local cache first.
726
+ Callers should use resolve_local_gguf_model() if they want to check for existing models first.
727
+
728
+ Args:
729
+ config_checkpoint: Checkpoint identifier (file path or HF repo with variant)
730
+ config_mmproj: Optional mmproj file to also download
731
+ do_not_upgrade: If True, use local cache only without attempting to download updates
595
732
 
596
- # This code handles all cases by constructing the appropriate filename or pattern
733
+ Returns:
734
+ Dictionary with "variant" (and optionally "mmproj") file paths
735
+ """
736
+ # Handle direct file path case - if the checkpoint is an actual file on disk
737
+ if os.path.exists(config_checkpoint):
738
+ result = {"variant": config_checkpoint}
739
+ if config_mmproj:
740
+ result["mmproj"] = config_mmproj
741
+ return result
742
+
743
+ # Parse checkpoint to extract base and variant
744
+ # Checkpoint format: repo_name:variant (e.g., "unsloth/Qwen3-0.6B-GGUF:Q4_0")
597
745
  checkpoint, variant = parse_checkpoint(config_checkpoint)
598
746
 
599
747
  # Identify the GGUF model files in the repository that match the variant
@@ -624,6 +772,37 @@ def download_gguf(config_checkpoint, config_mmproj=None, do_not_upgrade=False) -
624
772
  }
625
773
 
626
774
 
775
+ # Function to read a stream (stdout or stderr) into a list
776
+ def stream_reader(stream, output_list):
777
+ for line in iter(stream.readline, b""):
778
+ decoded_line = line.decode().rstrip()
779
+ output_list.append(decoded_line)
780
+ stream.close()
781
+
782
+
783
+ def monitor_process_memory(pid, memory_data, interval=0.5):
784
+ """Monitor memory usage of a process in a separate thread."""
785
+
786
+ try:
787
+ is_windows = platform.system() == "Windows"
788
+ if is_windows:
789
+ # We can only collect peak_wset in Windows
790
+ process = psutil.Process(pid)
791
+ while process.is_running():
792
+ try:
793
+ mem_info = process.memory_info()
794
+ peak_wset = mem_info.peak_wset
795
+ if peak_wset is not None:
796
+ memory_data["peak_wset"] = peak_wset
797
+ except psutil.NoSuchProcess:
798
+ break
799
+ time.sleep(interval)
800
+ except Exception as e:
801
+ print(f"Error monitoring process: {e}")
802
+
803
+ return memory_data
804
+
805
+
627
806
  class LlamaCppTokenizerAdapter(PassthroughTokenizer):
628
807
  pass
629
808
 
@@ -637,8 +816,10 @@ class LlamaCppAdapter(ModelAdapter):
637
816
  context_size,
638
817
  threads,
639
818
  executable,
819
+ bench_executable,
640
820
  reasoning=False,
641
821
  lib_dir=None,
822
+ state=None,
642
823
  ):
643
824
  super().__init__()
644
825
 
@@ -650,8 +831,10 @@ class LlamaCppAdapter(ModelAdapter):
650
831
  self.context_size = context_size
651
832
  self.threads = threads
652
833
  self.executable = os.path.normpath(executable)
834
+ self.bench_executable = os.path.normpath(bench_executable)
653
835
  self.reasoning = reasoning
654
836
  self.lib_dir = lib_dir
837
+ self.state = state
655
838
 
656
839
  def generate(
657
840
  self,
@@ -661,6 +844,7 @@ class LlamaCppAdapter(ModelAdapter):
661
844
  top_p: float = 0.95,
662
845
  top_k: int = 40,
663
846
  return_raw: bool = False,
847
+ save_max_memory_used: bool = False,
664
848
  **kwargs, # pylint: disable=unused-argument
665
849
  ):
666
850
  """
@@ -692,32 +876,54 @@ class LlamaCppAdapter(ModelAdapter):
692
876
  self.executable,
693
877
  "-m",
694
878
  self.model,
695
- "--ctx-size",
879
+ "--ctx-size", # size of the prompt context, 0 = loaded from model
696
880
  str(self.context_size),
697
- "-n",
881
+ "-n", # number of tokens to predict, -1 = infinity, =2 - until context filled
698
882
  str(n_predict),
699
- "-t",
883
+ "-t", # number of threads to use during generation
700
884
  str(self.threads),
701
885
  "-p",
702
886
  prompt,
887
+ "-b", # logical maximum batch size
888
+ "1",
889
+ "-ub", # physical maximum batch size
890
+ "1",
703
891
  "--temp",
704
892
  str(temperature),
705
893
  "--top-p",
706
894
  str(top_p),
707
895
  "--top-k",
708
896
  str(top_k),
709
- "-e",
710
- "-no-cnv",
711
- "--reasoning-format",
897
+ "-e", # process escape sequences
898
+ "--no-conversation", # disable conversation mode
899
+ "--reasoning-format", # leaves thoughts unparsed in message content
712
900
  "none",
713
901
  ]
714
902
 
903
+ # If prompt exceeds 500 characters, then use a file
904
+ if len(prompt) < 500:
905
+ cmd += ["-p", prompt]
906
+ else:
907
+ # Create prompt file in cache directory
908
+ prompt_file = os.path.join(
909
+ build.output_dir(self.state.cache_dir, self.state.build_name),
910
+ "prompt.txt",
911
+ )
912
+ with open(prompt_file, "w", encoding="utf-8") as file:
913
+ file.write(prompt)
914
+ cmd += ["-f", prompt_file]
915
+
715
916
  # Configure GPU layers: 99 for GPU, 0 for CPU-only
716
917
  ngl_value = "99" if self.device == "igpu" else "0"
717
918
  cmd = cmd + ["-ngl", ngl_value]
718
919
 
719
920
  cmd = [str(m) for m in cmd]
720
921
 
922
+ # save llama-cli command
923
+ self.state.llama_cli_cmd = getattr(self.state, "llama_cli_cmd", []) + [
924
+ " ".join(cmd)
925
+ ]
926
+
721
927
  try:
722
928
  # Set up environment with library path for Linux
723
929
  env = os.environ.copy()
@@ -746,15 +952,35 @@ class LlamaCppAdapter(ModelAdapter):
746
952
  env=env,
747
953
  )
748
954
 
749
- raw_output, stderr = process.communicate(timeout=600)
955
+ # Start memory monitoring in a separate thread
956
+ if save_max_memory_used:
957
+ memory_data = {}
958
+ monitor_thread = threading.Thread(
959
+ target=monitor_process_memory,
960
+ args=(process.pid, memory_data),
961
+ daemon=True,
962
+ )
963
+ monitor_thread.start()
964
+
965
+ # Communicate with the subprocess
966
+ stdout, stderr = process.communicate(timeout=600)
967
+
968
+ # save llama-cli command output with performance info to state
969
+ # (can be viewed in state.yaml file in cache)
970
+ self.state.llama_cli_stderr = getattr(
971
+ self.state, "llama_cli_stderr", []
972
+ ) + [
973
+ [line for line in stderr.splitlines() if line.startswith("llama_perf_")]
974
+ ]
975
+
750
976
  if process.returncode != 0:
751
977
  error_msg = f"llama.cpp failed with return code {process.returncode}.\n"
752
978
  error_msg += f"Command: {' '.join(cmd)}\n"
753
979
  error_msg += f"Error output:\n{stderr}\n"
754
- error_msg += f"Standard output:\n{raw_output}"
980
+ error_msg += f"Standard output:\n{stdout}"
755
981
  raise Exception(error_msg)
756
982
 
757
- if raw_output is None:
983
+ if stdout is None:
758
984
  raise Exception("No output received from llama.cpp process")
759
985
 
760
986
  # Parse information from llama.cpp output
@@ -785,14 +1011,19 @@ class LlamaCppAdapter(ModelAdapter):
785
1011
  else 0
786
1012
  )
787
1013
 
1014
+ # Wait for monitor thread to finish and write peak_wset
1015
+ if save_max_memory_used:
1016
+ monitor_thread.join(timeout=2)
1017
+ self.peak_wset = memory_data.get("peak_wset", None)
1018
+
788
1019
  if return_raw:
789
- return [raw_output, stderr]
1020
+ return [stdout, stderr]
790
1021
 
791
1022
  # Find where the prompt ends and the generated text begins
792
1023
  prompt_found = False
793
1024
  output_text = ""
794
1025
  prompt_first_line = prompt.split("\n")[0]
795
- for line in raw_output.splitlines():
1026
+ for line in stdout.splitlines():
796
1027
  if prompt_first_line in line:
797
1028
  prompt_found = True
798
1029
  if prompt_found:
@@ -803,7 +1034,7 @@ class LlamaCppAdapter(ModelAdapter):
803
1034
  raise Exception(
804
1035
  f"Could not find prompt '{prompt_first_line}' in llama.cpp output. "
805
1036
  "This usually means the model failed to process the prompt correctly.\n"
806
- f"Raw output:\n{raw_output}\n"
1037
+ f"Raw output:\n{stdout}\n"
807
1038
  f"Stderr:\n{stderr}"
808
1039
  )
809
1040
 
@@ -811,10 +1042,137 @@ class LlamaCppAdapter(ModelAdapter):
811
1042
  return [output_text]
812
1043
 
813
1044
  except Exception as e:
814
- error_msg = f"Failed to run llama.cpp command: {str(e)}\n"
1045
+ error_msg = f"Failed to run llama-cli.exe command: {str(e)}\n"
815
1046
  error_msg += f"Command: {' '.join(cmd)}"
816
1047
  raise Exception(error_msg)
817
1048
 
1049
+ def benchmark(self, prompt, iterations, output_tokens):
1050
+ """
1051
+ Runs the llama-bench.exe tool to measure TTFT and TPS
1052
+ """
1053
+ cmd = [
1054
+ self.bench_executable,
1055
+ "-m",
1056
+ self.model,
1057
+ "-r",
1058
+ iterations,
1059
+ "-p",
1060
+ str(prompt),
1061
+ "-n",
1062
+ output_tokens,
1063
+ "-t",
1064
+ self.threads if self.threads > 0 else 16,
1065
+ "-b",
1066
+ 1,
1067
+ "-ub",
1068
+ 1,
1069
+ ]
1070
+ ngl_value = "99" if self.device == "igpu" else "0"
1071
+ cmd = cmd + ["-ngl", ngl_value]
1072
+ cmd = [str(m) for m in cmd]
1073
+
1074
+ # save llama-bench command
1075
+ self.state.llama_bench_cmd = " ".join(cmd)
1076
+
1077
+ try:
1078
+ # Set up environment with library path for Linux
1079
+ env = os.environ.copy()
1080
+
1081
+ # Load environment variables from .env file in the executable directory
1082
+ exe_dir = os.path.dirname(self.executable)
1083
+ env_file_path = os.path.join(exe_dir, ".env")
1084
+ if os.path.exists(env_file_path):
1085
+ load_dotenv(env_file_path, override=True)
1086
+ env.update(os.environ)
1087
+
1088
+ if self.lib_dir and os.name != "nt": # Not Windows
1089
+ current_ld_path = env.get("LD_LIBRARY_PATH", "")
1090
+ if current_ld_path:
1091
+ env["LD_LIBRARY_PATH"] = f"{self.lib_dir}:{current_ld_path}"
1092
+ else:
1093
+ env["LD_LIBRARY_PATH"] = self.lib_dir
1094
+
1095
+ process = subprocess.Popen(
1096
+ cmd,
1097
+ stdout=subprocess.PIPE,
1098
+ stderr=subprocess.PIPE,
1099
+ universal_newlines=True,
1100
+ encoding="utf-8",
1101
+ errors="replace",
1102
+ env=env,
1103
+ )
1104
+
1105
+ # Start memory monitoring in a separate thread
1106
+ save_max_memory_used = platform.system() == "Windows"
1107
+ if save_max_memory_used:
1108
+ memory_data = {}
1109
+ monitor_thread = threading.Thread(
1110
+ target=monitor_process_memory,
1111
+ args=(process.pid, memory_data),
1112
+ daemon=True,
1113
+ )
1114
+ monitor_thread.start()
1115
+
1116
+ # Communicate with the subprocess
1117
+ stdout, stderr = process.communicate(timeout=600)
1118
+
1119
+ # save llama-bench command output with performance info to state
1120
+ # (can be viewed in state.yaml file in cache)
1121
+ self.state.llama_bench_standard_output = stdout.splitlines()
1122
+
1123
+ if process.returncode != 0:
1124
+ error_msg = (
1125
+ f"llama-bench.exe failed with return code {process.returncode}.\n"
1126
+ )
1127
+ error_msg += f"Command: {' '.join(cmd)}\n"
1128
+ error_msg += f"Error output:\n{stderr}\n"
1129
+ error_msg += f"Standard output:\n{stdout}"
1130
+ raise Exception(error_msg)
1131
+
1132
+ if stdout is None:
1133
+ error_msg = "No output received from llama-bench.exe process\n"
1134
+ error_msg += f"Error output:\n{stderr}\n"
1135
+ error_msg += f"Standard output:\n{stdout}"
1136
+ raise Exception(error_msg)
1137
+
1138
+ # Parse information from llama-bench.exe output
1139
+ prompt_length = None
1140
+ pp_tps = None
1141
+ pp_tps_sd = None
1142
+ tg_tps = None
1143
+ tg_tps_sd = None
1144
+
1145
+ for line in stdout.splitlines():
1146
+ # Parse TPS information
1147
+ if f"pp{prompt:d}" in line:
1148
+ parts = line.split("|")
1149
+ timings = parts[-2].strip().split(" ")
1150
+ prompt_length = prompt
1151
+ pp_tps = float(timings[0])
1152
+ pp_tps_sd = float(timings[-1])
1153
+ if f"tg{output_tokens:d}" in line:
1154
+ parts = line.split("|")
1155
+ timings = parts[-2].strip().split(" ")
1156
+ tg_tps = float(timings[0])
1157
+ tg_tps_sd = float(timings[-1])
1158
+
1159
+ except Exception as e:
1160
+ error_msg = f"Failed to run llama-bench.exe command: {str(e)}\n"
1161
+ error_msg += f"Command: {' '.join(cmd)}"
1162
+ raise Exception(error_msg)
1163
+
1164
+ # Determine max memory used
1165
+ if save_max_memory_used:
1166
+ # Wait for monitor thread to finish
1167
+ monitor_thread.join(timeout=2)
1168
+
1169
+ # Track memory usage concurrently
1170
+ peak_wset = memory_data.get("peak_wset", None)
1171
+ else:
1172
+ peak_wset = None
1173
+
1174
+ return prompt_length, pp_tps, pp_tps_sd, tg_tps, tg_tps_sd, peak_wset
1175
+
818
1176
 
819
1177
  def get_hip_devices():
820
1178
  """Get list of HIP devices with their IDs and names."""
@@ -841,7 +1199,9 @@ def get_hip_devices():
841
1199
  try:
842
1200
  libhip = ctypes.CDLL(matching_files[0])
843
1201
  except OSError:
844
- raise RuntimeError(f"Could not load HIP runtime library from {path}")
1202
+ raise RuntimeError(
1203
+ f"Could not load HIP runtime library from {matching_files[0]}"
1204
+ )
845
1205
 
846
1206
  # Setup function signatures
847
1207
  hipError_t = c_int