lemonade-sdk 8.1.4__py3-none-any.whl → 8.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lemonade-sdk might be problematic. Click here for more details.
- lemonade/cache.py +6 -1
- lemonade/cli.py +47 -5
- lemonade/common/inference_engines.py +13 -4
- lemonade/common/status.py +4 -4
- lemonade/common/system_info.py +544 -1
- lemonade/profilers/agt_power.py +437 -0
- lemonade/profilers/hwinfo_power.py +429 -0
- lemonade/tools/accuracy.py +143 -48
- lemonade/tools/adapter.py +6 -1
- lemonade/tools/bench.py +26 -8
- lemonade/tools/flm/__init__.py +1 -0
- lemonade/tools/flm/utils.py +303 -0
- lemonade/tools/huggingface/bench.py +6 -1
- lemonade/tools/llamacpp/bench.py +146 -27
- lemonade/tools/llamacpp/load.py +30 -2
- lemonade/tools/llamacpp/utils.py +393 -33
- lemonade/tools/oga/bench.py +5 -26
- lemonade/tools/oga/load.py +60 -121
- lemonade/tools/oga/migration.py +403 -0
- lemonade/tools/report/table.py +76 -8
- lemonade/tools/server/flm.py +133 -0
- lemonade/tools/server/llamacpp.py +220 -553
- lemonade/tools/server/serve.py +684 -168
- lemonade/tools/server/static/js/chat.js +666 -342
- lemonade/tools/server/static/js/model-settings.js +24 -3
- lemonade/tools/server/static/js/models.js +597 -73
- lemonade/tools/server/static/js/shared.js +79 -14
- lemonade/tools/server/static/logs.html +191 -0
- lemonade/tools/server/static/styles.css +491 -66
- lemonade/tools/server/static/webapp.html +83 -31
- lemonade/tools/server/tray.py +158 -38
- lemonade/tools/server/utils/macos_tray.py +226 -0
- lemonade/tools/server/utils/{system_tray.py → windows_tray.py} +13 -0
- lemonade/tools/server/webapp.py +4 -1
- lemonade/tools/server/wrapped_server.py +559 -0
- lemonade/version.py +1 -1
- lemonade_install/install.py +54 -611
- {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/METADATA +29 -72
- lemonade_sdk-8.2.2.dist-info/RECORD +83 -0
- lemonade_server/cli.py +145 -37
- lemonade_server/model_manager.py +521 -37
- lemonade_server/pydantic_models.py +28 -1
- lemonade_server/server_models.json +246 -92
- lemonade_server/settings.py +39 -39
- lemonade/tools/quark/__init__.py +0 -0
- lemonade/tools/quark/quark_load.py +0 -173
- lemonade/tools/quark/quark_quantize.py +0 -439
- lemonade_sdk-8.1.4.dist-info/RECORD +0 -77
- {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/WHEEL +0 -0
- {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/entry_points.txt +0 -0
- {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/licenses/LICENSE +0 -0
- {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/licenses/NOTICE.md +0 -0
- {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/top_level.txt +0 -0
lemonade/tools/llamacpp/utils.py
CHANGED
|
@@ -3,19 +3,22 @@ import os
|
|
|
3
3
|
import platform
|
|
4
4
|
import shutil
|
|
5
5
|
import sys
|
|
6
|
+
import threading
|
|
7
|
+
import time
|
|
6
8
|
import zipfile
|
|
7
9
|
from typing import Optional
|
|
10
|
+
import psutil
|
|
8
11
|
import subprocess
|
|
9
12
|
import requests
|
|
13
|
+
import lemonade.common.build as build
|
|
10
14
|
import lemonade.common.printing as printing
|
|
11
15
|
from lemonade.tools.adapter import PassthroughTokenizer, ModelAdapter
|
|
12
|
-
|
|
13
16
|
from lemonade.common.system_info import get_system_info
|
|
14
|
-
|
|
15
17
|
from dotenv import set_key, load_dotenv
|
|
16
18
|
|
|
17
|
-
LLAMA_VERSION_VULKAN = "
|
|
18
|
-
LLAMA_VERSION_ROCM = "
|
|
19
|
+
LLAMA_VERSION_VULKAN = "b6510"
|
|
20
|
+
LLAMA_VERSION_ROCM = "b1066"
|
|
21
|
+
LLAMA_VERSION_METAL = "b6510"
|
|
19
22
|
|
|
20
23
|
|
|
21
24
|
def identify_rocm_arch_from_name(device_name: str) -> str | None:
|
|
@@ -126,8 +129,12 @@ def get_llama_version(backend: str) -> str:
|
|
|
126
129
|
return LLAMA_VERSION_ROCM
|
|
127
130
|
elif backend == "vulkan":
|
|
128
131
|
return LLAMA_VERSION_VULKAN
|
|
132
|
+
elif backend == "metal":
|
|
133
|
+
return LLAMA_VERSION_METAL
|
|
129
134
|
else:
|
|
130
|
-
raise ValueError(
|
|
135
|
+
raise ValueError(
|
|
136
|
+
f"Unsupported backend: {backend}. Supported: vulkan, rocm, metal"
|
|
137
|
+
)
|
|
131
138
|
|
|
132
139
|
|
|
133
140
|
def get_llama_folder_path(backend: str):
|
|
@@ -142,10 +149,12 @@ def get_llama_exe_path(exe_name: str, backend: str):
|
|
|
142
149
|
Get path to platform-specific llama-server executable
|
|
143
150
|
"""
|
|
144
151
|
base_dir = get_llama_folder_path(backend)
|
|
145
|
-
|
|
152
|
+
system = platform.system().lower()
|
|
153
|
+
|
|
154
|
+
if system == "windows":
|
|
146
155
|
return os.path.join(base_dir, f"{exe_name}.exe")
|
|
147
|
-
else: # Linux/Ubuntu
|
|
148
|
-
# Check if executable exists in build/bin subdirectory
|
|
156
|
+
else: # Darwin/Linux/Ubuntu
|
|
157
|
+
# Check if executable exists in build/bin subdirectory
|
|
149
158
|
build_bin_path = os.path.join(base_dir, "build", "bin", exe_name)
|
|
150
159
|
if os.path.exists(build_bin_path):
|
|
151
160
|
return build_bin_path
|
|
@@ -168,6 +177,13 @@ def get_llama_cli_exe_path(backend: str):
|
|
|
168
177
|
return get_llama_exe_path("llama-cli", backend)
|
|
169
178
|
|
|
170
179
|
|
|
180
|
+
def get_llama_bench_exe_path(backend: str):
|
|
181
|
+
"""
|
|
182
|
+
Get path to platform-specific llama-bench executable
|
|
183
|
+
"""
|
|
184
|
+
return get_llama_exe_path("llama-bench", backend)
|
|
185
|
+
|
|
186
|
+
|
|
171
187
|
def get_version_txt_path(backend: str):
|
|
172
188
|
"""
|
|
173
189
|
Get path to text file that contains version information
|
|
@@ -223,8 +239,24 @@ def get_binary_url_and_filename(backend: str, target_arch: str = None):
|
|
|
223
239
|
raise NotImplementedError(
|
|
224
240
|
f"Platform {system} not supported for Vulkan llamacpp. Supported: Windows, Ubuntu Linux"
|
|
225
241
|
)
|
|
242
|
+
|
|
243
|
+
elif backend == "metal":
|
|
244
|
+
# Metal support for macOS Apple Silicon from ggml-org/llama.cpp
|
|
245
|
+
repo = "ggml-org/llama.cpp"
|
|
246
|
+
version = LLAMA_VERSION_METAL
|
|
247
|
+
if system == "darwin":
|
|
248
|
+
if platform.machine().lower() in ["arm64", "aarch64"]:
|
|
249
|
+
filename = f"llama-{version}-bin-macos-arm64.zip"
|
|
250
|
+
else:
|
|
251
|
+
raise NotImplementedError(
|
|
252
|
+
"Metal backend only supports Apple Silicon (ARM64) processors"
|
|
253
|
+
)
|
|
254
|
+
else:
|
|
255
|
+
raise NotImplementedError(
|
|
256
|
+
f"Platform {system} not supported for Metal llamacpp. Metal is only supported on macOS"
|
|
257
|
+
)
|
|
226
258
|
else:
|
|
227
|
-
supported_backends = ["vulkan", "rocm"]
|
|
259
|
+
supported_backends = ["vulkan", "rocm", "metal"]
|
|
228
260
|
raise NotImplementedError(
|
|
229
261
|
f"Unsupported backend: {backend}. Supported backends: {supported_backends}"
|
|
230
262
|
)
|
|
@@ -239,10 +271,10 @@ def validate_platform_support():
|
|
|
239
271
|
"""
|
|
240
272
|
system = platform.system().lower()
|
|
241
273
|
|
|
242
|
-
if system not in ["windows", "linux"]:
|
|
274
|
+
if system not in ["windows", "linux", "darwin"]:
|
|
243
275
|
raise NotImplementedError(
|
|
244
276
|
f"Platform {system} not supported for llamacpp. "
|
|
245
|
-
"Supported: Windows, Ubuntu Linux"
|
|
277
|
+
"Supported: Windows, Ubuntu Linux, macOS"
|
|
246
278
|
)
|
|
247
279
|
|
|
248
280
|
if system == "linux":
|
|
@@ -341,12 +373,39 @@ def install_llamacpp(backend):
|
|
|
341
373
|
if filename.endswith(".zip"):
|
|
342
374
|
with zipfile.ZipFile(llama_archive_path, "r") as zip_ref:
|
|
343
375
|
zip_ref.extractall(llama_server_exe_dir)
|
|
376
|
+
|
|
377
|
+
# On Unix-like systems (macOS/Linux), make executables executable
|
|
378
|
+
if platform.system().lower() in ["darwin", "linux"]:
|
|
379
|
+
import stat
|
|
380
|
+
|
|
381
|
+
# Find and make executable files executable
|
|
382
|
+
for root, _, files in os.walk(llama_server_exe_dir):
|
|
383
|
+
for file in files:
|
|
384
|
+
file_path = os.path.join(root, file)
|
|
385
|
+
# Make files in bin/ directories executable
|
|
386
|
+
if "bin" in root.split(os.sep) or file in [
|
|
387
|
+
"llama-server",
|
|
388
|
+
"llama-simple",
|
|
389
|
+
]:
|
|
390
|
+
try:
|
|
391
|
+
current_permissions = os.stat(file_path).st_mode
|
|
392
|
+
os.chmod(file_path, current_permissions | stat.S_IEXEC)
|
|
393
|
+
logging.debug(f"Made {file_path} executable")
|
|
394
|
+
except Exception as e:
|
|
395
|
+
raise RuntimeError(
|
|
396
|
+
f"Failed to make {file_path} executable. This will prevent "
|
|
397
|
+
f"llama-server from starting. Error: {e}"
|
|
398
|
+
)
|
|
344
399
|
else:
|
|
345
400
|
raise NotImplementedError(f"Unsupported archive format: {filename}")
|
|
346
401
|
|
|
347
402
|
# Identify and set HIP ID
|
|
348
403
|
if backend == "rocm":
|
|
349
|
-
|
|
404
|
+
try:
|
|
405
|
+
hip_id = identify_hip_id()
|
|
406
|
+
except Exception as e: # pylint: disable=broad-exception-caught
|
|
407
|
+
hip_id = 0
|
|
408
|
+
logging.warning(f"Error identifying HIP ID: {e}. Falling back to 0.")
|
|
350
409
|
env_file_path = os.path.join(llama_server_exe_dir, ".env")
|
|
351
410
|
set_key(env_file_path, "HIP_VISIBLE_DEVICES", str(hip_id))
|
|
352
411
|
|
|
@@ -356,6 +415,7 @@ def install_llamacpp(backend):
|
|
|
356
415
|
exe_paths = [
|
|
357
416
|
(get_llama_server_exe_path(backend), "llama-server"),
|
|
358
417
|
(get_llama_cli_exe_path(backend), "llama-cli"),
|
|
418
|
+
(get_llama_bench_exe_path(backend), "llama-bench"),
|
|
359
419
|
]
|
|
360
420
|
|
|
361
421
|
for exe_path, exe_name in exe_paths:
|
|
@@ -496,7 +556,7 @@ def get_local_checkpoint_path(base_checkpoint, variant):
|
|
|
496
556
|
|
|
497
557
|
|
|
498
558
|
def identify_gguf_models(
|
|
499
|
-
checkpoint: str, variant: str, mmproj: str
|
|
559
|
+
checkpoint: str, variant: Optional[str], mmproj: str
|
|
500
560
|
) -> tuple[dict, list[str]]:
|
|
501
561
|
"""
|
|
502
562
|
Identifies the GGUF model files in the repository that match the variant.
|
|
@@ -506,12 +566,14 @@ def identify_gguf_models(
|
|
|
506
566
|
The CHECKPOINT:VARIANT scheme is used to specify model files in Hugging Face repositories.
|
|
507
567
|
|
|
508
568
|
The VARIANT format can be one of several types:
|
|
569
|
+
0. wildcard (*): download all .gguf files in the repo
|
|
509
570
|
1. Full filename: exact file to download
|
|
510
571
|
2. None/empty: gets the first .gguf file in the repository (excludes mmproj files)
|
|
511
572
|
3. Quantization variant: find a single file ending with the variant name (case insensitive)
|
|
512
573
|
4. Folder name: downloads all .gguf files in the folder that matches the variant name (case insensitive)
|
|
513
574
|
|
|
514
575
|
Examples:
|
|
576
|
+
- "ggml-org/gpt-oss-120b-GGUF:*" -> downloads all .gguf files in repo
|
|
515
577
|
- "unsloth/Qwen3-8B-GGUF:qwen3.gguf" -> downloads "qwen3.gguf"
|
|
516
578
|
- "unsloth/Qwen3-30B-A3B-GGUF" -> downloads "Qwen3-30B-A3B-GGUF.gguf"
|
|
517
579
|
- "unsloth/Qwen3-8B-GGUF:Q4_1" -> downloads "Qwen3-8B-GGUF-Q4_1.gguf"
|
|
@@ -523,8 +585,18 @@ def identify_gguf_models(
|
|
|
523
585
|
repo_files = list_repo_files(checkpoint)
|
|
524
586
|
sharded_files = []
|
|
525
587
|
|
|
588
|
+
# (case 0) Wildcard, download everything
|
|
589
|
+
if variant and variant == "*":
|
|
590
|
+
sharded_files = [f for f in repo_files if f.endswith(".gguf")]
|
|
591
|
+
|
|
592
|
+
# Sort to ensure consistent ordering
|
|
593
|
+
sharded_files.sort()
|
|
594
|
+
|
|
595
|
+
# Use first file as primary (this is how llamacpp handles it)
|
|
596
|
+
variant_name = sharded_files[0]
|
|
597
|
+
|
|
526
598
|
# (case 1) If variant ends in .gguf, use it directly
|
|
527
|
-
|
|
599
|
+
elif variant and variant.endswith(".gguf"):
|
|
528
600
|
variant_name = variant
|
|
529
601
|
if variant_name not in repo_files:
|
|
530
602
|
raise ValueError(
|
|
@@ -585,15 +657,91 @@ def identify_gguf_models(
|
|
|
585
657
|
return core_files, sharded_files
|
|
586
658
|
|
|
587
659
|
|
|
588
|
-
def
|
|
660
|
+
def resolve_local_gguf_model(
|
|
661
|
+
checkpoint: str, variant: str, config_mmproj: str = None
|
|
662
|
+
) -> dict | None:
|
|
663
|
+
"""
|
|
664
|
+
Attempts to resolve a GGUF model from the local HuggingFace cache.
|
|
589
665
|
"""
|
|
590
|
-
|
|
666
|
+
from huggingface_hub.constants import HF_HUB_CACHE
|
|
667
|
+
|
|
668
|
+
# Convert checkpoint to cache directory format
|
|
669
|
+
if checkpoint.startswith("models--"):
|
|
670
|
+
model_cache_dir = os.path.join(HF_HUB_CACHE, checkpoint)
|
|
671
|
+
else:
|
|
672
|
+
# This is a HuggingFace repo - convert to cache directory format
|
|
673
|
+
repo_cache_name = checkpoint.replace("/", "--")
|
|
674
|
+
model_cache_dir = os.path.join(HF_HUB_CACHE, f"models--{repo_cache_name}")
|
|
675
|
+
|
|
676
|
+
# Check if the cache directory exists
|
|
677
|
+
if not os.path.exists(model_cache_dir):
|
|
678
|
+
return None
|
|
591
679
|
|
|
592
|
-
|
|
593
|
-
|
|
680
|
+
gguf_file_found = None
|
|
681
|
+
|
|
682
|
+
# If variant is specified, look for that specific file
|
|
683
|
+
if variant:
|
|
684
|
+
search_term = variant if variant.endswith(".gguf") else f"{variant}.gguf"
|
|
685
|
+
|
|
686
|
+
for root, _, files in os.walk(model_cache_dir):
|
|
687
|
+
if search_term in files:
|
|
688
|
+
gguf_file_found = os.path.join(root, search_term)
|
|
689
|
+
break
|
|
690
|
+
|
|
691
|
+
# If no variant or variant not found, find any .gguf file (excluding mmproj)
|
|
692
|
+
if not gguf_file_found:
|
|
693
|
+
for root, _, files in os.walk(model_cache_dir):
|
|
694
|
+
gguf_files = [
|
|
695
|
+
f for f in files if f.endswith(".gguf") and "mmproj" not in f.lower()
|
|
696
|
+
]
|
|
697
|
+
if gguf_files:
|
|
698
|
+
gguf_file_found = os.path.join(root, gguf_files[0])
|
|
699
|
+
break
|
|
700
|
+
|
|
701
|
+
# If no GGUF file found, model is not in cache
|
|
702
|
+
if not gguf_file_found:
|
|
703
|
+
return None
|
|
704
|
+
|
|
705
|
+
# Build result dictionary
|
|
706
|
+
result = {"variant": gguf_file_found}
|
|
707
|
+
|
|
708
|
+
# Search for mmproj file if provided
|
|
709
|
+
if config_mmproj:
|
|
710
|
+
for root, _, files in os.walk(model_cache_dir):
|
|
711
|
+
if config_mmproj in files:
|
|
712
|
+
result["mmproj"] = os.path.join(root, config_mmproj)
|
|
713
|
+
break
|
|
714
|
+
|
|
715
|
+
logging.info(f"Resolved local GGUF model: {result}")
|
|
716
|
+
return result
|
|
717
|
+
|
|
718
|
+
|
|
719
|
+
def download_gguf(
|
|
720
|
+
config_checkpoint: str, config_mmproj=None, do_not_upgrade: bool = False
|
|
721
|
+
) -> dict:
|
|
594
722
|
"""
|
|
723
|
+
Downloads the GGUF file for the given model configuration from HuggingFace.
|
|
724
|
+
|
|
725
|
+
This function downloads models from the internet. It does NOT check the local cache first.
|
|
726
|
+
Callers should use resolve_local_gguf_model() if they want to check for existing models first.
|
|
727
|
+
|
|
728
|
+
Args:
|
|
729
|
+
config_checkpoint: Checkpoint identifier (file path or HF repo with variant)
|
|
730
|
+
config_mmproj: Optional mmproj file to also download
|
|
731
|
+
do_not_upgrade: If True, use local cache only without attempting to download updates
|
|
595
732
|
|
|
596
|
-
|
|
733
|
+
Returns:
|
|
734
|
+
Dictionary with "variant" (and optionally "mmproj") file paths
|
|
735
|
+
"""
|
|
736
|
+
# Handle direct file path case - if the checkpoint is an actual file on disk
|
|
737
|
+
if os.path.exists(config_checkpoint):
|
|
738
|
+
result = {"variant": config_checkpoint}
|
|
739
|
+
if config_mmproj:
|
|
740
|
+
result["mmproj"] = config_mmproj
|
|
741
|
+
return result
|
|
742
|
+
|
|
743
|
+
# Parse checkpoint to extract base and variant
|
|
744
|
+
# Checkpoint format: repo_name:variant (e.g., "unsloth/Qwen3-0.6B-GGUF:Q4_0")
|
|
597
745
|
checkpoint, variant = parse_checkpoint(config_checkpoint)
|
|
598
746
|
|
|
599
747
|
# Identify the GGUF model files in the repository that match the variant
|
|
@@ -624,6 +772,37 @@ def download_gguf(config_checkpoint, config_mmproj=None, do_not_upgrade=False) -
|
|
|
624
772
|
}
|
|
625
773
|
|
|
626
774
|
|
|
775
|
+
# Function to read a stream (stdout or stderr) into a list
|
|
776
|
+
def stream_reader(stream, output_list):
|
|
777
|
+
for line in iter(stream.readline, b""):
|
|
778
|
+
decoded_line = line.decode().rstrip()
|
|
779
|
+
output_list.append(decoded_line)
|
|
780
|
+
stream.close()
|
|
781
|
+
|
|
782
|
+
|
|
783
|
+
def monitor_process_memory(pid, memory_data, interval=0.5):
|
|
784
|
+
"""Monitor memory usage of a process in a separate thread."""
|
|
785
|
+
|
|
786
|
+
try:
|
|
787
|
+
is_windows = platform.system() == "Windows"
|
|
788
|
+
if is_windows:
|
|
789
|
+
# We can only collect peak_wset in Windows
|
|
790
|
+
process = psutil.Process(pid)
|
|
791
|
+
while process.is_running():
|
|
792
|
+
try:
|
|
793
|
+
mem_info = process.memory_info()
|
|
794
|
+
peak_wset = mem_info.peak_wset
|
|
795
|
+
if peak_wset is not None:
|
|
796
|
+
memory_data["peak_wset"] = peak_wset
|
|
797
|
+
except psutil.NoSuchProcess:
|
|
798
|
+
break
|
|
799
|
+
time.sleep(interval)
|
|
800
|
+
except Exception as e:
|
|
801
|
+
print(f"Error monitoring process: {e}")
|
|
802
|
+
|
|
803
|
+
return memory_data
|
|
804
|
+
|
|
805
|
+
|
|
627
806
|
class LlamaCppTokenizerAdapter(PassthroughTokenizer):
|
|
628
807
|
pass
|
|
629
808
|
|
|
@@ -637,8 +816,10 @@ class LlamaCppAdapter(ModelAdapter):
|
|
|
637
816
|
context_size,
|
|
638
817
|
threads,
|
|
639
818
|
executable,
|
|
819
|
+
bench_executable,
|
|
640
820
|
reasoning=False,
|
|
641
821
|
lib_dir=None,
|
|
822
|
+
state=None,
|
|
642
823
|
):
|
|
643
824
|
super().__init__()
|
|
644
825
|
|
|
@@ -650,8 +831,10 @@ class LlamaCppAdapter(ModelAdapter):
|
|
|
650
831
|
self.context_size = context_size
|
|
651
832
|
self.threads = threads
|
|
652
833
|
self.executable = os.path.normpath(executable)
|
|
834
|
+
self.bench_executable = os.path.normpath(bench_executable)
|
|
653
835
|
self.reasoning = reasoning
|
|
654
836
|
self.lib_dir = lib_dir
|
|
837
|
+
self.state = state
|
|
655
838
|
|
|
656
839
|
def generate(
|
|
657
840
|
self,
|
|
@@ -661,6 +844,7 @@ class LlamaCppAdapter(ModelAdapter):
|
|
|
661
844
|
top_p: float = 0.95,
|
|
662
845
|
top_k: int = 40,
|
|
663
846
|
return_raw: bool = False,
|
|
847
|
+
save_max_memory_used: bool = False,
|
|
664
848
|
**kwargs, # pylint: disable=unused-argument
|
|
665
849
|
):
|
|
666
850
|
"""
|
|
@@ -692,32 +876,54 @@ class LlamaCppAdapter(ModelAdapter):
|
|
|
692
876
|
self.executable,
|
|
693
877
|
"-m",
|
|
694
878
|
self.model,
|
|
695
|
-
"--ctx-size",
|
|
879
|
+
"--ctx-size", # size of the prompt context, 0 = loaded from model
|
|
696
880
|
str(self.context_size),
|
|
697
|
-
"-n",
|
|
881
|
+
"-n", # number of tokens to predict, -1 = infinity, =2 - until context filled
|
|
698
882
|
str(n_predict),
|
|
699
|
-
"-t",
|
|
883
|
+
"-t", # number of threads to use during generation
|
|
700
884
|
str(self.threads),
|
|
701
885
|
"-p",
|
|
702
886
|
prompt,
|
|
887
|
+
"-b", # logical maximum batch size
|
|
888
|
+
"1",
|
|
889
|
+
"-ub", # physical maximum batch size
|
|
890
|
+
"1",
|
|
703
891
|
"--temp",
|
|
704
892
|
str(temperature),
|
|
705
893
|
"--top-p",
|
|
706
894
|
str(top_p),
|
|
707
895
|
"--top-k",
|
|
708
896
|
str(top_k),
|
|
709
|
-
"-e",
|
|
710
|
-
"
|
|
711
|
-
"--reasoning-format",
|
|
897
|
+
"-e", # process escape sequences
|
|
898
|
+
"--no-conversation", # disable conversation mode
|
|
899
|
+
"--reasoning-format", # leaves thoughts unparsed in message content
|
|
712
900
|
"none",
|
|
713
901
|
]
|
|
714
902
|
|
|
903
|
+
# If prompt exceeds 500 characters, then use a file
|
|
904
|
+
if len(prompt) < 500:
|
|
905
|
+
cmd += ["-p", prompt]
|
|
906
|
+
else:
|
|
907
|
+
# Create prompt file in cache directory
|
|
908
|
+
prompt_file = os.path.join(
|
|
909
|
+
build.output_dir(self.state.cache_dir, self.state.build_name),
|
|
910
|
+
"prompt.txt",
|
|
911
|
+
)
|
|
912
|
+
with open(prompt_file, "w", encoding="utf-8") as file:
|
|
913
|
+
file.write(prompt)
|
|
914
|
+
cmd += ["-f", prompt_file]
|
|
915
|
+
|
|
715
916
|
# Configure GPU layers: 99 for GPU, 0 for CPU-only
|
|
716
917
|
ngl_value = "99" if self.device == "igpu" else "0"
|
|
717
918
|
cmd = cmd + ["-ngl", ngl_value]
|
|
718
919
|
|
|
719
920
|
cmd = [str(m) for m in cmd]
|
|
720
921
|
|
|
922
|
+
# save llama-cli command
|
|
923
|
+
self.state.llama_cli_cmd = getattr(self.state, "llama_cli_cmd", []) + [
|
|
924
|
+
" ".join(cmd)
|
|
925
|
+
]
|
|
926
|
+
|
|
721
927
|
try:
|
|
722
928
|
# Set up environment with library path for Linux
|
|
723
929
|
env = os.environ.copy()
|
|
@@ -746,15 +952,35 @@ class LlamaCppAdapter(ModelAdapter):
|
|
|
746
952
|
env=env,
|
|
747
953
|
)
|
|
748
954
|
|
|
749
|
-
|
|
955
|
+
# Start memory monitoring in a separate thread
|
|
956
|
+
if save_max_memory_used:
|
|
957
|
+
memory_data = {}
|
|
958
|
+
monitor_thread = threading.Thread(
|
|
959
|
+
target=monitor_process_memory,
|
|
960
|
+
args=(process.pid, memory_data),
|
|
961
|
+
daemon=True,
|
|
962
|
+
)
|
|
963
|
+
monitor_thread.start()
|
|
964
|
+
|
|
965
|
+
# Communicate with the subprocess
|
|
966
|
+
stdout, stderr = process.communicate(timeout=600)
|
|
967
|
+
|
|
968
|
+
# save llama-cli command output with performance info to state
|
|
969
|
+
# (can be viewed in state.yaml file in cache)
|
|
970
|
+
self.state.llama_cli_stderr = getattr(
|
|
971
|
+
self.state, "llama_cli_stderr", []
|
|
972
|
+
) + [
|
|
973
|
+
[line for line in stderr.splitlines() if line.startswith("llama_perf_")]
|
|
974
|
+
]
|
|
975
|
+
|
|
750
976
|
if process.returncode != 0:
|
|
751
977
|
error_msg = f"llama.cpp failed with return code {process.returncode}.\n"
|
|
752
978
|
error_msg += f"Command: {' '.join(cmd)}\n"
|
|
753
979
|
error_msg += f"Error output:\n{stderr}\n"
|
|
754
|
-
error_msg += f"Standard output:\n{
|
|
980
|
+
error_msg += f"Standard output:\n{stdout}"
|
|
755
981
|
raise Exception(error_msg)
|
|
756
982
|
|
|
757
|
-
if
|
|
983
|
+
if stdout is None:
|
|
758
984
|
raise Exception("No output received from llama.cpp process")
|
|
759
985
|
|
|
760
986
|
# Parse information from llama.cpp output
|
|
@@ -785,14 +1011,19 @@ class LlamaCppAdapter(ModelAdapter):
|
|
|
785
1011
|
else 0
|
|
786
1012
|
)
|
|
787
1013
|
|
|
1014
|
+
# Wait for monitor thread to finish and write peak_wset
|
|
1015
|
+
if save_max_memory_used:
|
|
1016
|
+
monitor_thread.join(timeout=2)
|
|
1017
|
+
self.peak_wset = memory_data.get("peak_wset", None)
|
|
1018
|
+
|
|
788
1019
|
if return_raw:
|
|
789
|
-
return [
|
|
1020
|
+
return [stdout, stderr]
|
|
790
1021
|
|
|
791
1022
|
# Find where the prompt ends and the generated text begins
|
|
792
1023
|
prompt_found = False
|
|
793
1024
|
output_text = ""
|
|
794
1025
|
prompt_first_line = prompt.split("\n")[0]
|
|
795
|
-
for line in
|
|
1026
|
+
for line in stdout.splitlines():
|
|
796
1027
|
if prompt_first_line in line:
|
|
797
1028
|
prompt_found = True
|
|
798
1029
|
if prompt_found:
|
|
@@ -803,7 +1034,7 @@ class LlamaCppAdapter(ModelAdapter):
|
|
|
803
1034
|
raise Exception(
|
|
804
1035
|
f"Could not find prompt '{prompt_first_line}' in llama.cpp output. "
|
|
805
1036
|
"This usually means the model failed to process the prompt correctly.\n"
|
|
806
|
-
f"Raw output:\n{
|
|
1037
|
+
f"Raw output:\n{stdout}\n"
|
|
807
1038
|
f"Stderr:\n{stderr}"
|
|
808
1039
|
)
|
|
809
1040
|
|
|
@@ -811,10 +1042,137 @@ class LlamaCppAdapter(ModelAdapter):
|
|
|
811
1042
|
return [output_text]
|
|
812
1043
|
|
|
813
1044
|
except Exception as e:
|
|
814
|
-
error_msg = f"Failed to run llama.
|
|
1045
|
+
error_msg = f"Failed to run llama-cli.exe command: {str(e)}\n"
|
|
815
1046
|
error_msg += f"Command: {' '.join(cmd)}"
|
|
816
1047
|
raise Exception(error_msg)
|
|
817
1048
|
|
|
1049
|
+
def benchmark(self, prompt, iterations, output_tokens):
|
|
1050
|
+
"""
|
|
1051
|
+
Runs the llama-bench.exe tool to measure TTFT and TPS
|
|
1052
|
+
"""
|
|
1053
|
+
cmd = [
|
|
1054
|
+
self.bench_executable,
|
|
1055
|
+
"-m",
|
|
1056
|
+
self.model,
|
|
1057
|
+
"-r",
|
|
1058
|
+
iterations,
|
|
1059
|
+
"-p",
|
|
1060
|
+
str(prompt),
|
|
1061
|
+
"-n",
|
|
1062
|
+
output_tokens,
|
|
1063
|
+
"-t",
|
|
1064
|
+
self.threads if self.threads > 0 else 16,
|
|
1065
|
+
"-b",
|
|
1066
|
+
1,
|
|
1067
|
+
"-ub",
|
|
1068
|
+
1,
|
|
1069
|
+
]
|
|
1070
|
+
ngl_value = "99" if self.device == "igpu" else "0"
|
|
1071
|
+
cmd = cmd + ["-ngl", ngl_value]
|
|
1072
|
+
cmd = [str(m) for m in cmd]
|
|
1073
|
+
|
|
1074
|
+
# save llama-bench command
|
|
1075
|
+
self.state.llama_bench_cmd = " ".join(cmd)
|
|
1076
|
+
|
|
1077
|
+
try:
|
|
1078
|
+
# Set up environment with library path for Linux
|
|
1079
|
+
env = os.environ.copy()
|
|
1080
|
+
|
|
1081
|
+
# Load environment variables from .env file in the executable directory
|
|
1082
|
+
exe_dir = os.path.dirname(self.executable)
|
|
1083
|
+
env_file_path = os.path.join(exe_dir, ".env")
|
|
1084
|
+
if os.path.exists(env_file_path):
|
|
1085
|
+
load_dotenv(env_file_path, override=True)
|
|
1086
|
+
env.update(os.environ)
|
|
1087
|
+
|
|
1088
|
+
if self.lib_dir and os.name != "nt": # Not Windows
|
|
1089
|
+
current_ld_path = env.get("LD_LIBRARY_PATH", "")
|
|
1090
|
+
if current_ld_path:
|
|
1091
|
+
env["LD_LIBRARY_PATH"] = f"{self.lib_dir}:{current_ld_path}"
|
|
1092
|
+
else:
|
|
1093
|
+
env["LD_LIBRARY_PATH"] = self.lib_dir
|
|
1094
|
+
|
|
1095
|
+
process = subprocess.Popen(
|
|
1096
|
+
cmd,
|
|
1097
|
+
stdout=subprocess.PIPE,
|
|
1098
|
+
stderr=subprocess.PIPE,
|
|
1099
|
+
universal_newlines=True,
|
|
1100
|
+
encoding="utf-8",
|
|
1101
|
+
errors="replace",
|
|
1102
|
+
env=env,
|
|
1103
|
+
)
|
|
1104
|
+
|
|
1105
|
+
# Start memory monitoring in a separate thread
|
|
1106
|
+
save_max_memory_used = platform.system() == "Windows"
|
|
1107
|
+
if save_max_memory_used:
|
|
1108
|
+
memory_data = {}
|
|
1109
|
+
monitor_thread = threading.Thread(
|
|
1110
|
+
target=monitor_process_memory,
|
|
1111
|
+
args=(process.pid, memory_data),
|
|
1112
|
+
daemon=True,
|
|
1113
|
+
)
|
|
1114
|
+
monitor_thread.start()
|
|
1115
|
+
|
|
1116
|
+
# Communicate with the subprocess
|
|
1117
|
+
stdout, stderr = process.communicate(timeout=600)
|
|
1118
|
+
|
|
1119
|
+
# save llama-bench command output with performance info to state
|
|
1120
|
+
# (can be viewed in state.yaml file in cache)
|
|
1121
|
+
self.state.llama_bench_standard_output = stdout.splitlines()
|
|
1122
|
+
|
|
1123
|
+
if process.returncode != 0:
|
|
1124
|
+
error_msg = (
|
|
1125
|
+
f"llama-bench.exe failed with return code {process.returncode}.\n"
|
|
1126
|
+
)
|
|
1127
|
+
error_msg += f"Command: {' '.join(cmd)}\n"
|
|
1128
|
+
error_msg += f"Error output:\n{stderr}\n"
|
|
1129
|
+
error_msg += f"Standard output:\n{stdout}"
|
|
1130
|
+
raise Exception(error_msg)
|
|
1131
|
+
|
|
1132
|
+
if stdout is None:
|
|
1133
|
+
error_msg = "No output received from llama-bench.exe process\n"
|
|
1134
|
+
error_msg += f"Error output:\n{stderr}\n"
|
|
1135
|
+
error_msg += f"Standard output:\n{stdout}"
|
|
1136
|
+
raise Exception(error_msg)
|
|
1137
|
+
|
|
1138
|
+
# Parse information from llama-bench.exe output
|
|
1139
|
+
prompt_length = None
|
|
1140
|
+
pp_tps = None
|
|
1141
|
+
pp_tps_sd = None
|
|
1142
|
+
tg_tps = None
|
|
1143
|
+
tg_tps_sd = None
|
|
1144
|
+
|
|
1145
|
+
for line in stdout.splitlines():
|
|
1146
|
+
# Parse TPS information
|
|
1147
|
+
if f"pp{prompt:d}" in line:
|
|
1148
|
+
parts = line.split("|")
|
|
1149
|
+
timings = parts[-2].strip().split(" ")
|
|
1150
|
+
prompt_length = prompt
|
|
1151
|
+
pp_tps = float(timings[0])
|
|
1152
|
+
pp_tps_sd = float(timings[-1])
|
|
1153
|
+
if f"tg{output_tokens:d}" in line:
|
|
1154
|
+
parts = line.split("|")
|
|
1155
|
+
timings = parts[-2].strip().split(" ")
|
|
1156
|
+
tg_tps = float(timings[0])
|
|
1157
|
+
tg_tps_sd = float(timings[-1])
|
|
1158
|
+
|
|
1159
|
+
except Exception as e:
|
|
1160
|
+
error_msg = f"Failed to run llama-bench.exe command: {str(e)}\n"
|
|
1161
|
+
error_msg += f"Command: {' '.join(cmd)}"
|
|
1162
|
+
raise Exception(error_msg)
|
|
1163
|
+
|
|
1164
|
+
# Determine max memory used
|
|
1165
|
+
if save_max_memory_used:
|
|
1166
|
+
# Wait for monitor thread to finish
|
|
1167
|
+
monitor_thread.join(timeout=2)
|
|
1168
|
+
|
|
1169
|
+
# Track memory usage concurrently
|
|
1170
|
+
peak_wset = memory_data.get("peak_wset", None)
|
|
1171
|
+
else:
|
|
1172
|
+
peak_wset = None
|
|
1173
|
+
|
|
1174
|
+
return prompt_length, pp_tps, pp_tps_sd, tg_tps, tg_tps_sd, peak_wset
|
|
1175
|
+
|
|
818
1176
|
|
|
819
1177
|
def get_hip_devices():
|
|
820
1178
|
"""Get list of HIP devices with their IDs and names."""
|
|
@@ -841,7 +1199,9 @@ def get_hip_devices():
|
|
|
841
1199
|
try:
|
|
842
1200
|
libhip = ctypes.CDLL(matching_files[0])
|
|
843
1201
|
except OSError:
|
|
844
|
-
raise RuntimeError(
|
|
1202
|
+
raise RuntimeError(
|
|
1203
|
+
f"Could not load HIP runtime library from {matching_files[0]}"
|
|
1204
|
+
)
|
|
845
1205
|
|
|
846
1206
|
# Setup function signatures
|
|
847
1207
|
hipError_t = c_int
|