lemonade-sdk 8.1.11__py3-none-any.whl → 8.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lemonade-sdk might be problematic. Click here for more details.
- lemonade/cache.py +6 -1
- lemonade/common/status.py +4 -4
- lemonade/common/system_info.py +0 -26
- lemonade/tools/accuracy.py +143 -48
- lemonade/tools/adapter.py +6 -1
- lemonade/tools/bench.py +26 -8
- lemonade/tools/flm/utils.py +70 -22
- lemonade/tools/huggingface/bench.py +6 -1
- lemonade/tools/llamacpp/bench.py +146 -27
- lemonade/tools/llamacpp/load.py +30 -2
- lemonade/tools/llamacpp/utils.py +317 -21
- lemonade/tools/oga/bench.py +5 -26
- lemonade/tools/oga/load.py +49 -123
- lemonade/tools/oga/migration.py +403 -0
- lemonade/tools/report/table.py +76 -8
- lemonade/tools/server/flm.py +2 -6
- lemonade/tools/server/llamacpp.py +43 -2
- lemonade/tools/server/serve.py +354 -18
- lemonade/tools/server/static/js/chat.js +15 -77
- lemonade/tools/server/static/js/model-settings.js +24 -3
- lemonade/tools/server/static/js/models.js +440 -37
- lemonade/tools/server/static/js/shared.js +61 -8
- lemonade/tools/server/static/logs.html +157 -13
- lemonade/tools/server/static/styles.css +204 -0
- lemonade/tools/server/static/webapp.html +39 -1
- lemonade/version.py +1 -1
- lemonade_install/install.py +33 -579
- {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/METADATA +6 -4
- {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/RECORD +38 -37
- lemonade_server/cli.py +10 -0
- lemonade_server/model_manager.py +172 -11
- lemonade_server/pydantic_models.py +3 -0
- lemonade_server/server_models.json +102 -66
- {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/WHEEL +0 -0
- {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/entry_points.txt +0 -0
- {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/licenses/LICENSE +0 -0
- {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/licenses/NOTICE.md +0 -0
- {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/top_level.txt +0 -0
lemonade/tools/llamacpp/utils.py
CHANGED
|
@@ -3,15 +3,17 @@ import os
|
|
|
3
3
|
import platform
|
|
4
4
|
import shutil
|
|
5
5
|
import sys
|
|
6
|
+
import threading
|
|
7
|
+
import time
|
|
6
8
|
import zipfile
|
|
7
9
|
from typing import Optional
|
|
10
|
+
import psutil
|
|
8
11
|
import subprocess
|
|
9
12
|
import requests
|
|
13
|
+
import lemonade.common.build as build
|
|
10
14
|
import lemonade.common.printing as printing
|
|
11
15
|
from lemonade.tools.adapter import PassthroughTokenizer, ModelAdapter
|
|
12
|
-
|
|
13
16
|
from lemonade.common.system_info import get_system_info
|
|
14
|
-
|
|
15
17
|
from dotenv import set_key, load_dotenv
|
|
16
18
|
|
|
17
19
|
LLAMA_VERSION_VULKAN = "b6510"
|
|
@@ -175,6 +177,13 @@ def get_llama_cli_exe_path(backend: str):
|
|
|
175
177
|
return get_llama_exe_path("llama-cli", backend)
|
|
176
178
|
|
|
177
179
|
|
|
180
|
+
def get_llama_bench_exe_path(backend: str):
|
|
181
|
+
"""
|
|
182
|
+
Get path to platform-specific llama-bench executable
|
|
183
|
+
"""
|
|
184
|
+
return get_llama_exe_path("llama-bench", backend)
|
|
185
|
+
|
|
186
|
+
|
|
178
187
|
def get_version_txt_path(backend: str):
|
|
179
188
|
"""
|
|
180
189
|
Get path to text file that contains version information
|
|
@@ -370,7 +379,7 @@ def install_llamacpp(backend):
|
|
|
370
379
|
import stat
|
|
371
380
|
|
|
372
381
|
# Find and make executable files executable
|
|
373
|
-
for root,
|
|
382
|
+
for root, _, files in os.walk(llama_server_exe_dir):
|
|
374
383
|
for file in files:
|
|
375
384
|
file_path = os.path.join(root, file)
|
|
376
385
|
# Make files in bin/ directories executable
|
|
@@ -406,6 +415,7 @@ def install_llamacpp(backend):
|
|
|
406
415
|
exe_paths = [
|
|
407
416
|
(get_llama_server_exe_path(backend), "llama-server"),
|
|
408
417
|
(get_llama_cli_exe_path(backend), "llama-cli"),
|
|
418
|
+
(get_llama_bench_exe_path(backend), "llama-bench"),
|
|
409
419
|
]
|
|
410
420
|
|
|
411
421
|
for exe_path, exe_name in exe_paths:
|
|
@@ -647,15 +657,91 @@ def identify_gguf_models(
|
|
|
647
657
|
return core_files, sharded_files
|
|
648
658
|
|
|
649
659
|
|
|
650
|
-
def
|
|
660
|
+
def resolve_local_gguf_model(
|
|
661
|
+
checkpoint: str, variant: str, config_mmproj: str = None
|
|
662
|
+
) -> dict | None:
|
|
651
663
|
"""
|
|
652
|
-
|
|
664
|
+
Attempts to resolve a GGUF model from the local HuggingFace cache.
|
|
665
|
+
"""
|
|
666
|
+
from huggingface_hub.constants import HF_HUB_CACHE
|
|
667
|
+
|
|
668
|
+
# Convert checkpoint to cache directory format
|
|
669
|
+
if checkpoint.startswith("models--"):
|
|
670
|
+
model_cache_dir = os.path.join(HF_HUB_CACHE, checkpoint)
|
|
671
|
+
else:
|
|
672
|
+
# This is a HuggingFace repo - convert to cache directory format
|
|
673
|
+
repo_cache_name = checkpoint.replace("/", "--")
|
|
674
|
+
model_cache_dir = os.path.join(HF_HUB_CACHE, f"models--{repo_cache_name}")
|
|
675
|
+
|
|
676
|
+
# Check if the cache directory exists
|
|
677
|
+
if not os.path.exists(model_cache_dir):
|
|
678
|
+
return None
|
|
679
|
+
|
|
680
|
+
gguf_file_found = None
|
|
681
|
+
|
|
682
|
+
# If variant is specified, look for that specific file
|
|
683
|
+
if variant:
|
|
684
|
+
search_term = variant if variant.endswith(".gguf") else f"{variant}.gguf"
|
|
685
|
+
|
|
686
|
+
for root, _, files in os.walk(model_cache_dir):
|
|
687
|
+
if search_term in files:
|
|
688
|
+
gguf_file_found = os.path.join(root, search_term)
|
|
689
|
+
break
|
|
653
690
|
|
|
654
|
-
|
|
655
|
-
|
|
691
|
+
# If no variant or variant not found, find any .gguf file (excluding mmproj)
|
|
692
|
+
if not gguf_file_found:
|
|
693
|
+
for root, _, files in os.walk(model_cache_dir):
|
|
694
|
+
gguf_files = [
|
|
695
|
+
f for f in files if f.endswith(".gguf") and "mmproj" not in f.lower()
|
|
696
|
+
]
|
|
697
|
+
if gguf_files:
|
|
698
|
+
gguf_file_found = os.path.join(root, gguf_files[0])
|
|
699
|
+
break
|
|
700
|
+
|
|
701
|
+
# If no GGUF file found, model is not in cache
|
|
702
|
+
if not gguf_file_found:
|
|
703
|
+
return None
|
|
704
|
+
|
|
705
|
+
# Build result dictionary
|
|
706
|
+
result = {"variant": gguf_file_found}
|
|
707
|
+
|
|
708
|
+
# Search for mmproj file if provided
|
|
709
|
+
if config_mmproj:
|
|
710
|
+
for root, _, files in os.walk(model_cache_dir):
|
|
711
|
+
if config_mmproj in files:
|
|
712
|
+
result["mmproj"] = os.path.join(root, config_mmproj)
|
|
713
|
+
break
|
|
714
|
+
|
|
715
|
+
logging.info(f"Resolved local GGUF model: {result}")
|
|
716
|
+
return result
|
|
717
|
+
|
|
718
|
+
|
|
719
|
+
def download_gguf(
|
|
720
|
+
config_checkpoint: str, config_mmproj=None, do_not_upgrade: bool = False
|
|
721
|
+
) -> dict:
|
|
656
722
|
"""
|
|
723
|
+
Downloads the GGUF file for the given model configuration from HuggingFace.
|
|
724
|
+
|
|
725
|
+
This function downloads models from the internet. It does NOT check the local cache first.
|
|
726
|
+
Callers should use resolve_local_gguf_model() if they want to check for existing models first.
|
|
657
727
|
|
|
658
|
-
|
|
728
|
+
Args:
|
|
729
|
+
config_checkpoint: Checkpoint identifier (file path or HF repo with variant)
|
|
730
|
+
config_mmproj: Optional mmproj file to also download
|
|
731
|
+
do_not_upgrade: If True, use local cache only without attempting to download updates
|
|
732
|
+
|
|
733
|
+
Returns:
|
|
734
|
+
Dictionary with "variant" (and optionally "mmproj") file paths
|
|
735
|
+
"""
|
|
736
|
+
# Handle direct file path case - if the checkpoint is an actual file on disk
|
|
737
|
+
if os.path.exists(config_checkpoint):
|
|
738
|
+
result = {"variant": config_checkpoint}
|
|
739
|
+
if config_mmproj:
|
|
740
|
+
result["mmproj"] = config_mmproj
|
|
741
|
+
return result
|
|
742
|
+
|
|
743
|
+
# Parse checkpoint to extract base and variant
|
|
744
|
+
# Checkpoint format: repo_name:variant (e.g., "unsloth/Qwen3-0.6B-GGUF:Q4_0")
|
|
659
745
|
checkpoint, variant = parse_checkpoint(config_checkpoint)
|
|
660
746
|
|
|
661
747
|
# Identify the GGUF model files in the repository that match the variant
|
|
@@ -686,6 +772,37 @@ def download_gguf(config_checkpoint, config_mmproj=None, do_not_upgrade=False) -
|
|
|
686
772
|
}
|
|
687
773
|
|
|
688
774
|
|
|
775
|
+
# Function to read a stream (stdout or stderr) into a list
|
|
776
|
+
def stream_reader(stream, output_list):
|
|
777
|
+
for line in iter(stream.readline, b""):
|
|
778
|
+
decoded_line = line.decode().rstrip()
|
|
779
|
+
output_list.append(decoded_line)
|
|
780
|
+
stream.close()
|
|
781
|
+
|
|
782
|
+
|
|
783
|
+
def monitor_process_memory(pid, memory_data, interval=0.5):
|
|
784
|
+
"""Monitor memory usage of a process in a separate thread."""
|
|
785
|
+
|
|
786
|
+
try:
|
|
787
|
+
is_windows = platform.system() == "Windows"
|
|
788
|
+
if is_windows:
|
|
789
|
+
# We can only collect peak_wset in Windows
|
|
790
|
+
process = psutil.Process(pid)
|
|
791
|
+
while process.is_running():
|
|
792
|
+
try:
|
|
793
|
+
mem_info = process.memory_info()
|
|
794
|
+
peak_wset = mem_info.peak_wset
|
|
795
|
+
if peak_wset is not None:
|
|
796
|
+
memory_data["peak_wset"] = peak_wset
|
|
797
|
+
except psutil.NoSuchProcess:
|
|
798
|
+
break
|
|
799
|
+
time.sleep(interval)
|
|
800
|
+
except Exception as e:
|
|
801
|
+
print(f"Error monitoring process: {e}")
|
|
802
|
+
|
|
803
|
+
return memory_data
|
|
804
|
+
|
|
805
|
+
|
|
689
806
|
class LlamaCppTokenizerAdapter(PassthroughTokenizer):
|
|
690
807
|
pass
|
|
691
808
|
|
|
@@ -699,8 +816,10 @@ class LlamaCppAdapter(ModelAdapter):
|
|
|
699
816
|
context_size,
|
|
700
817
|
threads,
|
|
701
818
|
executable,
|
|
819
|
+
bench_executable,
|
|
702
820
|
reasoning=False,
|
|
703
821
|
lib_dir=None,
|
|
822
|
+
state=None,
|
|
704
823
|
):
|
|
705
824
|
super().__init__()
|
|
706
825
|
|
|
@@ -712,8 +831,10 @@ class LlamaCppAdapter(ModelAdapter):
|
|
|
712
831
|
self.context_size = context_size
|
|
713
832
|
self.threads = threads
|
|
714
833
|
self.executable = os.path.normpath(executable)
|
|
834
|
+
self.bench_executable = os.path.normpath(bench_executable)
|
|
715
835
|
self.reasoning = reasoning
|
|
716
836
|
self.lib_dir = lib_dir
|
|
837
|
+
self.state = state
|
|
717
838
|
|
|
718
839
|
def generate(
|
|
719
840
|
self,
|
|
@@ -723,6 +844,7 @@ class LlamaCppAdapter(ModelAdapter):
|
|
|
723
844
|
top_p: float = 0.95,
|
|
724
845
|
top_k: int = 40,
|
|
725
846
|
return_raw: bool = False,
|
|
847
|
+
save_max_memory_used: bool = False,
|
|
726
848
|
**kwargs, # pylint: disable=unused-argument
|
|
727
849
|
):
|
|
728
850
|
"""
|
|
@@ -754,32 +876,54 @@ class LlamaCppAdapter(ModelAdapter):
|
|
|
754
876
|
self.executable,
|
|
755
877
|
"-m",
|
|
756
878
|
self.model,
|
|
757
|
-
"--ctx-size",
|
|
879
|
+
"--ctx-size", # size of the prompt context, 0 = loaded from model
|
|
758
880
|
str(self.context_size),
|
|
759
|
-
"-n",
|
|
881
|
+
"-n", # number of tokens to predict, -1 = infinity, =2 - until context filled
|
|
760
882
|
str(n_predict),
|
|
761
|
-
"-t",
|
|
883
|
+
"-t", # number of threads to use during generation
|
|
762
884
|
str(self.threads),
|
|
763
885
|
"-p",
|
|
764
886
|
prompt,
|
|
887
|
+
"-b", # logical maximum batch size
|
|
888
|
+
"1",
|
|
889
|
+
"-ub", # physical maximum batch size
|
|
890
|
+
"1",
|
|
765
891
|
"--temp",
|
|
766
892
|
str(temperature),
|
|
767
893
|
"--top-p",
|
|
768
894
|
str(top_p),
|
|
769
895
|
"--top-k",
|
|
770
896
|
str(top_k),
|
|
771
|
-
"-e",
|
|
772
|
-
"
|
|
773
|
-
"--reasoning-format",
|
|
897
|
+
"-e", # process escape sequences
|
|
898
|
+
"--no-conversation", # disable conversation mode
|
|
899
|
+
"--reasoning-format", # leaves thoughts unparsed in message content
|
|
774
900
|
"none",
|
|
775
901
|
]
|
|
776
902
|
|
|
903
|
+
# If prompt exceeds 500 characters, then use a file
|
|
904
|
+
if len(prompt) < 500:
|
|
905
|
+
cmd += ["-p", prompt]
|
|
906
|
+
else:
|
|
907
|
+
# Create prompt file in cache directory
|
|
908
|
+
prompt_file = os.path.join(
|
|
909
|
+
build.output_dir(self.state.cache_dir, self.state.build_name),
|
|
910
|
+
"prompt.txt",
|
|
911
|
+
)
|
|
912
|
+
with open(prompt_file, "w", encoding="utf-8") as file:
|
|
913
|
+
file.write(prompt)
|
|
914
|
+
cmd += ["-f", prompt_file]
|
|
915
|
+
|
|
777
916
|
# Configure GPU layers: 99 for GPU, 0 for CPU-only
|
|
778
917
|
ngl_value = "99" if self.device == "igpu" else "0"
|
|
779
918
|
cmd = cmd + ["-ngl", ngl_value]
|
|
780
919
|
|
|
781
920
|
cmd = [str(m) for m in cmd]
|
|
782
921
|
|
|
922
|
+
# save llama-cli command
|
|
923
|
+
self.state.llama_cli_cmd = getattr(self.state, "llama_cli_cmd", []) + [
|
|
924
|
+
" ".join(cmd)
|
|
925
|
+
]
|
|
926
|
+
|
|
783
927
|
try:
|
|
784
928
|
# Set up environment with library path for Linux
|
|
785
929
|
env = os.environ.copy()
|
|
@@ -808,15 +952,35 @@ class LlamaCppAdapter(ModelAdapter):
|
|
|
808
952
|
env=env,
|
|
809
953
|
)
|
|
810
954
|
|
|
811
|
-
|
|
955
|
+
# Start memory monitoring in a separate thread
|
|
956
|
+
if save_max_memory_used:
|
|
957
|
+
memory_data = {}
|
|
958
|
+
monitor_thread = threading.Thread(
|
|
959
|
+
target=monitor_process_memory,
|
|
960
|
+
args=(process.pid, memory_data),
|
|
961
|
+
daemon=True,
|
|
962
|
+
)
|
|
963
|
+
monitor_thread.start()
|
|
964
|
+
|
|
965
|
+
# Communicate with the subprocess
|
|
966
|
+
stdout, stderr = process.communicate(timeout=600)
|
|
967
|
+
|
|
968
|
+
# save llama-cli command output with performance info to state
|
|
969
|
+
# (can be viewed in state.yaml file in cache)
|
|
970
|
+
self.state.llama_cli_stderr = getattr(
|
|
971
|
+
self.state, "llama_cli_stderr", []
|
|
972
|
+
) + [
|
|
973
|
+
[line for line in stderr.splitlines() if line.startswith("llama_perf_")]
|
|
974
|
+
]
|
|
975
|
+
|
|
812
976
|
if process.returncode != 0:
|
|
813
977
|
error_msg = f"llama.cpp failed with return code {process.returncode}.\n"
|
|
814
978
|
error_msg += f"Command: {' '.join(cmd)}\n"
|
|
815
979
|
error_msg += f"Error output:\n{stderr}\n"
|
|
816
|
-
error_msg += f"Standard output:\n{
|
|
980
|
+
error_msg += f"Standard output:\n{stdout}"
|
|
817
981
|
raise Exception(error_msg)
|
|
818
982
|
|
|
819
|
-
if
|
|
983
|
+
if stdout is None:
|
|
820
984
|
raise Exception("No output received from llama.cpp process")
|
|
821
985
|
|
|
822
986
|
# Parse information from llama.cpp output
|
|
@@ -847,14 +1011,19 @@ class LlamaCppAdapter(ModelAdapter):
|
|
|
847
1011
|
else 0
|
|
848
1012
|
)
|
|
849
1013
|
|
|
1014
|
+
# Wait for monitor thread to finish and write peak_wset
|
|
1015
|
+
if save_max_memory_used:
|
|
1016
|
+
monitor_thread.join(timeout=2)
|
|
1017
|
+
self.peak_wset = memory_data.get("peak_wset", None)
|
|
1018
|
+
|
|
850
1019
|
if return_raw:
|
|
851
|
-
return [
|
|
1020
|
+
return [stdout, stderr]
|
|
852
1021
|
|
|
853
1022
|
# Find where the prompt ends and the generated text begins
|
|
854
1023
|
prompt_found = False
|
|
855
1024
|
output_text = ""
|
|
856
1025
|
prompt_first_line = prompt.split("\n")[0]
|
|
857
|
-
for line in
|
|
1026
|
+
for line in stdout.splitlines():
|
|
858
1027
|
if prompt_first_line in line:
|
|
859
1028
|
prompt_found = True
|
|
860
1029
|
if prompt_found:
|
|
@@ -865,7 +1034,7 @@ class LlamaCppAdapter(ModelAdapter):
|
|
|
865
1034
|
raise Exception(
|
|
866
1035
|
f"Could not find prompt '{prompt_first_line}' in llama.cpp output. "
|
|
867
1036
|
"This usually means the model failed to process the prompt correctly.\n"
|
|
868
|
-
f"Raw output:\n{
|
|
1037
|
+
f"Raw output:\n{stdout}\n"
|
|
869
1038
|
f"Stderr:\n{stderr}"
|
|
870
1039
|
)
|
|
871
1040
|
|
|
@@ -873,10 +1042,137 @@ class LlamaCppAdapter(ModelAdapter):
|
|
|
873
1042
|
return [output_text]
|
|
874
1043
|
|
|
875
1044
|
except Exception as e:
|
|
876
|
-
error_msg = f"Failed to run llama.
|
|
1045
|
+
error_msg = f"Failed to run llama-cli.exe command: {str(e)}\n"
|
|
877
1046
|
error_msg += f"Command: {' '.join(cmd)}"
|
|
878
1047
|
raise Exception(error_msg)
|
|
879
1048
|
|
|
1049
|
+
def benchmark(self, prompt, iterations, output_tokens):
|
|
1050
|
+
"""
|
|
1051
|
+
Runs the llama-bench.exe tool to measure TTFT and TPS
|
|
1052
|
+
"""
|
|
1053
|
+
cmd = [
|
|
1054
|
+
self.bench_executable,
|
|
1055
|
+
"-m",
|
|
1056
|
+
self.model,
|
|
1057
|
+
"-r",
|
|
1058
|
+
iterations,
|
|
1059
|
+
"-p",
|
|
1060
|
+
str(prompt),
|
|
1061
|
+
"-n",
|
|
1062
|
+
output_tokens,
|
|
1063
|
+
"-t",
|
|
1064
|
+
self.threads if self.threads > 0 else 16,
|
|
1065
|
+
"-b",
|
|
1066
|
+
1,
|
|
1067
|
+
"-ub",
|
|
1068
|
+
1,
|
|
1069
|
+
]
|
|
1070
|
+
ngl_value = "99" if self.device == "igpu" else "0"
|
|
1071
|
+
cmd = cmd + ["-ngl", ngl_value]
|
|
1072
|
+
cmd = [str(m) for m in cmd]
|
|
1073
|
+
|
|
1074
|
+
# save llama-bench command
|
|
1075
|
+
self.state.llama_bench_cmd = " ".join(cmd)
|
|
1076
|
+
|
|
1077
|
+
try:
|
|
1078
|
+
# Set up environment with library path for Linux
|
|
1079
|
+
env = os.environ.copy()
|
|
1080
|
+
|
|
1081
|
+
# Load environment variables from .env file in the executable directory
|
|
1082
|
+
exe_dir = os.path.dirname(self.executable)
|
|
1083
|
+
env_file_path = os.path.join(exe_dir, ".env")
|
|
1084
|
+
if os.path.exists(env_file_path):
|
|
1085
|
+
load_dotenv(env_file_path, override=True)
|
|
1086
|
+
env.update(os.environ)
|
|
1087
|
+
|
|
1088
|
+
if self.lib_dir and os.name != "nt": # Not Windows
|
|
1089
|
+
current_ld_path = env.get("LD_LIBRARY_PATH", "")
|
|
1090
|
+
if current_ld_path:
|
|
1091
|
+
env["LD_LIBRARY_PATH"] = f"{self.lib_dir}:{current_ld_path}"
|
|
1092
|
+
else:
|
|
1093
|
+
env["LD_LIBRARY_PATH"] = self.lib_dir
|
|
1094
|
+
|
|
1095
|
+
process = subprocess.Popen(
|
|
1096
|
+
cmd,
|
|
1097
|
+
stdout=subprocess.PIPE,
|
|
1098
|
+
stderr=subprocess.PIPE,
|
|
1099
|
+
universal_newlines=True,
|
|
1100
|
+
encoding="utf-8",
|
|
1101
|
+
errors="replace",
|
|
1102
|
+
env=env,
|
|
1103
|
+
)
|
|
1104
|
+
|
|
1105
|
+
# Start memory monitoring in a separate thread
|
|
1106
|
+
save_max_memory_used = platform.system() == "Windows"
|
|
1107
|
+
if save_max_memory_used:
|
|
1108
|
+
memory_data = {}
|
|
1109
|
+
monitor_thread = threading.Thread(
|
|
1110
|
+
target=monitor_process_memory,
|
|
1111
|
+
args=(process.pid, memory_data),
|
|
1112
|
+
daemon=True,
|
|
1113
|
+
)
|
|
1114
|
+
monitor_thread.start()
|
|
1115
|
+
|
|
1116
|
+
# Communicate with the subprocess
|
|
1117
|
+
stdout, stderr = process.communicate(timeout=600)
|
|
1118
|
+
|
|
1119
|
+
# save llama-bench command output with performance info to state
|
|
1120
|
+
# (can be viewed in state.yaml file in cache)
|
|
1121
|
+
self.state.llama_bench_standard_output = stdout.splitlines()
|
|
1122
|
+
|
|
1123
|
+
if process.returncode != 0:
|
|
1124
|
+
error_msg = (
|
|
1125
|
+
f"llama-bench.exe failed with return code {process.returncode}.\n"
|
|
1126
|
+
)
|
|
1127
|
+
error_msg += f"Command: {' '.join(cmd)}\n"
|
|
1128
|
+
error_msg += f"Error output:\n{stderr}\n"
|
|
1129
|
+
error_msg += f"Standard output:\n{stdout}"
|
|
1130
|
+
raise Exception(error_msg)
|
|
1131
|
+
|
|
1132
|
+
if stdout is None:
|
|
1133
|
+
error_msg = "No output received from llama-bench.exe process\n"
|
|
1134
|
+
error_msg += f"Error output:\n{stderr}\n"
|
|
1135
|
+
error_msg += f"Standard output:\n{stdout}"
|
|
1136
|
+
raise Exception(error_msg)
|
|
1137
|
+
|
|
1138
|
+
# Parse information from llama-bench.exe output
|
|
1139
|
+
prompt_length = None
|
|
1140
|
+
pp_tps = None
|
|
1141
|
+
pp_tps_sd = None
|
|
1142
|
+
tg_tps = None
|
|
1143
|
+
tg_tps_sd = None
|
|
1144
|
+
|
|
1145
|
+
for line in stdout.splitlines():
|
|
1146
|
+
# Parse TPS information
|
|
1147
|
+
if f"pp{prompt:d}" in line:
|
|
1148
|
+
parts = line.split("|")
|
|
1149
|
+
timings = parts[-2].strip().split(" ")
|
|
1150
|
+
prompt_length = prompt
|
|
1151
|
+
pp_tps = float(timings[0])
|
|
1152
|
+
pp_tps_sd = float(timings[-1])
|
|
1153
|
+
if f"tg{output_tokens:d}" in line:
|
|
1154
|
+
parts = line.split("|")
|
|
1155
|
+
timings = parts[-2].strip().split(" ")
|
|
1156
|
+
tg_tps = float(timings[0])
|
|
1157
|
+
tg_tps_sd = float(timings[-1])
|
|
1158
|
+
|
|
1159
|
+
except Exception as e:
|
|
1160
|
+
error_msg = f"Failed to run llama-bench.exe command: {str(e)}\n"
|
|
1161
|
+
error_msg += f"Command: {' '.join(cmd)}"
|
|
1162
|
+
raise Exception(error_msg)
|
|
1163
|
+
|
|
1164
|
+
# Determine max memory used
|
|
1165
|
+
if save_max_memory_used:
|
|
1166
|
+
# Wait for monitor thread to finish
|
|
1167
|
+
monitor_thread.join(timeout=2)
|
|
1168
|
+
|
|
1169
|
+
# Track memory usage concurrently
|
|
1170
|
+
peak_wset = memory_data.get("peak_wset", None)
|
|
1171
|
+
else:
|
|
1172
|
+
peak_wset = None
|
|
1173
|
+
|
|
1174
|
+
return prompt_length, pp_tps, pp_tps_sd, tg_tps, tg_tps_sd, peak_wset
|
|
1175
|
+
|
|
880
1176
|
|
|
881
1177
|
def get_hip_devices():
|
|
882
1178
|
"""Get list of HIP devices with their IDs and names."""
|
lemonade/tools/oga/bench.py
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
import statistics
|
|
3
3
|
from statistics import StatisticsError
|
|
4
|
+
import psutil
|
|
4
5
|
from lemonade.state import State
|
|
5
|
-
from lemonade.cache import Keys
|
|
6
6
|
from lemonade.tools.adapter import ModelAdapter, TokenizerAdapter
|
|
7
7
|
from lemonade.tools.bench import Bench
|
|
8
8
|
|
|
@@ -20,16 +20,6 @@ class OgaBench(Bench):
|
|
|
20
20
|
|
|
21
21
|
unique_name = "oga-bench"
|
|
22
22
|
|
|
23
|
-
def __init__(self):
|
|
24
|
-
super().__init__()
|
|
25
|
-
|
|
26
|
-
# Additional statistics generated by this bench tool
|
|
27
|
-
self.status_stats.insert(
|
|
28
|
-
self.status_stats.index(Keys.TOKEN_GENERATION_TOKENS_PER_SECOND) + 1,
|
|
29
|
-
Keys.STD_DEV_TOKENS_PER_SECOND,
|
|
30
|
-
)
|
|
31
|
-
self.std_dev_token_generation_tokens_per_second_list = []
|
|
32
|
-
|
|
33
23
|
@staticmethod
|
|
34
24
|
def parser(add_help: bool = True) -> argparse.ArgumentParser:
|
|
35
25
|
parser = __class__.helpful_parser(
|
|
@@ -62,7 +52,7 @@ class OgaBench(Bench):
|
|
|
62
52
|
iterations: int,
|
|
63
53
|
warmup_iterations: int,
|
|
64
54
|
output_tokens: int,
|
|
65
|
-
)
|
|
55
|
+
):
|
|
66
56
|
|
|
67
57
|
model: ModelAdapter = state.model
|
|
68
58
|
tokenizer: TokenizerAdapter = state.tokenizer
|
|
@@ -120,20 +110,9 @@ class OgaBench(Bench):
|
|
|
120
110
|
except StatisticsError:
|
|
121
111
|
# Less than 2 measurements
|
|
122
112
|
self.std_dev_token_generation_tokens_per_second_list.append(None)
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
# Save additional statistics
|
|
128
|
-
if not all(
|
|
129
|
-
element is None
|
|
130
|
-
for element in self.std_dev_token_generation_tokens_per_second_list
|
|
131
|
-
):
|
|
132
|
-
state.save_stat(
|
|
133
|
-
Keys.STD_DEV_TOKENS_PER_SECOND,
|
|
134
|
-
self.get_item_or_list(
|
|
135
|
-
self.std_dev_token_generation_tokens_per_second_list
|
|
136
|
-
),
|
|
113
|
+
if self.save_max_memory_used:
|
|
114
|
+
self.max_memory_used_gb_list.append(
|
|
115
|
+
psutil.Process().memory_info().peak_wset / 1024**3
|
|
137
116
|
)
|
|
138
117
|
|
|
139
118
|
|