lemonade-sdk 8.1.10__py3-none-any.whl → 8.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lemonade-sdk might be problematic. Click here for more details.
- lemonade/cache.py +6 -1
- lemonade/common/status.py +4 -4
- lemonade/tools/bench.py +22 -1
- lemonade/tools/flm/__init__.py +1 -0
- lemonade/tools/flm/utils.py +255 -0
- lemonade/tools/llamacpp/bench.py +111 -23
- lemonade/tools/llamacpp/load.py +20 -1
- lemonade/tools/llamacpp/utils.py +210 -17
- lemonade/tools/oga/bench.py +0 -26
- lemonade/tools/report/table.py +6 -0
- lemonade/tools/server/flm.py +133 -0
- lemonade/tools/server/llamacpp.py +23 -5
- lemonade/tools/server/serve.py +260 -135
- lemonade/tools/server/static/js/chat.js +165 -82
- lemonade/tools/server/static/js/models.js +87 -54
- lemonade/tools/server/static/js/shared.js +9 -6
- lemonade/tools/server/static/logs.html +57 -0
- lemonade/tools/server/static/styles.css +159 -8
- lemonade/tools/server/static/webapp.html +28 -10
- lemonade/tools/server/tray.py +94 -38
- lemonade/tools/server/utils/macos_tray.py +226 -0
- lemonade/tools/server/utils/{system_tray.py → windows_tray.py} +13 -0
- lemonade/tools/server/webapp.py +4 -1
- lemonade/tools/server/wrapped_server.py +91 -25
- lemonade/version.py +1 -1
- lemonade_install/install.py +25 -2
- {lemonade_sdk-8.1.10.dist-info → lemonade_sdk-8.1.12.dist-info}/METADATA +10 -6
- {lemonade_sdk-8.1.10.dist-info → lemonade_sdk-8.1.12.dist-info}/RECORD +37 -32
- lemonade_server/cli.py +103 -14
- lemonade_server/model_manager.py +186 -45
- lemonade_server/pydantic_models.py +25 -1
- lemonade_server/server_models.json +175 -62
- {lemonade_sdk-8.1.10.dist-info → lemonade_sdk-8.1.12.dist-info}/WHEEL +0 -0
- {lemonade_sdk-8.1.10.dist-info → lemonade_sdk-8.1.12.dist-info}/entry_points.txt +0 -0
- {lemonade_sdk-8.1.10.dist-info → lemonade_sdk-8.1.12.dist-info}/licenses/LICENSE +0 -0
- {lemonade_sdk-8.1.10.dist-info → lemonade_sdk-8.1.12.dist-info}/licenses/NOTICE.md +0 -0
- {lemonade_sdk-8.1.10.dist-info → lemonade_sdk-8.1.12.dist-info}/top_level.txt +0 -0
lemonade/tools/llamacpp/utils.py
CHANGED
|
@@ -7,6 +7,7 @@ import zipfile
|
|
|
7
7
|
from typing import Optional
|
|
8
8
|
import subprocess
|
|
9
9
|
import requests
|
|
10
|
+
import lemonade.common.build as build
|
|
10
11
|
import lemonade.common.printing as printing
|
|
11
12
|
from lemonade.tools.adapter import PassthroughTokenizer, ModelAdapter
|
|
12
13
|
|
|
@@ -14,8 +15,9 @@ from lemonade.common.system_info import get_system_info
|
|
|
14
15
|
|
|
15
16
|
from dotenv import set_key, load_dotenv
|
|
16
17
|
|
|
17
|
-
LLAMA_VERSION_VULKAN = "
|
|
18
|
-
LLAMA_VERSION_ROCM = "
|
|
18
|
+
LLAMA_VERSION_VULKAN = "b6510"
|
|
19
|
+
LLAMA_VERSION_ROCM = "b1066"
|
|
20
|
+
LLAMA_VERSION_METAL = "b6510"
|
|
19
21
|
|
|
20
22
|
|
|
21
23
|
def identify_rocm_arch_from_name(device_name: str) -> str | None:
|
|
@@ -126,8 +128,12 @@ def get_llama_version(backend: str) -> str:
|
|
|
126
128
|
return LLAMA_VERSION_ROCM
|
|
127
129
|
elif backend == "vulkan":
|
|
128
130
|
return LLAMA_VERSION_VULKAN
|
|
131
|
+
elif backend == "metal":
|
|
132
|
+
return LLAMA_VERSION_METAL
|
|
129
133
|
else:
|
|
130
|
-
raise ValueError(
|
|
134
|
+
raise ValueError(
|
|
135
|
+
f"Unsupported backend: {backend}. Supported: vulkan, rocm, metal"
|
|
136
|
+
)
|
|
131
137
|
|
|
132
138
|
|
|
133
139
|
def get_llama_folder_path(backend: str):
|
|
@@ -142,10 +148,12 @@ def get_llama_exe_path(exe_name: str, backend: str):
|
|
|
142
148
|
Get path to platform-specific llama-server executable
|
|
143
149
|
"""
|
|
144
150
|
base_dir = get_llama_folder_path(backend)
|
|
145
|
-
|
|
151
|
+
system = platform.system().lower()
|
|
152
|
+
|
|
153
|
+
if system == "windows":
|
|
146
154
|
return os.path.join(base_dir, f"{exe_name}.exe")
|
|
147
|
-
else: # Linux/Ubuntu
|
|
148
|
-
# Check if executable exists in build/bin subdirectory
|
|
155
|
+
else: # Darwin/Linux/Ubuntu
|
|
156
|
+
# Check if executable exists in build/bin subdirectory
|
|
149
157
|
build_bin_path = os.path.join(base_dir, "build", "bin", exe_name)
|
|
150
158
|
if os.path.exists(build_bin_path):
|
|
151
159
|
return build_bin_path
|
|
@@ -168,6 +176,13 @@ def get_llama_cli_exe_path(backend: str):
|
|
|
168
176
|
return get_llama_exe_path("llama-cli", backend)
|
|
169
177
|
|
|
170
178
|
|
|
179
|
+
def get_llama_bench_exe_path(backend: str):
|
|
180
|
+
"""
|
|
181
|
+
Get path to platform-specific llama-bench executable
|
|
182
|
+
"""
|
|
183
|
+
return get_llama_exe_path("llama-bench", backend)
|
|
184
|
+
|
|
185
|
+
|
|
171
186
|
def get_version_txt_path(backend: str):
|
|
172
187
|
"""
|
|
173
188
|
Get path to text file that contains version information
|
|
@@ -223,8 +238,24 @@ def get_binary_url_and_filename(backend: str, target_arch: str = None):
|
|
|
223
238
|
raise NotImplementedError(
|
|
224
239
|
f"Platform {system} not supported for Vulkan llamacpp. Supported: Windows, Ubuntu Linux"
|
|
225
240
|
)
|
|
241
|
+
|
|
242
|
+
elif backend == "metal":
|
|
243
|
+
# Metal support for macOS Apple Silicon from ggml-org/llama.cpp
|
|
244
|
+
repo = "ggml-org/llama.cpp"
|
|
245
|
+
version = LLAMA_VERSION_METAL
|
|
246
|
+
if system == "darwin":
|
|
247
|
+
if platform.machine().lower() in ["arm64", "aarch64"]:
|
|
248
|
+
filename = f"llama-{version}-bin-macos-arm64.zip"
|
|
249
|
+
else:
|
|
250
|
+
raise NotImplementedError(
|
|
251
|
+
"Metal backend only supports Apple Silicon (ARM64) processors"
|
|
252
|
+
)
|
|
253
|
+
else:
|
|
254
|
+
raise NotImplementedError(
|
|
255
|
+
f"Platform {system} not supported for Metal llamacpp. Metal is only supported on macOS"
|
|
256
|
+
)
|
|
226
257
|
else:
|
|
227
|
-
supported_backends = ["vulkan", "rocm"]
|
|
258
|
+
supported_backends = ["vulkan", "rocm", "metal"]
|
|
228
259
|
raise NotImplementedError(
|
|
229
260
|
f"Unsupported backend: {backend}. Supported backends: {supported_backends}"
|
|
230
261
|
)
|
|
@@ -239,10 +270,10 @@ def validate_platform_support():
|
|
|
239
270
|
"""
|
|
240
271
|
system = platform.system().lower()
|
|
241
272
|
|
|
242
|
-
if system not in ["windows", "linux"]:
|
|
273
|
+
if system not in ["windows", "linux", "darwin"]:
|
|
243
274
|
raise NotImplementedError(
|
|
244
275
|
f"Platform {system} not supported for llamacpp. "
|
|
245
|
-
"Supported: Windows, Ubuntu Linux"
|
|
276
|
+
"Supported: Windows, Ubuntu Linux, macOS"
|
|
246
277
|
)
|
|
247
278
|
|
|
248
279
|
if system == "linux":
|
|
@@ -341,6 +372,29 @@ def install_llamacpp(backend):
|
|
|
341
372
|
if filename.endswith(".zip"):
|
|
342
373
|
with zipfile.ZipFile(llama_archive_path, "r") as zip_ref:
|
|
343
374
|
zip_ref.extractall(llama_server_exe_dir)
|
|
375
|
+
|
|
376
|
+
# On Unix-like systems (macOS/Linux), make executables executable
|
|
377
|
+
if platform.system().lower() in ["darwin", "linux"]:
|
|
378
|
+
import stat
|
|
379
|
+
|
|
380
|
+
# Find and make executable files executable
|
|
381
|
+
for root, dirs, files in os.walk(llama_server_exe_dir):
|
|
382
|
+
for file in files:
|
|
383
|
+
file_path = os.path.join(root, file)
|
|
384
|
+
# Make files in bin/ directories executable
|
|
385
|
+
if "bin" in root.split(os.sep) or file in [
|
|
386
|
+
"llama-server",
|
|
387
|
+
"llama-simple",
|
|
388
|
+
]:
|
|
389
|
+
try:
|
|
390
|
+
current_permissions = os.stat(file_path).st_mode
|
|
391
|
+
os.chmod(file_path, current_permissions | stat.S_IEXEC)
|
|
392
|
+
logging.debug(f"Made {file_path} executable")
|
|
393
|
+
except Exception as e:
|
|
394
|
+
raise RuntimeError(
|
|
395
|
+
f"Failed to make {file_path} executable. This will prevent "
|
|
396
|
+
f"llama-server from starting. Error: {e}"
|
|
397
|
+
)
|
|
344
398
|
else:
|
|
345
399
|
raise NotImplementedError(f"Unsupported archive format: {filename}")
|
|
346
400
|
|
|
@@ -360,6 +414,7 @@ def install_llamacpp(backend):
|
|
|
360
414
|
exe_paths = [
|
|
361
415
|
(get_llama_server_exe_path(backend), "llama-server"),
|
|
362
416
|
(get_llama_cli_exe_path(backend), "llama-cli"),
|
|
417
|
+
(get_llama_bench_exe_path(backend), "llama-bench"),
|
|
363
418
|
]
|
|
364
419
|
|
|
365
420
|
for exe_path, exe_name in exe_paths:
|
|
@@ -653,8 +708,10 @@ class LlamaCppAdapter(ModelAdapter):
|
|
|
653
708
|
context_size,
|
|
654
709
|
threads,
|
|
655
710
|
executable,
|
|
711
|
+
bench_executable,
|
|
656
712
|
reasoning=False,
|
|
657
713
|
lib_dir=None,
|
|
714
|
+
state=None,
|
|
658
715
|
):
|
|
659
716
|
super().__init__()
|
|
660
717
|
|
|
@@ -666,8 +723,10 @@ class LlamaCppAdapter(ModelAdapter):
|
|
|
666
723
|
self.context_size = context_size
|
|
667
724
|
self.threads = threads
|
|
668
725
|
self.executable = os.path.normpath(executable)
|
|
726
|
+
self.bench_executable = os.path.normpath(bench_executable)
|
|
669
727
|
self.reasoning = reasoning
|
|
670
728
|
self.lib_dir = lib_dir
|
|
729
|
+
self.state = state
|
|
671
730
|
|
|
672
731
|
def generate(
|
|
673
732
|
self,
|
|
@@ -708,32 +767,54 @@ class LlamaCppAdapter(ModelAdapter):
|
|
|
708
767
|
self.executable,
|
|
709
768
|
"-m",
|
|
710
769
|
self.model,
|
|
711
|
-
"--ctx-size",
|
|
770
|
+
"--ctx-size", # size of the prompt context, 0 = loaded from model
|
|
712
771
|
str(self.context_size),
|
|
713
|
-
"-n",
|
|
772
|
+
"-n", # number of tokens to predict, -1 = infinity, =2 - until context filled
|
|
714
773
|
str(n_predict),
|
|
715
|
-
"-t",
|
|
774
|
+
"-t", # number of threads to use during generation
|
|
716
775
|
str(self.threads),
|
|
717
776
|
"-p",
|
|
718
777
|
prompt,
|
|
778
|
+
"-b", # logical maximum batch size
|
|
779
|
+
"1",
|
|
780
|
+
"-ub", # physical maximum batch size
|
|
781
|
+
"1",
|
|
719
782
|
"--temp",
|
|
720
783
|
str(temperature),
|
|
721
784
|
"--top-p",
|
|
722
785
|
str(top_p),
|
|
723
786
|
"--top-k",
|
|
724
787
|
str(top_k),
|
|
725
|
-
"-e",
|
|
726
|
-
"
|
|
727
|
-
"--reasoning-format",
|
|
788
|
+
"-e", # process escape sequences
|
|
789
|
+
"--no-conversation", # disable conversation mode
|
|
790
|
+
"--reasoning-format", # leaves thoughts unparsed in message content
|
|
728
791
|
"none",
|
|
729
792
|
]
|
|
730
793
|
|
|
794
|
+
# If prompt exceeds 500 characters, then use a file
|
|
795
|
+
if len(prompt) < 500:
|
|
796
|
+
cmd += ["-p", prompt]
|
|
797
|
+
else:
|
|
798
|
+
# Create prompt file in cache directory
|
|
799
|
+
prompt_file = os.path.join(
|
|
800
|
+
build.output_dir(self.state.cache_dir, self.state.build_name),
|
|
801
|
+
"prompt.txt",
|
|
802
|
+
)
|
|
803
|
+
with open(prompt_file, "w", encoding="utf-8") as file:
|
|
804
|
+
file.write(prompt)
|
|
805
|
+
cmd += ["-f", prompt_file]
|
|
806
|
+
|
|
731
807
|
# Configure GPU layers: 99 for GPU, 0 for CPU-only
|
|
732
808
|
ngl_value = "99" if self.device == "igpu" else "0"
|
|
733
809
|
cmd = cmd + ["-ngl", ngl_value]
|
|
734
810
|
|
|
735
811
|
cmd = [str(m) for m in cmd]
|
|
736
812
|
|
|
813
|
+
# save llama-cli command
|
|
814
|
+
self.state.llama_cli_cmd = getattr(self.state, "llama_cli_cmd", []) + [
|
|
815
|
+
" ".join(cmd)
|
|
816
|
+
]
|
|
817
|
+
|
|
737
818
|
try:
|
|
738
819
|
# Set up environment with library path for Linux
|
|
739
820
|
env = os.environ.copy()
|
|
@@ -763,6 +844,15 @@ class LlamaCppAdapter(ModelAdapter):
|
|
|
763
844
|
)
|
|
764
845
|
|
|
765
846
|
raw_output, stderr = process.communicate(timeout=600)
|
|
847
|
+
|
|
848
|
+
# save llama-cli command output with performance info to state
|
|
849
|
+
# (can be viewed in state.yaml file in cache)
|
|
850
|
+
self.state.llama_cli_stderr = getattr(
|
|
851
|
+
self.state, "llama_cli_stderr", []
|
|
852
|
+
) + [
|
|
853
|
+
[line for line in stderr.splitlines() if line.startswith("llama_perf_")]
|
|
854
|
+
]
|
|
855
|
+
|
|
766
856
|
if process.returncode != 0:
|
|
767
857
|
error_msg = f"llama.cpp failed with return code {process.returncode}.\n"
|
|
768
858
|
error_msg += f"Command: {' '.join(cmd)}\n"
|
|
@@ -827,7 +917,108 @@ class LlamaCppAdapter(ModelAdapter):
|
|
|
827
917
|
return [output_text]
|
|
828
918
|
|
|
829
919
|
except Exception as e:
|
|
830
|
-
error_msg = f"Failed to run llama.
|
|
920
|
+
error_msg = f"Failed to run llama-cli.exe command: {str(e)}\n"
|
|
921
|
+
error_msg += f"Command: {' '.join(cmd)}"
|
|
922
|
+
raise Exception(error_msg)
|
|
923
|
+
|
|
924
|
+
def benchmark(self, prompts, iterations, output_tokens):
|
|
925
|
+
"""
|
|
926
|
+
Runs the llama-bench.exe tool to measure TTFT and TPS
|
|
927
|
+
"""
|
|
928
|
+
cmd = [
|
|
929
|
+
self.bench_executable,
|
|
930
|
+
"-m",
|
|
931
|
+
self.model,
|
|
932
|
+
"-r",
|
|
933
|
+
iterations,
|
|
934
|
+
"-p",
|
|
935
|
+
",".join([str(p) for p in prompts]),
|
|
936
|
+
"-n",
|
|
937
|
+
output_tokens,
|
|
938
|
+
"-t",
|
|
939
|
+
self.threads if self.threads > 0 else 16,
|
|
940
|
+
"-b",
|
|
941
|
+
1,
|
|
942
|
+
"-ub",
|
|
943
|
+
1,
|
|
944
|
+
]
|
|
945
|
+
cmd = [str(m) for m in cmd]
|
|
946
|
+
|
|
947
|
+
# save llama-bench command
|
|
948
|
+
self.state.llama_bench_cmd = " ".join(cmd)
|
|
949
|
+
|
|
950
|
+
try:
|
|
951
|
+
# Set up environment with library path for Linux
|
|
952
|
+
env = os.environ.copy()
|
|
953
|
+
|
|
954
|
+
# Load environment variables from .env file in the executable directory
|
|
955
|
+
exe_dir = os.path.dirname(self.executable)
|
|
956
|
+
env_file_path = os.path.join(exe_dir, ".env")
|
|
957
|
+
if os.path.exists(env_file_path):
|
|
958
|
+
load_dotenv(env_file_path, override=True)
|
|
959
|
+
env.update(os.environ)
|
|
960
|
+
|
|
961
|
+
if self.lib_dir and os.name != "nt": # Not Windows
|
|
962
|
+
current_ld_path = env.get("LD_LIBRARY_PATH", "")
|
|
963
|
+
if current_ld_path:
|
|
964
|
+
env["LD_LIBRARY_PATH"] = f"{self.lib_dir}:{current_ld_path}"
|
|
965
|
+
else:
|
|
966
|
+
env["LD_LIBRARY_PATH"] = self.lib_dir
|
|
967
|
+
|
|
968
|
+
process = subprocess.Popen(
|
|
969
|
+
cmd,
|
|
970
|
+
stdout=subprocess.PIPE,
|
|
971
|
+
stderr=subprocess.PIPE,
|
|
972
|
+
universal_newlines=True,
|
|
973
|
+
encoding="utf-8",
|
|
974
|
+
errors="replace",
|
|
975
|
+
env=env,
|
|
976
|
+
)
|
|
977
|
+
|
|
978
|
+
raw_output, stderr = process.communicate(timeout=600)
|
|
979
|
+
|
|
980
|
+
# save llama-bench command output with performance info to state
|
|
981
|
+
# (can be viewed in state.yaml file in cache)
|
|
982
|
+
self.state.llama_bench_standard_output = raw_output.splitlines()
|
|
983
|
+
|
|
984
|
+
if process.returncode != 0:
|
|
985
|
+
error_msg = (
|
|
986
|
+
f"llama-bench.exe failed with return code {process.returncode}.\n"
|
|
987
|
+
)
|
|
988
|
+
error_msg += f"Command: {' '.join(cmd)}\n"
|
|
989
|
+
error_msg += f"Error output:\n{stderr}\n"
|
|
990
|
+
error_msg += f"Standard output:\n{raw_output}"
|
|
991
|
+
raise Exception(error_msg)
|
|
992
|
+
|
|
993
|
+
if raw_output is None:
|
|
994
|
+
raise Exception("No output received from llama-bench.exe process")
|
|
995
|
+
|
|
996
|
+
# Parse information from llama-bench.exe output
|
|
997
|
+
prompt_lengths = []
|
|
998
|
+
pp_tps = []
|
|
999
|
+
pp_tps_sd = []
|
|
1000
|
+
tg_tps = None
|
|
1001
|
+
tg_tps_sd = None
|
|
1002
|
+
|
|
1003
|
+
for line in self.state.llama_bench_standard_output:
|
|
1004
|
+
# Parse TPS information
|
|
1005
|
+
for p in prompts:
|
|
1006
|
+
if f"pp{p:d}" in line:
|
|
1007
|
+
parts = line.split("|")
|
|
1008
|
+
timings = parts[-2].strip().split(" ")
|
|
1009
|
+
prompt_lengths.append(p)
|
|
1010
|
+
pp_tps.append(float(timings[0]))
|
|
1011
|
+
pp_tps_sd.append(float(timings[-1]))
|
|
1012
|
+
if f"tg{output_tokens:d}" in line:
|
|
1013
|
+
parts = line.split("|")
|
|
1014
|
+
timings = parts[-2].strip().split(" ")
|
|
1015
|
+
tg_tps = float(timings[0])
|
|
1016
|
+
tg_tps_sd = float(timings[-1])
|
|
1017
|
+
|
|
1018
|
+
return prompt_lengths, pp_tps, pp_tps_sd, tg_tps, tg_tps_sd
|
|
1019
|
+
|
|
1020
|
+
except Exception as e:
|
|
1021
|
+
error_msg = f"Failed to run llama-bench.exe command: {str(e)}\n"
|
|
831
1022
|
error_msg += f"Command: {' '.join(cmd)}"
|
|
832
1023
|
raise Exception(error_msg)
|
|
833
1024
|
|
|
@@ -857,7 +1048,9 @@ def get_hip_devices():
|
|
|
857
1048
|
try:
|
|
858
1049
|
libhip = ctypes.CDLL(matching_files[0])
|
|
859
1050
|
except OSError:
|
|
860
|
-
raise RuntimeError(
|
|
1051
|
+
raise RuntimeError(
|
|
1052
|
+
f"Could not load HIP runtime library from {matching_files[0]}"
|
|
1053
|
+
)
|
|
861
1054
|
|
|
862
1055
|
# Setup function signatures
|
|
863
1056
|
hipError_t = c_int
|
lemonade/tools/oga/bench.py
CHANGED
|
@@ -2,7 +2,6 @@ import argparse
|
|
|
2
2
|
import statistics
|
|
3
3
|
from statistics import StatisticsError
|
|
4
4
|
from lemonade.state import State
|
|
5
|
-
from lemonade.cache import Keys
|
|
6
5
|
from lemonade.tools.adapter import ModelAdapter, TokenizerAdapter
|
|
7
6
|
from lemonade.tools.bench import Bench
|
|
8
7
|
|
|
@@ -20,16 +19,6 @@ class OgaBench(Bench):
|
|
|
20
19
|
|
|
21
20
|
unique_name = "oga-bench"
|
|
22
21
|
|
|
23
|
-
def __init__(self):
|
|
24
|
-
super().__init__()
|
|
25
|
-
|
|
26
|
-
# Additional statistics generated by this bench tool
|
|
27
|
-
self.status_stats.insert(
|
|
28
|
-
self.status_stats.index(Keys.TOKEN_GENERATION_TOKENS_PER_SECOND) + 1,
|
|
29
|
-
Keys.STD_DEV_TOKENS_PER_SECOND,
|
|
30
|
-
)
|
|
31
|
-
self.std_dev_token_generation_tokens_per_second_list = []
|
|
32
|
-
|
|
33
22
|
@staticmethod
|
|
34
23
|
def parser(add_help: bool = True) -> argparse.ArgumentParser:
|
|
35
24
|
parser = __class__.helpful_parser(
|
|
@@ -121,21 +110,6 @@ class OgaBench(Bench):
|
|
|
121
110
|
# Less than 2 measurements
|
|
122
111
|
self.std_dev_token_generation_tokens_per_second_list.append(None)
|
|
123
112
|
|
|
124
|
-
def save_stats(self, state):
|
|
125
|
-
super().save_stats(state)
|
|
126
|
-
|
|
127
|
-
# Save additional statistics
|
|
128
|
-
if not all(
|
|
129
|
-
element is None
|
|
130
|
-
for element in self.std_dev_token_generation_tokens_per_second_list
|
|
131
|
-
):
|
|
132
|
-
state.save_stat(
|
|
133
|
-
Keys.STD_DEV_TOKENS_PER_SECOND,
|
|
134
|
-
self.get_item_or_list(
|
|
135
|
-
self.std_dev_token_generation_tokens_per_second_list
|
|
136
|
-
),
|
|
137
|
-
)
|
|
138
|
-
|
|
139
113
|
|
|
140
114
|
# This file was originally licensed under Apache 2.0. It has been modified.
|
|
141
115
|
# Modifications Copyright (c) 2025 AMD
|
lemonade/tools/report/table.py
CHANGED
|
@@ -581,6 +581,12 @@ class LemonadePerfTable(Table):
|
|
|
581
581
|
Keys.STD_DEV_SECONDS_TO_FIRST_TOKEN,
|
|
582
582
|
".2f",
|
|
583
583
|
),
|
|
584
|
+
StatWithSD(
|
|
585
|
+
_wrap("Prefill Tokens per Second", 8),
|
|
586
|
+
Keys.PREFILL_TOKENS_PER_SECOND,
|
|
587
|
+
Keys.STD_DEV_PREFILL_TOKENS_PER_SECOND,
|
|
588
|
+
".2f",
|
|
589
|
+
),
|
|
584
590
|
StatWithSD(
|
|
585
591
|
_wrap("Tokens per Second", 8),
|
|
586
592
|
Keys.TOKEN_GENERATION_TOKENS_PER_SECOND,
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import logging
|
|
3
|
+
import subprocess
|
|
4
|
+
import time
|
|
5
|
+
import threading
|
|
6
|
+
|
|
7
|
+
import requests
|
|
8
|
+
|
|
9
|
+
from lemonade_server.pydantic_models import (
|
|
10
|
+
PullConfig,
|
|
11
|
+
ChatCompletionRequest,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
from lemonade.tools.server.wrapped_server import WrappedServerTelemetry, WrappedServer
|
|
15
|
+
from lemonade.tools.flm.utils import install_flm, download_flm_model
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class FlmTelemetry(WrappedServerTelemetry):
|
|
19
|
+
"""
|
|
20
|
+
Manages telemetry data collection and display for FLM server.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def parse_telemetry_line(self, line: str):
|
|
24
|
+
"""
|
|
25
|
+
Parse telemetry data from FLM server output lines.
|
|
26
|
+
|
|
27
|
+
Note: as of FLM 0.9.10, no telemetry data is provided by the server CLI.
|
|
28
|
+
This function is required to be implemented, so we leave it empty
|
|
29
|
+
as a placeholder for now.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
return
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class FlmServer(WrappedServer):
|
|
36
|
+
"""
|
|
37
|
+
Routes OpenAI API requests to an FLM server instance and returns the result
|
|
38
|
+
back to Lemonade Server.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(self):
|
|
42
|
+
self.flm_model_name = None
|
|
43
|
+
super().__init__(server_name="flm-server", telemetry=FlmTelemetry())
|
|
44
|
+
|
|
45
|
+
def address(self):
|
|
46
|
+
return f"http://localhost:{self.port}/v1"
|
|
47
|
+
|
|
48
|
+
def install_server(self):
|
|
49
|
+
"""
|
|
50
|
+
Check if FLM is installed and at minimum version.
|
|
51
|
+
If not, download and run the GUI installer, then wait for completion.
|
|
52
|
+
"""
|
|
53
|
+
install_flm()
|
|
54
|
+
|
|
55
|
+
def download_model(
|
|
56
|
+
self, config_checkpoint, config_mmproj=None, do_not_upgrade=False
|
|
57
|
+
) -> dict:
|
|
58
|
+
download_flm_model(config_checkpoint, config_mmproj, do_not_upgrade)
|
|
59
|
+
|
|
60
|
+
def _launch_server_subprocess(
|
|
61
|
+
self,
|
|
62
|
+
model_config: PullConfig,
|
|
63
|
+
snapshot_files: dict,
|
|
64
|
+
ctx_size: int,
|
|
65
|
+
supports_embeddings: bool = False,
|
|
66
|
+
supports_reranking: bool = False,
|
|
67
|
+
):
|
|
68
|
+
|
|
69
|
+
self._choose_port()
|
|
70
|
+
|
|
71
|
+
# Keep track of the FLM model name so that we can use it later
|
|
72
|
+
self.flm_model_name = model_config.checkpoint
|
|
73
|
+
|
|
74
|
+
command = [
|
|
75
|
+
"flm",
|
|
76
|
+
"serve",
|
|
77
|
+
f"{self.flm_model_name}",
|
|
78
|
+
"--ctx-len",
|
|
79
|
+
str(ctx_size),
|
|
80
|
+
"--port",
|
|
81
|
+
str(self.port),
|
|
82
|
+
]
|
|
83
|
+
|
|
84
|
+
# Set up environment with library path for Linux
|
|
85
|
+
env = os.environ.copy()
|
|
86
|
+
|
|
87
|
+
self.process = subprocess.Popen(
|
|
88
|
+
command,
|
|
89
|
+
stdout=subprocess.PIPE,
|
|
90
|
+
stderr=subprocess.STDOUT,
|
|
91
|
+
text=True,
|
|
92
|
+
encoding="utf-8",
|
|
93
|
+
errors="replace",
|
|
94
|
+
bufsize=1,
|
|
95
|
+
env=env,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
# Start background thread to log subprocess output
|
|
99
|
+
threading.Thread(
|
|
100
|
+
target=self._log_subprocess_output,
|
|
101
|
+
args=("FLM SERVER",),
|
|
102
|
+
daemon=True,
|
|
103
|
+
).start()
|
|
104
|
+
|
|
105
|
+
def _wait_for_load(self):
|
|
106
|
+
"""
|
|
107
|
+
FLM doesn't seem to have a health API, so we'll use the "list local models"
|
|
108
|
+
API to check if the server is up.
|
|
109
|
+
"""
|
|
110
|
+
status_code = None
|
|
111
|
+
while not self.process.poll() and status_code != 200:
|
|
112
|
+
health_url = f"http://localhost:{self.port}/api/tags"
|
|
113
|
+
try:
|
|
114
|
+
health_response = requests.get(health_url)
|
|
115
|
+
except requests.exceptions.ConnectionError:
|
|
116
|
+
logging.debug(
|
|
117
|
+
"Not able to connect to %s yet, will retry", self.server_name
|
|
118
|
+
)
|
|
119
|
+
else:
|
|
120
|
+
status_code = health_response.status_code
|
|
121
|
+
logging.debug(
|
|
122
|
+
"Testing %s readiness (will retry until ready), result: %s",
|
|
123
|
+
self.server_name,
|
|
124
|
+
health_response.json(),
|
|
125
|
+
)
|
|
126
|
+
time.sleep(1)
|
|
127
|
+
|
|
128
|
+
def chat_completion(self, chat_completion_request: ChatCompletionRequest):
|
|
129
|
+
# FLM requires the correct model name to be in the request
|
|
130
|
+
# (whereas llama-server ignores the model name field in the request)
|
|
131
|
+
chat_completion_request.model = self.flm_model_name
|
|
132
|
+
|
|
133
|
+
return super().chat_completion(chat_completion_request)
|
|
@@ -88,9 +88,8 @@ class LlamaTelemetry(WrappedServerTelemetry):
|
|
|
88
88
|
|
|
89
89
|
class LlamaServer(WrappedServer):
|
|
90
90
|
def __init__(self, backend: str):
|
|
91
|
-
self.telemetry = LlamaTelemetry()
|
|
92
91
|
self.backend = backend
|
|
93
|
-
super().__init__(server_name="llama-server", telemetry=
|
|
92
|
+
super().__init__(server_name="llama-server", telemetry=LlamaTelemetry())
|
|
94
93
|
|
|
95
94
|
def install_server(self, backend=None):
|
|
96
95
|
"""
|
|
@@ -157,13 +156,23 @@ class LlamaServer(WrappedServer):
|
|
|
157
156
|
|
|
158
157
|
# Find a port, and save it in the telemetry object for future reference
|
|
159
158
|
# by other functions
|
|
160
|
-
self.
|
|
159
|
+
self._choose_port()
|
|
161
160
|
|
|
162
161
|
# Add port and jinja to enable tool use
|
|
163
162
|
base_command.extend(["--port", str(self.port), "--jinja"])
|
|
164
163
|
|
|
165
164
|
# Enable context shift and avoid attention sink issues by preserving the initial tokens
|
|
166
|
-
|
|
165
|
+
# Note: --context-shift is not supported on all backends (e.g., Metal on macOS)
|
|
166
|
+
# Only add context-shift for backends that support it
|
|
167
|
+
context_shift_supported_backends = ["vulkan", "rocm"]
|
|
168
|
+
if self.backend in context_shift_supported_backends:
|
|
169
|
+
base_command.extend(["--context-shift", "--keep", "16"])
|
|
170
|
+
else:
|
|
171
|
+
# For backends that don't support context-shift (e.g., Metal), just use keep
|
|
172
|
+
base_command.extend(["--keep", "16"])
|
|
173
|
+
logging.debug(
|
|
174
|
+
f"Skipped --context-shift for backend: {self.backend} (not supported)"
|
|
175
|
+
)
|
|
167
176
|
|
|
168
177
|
# Use legacy reasoning formatting, since not all apps support the new
|
|
169
178
|
# reasoning_content field
|
|
@@ -192,7 +201,8 @@ class LlamaServer(WrappedServer):
|
|
|
192
201
|
env.update(os.environ)
|
|
193
202
|
logging.debug(f"Loaded environment variables from {env_file_path}")
|
|
194
203
|
|
|
195
|
-
|
|
204
|
+
system = platform.system().lower()
|
|
205
|
+
if system == "linux":
|
|
196
206
|
lib_dir = os.path.dirname(exe_path) # Same directory as the executable
|
|
197
207
|
current_ld_path = env.get("LD_LIBRARY_PATH", "")
|
|
198
208
|
if current_ld_path:
|
|
@@ -200,6 +210,14 @@ class LlamaServer(WrappedServer):
|
|
|
200
210
|
else:
|
|
201
211
|
env["LD_LIBRARY_PATH"] = lib_dir
|
|
202
212
|
logging.debug(f"Set LD_LIBRARY_PATH to {env['LD_LIBRARY_PATH']}")
|
|
213
|
+
elif system == "darwin":
|
|
214
|
+
lib_dir = os.path.dirname(exe_path)
|
|
215
|
+
current_dyld_path = env.get("DYLD_LIBRARY_PATH", "")
|
|
216
|
+
if current_dyld_path:
|
|
217
|
+
env["DYLD_LIBRARY_PATH"] = f"{lib_dir}:{current_dyld_path}"
|
|
218
|
+
else:
|
|
219
|
+
env["DYLD_LIBRARY_PATH"] = lib_dir
|
|
220
|
+
logging.debug(f"Set DYLD_LIBRARY_PATH to {env['DYLD_LIBRARY_PATH']}")
|
|
203
221
|
|
|
204
222
|
# Start subprocess with output capture
|
|
205
223
|
self.process = subprocess.Popen(
|