lemonade-sdk 8.1.11__py3-none-any.whl → 8.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lemonade-sdk might be problematic. Click here for more details.
- lemonade/cache.py +6 -1
- lemonade/common/status.py +4 -4
- lemonade/common/system_info.py +0 -26
- lemonade/tools/accuracy.py +143 -48
- lemonade/tools/adapter.py +6 -1
- lemonade/tools/bench.py +26 -8
- lemonade/tools/flm/utils.py +70 -22
- lemonade/tools/huggingface/bench.py +6 -1
- lemonade/tools/llamacpp/bench.py +146 -27
- lemonade/tools/llamacpp/load.py +30 -2
- lemonade/tools/llamacpp/utils.py +317 -21
- lemonade/tools/oga/bench.py +5 -26
- lemonade/tools/oga/load.py +49 -123
- lemonade/tools/oga/migration.py +403 -0
- lemonade/tools/report/table.py +76 -8
- lemonade/tools/server/flm.py +2 -6
- lemonade/tools/server/llamacpp.py +43 -2
- lemonade/tools/server/serve.py +354 -18
- lemonade/tools/server/static/js/chat.js +15 -77
- lemonade/tools/server/static/js/model-settings.js +24 -3
- lemonade/tools/server/static/js/models.js +440 -37
- lemonade/tools/server/static/js/shared.js +61 -8
- lemonade/tools/server/static/logs.html +157 -13
- lemonade/tools/server/static/styles.css +204 -0
- lemonade/tools/server/static/webapp.html +39 -1
- lemonade/version.py +1 -1
- lemonade_install/install.py +33 -579
- {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/METADATA +6 -4
- {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/RECORD +38 -37
- lemonade_server/cli.py +10 -0
- lemonade_server/model_manager.py +172 -11
- lemonade_server/pydantic_models.py +3 -0
- lemonade_server/server_models.json +102 -66
- {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/WHEEL +0 -0
- {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/entry_points.txt +0 -0
- {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/licenses/LICENSE +0 -0
- {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/licenses/NOTICE.md +0 -0
- {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/top_level.txt +0 -0
lemonade/tools/llamacpp/bench.py
CHANGED
|
@@ -2,9 +2,15 @@ import argparse
|
|
|
2
2
|
import statistics
|
|
3
3
|
from statistics import StatisticsError
|
|
4
4
|
from lemonade.state import State
|
|
5
|
-
from lemonade.
|
|
5
|
+
from lemonade.tools.tool import Tool
|
|
6
6
|
from lemonade.tools.llamacpp.utils import LlamaCppAdapter
|
|
7
|
-
from lemonade.tools.bench import
|
|
7
|
+
from lemonade.tools.bench import (
|
|
8
|
+
Bench,
|
|
9
|
+
default_prompt_length,
|
|
10
|
+
default_iterations,
|
|
11
|
+
default_output_tokens,
|
|
12
|
+
default_warmup_runs,
|
|
13
|
+
)
|
|
8
14
|
|
|
9
15
|
|
|
10
16
|
class LlamaCppBench(Bench):
|
|
@@ -14,16 +20,6 @@ class LlamaCppBench(Bench):
|
|
|
14
20
|
|
|
15
21
|
unique_name = "llamacpp-bench"
|
|
16
22
|
|
|
17
|
-
def __init__(self):
|
|
18
|
-
super().__init__()
|
|
19
|
-
|
|
20
|
-
# Additional statistics generated by this bench tool
|
|
21
|
-
self.status_stats.insert(
|
|
22
|
-
self.status_stats.index(Keys.TOKEN_GENERATION_TOKENS_PER_SECOND) + 1,
|
|
23
|
-
Keys.STD_DEV_TOKENS_PER_SECOND,
|
|
24
|
-
)
|
|
25
|
-
self.std_dev_token_generation_tokens_per_second_list = []
|
|
26
|
-
|
|
27
23
|
@staticmethod
|
|
28
24
|
def parser(add_help: bool = True) -> argparse.ArgumentParser:
|
|
29
25
|
parser = __class__.helpful_parser(
|
|
@@ -33,8 +29,46 @@ class LlamaCppBench(Bench):
|
|
|
33
29
|
|
|
34
30
|
parser = Bench.parser(parser)
|
|
35
31
|
|
|
32
|
+
parser.add_argument(
|
|
33
|
+
"--cli",
|
|
34
|
+
action="store_true",
|
|
35
|
+
help="Set this flag to use llama-cli.exe to benchmark model performance. "
|
|
36
|
+
"This executable will be called once per iteration. Otherwise, "
|
|
37
|
+
"llama-bench.exe is used by default. In this default behavior behavior, "
|
|
38
|
+
"the only valid prompt format is integer token lengths. Also, the "
|
|
39
|
+
"warmup-iterations parameter is ignored and the default value for number of "
|
|
40
|
+
"threads is 16.",
|
|
41
|
+
)
|
|
42
|
+
|
|
36
43
|
return parser
|
|
37
44
|
|
|
45
|
+
def parse(self, state: State, args, known_only=True) -> argparse.Namespace:
|
|
46
|
+
"""
|
|
47
|
+
Helper function to parse CLI arguments into the args expected by run()
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
# Call Tool parse method, NOT the Bench parse method
|
|
51
|
+
parsed_args = Tool.parse(self, state, args, known_only)
|
|
52
|
+
|
|
53
|
+
if parsed_args.cli:
|
|
54
|
+
parsed_args = super().parse(state, args, known_only)
|
|
55
|
+
else:
|
|
56
|
+
# Make sure prompts is a list of integers
|
|
57
|
+
if parsed_args.prompts is None:
|
|
58
|
+
parsed_args.prompts = [default_prompt_length]
|
|
59
|
+
prompt_ints = []
|
|
60
|
+
for prompt_item in parsed_args.prompts:
|
|
61
|
+
if prompt_item.isdigit():
|
|
62
|
+
prompt_ints.append(int(prompt_item))
|
|
63
|
+
else:
|
|
64
|
+
raise Exception(
|
|
65
|
+
f"When not using the --cli flag to {self.unique_name}, the prompt format "
|
|
66
|
+
"must be in integer format."
|
|
67
|
+
)
|
|
68
|
+
parsed_args.prompts = prompt_ints
|
|
69
|
+
|
|
70
|
+
return parsed_args
|
|
71
|
+
|
|
38
72
|
def run_prompt(
|
|
39
73
|
self,
|
|
40
74
|
state: State,
|
|
@@ -43,7 +77,7 @@ class LlamaCppBench(Bench):
|
|
|
43
77
|
iterations: int,
|
|
44
78
|
warmup_iterations: int,
|
|
45
79
|
output_tokens: int,
|
|
46
|
-
)
|
|
80
|
+
):
|
|
47
81
|
"""
|
|
48
82
|
Benchmark llama.cpp model that was loaded by LoadLlamaCpp.
|
|
49
83
|
"""
|
|
@@ -61,6 +95,7 @@ class LlamaCppBench(Bench):
|
|
|
61
95
|
|
|
62
96
|
per_iteration_tokens_per_second = []
|
|
63
97
|
per_iteration_time_to_first_token = []
|
|
98
|
+
per_iteration_peak_wset = []
|
|
64
99
|
|
|
65
100
|
for iteration in range(iterations + warmup_iterations):
|
|
66
101
|
try:
|
|
@@ -69,7 +104,10 @@ class LlamaCppBench(Bench):
|
|
|
69
104
|
model.time_to_first_token = None
|
|
70
105
|
model.tokens_per_second = None
|
|
71
106
|
raw_output, stderr = model.generate(
|
|
72
|
-
prompt,
|
|
107
|
+
prompt,
|
|
108
|
+
max_new_tokens=output_tokens,
|
|
109
|
+
return_raw=True,
|
|
110
|
+
save_max_memory_used=self.save_max_memory_used,
|
|
73
111
|
)
|
|
74
112
|
|
|
75
113
|
if model.time_to_first_token is None or model.tokens_per_second is None:
|
|
@@ -85,6 +123,7 @@ class LlamaCppBench(Bench):
|
|
|
85
123
|
if iteration > warmup_iterations - 1:
|
|
86
124
|
per_iteration_tokens_per_second.append(model.tokens_per_second)
|
|
87
125
|
per_iteration_time_to_first_token.append(model.time_to_first_token)
|
|
126
|
+
per_iteration_peak_wset.append(model.peak_wset)
|
|
88
127
|
|
|
89
128
|
report_progress_fn((iteration + 1) / (warmup_iterations + iterations))
|
|
90
129
|
|
|
@@ -115,21 +154,101 @@ class LlamaCppBench(Bench):
|
|
|
115
154
|
except StatisticsError:
|
|
116
155
|
# Less than 2 measurements
|
|
117
156
|
self.std_dev_token_generation_tokens_per_second_list.append(None)
|
|
157
|
+
if self.save_max_memory_used:
|
|
158
|
+
filtered_list = [
|
|
159
|
+
item for item in per_iteration_peak_wset if item is not None
|
|
160
|
+
]
|
|
161
|
+
mean_gb_used = (
|
|
162
|
+
None
|
|
163
|
+
if len(filtered_list) == 0
|
|
164
|
+
else statistics.mean(filtered_list) / 1024**3
|
|
165
|
+
)
|
|
166
|
+
self.max_memory_used_gb_list.append(mean_gb_used)
|
|
167
|
+
|
|
168
|
+
def run_llama_bench_exe(self, state, prompts, iterations, output_tokens):
|
|
169
|
+
|
|
170
|
+
if prompts is None:
|
|
171
|
+
prompts = [default_prompt_length]
|
|
172
|
+
elif isinstance(prompts, int):
|
|
173
|
+
prompts = [prompts]
|
|
174
|
+
|
|
175
|
+
state.save_stat("prompts", prompts)
|
|
176
|
+
state.save_stat("iterations", iterations)
|
|
177
|
+
state.save_stat("output_tokens", output_tokens)
|
|
118
178
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
179
|
+
counter = 0
|
|
180
|
+
report_progress_fn = lambda x: self.set_percent_progress(
|
|
181
|
+
100 * (counter + x) / len(prompts)
|
|
182
|
+
)
|
|
183
|
+
self.first_run_prompt = True
|
|
184
|
+
for counter, prompt in enumerate(prompts):
|
|
185
|
+
report_progress_fn(0)
|
|
186
|
+
|
|
187
|
+
self.run_prompt_llama_bench_exe(
|
|
188
|
+
state,
|
|
189
|
+
prompt,
|
|
190
|
+
iterations,
|
|
191
|
+
output_tokens,
|
|
132
192
|
)
|
|
193
|
+
self.first_run_prompt = False
|
|
194
|
+
|
|
195
|
+
self.set_percent_progress(None)
|
|
196
|
+
self.save_stats(state)
|
|
197
|
+
return state
|
|
198
|
+
|
|
199
|
+
def run_prompt_llama_bench_exe(self, state, prompt, iterations, output_tokens):
|
|
200
|
+
|
|
201
|
+
model: LlamaCppAdapter = state.model
|
|
202
|
+
prompt_length, pp_tps, pp_tps_sd, tg_tps, tg_tps_sd, peak_wset = (
|
|
203
|
+
model.benchmark(prompt, iterations, output_tokens)
|
|
204
|
+
)
|
|
205
|
+
self.input_ids_len_list.append(prompt_length)
|
|
206
|
+
self.prefill_tokens_per_second_list.append(pp_tps)
|
|
207
|
+
self.std_dev_prefill_tokens_per_second_list.append(pp_tps_sd)
|
|
208
|
+
self.mean_time_to_first_token_list.append(prompt_length / pp_tps)
|
|
209
|
+
self.token_generation_tokens_per_second_list.append(tg_tps)
|
|
210
|
+
self.std_dev_token_generation_tokens_per_second_list.append(tg_tps_sd)
|
|
211
|
+
self.tokens_out_len_list.append(output_tokens * iterations)
|
|
212
|
+
if self.save_max_memory_used:
|
|
213
|
+
if peak_wset is not None:
|
|
214
|
+
self.max_memory_used_gb_list.append(peak_wset / 1024**3)
|
|
215
|
+
else:
|
|
216
|
+
self.max_memory_used_gb_list.append(None)
|
|
217
|
+
|
|
218
|
+
def run(
|
|
219
|
+
self,
|
|
220
|
+
state: State,
|
|
221
|
+
prompts: list[str] = None,
|
|
222
|
+
iterations: int = default_iterations,
|
|
223
|
+
warmup_iterations: int = default_warmup_runs,
|
|
224
|
+
output_tokens: int = default_output_tokens,
|
|
225
|
+
cli: bool = False,
|
|
226
|
+
**kwargs,
|
|
227
|
+
) -> State:
|
|
228
|
+
"""
|
|
229
|
+
Args:
|
|
230
|
+
- prompts: List of input prompts used as starting points for LLM text generation
|
|
231
|
+
- iterations: Number of benchmarking samples to take; results are
|
|
232
|
+
reported as the median and mean of the samples.
|
|
233
|
+
- warmup_iterations: Subset of the iterations to treat as warmup,
|
|
234
|
+
and not included in the results.
|
|
235
|
+
- output_tokens: Number of new tokens LLM to create.
|
|
236
|
+
- cli: Use multiple calls to llama-cpp.exe instead of llama-bench.exe
|
|
237
|
+
- kwargs: Additional parameters used by bench tools
|
|
238
|
+
"""
|
|
239
|
+
|
|
240
|
+
# Check that state has the attribute model and it is a LlamaCPP model
|
|
241
|
+
if not hasattr(state, "model") or not isinstance(state.model, LlamaCppAdapter):
|
|
242
|
+
raise Exception("Load model using llamacpp-load first.")
|
|
243
|
+
|
|
244
|
+
if cli:
|
|
245
|
+
state = super().run(
|
|
246
|
+
state, prompts, iterations, warmup_iterations, output_tokens, **kwargs
|
|
247
|
+
)
|
|
248
|
+
else:
|
|
249
|
+
state = self.run_llama_bench_exe(state, prompts, iterations, output_tokens)
|
|
250
|
+
|
|
251
|
+
return state
|
|
133
252
|
|
|
134
253
|
|
|
135
254
|
# This file was originally licensed under Apache 2.0. It has been modified.
|
lemonade/tools/llamacpp/load.py
CHANGED
|
@@ -93,9 +93,11 @@ class LoadLlamaCpp(FirstTool):
|
|
|
93
93
|
from lemonade.tools.llamacpp.utils import (
|
|
94
94
|
install_llamacpp,
|
|
95
95
|
get_llama_cli_exe_path,
|
|
96
|
+
get_llama_bench_exe_path,
|
|
96
97
|
get_llama_installed_version,
|
|
97
98
|
parse_checkpoint,
|
|
98
99
|
download_gguf,
|
|
100
|
+
resolve_local_gguf_model,
|
|
99
101
|
get_local_checkpoint_path,
|
|
100
102
|
LlamaCppTokenizerAdapter,
|
|
101
103
|
LlamaCppAdapter,
|
|
@@ -103,6 +105,8 @@ class LoadLlamaCpp(FirstTool):
|
|
|
103
105
|
|
|
104
106
|
install_llamacpp(backend)
|
|
105
107
|
|
|
108
|
+
extension = ""
|
|
109
|
+
|
|
106
110
|
# Check if input is a local folder containing a .GGUF model
|
|
107
111
|
if os.path.isdir(input):
|
|
108
112
|
# input is a local folder
|
|
@@ -121,6 +125,17 @@ class LoadLlamaCpp(FirstTool):
|
|
|
121
125
|
)
|
|
122
126
|
model_to_use = gguf_files[0]
|
|
123
127
|
full_model_path = os.path.join(local_model_folder, model_to_use)
|
|
128
|
+
extension = ".gguf"
|
|
129
|
+
|
|
130
|
+
elif input.endswith(".gguf") and os.path.isfile(input):
|
|
131
|
+
# input is a local .gguf file
|
|
132
|
+
full_model_path = os.path.abspath(input)
|
|
133
|
+
checkpoint = "local_model"
|
|
134
|
+
state.checkpoint = checkpoint
|
|
135
|
+
state.save_stat(Keys.CHECKPOINT, checkpoint)
|
|
136
|
+
state.save_stat(Keys.LOCAL_MODEL_FOLDER, full_model_path)
|
|
137
|
+
model_to_use = os.path.basename(full_model_path)
|
|
138
|
+
extension = ".gguf"
|
|
124
139
|
|
|
125
140
|
else:
|
|
126
141
|
# Input is a model checkpoint
|
|
@@ -155,12 +170,21 @@ class LoadLlamaCpp(FirstTool):
|
|
|
155
170
|
)
|
|
156
171
|
|
|
157
172
|
else:
|
|
173
|
+
# First, try to resolve from local cache to avoid unnecessary downloads
|
|
174
|
+
base_checkpoint, variant = parse_checkpoint(checkpoint)
|
|
175
|
+
snapshot_files = resolve_local_gguf_model(
|
|
176
|
+
base_checkpoint, variant, None
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
# If not found locally, download from internet
|
|
180
|
+
if not snapshot_files:
|
|
181
|
+
snapshot_files = download_gguf(checkpoint)
|
|
158
182
|
|
|
159
|
-
snapshot_files = download_gguf(checkpoint)
|
|
160
183
|
full_model_path = snapshot_files["variant"]
|
|
161
184
|
model_to_use = os.path.basename(full_model_path)
|
|
162
185
|
|
|
163
186
|
llama_cli_exe_path = get_llama_cli_exe_path(backend)
|
|
187
|
+
llama_bench_exe_path = get_llama_bench_exe_path(backend)
|
|
164
188
|
printing.log_info(f"Using llama_cli for GGUF model: {llama_cli_exe_path}")
|
|
165
189
|
|
|
166
190
|
# Get the directory containing the executable for shared libraries
|
|
@@ -174,8 +198,10 @@ class LoadLlamaCpp(FirstTool):
|
|
|
174
198
|
context_size=context_size,
|
|
175
199
|
threads=threads,
|
|
176
200
|
executable=llama_cli_exe_path,
|
|
201
|
+
bench_executable=llama_bench_exe_path,
|
|
177
202
|
reasoning=reasoning,
|
|
178
203
|
lib_dir=lib_dir,
|
|
204
|
+
state=state,
|
|
179
205
|
)
|
|
180
206
|
state.tokenizer = LlamaCppTokenizerAdapter()
|
|
181
207
|
state.device = device
|
|
@@ -186,7 +212,9 @@ class LoadLlamaCpp(FirstTool):
|
|
|
186
212
|
Keys.LLAMA_CLI_VERSION_INFO, get_llama_installed_version(backend)
|
|
187
213
|
)
|
|
188
214
|
|
|
189
|
-
status.add_to_state(
|
|
215
|
+
status.add_to_state(
|
|
216
|
+
state=state, name=input, model=model_to_use, extension=extension
|
|
217
|
+
)
|
|
190
218
|
return state
|
|
191
219
|
|
|
192
220
|
|