lemonade-sdk 8.1.11__py3-none-any.whl → 8.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (38) hide show
  1. lemonade/cache.py +6 -1
  2. lemonade/common/status.py +4 -4
  3. lemonade/common/system_info.py +0 -26
  4. lemonade/tools/accuracy.py +143 -48
  5. lemonade/tools/adapter.py +6 -1
  6. lemonade/tools/bench.py +26 -8
  7. lemonade/tools/flm/utils.py +70 -22
  8. lemonade/tools/huggingface/bench.py +6 -1
  9. lemonade/tools/llamacpp/bench.py +146 -27
  10. lemonade/tools/llamacpp/load.py +30 -2
  11. lemonade/tools/llamacpp/utils.py +317 -21
  12. lemonade/tools/oga/bench.py +5 -26
  13. lemonade/tools/oga/load.py +49 -123
  14. lemonade/tools/oga/migration.py +403 -0
  15. lemonade/tools/report/table.py +76 -8
  16. lemonade/tools/server/flm.py +2 -6
  17. lemonade/tools/server/llamacpp.py +43 -2
  18. lemonade/tools/server/serve.py +354 -18
  19. lemonade/tools/server/static/js/chat.js +15 -77
  20. lemonade/tools/server/static/js/model-settings.js +24 -3
  21. lemonade/tools/server/static/js/models.js +440 -37
  22. lemonade/tools/server/static/js/shared.js +61 -8
  23. lemonade/tools/server/static/logs.html +157 -13
  24. lemonade/tools/server/static/styles.css +204 -0
  25. lemonade/tools/server/static/webapp.html +39 -1
  26. lemonade/version.py +1 -1
  27. lemonade_install/install.py +33 -579
  28. {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/METADATA +6 -4
  29. {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/RECORD +38 -37
  30. lemonade_server/cli.py +10 -0
  31. lemonade_server/model_manager.py +172 -11
  32. lemonade_server/pydantic_models.py +3 -0
  33. lemonade_server/server_models.json +102 -66
  34. {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/WHEEL +0 -0
  35. {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/entry_points.txt +0 -0
  36. {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/licenses/LICENSE +0 -0
  37. {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/licenses/NOTICE.md +0 -0
  38. {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/top_level.txt +0 -0
@@ -2,9 +2,15 @@ import argparse
2
2
  import statistics
3
3
  from statistics import StatisticsError
4
4
  from lemonade.state import State
5
- from lemonade.cache import Keys
5
+ from lemonade.tools.tool import Tool
6
6
  from lemonade.tools.llamacpp.utils import LlamaCppAdapter
7
- from lemonade.tools.bench import Bench
7
+ from lemonade.tools.bench import (
8
+ Bench,
9
+ default_prompt_length,
10
+ default_iterations,
11
+ default_output_tokens,
12
+ default_warmup_runs,
13
+ )
8
14
 
9
15
 
10
16
  class LlamaCppBench(Bench):
@@ -14,16 +20,6 @@ class LlamaCppBench(Bench):
14
20
 
15
21
  unique_name = "llamacpp-bench"
16
22
 
17
- def __init__(self):
18
- super().__init__()
19
-
20
- # Additional statistics generated by this bench tool
21
- self.status_stats.insert(
22
- self.status_stats.index(Keys.TOKEN_GENERATION_TOKENS_PER_SECOND) + 1,
23
- Keys.STD_DEV_TOKENS_PER_SECOND,
24
- )
25
- self.std_dev_token_generation_tokens_per_second_list = []
26
-
27
23
  @staticmethod
28
24
  def parser(add_help: bool = True) -> argparse.ArgumentParser:
29
25
  parser = __class__.helpful_parser(
@@ -33,8 +29,46 @@ class LlamaCppBench(Bench):
33
29
 
34
30
  parser = Bench.parser(parser)
35
31
 
32
+ parser.add_argument(
33
+ "--cli",
34
+ action="store_true",
35
+ help="Set this flag to use llama-cli.exe to benchmark model performance. "
36
+ "This executable will be called once per iteration. Otherwise, "
37
+ "llama-bench.exe is used by default. In this default behavior behavior, "
38
+ "the only valid prompt format is integer token lengths. Also, the "
39
+ "warmup-iterations parameter is ignored and the default value for number of "
40
+ "threads is 16.",
41
+ )
42
+
36
43
  return parser
37
44
 
45
+ def parse(self, state: State, args, known_only=True) -> argparse.Namespace:
46
+ """
47
+ Helper function to parse CLI arguments into the args expected by run()
48
+ """
49
+
50
+ # Call Tool parse method, NOT the Bench parse method
51
+ parsed_args = Tool.parse(self, state, args, known_only)
52
+
53
+ if parsed_args.cli:
54
+ parsed_args = super().parse(state, args, known_only)
55
+ else:
56
+ # Make sure prompts is a list of integers
57
+ if parsed_args.prompts is None:
58
+ parsed_args.prompts = [default_prompt_length]
59
+ prompt_ints = []
60
+ for prompt_item in parsed_args.prompts:
61
+ if prompt_item.isdigit():
62
+ prompt_ints.append(int(prompt_item))
63
+ else:
64
+ raise Exception(
65
+ f"When not using the --cli flag to {self.unique_name}, the prompt format "
66
+ "must be in integer format."
67
+ )
68
+ parsed_args.prompts = prompt_ints
69
+
70
+ return parsed_args
71
+
38
72
  def run_prompt(
39
73
  self,
40
74
  state: State,
@@ -43,7 +77,7 @@ class LlamaCppBench(Bench):
43
77
  iterations: int,
44
78
  warmup_iterations: int,
45
79
  output_tokens: int,
46
- ) -> State:
80
+ ):
47
81
  """
48
82
  Benchmark llama.cpp model that was loaded by LoadLlamaCpp.
49
83
  """
@@ -61,6 +95,7 @@ class LlamaCppBench(Bench):
61
95
 
62
96
  per_iteration_tokens_per_second = []
63
97
  per_iteration_time_to_first_token = []
98
+ per_iteration_peak_wset = []
64
99
 
65
100
  for iteration in range(iterations + warmup_iterations):
66
101
  try:
@@ -69,7 +104,10 @@ class LlamaCppBench(Bench):
69
104
  model.time_to_first_token = None
70
105
  model.tokens_per_second = None
71
106
  raw_output, stderr = model.generate(
72
- prompt, max_new_tokens=output_tokens, return_raw=True
107
+ prompt,
108
+ max_new_tokens=output_tokens,
109
+ return_raw=True,
110
+ save_max_memory_used=self.save_max_memory_used,
73
111
  )
74
112
 
75
113
  if model.time_to_first_token is None or model.tokens_per_second is None:
@@ -85,6 +123,7 @@ class LlamaCppBench(Bench):
85
123
  if iteration > warmup_iterations - 1:
86
124
  per_iteration_tokens_per_second.append(model.tokens_per_second)
87
125
  per_iteration_time_to_first_token.append(model.time_to_first_token)
126
+ per_iteration_peak_wset.append(model.peak_wset)
88
127
 
89
128
  report_progress_fn((iteration + 1) / (warmup_iterations + iterations))
90
129
 
@@ -115,21 +154,101 @@ class LlamaCppBench(Bench):
115
154
  except StatisticsError:
116
155
  # Less than 2 measurements
117
156
  self.std_dev_token_generation_tokens_per_second_list.append(None)
157
+ if self.save_max_memory_used:
158
+ filtered_list = [
159
+ item for item in per_iteration_peak_wset if item is not None
160
+ ]
161
+ mean_gb_used = (
162
+ None
163
+ if len(filtered_list) == 0
164
+ else statistics.mean(filtered_list) / 1024**3
165
+ )
166
+ self.max_memory_used_gb_list.append(mean_gb_used)
167
+
168
+ def run_llama_bench_exe(self, state, prompts, iterations, output_tokens):
169
+
170
+ if prompts is None:
171
+ prompts = [default_prompt_length]
172
+ elif isinstance(prompts, int):
173
+ prompts = [prompts]
174
+
175
+ state.save_stat("prompts", prompts)
176
+ state.save_stat("iterations", iterations)
177
+ state.save_stat("output_tokens", output_tokens)
118
178
 
119
- def save_stats(self, state):
120
- super().save_stats(state)
121
-
122
- # Save additional statistics
123
- if not all(
124
- element is None
125
- for element in self.std_dev_token_generation_tokens_per_second_list
126
- ):
127
- state.save_stat(
128
- Keys.STD_DEV_TOKENS_PER_SECOND,
129
- self.get_item_or_list(
130
- self.std_dev_token_generation_tokens_per_second_list
131
- ),
179
+ counter = 0
180
+ report_progress_fn = lambda x: self.set_percent_progress(
181
+ 100 * (counter + x) / len(prompts)
182
+ )
183
+ self.first_run_prompt = True
184
+ for counter, prompt in enumerate(prompts):
185
+ report_progress_fn(0)
186
+
187
+ self.run_prompt_llama_bench_exe(
188
+ state,
189
+ prompt,
190
+ iterations,
191
+ output_tokens,
132
192
  )
193
+ self.first_run_prompt = False
194
+
195
+ self.set_percent_progress(None)
196
+ self.save_stats(state)
197
+ return state
198
+
199
+ def run_prompt_llama_bench_exe(self, state, prompt, iterations, output_tokens):
200
+
201
+ model: LlamaCppAdapter = state.model
202
+ prompt_length, pp_tps, pp_tps_sd, tg_tps, tg_tps_sd, peak_wset = (
203
+ model.benchmark(prompt, iterations, output_tokens)
204
+ )
205
+ self.input_ids_len_list.append(prompt_length)
206
+ self.prefill_tokens_per_second_list.append(pp_tps)
207
+ self.std_dev_prefill_tokens_per_second_list.append(pp_tps_sd)
208
+ self.mean_time_to_first_token_list.append(prompt_length / pp_tps)
209
+ self.token_generation_tokens_per_second_list.append(tg_tps)
210
+ self.std_dev_token_generation_tokens_per_second_list.append(tg_tps_sd)
211
+ self.tokens_out_len_list.append(output_tokens * iterations)
212
+ if self.save_max_memory_used:
213
+ if peak_wset is not None:
214
+ self.max_memory_used_gb_list.append(peak_wset / 1024**3)
215
+ else:
216
+ self.max_memory_used_gb_list.append(None)
217
+
218
+ def run(
219
+ self,
220
+ state: State,
221
+ prompts: list[str] = None,
222
+ iterations: int = default_iterations,
223
+ warmup_iterations: int = default_warmup_runs,
224
+ output_tokens: int = default_output_tokens,
225
+ cli: bool = False,
226
+ **kwargs,
227
+ ) -> State:
228
+ """
229
+ Args:
230
+ - prompts: List of input prompts used as starting points for LLM text generation
231
+ - iterations: Number of benchmarking samples to take; results are
232
+ reported as the median and mean of the samples.
233
+ - warmup_iterations: Subset of the iterations to treat as warmup,
234
+ and not included in the results.
235
+ - output_tokens: Number of new tokens LLM to create.
236
+ - cli: Use multiple calls to llama-cpp.exe instead of llama-bench.exe
237
+ - kwargs: Additional parameters used by bench tools
238
+ """
239
+
240
+ # Check that state has the attribute model and it is a LlamaCPP model
241
+ if not hasattr(state, "model") or not isinstance(state.model, LlamaCppAdapter):
242
+ raise Exception("Load model using llamacpp-load first.")
243
+
244
+ if cli:
245
+ state = super().run(
246
+ state, prompts, iterations, warmup_iterations, output_tokens, **kwargs
247
+ )
248
+ else:
249
+ state = self.run_llama_bench_exe(state, prompts, iterations, output_tokens)
250
+
251
+ return state
133
252
 
134
253
 
135
254
  # This file was originally licensed under Apache 2.0. It has been modified.
@@ -93,9 +93,11 @@ class LoadLlamaCpp(FirstTool):
93
93
  from lemonade.tools.llamacpp.utils import (
94
94
  install_llamacpp,
95
95
  get_llama_cli_exe_path,
96
+ get_llama_bench_exe_path,
96
97
  get_llama_installed_version,
97
98
  parse_checkpoint,
98
99
  download_gguf,
100
+ resolve_local_gguf_model,
99
101
  get_local_checkpoint_path,
100
102
  LlamaCppTokenizerAdapter,
101
103
  LlamaCppAdapter,
@@ -103,6 +105,8 @@ class LoadLlamaCpp(FirstTool):
103
105
 
104
106
  install_llamacpp(backend)
105
107
 
108
+ extension = ""
109
+
106
110
  # Check if input is a local folder containing a .GGUF model
107
111
  if os.path.isdir(input):
108
112
  # input is a local folder
@@ -121,6 +125,17 @@ class LoadLlamaCpp(FirstTool):
121
125
  )
122
126
  model_to_use = gguf_files[0]
123
127
  full_model_path = os.path.join(local_model_folder, model_to_use)
128
+ extension = ".gguf"
129
+
130
+ elif input.endswith(".gguf") and os.path.isfile(input):
131
+ # input is a local .gguf file
132
+ full_model_path = os.path.abspath(input)
133
+ checkpoint = "local_model"
134
+ state.checkpoint = checkpoint
135
+ state.save_stat(Keys.CHECKPOINT, checkpoint)
136
+ state.save_stat(Keys.LOCAL_MODEL_FOLDER, full_model_path)
137
+ model_to_use = os.path.basename(full_model_path)
138
+ extension = ".gguf"
124
139
 
125
140
  else:
126
141
  # Input is a model checkpoint
@@ -155,12 +170,21 @@ class LoadLlamaCpp(FirstTool):
155
170
  )
156
171
 
157
172
  else:
173
+ # First, try to resolve from local cache to avoid unnecessary downloads
174
+ base_checkpoint, variant = parse_checkpoint(checkpoint)
175
+ snapshot_files = resolve_local_gguf_model(
176
+ base_checkpoint, variant, None
177
+ )
178
+
179
+ # If not found locally, download from internet
180
+ if not snapshot_files:
181
+ snapshot_files = download_gguf(checkpoint)
158
182
 
159
- snapshot_files = download_gguf(checkpoint)
160
183
  full_model_path = snapshot_files["variant"]
161
184
  model_to_use = os.path.basename(full_model_path)
162
185
 
163
186
  llama_cli_exe_path = get_llama_cli_exe_path(backend)
187
+ llama_bench_exe_path = get_llama_bench_exe_path(backend)
164
188
  printing.log_info(f"Using llama_cli for GGUF model: {llama_cli_exe_path}")
165
189
 
166
190
  # Get the directory containing the executable for shared libraries
@@ -174,8 +198,10 @@ class LoadLlamaCpp(FirstTool):
174
198
  context_size=context_size,
175
199
  threads=threads,
176
200
  executable=llama_cli_exe_path,
201
+ bench_executable=llama_bench_exe_path,
177
202
  reasoning=reasoning,
178
203
  lib_dir=lib_dir,
204
+ state=state,
179
205
  )
180
206
  state.tokenizer = LlamaCppTokenizerAdapter()
181
207
  state.device = device
@@ -186,7 +212,9 @@ class LoadLlamaCpp(FirstTool):
186
212
  Keys.LLAMA_CLI_VERSION_INFO, get_llama_installed_version(backend)
187
213
  )
188
214
 
189
- status.add_to_state(state=state, name=input, model=model_to_use)
215
+ status.add_to_state(
216
+ state=state, name=input, model=model_to_use, extension=extension
217
+ )
190
218
  return state
191
219
 
192
220