lemonade-sdk 9.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. lemonade/__init__.py +5 -0
  2. lemonade/api.py +180 -0
  3. lemonade/cache.py +92 -0
  4. lemonade/cli.py +173 -0
  5. lemonade/common/__init__.py +0 -0
  6. lemonade/common/build.py +176 -0
  7. lemonade/common/cli_helpers.py +139 -0
  8. lemonade/common/exceptions.py +98 -0
  9. lemonade/common/filesystem.py +368 -0
  10. lemonade/common/inference_engines.py +408 -0
  11. lemonade/common/network.py +93 -0
  12. lemonade/common/printing.py +110 -0
  13. lemonade/common/status.py +471 -0
  14. lemonade/common/system_info.py +1411 -0
  15. lemonade/common/test_helpers.py +28 -0
  16. lemonade/profilers/__init__.py +1 -0
  17. lemonade/profilers/agt_power.py +437 -0
  18. lemonade/profilers/hwinfo_power.py +429 -0
  19. lemonade/profilers/memory_tracker.py +259 -0
  20. lemonade/profilers/profiler.py +58 -0
  21. lemonade/sequence.py +363 -0
  22. lemonade/state.py +159 -0
  23. lemonade/tools/__init__.py +1 -0
  24. lemonade/tools/accuracy.py +432 -0
  25. lemonade/tools/adapter.py +114 -0
  26. lemonade/tools/bench.py +302 -0
  27. lemonade/tools/flm/__init__.py +1 -0
  28. lemonade/tools/flm/utils.py +305 -0
  29. lemonade/tools/huggingface/bench.py +187 -0
  30. lemonade/tools/huggingface/load.py +235 -0
  31. lemonade/tools/huggingface/utils.py +359 -0
  32. lemonade/tools/humaneval.py +264 -0
  33. lemonade/tools/llamacpp/bench.py +255 -0
  34. lemonade/tools/llamacpp/load.py +222 -0
  35. lemonade/tools/llamacpp/utils.py +1260 -0
  36. lemonade/tools/management_tools.py +319 -0
  37. lemonade/tools/mmlu.py +319 -0
  38. lemonade/tools/oga/__init__.py +0 -0
  39. lemonade/tools/oga/bench.py +120 -0
  40. lemonade/tools/oga/load.py +804 -0
  41. lemonade/tools/oga/migration.py +403 -0
  42. lemonade/tools/oga/utils.py +462 -0
  43. lemonade/tools/perplexity.py +147 -0
  44. lemonade/tools/prompt.py +263 -0
  45. lemonade/tools/report/__init__.py +0 -0
  46. lemonade/tools/report/llm_report.py +203 -0
  47. lemonade/tools/report/table.py +899 -0
  48. lemonade/tools/server/__init__.py +0 -0
  49. lemonade/tools/server/flm.py +133 -0
  50. lemonade/tools/server/llamacpp.py +320 -0
  51. lemonade/tools/server/serve.py +2123 -0
  52. lemonade/tools/server/static/favicon.ico +0 -0
  53. lemonade/tools/server/static/index.html +279 -0
  54. lemonade/tools/server/static/js/chat.js +1059 -0
  55. lemonade/tools/server/static/js/model-settings.js +183 -0
  56. lemonade/tools/server/static/js/models.js +1395 -0
  57. lemonade/tools/server/static/js/shared.js +556 -0
  58. lemonade/tools/server/static/logs.html +191 -0
  59. lemonade/tools/server/static/styles.css +2654 -0
  60. lemonade/tools/server/static/webapp.html +321 -0
  61. lemonade/tools/server/tool_calls.py +153 -0
  62. lemonade/tools/server/tray.py +664 -0
  63. lemonade/tools/server/utils/macos_tray.py +226 -0
  64. lemonade/tools/server/utils/port.py +77 -0
  65. lemonade/tools/server/utils/thread.py +85 -0
  66. lemonade/tools/server/utils/windows_tray.py +408 -0
  67. lemonade/tools/server/webapp.py +34 -0
  68. lemonade/tools/server/wrapped_server.py +559 -0
  69. lemonade/tools/tool.py +374 -0
  70. lemonade/version.py +1 -0
  71. lemonade_install/__init__.py +1 -0
  72. lemonade_install/install.py +239 -0
  73. lemonade_sdk-9.1.1.dist-info/METADATA +276 -0
  74. lemonade_sdk-9.1.1.dist-info/RECORD +84 -0
  75. lemonade_sdk-9.1.1.dist-info/WHEEL +5 -0
  76. lemonade_sdk-9.1.1.dist-info/entry_points.txt +5 -0
  77. lemonade_sdk-9.1.1.dist-info/licenses/LICENSE +201 -0
  78. lemonade_sdk-9.1.1.dist-info/licenses/NOTICE.md +47 -0
  79. lemonade_sdk-9.1.1.dist-info/top_level.txt +3 -0
  80. lemonade_server/cli.py +805 -0
  81. lemonade_server/model_manager.py +758 -0
  82. lemonade_server/pydantic_models.py +159 -0
  83. lemonade_server/server_models.json +643 -0
  84. lemonade_server/settings.py +39 -0
@@ -0,0 +1,255 @@
1
+ import argparse
2
+ import statistics
3
+ from statistics import StatisticsError
4
+ from lemonade.state import State
5
+ from lemonade.tools.tool import Tool
6
+ from lemonade.tools.llamacpp.utils import LlamaCppAdapter
7
+ from lemonade.tools.bench import (
8
+ Bench,
9
+ default_prompt_length,
10
+ default_iterations,
11
+ default_output_tokens,
12
+ default_warmup_runs,
13
+ )
14
+
15
+
16
+ class LlamaCppBench(Bench):
17
+ """
18
+ Benchmark a llama.cpp model
19
+ """
20
+
21
+ unique_name = "llamacpp-bench"
22
+
23
+ @staticmethod
24
+ def parser(add_help: bool = True) -> argparse.ArgumentParser:
25
+ parser = __class__.helpful_parser(
26
+ short_description="Benchmark an LLM in llama.cpp",
27
+ add_help=add_help,
28
+ )
29
+
30
+ parser = Bench.parser(parser)
31
+
32
+ parser.add_argument(
33
+ "--cli",
34
+ action="store_true",
35
+ help="Set this flag to use llama-cli.exe to benchmark model performance. "
36
+ "This executable will be called once per iteration. Otherwise, "
37
+ "llama-bench.exe is used by default. In this default behavior behavior, "
38
+ "the only valid prompt format is integer token lengths. Also, the "
39
+ "warmup-iterations parameter is ignored and the default value for number of "
40
+ "threads is 16.",
41
+ )
42
+
43
+ return parser
44
+
45
+ def parse(self, state: State, args, known_only=True) -> argparse.Namespace:
46
+ """
47
+ Helper function to parse CLI arguments into the args expected by run()
48
+ """
49
+
50
+ # Call Tool parse method, NOT the Bench parse method
51
+ parsed_args = Tool.parse(self, state, args, known_only)
52
+
53
+ if parsed_args.cli:
54
+ parsed_args = super().parse(state, args, known_only)
55
+ else:
56
+ # Make sure prompts is a list of integers
57
+ if parsed_args.prompts is None:
58
+ parsed_args.prompts = [default_prompt_length]
59
+ prompt_ints = []
60
+ for prompt_item in parsed_args.prompts:
61
+ if prompt_item.isdigit():
62
+ prompt_ints.append(int(prompt_item))
63
+ else:
64
+ raise Exception(
65
+ f"When not using the --cli flag to {self.unique_name}, the prompt format "
66
+ "must be in integer format."
67
+ )
68
+ parsed_args.prompts = prompt_ints
69
+
70
+ return parsed_args
71
+
72
+ def run_prompt(
73
+ self,
74
+ state: State,
75
+ report_progress_fn,
76
+ prompt: str,
77
+ iterations: int,
78
+ warmup_iterations: int,
79
+ output_tokens: int,
80
+ ):
81
+ """
82
+ Benchmark llama.cpp model that was loaded by LoadLlamaCpp.
83
+ """
84
+
85
+ if self.first_run_prompt:
86
+
87
+ if not hasattr(state, "model") or not isinstance(
88
+ state.model, LlamaCppAdapter
89
+ ):
90
+ raise Exception(
91
+ f"{self.__class__.unique_name} requires a LlamaCppAdapter model to be "
92
+ "loaded first. Please run load-llama-cpp before this tool."
93
+ )
94
+ model: LlamaCppAdapter = state.model
95
+
96
+ per_iteration_tokens_per_second = []
97
+ per_iteration_time_to_first_token = []
98
+ per_iteration_peak_wset = []
99
+
100
+ for iteration in range(iterations + warmup_iterations):
101
+ try:
102
+ # Use the adapter's generate method which already has the timeout
103
+ # and error handling
104
+ model.time_to_first_token = None
105
+ model.tokens_per_second = None
106
+ raw_output, stderr = model.generate(
107
+ prompt,
108
+ max_new_tokens=output_tokens,
109
+ return_raw=True,
110
+ save_max_memory_used=self.save_max_memory_used,
111
+ )
112
+
113
+ if model.time_to_first_token is None or model.tokens_per_second is None:
114
+ error_msg = (
115
+ "Could not find timing information in llama.cpp output.\n"
116
+ )
117
+ error_msg += "Raw output:\n" + raw_output + "\n"
118
+ error_msg += "Stderr:\n" + stderr
119
+ raise Exception(error_msg)
120
+
121
+ self.tokens_out_len_list.append(model.response_tokens)
122
+
123
+ if iteration > warmup_iterations - 1:
124
+ per_iteration_tokens_per_second.append(model.tokens_per_second)
125
+ per_iteration_time_to_first_token.append(model.time_to_first_token)
126
+ per_iteration_peak_wset.append(model.peak_wset)
127
+
128
+ report_progress_fn((iteration + 1) / (warmup_iterations + iterations))
129
+
130
+ except Exception as e:
131
+ error_msg = f"Failed to run benchmark: {str(e)}"
132
+ raise Exception(error_msg)
133
+
134
+ self.input_ids_len_list.append(model.prompt_tokens)
135
+ mean_time_to_first_token = statistics.mean(per_iteration_time_to_first_token)
136
+ self.mean_time_to_first_token_list.append(mean_time_to_first_token)
137
+ self.prefill_tokens_per_second_list.append(
138
+ model.prompt_tokens / mean_time_to_first_token
139
+ )
140
+ self.token_generation_tokens_per_second_list.append(
141
+ statistics.mean(per_iteration_tokens_per_second)
142
+ )
143
+ try:
144
+ self.std_dev_time_to_first_token_list.append(
145
+ statistics.stdev(per_iteration_time_to_first_token)
146
+ )
147
+ except StatisticsError:
148
+ # Less than 2 measurements
149
+ self.std_dev_time_to_first_token_list.append(None)
150
+ try:
151
+ self.std_dev_token_generation_tokens_per_second_list.append(
152
+ statistics.stdev(per_iteration_tokens_per_second)
153
+ )
154
+ except StatisticsError:
155
+ # Less than 2 measurements
156
+ self.std_dev_token_generation_tokens_per_second_list.append(None)
157
+ if self.save_max_memory_used:
158
+ filtered_list = [
159
+ item for item in per_iteration_peak_wset if item is not None
160
+ ]
161
+ mean_gb_used = (
162
+ None
163
+ if len(filtered_list) == 0
164
+ else statistics.mean(filtered_list) / 1024**3
165
+ )
166
+ self.max_memory_used_gb_list.append(mean_gb_used)
167
+
168
+ def run_llama_bench_exe(self, state, prompts, iterations, output_tokens):
169
+
170
+ if prompts is None:
171
+ prompts = [default_prompt_length]
172
+ elif isinstance(prompts, int):
173
+ prompts = [prompts]
174
+
175
+ state.save_stat("prompts", prompts)
176
+ state.save_stat("iterations", iterations)
177
+ state.save_stat("output_tokens", output_tokens)
178
+
179
+ counter = 0
180
+ report_progress_fn = lambda x: self.set_percent_progress(
181
+ 100 * (counter + x) / len(prompts)
182
+ )
183
+ self.first_run_prompt = True
184
+ for counter, prompt in enumerate(prompts):
185
+ report_progress_fn(0)
186
+
187
+ self.run_prompt_llama_bench_exe(
188
+ state,
189
+ prompt,
190
+ iterations,
191
+ output_tokens,
192
+ )
193
+ self.first_run_prompt = False
194
+
195
+ self.set_percent_progress(None)
196
+ self.save_stats(state)
197
+ return state
198
+
199
+ def run_prompt_llama_bench_exe(self, state, prompt, iterations, output_tokens):
200
+
201
+ model: LlamaCppAdapter = state.model
202
+ prompt_length, pp_tps, pp_tps_sd, tg_tps, tg_tps_sd, peak_wset = (
203
+ model.benchmark(prompt, iterations, output_tokens)
204
+ )
205
+ self.input_ids_len_list.append(prompt_length)
206
+ self.prefill_tokens_per_second_list.append(pp_tps)
207
+ self.std_dev_prefill_tokens_per_second_list.append(pp_tps_sd)
208
+ self.mean_time_to_first_token_list.append(prompt_length / pp_tps)
209
+ self.token_generation_tokens_per_second_list.append(tg_tps)
210
+ self.std_dev_token_generation_tokens_per_second_list.append(tg_tps_sd)
211
+ self.tokens_out_len_list.append(output_tokens * iterations)
212
+ if self.save_max_memory_used:
213
+ if peak_wset is not None:
214
+ self.max_memory_used_gb_list.append(peak_wset / 1024**3)
215
+ else:
216
+ self.max_memory_used_gb_list.append(None)
217
+
218
+ def run(
219
+ self,
220
+ state: State,
221
+ prompts: list[str] = None,
222
+ iterations: int = default_iterations,
223
+ warmup_iterations: int = default_warmup_runs,
224
+ output_tokens: int = default_output_tokens,
225
+ cli: bool = False,
226
+ **kwargs,
227
+ ) -> State:
228
+ """
229
+ Args:
230
+ - prompts: List of input prompts used as starting points for LLM text generation
231
+ - iterations: Number of benchmarking samples to take; results are
232
+ reported as the median and mean of the samples.
233
+ - warmup_iterations: Subset of the iterations to treat as warmup,
234
+ and not included in the results.
235
+ - output_tokens: Number of new tokens LLM to create.
236
+ - cli: Use multiple calls to llama-cpp.exe instead of llama-bench.exe
237
+ - kwargs: Additional parameters used by bench tools
238
+ """
239
+
240
+ # Check that state has the attribute model and it is a LlamaCPP model
241
+ if not hasattr(state, "model") or not isinstance(state.model, LlamaCppAdapter):
242
+ raise Exception("Load model using llamacpp-load first.")
243
+
244
+ if cli:
245
+ state = super().run(
246
+ state, prompts, iterations, warmup_iterations, output_tokens, **kwargs
247
+ )
248
+ else:
249
+ state = self.run_llama_bench_exe(state, prompts, iterations, output_tokens)
250
+
251
+ return state
252
+
253
+
254
+ # This file was originally licensed under Apache 2.0. It has been modified.
255
+ # Modifications Copyright (c) 2025 AMD
@@ -0,0 +1,222 @@
1
+ import argparse
2
+ import os
3
+ import lemonade.common.printing as printing
4
+ import lemonade.common.status as status
5
+ from lemonade.state import State
6
+ from lemonade.tools import FirstTool
7
+ from lemonade.cache import Keys
8
+
9
+
10
+ class LoadLlamaCpp(FirstTool):
11
+ unique_name = "llamacpp-load"
12
+
13
+ def __init__(self):
14
+ super().__init__(monitor_message="Loading llama.cpp model")
15
+
16
+ self.status_stats = [
17
+ Keys.DEVICE,
18
+ ]
19
+
20
+ @staticmethod
21
+ def parser(add_help: bool = True) -> argparse.ArgumentParser:
22
+ parser = __class__.helpful_parser(
23
+ short_description="Wrap llama.cpp models with an API",
24
+ add_help=add_help,
25
+ )
26
+
27
+ parser.add_argument(
28
+ "-d",
29
+ "--device",
30
+ choices=["cpu", "igpu"],
31
+ default="igpu",
32
+ help="Which device to load the model on to (default: igpu)",
33
+ )
34
+
35
+ default_threads = -1
36
+ parser.add_argument(
37
+ "--threads",
38
+ required=False,
39
+ type=int,
40
+ default=default_threads,
41
+ help=f"Number of threads to use during generation (default: {default_threads})",
42
+ )
43
+
44
+ context_size = 4096
45
+ parser.add_argument(
46
+ "--context-size",
47
+ required=False,
48
+ type=int,
49
+ default=context_size,
50
+ help=f"Size of the prompt context (default: {context_size}. 0 = loaded from model)",
51
+ )
52
+
53
+ output_tokens = 512
54
+ parser.add_argument(
55
+ "--output-tokens",
56
+ required=False,
57
+ type=int,
58
+ default=output_tokens,
59
+ help=f"Maximum number of output tokens to generate (default: {output_tokens})",
60
+ )
61
+
62
+ parser.add_argument(
63
+ "--reasoning",
64
+ action="store_true",
65
+ help="Set this flag to indicate the model is a reasoning model",
66
+ )
67
+
68
+ parser.add_argument(
69
+ "--backend",
70
+ choices=["vulkan", "rocm", "cpu"],
71
+ default="vulkan",
72
+ help="Backend to use for llama.cpp (default: vulkan)",
73
+ )
74
+
75
+ return parser
76
+
77
+ def run(
78
+ self,
79
+ state: State,
80
+ input: str = "",
81
+ device: str = "igpu",
82
+ context_size: int = 512,
83
+ threads: int = 1,
84
+ output_tokens: int = 512,
85
+ reasoning: bool = False,
86
+ backend: str = "vulkan",
87
+ ) -> State:
88
+ """
89
+ Load a llama.cpp model
90
+ """
91
+
92
+ from lemonade.common.network import is_offline
93
+ from lemonade.tools.llamacpp.utils import (
94
+ install_llamacpp,
95
+ get_llama_cli_exe_path,
96
+ get_llama_bench_exe_path,
97
+ get_llama_installed_version,
98
+ parse_checkpoint,
99
+ download_gguf,
100
+ resolve_local_gguf_model,
101
+ get_local_checkpoint_path,
102
+ LlamaCppTokenizerAdapter,
103
+ LlamaCppAdapter,
104
+ )
105
+
106
+ install_llamacpp(backend)
107
+
108
+ extension = ""
109
+
110
+ # Check if input is a local folder containing a .GGUF model
111
+ if os.path.isdir(input):
112
+ # input is a local folder
113
+ local_model_folder = os.path.abspath(input)
114
+ checkpoint = "local_model"
115
+ state.checkpoint = checkpoint
116
+ state.save_stat(Keys.CHECKPOINT, checkpoint)
117
+ state.save_stat(Keys.LOCAL_MODEL_FOLDER, local_model_folder)
118
+
119
+ # See if there is a file ending in ".gguf" in this folder
120
+ dir = os.listdir(input)
121
+ gguf_files = [filename for filename in dir if filename.endswith(".gguf")]
122
+ if len(gguf_files) == 0:
123
+ raise ValueError(
124
+ f"The folder {input} does not contain a GGUF model file."
125
+ )
126
+ model_to_use = gguf_files[0]
127
+ full_model_path = os.path.join(local_model_folder, model_to_use)
128
+ extension = ".gguf"
129
+
130
+ elif input.endswith(".gguf") and os.path.isfile(input):
131
+ # input is a local .gguf file
132
+ full_model_path = os.path.abspath(input)
133
+ checkpoint = "local_model"
134
+ state.checkpoint = checkpoint
135
+ state.save_stat(Keys.CHECKPOINT, checkpoint)
136
+ state.save_stat(Keys.LOCAL_MODEL_FOLDER, full_model_path)
137
+ model_to_use = os.path.basename(full_model_path)
138
+ extension = ".gguf"
139
+
140
+ else:
141
+ # Input is a model checkpoint
142
+ checkpoint = input
143
+ state.checkpoint = checkpoint
144
+ state.save_stat(Keys.CHECKPOINT, checkpoint)
145
+
146
+ # Make sure that a variant is provided for the GGUF model
147
+ base_checkpoint, variant = parse_checkpoint(checkpoint)
148
+ if variant is None:
149
+ raise ValueError(
150
+ "You are required to provide a 'variant' when "
151
+ "selecting a GGUF model. The variant is provided "
152
+ "as CHECKPOINT:VARIANT. For example: "
153
+ "Qwen/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_0 or "
154
+ "Qwen/Qwen2.5-Coder-3B-Instruct-GGUF:qwen2.5-coder-3b-instruct-q4_0.gguf"
155
+ )
156
+
157
+ # Auto-detect offline status
158
+ offline = is_offline()
159
+ if offline:
160
+ printing.log_warning(
161
+ "Network connectivity to huggingface.co not detected. Running in offline mode."
162
+ )
163
+ full_model_path, model_to_use = get_local_checkpoint_path(
164
+ base_checkpoint, variant
165
+ )
166
+ if not full_model_path:
167
+ raise ValueError(
168
+ f"Model {checkpoint} is not available locally."
169
+ f"Cannot download in offline mode."
170
+ )
171
+
172
+ else:
173
+ # First, try to resolve from local cache to avoid unnecessary downloads
174
+ base_checkpoint, variant = parse_checkpoint(checkpoint)
175
+ snapshot_files = resolve_local_gguf_model(
176
+ base_checkpoint, variant, None
177
+ )
178
+
179
+ # If not found locally, download from internet
180
+ if not snapshot_files:
181
+ snapshot_files = download_gguf(checkpoint)
182
+
183
+ full_model_path = snapshot_files["variant"]
184
+ model_to_use = os.path.basename(full_model_path)
185
+
186
+ llama_cli_exe_path = get_llama_cli_exe_path(backend)
187
+ llama_bench_exe_path = get_llama_bench_exe_path(backend)
188
+ printing.log_info(f"Using llama_cli for GGUF model: {llama_cli_exe_path}")
189
+
190
+ # Get the directory containing the executable for shared libraries
191
+ lib_dir = os.path.dirname(llama_cli_exe_path)
192
+
193
+ # Pass the model and inputs into state
194
+ state.model = LlamaCppAdapter(
195
+ model=full_model_path,
196
+ device=device,
197
+ output_tokens=output_tokens,
198
+ context_size=context_size,
199
+ threads=threads,
200
+ executable=llama_cli_exe_path,
201
+ bench_executable=llama_bench_exe_path,
202
+ reasoning=reasoning,
203
+ lib_dir=lib_dir,
204
+ state=state,
205
+ )
206
+ state.tokenizer = LlamaCppTokenizerAdapter()
207
+ state.device = device
208
+
209
+ # Save initial stats
210
+ state.save_stat(Keys.DEVICE, device)
211
+ state.save_stat(
212
+ Keys.LLAMA_CLI_VERSION_INFO, get_llama_installed_version(backend)
213
+ )
214
+
215
+ status.add_to_state(
216
+ state=state, name=input, model=model_to_use, extension=extension
217
+ )
218
+ return state
219
+
220
+
221
+ # This file was originally licensed under Apache 2.0. It has been modified.
222
+ # Modifications Copyright (c) 2025 AMD