lemonade-sdk 8.0.5__py3-none-any.whl → 8.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

lemonade/cache.py CHANGED
@@ -34,7 +34,7 @@ def build_name(input_name):
34
34
  """
35
35
  Name the lemonade build by concatenating these two factors:
36
36
  1. Sanitize the input name (typically a model checkpoint name) by
37
- replacing any `/` characters with `_`.
37
+ replacing any `/` characters with `_` and ':' characters with '-'.
38
38
  2. Timestamp to ensure that builds in the same cache will not
39
39
  collide in the same build directory.
40
40
 
@@ -47,6 +47,7 @@ def build_name(input_name):
47
47
  else:
48
48
  # Sanitize the input name
49
49
  input_name_sanitized = input_name.replace("/", "_")
50
+ input_name_sanitized = input_name_sanitized.replace(":", "-")
50
51
 
51
52
  # Get the formatted timestamp string
52
53
  timestamp = get_timestamp()
@@ -79,6 +80,7 @@ class Keys:
79
80
  MAX_MEMORY_USED_GB = "max_memory_used_GB"
80
81
  MAX_MEMORY_USED_GBYTE = "max_memory_used_gbyte"
81
82
  RYZEN_AI_VERSION_INFO = "ryzen_ai_version_info"
83
+ LLAMA_CLI_VERSION_INFO = "llama_cli_version_info"
82
84
 
83
85
 
84
86
  # This file was originally licensed under Apache 2.0. It has been modified.
lemonade/tools/adapter.py CHANGED
@@ -13,6 +13,9 @@ class ModelAdapter(abc.ABC):
13
13
  """
14
14
  self.tokens_per_second = None
15
15
  self.time_to_first_token = None
16
+ self.prompt_tokens = None
17
+ self.response_tokens = None
18
+
16
19
  self.type = "generic"
17
20
 
18
21
  @abc.abstractmethod
@@ -22,6 +25,9 @@ class ModelAdapter(abc.ABC):
22
25
 
23
26
  We try to keep the signature here minimal to allow for maximum compatibility
24
27
  with recipe components, which themselves may not support a lot of arguments.
28
+
29
+ The generate method should store prompt and response lengths (in tokens)
30
+ in the prompt_tokens and response_tokens members.
25
31
  """
26
32
 
27
33
 
@@ -108,7 +108,9 @@ class HuggingfaceAdapter(ModelAdapter):
108
108
  with torch.no_grad(), torch.inference_mode():
109
109
  outputs = self.model.generate(input_ids=input_ids, **generation_kwargs)
110
110
 
111
- return outputs
111
+ self.prompt_tokens = input_ids.shape[1]
112
+ self.response_tokens = len(outputs[0]) - self.prompt_tokens
113
+ return outputs
112
114
 
113
115
  def _model_call(self, input_tensor):
114
116
  """Forward pass through the model to get logits
@@ -341,12 +343,11 @@ def benchmark_huggingface_llm(
341
343
 
342
344
  latency = end_time - start_time
343
345
 
344
- token_len = outputs.shape[1] - input_ids.shape[1]
345
- tokens_out_len_list.append(token_len)
346
+ tokens_out_len_list.append(model.response_tokens)
346
347
 
347
348
  # Only count an iteration if it produced enough tokens
348
- if token_len >= target_output_tokens:
349
- per_iteration_result.append((latency, token_len))
349
+ if model.response_tokens >= target_output_tokens:
350
+ per_iteration_result.append((latency, model.response_tokens))
350
351
 
351
352
  report_progress_fn(
352
353
  (warmup_iterations + count + 1) / (warmup_iterations + iterations)
@@ -3,27 +3,31 @@ import statistics
3
3
  from statistics import StatisticsError
4
4
  from lemonade.state import State
5
5
  from lemonade.cache import Keys
6
- from lemonade.tools.llamacpp.load import LlamaCppAdapter
6
+ from lemonade.tools.llamacpp.utils import LlamaCppAdapter
7
7
  from lemonade.tools.bench import Bench
8
8
 
9
9
 
10
10
  class LlamaCppBench(Bench):
11
+ """
12
+ Benchmark a llama.cpp model
13
+ """
11
14
 
12
- unique_name = "llama-cpp-bench"
15
+ unique_name = "llamacpp-bench"
13
16
 
14
17
  def __init__(self):
15
18
  super().__init__()
16
19
 
17
20
  # Additional statistics generated by this bench tool
18
- self.status_stats += [
21
+ self.status_stats.insert(
22
+ self.status_stats.index(Keys.TOKEN_GENERATION_TOKENS_PER_SECOND) + 1,
19
23
  Keys.STD_DEV_TOKENS_PER_SECOND,
20
- ]
24
+ )
21
25
  self.std_dev_token_generation_tokens_per_second_list = []
22
26
 
23
27
  @staticmethod
24
28
  def parser(add_help: bool = True) -> argparse.ArgumentParser:
25
29
  parser = __class__.helpful_parser(
26
- short_description="Benchmark a llama.cpp model",
30
+ short_description="Benchmark an LLM in llama.cpp",
27
31
  add_help=add_help,
28
32
  )
29
33
 
@@ -53,38 +57,20 @@ class LlamaCppBench(Bench):
53
57
  f"{self.__class__.unique_name} requires a LlamaCppAdapter model to be "
54
58
  "loaded first. Please run load-llama-cpp before this tool."
55
59
  )
60
+ model: LlamaCppAdapter = state.model
56
61
 
57
- iteration_tokens_per_second = []
58
- iteration_time_to_first_token = []
62
+ per_iteration_tokens_per_second = []
63
+ per_iteration_time_to_first_token = []
59
64
 
60
65
  for iteration in range(iterations + warmup_iterations):
61
66
  try:
62
67
  # Use the adapter's generate method which already has the timeout
63
68
  # and error handling
64
- raw_output, stderr = state.model.generate(prompt, return_raw=True)
65
-
66
- # Parse the timing information from the output
67
- ms_per_token = None
68
- time_to_first_token_ms = None
69
- input_tokens = None
70
-
71
- # Look for timing in both stdout and stderr
72
- for output in [raw_output, stderr]:
73
- for line in output.splitlines():
74
- if "llama_perf_context_print: eval time =" in line:
75
- parts = line.split("(")[1].strip()
76
- parts = parts.split(",")
77
- ms_per_token = float(
78
- parts[0].split("ms per token")[0].strip()
79
- )
80
- if "llama_perf_context_print: prompt eval time =" in line:
81
- parts = line.split("=")[1].split("/")
82
- time_to_first_token_ms = float(
83
- parts[0].split("ms")[0].strip()
84
- )
85
- input_tokens = int(parts[1].split("tokens")[0].strip())
86
-
87
- if ms_per_token is None or time_to_first_token_ms is None:
69
+ model.time_to_first_token = None
70
+ model.tokens_per_second = None
71
+ raw_output, stderr = model.generate(prompt, return_raw=True)
72
+
73
+ if model.time_to_first_token is None or model.tokens_per_second is None:
88
74
  error_msg = (
89
75
  "Could not find timing information in llama.cpp output.\n"
90
76
  )
@@ -92,17 +78,11 @@ class LlamaCppBench(Bench):
92
78
  error_msg += "Stderr:\n" + stderr
93
79
  raise Exception(error_msg)
94
80
 
95
- # When output_tokens is set to 1 for accuracy tests, ms_per_token tends to 0
96
- # and causes a divide-by-zero error. Set tokens_per_second to 0 in such cases
97
- # as performance data for generating a few tokens is not relevant.
98
- tokens_per_second = 0
99
- if output_tokens > 5 and ms_per_token > 0:
100
- tokens_per_second = 1000 / ms_per_token
101
- time_to_first_token = time_to_first_token_ms / 1000
81
+ self.tokens_out_len_list.append(model.response_tokens)
102
82
 
103
83
  if iteration > warmup_iterations - 1:
104
- iteration_tokens_per_second.append(tokens_per_second)
105
- iteration_time_to_first_token.append(time_to_first_token)
84
+ per_iteration_tokens_per_second.append(model.tokens_per_second)
85
+ per_iteration_time_to_first_token.append(model.time_to_first_token)
106
86
 
107
87
  report_progress_fn((iteration + 1) / (warmup_iterations + iterations))
108
88
 
@@ -110,25 +90,25 @@ class LlamaCppBench(Bench):
110
90
  error_msg = f"Failed to run benchmark: {str(e)}"
111
91
  raise Exception(error_msg)
112
92
 
113
- self.input_ids_len_list.append(input_tokens)
114
- mean_time_to_first_token = statistics.mean(iteration_time_to_first_token)
93
+ self.input_ids_len_list.append(model.prompt_tokens)
94
+ mean_time_to_first_token = statistics.mean(per_iteration_time_to_first_token)
115
95
  self.mean_time_to_first_token_list.append(mean_time_to_first_token)
116
96
  self.prefill_tokens_per_second_list.append(
117
- input_tokens / mean_time_to_first_token
97
+ model.prompt_tokens / mean_time_to_first_token
118
98
  )
119
99
  self.token_generation_tokens_per_second_list.append(
120
- statistics.mean(iteration_tokens_per_second)
100
+ statistics.mean(per_iteration_tokens_per_second)
121
101
  )
122
102
  try:
123
103
  self.std_dev_time_to_first_token_list.append(
124
- statistics.stdev(iteration_time_to_first_token)
104
+ statistics.stdev(per_iteration_time_to_first_token)
125
105
  )
126
106
  except StatisticsError:
127
107
  # Less than 2 measurements
128
108
  self.std_dev_time_to_first_token_list.append(None)
129
109
  try:
130
110
  self.std_dev_token_generation_tokens_per_second_list.append(
131
- statistics.stdev(iteration_tokens_per_second)
111
+ statistics.stdev(per_iteration_tokens_per_second)
132
112
  )
133
113
  except StatisticsError:
134
114
  # Less than 2 measurements
@@ -1,166 +1,22 @@
1
1
  import argparse
2
2
  import os
3
- from typing import Optional
4
- import subprocess
5
- from lemonade.state import State
3
+ import lemonade.common.printing as printing
6
4
  import lemonade.common.status as status
5
+ from lemonade.state import State
7
6
  from lemonade.tools import FirstTool
8
- from lemonade.tools.adapter import PassthroughTokenizer, ModelAdapter
9
7
  from lemonade.cache import Keys
10
8
 
11
9
 
12
- class LlamaCppAdapter(ModelAdapter):
13
- def __init__(
14
- self, model, output_tokens, context_size, threads, executable, lib_dir=None
15
- ):
16
- super().__init__()
17
-
18
- self.model = os.path.normpath(model)
19
- self.output_tokens = output_tokens
20
- self.context_size = context_size
21
- self.threads = threads
22
- self.executable = os.path.normpath(executable)
23
- self.lib_dir = lib_dir
24
-
25
- def generate(
26
- self,
27
- input_ids: str,
28
- max_new_tokens: Optional[int] = None,
29
- temperature: float = 0.8,
30
- top_p: float = 0.95,
31
- top_k: int = 40,
32
- return_raw: bool = False,
33
- **kwargs, # pylint: disable=unused-argument
34
- ):
35
- """
36
- Pass a text prompt into the llamacpp inference CLI.
37
-
38
- The input_ids arg here should receive the original text that
39
- would normally be encoded by a tokenizer.
40
-
41
- Args:
42
- input_ids: The input text prompt
43
- max_new_tokens: Maximum number of tokens to generate
44
- temperature: Temperature for sampling (0.0 = greedy)
45
- top_p: Top-p sampling threshold
46
- top_k: Top-k sampling threshold
47
- return_raw: If True, returns the complete raw output including timing info
48
- **kwargs: Additional arguments (ignored)
49
-
50
- Returns:
51
- List containing a single string with the generated text, or raw output if
52
- return_raw=True
53
- """
54
-
55
- prompt = input_ids
56
- n_predict = max_new_tokens if max_new_tokens is not None else self.output_tokens
57
-
58
- cmd = [
59
- self.executable,
60
- "-m",
61
- self.model,
62
- "--ctx-size",
63
- str(self.context_size),
64
- "-n",
65
- str(n_predict),
66
- "-t",
67
- str(self.threads),
68
- "-p",
69
- prompt,
70
- "--temp",
71
- str(temperature),
72
- "--top-p",
73
- str(top_p),
74
- "--top-k",
75
- str(top_k),
76
- "-e",
77
- "-no-cnv",
78
- ]
79
-
80
- cmd = [str(m) for m in cmd]
81
-
82
- try:
83
- # Set up environment with library path for Linux
84
- env = os.environ.copy()
85
- if self.lib_dir and os.name != "nt": # Not Windows
86
- current_ld_path = env.get("LD_LIBRARY_PATH", "")
87
- if current_ld_path:
88
- env["LD_LIBRARY_PATH"] = f"{self.lib_dir}:{current_ld_path}"
89
- else:
90
- env["LD_LIBRARY_PATH"] = self.lib_dir
91
-
92
- process = subprocess.Popen(
93
- cmd,
94
- stdout=subprocess.PIPE,
95
- stderr=subprocess.PIPE,
96
- universal_newlines=True,
97
- encoding="utf-8",
98
- errors="replace",
99
- env=env,
100
- )
101
-
102
- raw_output, stderr = process.communicate(timeout=600)
103
- if process.returncode != 0:
104
- error_msg = f"llama.cpp failed with return code {process.returncode}.\n"
105
- error_msg += f"Command: {' '.join(cmd)}\n"
106
- error_msg += f"Error output:\n{stderr}\n"
107
- error_msg += f"Standard output:\n{raw_output}"
108
- raise Exception(error_msg)
109
-
110
- if raw_output is None:
111
- raise Exception("No output received from llama.cpp process")
112
-
113
- # Parse timing information
114
- for line in raw_output.splitlines():
115
- if "llama_perf_context_print: eval time =" in line:
116
- parts = line.split("(")[1].strip()
117
- parts = parts.split(",")
118
- ms_per_token = float(parts[0].split("ms per token")[0].strip())
119
- self.tokens_per_second = (
120
- 1000 / ms_per_token if ms_per_token > 0 else 0
121
- )
122
- if "llama_perf_context_print: prompt eval time =" in line:
123
- parts = line.split("=")[1].split("/")[0]
124
- time_to_first_token_ms = float(parts.split("ms")[0].strip())
125
- self.time_to_first_token = time_to_first_token_ms / 1000
126
-
127
- if return_raw:
128
- return [raw_output, stderr]
129
-
130
- # Find where the prompt ends and the generated text begins
131
- prompt_found = False
132
- output_text = ""
133
- prompt_first_line = prompt.split("\n")[0]
134
- for line in raw_output.splitlines():
135
- if prompt_first_line in line:
136
- prompt_found = True
137
- if prompt_found:
138
- line = line.replace("</s> [end of text]", "")
139
- output_text = output_text + line
140
-
141
- if not prompt_found:
142
- raise Exception(
143
- f"Could not find prompt '{prompt_first_line}' in llama.cpp output. "
144
- "This usually means the model failed to process the prompt correctly.\n"
145
- f"Raw output:\n{raw_output}\n"
146
- f"Stderr:\n{stderr}"
147
- )
148
-
149
- # Return list containing the generated text
150
- return [output_text]
151
-
152
- except Exception as e:
153
- error_msg = f"Failed to run llama.cpp command: {str(e)}\n"
154
- error_msg += f"Command: {' '.join(cmd)}"
155
- raise Exception(error_msg)
156
-
157
-
158
10
  class LoadLlamaCpp(FirstTool):
159
- unique_name = "load-llama-cpp"
11
+ unique_name = "llamacpp-load"
160
12
 
161
13
  def __init__(self):
162
14
  super().__init__(monitor_message="Loading llama.cpp model")
163
15
 
16
+ self.status_stats = [
17
+ Keys.DEVICE,
18
+ ]
19
+
164
20
  @staticmethod
165
21
  def parser(add_help: bool = True) -> argparse.ArgumentParser:
166
22
  parser = __class__.helpful_parser(
@@ -169,28 +25,29 @@ class LoadLlamaCpp(FirstTool):
169
25
  )
170
26
 
171
27
  parser.add_argument(
172
- "--executable",
173
- required=True,
174
- type=str,
175
- help="Path to the llama.cpp executable (e.g., llama-cli or llama-cli.exe)",
28
+ "-d",
29
+ "--device",
30
+ choices=["cpu", "igpu"],
31
+ default="igpu",
32
+ help="Which device to load the model on to (default: igpu)",
176
33
  )
177
34
 
178
- default_threads = 1
35
+ default_threads = -1
179
36
  parser.add_argument(
180
37
  "--threads",
181
38
  required=False,
182
39
  type=int,
183
40
  default=default_threads,
184
- help=f"Number of threads to use for generation (default: {default_threads})",
41
+ help=f"Number of threads to use during generation (default: {default_threads})",
185
42
  )
186
43
 
187
- context_size = 512
44
+ context_size = 4096
188
45
  parser.add_argument(
189
46
  "--context-size",
190
47
  required=False,
191
48
  type=int,
192
49
  default=context_size,
193
- help=f"Context size of the prompt (default: {context_size})",
50
+ help=f"Size of the prompt context (default: {context_size}. 0 = loaded from model)",
194
51
  )
195
52
 
196
53
  output_tokens = 512
@@ -199,14 +56,13 @@ class LoadLlamaCpp(FirstTool):
199
56
  required=False,
200
57
  type=int,
201
58
  default=output_tokens,
202
- help=f"Maximum number of output tokens the LLM should make (default: {output_tokens})",
59
+ help=f"Maximum number of output tokens to generate (default: {output_tokens})",
203
60
  )
204
61
 
205
62
  parser.add_argument(
206
- "--model-binary",
207
- required=True,
208
- type=str,
209
- help="Path to a .gguf model file",
63
+ "--reasoning",
64
+ action="store_true",
65
+ help="Set this flag to indicate the model is a reasoning model",
210
66
  )
211
67
 
212
68
  return parser
@@ -215,61 +71,113 @@ class LoadLlamaCpp(FirstTool):
215
71
  self,
216
72
  state: State,
217
73
  input: str = "",
74
+ device: str = "igpu",
218
75
  context_size: int = 512,
219
76
  threads: int = 1,
220
77
  output_tokens: int = 512,
221
- model_binary: Optional[str] = None,
222
- executable: str = None,
223
- lib_dir: Optional[str] = None,
78
+ reasoning: bool = False,
224
79
  ) -> State:
225
80
  """
226
81
  Load a llama.cpp model
227
82
  """
228
83
 
229
- from lemonade.common.network import get_base_model
230
-
231
- if executable is None:
232
- raise Exception(f"{self.__class__.unique_name} requires an executable path")
84
+ from lemonade.common.network import is_offline
85
+ from lemonade.tools.llamacpp.utils import (
86
+ install_llamacpp,
87
+ get_llama_cli_exe_path,
88
+ get_llama_installed_version,
89
+ parse_checkpoint,
90
+ download_gguf,
91
+ get_local_checkpoint_path,
92
+ LlamaCppTokenizerAdapter,
93
+ LlamaCppAdapter,
94
+ )
233
95
 
234
- # Convert paths to platform-specific format
235
- executable = os.path.normpath(executable)
96
+ # Validate and install llama.cpp, if needed
97
+ install_llamacpp()
98
+
99
+ # Check if input is a local folder containing a .GGUF model
100
+ if os.path.isdir(input):
101
+ # input is a local folder
102
+ local_model_folder = os.path.abspath(input)
103
+ checkpoint = "local_model"
104
+ state.checkpoint = checkpoint
105
+ state.save_stat(Keys.CHECKPOINT, checkpoint)
106
+ state.save_stat(Keys.LOCAL_MODEL_FOLDER, local_model_folder)
107
+
108
+ # See if there is a file ending in ".gguf" in this folder
109
+ dir = os.listdir(input)
110
+ gguf_files = [filename for filename in dir if filename.endswith(".gguf")]
111
+ if len(gguf_files) == 0:
112
+ raise ValueError(
113
+ f"The folder {input} does not contain a GGUF model file."
114
+ )
115
+ model_to_use = gguf_files[0]
116
+ full_model_path = os.path.join(local_model_folder, model_to_use)
236
117
 
237
- if model_binary:
238
- model_to_use = os.path.normpath(model_binary)
239
118
  else:
240
- model_binary = input
241
- model_to_use = os.path.normpath(model_binary) if model_binary else None
119
+ # Input is a model checkpoint
120
+ checkpoint = input
121
+ state.checkpoint = checkpoint
122
+ state.save_stat(Keys.CHECKPOINT, checkpoint)
123
+
124
+ # Make sure that a variant is provided for the GGUF model
125
+ base_checkpoint, variant = parse_checkpoint(checkpoint)
126
+ if variant is None:
127
+ raise ValueError(
128
+ "You are required to provide a 'variant' when "
129
+ "selecting a GGUF model. The variant is provided "
130
+ "as CHECKPOINT:VARIANT. For example: "
131
+ "Qwen/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_0 or "
132
+ "Qwen/Qwen2.5-Coder-3B-Instruct-GGUF:qwen2.5-coder-3b-instruct-q4_0.gguf"
133
+ )
134
+
135
+ # Auto-detect offline status
136
+ offline = is_offline()
137
+ if offline:
138
+ printing.log_warning(
139
+ "Network connectivity to huggingface.co not detected. Running in offline mode."
140
+ )
141
+ full_model_path, model_to_use = get_local_checkpoint_path(
142
+ base_checkpoint, variant
143
+ )
144
+ if not full_model_path:
145
+ raise ValueError(
146
+ f"Model {checkpoint} is not available locally."
147
+ f"Cannot download in offline mode."
148
+ )
149
+
150
+ else:
151
+
152
+ snapshot_files = download_gguf(checkpoint)
153
+ full_model_path = snapshot_files["variant"]
154
+ model_to_use = os.path.basename(full_model_path)
242
155
 
243
- if not model_binary:
244
- model_to_use = state.get(Keys.MODEL)
156
+ llama_cli_exe_path = get_llama_cli_exe_path()
157
+ printing.log_info(f"Using llama_cli for GGUF model: {llama_cli_exe_path}")
245
158
 
246
- if model_to_use is None:
247
- raise Exception(
248
- f"{self.__class__.unique_name} requires the preceding tool to pass a "
249
- "Llamacpp model, "
250
- "or for the user to supply a model with `--model-binary`"
251
- )
159
+ # Get the directory containing the executable for shared libraries
160
+ lib_dir = os.path.dirname(llama_cli_exe_path)
252
161
 
162
+ # Pass the model and inputs into state
253
163
  state.model = LlamaCppAdapter(
254
- model=model_to_use,
164
+ model=full_model_path,
165
+ device=device,
255
166
  output_tokens=output_tokens,
256
167
  context_size=context_size,
257
168
  threads=threads,
258
- executable=executable,
169
+ executable=llama_cli_exe_path,
170
+ reasoning=reasoning,
259
171
  lib_dir=lib_dir,
260
172
  )
261
- state.tokenizer = PassthroughTokenizer()
173
+ state.tokenizer = LlamaCppTokenizerAdapter()
174
+ state.device = device
262
175
 
263
- # Save stats about the model
264
- state.save_stat(Keys.CHECKPOINT, model_to_use)
265
-
266
- # Get base model information if this is a converted HF model
267
- base_model = get_base_model(input)
268
- if base_model is not None:
269
- state.save_stat("base_model", base_model)
176
+ # Save initial stats
177
+ state.save_stat(Keys.DEVICE, device)
178
+ state.save_stat(Keys.LLAMA_CLI_VERSION_INFO, get_llama_installed_version())
270
179
 
271
180
  status.add_to_state(state=state, name=input, model=model_to_use)
272
-
273
181
  return state
274
182
 
275
183