lemonade-sdk 7.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (61) hide show
  1. lemonade/__init__.py +5 -0
  2. lemonade/api.py +125 -0
  3. lemonade/cache.py +85 -0
  4. lemonade/cli.py +135 -0
  5. lemonade/common/__init__.py +0 -0
  6. lemonade/common/analyze_model.py +26 -0
  7. lemonade/common/build.py +223 -0
  8. lemonade/common/cli_helpers.py +139 -0
  9. lemonade/common/exceptions.py +98 -0
  10. lemonade/common/filesystem.py +368 -0
  11. lemonade/common/labels.py +61 -0
  12. lemonade/common/onnx_helpers.py +176 -0
  13. lemonade/common/plugins.py +10 -0
  14. lemonade/common/printing.py +110 -0
  15. lemonade/common/status.py +490 -0
  16. lemonade/common/system_info.py +390 -0
  17. lemonade/common/tensor_helpers.py +83 -0
  18. lemonade/common/test_helpers.py +28 -0
  19. lemonade/profilers/__init__.py +1 -0
  20. lemonade/profilers/memory_tracker.py +257 -0
  21. lemonade/profilers/profiler.py +55 -0
  22. lemonade/sequence.py +363 -0
  23. lemonade/state.py +159 -0
  24. lemonade/tools/__init__.py +1 -0
  25. lemonade/tools/adapter.py +104 -0
  26. lemonade/tools/bench.py +284 -0
  27. lemonade/tools/huggingface_bench.py +267 -0
  28. lemonade/tools/huggingface_load.py +520 -0
  29. lemonade/tools/humaneval.py +258 -0
  30. lemonade/tools/llamacpp.py +261 -0
  31. lemonade/tools/llamacpp_bench.py +154 -0
  32. lemonade/tools/management_tools.py +273 -0
  33. lemonade/tools/mmlu.py +327 -0
  34. lemonade/tools/ort_genai/__init__.py +0 -0
  35. lemonade/tools/ort_genai/oga.py +1129 -0
  36. lemonade/tools/ort_genai/oga_bench.py +142 -0
  37. lemonade/tools/perplexity.py +146 -0
  38. lemonade/tools/prompt.py +228 -0
  39. lemonade/tools/quark/__init__.py +0 -0
  40. lemonade/tools/quark/quark_load.py +172 -0
  41. lemonade/tools/quark/quark_quantize.py +439 -0
  42. lemonade/tools/report/__init__.py +0 -0
  43. lemonade/tools/report/llm_report.py +203 -0
  44. lemonade/tools/report/table.py +739 -0
  45. lemonade/tools/server/__init__.py +0 -0
  46. lemonade/tools/server/serve.py +1354 -0
  47. lemonade/tools/server/tool_calls.py +146 -0
  48. lemonade/tools/tool.py +374 -0
  49. lemonade/version.py +1 -0
  50. lemonade_install/__init__.py +1 -0
  51. lemonade_install/install.py +774 -0
  52. lemonade_sdk-7.0.0.dist-info/METADATA +116 -0
  53. lemonade_sdk-7.0.0.dist-info/RECORD +61 -0
  54. lemonade_sdk-7.0.0.dist-info/WHEEL +5 -0
  55. lemonade_sdk-7.0.0.dist-info/entry_points.txt +4 -0
  56. lemonade_sdk-7.0.0.dist-info/licenses/LICENSE +201 -0
  57. lemonade_sdk-7.0.0.dist-info/licenses/NOTICE.md +21 -0
  58. lemonade_sdk-7.0.0.dist-info/top_level.txt +3 -0
  59. lemonade_server/cli.py +260 -0
  60. lemonade_server/model_manager.py +98 -0
  61. lemonade_server/server_models.json +142 -0
@@ -0,0 +1,258 @@
1
+ import argparse
2
+ import os
3
+ import csv
4
+ from typing import Dict, Optional, Any
5
+ import requests
6
+ from human_eval.data import write_jsonl, read_problems
7
+ from human_eval.evaluation import evaluate_functional_correctness
8
+
9
+ from lemonade.state import State
10
+ from lemonade.tools import Tool
11
+ import lemonade.common.printing as printing
12
+ import lemonade.common.build as build
13
+
14
+
15
+ class AccuracyHumaneval(Tool):
16
+ """
17
+ HumanEval accuracy measurement tool.
18
+
19
+ This tool evaluates language models on the HumanEval dataset, which consists of
20
+ Python programming problems. It measures the model's ability to:
21
+ 1. Generate functionally correct code completions
22
+ 2. Pass unit tests for each programming problem
23
+
24
+ Metrics:
25
+ - pass@1: Percentage of problems solved with 1 generation attempt
26
+ - pass@10: Percentage of problems solved within 10 generation attempts
27
+ - pass@100: Percentage of problems solved within 100 generation attempts
28
+
29
+ See docs/lemonade/humaneval_accuracy.md for more details
30
+ """
31
+
32
+ unique_name = "accuracy-humaneval"
33
+ DATASET = "https://github.com/openai/human-eval/blob/master/data/HumanEval.jsonl.gz?raw=true"
34
+ TOTAL_PROBLEMS = 164 # Total number of problems in the HumanEval dataset
35
+
36
+ def __init__(self):
37
+ super().__init__(monitor_message="Measuring accuracy with HumanEval")
38
+ self.status_stats = []
39
+ # Enable code evaluation for HumanEval
40
+ os.environ["HF_ALLOW_CODE_EVAL"] = "1"
41
+
42
+ @staticmethod
43
+ def parser(add_help: bool = True) -> argparse.ArgumentParser:
44
+ parser = __class__.helpful_parser(
45
+ short_description="Measure coding accuracy with HumanEval",
46
+ add_help=add_help,
47
+ )
48
+ parser.add_argument(
49
+ "--k-samples",
50
+ type=int,
51
+ default=1,
52
+ help="Number of completions to generate per prompt for pass@k calculation"
53
+ " (default: %(default)s)",
54
+ )
55
+ parser.add_argument(
56
+ "--first-n-samples",
57
+ type=int,
58
+ default=AccuracyHumaneval.TOTAL_PROBLEMS,
59
+ help=f"Evaluate only the first N problems from the dataset (default: "
60
+ f"%(default)s, evaluates all {AccuracyHumaneval.TOTAL_PROBLEMS} problems)",
61
+ )
62
+ parser.add_argument(
63
+ "--timeout",
64
+ type=float,
65
+ default=30.0,
66
+ help="Timeout in seconds for each test case (default: %(default)s)",
67
+ )
68
+ parser.add_argument(
69
+ "--data-dir",
70
+ type=str,
71
+ default=None,
72
+ help="Custom directory for dataset storage (default: %(default)s, "
73
+ "uses <lemonade_cache_dir>/data/humaneval)",
74
+ )
75
+ return parser
76
+
77
+ def run(
78
+ self,
79
+ state: State,
80
+ data_dir: Optional[str] = None,
81
+ k_samples: int = 1,
82
+ first_n_samples: Optional[int] = TOTAL_PROBLEMS,
83
+ timeout: float = 30.0,
84
+ ) -> State:
85
+ """
86
+ Run HumanEval evaluation on the model.
87
+
88
+ Args:
89
+ state: Current state containing model and tokenizer
90
+ data_dir: Optional custom directory for dataset storage
91
+ k_samples: Number of completions to generate per prompt for pass@k calculation
92
+ first_n_samples: Number of first N problems to evaluate
93
+ timeout: Timeout in seconds for each test case
94
+
95
+ Returns:
96
+ Updated state with evaluation results
97
+ """
98
+ # Validate required state components
99
+ if not hasattr(state, "model") or not hasattr(state, "tokenizer"):
100
+ raise ValueError("State must contain both 'model' and 'tokenizer'")
101
+
102
+ # Setup directories
103
+ data_dir_to_use = data_dir or os.path.join(state.cache_dir, "data", "humaneval")
104
+ data_path = os.path.join(data_dir_to_use, "HumanEval.jsonl.gz")
105
+ model_results_dir = os.path.join(
106
+ build.output_dir(state.cache_dir, state.build_name), "humaneval"
107
+ )
108
+ os.makedirs(model_results_dir, exist_ok=True)
109
+
110
+ # Download dataset if needed
111
+ self._download_dataset(data_path)
112
+
113
+ # Run evaluation
114
+ results = self._evaluate_model(
115
+ state.model,
116
+ state.tokenizer,
117
+ data_path,
118
+ k_samples,
119
+ timeout,
120
+ model_results_dir,
121
+ first_n_samples,
122
+ )
123
+
124
+ # Save metrics
125
+ self._save_metrics(state, results)
126
+
127
+ return state
128
+
129
+ def _download_dataset(self, output_path: str) -> None:
130
+ """Download HumanEval dataset if not already present."""
131
+ if os.path.exists(output_path):
132
+ printing.log_info(f"Dataset already exists at: {output_path}")
133
+ return
134
+
135
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
136
+ response = requests.get(self.DATASET, stream=True)
137
+
138
+ if response.status_code == 200:
139
+ with open(output_path, "wb") as file:
140
+ for chunk in response.iter_content(chunk_size=8192):
141
+ file.write(chunk)
142
+ printing.log_info(f"Dataset downloaded successfully to: {output_path}")
143
+ else:
144
+ raise RuntimeError(
145
+ f"Failed to download dataset. Status code: {response.status_code}"
146
+ )
147
+
148
+ def _evaluate_model(
149
+ self,
150
+ model: Any,
151
+ tokenizer: Any,
152
+ data_path: str,
153
+ k_samples: int,
154
+ timeout: float,
155
+ results_dir: str,
156
+ first_n_samples: Optional[int] = TOTAL_PROBLEMS,
157
+ ) -> Dict[str, float]:
158
+ """
159
+ Evaluate model on HumanEval dataset.
160
+
161
+ Args:
162
+ model: The language model to evaluate
163
+ tokenizer: The tokenizer for the model
164
+ data_path: Path to the HumanEval dataset
165
+ k_samples: Number of completions per prompt for pass@k calculation
166
+ timeout: Test case timeout in seconds
167
+ results_dir: Directory to save results
168
+ first_n_samples: Number of first N problems to evaluate
169
+
170
+ Returns:
171
+ Dictionary containing evaluation metrics
172
+ """
173
+ dataset = read_problems(data_path)
174
+
175
+ # Limit to first N problems
176
+ dataset_keys = list(dataset.keys())[:first_n_samples]
177
+ ignore_incomplete = True
178
+
179
+ samples = []
180
+
181
+ # Update Tool progress monitor
182
+ self.set_percent_progress(0.0)
183
+ questions_completed = 0
184
+ number_of_questions = first_n_samples * k_samples
185
+
186
+ # Save completions and expected answers
187
+ csv_path = os.path.join(results_dir, "evaluation_results.csv")
188
+ with open(
189
+ csv_path, mode="w", newline="", encoding="utf-8", errors="replace"
190
+ ) as file:
191
+ writer = csv.writer(file)
192
+ writer.writerow(["Prompt", "Completion", "Expected Answer"])
193
+
194
+ for task_id in dataset_keys:
195
+ try:
196
+ for _ in range(k_samples):
197
+ prompt = dataset[task_id]["prompt"]
198
+ expected = dataset[task_id]["canonical_solution"]
199
+
200
+ # Generate completion
201
+ input_ids = tokenizer(prompt, return_tensors="pt").input_ids
202
+ completion = model.generate(
203
+ input_ids,
204
+ max_new_tokens=512,
205
+ do_sample=False,
206
+ )
207
+ completion_text = tokenizer.decode(
208
+ completion[0], skip_special_tokens=True
209
+ )
210
+
211
+ # Save results
212
+ samples.append(
213
+ {"task_id": task_id, "completion": completion_text}
214
+ )
215
+ writer.writerow([prompt, completion_text, expected])
216
+
217
+ # Update progress monitor after completing all samples for a question
218
+ questions_completed = questions_completed + 1
219
+ percent_completed = (
220
+ questions_completed / number_of_questions * 100
221
+ )
222
+ self.set_percent_progress(percent_completed)
223
+
224
+ # pylint: disable=W0718
225
+ except Exception as e:
226
+ printing.log_info(f"Error processing task {task_id}: {str(e)}")
227
+ continue
228
+
229
+ # Save predictions and evaluate
230
+ pred_path = os.path.join(results_dir, "humaneval_predictions.jsonl")
231
+ write_jsonl(pred_path, samples)
232
+ printing.log_info(f"Results saved in: {results_dir}")
233
+
234
+ # Run functional correctness evaluation
235
+ k_values = [k_samples]
236
+ results = evaluate_functional_correctness(
237
+ pred_path,
238
+ k_values,
239
+ n_workers=1,
240
+ timeout=timeout,
241
+ problem_file=data_path,
242
+ ignore_incomplete=ignore_incomplete,
243
+ )
244
+ return results
245
+
246
+ def _save_metrics(self, state: State, results: Dict[str, float]) -> None:
247
+ """Save evaluation metrics to state."""
248
+ for metric, value in results.items():
249
+ metric_name = f"humaneval_{metric}"
250
+ state.save_stat(
251
+ metric_name, float(value) * 100 if value is not None else None
252
+ )
253
+ state.save_stat(f"{metric_name}_units", "%")
254
+ self.status_stats.append(metric_name)
255
+
256
+
257
+ # This file was originally licensed under Apache 2.0. It has been modified.
258
+ # Modifications Copyright (c) 2025 AMD
@@ -0,0 +1,261 @@
1
+ import argparse
2
+ import os
3
+ from typing import Optional
4
+ import subprocess
5
+ from lemonade.state import State
6
+ import lemonade.common.status as status
7
+ from lemonade.tools import FirstTool
8
+ from lemonade.tools.adapter import PassthroughTokenizer, ModelAdapter
9
+ from lemonade.cache import Keys
10
+ from lemonade.tools.huggingface_load import get_base_model
11
+
12
+
13
+ class LlamaCppAdapter(ModelAdapter):
14
+ def __init__(self, model, output_tokens, context_size, threads, executable):
15
+ super().__init__()
16
+
17
+ self.model = os.path.normpath(model)
18
+ self.output_tokens = output_tokens
19
+ self.context_size = context_size
20
+ self.threads = threads
21
+ self.executable = os.path.normpath(executable)
22
+
23
+ def generate(
24
+ self,
25
+ input_ids: str,
26
+ max_new_tokens: Optional[int] = None,
27
+ temperature: float = 0.8,
28
+ top_p: float = 0.95,
29
+ top_k: int = 40,
30
+ return_raw: bool = False,
31
+ **kwargs, # pylint: disable=unused-argument
32
+ ):
33
+ """
34
+ Pass a text prompt into the llamacpp inference CLI.
35
+
36
+ The input_ids arg here should receive the original text that
37
+ would normally be encoded by a tokenizer.
38
+
39
+ Args:
40
+ input_ids: The input text prompt
41
+ max_new_tokens: Maximum number of tokens to generate
42
+ temperature: Temperature for sampling (0.0 = greedy)
43
+ top_p: Top-p sampling threshold
44
+ top_k: Top-k sampling threshold
45
+ return_raw: If True, returns the complete raw output including timing info
46
+ **kwargs: Additional arguments (ignored)
47
+
48
+ Returns:
49
+ List containing a single string with the generated text, or raw output if
50
+ return_raw=True
51
+ """
52
+
53
+ prompt = input_ids
54
+ n_predict = max_new_tokens if max_new_tokens is not None else self.output_tokens
55
+
56
+ cmd = [
57
+ self.executable,
58
+ "-m",
59
+ self.model,
60
+ "--ctx-size",
61
+ str(self.context_size),
62
+ "-n",
63
+ str(n_predict),
64
+ "-t",
65
+ str(self.threads),
66
+ "-p",
67
+ prompt,
68
+ "--temp",
69
+ str(temperature),
70
+ "--top-p",
71
+ str(top_p),
72
+ "--top-k",
73
+ str(top_k),
74
+ "-e",
75
+ "-no-cnv",
76
+ ]
77
+
78
+ cmd = [str(m) for m in cmd]
79
+
80
+ try:
81
+ process = subprocess.Popen(
82
+ cmd,
83
+ stdout=subprocess.PIPE,
84
+ stderr=subprocess.PIPE,
85
+ universal_newlines=True,
86
+ encoding="utf-8",
87
+ errors="replace",
88
+ )
89
+
90
+ raw_output, stderr = process.communicate(timeout=600)
91
+ if process.returncode != 0:
92
+ error_msg = f"llama.cpp failed with return code {process.returncode}.\n"
93
+ error_msg += f"Command: {' '.join(cmd)}\n"
94
+ error_msg += f"Error output:\n{stderr}\n"
95
+ error_msg += f"Standard output:\n{raw_output}"
96
+ raise Exception(error_msg)
97
+
98
+ if raw_output is None:
99
+ raise Exception("No output received from llama.cpp process")
100
+
101
+ # Parse timing information
102
+ for line in raw_output.splitlines():
103
+ if "llama_perf_context_print: eval time =" in line:
104
+ parts = line.split("(")[1].strip()
105
+ parts = parts.split(",")
106
+ ms_per_token = float(parts[0].split("ms per token")[0].strip())
107
+ self.tokens_per_second = (
108
+ 1000 / ms_per_token if ms_per_token > 0 else 0
109
+ )
110
+ if "llama_perf_context_print: prompt eval time =" in line:
111
+ parts = line.split("=")[1].split("/")[0]
112
+ time_to_first_token_ms = float(parts.split("ms")[0].strip())
113
+ self.time_to_first_token = time_to_first_token_ms / 1000
114
+
115
+ if return_raw:
116
+ return [raw_output, stderr]
117
+
118
+ # Find where the prompt ends and the generated text begins
119
+ prompt_found = False
120
+ output_text = ""
121
+ prompt_first_line = prompt.split("\n")[0]
122
+ for line in raw_output.splitlines():
123
+ if prompt_first_line in line:
124
+ prompt_found = True
125
+ if prompt_found:
126
+ line = line.replace("</s> [end of text]", "")
127
+ output_text = output_text + line
128
+
129
+ if not prompt_found:
130
+ raise Exception(
131
+ f"Could not find prompt '{prompt_first_line}' in llama.cpp output. "
132
+ "This usually means the model failed to process the prompt correctly.\n"
133
+ f"Raw output:\n{raw_output}\n"
134
+ f"Stderr:\n{stderr}"
135
+ )
136
+
137
+ # Return list containing the generated text
138
+ return [output_text]
139
+
140
+ except Exception as e:
141
+ error_msg = f"Failed to run llama.cpp command: {str(e)}\n"
142
+ error_msg += f"Command: {' '.join(cmd)}"
143
+ raise Exception(error_msg)
144
+
145
+
146
+ class LoadLlamaCpp(FirstTool):
147
+ unique_name = "load-llama-cpp"
148
+
149
+ def __init__(self):
150
+ super().__init__(monitor_message="Loading llama.cpp model")
151
+
152
+ @staticmethod
153
+ def parser(add_help: bool = True) -> argparse.ArgumentParser:
154
+ parser = __class__.helpful_parser(
155
+ short_description="Wrap llama.cpp models with an API",
156
+ add_help=add_help,
157
+ )
158
+
159
+ parser.add_argument(
160
+ "--executable",
161
+ required=True,
162
+ type=str,
163
+ help="Path to the llama.cpp executable (e.g., llama-cli or llama-cli.exe)",
164
+ )
165
+
166
+ default_threads = 1
167
+ parser.add_argument(
168
+ "--threads",
169
+ required=False,
170
+ type=int,
171
+ default=default_threads,
172
+ help=f"Number of threads to use for generation (default: {default_threads})",
173
+ )
174
+
175
+ context_size = 512
176
+ parser.add_argument(
177
+ "--context-size",
178
+ required=False,
179
+ type=int,
180
+ default=context_size,
181
+ help=f"Context size of the prompt (default: {context_size})",
182
+ )
183
+
184
+ output_tokens = 512
185
+ parser.add_argument(
186
+ "--output-tokens",
187
+ required=False,
188
+ type=int,
189
+ default=output_tokens,
190
+ help=f"Maximum number of output tokens the LLM should make (default: {output_tokens})",
191
+ )
192
+
193
+ parser.add_argument(
194
+ "--model-binary",
195
+ required=True,
196
+ type=str,
197
+ help="Path to a .gguf model file",
198
+ )
199
+
200
+ return parser
201
+
202
+ def run(
203
+ self,
204
+ state: State,
205
+ input: str = "",
206
+ context_size: int = 512,
207
+ threads: int = 1,
208
+ output_tokens: int = 512,
209
+ model_binary: Optional[str] = None,
210
+ executable: str = None,
211
+ ) -> State:
212
+ """
213
+ Load a llama.cpp model
214
+ """
215
+
216
+ if executable is None:
217
+ raise Exception(f"{self.__class__.unique_name} requires an executable path")
218
+
219
+ # Convert paths to platform-specific format
220
+ executable = os.path.normpath(executable)
221
+
222
+ if model_binary:
223
+ model_to_use = os.path.normpath(model_binary)
224
+ else:
225
+ model_binary = input
226
+ model_to_use = os.path.normpath(model_binary) if model_binary else None
227
+
228
+ if not model_binary:
229
+ model_to_use = state.get(Keys.MODEL)
230
+
231
+ if model_to_use is None:
232
+ raise Exception(
233
+ f"{self.__class__.unique_name} requires the preceding tool to pass a "
234
+ "Llamacpp model, "
235
+ "or for the user to supply a model with `--model-binary`"
236
+ )
237
+
238
+ state.model = LlamaCppAdapter(
239
+ model=model_to_use,
240
+ output_tokens=output_tokens,
241
+ context_size=context_size,
242
+ threads=threads,
243
+ executable=executable,
244
+ )
245
+ state.tokenizer = PassthroughTokenizer()
246
+
247
+ # Save stats about the model
248
+ state.save_stat(Keys.CHECKPOINT, model_to_use)
249
+
250
+ # Get base model information if this is a converted HF model
251
+ base_model = get_base_model(input)
252
+ if base_model is not None:
253
+ state.save_stat("base_model", base_model)
254
+
255
+ status.add_to_state(state=state, name=input, model=model_to_use)
256
+
257
+ return state
258
+
259
+
260
+ # This file was originally licensed under Apache 2.0. It has been modified.
261
+ # Modifications Copyright (c) 2025 AMD
@@ -0,0 +1,154 @@
1
+ import argparse
2
+ import statistics
3
+ from statistics import StatisticsError
4
+ from lemonade.state import State
5
+ from lemonade.cache import Keys
6
+ from lemonade.tools.llamacpp import LlamaCppAdapter
7
+ from lemonade.tools.bench import Bench
8
+
9
+
10
+ class LlamaCppBench(Bench):
11
+
12
+ unique_name = "llama-cpp-bench"
13
+
14
+ def __init__(self):
15
+ super().__init__()
16
+
17
+ # Additional statistics generated by this bench tool
18
+ self.status_stats += [
19
+ Keys.STD_DEV_TOKENS_PER_SECOND,
20
+ ]
21
+ self.std_dev_token_generation_tokens_per_second_list = []
22
+
23
+ @staticmethod
24
+ def parser(add_help: bool = True) -> argparse.ArgumentParser:
25
+ parser = __class__.helpful_parser(
26
+ short_description="Benchmark a llama.cpp model",
27
+ add_help=add_help,
28
+ )
29
+
30
+ parser = Bench.parser(parser)
31
+
32
+ return parser
33
+
34
+ def run_prompt(
35
+ self,
36
+ state: State,
37
+ report_progress_fn,
38
+ prompt: str,
39
+ iterations: int,
40
+ warmup_iterations: int,
41
+ output_tokens: int,
42
+ ) -> State:
43
+ """
44
+ Benchmark llama.cpp model that was loaded by LoadLlamaCpp.
45
+ """
46
+
47
+ if self.first_run_prompt:
48
+
49
+ if not hasattr(state, "model") or not isinstance(
50
+ state.model, LlamaCppAdapter
51
+ ):
52
+ raise Exception(
53
+ f"{self.__class__.unique_name} requires a LlamaCppAdapter model to be "
54
+ "loaded first. Please run load-llama-cpp before this tool."
55
+ )
56
+
57
+ iteration_tokens_per_second = []
58
+ iteration_time_to_first_token = []
59
+
60
+ for iteration in range(iterations + warmup_iterations):
61
+ try:
62
+ # Use the adapter's generate method which already has the timeout
63
+ # and error handling
64
+ raw_output, stderr = state.model.generate(prompt, return_raw=True)
65
+
66
+ # Parse the timing information from the output
67
+ ms_per_token = None
68
+ time_to_first_token_ms = None
69
+ input_tokens = None
70
+
71
+ # Look for timing in both stdout and stderr
72
+ for output in [raw_output, stderr]:
73
+ for line in output.splitlines():
74
+ if "llama_perf_context_print: eval time =" in line:
75
+ parts = line.split("(")[1].strip()
76
+ parts = parts.split(",")
77
+ ms_per_token = float(
78
+ parts[0].split("ms per token")[0].strip()
79
+ )
80
+ if "llama_perf_context_print: prompt eval time =" in line:
81
+ parts = line.split("=")[1].split("/")
82
+ time_to_first_token_ms = float(
83
+ parts[0].split("ms")[0].strip()
84
+ )
85
+ input_tokens = int(parts[1].split("tokens")[0].strip())
86
+
87
+ if ms_per_token is None or time_to_first_token_ms is None:
88
+ error_msg = (
89
+ "Could not find timing information in llama.cpp output.\n"
90
+ )
91
+ error_msg += "Raw output:\n" + raw_output + "\n"
92
+ error_msg += "Stderr:\n" + stderr
93
+ raise Exception(error_msg)
94
+
95
+ # When output_tokens is set to 1 for accuracy tests, ms_per_token tends to 0
96
+ # and causes a divide-by-zero error. Set tokens_per_second to 0 in such cases
97
+ # as performance data for generating a few tokens is not relevant.
98
+ tokens_per_second = 0
99
+ if output_tokens > 5 and ms_per_token > 0:
100
+ tokens_per_second = 1000 / ms_per_token
101
+ time_to_first_token = time_to_first_token_ms / 1000
102
+
103
+ if iteration > warmup_iterations - 1:
104
+ iteration_tokens_per_second.append(tokens_per_second)
105
+ iteration_time_to_first_token.append(time_to_first_token)
106
+
107
+ report_progress_fn((iteration + 1) / (warmup_iterations + iterations))
108
+
109
+ except Exception as e:
110
+ error_msg = f"Failed to run benchmark: {str(e)}"
111
+ raise Exception(error_msg)
112
+
113
+ self.input_ids_len_list.append(input_tokens)
114
+ mean_time_to_first_token = statistics.mean(iteration_time_to_first_token)
115
+ self.mean_time_to_first_token_list.append(mean_time_to_first_token)
116
+ self.prefill_tokens_per_second_list.append(
117
+ input_tokens / mean_time_to_first_token
118
+ )
119
+ self.token_generation_tokens_per_second_list.append(
120
+ statistics.mean(iteration_tokens_per_second)
121
+ )
122
+ try:
123
+ self.std_dev_time_to_first_token_list.append(
124
+ statistics.stdev(iteration_time_to_first_token)
125
+ )
126
+ except StatisticsError:
127
+ # Less than 2 measurements
128
+ self.std_dev_time_to_first_token_list.append(None)
129
+ try:
130
+ self.std_dev_token_generation_tokens_per_second_list.append(
131
+ statistics.stdev(iteration_tokens_per_second)
132
+ )
133
+ except StatisticsError:
134
+ # Less than 2 measurements
135
+ self.std_dev_token_generation_tokens_per_second_list.append(None)
136
+
137
+ def save_stats(self, state):
138
+ super().save_stats(state)
139
+
140
+ # Save additional statistics
141
+ if not all(
142
+ element is None
143
+ for element in self.std_dev_token_generation_tokens_per_second_list
144
+ ):
145
+ state.save_stat(
146
+ Keys.STD_DEV_TOKENS_PER_SECOND,
147
+ self.get_item_or_list(
148
+ self.std_dev_token_generation_tokens_per_second_list
149
+ ),
150
+ )
151
+
152
+
153
+ # This file was originally licensed under Apache 2.0. It has been modified.
154
+ # Modifications Copyright (c) 2025 AMD