lemonade-sdk 7.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lemonade-sdk might be problematic. Click here for more details.
- lemonade/__init__.py +5 -0
- lemonade/api.py +125 -0
- lemonade/cache.py +85 -0
- lemonade/cli.py +135 -0
- lemonade/common/__init__.py +0 -0
- lemonade/common/analyze_model.py +26 -0
- lemonade/common/build.py +223 -0
- lemonade/common/cli_helpers.py +139 -0
- lemonade/common/exceptions.py +98 -0
- lemonade/common/filesystem.py +368 -0
- lemonade/common/labels.py +61 -0
- lemonade/common/onnx_helpers.py +176 -0
- lemonade/common/plugins.py +10 -0
- lemonade/common/printing.py +110 -0
- lemonade/common/status.py +490 -0
- lemonade/common/system_info.py +390 -0
- lemonade/common/tensor_helpers.py +83 -0
- lemonade/common/test_helpers.py +28 -0
- lemonade/profilers/__init__.py +1 -0
- lemonade/profilers/memory_tracker.py +257 -0
- lemonade/profilers/profiler.py +55 -0
- lemonade/sequence.py +363 -0
- lemonade/state.py +159 -0
- lemonade/tools/__init__.py +1 -0
- lemonade/tools/adapter.py +104 -0
- lemonade/tools/bench.py +284 -0
- lemonade/tools/huggingface_bench.py +267 -0
- lemonade/tools/huggingface_load.py +520 -0
- lemonade/tools/humaneval.py +258 -0
- lemonade/tools/llamacpp.py +261 -0
- lemonade/tools/llamacpp_bench.py +154 -0
- lemonade/tools/management_tools.py +273 -0
- lemonade/tools/mmlu.py +327 -0
- lemonade/tools/ort_genai/__init__.py +0 -0
- lemonade/tools/ort_genai/oga.py +1129 -0
- lemonade/tools/ort_genai/oga_bench.py +142 -0
- lemonade/tools/perplexity.py +146 -0
- lemonade/tools/prompt.py +228 -0
- lemonade/tools/quark/__init__.py +0 -0
- lemonade/tools/quark/quark_load.py +172 -0
- lemonade/tools/quark/quark_quantize.py +439 -0
- lemonade/tools/report/__init__.py +0 -0
- lemonade/tools/report/llm_report.py +203 -0
- lemonade/tools/report/table.py +739 -0
- lemonade/tools/server/__init__.py +0 -0
- lemonade/tools/server/serve.py +1354 -0
- lemonade/tools/server/tool_calls.py +146 -0
- lemonade/tools/tool.py +374 -0
- lemonade/version.py +1 -0
- lemonade_install/__init__.py +1 -0
- lemonade_install/install.py +774 -0
- lemonade_sdk-7.0.0.dist-info/METADATA +116 -0
- lemonade_sdk-7.0.0.dist-info/RECORD +61 -0
- lemonade_sdk-7.0.0.dist-info/WHEEL +5 -0
- lemonade_sdk-7.0.0.dist-info/entry_points.txt +4 -0
- lemonade_sdk-7.0.0.dist-info/licenses/LICENSE +201 -0
- lemonade_sdk-7.0.0.dist-info/licenses/NOTICE.md +21 -0
- lemonade_sdk-7.0.0.dist-info/top_level.txt +3 -0
- lemonade_server/cli.py +260 -0
- lemonade_server/model_manager.py +98 -0
- lemonade_server/server_models.json +142 -0
lemonade/state.py
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
from typing import Dict, Optional, Any
|
|
4
|
+
import yaml
|
|
5
|
+
import lemonade.common.build as build
|
|
6
|
+
import lemonade.common.filesystem as fs
|
|
7
|
+
from lemonade.version import __version__ as lemonade_version
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _is_nice_to_write(value):
|
|
11
|
+
"""
|
|
12
|
+
Checks whether a value is nice to write to YAML.
|
|
13
|
+
Returns True if the value is a string, int, float, bool, list, dict, or tuple.
|
|
14
|
+
Returns False otherwise.
|
|
15
|
+
"""
|
|
16
|
+
if isinstance(value, (str, int, float, bool)):
|
|
17
|
+
return True
|
|
18
|
+
elif isinstance(value, list) or isinstance(value, tuple):
|
|
19
|
+
# Check if all elements in the list are nice to write
|
|
20
|
+
return all(_is_nice_to_write(item) for item in value)
|
|
21
|
+
elif isinstance(value, dict):
|
|
22
|
+
# Check if all values in the dictionary are nice to write
|
|
23
|
+
return all(_is_nice_to_write(item) for item in value.values())
|
|
24
|
+
return False
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _sanitize_for_yaml(input_dict: Dict) -> Dict:
|
|
28
|
+
"""
|
|
29
|
+
Creates a new dictionary containing only nice-to-write values
|
|
30
|
+
from the original dictionary.
|
|
31
|
+
"""
|
|
32
|
+
result = {}
|
|
33
|
+
for key, value in input_dict.items():
|
|
34
|
+
if _is_nice_to_write(value):
|
|
35
|
+
result[key] = value
|
|
36
|
+
return result
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class State:
|
|
40
|
+
"""
|
|
41
|
+
The State class is meant to carry build state, starting with the user's
|
|
42
|
+
initial arguments, through each build Tool in the Sequence, and finally
|
|
43
|
+
to the disk, where it is used to assess cache hits.
|
|
44
|
+
|
|
45
|
+
State is initialized with the key members that are shared by every build,
|
|
46
|
+
and reasonable default values are assigned as appropriate.
|
|
47
|
+
|
|
48
|
+
Tool developers can also add any members they wish. To get or set an
|
|
49
|
+
attribute, reference it as an attribute:
|
|
50
|
+
1. get: `my_variable = state.attribute_name`
|
|
51
|
+
2. set: `state.attribute_name = my_variable`
|
|
52
|
+
|
|
53
|
+
Build State can be saved and loaded from disk in the form of a state.yaml file
|
|
54
|
+
via State.save() and load_state(), respectively. Note that while State can
|
|
55
|
+
contain members of any type, only YAML-safe members (str, int, bool, float,
|
|
56
|
+
list, dict, tuple) will be saved and loaded.
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
def __init__(
|
|
60
|
+
self,
|
|
61
|
+
cache_dir: str,
|
|
62
|
+
build_name: Optional[str] = None,
|
|
63
|
+
sequence_info: Dict[str, Dict] = None,
|
|
64
|
+
**kwargs,
|
|
65
|
+
):
|
|
66
|
+
|
|
67
|
+
# The default model name is the name of the python file that calls build_model()
|
|
68
|
+
if build_name is None:
|
|
69
|
+
build_name = os.path.basename(sys.argv[0])
|
|
70
|
+
|
|
71
|
+
# Support "~" in the cache_dir argument
|
|
72
|
+
parsed_cache_dir = os.path.expanduser(cache_dir)
|
|
73
|
+
|
|
74
|
+
# Save settings as State members
|
|
75
|
+
self.cache_dir = parsed_cache_dir
|
|
76
|
+
self.build_name = build_name
|
|
77
|
+
self.sequence_info = sequence_info
|
|
78
|
+
self.lemonade_version = lemonade_version
|
|
79
|
+
self.build_status = build.FunctionStatus.NOT_STARTED
|
|
80
|
+
self.downcast_applied = False
|
|
81
|
+
self.uid = build.unique_id()
|
|
82
|
+
self.results = None
|
|
83
|
+
|
|
84
|
+
# Store any additional kwargs as members
|
|
85
|
+
for key, value in kwargs.items():
|
|
86
|
+
self.__dict__[key] = value
|
|
87
|
+
|
|
88
|
+
def __setattr__(self, name: str, value: Any) -> None:
|
|
89
|
+
"""
|
|
90
|
+
Tool developers can add a new member to State by simply
|
|
91
|
+
assigning it as an attribute, i.e., `state.new_member = value`.
|
|
92
|
+
"""
|
|
93
|
+
return super().__setattr__(name, value)
|
|
94
|
+
|
|
95
|
+
def save_stat(self, key: str, value):
|
|
96
|
+
"""
|
|
97
|
+
Save statistics to an yaml file in the build directory
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
stats = fs.Stats(self.cache_dir, self.build_name)
|
|
101
|
+
stats.save_stat(key, value)
|
|
102
|
+
|
|
103
|
+
def save_sub_stat(self, parent_key: str, key: str, value):
|
|
104
|
+
"""
|
|
105
|
+
Save statistics to an yaml file in the build directory
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
stats = fs.Stats(self.cache_dir, self.build_name)
|
|
109
|
+
stats.save_sub_stat(parent_key, key, value)
|
|
110
|
+
|
|
111
|
+
def save(self):
|
|
112
|
+
"""
|
|
113
|
+
Save all YAML-friendly members to disk as a state.yaml file.
|
|
114
|
+
|
|
115
|
+
Note that `model` and `inputs` will typically not be saved since
|
|
116
|
+
they are typically in non-YAML-friendly types such as `torch.nn.Module`
|
|
117
|
+
and `torch.tensor`.
|
|
118
|
+
"""
|
|
119
|
+
|
|
120
|
+
state_to_save = _sanitize_for_yaml(vars(self))
|
|
121
|
+
|
|
122
|
+
# Create a build directory in the cache
|
|
123
|
+
fs.make_build_dir(self.cache_dir, self.build_name)
|
|
124
|
+
|
|
125
|
+
with open(
|
|
126
|
+
build.state_file(self.cache_dir, self.build_name),
|
|
127
|
+
"w",
|
|
128
|
+
encoding="utf8",
|
|
129
|
+
) as outfile:
|
|
130
|
+
yaml.dump(state_to_save, outfile)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def load_state(
|
|
134
|
+
cache_dir=None,
|
|
135
|
+
build_name=None,
|
|
136
|
+
state_path=None,
|
|
137
|
+
) -> State:
|
|
138
|
+
"""
|
|
139
|
+
Read a state.yaml file corresponding to a specific build in a specific
|
|
140
|
+
cache, and use its contents to initialize a State instance.
|
|
141
|
+
"""
|
|
142
|
+
|
|
143
|
+
if state_path is not None:
|
|
144
|
+
file_path = state_path
|
|
145
|
+
elif build_name is not None and cache_dir is not None:
|
|
146
|
+
file_path = build.state_file(cache_dir, build_name)
|
|
147
|
+
else:
|
|
148
|
+
raise ValueError(
|
|
149
|
+
"This function requires either build_name and cache_dir to be set, "
|
|
150
|
+
"or state_path to be set, not both or neither"
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
state_dict = build.load_yaml(file_path)
|
|
154
|
+
|
|
155
|
+
return State(**state_dict)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
# This file was originally licensed under Apache 2.0. It has been modified.
|
|
159
|
+
# Modifications Copyright (c) 2025 AMD
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .tool import Tool, FirstTool, NiceHelpFormatter
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
from transformers import AutoTokenizer
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class ModelAdapter(abc.ABC):
|
|
6
|
+
"""
|
|
7
|
+
Base class for adapting an LLM to work with lemonade's standardized tools
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
def __init__(self):
|
|
11
|
+
"""
|
|
12
|
+
Self-benchmarking ModelAdapters can store their results in the
|
|
13
|
+
tokens_per_second and time_to_first_token members.
|
|
14
|
+
"""
|
|
15
|
+
self.tokens_per_second = None
|
|
16
|
+
self.time_to_first_token = None
|
|
17
|
+
self.type = "generic"
|
|
18
|
+
|
|
19
|
+
@abc.abstractmethod
|
|
20
|
+
def generate(self, input_ids, max_new_tokens=512):
|
|
21
|
+
"""
|
|
22
|
+
Generate is the primary method required by lemonade's accuracy tools
|
|
23
|
+
|
|
24
|
+
We try to keep the signature here minimal to allow for maximum compatibility
|
|
25
|
+
with recipe components, which themselves may not support a lot of arguments.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class TokenizerAdapter(abc.ABC):
|
|
30
|
+
"""
|
|
31
|
+
Base class for adapting an LLM's tokenizer to work with lemonade's standard tools
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(self, tokenizer: AutoTokenizer = None):
|
|
35
|
+
self.auto_tokenizer = tokenizer
|
|
36
|
+
|
|
37
|
+
@abc.abstractmethod
|
|
38
|
+
def __call__(self, prompt: str):
|
|
39
|
+
"""
|
|
40
|
+
Args:
|
|
41
|
+
prompt: text that should be encoded and passed to the LLM as input_ids
|
|
42
|
+
|
|
43
|
+
Returns: input_ids
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
@abc.abstractmethod
|
|
47
|
+
def decode(self, response) -> str:
|
|
48
|
+
"""
|
|
49
|
+
Args:
|
|
50
|
+
response: tokens from the LLM that should be decoded into text
|
|
51
|
+
|
|
52
|
+
Returns: text response of the LLM
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
def apply_chat_template(self, *args, **kwargs):
|
|
56
|
+
"""
|
|
57
|
+
Convert messages into a single tokenizable string
|
|
58
|
+
"""
|
|
59
|
+
return self.auto_tokenizer.apply_chat_template(*args, **kwargs)
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def chat_template(self):
|
|
63
|
+
return self.auto_tokenizer.chat_template
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def eos_token(self):
|
|
67
|
+
return self.auto_tokenizer.eos_token
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class PassthroughTokenizerResult:
|
|
71
|
+
"""
|
|
72
|
+
Data structure for holding a tokenizer result where the input_ids
|
|
73
|
+
are packaged in a non-standard way, but we still want to adhere to
|
|
74
|
+
standard interfaces (e.g., result.input_ids).
|
|
75
|
+
|
|
76
|
+
For example: CLI-based tools that have their own internal tokenizer that
|
|
77
|
+
isn't exposed to the user. In this case we can pass the prompt through as
|
|
78
|
+
a string.
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
def __init__(self, prompt):
|
|
82
|
+
self.input_ids = prompt
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class PassthroughTokenizer(TokenizerAdapter):
|
|
86
|
+
"""
|
|
87
|
+
Tokenizer adapter that forwards the prompt to input_ids as text,
|
|
88
|
+
and then forwards a text LLM response through decode() as text.
|
|
89
|
+
|
|
90
|
+
Useful for CLI-based tools that have their own internal tokenizer that
|
|
91
|
+
isn't exposed to the user.
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
# pylint: disable=unused-argument
|
|
95
|
+
def __call__(self, prompt: str, **kwargs):
|
|
96
|
+
return PassthroughTokenizerResult(prompt)
|
|
97
|
+
|
|
98
|
+
# pylint: disable=unused-argument
|
|
99
|
+
def decode(self, response: str, **kwargs):
|
|
100
|
+
return response
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
# This file was originally licensed under Apache 2.0. It has been modified.
|
|
104
|
+
# Modifications Copyright (c) 2025 AMD
|
lemonade/tools/bench.py
ADDED
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
import argparse
|
|
3
|
+
import os
|
|
4
|
+
import platform
|
|
5
|
+
import psutil
|
|
6
|
+
from lemonade.state import State
|
|
7
|
+
from lemonade.tools import Tool
|
|
8
|
+
from lemonade.cache import Keys
|
|
9
|
+
|
|
10
|
+
default_iterations = 10
|
|
11
|
+
default_warmup_runs = 5
|
|
12
|
+
default_prompt_length = 64
|
|
13
|
+
default_output_tokens = 32
|
|
14
|
+
default_prompt = "Hello, I am conscious and"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Bench(Tool, ABC):
|
|
18
|
+
"""
|
|
19
|
+
Abstract parent class for tools that benchmark the performance of the generate()
|
|
20
|
+
method of an LLM.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(self, monitor_message="Benchmarking LLM"):
|
|
24
|
+
super().__init__(monitor_message)
|
|
25
|
+
|
|
26
|
+
# The minimum set of statistics that a benchmark tool will produce
|
|
27
|
+
# Inherited tools should append any additional statistics they generate to this list
|
|
28
|
+
self.status_stats = [
|
|
29
|
+
Keys.SECONDS_TO_FIRST_TOKEN,
|
|
30
|
+
Keys.STD_DEV_SECONDS_TO_FIRST_TOKEN,
|
|
31
|
+
Keys.TOKEN_GENERATION_TOKENS_PER_SECOND,
|
|
32
|
+
Keys.PREFILL_TOKENS_PER_SECOND,
|
|
33
|
+
Keys.PROMPT_TOKENS,
|
|
34
|
+
Keys.RESPONSE_TOKENS,
|
|
35
|
+
Keys.MAX_MEMORY_USED_GBYTE,
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
# Minimum per measurement statistics
|
|
39
|
+
# Inherited tools should add additional lists for other per prompt statistics
|
|
40
|
+
self.input_ids_len_list = []
|
|
41
|
+
self.tokens_out_len_list = []
|
|
42
|
+
self.mean_time_to_first_token_list = []
|
|
43
|
+
self.std_dev_time_to_first_token_list = []
|
|
44
|
+
self.prefill_tokens_per_second_list = []
|
|
45
|
+
self.token_generation_tokens_per_second_list = []
|
|
46
|
+
self.max_memory_used_gb_list = []
|
|
47
|
+
|
|
48
|
+
# Max memory used can only be measured on Windows systems
|
|
49
|
+
self.save_max_memory_used = platform.system() == "Windows"
|
|
50
|
+
|
|
51
|
+
# This is set to True only for the duration of the first call to run_prompt
|
|
52
|
+
self.first_run_prompt = None
|
|
53
|
+
|
|
54
|
+
@staticmethod
|
|
55
|
+
def parser(parser: argparse.ArgumentParser = None, add_help: bool = True):
|
|
56
|
+
# Allow inherited classes to initialize and pass in a parser, add parameters to it if so
|
|
57
|
+
if parser is None:
|
|
58
|
+
parser = __class__.helpful_parser(
|
|
59
|
+
short_description="Benchmark an LLM", add_help=add_help
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
parser.add_argument(
|
|
63
|
+
"--iterations",
|
|
64
|
+
"-i",
|
|
65
|
+
required=False,
|
|
66
|
+
type=int,
|
|
67
|
+
default=default_iterations,
|
|
68
|
+
help="Number of benchmarking iterations to run (default: "
|
|
69
|
+
f"{default_iterations})",
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
parser.add_argument(
|
|
73
|
+
"--warmup-iterations",
|
|
74
|
+
"-w",
|
|
75
|
+
required=False,
|
|
76
|
+
type=int,
|
|
77
|
+
default=default_warmup_runs,
|
|
78
|
+
help="Number of benchmarking iterations to use for cache warmup "
|
|
79
|
+
"(the results of these iterations "
|
|
80
|
+
f"are not included in the results; default: {default_warmup_runs})",
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
parser.add_argument(
|
|
84
|
+
"--prompts",
|
|
85
|
+
"-p",
|
|
86
|
+
nargs="+",
|
|
87
|
+
required=False,
|
|
88
|
+
default=[str(default_prompt_length)],
|
|
89
|
+
metavar="PROMPT",
|
|
90
|
+
help="Input one or more prompts to the LLM. Three formats are supported. "
|
|
91
|
+
"1) integer: use a synthetic prompt with the specified length "
|
|
92
|
+
"2) str: use a user-provided prompt string "
|
|
93
|
+
"3) path/to/prompt.txt: load the prompt from a text file. "
|
|
94
|
+
f"(default: {default_prompt_length}) ",
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
parser.add_argument(
|
|
98
|
+
"--output-tokens",
|
|
99
|
+
required=False,
|
|
100
|
+
type=int,
|
|
101
|
+
default=default_output_tokens,
|
|
102
|
+
help="Number of new tokens the LLM should make (default: "
|
|
103
|
+
f"{default_output_tokens})",
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
return parser
|
|
107
|
+
|
|
108
|
+
def get_prompt_str(self, _state, token_length):
|
|
109
|
+
"""
|
|
110
|
+
Returns a string with approximately the prescribed token length.
|
|
111
|
+
Note: Actual token length is dependent on the tokenizer.
|
|
112
|
+
"""
|
|
113
|
+
return "word " * (token_length - 1)
|
|
114
|
+
|
|
115
|
+
def parse(self, state: State, args, known_only=True) -> argparse.Namespace:
|
|
116
|
+
"""
|
|
117
|
+
Helper function to parse CLI arguments into the args expected by run()
|
|
118
|
+
"""
|
|
119
|
+
|
|
120
|
+
parsed_args = super().parse(state, args, known_only)
|
|
121
|
+
|
|
122
|
+
if parsed_args.prompts is None:
|
|
123
|
+
parsed_args.prompts = [str(default_prompt_length)]
|
|
124
|
+
|
|
125
|
+
# Decode prompt arg into a list of prompt strings
|
|
126
|
+
prompt_strings = []
|
|
127
|
+
for prompt_item in parsed_args.prompts:
|
|
128
|
+
if prompt_item.isdigit():
|
|
129
|
+
# Generate a prompt with the requested length
|
|
130
|
+
token_length = int(prompt_item)
|
|
131
|
+
prompt_strings.append(self.get_prompt_str(state, token_length))
|
|
132
|
+
|
|
133
|
+
elif os.path.exists(prompt_item):
|
|
134
|
+
with open(prompt_item, "r", encoding="utf-8") as f:
|
|
135
|
+
prompt_strings.append(f.read())
|
|
136
|
+
|
|
137
|
+
else:
|
|
138
|
+
# No change to the prompt
|
|
139
|
+
prompt_strings.append(prompt_item)
|
|
140
|
+
parsed_args.prompts = prompt_strings
|
|
141
|
+
|
|
142
|
+
return parsed_args
|
|
143
|
+
|
|
144
|
+
def run(
|
|
145
|
+
self,
|
|
146
|
+
state: State,
|
|
147
|
+
prompts: list[str] = None,
|
|
148
|
+
iterations: int = default_iterations,
|
|
149
|
+
warmup_iterations: int = default_warmup_runs,
|
|
150
|
+
output_tokens: int = default_output_tokens,
|
|
151
|
+
**kwargs,
|
|
152
|
+
) -> State:
|
|
153
|
+
"""
|
|
154
|
+
Args:
|
|
155
|
+
- prompts: List of input prompts used as starting points for LLM text generation
|
|
156
|
+
- iterations: number of benchmarking samples to take; results are
|
|
157
|
+
reported as the median and mean of the samples.
|
|
158
|
+
- warmup_iterations: subset of the iterations to treat as warmup,
|
|
159
|
+
and not included in the results.
|
|
160
|
+
- output_tokens: Number of new tokens LLM to create.
|
|
161
|
+
- kwargs: Additional parameters used by bench tools
|
|
162
|
+
"""
|
|
163
|
+
|
|
164
|
+
if prompts is None:
|
|
165
|
+
prompts = ["word " * (default_prompt_length - 2)]
|
|
166
|
+
elif isinstance(prompts, str):
|
|
167
|
+
prompts = [prompts]
|
|
168
|
+
|
|
169
|
+
state.save_stat("prompts", prompts)
|
|
170
|
+
state.save_stat("iterations", iterations)
|
|
171
|
+
state.save_stat("warmup_iterations", warmup_iterations)
|
|
172
|
+
state.save_stat("output_tokens", output_tokens)
|
|
173
|
+
|
|
174
|
+
counter = 0
|
|
175
|
+
report_progress_fn = lambda x: self.set_percent_progress(
|
|
176
|
+
100 * (counter + x) / len(prompts)
|
|
177
|
+
)
|
|
178
|
+
self.first_run_prompt = True
|
|
179
|
+
for counter, prompt in enumerate(prompts):
|
|
180
|
+
report_progress_fn(0)
|
|
181
|
+
|
|
182
|
+
self.run_prompt(
|
|
183
|
+
state,
|
|
184
|
+
report_progress_fn,
|
|
185
|
+
prompt,
|
|
186
|
+
iterations,
|
|
187
|
+
warmup_iterations,
|
|
188
|
+
output_tokens,
|
|
189
|
+
**kwargs,
|
|
190
|
+
)
|
|
191
|
+
self.first_run_prompt = False
|
|
192
|
+
|
|
193
|
+
if self.save_max_memory_used:
|
|
194
|
+
self.max_memory_used_gb_list.append(
|
|
195
|
+
psutil.Process().memory_info().peak_wset / 1024**3
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
self.set_percent_progress(None)
|
|
199
|
+
self.save_stats(state)
|
|
200
|
+
|
|
201
|
+
return state
|
|
202
|
+
|
|
203
|
+
@abstractmethod
|
|
204
|
+
def run_prompt(
|
|
205
|
+
self,
|
|
206
|
+
state,
|
|
207
|
+
report_progress_fn,
|
|
208
|
+
prompt,
|
|
209
|
+
iterations,
|
|
210
|
+
warmup_iterations,
|
|
211
|
+
output_tokens,
|
|
212
|
+
**kwargs,
|
|
213
|
+
):
|
|
214
|
+
pass
|
|
215
|
+
|
|
216
|
+
@staticmethod
|
|
217
|
+
def get_item_or_list(lst):
|
|
218
|
+
"""
|
|
219
|
+
If the list is just a single item then return the item, else return the list
|
|
220
|
+
"""
|
|
221
|
+
if len(lst) == 1:
|
|
222
|
+
return lst[0]
|
|
223
|
+
else:
|
|
224
|
+
return lst
|
|
225
|
+
|
|
226
|
+
def save_stats(self, state):
|
|
227
|
+
# Save performance data to stats
|
|
228
|
+
state.save_stat(
|
|
229
|
+
Keys.PROMPT_TOKENS, self.get_item_or_list(self.input_ids_len_list)
|
|
230
|
+
)
|
|
231
|
+
state.save_stat(
|
|
232
|
+
Keys.RESPONSE_TOKENS, self.get_item_or_list(self.tokens_out_len_list)
|
|
233
|
+
)
|
|
234
|
+
state.save_stat(
|
|
235
|
+
Keys.SECONDS_TO_FIRST_TOKEN,
|
|
236
|
+
self.get_item_or_list(self.mean_time_to_first_token_list),
|
|
237
|
+
)
|
|
238
|
+
if not all(
|
|
239
|
+
element is None for element in self.std_dev_time_to_first_token_list
|
|
240
|
+
):
|
|
241
|
+
state.save_stat(
|
|
242
|
+
Keys.STD_DEV_SECONDS_TO_FIRST_TOKEN,
|
|
243
|
+
self.get_item_or_list(self.std_dev_time_to_first_token_list),
|
|
244
|
+
)
|
|
245
|
+
state.save_stat(
|
|
246
|
+
Keys.PREFILL_TOKENS_PER_SECOND,
|
|
247
|
+
self.get_item_or_list(self.prefill_tokens_per_second_list),
|
|
248
|
+
)
|
|
249
|
+
state.save_stat(
|
|
250
|
+
Keys.TOKEN_GENERATION_TOKENS_PER_SECOND,
|
|
251
|
+
self.get_item_or_list(self.token_generation_tokens_per_second_list),
|
|
252
|
+
)
|
|
253
|
+
if self.save_max_memory_used:
|
|
254
|
+
state.save_stat(
|
|
255
|
+
Keys.MAX_MEMORY_USED_GBYTE,
|
|
256
|
+
self.get_item_or_list(self.max_memory_used_gb_list),
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
@staticmethod
|
|
260
|
+
def not_enough_tokens(output_tokens: int):
|
|
261
|
+
"""
|
|
262
|
+
Raise an exception that explains why a benchmark did not produce any results
|
|
263
|
+
"""
|
|
264
|
+
|
|
265
|
+
raise ValueError(
|
|
266
|
+
"Your model was benchmarked, however none of the benchmarking "
|
|
267
|
+
"iterations produced the requested amount of output tokens "
|
|
268
|
+
f"(currently {output_tokens}), so "
|
|
269
|
+
"the results have been discarded. You have the following options "
|
|
270
|
+
"to solve this: \n"
|
|
271
|
+
"1. Use the -p option to change the prompt to something that will "
|
|
272
|
+
"produce more output tokens. For example, 'The extremely long "
|
|
273
|
+
"story of my life, told in excruciating details is:' "
|
|
274
|
+
"is an example of a prompt that will result in a lot of output. \n"
|
|
275
|
+
"2. Set a lower value for --output-tokens to make it more likely "
|
|
276
|
+
"that the model will produce enough. \n"
|
|
277
|
+
"3. Set more verbose hyperparameters. \n"
|
|
278
|
+
"4. Run more benchmarking iterations, to improve the chance of "
|
|
279
|
+
"getting at least one with enough output tokens. \n"
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
# This file was originally licensed under Apache 2.0. It has been modified.
|
|
284
|
+
# Modifications Copyright (c) 2025 AMD
|