lemonade-sdk 7.0.4__py3-none-any.whl → 8.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lemonade-sdk might be problematic. Click here for more details.
- lemonade/api.py +3 -3
- lemonade/cli.py +11 -17
- lemonade/common/build.py +0 -47
- lemonade/common/network.py +50 -0
- lemonade/common/status.py +2 -21
- lemonade/common/system_info.py +19 -4
- lemonade/profilers/memory_tracker.py +3 -1
- lemonade/tools/accuracy.py +3 -4
- lemonade/tools/adapter.py +1 -2
- lemonade/tools/{huggingface_bench.py → huggingface/bench.py} +2 -87
- lemonade/tools/huggingface/load.py +235 -0
- lemonade/tools/{huggingface_load.py → huggingface/utils.py} +87 -255
- lemonade/tools/humaneval.py +9 -3
- lemonade/tools/{llamacpp_bench.py → llamacpp/bench.py} +1 -1
- lemonade/tools/{llamacpp.py → llamacpp/load.py} +18 -2
- lemonade/tools/mmlu.py +7 -15
- lemonade/tools/{ort_genai/oga.py → oga/load.py} +31 -422
- lemonade/tools/oga/utils.py +423 -0
- lemonade/tools/perplexity.py +4 -3
- lemonade/tools/prompt.py +2 -1
- lemonade/tools/quark/quark_load.py +2 -1
- lemonade/tools/quark/quark_quantize.py +5 -5
- lemonade/tools/report/table.py +3 -3
- lemonade/tools/server/llamacpp.py +154 -29
- lemonade/tools/server/serve.py +169 -146
- lemonade/tools/server/static/favicon.ico +0 -0
- lemonade/tools/server/static/styles.css +568 -0
- lemonade/tools/server/static/webapp.html +439 -0
- lemonade/tools/server/tray.py +458 -0
- lemonade/tools/server/{port_utils.py → utils/port.py} +22 -3
- lemonade/tools/server/utils/system_tray.py +395 -0
- lemonade/tools/server/{instructions.py → webapp.py} +4 -10
- lemonade/version.py +1 -1
- lemonade_install/install.py +46 -28
- {lemonade_sdk-7.0.4.dist-info → lemonade_sdk-8.0.0.dist-info}/METADATA +84 -22
- lemonade_sdk-8.0.0.dist-info/RECORD +70 -0
- lemonade_server/cli.py +182 -27
- lemonade_server/model_manager.py +192 -20
- lemonade_server/pydantic_models.py +9 -4
- lemonade_server/server_models.json +5 -3
- lemonade/common/analyze_model.py +0 -26
- lemonade/common/labels.py +0 -61
- lemonade/common/onnx_helpers.py +0 -176
- lemonade/common/plugins.py +0 -10
- lemonade/common/tensor_helpers.py +0 -83
- lemonade/tools/server/static/instructions.html +0 -262
- lemonade_sdk-7.0.4.dist-info/RECORD +0 -69
- /lemonade/tools/{ort_genai → oga}/__init__.py +0 -0
- /lemonade/tools/{ort_genai/oga_bench.py → oga/bench.py} +0 -0
- /lemonade/tools/server/{thread_utils.py → utils/thread.py} +0 -0
- {lemonade_sdk-7.0.4.dist-info → lemonade_sdk-8.0.0.dist-info}/WHEEL +0 -0
- {lemonade_sdk-7.0.4.dist-info → lemonade_sdk-8.0.0.dist-info}/entry_points.txt +0 -0
- {lemonade_sdk-7.0.4.dist-info → lemonade_sdk-8.0.0.dist-info}/licenses/LICENSE +0 -0
- {lemonade_sdk-7.0.4.dist-info → lemonade_sdk-8.0.0.dist-info}/licenses/NOTICE.md +0 -0
- {lemonade_sdk-7.0.4.dist-info → lemonade_sdk-8.0.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
from typing import Dict, Optional
|
|
3
|
+
import json
|
|
4
|
+
from lemonade.tools import FirstTool
|
|
5
|
+
from lemonade.state import State
|
|
6
|
+
import lemonade.common.status as status
|
|
7
|
+
import lemonade.common.printing as printing
|
|
8
|
+
from lemonade.cache import Keys
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class HuggingfaceLoad(FirstTool):
|
|
12
|
+
"""
|
|
13
|
+
Load an LLM as a torch.nn.Module using the Hugging Face transformers
|
|
14
|
+
from_pretrained() API.
|
|
15
|
+
|
|
16
|
+
Expected input: a checkpoint to load
|
|
17
|
+
|
|
18
|
+
Output state produced:
|
|
19
|
+
- state.model: instance of torch.nn.Module that implements an LLM.
|
|
20
|
+
- state.inputs: tokenized example inputs to the model, in the form of a
|
|
21
|
+
dictionary of kwargs.
|
|
22
|
+
- state.tokenizer: instance of Hugging Face PretrainedTokenizer.
|
|
23
|
+
- state.dtype: data type of the model.
|
|
24
|
+
- state.checkpoint: pretrained checkpoint used to load the model.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
unique_name = "huggingface-load"
|
|
28
|
+
|
|
29
|
+
def _imports(self):
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
def __init__(self):
|
|
33
|
+
super().__init__(monitor_message="Loading Huggingface checkpoint")
|
|
34
|
+
|
|
35
|
+
self.status_stats = [Keys.DTYPE]
|
|
36
|
+
|
|
37
|
+
@staticmethod
|
|
38
|
+
def parser(add_help: bool = True) -> argparse.ArgumentParser:
|
|
39
|
+
parser = __class__.helpful_parser(
|
|
40
|
+
short_description="Load an LLM in PyTorch using huggingface transformers",
|
|
41
|
+
add_help=add_help,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
default_dtype = "float32"
|
|
45
|
+
parser.add_argument(
|
|
46
|
+
"--dtype",
|
|
47
|
+
"-d",
|
|
48
|
+
required=False,
|
|
49
|
+
default=default_dtype,
|
|
50
|
+
help=f"Data type to load the model in (default: {default_dtype}).",
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
choices = ["cpu", "cuda"]
|
|
54
|
+
for cuda in range(15):
|
|
55
|
+
choices.append(f"cuda:{cuda}")
|
|
56
|
+
parser.add_argument(
|
|
57
|
+
"--device",
|
|
58
|
+
required=False,
|
|
59
|
+
default=None,
|
|
60
|
+
choices=choices,
|
|
61
|
+
help="Move the model and inputs to a device using the .to() method "
|
|
62
|
+
"(default: don't call the .to() method)",
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
parser.add_argument(
|
|
66
|
+
"--load-kwargs",
|
|
67
|
+
required=False,
|
|
68
|
+
default="{}",
|
|
69
|
+
type=json.loads,
|
|
70
|
+
help="Arbitrary kwargs, in json format, that will be passed as "
|
|
71
|
+
"from_pretrained(**kwargs). "
|
|
72
|
+
r"Example: --load-kwargs='{\"trust_remote_code\": true} would result in "
|
|
73
|
+
"from_pretrained(trust_remote_code=True)",
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
parser.add_argument(
|
|
77
|
+
"--channels-last",
|
|
78
|
+
default=True,
|
|
79
|
+
type=bool,
|
|
80
|
+
help="Whether to format the model in memory using "
|
|
81
|
+
"channels-last (default: True)",
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
return parser
|
|
85
|
+
|
|
86
|
+
def parse(self, state: State, args, known_only=True) -> argparse.Namespace:
|
|
87
|
+
|
|
88
|
+
from lemonade.tools.huggingface.utils import str_to_dtype
|
|
89
|
+
|
|
90
|
+
parsed_args = super().parse(state, args, known_only)
|
|
91
|
+
|
|
92
|
+
# Save stats about the user's input (do this prior to decoding)
|
|
93
|
+
state.save_stat(Keys.CHECKPOINT, parsed_args.input)
|
|
94
|
+
state.save_stat(Keys.DTYPE, parsed_args.dtype)
|
|
95
|
+
|
|
96
|
+
# Decode dtype arg into a torch value
|
|
97
|
+
parsed_args.dtype = str_to_dtype[parsed_args.dtype]
|
|
98
|
+
|
|
99
|
+
return parsed_args
|
|
100
|
+
|
|
101
|
+
def run(
|
|
102
|
+
self,
|
|
103
|
+
state: State,
|
|
104
|
+
input: str = "",
|
|
105
|
+
dtype: "torch.dtype" = None,
|
|
106
|
+
device: Optional[str] = None,
|
|
107
|
+
load_kwargs: Optional[Dict] = None,
|
|
108
|
+
channels_last: bool = True,
|
|
109
|
+
) -> State:
|
|
110
|
+
# Import expensive modules at runtime
|
|
111
|
+
import transformers
|
|
112
|
+
import torch
|
|
113
|
+
|
|
114
|
+
from lemonade.tools.huggingface.utils import (
|
|
115
|
+
HuggingfaceTokenizerAdapter,
|
|
116
|
+
HuggingfaceAdapter,
|
|
117
|
+
)
|
|
118
|
+
from lemonade.common.network import (
|
|
119
|
+
is_offline,
|
|
120
|
+
get_base_model,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
# Set default dtype
|
|
124
|
+
if dtype is None:
|
|
125
|
+
dtype_to_use = torch.float32
|
|
126
|
+
else:
|
|
127
|
+
dtype_to_use = dtype
|
|
128
|
+
|
|
129
|
+
# Auto-detect offline status
|
|
130
|
+
offline = is_offline()
|
|
131
|
+
if offline:
|
|
132
|
+
printing.log_warning(
|
|
133
|
+
"Network connectivity to huggingface.co not detected. Running in offline mode."
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
checkpoint = input
|
|
137
|
+
|
|
138
|
+
if load_kwargs is None:
|
|
139
|
+
load_kwargs_to_use = {}
|
|
140
|
+
else:
|
|
141
|
+
load_kwargs_to_use = load_kwargs
|
|
142
|
+
|
|
143
|
+
# Add local_files_only to kwargs in offline mode
|
|
144
|
+
if offline:
|
|
145
|
+
load_kwargs_to_use["local_files_only"] = True
|
|
146
|
+
|
|
147
|
+
if vars(state).get(Keys.MODEL):
|
|
148
|
+
raise ValueError("HuggingfaceLoad must be the first tool in the sequence")
|
|
149
|
+
|
|
150
|
+
try:
|
|
151
|
+
model = transformers.AutoModelForCausalLM.from_pretrained(
|
|
152
|
+
checkpoint,
|
|
153
|
+
torch_dtype=dtype_to_use,
|
|
154
|
+
low_cpu_mem_usage=True,
|
|
155
|
+
**load_kwargs_to_use,
|
|
156
|
+
)
|
|
157
|
+
except Exception as e:
|
|
158
|
+
if offline and "Can't load config for" in str(e):
|
|
159
|
+
raise ValueError(
|
|
160
|
+
f"Cannot load model {checkpoint} in offline mode. "
|
|
161
|
+
f"The model files may not be available locally. Original error: {str(e)}"
|
|
162
|
+
)
|
|
163
|
+
raise
|
|
164
|
+
|
|
165
|
+
# Only call the model.to() method if an argument to this function
|
|
166
|
+
# provides a reason to do so
|
|
167
|
+
to_args = {}
|
|
168
|
+
if channels_last:
|
|
169
|
+
to_args["memory_format"] = torch.channels_last
|
|
170
|
+
if device:
|
|
171
|
+
to_args["device"] = device
|
|
172
|
+
if to_args:
|
|
173
|
+
model.to(**to_args)
|
|
174
|
+
|
|
175
|
+
model = model.eval()
|
|
176
|
+
|
|
177
|
+
try:
|
|
178
|
+
tokenizer_kwargs = {
|
|
179
|
+
"use_fast": False,
|
|
180
|
+
"model_max_length": 4096,
|
|
181
|
+
"padding_side": "left",
|
|
182
|
+
}
|
|
183
|
+
if offline:
|
|
184
|
+
tokenizer_kwargs["local_files_only"] = True
|
|
185
|
+
|
|
186
|
+
tokenizer = transformers.AutoTokenizer.from_pretrained(
|
|
187
|
+
checkpoint, **tokenizer_kwargs
|
|
188
|
+
)
|
|
189
|
+
except ValueError as e:
|
|
190
|
+
# Sometimes those specific tokenizer flags are not supported, in which
|
|
191
|
+
# case we try to just load a simple tokenizer
|
|
192
|
+
tokenizer_kwargs = {}
|
|
193
|
+
if offline:
|
|
194
|
+
tokenizer_kwargs["local_files_only"] = True
|
|
195
|
+
|
|
196
|
+
try:
|
|
197
|
+
tokenizer = transformers.AutoTokenizer.from_pretrained(
|
|
198
|
+
checkpoint, **tokenizer_kwargs
|
|
199
|
+
)
|
|
200
|
+
except Exception as e:
|
|
201
|
+
if offline and "Can't load tokenizer for" in str(e):
|
|
202
|
+
raise ValueError(
|
|
203
|
+
f"Cannot load tokenizer for {checkpoint} in offline mode. "
|
|
204
|
+
f"The tokenizer files may not be available locally. "
|
|
205
|
+
f"Original error: {str(e)}"
|
|
206
|
+
)
|
|
207
|
+
raise
|
|
208
|
+
|
|
209
|
+
# Pass the model and inputs into state
|
|
210
|
+
state.model = HuggingfaceAdapter(model, dtype_to_use, device, tokenizer)
|
|
211
|
+
|
|
212
|
+
state.tokenizer = HuggingfaceTokenizerAdapter(tokenizer, device)
|
|
213
|
+
state.dtype = dtype_to_use
|
|
214
|
+
state.checkpoint = checkpoint
|
|
215
|
+
state.device = device
|
|
216
|
+
|
|
217
|
+
# Save stats about the model
|
|
218
|
+
state.save_stat(Keys.CHECKPOINT, checkpoint)
|
|
219
|
+
state.save_stat(Keys.DTYPE, str(dtype_to_use).split(".")[1])
|
|
220
|
+
state.save_stat(Keys.DEVICE, device)
|
|
221
|
+
|
|
222
|
+
# Get base model information
|
|
223
|
+
base_model = get_base_model(checkpoint)
|
|
224
|
+
if base_model is not None:
|
|
225
|
+
state.save_stat("base_model", base_model)
|
|
226
|
+
|
|
227
|
+
# Create a UniqueInvocationInfo and ModelInfo so that we can display status
|
|
228
|
+
# at the end of the sequence
|
|
229
|
+
status.add_to_state(state=state, name=input, model=model)
|
|
230
|
+
|
|
231
|
+
return state
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
# This file was originally licensed under Apache 2.0. It has been modified.
|
|
235
|
+
# Modifications Copyright (c) 2025 AMD
|
|
@@ -1,16 +1,12 @@
|
|
|
1
|
-
import
|
|
2
|
-
|
|
3
|
-
import
|
|
4
|
-
import socket
|
|
1
|
+
from typing import Dict, List, Tuple
|
|
2
|
+
import time
|
|
3
|
+
from contextlib import nullcontext
|
|
5
4
|
import transformers
|
|
6
5
|
import torch
|
|
7
|
-
from huggingface_hub import model_info
|
|
8
6
|
from lemonade.state import State
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
from lemonade.tools import
|
|
12
|
-
from lemonade.tools.adapter import ModelAdapter, TokenizerAdapter
|
|
13
|
-
from lemonade.cache import Keys
|
|
7
|
+
from lemonade.tools.adapter import TokenizerAdapter
|
|
8
|
+
from lemonade.tools.adapter import ModelAdapter
|
|
9
|
+
from lemonade.tools.bench import Bench
|
|
14
10
|
|
|
15
11
|
# Command line interfaces for tools will use string inputs for data
|
|
16
12
|
# types, however the internal tool logic will need to know the actual
|
|
@@ -62,249 +58,6 @@ class HuggingfaceTokenizerAdapter(TokenizerAdapter):
|
|
|
62
58
|
return self.tokenizer.save_pretrained(model_dir, **kwargs)
|
|
63
59
|
|
|
64
60
|
|
|
65
|
-
def is_offline():
|
|
66
|
-
"""
|
|
67
|
-
Check if the system is offline by attempting to connect to huggingface.co.
|
|
68
|
-
|
|
69
|
-
Returns:
|
|
70
|
-
bool: True if the system is offline (cannot connect to huggingface.co),
|
|
71
|
-
False otherwise.
|
|
72
|
-
"""
|
|
73
|
-
try:
|
|
74
|
-
socket.gethostbyname("huggingface.co")
|
|
75
|
-
return False
|
|
76
|
-
except socket.gaierror:
|
|
77
|
-
return True
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
def get_base_model(checkpoint: str) -> Optional[str]:
|
|
81
|
-
"""
|
|
82
|
-
Get the base model information for a given checkpoint from the Hugging Face Hub.
|
|
83
|
-
Will auto-detect if we're offline and skip the network call in that case.
|
|
84
|
-
|
|
85
|
-
Args:
|
|
86
|
-
checkpoint: The model checkpoint to query
|
|
87
|
-
|
|
88
|
-
Returns:
|
|
89
|
-
The base model name if found, or None if not found or error occurs
|
|
90
|
-
"""
|
|
91
|
-
# Skip network call in offline mode
|
|
92
|
-
if is_offline():
|
|
93
|
-
return None
|
|
94
|
-
|
|
95
|
-
try:
|
|
96
|
-
info = model_info(checkpoint)
|
|
97
|
-
if info.cardData and "base_model" in info.cardData:
|
|
98
|
-
if info.cardData["base_model"] is not None:
|
|
99
|
-
# This is a derived model
|
|
100
|
-
return info.cardData["base_model"]
|
|
101
|
-
else:
|
|
102
|
-
# This is itself a base model
|
|
103
|
-
return [checkpoint]
|
|
104
|
-
except Exception: # pylint: disable=broad-except
|
|
105
|
-
pass
|
|
106
|
-
return None
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
class HuggingfaceLoad(FirstTool):
|
|
110
|
-
"""
|
|
111
|
-
Load an LLM as a torch.nn.Module using the Hugging Face transformers
|
|
112
|
-
from_pretrained() API.
|
|
113
|
-
|
|
114
|
-
Expected input: a checkpoint to load
|
|
115
|
-
|
|
116
|
-
Output state produced:
|
|
117
|
-
- state.model: instance of torch.nn.Module that implements an LLM.
|
|
118
|
-
- state.inputs: tokenized example inputs to the model, in the form of a
|
|
119
|
-
dictionary of kwargs.
|
|
120
|
-
- state.tokenizer: instance of Hugging Face PretrainedTokenizer.
|
|
121
|
-
- state.dtype: data type of the model.
|
|
122
|
-
- state.checkpoint: pretrained checkpoint used to load the model.
|
|
123
|
-
"""
|
|
124
|
-
|
|
125
|
-
unique_name = "huggingface-load"
|
|
126
|
-
|
|
127
|
-
def __init__(self):
|
|
128
|
-
super().__init__(monitor_message="Loading Huggingface checkpoint")
|
|
129
|
-
|
|
130
|
-
self.status_stats = [Keys.DTYPE]
|
|
131
|
-
|
|
132
|
-
@staticmethod
|
|
133
|
-
def parser(add_help: bool = True) -> argparse.ArgumentParser:
|
|
134
|
-
parser = __class__.helpful_parser(
|
|
135
|
-
short_description="Load an LLM in PyTorch using huggingface transformers",
|
|
136
|
-
add_help=add_help,
|
|
137
|
-
)
|
|
138
|
-
|
|
139
|
-
default_dtype = "float32"
|
|
140
|
-
parser.add_argument(
|
|
141
|
-
"--dtype",
|
|
142
|
-
"-d",
|
|
143
|
-
required=False,
|
|
144
|
-
default=default_dtype,
|
|
145
|
-
help=f"Data type to load the model in (default: {default_dtype}).",
|
|
146
|
-
)
|
|
147
|
-
|
|
148
|
-
choices = ["cpu", "cuda"]
|
|
149
|
-
for cuda in range(15):
|
|
150
|
-
choices.append(f"cuda:{cuda}")
|
|
151
|
-
parser.add_argument(
|
|
152
|
-
"--device",
|
|
153
|
-
required=False,
|
|
154
|
-
default=None,
|
|
155
|
-
choices=choices,
|
|
156
|
-
help="Move the model and inputs to a device using the .to() method "
|
|
157
|
-
"(default: don't call the .to() method)",
|
|
158
|
-
)
|
|
159
|
-
|
|
160
|
-
parser.add_argument(
|
|
161
|
-
"--load-kwargs",
|
|
162
|
-
required=False,
|
|
163
|
-
default="{}",
|
|
164
|
-
type=json.loads,
|
|
165
|
-
help="Arbitrary kwargs, in json format, that will be passed as "
|
|
166
|
-
"from_pretrained(**kwargs). "
|
|
167
|
-
r"Example: --load-kwargs='{\"trust_remote_code\": true} would result in "
|
|
168
|
-
"from_pretrained(trust_remote_code=True)",
|
|
169
|
-
)
|
|
170
|
-
|
|
171
|
-
parser.add_argument(
|
|
172
|
-
"--channels-last",
|
|
173
|
-
default=True,
|
|
174
|
-
type=bool,
|
|
175
|
-
help="Whether to format the model in memory using "
|
|
176
|
-
"channels-last (default: True)",
|
|
177
|
-
)
|
|
178
|
-
|
|
179
|
-
return parser
|
|
180
|
-
|
|
181
|
-
def parse(self, state: State, args, known_only=True) -> argparse.Namespace:
|
|
182
|
-
|
|
183
|
-
parsed_args = super().parse(state, args, known_only)
|
|
184
|
-
|
|
185
|
-
# Save stats about the user's input (do this prior to decoding)
|
|
186
|
-
state.save_stat(Keys.CHECKPOINT, parsed_args.input)
|
|
187
|
-
state.save_stat(Keys.DTYPE, parsed_args.dtype)
|
|
188
|
-
|
|
189
|
-
# Decode dtype arg into a torch value
|
|
190
|
-
parsed_args.dtype = str_to_dtype[parsed_args.dtype]
|
|
191
|
-
|
|
192
|
-
return parsed_args
|
|
193
|
-
|
|
194
|
-
def run(
|
|
195
|
-
self,
|
|
196
|
-
state: State,
|
|
197
|
-
input: str = "",
|
|
198
|
-
dtype: torch.dtype = torch.float32,
|
|
199
|
-
device: Optional[str] = None,
|
|
200
|
-
load_kwargs: Optional[Dict] = None,
|
|
201
|
-
channels_last: bool = True,
|
|
202
|
-
) -> State:
|
|
203
|
-
# Auto-detect offline status
|
|
204
|
-
offline = is_offline()
|
|
205
|
-
if offline:
|
|
206
|
-
printing.log_warning(
|
|
207
|
-
"Network connectivity to huggingface.co not detected. Running in offline mode."
|
|
208
|
-
)
|
|
209
|
-
|
|
210
|
-
checkpoint = input
|
|
211
|
-
|
|
212
|
-
if load_kwargs is None:
|
|
213
|
-
load_kwargs_to_use = {}
|
|
214
|
-
else:
|
|
215
|
-
load_kwargs_to_use = load_kwargs
|
|
216
|
-
|
|
217
|
-
# Add local_files_only to kwargs in offline mode
|
|
218
|
-
if offline:
|
|
219
|
-
load_kwargs_to_use["local_files_only"] = True
|
|
220
|
-
|
|
221
|
-
if vars(state).get(Keys.MODEL):
|
|
222
|
-
raise ValueError("HuggingfaceLoad must be the first tool in the sequence")
|
|
223
|
-
|
|
224
|
-
try:
|
|
225
|
-
model = transformers.AutoModelForCausalLM.from_pretrained(
|
|
226
|
-
checkpoint,
|
|
227
|
-
torch_dtype=dtype,
|
|
228
|
-
low_cpu_mem_usage=True,
|
|
229
|
-
**load_kwargs_to_use,
|
|
230
|
-
)
|
|
231
|
-
except Exception as e:
|
|
232
|
-
if offline and "Can't load config for" in str(e):
|
|
233
|
-
raise ValueError(
|
|
234
|
-
f"Cannot load model {checkpoint} in offline mode. "
|
|
235
|
-
f"The model files may not be available locally. Original error: {str(e)}"
|
|
236
|
-
)
|
|
237
|
-
raise
|
|
238
|
-
|
|
239
|
-
# Only call the model.to() method if an argument to this function
|
|
240
|
-
# provides a reason to do so
|
|
241
|
-
to_args = {}
|
|
242
|
-
if channels_last:
|
|
243
|
-
to_args["memory_format"] = torch.channels_last
|
|
244
|
-
if device:
|
|
245
|
-
to_args["device"] = device
|
|
246
|
-
if to_args:
|
|
247
|
-
model.to(**to_args)
|
|
248
|
-
|
|
249
|
-
model = model.eval()
|
|
250
|
-
|
|
251
|
-
try:
|
|
252
|
-
tokenizer_kwargs = {
|
|
253
|
-
"use_fast": False,
|
|
254
|
-
"model_max_length": 4096,
|
|
255
|
-
"padding_side": "left",
|
|
256
|
-
}
|
|
257
|
-
if offline:
|
|
258
|
-
tokenizer_kwargs["local_files_only"] = True
|
|
259
|
-
|
|
260
|
-
tokenizer = transformers.AutoTokenizer.from_pretrained(
|
|
261
|
-
checkpoint, **tokenizer_kwargs
|
|
262
|
-
)
|
|
263
|
-
except ValueError as e:
|
|
264
|
-
# Sometimes those specific tokenizer flags are not supported, in which
|
|
265
|
-
# case we try to just load a simple tokenizer
|
|
266
|
-
tokenizer_kwargs = {}
|
|
267
|
-
if offline:
|
|
268
|
-
tokenizer_kwargs["local_files_only"] = True
|
|
269
|
-
|
|
270
|
-
try:
|
|
271
|
-
tokenizer = transformers.AutoTokenizer.from_pretrained(
|
|
272
|
-
checkpoint, **tokenizer_kwargs
|
|
273
|
-
)
|
|
274
|
-
except Exception as e:
|
|
275
|
-
if offline and "Can't load tokenizer for" in str(e):
|
|
276
|
-
raise ValueError(
|
|
277
|
-
f"Cannot load tokenizer for {checkpoint} in offline mode. "
|
|
278
|
-
f"The tokenizer files may not be available locally. "
|
|
279
|
-
f"Original error: {str(e)}"
|
|
280
|
-
)
|
|
281
|
-
raise
|
|
282
|
-
|
|
283
|
-
# Pass the model and inputs into state
|
|
284
|
-
state.model = HuggingfaceAdapter(model, dtype, device, tokenizer)
|
|
285
|
-
|
|
286
|
-
state.tokenizer = HuggingfaceTokenizerAdapter(tokenizer, device)
|
|
287
|
-
state.dtype = dtype
|
|
288
|
-
state.checkpoint = checkpoint
|
|
289
|
-
state.device = device
|
|
290
|
-
|
|
291
|
-
# Save stats about the model
|
|
292
|
-
state.save_stat(Keys.CHECKPOINT, checkpoint)
|
|
293
|
-
state.save_stat(Keys.DTYPE, str(dtype).split(".")[1])
|
|
294
|
-
state.save_stat(Keys.DEVICE, device)
|
|
295
|
-
|
|
296
|
-
# Get base model information
|
|
297
|
-
base_model = get_base_model(checkpoint)
|
|
298
|
-
if base_model is not None:
|
|
299
|
-
state.save_stat("base_model", base_model)
|
|
300
|
-
|
|
301
|
-
# Create a UniqueInvocationInfo and ModelInfo so that we can display status
|
|
302
|
-
# at the end of the sequence
|
|
303
|
-
status.add_to_state(state=state, name=input, model=model)
|
|
304
|
-
|
|
305
|
-
return state
|
|
306
|
-
|
|
307
|
-
|
|
308
61
|
class HuggingfaceAdapter(ModelAdapter):
|
|
309
62
|
"""
|
|
310
63
|
Wrapper class for Huggingface LLMs that handle generation arguments
|
|
@@ -522,5 +275,84 @@ class HuggingfaceAdapter(ModelAdapter):
|
|
|
522
275
|
return text_offset, token_log_probs, token_strings, top_logprobs_list
|
|
523
276
|
|
|
524
277
|
|
|
525
|
-
|
|
526
|
-
|
|
278
|
+
def benchmark_huggingface_llm(
|
|
279
|
+
model: torch.nn.Module,
|
|
280
|
+
tokenizer,
|
|
281
|
+
input_ids,
|
|
282
|
+
dtype,
|
|
283
|
+
num_beams: int,
|
|
284
|
+
target_output_tokens: int,
|
|
285
|
+
iterations: int,
|
|
286
|
+
warmup_iterations: int,
|
|
287
|
+
report_progress_fn,
|
|
288
|
+
) -> List[Tuple[float, int]]:
|
|
289
|
+
|
|
290
|
+
amp_enabled = True if (dtype == torch.float16 or dtype == torch.bfloat16) else False
|
|
291
|
+
# The "if amp_enabled else nullcontext()" is to get around a bug in PyTorch 2.1
|
|
292
|
+
# where torch.cpu.amp.autocast(enabled=False) does nothing
|
|
293
|
+
with (
|
|
294
|
+
torch.cpu.amp.autocast(enabled=amp_enabled, dtype=dtype)
|
|
295
|
+
if amp_enabled
|
|
296
|
+
else nullcontext()
|
|
297
|
+
):
|
|
298
|
+
|
|
299
|
+
per_iteration_result = []
|
|
300
|
+
tokens_out_len_list = []
|
|
301
|
+
|
|
302
|
+
# Early stopping is only a valid parameter with multiple beams
|
|
303
|
+
early_stopping = num_beams > 1
|
|
304
|
+
|
|
305
|
+
with torch.no_grad(), torch.inference_mode():
|
|
306
|
+
# Don't capture time for warmup
|
|
307
|
+
for count in range(warmup_iterations):
|
|
308
|
+
outputs = model.generate(
|
|
309
|
+
input_ids,
|
|
310
|
+
num_beams=num_beams,
|
|
311
|
+
max_new_tokens=target_output_tokens,
|
|
312
|
+
min_new_tokens=target_output_tokens,
|
|
313
|
+
early_stopping=early_stopping,
|
|
314
|
+
pad_token_id=tokenizer.eos_token_id,
|
|
315
|
+
)
|
|
316
|
+
tokens_out_len_list.append(outputs.shape[1] - input_ids.shape[1])
|
|
317
|
+
report_progress_fn((count + 1) / (warmup_iterations + iterations))
|
|
318
|
+
|
|
319
|
+
for count in range(iterations):
|
|
320
|
+
# CUDA synchronization is required prior to GPU benchmarking
|
|
321
|
+
# This has no negative effect on CPU-only benchmarks, and is more robust than
|
|
322
|
+
# checking `model.device == "cuda"` since it applies to multi-GPU environments
|
|
323
|
+
# Synchronization is done before collecting the start time because this will
|
|
324
|
+
# ensure that the GPU has finished initialization tasks such as loading weights
|
|
325
|
+
if torch.cuda.is_available():
|
|
326
|
+
torch.cuda.synchronize()
|
|
327
|
+
start_time = time.perf_counter()
|
|
328
|
+
|
|
329
|
+
outputs = model.generate(
|
|
330
|
+
input_ids,
|
|
331
|
+
num_beams=num_beams,
|
|
332
|
+
max_new_tokens=target_output_tokens,
|
|
333
|
+
min_new_tokens=target_output_tokens,
|
|
334
|
+
early_stopping=early_stopping,
|
|
335
|
+
pad_token_id=tokenizer.eos_token_id,
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
if torch.cuda.is_available():
|
|
339
|
+
torch.cuda.synchronize()
|
|
340
|
+
end_time = time.perf_counter()
|
|
341
|
+
|
|
342
|
+
latency = end_time - start_time
|
|
343
|
+
|
|
344
|
+
token_len = outputs.shape[1] - input_ids.shape[1]
|
|
345
|
+
tokens_out_len_list.append(token_len)
|
|
346
|
+
|
|
347
|
+
# Only count an iteration if it produced enough tokens
|
|
348
|
+
if token_len >= target_output_tokens:
|
|
349
|
+
per_iteration_result.append((latency, token_len))
|
|
350
|
+
|
|
351
|
+
report_progress_fn(
|
|
352
|
+
(warmup_iterations + count + 1) / (warmup_iterations + iterations)
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
if not per_iteration_result:
|
|
356
|
+
raise Bench.not_enough_tokens(target_output_tokens)
|
|
357
|
+
|
|
358
|
+
return per_iteration_result, tokens_out_len_list
|
lemonade/tools/humaneval.py
CHANGED
|
@@ -2,9 +2,7 @@ import argparse
|
|
|
2
2
|
import os
|
|
3
3
|
import csv
|
|
4
4
|
from typing import Dict, Optional, Any
|
|
5
|
-
|
|
6
|
-
from human_eval.data import write_jsonl, read_problems
|
|
7
|
-
from human_eval.evaluation import evaluate_functional_correctness
|
|
5
|
+
|
|
8
6
|
|
|
9
7
|
from lemonade.state import State
|
|
10
8
|
from lemonade.tools import Tool
|
|
@@ -95,6 +93,7 @@ class AccuracyHumaneval(Tool):
|
|
|
95
93
|
Returns:
|
|
96
94
|
Updated state with evaluation results
|
|
97
95
|
"""
|
|
96
|
+
|
|
98
97
|
# Validate required state components
|
|
99
98
|
if not hasattr(state, "model") or not hasattr(state, "tokenizer"):
|
|
100
99
|
raise ValueError("State must contain both 'model' and 'tokenizer'")
|
|
@@ -128,6 +127,9 @@ class AccuracyHumaneval(Tool):
|
|
|
128
127
|
|
|
129
128
|
def _download_dataset(self, output_path: str) -> None:
|
|
130
129
|
"""Download HumanEval dataset if not already present."""
|
|
130
|
+
|
|
131
|
+
import requests
|
|
132
|
+
|
|
131
133
|
if os.path.exists(output_path):
|
|
132
134
|
printing.log_info(f"Dataset already exists at: {output_path}")
|
|
133
135
|
return
|
|
@@ -170,6 +172,10 @@ class AccuracyHumaneval(Tool):
|
|
|
170
172
|
Returns:
|
|
171
173
|
Dictionary containing evaluation metrics
|
|
172
174
|
"""
|
|
175
|
+
|
|
176
|
+
from human_eval.data import write_jsonl, read_problems
|
|
177
|
+
from human_eval.evaluation import evaluate_functional_correctness
|
|
178
|
+
|
|
173
179
|
dataset = read_problems(data_path)
|
|
174
180
|
|
|
175
181
|
# Limit to first N problems
|
|
@@ -3,7 +3,7 @@ import statistics
|
|
|
3
3
|
from statistics import StatisticsError
|
|
4
4
|
from lemonade.state import State
|
|
5
5
|
from lemonade.cache import Keys
|
|
6
|
-
from lemonade.tools.llamacpp import LlamaCppAdapter
|
|
6
|
+
from lemonade.tools.llamacpp.load import LlamaCppAdapter
|
|
7
7
|
from lemonade.tools.bench import Bench
|
|
8
8
|
|
|
9
9
|
|