lemonade-sdk 8.1.11__py3-none-any.whl → 8.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lemonade-sdk might be problematic. Click here for more details.
- lemonade/cache.py +6 -1
- lemonade/common/status.py +4 -4
- lemonade/common/system_info.py +0 -26
- lemonade/tools/accuracy.py +143 -48
- lemonade/tools/adapter.py +6 -1
- lemonade/tools/bench.py +26 -8
- lemonade/tools/flm/utils.py +70 -22
- lemonade/tools/huggingface/bench.py +6 -1
- lemonade/tools/llamacpp/bench.py +146 -27
- lemonade/tools/llamacpp/load.py +30 -2
- lemonade/tools/llamacpp/utils.py +317 -21
- lemonade/tools/oga/bench.py +5 -26
- lemonade/tools/oga/load.py +49 -123
- lemonade/tools/oga/migration.py +403 -0
- lemonade/tools/report/table.py +76 -8
- lemonade/tools/server/flm.py +2 -6
- lemonade/tools/server/llamacpp.py +43 -2
- lemonade/tools/server/serve.py +354 -18
- lemonade/tools/server/static/js/chat.js +15 -77
- lemonade/tools/server/static/js/model-settings.js +24 -3
- lemonade/tools/server/static/js/models.js +440 -37
- lemonade/tools/server/static/js/shared.js +61 -8
- lemonade/tools/server/static/logs.html +157 -13
- lemonade/tools/server/static/styles.css +204 -0
- lemonade/tools/server/static/webapp.html +39 -1
- lemonade/version.py +1 -1
- lemonade_install/install.py +33 -579
- {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/METADATA +6 -4
- {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/RECORD +38 -37
- lemonade_server/cli.py +10 -0
- lemonade_server/model_manager.py +172 -11
- lemonade_server/pydantic_models.py +3 -0
- lemonade_server/server_models.json +102 -66
- {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/WHEEL +0 -0
- {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/entry_points.txt +0 -0
- {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/licenses/LICENSE +0 -0
- {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/licenses/NOTICE.md +0 -0
- {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/top_level.txt +0 -0
lemonade/cache.py
CHANGED
|
@@ -43,7 +43,11 @@ def build_name(input_name):
|
|
|
43
43
|
"""
|
|
44
44
|
|
|
45
45
|
if os.path.isdir(input_name):
|
|
46
|
+
# Input is a folder so no good way to determine a model name
|
|
46
47
|
input_name_sanitized = "local_model"
|
|
48
|
+
elif os.path.isfile(input_name):
|
|
49
|
+
# Use the filename without its extension
|
|
50
|
+
input_name_sanitized = os.path.splitext(os.path.basename(input_name))[0]
|
|
47
51
|
else:
|
|
48
52
|
# Sanitize the input name
|
|
49
53
|
input_name_sanitized = input_name.replace("/", "_")
|
|
@@ -63,8 +67,9 @@ class Keys:
|
|
|
63
67
|
TOKEN_GENERATION_TOKENS_PER_SECOND = "token_generation_tokens_per_second"
|
|
64
68
|
STD_DEV_TOKENS_PER_SECOND = "std_dev_tokens_per_second"
|
|
65
69
|
SECONDS_TO_FIRST_TOKEN = "seconds_to_first_token"
|
|
66
|
-
PREFILL_TOKENS_PER_SECOND = "prefill_tokens_per_second"
|
|
67
70
|
STD_DEV_SECONDS_TO_FIRST_TOKEN = "std_dev_seconds_to_first_token"
|
|
71
|
+
PREFILL_TOKENS_PER_SECOND = "prefill_tokens_per_second"
|
|
72
|
+
STD_DEV_PREFILL_TOKENS_PER_SECOND = "std_dev_prefill_tokens_per_second"
|
|
68
73
|
CHECKPOINT = "checkpoint"
|
|
69
74
|
DTYPE = "dtype"
|
|
70
75
|
PROMPT = "prompt"
|
lemonade/common/status.py
CHANGED
|
@@ -112,10 +112,10 @@ class UniqueInvocationInfo(BasicInfo):
|
|
|
112
112
|
if print_file_name:
|
|
113
113
|
print(f"{self.script_name}{self.extension}:")
|
|
114
114
|
|
|
115
|
-
# Print invocation about the model (only applies to scripts, not ONNX files
|
|
115
|
+
# Print invocation about the model (only applies to scripts, not ONNX or GGUF files, nor
|
|
116
116
|
# LLMs, which have no extension)
|
|
117
117
|
if not (
|
|
118
|
-
self.extension
|
|
118
|
+
self.extension in [".onnx", ".gguf"]
|
|
119
119
|
or self.extension == build.state_file_name
|
|
120
120
|
or self.extension == ""
|
|
121
121
|
):
|
|
@@ -138,7 +138,7 @@ class UniqueInvocationInfo(BasicInfo):
|
|
|
138
138
|
|
|
139
139
|
if self.depth == 0:
|
|
140
140
|
print(f"{self.indent}\tLocation:\t{self.file}", end="")
|
|
141
|
-
if self.extension
|
|
141
|
+
if self.extension in [".onnx", ".gguf"]:
|
|
142
142
|
print()
|
|
143
143
|
else:
|
|
144
144
|
print(f", line {self.line}")
|
|
@@ -314,7 +314,7 @@ class UniqueInvocationInfo(BasicInfo):
|
|
|
314
314
|
Print information about a given model or submodel.
|
|
315
315
|
"""
|
|
316
316
|
|
|
317
|
-
if self.extension
|
|
317
|
+
if self.extension in [".onnx", ".gguf"] or self.extension == "":
|
|
318
318
|
self.indent = "\t" * (2 * self.depth)
|
|
319
319
|
else:
|
|
320
320
|
self.indent = "\t" * (2 * self.depth + 1)
|
lemonade/common/system_info.py
CHANGED
|
@@ -1110,32 +1110,6 @@ class LinuxSystemInfo(SystemInfo):
|
|
|
1110
1110
|
|
|
1111
1111
|
return ""
|
|
1112
1112
|
|
|
1113
|
-
def _get_nvidia_vram_smi_linux(self) -> float:
|
|
1114
|
-
"""
|
|
1115
|
-
Get NVIDIA GPU VRAM on Linux using nvidia-smi command.
|
|
1116
|
-
|
|
1117
|
-
Returns:
|
|
1118
|
-
float: VRAM in GB, or 0.0 if detection fails
|
|
1119
|
-
"""
|
|
1120
|
-
try:
|
|
1121
|
-
output = (
|
|
1122
|
-
subprocess.check_output(
|
|
1123
|
-
"nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits",
|
|
1124
|
-
shell=True,
|
|
1125
|
-
stderr=subprocess.DEVNULL,
|
|
1126
|
-
)
|
|
1127
|
-
.decode()
|
|
1128
|
-
.strip()
|
|
1129
|
-
)
|
|
1130
|
-
|
|
1131
|
-
# nvidia-smi returns memory in MB
|
|
1132
|
-
vram_mb = int(output.split("\n")[0])
|
|
1133
|
-
vram_gb = round(vram_mb / 1024, 1)
|
|
1134
|
-
return vram_gb
|
|
1135
|
-
except (subprocess.CalledProcessError, FileNotFoundError, ValueError):
|
|
1136
|
-
pass
|
|
1137
|
-
return 0.0
|
|
1138
|
-
|
|
1139
1113
|
@staticmethod
|
|
1140
1114
|
def get_processor_name() -> str:
|
|
1141
1115
|
"""
|
lemonade/tools/accuracy.py
CHANGED
|
@@ -83,42 +83,116 @@ class LMEvalHarness(Tool):
|
|
|
83
83
|
|
|
84
84
|
return parser
|
|
85
85
|
|
|
86
|
-
def
|
|
87
|
-
"""
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
86
|
+
def _scale_metric(self, metric_name, value):
|
|
87
|
+
"""
|
|
88
|
+
Scale metric value appropriately based on type and range
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
metric_name: Name of the metric (e.g., "acc,none", "ppl")
|
|
92
|
+
value: Numeric value of the metric
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
tuple: (scaled_value, units, display_string)
|
|
96
|
+
"""
|
|
97
|
+
fraction_metrics = {
|
|
98
|
+
"acc",
|
|
99
|
+
"accuracy",
|
|
100
|
+
"f1",
|
|
101
|
+
"exact_match",
|
|
102
|
+
"em",
|
|
103
|
+
"win_rate",
|
|
104
|
+
"recall",
|
|
105
|
+
"precision",
|
|
106
|
+
"rouge",
|
|
107
|
+
"bleu",
|
|
108
|
+
"meteor",
|
|
109
|
+
"bertscore",
|
|
110
|
+
"match",
|
|
111
|
+
"correct",
|
|
112
|
+
"pass",
|
|
113
|
+
"success_rate",
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
metric_base = metric_name.split(",")[0].lower()
|
|
117
|
+
is_fraction = any(
|
|
118
|
+
frac_metric in metric_base for frac_metric in fraction_metrics
|
|
119
|
+
)
|
|
120
|
+
is_in_unit_range = 0 <= value <= 1
|
|
111
121
|
|
|
112
|
-
if
|
|
113
|
-
|
|
122
|
+
if is_fraction and is_in_unit_range:
|
|
123
|
+
scaled_value = float(value) * 100
|
|
124
|
+
units = "%"
|
|
125
|
+
display_str = f"{value:.4f} ({scaled_value:.2f}%)"
|
|
126
|
+
else:
|
|
127
|
+
scaled_value = float(value)
|
|
128
|
+
units = "raw"
|
|
129
|
+
display_str = f"{value:.4f}"
|
|
130
|
+
|
|
131
|
+
return scaled_value, units, display_str
|
|
132
|
+
|
|
133
|
+
def _process_results(self, results_path, state):
|
|
134
|
+
"""
|
|
135
|
+
Process evaluation results and save to state stats
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
results_path: Can be either a direct JSON file path or a directory path
|
|
139
|
+
state: State object to save metrics to
|
|
140
|
+
"""
|
|
141
|
+
results_file_path = None
|
|
142
|
+
|
|
143
|
+
# Determine if this is a file or directory and find the JSON file
|
|
144
|
+
if os.path.isfile(results_path) and results_path.endswith(".json"):
|
|
145
|
+
# Direct JSON file path (modern format)
|
|
146
|
+
results_file_path = results_path
|
|
147
|
+
elif os.path.isdir(results_path):
|
|
148
|
+
# Look for model subdirectories
|
|
149
|
+
model_dirs = [
|
|
150
|
+
d
|
|
151
|
+
for d in os.listdir(results_path)
|
|
152
|
+
if os.path.isdir(os.path.join(results_path, d))
|
|
153
|
+
]
|
|
154
|
+
|
|
155
|
+
if model_dirs:
|
|
156
|
+
# Format: results_dir/model_name/results_*.json
|
|
157
|
+
model_dir = os.path.join(results_path, model_dirs[0])
|
|
158
|
+
printing.log_info(f"Found model directory: {model_dir}")
|
|
159
|
+
|
|
160
|
+
results_files = [
|
|
161
|
+
f
|
|
162
|
+
for f in os.listdir(model_dir)
|
|
163
|
+
if f.startswith("results_") and f.endswith(".json")
|
|
164
|
+
]
|
|
165
|
+
|
|
166
|
+
if results_files:
|
|
167
|
+
results_files.sort(reverse=True)
|
|
168
|
+
results_file_path = os.path.join(model_dir, results_files[0])
|
|
169
|
+
else:
|
|
170
|
+
printing.log_warning(f"No results files found in {model_dir}")
|
|
171
|
+
return
|
|
172
|
+
else:
|
|
173
|
+
printing.log_warning(f"No model directories found in {results_path}")
|
|
174
|
+
return
|
|
175
|
+
else:
|
|
176
|
+
# Handle case where lm-eval adds timestamp to expected filename
|
|
177
|
+
results_dir = os.path.dirname(results_path)
|
|
178
|
+
if os.path.exists(results_dir):
|
|
179
|
+
json_files = [f for f in os.listdir(results_dir) if f.endswith(".json")]
|
|
180
|
+
if json_files:
|
|
181
|
+
results_file_path = os.path.join(results_dir, json_files[0])
|
|
182
|
+
printing.log_info(f"Found results file: {results_file_path}")
|
|
183
|
+
else:
|
|
184
|
+
printing.log_warning(f"No JSON results file found in {results_dir}")
|
|
185
|
+
return
|
|
186
|
+
else:
|
|
187
|
+
printing.log_warning(f"Results path not found at {results_path}")
|
|
188
|
+
return
|
|
189
|
+
|
|
190
|
+
if not results_file_path or not os.path.exists(results_file_path):
|
|
191
|
+
printing.log_warning(f"Results file not found at {results_file_path}")
|
|
114
192
|
return
|
|
115
193
|
|
|
116
|
-
# Sort by timestamp
|
|
117
|
-
results_files.sort(reverse=True)
|
|
118
|
-
results_file_path = os.path.join(model_dir, results_files[0])
|
|
119
194
|
printing.log_info(f"Processing results from {results_file_path}")
|
|
120
195
|
|
|
121
|
-
# Read and process results
|
|
122
196
|
try:
|
|
123
197
|
with open(results_file_path, "r", encoding="utf-8") as f:
|
|
124
198
|
results = json.load(f)
|
|
@@ -132,18 +206,21 @@ class LMEvalHarness(Tool):
|
|
|
132
206
|
if isinstance(value, (int, float)) and not metric.startswith(
|
|
133
207
|
"alias"
|
|
134
208
|
):
|
|
135
|
-
# Format metric name for stats
|
|
136
|
-
clean_metric = metric.
|
|
209
|
+
# Format metric name for stats - remove ,none suffix
|
|
210
|
+
clean_metric = metric.split(",")[0] # Remove ,none suffix
|
|
137
211
|
stat_name = f"lm_eval_{task_name}_{clean_metric}"
|
|
138
212
|
|
|
139
|
-
#
|
|
140
|
-
|
|
141
|
-
|
|
213
|
+
# Scale metric appropriately
|
|
214
|
+
scaled_value, units, value_str = self._scale_metric(
|
|
215
|
+
metric, value
|
|
216
|
+
)
|
|
217
|
+
display_str = f" {metric}: {value_str}"
|
|
218
|
+
|
|
219
|
+
state.save_stat(stat_name, scaled_value)
|
|
220
|
+
state.save_stat(f"{stat_name}_units", units)
|
|
142
221
|
self.status_stats.append(stat_name)
|
|
143
222
|
|
|
144
|
-
printing.log_info(
|
|
145
|
-
f" {metric}: {value:.4f} ({value*100:.2f}%)"
|
|
146
|
-
)
|
|
223
|
+
printing.log_info(display_str)
|
|
147
224
|
|
|
148
225
|
# Save summary metrics if available
|
|
149
226
|
avg_metrics = {}
|
|
@@ -167,12 +244,17 @@ class LMEvalHarness(Tool):
|
|
|
167
244
|
if values:
|
|
168
245
|
avg_value = sum(values) / len(values)
|
|
169
246
|
stat_name = f"lm_eval_average_{metric}"
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
self.
|
|
173
|
-
|
|
174
|
-
f"Average {metric}: {avg_value:.4f} ({avg_value*100:.2f}%)"
|
|
247
|
+
|
|
248
|
+
# Apply same scaling logic as individual metrics
|
|
249
|
+
scaled_avg, units, value_str = self._scale_metric(
|
|
250
|
+
metric, avg_value
|
|
175
251
|
)
|
|
252
|
+
display_str = f"Average {metric}: {value_str}"
|
|
253
|
+
|
|
254
|
+
state.save_stat(stat_name, scaled_avg)
|
|
255
|
+
state.save_stat(f"{stat_name}_units", units)
|
|
256
|
+
self.status_stats.append(stat_name)
|
|
257
|
+
printing.log_info(display_str)
|
|
176
258
|
|
|
177
259
|
except (IOError, json.JSONDecodeError) as e:
|
|
178
260
|
printing.log_error(f"Error processing results: {e}")
|
|
@@ -189,6 +271,20 @@ class LMEvalHarness(Tool):
|
|
|
189
271
|
output_path: Optional[str] = None,
|
|
190
272
|
) -> State:
|
|
191
273
|
|
|
274
|
+
# Check if lm-eval is available
|
|
275
|
+
try:
|
|
276
|
+
# pylint: disable=unused-import
|
|
277
|
+
import lm_eval
|
|
278
|
+
except ImportError:
|
|
279
|
+
error_msg = (
|
|
280
|
+
"lm-eval-harness is required but not installed. "
|
|
281
|
+
"Please install it using one of the following commands:\n"
|
|
282
|
+
" pip install lemonade-sdk[dev]\n"
|
|
283
|
+
" pip install -e .[dev]\n"
|
|
284
|
+
)
|
|
285
|
+
printing.log_error(error_msg)
|
|
286
|
+
raise ImportError(error_msg)
|
|
287
|
+
|
|
192
288
|
import requests
|
|
193
289
|
from lemonade.tools.server.utils.thread import ServerRunner
|
|
194
290
|
|
|
@@ -261,7 +357,7 @@ class LMEvalHarness(Tool):
|
|
|
261
357
|
raise RuntimeError("Failed to start the server")
|
|
262
358
|
|
|
263
359
|
# Build API URL
|
|
264
|
-
results_file = os.path.join(output_path, f"{task}_results")
|
|
360
|
+
results_file = os.path.join(output_path, f"{task}_results.json")
|
|
265
361
|
|
|
266
362
|
printing.log_info(f"Running lm-eval-harness on {task}...")
|
|
267
363
|
|
|
@@ -312,9 +408,8 @@ class LMEvalHarness(Tool):
|
|
|
312
408
|
"Results obtained successfully but couldn't display due to encoding issues"
|
|
313
409
|
)
|
|
314
410
|
|
|
315
|
-
# Process results from the
|
|
316
|
-
|
|
317
|
-
self._process_results(results_dir, state)
|
|
411
|
+
# Process results from the JSON file
|
|
412
|
+
self._process_results(results_file, state)
|
|
318
413
|
|
|
319
414
|
except subprocess.CalledProcessError as e:
|
|
320
415
|
printing.log_error(f"Error running lm-eval-harness: {e}")
|
lemonade/tools/adapter.py
CHANGED
|
@@ -10,11 +10,14 @@ class ModelAdapter(abc.ABC):
|
|
|
10
10
|
"""
|
|
11
11
|
Self-benchmarking ModelAdapters can store their results in the
|
|
12
12
|
tokens_per_second and time_to_first_token members.
|
|
13
|
+
ModelAdapters that run generate in a different process can store the
|
|
14
|
+
peak memory used (bytes) by that process in the peak_wset member.
|
|
13
15
|
"""
|
|
14
16
|
self.tokens_per_second = None
|
|
15
17
|
self.time_to_first_token = None
|
|
16
18
|
self.prompt_tokens = None
|
|
17
19
|
self.response_tokens = None
|
|
20
|
+
self.peak_wset = None
|
|
18
21
|
|
|
19
22
|
self.type = "generic"
|
|
20
23
|
|
|
@@ -27,7 +30,9 @@ class ModelAdapter(abc.ABC):
|
|
|
27
30
|
with recipe components, which themselves may not support a lot of arguments.
|
|
28
31
|
|
|
29
32
|
The generate method should store prompt and response lengths (in tokens)
|
|
30
|
-
in the prompt_tokens and response_tokens members.
|
|
33
|
+
in the prompt_tokens and response_tokens members. If a different process is used,
|
|
34
|
+
the generate method can also store the peak memory used by that process in the
|
|
35
|
+
peak_wset member.
|
|
31
36
|
"""
|
|
32
37
|
|
|
33
38
|
|
lemonade/tools/bench.py
CHANGED
|
@@ -2,7 +2,6 @@ from abc import ABC, abstractmethod
|
|
|
2
2
|
import argparse
|
|
3
3
|
import os
|
|
4
4
|
import platform
|
|
5
|
-
import psutil
|
|
6
5
|
from lemonade.state import State
|
|
7
6
|
from lemonade.tools import Tool
|
|
8
7
|
from lemonade.cache import Keys
|
|
@@ -29,7 +28,9 @@ class Bench(Tool, ABC):
|
|
|
29
28
|
Keys.SECONDS_TO_FIRST_TOKEN,
|
|
30
29
|
Keys.STD_DEV_SECONDS_TO_FIRST_TOKEN,
|
|
31
30
|
Keys.TOKEN_GENERATION_TOKENS_PER_SECOND,
|
|
31
|
+
Keys.STD_DEV_TOKENS_PER_SECOND,
|
|
32
32
|
Keys.PREFILL_TOKENS_PER_SECOND,
|
|
33
|
+
Keys.STD_DEV_PREFILL_TOKENS_PER_SECOND,
|
|
33
34
|
Keys.PROMPT_TOKENS,
|
|
34
35
|
Keys.RESPONSE_TOKENS,
|
|
35
36
|
Keys.MAX_MEMORY_USED_GBYTE,
|
|
@@ -42,7 +43,9 @@ class Bench(Tool, ABC):
|
|
|
42
43
|
self.mean_time_to_first_token_list = []
|
|
43
44
|
self.std_dev_time_to_first_token_list = []
|
|
44
45
|
self.prefill_tokens_per_second_list = []
|
|
46
|
+
self.std_dev_prefill_tokens_per_second_list = []
|
|
45
47
|
self.token_generation_tokens_per_second_list = []
|
|
48
|
+
self.std_dev_token_generation_tokens_per_second_list = []
|
|
46
49
|
self.max_memory_used_gb_list = []
|
|
47
50
|
|
|
48
51
|
# Max memory used can only be measured on Windows systems
|
|
@@ -88,7 +91,7 @@ class Bench(Tool, ABC):
|
|
|
88
91
|
default=[str(default_prompt_length)],
|
|
89
92
|
metavar="PROMPT",
|
|
90
93
|
help="Input one or more prompts to the LLM. Three formats are supported. "
|
|
91
|
-
"1) integer: use a synthetic prompt with the specified length "
|
|
94
|
+
"1) integer: use a synthetic prompt with the specified token length "
|
|
92
95
|
"2) str: use a user-provided prompt string "
|
|
93
96
|
"3) path/to/prompt.txt: load the prompt from a text file. "
|
|
94
97
|
f"(default: {default_prompt_length}) ",
|
|
@@ -190,11 +193,6 @@ class Bench(Tool, ABC):
|
|
|
190
193
|
)
|
|
191
194
|
self.first_run_prompt = False
|
|
192
195
|
|
|
193
|
-
if self.save_max_memory_used:
|
|
194
|
-
self.max_memory_used_gb_list.append(
|
|
195
|
-
psutil.Process().memory_info().peak_wset / 1024**3
|
|
196
|
-
)
|
|
197
|
-
|
|
198
196
|
self.set_percent_progress(None)
|
|
199
197
|
self.save_stats(state)
|
|
200
198
|
|
|
@@ -211,7 +209,10 @@ class Bench(Tool, ABC):
|
|
|
211
209
|
output_tokens,
|
|
212
210
|
**kwargs,
|
|
213
211
|
):
|
|
214
|
-
|
|
212
|
+
"""
|
|
213
|
+
The run_prompt method should append the appropriate value to each of the per prompt
|
|
214
|
+
measurement statistics lists that are members of the Bench class.
|
|
215
|
+
"""
|
|
215
216
|
|
|
216
217
|
@staticmethod
|
|
217
218
|
def get_item_or_list(lst):
|
|
@@ -246,10 +247,27 @@ class Bench(Tool, ABC):
|
|
|
246
247
|
Keys.PREFILL_TOKENS_PER_SECOND,
|
|
247
248
|
self.get_item_or_list(self.prefill_tokens_per_second_list),
|
|
248
249
|
)
|
|
250
|
+
if not all(
|
|
251
|
+
element is None for element in self.std_dev_prefill_tokens_per_second_list
|
|
252
|
+
):
|
|
253
|
+
state.save_stat(
|
|
254
|
+
Keys.STD_DEV_PREFILL_TOKENS_PER_SECOND,
|
|
255
|
+
self.get_item_or_list(self.std_dev_prefill_tokens_per_second_list),
|
|
256
|
+
)
|
|
249
257
|
state.save_stat(
|
|
250
258
|
Keys.TOKEN_GENERATION_TOKENS_PER_SECOND,
|
|
251
259
|
self.get_item_or_list(self.token_generation_tokens_per_second_list),
|
|
252
260
|
)
|
|
261
|
+
if not all(
|
|
262
|
+
element is None
|
|
263
|
+
for element in self.std_dev_token_generation_tokens_per_second_list
|
|
264
|
+
):
|
|
265
|
+
state.save_stat(
|
|
266
|
+
Keys.STD_DEV_TOKENS_PER_SECOND,
|
|
267
|
+
self.get_item_or_list(
|
|
268
|
+
self.std_dev_token_generation_tokens_per_second_list
|
|
269
|
+
),
|
|
270
|
+
)
|
|
253
271
|
if self.save_max_memory_used:
|
|
254
272
|
state.save_stat(
|
|
255
273
|
Keys.MAX_MEMORY_USED_GBYTE,
|
lemonade/tools/flm/utils.py
CHANGED
|
@@ -10,16 +10,46 @@ import time
|
|
|
10
10
|
from typing import List, Optional
|
|
11
11
|
|
|
12
12
|
import requests
|
|
13
|
-
from packaging.version import Version
|
|
13
|
+
from packaging.version import Version, InvalidVersion
|
|
14
14
|
|
|
15
15
|
|
|
16
|
-
|
|
16
|
+
def get_flm_latest_version() -> Optional[str]:
|
|
17
|
+
"""
|
|
18
|
+
Get and return the latest FLM version from "https://github.com/FastFlowLM/FastFlowLM/tags"
|
|
19
|
+
This uses the GitHub tags API.
|
|
20
|
+
"""
|
|
21
|
+
url = "https://api.github.com/repos/FastFlowLM/FastFlowLM/tags"
|
|
22
|
+
try:
|
|
23
|
+
response = requests.get(url, timeout=10)
|
|
24
|
+
response.raise_for_status()
|
|
25
|
+
tags = response.json()
|
|
26
|
+
if not tags:
|
|
27
|
+
return None
|
|
28
|
+
# Tags are sorted in reverse chronological order; find the first that looks like a version
|
|
29
|
+
for tag in tags:
|
|
30
|
+
tag_name = tag.get("name", "")
|
|
31
|
+
# Accept tags of the form v0.9.10, 0.9.10, etc.
|
|
32
|
+
if tag_name.startswith("v"):
|
|
33
|
+
version_candidate = tag_name[1:]
|
|
34
|
+
else:
|
|
35
|
+
version_candidate = tag_name
|
|
36
|
+
try:
|
|
37
|
+
# validate it's a version string
|
|
38
|
+
_ = Version(version_candidate)
|
|
39
|
+
return version_candidate
|
|
40
|
+
except InvalidVersion:
|
|
41
|
+
continue
|
|
42
|
+
return None
|
|
43
|
+
except requests.exceptions.RequestException as e:
|
|
44
|
+
logging.debug("Error retrieving latest FLM version: %s", e)
|
|
45
|
+
return None
|
|
17
46
|
|
|
18
47
|
|
|
19
48
|
def check_flm_version() -> Optional[str]:
|
|
20
49
|
"""
|
|
21
50
|
Check if FLM is installed and return version, or None if not available.
|
|
22
51
|
"""
|
|
52
|
+
latest_version_str = get_flm_latest_version()
|
|
23
53
|
try:
|
|
24
54
|
result = subprocess.run(
|
|
25
55
|
["flm", "version"],
|
|
@@ -34,11 +64,11 @@ def check_flm_version() -> Optional[str]:
|
|
|
34
64
|
output = result.stdout.strip()
|
|
35
65
|
if output.startswith("FLM v"):
|
|
36
66
|
version_str = output[5:] # Remove "FLM v" prefix
|
|
37
|
-
return version_str
|
|
38
|
-
return None
|
|
67
|
+
return version_str, latest_version_str
|
|
68
|
+
return None, latest_version_str
|
|
39
69
|
|
|
40
70
|
except (subprocess.CalledProcessError, FileNotFoundError):
|
|
41
|
-
return None
|
|
71
|
+
return None, latest_version_str
|
|
42
72
|
|
|
43
73
|
|
|
44
74
|
def refresh_environment():
|
|
@@ -76,31 +106,42 @@ def install_flm():
|
|
|
76
106
|
If not, download and run the GUI installer, then wait for completion.
|
|
77
107
|
"""
|
|
78
108
|
# Check current FLM installation
|
|
79
|
-
current_version = check_flm_version()
|
|
109
|
+
current_version, latest_version = check_flm_version()
|
|
80
110
|
|
|
81
|
-
if
|
|
111
|
+
if (
|
|
112
|
+
current_version
|
|
113
|
+
and latest_version
|
|
114
|
+
and Version(current_version) == Version(latest_version)
|
|
115
|
+
):
|
|
82
116
|
logging.info(
|
|
83
|
-
"FLM v%s is already installed and
|
|
117
|
+
"FLM v%s is already installed and is up to date (latest version: v%s).",
|
|
84
118
|
current_version,
|
|
85
|
-
|
|
119
|
+
latest_version,
|
|
86
120
|
)
|
|
87
121
|
return
|
|
88
122
|
|
|
89
123
|
if current_version:
|
|
124
|
+
if not latest_version:
|
|
125
|
+
logging.info(
|
|
126
|
+
"Unable to detect the latest FLM version; continuing with installed FLM v%s.",
|
|
127
|
+
current_version,
|
|
128
|
+
)
|
|
129
|
+
return
|
|
90
130
|
logging.info(
|
|
91
|
-
"FLM v%s is installed but below
|
|
131
|
+
"FLM v%s is installed but below latest version v%s. Upgrading...",
|
|
92
132
|
current_version,
|
|
93
|
-
|
|
133
|
+
latest_version,
|
|
94
134
|
)
|
|
135
|
+
verysilent = True
|
|
95
136
|
else:
|
|
96
|
-
logging.info(
|
|
97
|
-
|
|
98
|
-
)
|
|
137
|
+
logging.info("FLM not found. Installing FLM v%s or later...", latest_version)
|
|
138
|
+
verysilent = False
|
|
99
139
|
|
|
100
140
|
# Download the installer
|
|
101
141
|
# pylint: disable=line-too-long
|
|
102
142
|
installer_url = "https://github.com/FastFlowLM/FastFlowLM/releases/latest/download/flm-setup.exe"
|
|
103
143
|
installer_path = os.path.join(tempfile.gettempdir(), "flm-setup.exe")
|
|
144
|
+
installer_args = [installer_path, "/VERYSILENT"] if verysilent else [installer_path]
|
|
104
145
|
|
|
105
146
|
try:
|
|
106
147
|
# Remove existing installer if present
|
|
@@ -123,13 +164,15 @@ def install_flm():
|
|
|
123
164
|
# Launch the installer GUI
|
|
124
165
|
logging.warning(
|
|
125
166
|
"Launching FLM installer GUI. Please complete the installation..."
|
|
167
|
+
if not verysilent
|
|
168
|
+
else "Installing FLM..."
|
|
126
169
|
)
|
|
127
170
|
|
|
128
171
|
# Launch installer and wait for it to complete
|
|
129
172
|
if os.name == "nt": # Windows
|
|
130
|
-
process = subprocess.Popen(
|
|
173
|
+
process = subprocess.Popen(installer_args, shell=True)
|
|
131
174
|
else:
|
|
132
|
-
process = subprocess.Popen(
|
|
175
|
+
process = subprocess.Popen(installer_args)
|
|
133
176
|
|
|
134
177
|
# Wait for installer to complete
|
|
135
178
|
process.wait()
|
|
@@ -150,8 +193,8 @@ def install_flm():
|
|
|
150
193
|
# Verify installation
|
|
151
194
|
max_retries = 10
|
|
152
195
|
for attempt in range(max_retries):
|
|
153
|
-
new_version = check_flm_version()
|
|
154
|
-
if new_version and Version(new_version)
|
|
196
|
+
new_version, latest_version = check_flm_version()
|
|
197
|
+
if new_version and Version(new_version) == Version(latest_version):
|
|
155
198
|
logging.info("FLM v%s successfully installed and verified", new_version)
|
|
156
199
|
return
|
|
157
200
|
|
|
@@ -240,7 +283,12 @@ def get_flm_installed_models() -> List[str]:
|
|
|
240
283
|
|
|
241
284
|
return installed_checkpoints
|
|
242
285
|
|
|
243
|
-
except (
|
|
286
|
+
except (
|
|
287
|
+
subprocess.CalledProcessError,
|
|
288
|
+
FileNotFoundError,
|
|
289
|
+
AttributeError,
|
|
290
|
+
NotADirectoryError,
|
|
291
|
+
):
|
|
244
292
|
# FLM not installed, not available, or output parsing failed
|
|
245
293
|
return []
|
|
246
294
|
|
|
@@ -249,7 +297,7 @@ def is_flm_available() -> bool:
|
|
|
249
297
|
"""
|
|
250
298
|
Check if FLM is available and meets minimum version requirements.
|
|
251
299
|
"""
|
|
252
|
-
current_version = check_flm_version()
|
|
253
|
-
return current_version is not None and Version(current_version)
|
|
254
|
-
|
|
300
|
+
current_version, latest_version = check_flm_version()
|
|
301
|
+
return current_version is not None and Version(current_version) == Version(
|
|
302
|
+
latest_version
|
|
255
303
|
)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
import statistics
|
|
3
3
|
from statistics import StatisticsError
|
|
4
|
+
import psutil
|
|
4
5
|
from lemonade.state import State
|
|
5
6
|
from lemonade.cache import Keys
|
|
6
7
|
from lemonade.tools.bench import Bench
|
|
@@ -75,7 +76,7 @@ class HuggingfaceBench(Bench):
|
|
|
75
76
|
warmup_iterations: int,
|
|
76
77
|
output_tokens: int,
|
|
77
78
|
num_beams: int = default_beams,
|
|
78
|
-
)
|
|
79
|
+
):
|
|
79
80
|
"""
|
|
80
81
|
We don't have access to the internal timings of generate(), so time to first
|
|
81
82
|
token (TTFT, aka prefill latency) and token/s are calculated using the following formulae:
|
|
@@ -176,6 +177,10 @@ class HuggingfaceBench(Bench):
|
|
|
176
177
|
self.token_generation_tokens_per_second_list.append(
|
|
177
178
|
(mean_token_len - 1) / mean_decode_latency
|
|
178
179
|
)
|
|
180
|
+
if self.save_max_memory_used:
|
|
181
|
+
self.max_memory_used_gb_list.append(
|
|
182
|
+
psutil.Process().memory_info().peak_wset / 1024**3
|
|
183
|
+
)
|
|
179
184
|
|
|
180
185
|
|
|
181
186
|
# This file was originally licensed under Apache 2.0. It has been modified.
|