lemonade-sdk 8.1.7__py3-none-any.whl → 8.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lemonade-sdk might be problematic. Click here for more details.
- lemonade/cli.py +47 -5
- lemonade/profilers/agt_power.py +437 -0
- lemonade/profilers/hwinfo_power.py +429 -0
- lemonade/tools/llamacpp/utils.py +15 -4
- lemonade/tools/oga/load.py +15 -2
- lemonade/tools/report/table.py +1 -1
- lemonade/tools/server/llamacpp.py +19 -13
- lemonade/tools/server/serve.py +39 -9
- lemonade/tools/server/static/js/chat.js +545 -242
- lemonade/tools/server/static/js/models.js +112 -24
- lemonade/tools/server/static/js/shared.js +15 -5
- lemonade/tools/server/static/styles.css +145 -75
- lemonade/tools/server/static/webapp.html +23 -27
- lemonade/tools/server/wrapped_server.py +8 -0
- lemonade/version.py +1 -1
- lemonade_install/install.py +15 -49
- {lemonade_sdk-8.1.7.dist-info → lemonade_sdk-8.1.9.dist-info}/METADATA +16 -64
- {lemonade_sdk-8.1.7.dist-info → lemonade_sdk-8.1.9.dist-info}/RECORD +26 -27
- lemonade_server/cli.py +12 -9
- lemonade_server/model_manager.py +48 -0
- lemonade_server/server_models.json +24 -6
- lemonade/tools/quark/__init__.py +0 -0
- lemonade/tools/quark/quark_load.py +0 -173
- lemonade/tools/quark/quark_quantize.py +0 -439
- {lemonade_sdk-8.1.7.dist-info → lemonade_sdk-8.1.9.dist-info}/WHEEL +0 -0
- {lemonade_sdk-8.1.7.dist-info → lemonade_sdk-8.1.9.dist-info}/entry_points.txt +0 -0
- {lemonade_sdk-8.1.7.dist-info → lemonade_sdk-8.1.9.dist-info}/licenses/LICENSE +0 -0
- {lemonade_sdk-8.1.7.dist-info → lemonade_sdk-8.1.9.dist-info}/licenses/NOTICE.md +0 -0
- {lemonade_sdk-8.1.7.dist-info → lemonade_sdk-8.1.9.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,429 @@
|
|
|
1
|
+
#
|
|
2
|
+
# This power profiler uses an external tool called HWiNFO.
|
|
3
|
+
# Please see the power profiling documentation for download and install instructions.
|
|
4
|
+
#
|
|
5
|
+
# The power profiling functionality is currently not part of our continuous integration
|
|
6
|
+
# testing framework, primarily due to the setup overhead required from the above three items.
|
|
7
|
+
# We will revisit in the future if we face issues.
|
|
8
|
+
#
|
|
9
|
+
|
|
10
|
+
import ctypes
|
|
11
|
+
from datetime import datetime
|
|
12
|
+
import os
|
|
13
|
+
import platform
|
|
14
|
+
import textwrap
|
|
15
|
+
import time
|
|
16
|
+
import subprocess
|
|
17
|
+
import psutil
|
|
18
|
+
import matplotlib.pyplot as plt
|
|
19
|
+
import numpy as np
|
|
20
|
+
import pandas as pd
|
|
21
|
+
import lemonade.common.printing as printing
|
|
22
|
+
from lemonade.profilers import Profiler
|
|
23
|
+
from lemonade.tools.report.table import LemonadePerfTable, DictListStat
|
|
24
|
+
|
|
25
|
+
DEFAULT_TRACK_POWER_INTERVAL_MS = 500
|
|
26
|
+
DEFAULT_TRACK_POWER_WARMUP_PERIOD = 5
|
|
27
|
+
|
|
28
|
+
HWINFO_PATH_ENV_VAR = "HWINFO_PATH"
|
|
29
|
+
DEFAULT_HWINFO_PATH = r"C:\Program Files\HWiNFO64\HWiNFO64.exe"
|
|
30
|
+
POWER_USAGE_CSV_FILENAME = "power_usage_hwinfo.csv"
|
|
31
|
+
POWER_USAGE_PNG_FILENAME = "power_usage_hwinfo.png"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class Keys:
|
|
35
|
+
# Path to the file containing the power usage plot
|
|
36
|
+
POWER_USAGE_PLOT = "power_usage_plot_hwinfo"
|
|
37
|
+
# Path to the file containing the power usage plot
|
|
38
|
+
POWER_USAGE_DATA = "power_usage_data_hwinfo"
|
|
39
|
+
# Path to the file containing the power usage plot
|
|
40
|
+
POWER_USAGE_DATA_CSV = "power_usage_data_file_hwinfo"
|
|
41
|
+
# Maximum power consumed by the CPU processor package during the tools sequence
|
|
42
|
+
PEAK_PROCESSOR_PACKAGE_POWER = "peak_processor_package_power_hwinfo"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
# Add column to the Lemonade performance report table for the power data
|
|
46
|
+
LemonadePerfTable.table_descriptor["stat_columns"].append(
|
|
47
|
+
DictListStat(
|
|
48
|
+
"Power Usage (HWiNFO)",
|
|
49
|
+
Keys.POWER_USAGE_DATA,
|
|
50
|
+
[
|
|
51
|
+
("name", "{0}:"),
|
|
52
|
+
("duration", "{0:.1f}s,"),
|
|
53
|
+
("energy consumed", "{0:.1f} J"),
|
|
54
|
+
],
|
|
55
|
+
)
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def is_user_admin() -> bool:
|
|
60
|
+
"""Return true if platform is Windows and user is Admin"""
|
|
61
|
+
os_type = platform.system()
|
|
62
|
+
if os_type == "Windows":
|
|
63
|
+
try:
|
|
64
|
+
return ctypes.windll.shell32.IsUserAnAdmin() == 1
|
|
65
|
+
except AttributeError:
|
|
66
|
+
pass
|
|
67
|
+
return False
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def is_process_running(executable_name):
|
|
71
|
+
"""Checks if an executable is currently running."""
|
|
72
|
+
executable_name = executable_name.lower()
|
|
73
|
+
for process in psutil.process_iter(["pid", "name"]):
|
|
74
|
+
if process.info["name"].lower() == executable_name:
|
|
75
|
+
return True
|
|
76
|
+
return False
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def read_data_from_csv(csv_path, columns_dict, encoding="utf-8") -> pd.DataFrame:
|
|
80
|
+
try:
|
|
81
|
+
available_columns = pd.read_csv(csv_path, nrows=0, encoding=encoding).columns
|
|
82
|
+
columns_to_read = list(set(columns_dict.values()) & set(available_columns))
|
|
83
|
+
df = pd.read_csv(csv_path, usecols=columns_to_read, encoding=encoding)
|
|
84
|
+
except FileNotFoundError as e:
|
|
85
|
+
printing.log_info(f"Power profiler file not found: {e.filename}")
|
|
86
|
+
return None
|
|
87
|
+
except ValueError as e:
|
|
88
|
+
printing.log_info(f"Error reading power data from {csv_path}: {e}")
|
|
89
|
+
return None
|
|
90
|
+
|
|
91
|
+
# Rename columns to simple name
|
|
92
|
+
df.rename(
|
|
93
|
+
columns={v: k for k, v in columns_dict.items() if v in columns_to_read},
|
|
94
|
+
inplace=True,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
return df
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class HWINFOPowerProfiler(Profiler):
|
|
101
|
+
|
|
102
|
+
unique_name = "power-hwinfo"
|
|
103
|
+
|
|
104
|
+
# mapping from short name to full name of the measurement in the CSV file produced by HWiNFO
|
|
105
|
+
columns_dict = {
|
|
106
|
+
"time": "Time",
|
|
107
|
+
"cpu_package_power": "CPU Package Power [W]",
|
|
108
|
+
"npu_clock": "NPU Clock [MHz]",
|
|
109
|
+
"gpu_clock": "GPU Clock [MHz]",
|
|
110
|
+
"total_cpu_usage": "Total CPU Usage [%]",
|
|
111
|
+
"apu_stapm_limit": "APU STAPM Limit [%]",
|
|
112
|
+
"cpu_tdc_limit": "CPU TDC Limit [%]",
|
|
113
|
+
"cpu_edc_limit": "CPU EDC Limit [%]",
|
|
114
|
+
"cpu_ppt_fast_limit": "CPU PPT FAST Limit [%]",
|
|
115
|
+
"cpu_ppt_slow_limit": "CPU PPT SLOW Limit [%]",
|
|
116
|
+
"thermal_limit": "Thermal Limit [%]",
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
@staticmethod
|
|
120
|
+
def time_to_seconds(time_str):
|
|
121
|
+
# Parse the time string
|
|
122
|
+
try:
|
|
123
|
+
time_obj = datetime.strptime(time_str, "%H:%M:%S.%f")
|
|
124
|
+
except TypeError:
|
|
125
|
+
raise ValueError(f"Could not parse {time_str}")
|
|
126
|
+
|
|
127
|
+
# Calculate the total seconds
|
|
128
|
+
total_seconds = (
|
|
129
|
+
time_obj.hour * 3600
|
|
130
|
+
+ time_obj.minute * 60
|
|
131
|
+
+ time_obj.second
|
|
132
|
+
+ time_obj.microsecond / 1_000_000
|
|
133
|
+
)
|
|
134
|
+
return total_seconds
|
|
135
|
+
|
|
136
|
+
@staticmethod
|
|
137
|
+
def add_arguments_to_parser(parser):
|
|
138
|
+
parser.add_argument(
|
|
139
|
+
f"--{HWINFOPowerProfiler.unique_name}",
|
|
140
|
+
nargs="?",
|
|
141
|
+
metavar="WARMUP_PERIOD",
|
|
142
|
+
type=int,
|
|
143
|
+
default=None,
|
|
144
|
+
const=DEFAULT_TRACK_POWER_WARMUP_PERIOD,
|
|
145
|
+
help="Track power consumption using the HWiNFO application and plot the results. "
|
|
146
|
+
"HWiNFO is a commercial product from a third party (https://www.hwinfo.com/) "
|
|
147
|
+
"and should be acquired/licensed appropriately. "
|
|
148
|
+
"Optionally, set the warmup period in seconds "
|
|
149
|
+
f"(default: {DEFAULT_TRACK_POWER_WARMUP_PERIOD}). If the application is not "
|
|
150
|
+
f"installed at {DEFAULT_HWINFO_PATH}, set the {HWINFO_PATH_ENV_VAR} environment "
|
|
151
|
+
f"variable to point at it. This is a Windows only feature and Lemonade must be run "
|
|
152
|
+
f"from a CMD window with Administrator privileges.",
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
def __init__(self, parser_arg_value):
|
|
156
|
+
super().__init__()
|
|
157
|
+
self.warmup_period = parser_arg_value
|
|
158
|
+
self.status_stats += [Keys.PEAK_PROCESSOR_PACKAGE_POWER, Keys.POWER_USAGE_PLOT]
|
|
159
|
+
self.tracking_active = False
|
|
160
|
+
self.build_dir = None
|
|
161
|
+
self.csv_path = None
|
|
162
|
+
self.hwinfo_process = None
|
|
163
|
+
self.data = None
|
|
164
|
+
|
|
165
|
+
def start(self, build_dir):
|
|
166
|
+
if self.tracking_active:
|
|
167
|
+
raise RuntimeError("Cannot start power tracking while already tracking")
|
|
168
|
+
|
|
169
|
+
if platform.system() != "Windows":
|
|
170
|
+
raise RuntimeError("Power usage tracking is only enabled in Windows.")
|
|
171
|
+
|
|
172
|
+
# Check that user as running in Admin mode
|
|
173
|
+
if not is_user_admin():
|
|
174
|
+
raise RuntimeError(
|
|
175
|
+
"For power usage tracking, run Lemonade as an Administrator."
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
# Save the folder where data and plot will be stored
|
|
179
|
+
self.build_dir = build_dir
|
|
180
|
+
|
|
181
|
+
# The csv file where power data will be stored
|
|
182
|
+
self.csv_path = os.path.join(build_dir, POWER_USAGE_CSV_FILENAME)
|
|
183
|
+
if " " in self.csv_path:
|
|
184
|
+
raise RuntimeError(
|
|
185
|
+
"Can't log HWiNFO data to a file with a <space> in the path. "
|
|
186
|
+
"Please use the `-d` flag to specify a Lemonade cache path with no spaces."
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
# See if the HWINFO_PATH environment variables exists
|
|
190
|
+
# If so, use it instead of the default path
|
|
191
|
+
if HWINFO_PATH_ENV_VAR in os.environ:
|
|
192
|
+
hwinfo_path = os.getenv(HWINFO_PATH_ENV_VAR)
|
|
193
|
+
else:
|
|
194
|
+
hwinfo_path = DEFAULT_HWINFO_PATH
|
|
195
|
+
|
|
196
|
+
# Check the HWINFO executable exists
|
|
197
|
+
if not os.path.isfile(hwinfo_path):
|
|
198
|
+
raise FileNotFoundError(hwinfo_path)
|
|
199
|
+
|
|
200
|
+
# Check that executable is not already running
|
|
201
|
+
executable = hwinfo_path.split(os.sep)[-1]
|
|
202
|
+
if is_process_running(executable):
|
|
203
|
+
raise RuntimeError(
|
|
204
|
+
f"{executable} is already running. Quit it and try again."
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
# Start HWiNFO executable
|
|
208
|
+
try:
|
|
209
|
+
command = [
|
|
210
|
+
hwinfo_path,
|
|
211
|
+
f"-l{self.csv_path}",
|
|
212
|
+
f"-poll_rate={DEFAULT_TRACK_POWER_INTERVAL_MS}",
|
|
213
|
+
]
|
|
214
|
+
self.hwinfo_process = subprocess.Popen(
|
|
215
|
+
command,
|
|
216
|
+
stdin=subprocess.PIPE,
|
|
217
|
+
stderr=subprocess.PIPE,
|
|
218
|
+
)
|
|
219
|
+
except OSError as e:
|
|
220
|
+
if "[WinError 740]" in str(e):
|
|
221
|
+
print(
|
|
222
|
+
"\nTo avoid `requested operation requires elevation` error, please make sure"
|
|
223
|
+
)
|
|
224
|
+
print(
|
|
225
|
+
"HWiNFO.exe has Properties->Compatibility->`Run this program as an "
|
|
226
|
+
"administrator` checked."
|
|
227
|
+
)
|
|
228
|
+
print(
|
|
229
|
+
"You may also need to set Windows User Account Control to `Never notify`.\n"
|
|
230
|
+
)
|
|
231
|
+
raise
|
|
232
|
+
self.tracking_active = True
|
|
233
|
+
time.sleep(self.warmup_period)
|
|
234
|
+
|
|
235
|
+
def stop(self):
|
|
236
|
+
if self.tracking_active:
|
|
237
|
+
self.tracking_active = False
|
|
238
|
+
time.sleep(self.warmup_period)
|
|
239
|
+
self.hwinfo_process.terminate()
|
|
240
|
+
self.hwinfo_process.wait()
|
|
241
|
+
|
|
242
|
+
def generate_results(self, state, timestamp, start_times):
|
|
243
|
+
if self.hwinfo_process is None:
|
|
244
|
+
return
|
|
245
|
+
|
|
246
|
+
if self.tracking_active:
|
|
247
|
+
self.stop()
|
|
248
|
+
|
|
249
|
+
df = read_data_from_csv(self.csv_path, self.columns_dict, encoding="latin1")
|
|
250
|
+
if df is None:
|
|
251
|
+
state.save_stat(Keys.POWER_USAGE_PLOT, "NONE")
|
|
252
|
+
return
|
|
253
|
+
|
|
254
|
+
# Remap time to seconds from start of profiling data
|
|
255
|
+
# Remap csv data time to elapsed seconds (i.e., substract out initial time)
|
|
256
|
+
try:
|
|
257
|
+
initial_data_time = self.time_to_seconds(df["time"].iloc[0])
|
|
258
|
+
df["time"] = df["time"].apply(
|
|
259
|
+
lambda x: (self.time_to_seconds(x) - initial_data_time)
|
|
260
|
+
)
|
|
261
|
+
except ValueError as e:
|
|
262
|
+
printing.log_info(
|
|
263
|
+
f"Badly formatted time data in {self.csv_path}: {e}. "
|
|
264
|
+
f"HWiNFO may have closed unexpectedly."
|
|
265
|
+
)
|
|
266
|
+
state.save_stat(Keys.POWER_USAGE_PLOT, "NONE")
|
|
267
|
+
return
|
|
268
|
+
|
|
269
|
+
# Make time 0 the time of the first tool starting (after the warmup period)
|
|
270
|
+
if start_times:
|
|
271
|
+
tool_start_times = sorted(start_times.values())
|
|
272
|
+
# First tool after warmup (if no tools, then will be time of start of cool down)
|
|
273
|
+
first_tool_time = tool_start_times[1]
|
|
274
|
+
|
|
275
|
+
# Map the measurement data so that zero in the measurement data aligns with
|
|
276
|
+
# the first_tool_time
|
|
277
|
+
#
|
|
278
|
+
# Find the difference between the timestamp first_tool_time and initial_data_time
|
|
279
|
+
# which is a count of seconds since midnight
|
|
280
|
+
#
|
|
281
|
+
# Find midnight prior to first_tool_time
|
|
282
|
+
t = time.localtime(first_tool_time)
|
|
283
|
+
since_midnight = (
|
|
284
|
+
t.tm_hour * 3600 + t.tm_min * 60 + t.tm_sec + (first_tool_time % 1)
|
|
285
|
+
)
|
|
286
|
+
delta = since_midnight - initial_data_time
|
|
287
|
+
df["time"] = df["time"] - delta
|
|
288
|
+
|
|
289
|
+
peak_power = max(df["cpu_package_power"])
|
|
290
|
+
|
|
291
|
+
# Create a figure
|
|
292
|
+
fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(16, 8))
|
|
293
|
+
|
|
294
|
+
if start_times:
|
|
295
|
+
tool_starts = sorted(start_times.items(), key=lambda item: item[1])
|
|
296
|
+
tool_name_list = [item[0] for item in tool_starts]
|
|
297
|
+
|
|
298
|
+
# Adjust to common time frame as power measurements
|
|
299
|
+
tool_start_list = [
|
|
300
|
+
max(df["time"].iloc[0], item[1] - first_tool_time)
|
|
301
|
+
for item in tool_starts
|
|
302
|
+
]
|
|
303
|
+
tool_stop_list = tool_start_list[1:] + [df["time"].values[-1]]
|
|
304
|
+
|
|
305
|
+
# Extract power data time series
|
|
306
|
+
x_time = df["time"].to_numpy()
|
|
307
|
+
y_power = df["cpu_package_power"].to_numpy()
|
|
308
|
+
|
|
309
|
+
# Extract data for each stage in the build
|
|
310
|
+
self.data = []
|
|
311
|
+
for name, t0, tf in zip(tool_name_list, tool_start_list, tool_stop_list):
|
|
312
|
+
x = x_time[(x_time >= t0) * (x_time <= tf)]
|
|
313
|
+
x = np.insert(x, 0, t0)
|
|
314
|
+
x = np.insert(x, len(x), tf)
|
|
315
|
+
y = np.interp(x, x_time, y_power)
|
|
316
|
+
energy = np.trapz(y, x)
|
|
317
|
+
avg_power = energy / (tf - t0)
|
|
318
|
+
stage = {
|
|
319
|
+
"name": name,
|
|
320
|
+
"t": x.tolist(),
|
|
321
|
+
"power": y.tolist(),
|
|
322
|
+
"duration": float(tf - t0),
|
|
323
|
+
"energy consumed": float(energy),
|
|
324
|
+
"average power": float(avg_power),
|
|
325
|
+
}
|
|
326
|
+
self.data.append(stage)
|
|
327
|
+
|
|
328
|
+
for stage in self.data:
|
|
329
|
+
# Plot power usage time series
|
|
330
|
+
p = ax1.plot(
|
|
331
|
+
stage["t"],
|
|
332
|
+
stage["power"],
|
|
333
|
+
label=f"{stage['name']} ({stage['duration']:.1f}s, "
|
|
334
|
+
f"{stage['energy consumed']:0.1f} J)",
|
|
335
|
+
)
|
|
336
|
+
# Add a dashed line to show average power
|
|
337
|
+
ax1.plot(
|
|
338
|
+
[stage["t"][0], stage["t"][-1]],
|
|
339
|
+
[stage["average power"], stage["average power"]],
|
|
340
|
+
linestyle="--",
|
|
341
|
+
c=p[0].get_c(),
|
|
342
|
+
)
|
|
343
|
+
# Add average power text to plot
|
|
344
|
+
ax1.text(
|
|
345
|
+
stage["t"][0],
|
|
346
|
+
stage["average power"],
|
|
347
|
+
f"{stage['average power']:.1f} W ",
|
|
348
|
+
horizontalalignment="right",
|
|
349
|
+
verticalalignment="center",
|
|
350
|
+
c=p[0].get_c(),
|
|
351
|
+
)
|
|
352
|
+
else:
|
|
353
|
+
ax1.plot(
|
|
354
|
+
df["time"],
|
|
355
|
+
df["cpu_package_power"],
|
|
356
|
+
)
|
|
357
|
+
# Add title and labels to plots
|
|
358
|
+
ax1.set_ylabel(self.columns_dict["cpu_package_power"])
|
|
359
|
+
title_str = "HWiNFO Stats\n" + "\n".join(textwrap.wrap(state.build_name, 60))
|
|
360
|
+
ax1.set_title(title_str)
|
|
361
|
+
ax1.legend()
|
|
362
|
+
ax1.grid(True)
|
|
363
|
+
|
|
364
|
+
# Create second plot
|
|
365
|
+
ax2.plot(
|
|
366
|
+
df["time"],
|
|
367
|
+
df["npu_clock"],
|
|
368
|
+
label=self.columns_dict["npu_clock"],
|
|
369
|
+
)
|
|
370
|
+
ax2.plot(
|
|
371
|
+
df["time"],
|
|
372
|
+
df["gpu_clock"],
|
|
373
|
+
label=self.columns_dict["gpu_clock"],
|
|
374
|
+
)
|
|
375
|
+
ax2.set_xlabel("Time [s]")
|
|
376
|
+
ax2.set_ylabel("Clock Frequency [MHz]")
|
|
377
|
+
ax2.legend(loc=2)
|
|
378
|
+
ax2.grid(True)
|
|
379
|
+
# Add second y-axis for %
|
|
380
|
+
ax2_twin = ax2.twinx()
|
|
381
|
+
ax2_twin.plot(
|
|
382
|
+
df["time"],
|
|
383
|
+
df["total_cpu_usage"],
|
|
384
|
+
label=self.columns_dict["total_cpu_usage"],
|
|
385
|
+
c="g",
|
|
386
|
+
)
|
|
387
|
+
ax2_twin.set_ylim([0, 100])
|
|
388
|
+
vals = ax2_twin.get_yticks()
|
|
389
|
+
ax2_twin.set_yticks(vals)
|
|
390
|
+
ax2_twin.set_yticklabels([f"{v:.0f}%" for v in vals])
|
|
391
|
+
ax2_twin.legend(loc=1)
|
|
392
|
+
|
|
393
|
+
# Create third plot (all remaining columns)
|
|
394
|
+
plot3_columns = [
|
|
395
|
+
"apu_stapm_limit",
|
|
396
|
+
"cpu_tdc_limit",
|
|
397
|
+
"cpu_edc_limit",
|
|
398
|
+
"cpu_ppt_fast_limit",
|
|
399
|
+
"cpu_ppt_slow_limit",
|
|
400
|
+
"thermal_limit",
|
|
401
|
+
]
|
|
402
|
+
for col_str in plot3_columns:
|
|
403
|
+
if col_str in df.columns:
|
|
404
|
+
ax3.plot(
|
|
405
|
+
df["time"],
|
|
406
|
+
df[col_str],
|
|
407
|
+
label=self.columns_dict[col_str],
|
|
408
|
+
)
|
|
409
|
+
ax3.set_xlabel("Time [s]")
|
|
410
|
+
ax3.set_ylim([0, 100])
|
|
411
|
+
vals = ax3.get_yticks()
|
|
412
|
+
ax3.set_yticks(vals)
|
|
413
|
+
ax3.set_yticklabels([f"{v:.0f}%" for v in vals])
|
|
414
|
+
if len(ax3.lines):
|
|
415
|
+
ax3.legend()
|
|
416
|
+
ax3.grid(True)
|
|
417
|
+
|
|
418
|
+
# Save plot to current folder AND save to cache
|
|
419
|
+
plot_path = os.path.join(
|
|
420
|
+
self.build_dir, f"{timestamp}_{POWER_USAGE_PNG_FILENAME}"
|
|
421
|
+
)
|
|
422
|
+
fig.savefig(plot_path, dpi=300, bbox_inches="tight")
|
|
423
|
+
plot_path = os.path.join(os.getcwd(), f"{timestamp}_{POWER_USAGE_PNG_FILENAME}")
|
|
424
|
+
fig.savefig(plot_path, dpi=300, bbox_inches="tight")
|
|
425
|
+
|
|
426
|
+
state.save_stat(Keys.POWER_USAGE_PLOT, plot_path)
|
|
427
|
+
state.save_stat(Keys.POWER_USAGE_DATA, self.data)
|
|
428
|
+
state.save_stat(Keys.POWER_USAGE_DATA_CSV, self.csv_path)
|
|
429
|
+
state.save_stat(Keys.PEAK_PROCESSOR_PACKAGE_POWER, f"{peak_power:0.1f} W")
|
lemonade/tools/llamacpp/utils.py
CHANGED
|
@@ -14,8 +14,8 @@ from lemonade.common.system_info import get_system_info
|
|
|
14
14
|
|
|
15
15
|
from dotenv import set_key, load_dotenv
|
|
16
16
|
|
|
17
|
-
LLAMA_VERSION_VULKAN = "
|
|
18
|
-
LLAMA_VERSION_ROCM = "
|
|
17
|
+
LLAMA_VERSION_VULKAN = "b6431"
|
|
18
|
+
LLAMA_VERSION_ROCM = "b1057"
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
def identify_rocm_arch_from_name(device_name: str) -> str | None:
|
|
@@ -500,7 +500,7 @@ def get_local_checkpoint_path(base_checkpoint, variant):
|
|
|
500
500
|
|
|
501
501
|
|
|
502
502
|
def identify_gguf_models(
|
|
503
|
-
checkpoint: str, variant: str, mmproj: str
|
|
503
|
+
checkpoint: str, variant: Optional[str], mmproj: str
|
|
504
504
|
) -> tuple[dict, list[str]]:
|
|
505
505
|
"""
|
|
506
506
|
Identifies the GGUF model files in the repository that match the variant.
|
|
@@ -510,12 +510,14 @@ def identify_gguf_models(
|
|
|
510
510
|
The CHECKPOINT:VARIANT scheme is used to specify model files in Hugging Face repositories.
|
|
511
511
|
|
|
512
512
|
The VARIANT format can be one of several types:
|
|
513
|
+
0. wildcard (*): download all files in the repo
|
|
513
514
|
1. Full filename: exact file to download
|
|
514
515
|
2. None/empty: gets the first .gguf file in the repository (excludes mmproj files)
|
|
515
516
|
3. Quantization variant: find a single file ending with the variant name (case insensitive)
|
|
516
517
|
4. Folder name: downloads all .gguf files in the folder that matches the variant name (case insensitive)
|
|
517
518
|
|
|
518
519
|
Examples:
|
|
520
|
+
- "ggml-org/gpt-oss-120b-GGUF:*" -> downloads all files in repo
|
|
519
521
|
- "unsloth/Qwen3-8B-GGUF:qwen3.gguf" -> downloads "qwen3.gguf"
|
|
520
522
|
- "unsloth/Qwen3-30B-A3B-GGUF" -> downloads "Qwen3-30B-A3B-GGUF.gguf"
|
|
521
523
|
- "unsloth/Qwen3-8B-GGUF:Q4_1" -> downloads "Qwen3-8B-GGUF-Q4_1.gguf"
|
|
@@ -527,8 +529,17 @@ def identify_gguf_models(
|
|
|
527
529
|
repo_files = list_repo_files(checkpoint)
|
|
528
530
|
sharded_files = []
|
|
529
531
|
|
|
532
|
+
# (case 0) Wildcard, download everything
|
|
533
|
+
if variant and variant == "*":
|
|
534
|
+
sharded_files = repo_files
|
|
535
|
+
|
|
536
|
+
# Sort to ensure consistent ordering
|
|
537
|
+
sharded_files.sort()
|
|
538
|
+
|
|
539
|
+
# Use first file as primary (this is how llamacpp handles it)
|
|
540
|
+
variant_name = sharded_files[0]
|
|
530
541
|
# (case 1) If variant ends in .gguf, use it directly
|
|
531
|
-
|
|
542
|
+
elif variant and variant.endswith(".gguf"):
|
|
532
543
|
variant_name = variant
|
|
533
544
|
if variant_name not in repo_files:
|
|
534
545
|
raise ValueError(
|
lemonade/tools/oga/load.py
CHANGED
|
@@ -74,6 +74,17 @@ def _get_npu_driver_version():
|
|
|
74
74
|
return None
|
|
75
75
|
|
|
76
76
|
|
|
77
|
+
def _compare_driver_versions(current_version, required_version):
|
|
78
|
+
"""
|
|
79
|
+
Compare two driver version strings.
|
|
80
|
+
Returns True if current_version >= required_version, False otherwise.
|
|
81
|
+
Uses packaging.version for proper semantic version comparison.
|
|
82
|
+
"""
|
|
83
|
+
from packaging.version import Version
|
|
84
|
+
|
|
85
|
+
return Version(current_version) >= Version(required_version)
|
|
86
|
+
|
|
87
|
+
|
|
77
88
|
def import_error_heler(e: Exception):
|
|
78
89
|
"""
|
|
79
90
|
Print a helpful message in the event of an import error
|
|
@@ -343,11 +354,13 @@ class OgaLoad(FirstTool):
|
|
|
343
354
|
)
|
|
344
355
|
_open_driver_install_page()
|
|
345
356
|
|
|
346
|
-
elif
|
|
357
|
+
elif not _compare_driver_versions(
|
|
358
|
+
current_driver_version, required_driver_version
|
|
359
|
+
):
|
|
347
360
|
printing.log_warning(
|
|
348
361
|
f"Incorrect NPU driver version detected: {current_driver_version}\n"
|
|
349
362
|
f"{device.upper()} inference with RyzenAI 1.5.0 requires driver "
|
|
350
|
-
f"version {required_driver_version}.\n"
|
|
363
|
+
f"version {required_driver_version} or higher.\n"
|
|
351
364
|
"Please download and install the correct NPU Driver from:\n"
|
|
352
365
|
f"{NPU_DRIVER_DOWNLOAD_URL}\n"
|
|
353
366
|
"NPU functionality may not work properly."
|
lemonade/tools/report/table.py
CHANGED
|
@@ -591,7 +591,7 @@ class LemonadePerfTable(Table):
|
|
|
591
591
|
_wrap("Total Generated Tokens", 9),
|
|
592
592
|
Keys.RESPONSE_TOKENS,
|
|
593
593
|
"d",
|
|
594
|
-
stat_fn=sum,
|
|
594
|
+
stat_fn=lambda x: sum(_to_list(x)),
|
|
595
595
|
),
|
|
596
596
|
SimpleStat(
|
|
597
597
|
_wrap("Memory Used (GB)", 8), Keys.MAX_MEMORY_USED_GBYTE, ".3f"
|
|
@@ -6,6 +6,7 @@ import threading
|
|
|
6
6
|
import platform
|
|
7
7
|
|
|
8
8
|
from dotenv import load_dotenv
|
|
9
|
+
from fastapi import HTTPException, status
|
|
9
10
|
|
|
10
11
|
from lemonade_server.pydantic_models import (
|
|
11
12
|
PullConfig,
|
|
@@ -28,6 +29,20 @@ class LlamaTelemetry(WrappedServerTelemetry):
|
|
|
28
29
|
Parse telemetry data from llama server output lines.
|
|
29
30
|
"""
|
|
30
31
|
|
|
32
|
+
if "vk::PhysicalDevice::createDevice: ErrorExtensionNotPresent" in line:
|
|
33
|
+
msg = (
|
|
34
|
+
"Your AMD GPU driver version is not compatible with this software.\n"
|
|
35
|
+
"Please update and try again: "
|
|
36
|
+
"https://www.amd.com/en/support/download/drivers.html"
|
|
37
|
+
)
|
|
38
|
+
logging.error(msg)
|
|
39
|
+
raise HTTPException(
|
|
40
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
41
|
+
detail=msg,
|
|
42
|
+
)
|
|
43
|
+
elif "error" in line.lower():
|
|
44
|
+
logging.error(line)
|
|
45
|
+
|
|
31
46
|
# Parse Vulkan device detection
|
|
32
47
|
vulkan_match = re.search(r"ggml_vulkan: Found (\d+) Vulkan devices?:", line)
|
|
33
48
|
if vulkan_match:
|
|
@@ -147,21 +162,12 @@ class LlamaServer(WrappedServer):
|
|
|
147
162
|
# Add port and jinja to enable tool use
|
|
148
163
|
base_command.extend(["--port", str(self.port), "--jinja"])
|
|
149
164
|
|
|
150
|
-
#
|
|
151
|
-
|
|
152
|
-
self.backend == "vulkan"
|
|
153
|
-
and "gpt-oss-120b" in snapshot_files["variant"].lower()
|
|
154
|
-
):
|
|
155
|
-
base_command.remove("--jinja")
|
|
156
|
-
logging.warning(
|
|
157
|
-
"Jinja is disabled for gpt-oss-120b on Vulkan due to a llama.cpp bug "
|
|
158
|
-
"(see https://github.com/ggml-org/llama.cpp/issues/15274). "
|
|
159
|
-
"The model cannot use tools. If needed, use the ROCm backend instead."
|
|
160
|
-
)
|
|
165
|
+
# Enable context shift and avoid attention sink issues by preserving the initial tokens
|
|
166
|
+
base_command.extend(["--context-shift", "--keep", "16"])
|
|
161
167
|
|
|
162
168
|
# Use legacy reasoning formatting, since not all apps support the new
|
|
163
169
|
# reasoning_content field
|
|
164
|
-
base_command.extend(["--reasoning-format", "
|
|
170
|
+
base_command.extend(["--reasoning-format", "auto"])
|
|
165
171
|
|
|
166
172
|
# Add embeddings support if the model supports it
|
|
167
173
|
if supports_embeddings:
|
|
@@ -182,7 +188,7 @@ class LlamaServer(WrappedServer):
|
|
|
182
188
|
exe_dir = os.path.dirname(exe_path)
|
|
183
189
|
env_file_path = os.path.join(exe_dir, ".env")
|
|
184
190
|
if os.path.exists(env_file_path):
|
|
185
|
-
load_dotenv(env_file_path, override=
|
|
191
|
+
load_dotenv(env_file_path, override=False)
|
|
186
192
|
env.update(os.environ)
|
|
187
193
|
logging.debug(f"Loaded environment variables from {env_file_path}")
|
|
188
194
|
|
lemonade/tools/server/serve.py
CHANGED
|
@@ -133,6 +133,21 @@ class StopOnEvent:
|
|
|
133
133
|
return self.stop_event.is_set()
|
|
134
134
|
|
|
135
135
|
|
|
136
|
+
class NoCacheStaticFiles(StaticFiles):
|
|
137
|
+
"""Custom StaticFiles class with no-cache headers"""
|
|
138
|
+
|
|
139
|
+
def __init__(self, *args, **kwargs):
|
|
140
|
+
super().__init__(*args, **kwargs)
|
|
141
|
+
|
|
142
|
+
def file_response(self, *args, **kwargs) -> Response:
|
|
143
|
+
response = super().file_response(*args, **kwargs)
|
|
144
|
+
# Add no-cache headers for all static files
|
|
145
|
+
response.headers["Cache-Control"] = "no-cache, no-store, must-revalidate"
|
|
146
|
+
response.headers["Pragma"] = "no-cache"
|
|
147
|
+
response.headers["Expires"] = "0"
|
|
148
|
+
return response
|
|
149
|
+
|
|
150
|
+
|
|
136
151
|
class Server:
|
|
137
152
|
"""
|
|
138
153
|
Open a web server that apps can use to communicate with the LLM.
|
|
@@ -198,7 +213,7 @@ class Server:
|
|
|
198
213
|
# as the Web App
|
|
199
214
|
static_dir = Path(__file__).parent / "static"
|
|
200
215
|
self.app.mount(
|
|
201
|
-
"/static",
|
|
216
|
+
"/static", NoCacheStaticFiles(directory=static_dir), name="static_assets"
|
|
202
217
|
)
|
|
203
218
|
|
|
204
219
|
# Performance stats that are set during /ws and can be
|
|
@@ -1145,18 +1160,33 @@ class Server:
|
|
|
1145
1160
|
)
|
|
1146
1161
|
self.input_tokens = len(input_ids[0])
|
|
1147
1162
|
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1163
|
+
max_prompt_length = self.ctx_size # Default fallback
|
|
1164
|
+
# For OGA models, try to read the actual max prompt length from config
|
|
1165
|
+
if "oga-" in self.llm_loaded.recipe:
|
|
1166
|
+
try:
|
|
1167
|
+
if model.config and model.config.get("max_prompt_length"):
|
|
1168
|
+
max_prompt_length = model.config["max_prompt_length"]
|
|
1169
|
+
logging.debug(
|
|
1170
|
+
f"Using OGA model max_prompt_length: {max_prompt_length}"
|
|
1171
|
+
)
|
|
1172
|
+
# pylint: disable=broad-exception-caught
|
|
1173
|
+
except Exception as e:
|
|
1174
|
+
logging.debug(f"Could not read OGA model config, using ctx_size: {e}")
|
|
1153
1175
|
|
|
1176
|
+
# Apply truncation if input exceeds the limit
|
|
1177
|
+
if self.input_tokens > max_prompt_length:
|
|
1178
|
+
# Truncate input ids
|
|
1179
|
+
truncate_amount = self.input_tokens - max_prompt_length
|
|
1180
|
+
input_ids = input_ids[:max_prompt_length]
|
|
1154
1181
|
# Update token count
|
|
1155
|
-
self.
|
|
1182
|
+
if "oga-" in self.llm_loaded.recipe:
|
|
1183
|
+
self.input_tokens = len(input_ids)
|
|
1184
|
+
else:
|
|
1185
|
+
self.input_tokens = len(input_ids[0])
|
|
1156
1186
|
|
|
1157
|
-
#
|
|
1187
|
+
# Log warning message instead of raising exception
|
|
1158
1188
|
truncation_message = (
|
|
1159
|
-
f"Input exceeded {
|
|
1189
|
+
f"Input exceeded {max_prompt_length} tokens. "
|
|
1160
1190
|
f"Truncated {truncate_amount} tokens from the beginning."
|
|
1161
1191
|
)
|
|
1162
1192
|
logging.warning(truncation_message)
|