lemonade-sdk 8.1.4__py3-none-any.whl → 8.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (53) hide show
  1. lemonade/cache.py +6 -1
  2. lemonade/cli.py +47 -5
  3. lemonade/common/inference_engines.py +13 -4
  4. lemonade/common/status.py +4 -4
  5. lemonade/common/system_info.py +544 -1
  6. lemonade/profilers/agt_power.py +437 -0
  7. lemonade/profilers/hwinfo_power.py +429 -0
  8. lemonade/tools/accuracy.py +143 -48
  9. lemonade/tools/adapter.py +6 -1
  10. lemonade/tools/bench.py +26 -8
  11. lemonade/tools/flm/__init__.py +1 -0
  12. lemonade/tools/flm/utils.py +303 -0
  13. lemonade/tools/huggingface/bench.py +6 -1
  14. lemonade/tools/llamacpp/bench.py +146 -27
  15. lemonade/tools/llamacpp/load.py +30 -2
  16. lemonade/tools/llamacpp/utils.py +393 -33
  17. lemonade/tools/oga/bench.py +5 -26
  18. lemonade/tools/oga/load.py +60 -121
  19. lemonade/tools/oga/migration.py +403 -0
  20. lemonade/tools/report/table.py +76 -8
  21. lemonade/tools/server/flm.py +133 -0
  22. lemonade/tools/server/llamacpp.py +220 -553
  23. lemonade/tools/server/serve.py +684 -168
  24. lemonade/tools/server/static/js/chat.js +666 -342
  25. lemonade/tools/server/static/js/model-settings.js +24 -3
  26. lemonade/tools/server/static/js/models.js +597 -73
  27. lemonade/tools/server/static/js/shared.js +79 -14
  28. lemonade/tools/server/static/logs.html +191 -0
  29. lemonade/tools/server/static/styles.css +491 -66
  30. lemonade/tools/server/static/webapp.html +83 -31
  31. lemonade/tools/server/tray.py +158 -38
  32. lemonade/tools/server/utils/macos_tray.py +226 -0
  33. lemonade/tools/server/utils/{system_tray.py → windows_tray.py} +13 -0
  34. lemonade/tools/server/webapp.py +4 -1
  35. lemonade/tools/server/wrapped_server.py +559 -0
  36. lemonade/version.py +1 -1
  37. lemonade_install/install.py +54 -611
  38. {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/METADATA +29 -72
  39. lemonade_sdk-8.2.2.dist-info/RECORD +83 -0
  40. lemonade_server/cli.py +145 -37
  41. lemonade_server/model_manager.py +521 -37
  42. lemonade_server/pydantic_models.py +28 -1
  43. lemonade_server/server_models.json +246 -92
  44. lemonade_server/settings.py +39 -39
  45. lemonade/tools/quark/__init__.py +0 -0
  46. lemonade/tools/quark/quark_load.py +0 -173
  47. lemonade/tools/quark/quark_quantize.py +0 -439
  48. lemonade_sdk-8.1.4.dist-info/RECORD +0 -77
  49. {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/WHEEL +0 -0
  50. {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/entry_points.txt +0 -0
  51. {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/licenses/LICENSE +0 -0
  52. {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/licenses/NOTICE.md +0 -0
  53. {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,429 @@
1
+ #
2
+ # This power profiler uses an external tool called HWiNFO.
3
+ # Please see the power profiling documentation for download and install instructions.
4
+ #
5
+ # The power profiling functionality is currently not part of our continuous integration
6
+ # testing framework, primarily due to the setup overhead required from the above three items.
7
+ # We will revisit in the future if we face issues.
8
+ #
9
+
10
+ import ctypes
11
+ from datetime import datetime
12
+ import os
13
+ import platform
14
+ import textwrap
15
+ import time
16
+ import subprocess
17
+ import psutil
18
+ import matplotlib.pyplot as plt
19
+ import numpy as np
20
+ import pandas as pd
21
+ import lemonade.common.printing as printing
22
+ from lemonade.profilers import Profiler
23
+ from lemonade.tools.report.table import LemonadePerfTable, DictListStat
24
+
25
+ DEFAULT_TRACK_POWER_INTERVAL_MS = 500
26
+ DEFAULT_TRACK_POWER_WARMUP_PERIOD = 5
27
+
28
+ HWINFO_PATH_ENV_VAR = "HWINFO_PATH"
29
+ DEFAULT_HWINFO_PATH = r"C:\Program Files\HWiNFO64\HWiNFO64.exe"
30
+ POWER_USAGE_CSV_FILENAME = "power_usage_hwinfo.csv"
31
+ POWER_USAGE_PNG_FILENAME = "power_usage_hwinfo.png"
32
+
33
+
34
+ class Keys:
35
+ # Path to the file containing the power usage plot
36
+ POWER_USAGE_PLOT = "power_usage_plot_hwinfo"
37
+ # Path to the file containing the power usage plot
38
+ POWER_USAGE_DATA = "power_usage_data_hwinfo"
39
+ # Path to the file containing the power usage plot
40
+ POWER_USAGE_DATA_CSV = "power_usage_data_file_hwinfo"
41
+ # Maximum power consumed by the CPU processor package during the tools sequence
42
+ PEAK_PROCESSOR_PACKAGE_POWER = "peak_processor_package_power_hwinfo"
43
+
44
+
45
+ # Add column to the Lemonade performance report table for the power data
46
+ LemonadePerfTable.table_descriptor["stat_columns"].append(
47
+ DictListStat(
48
+ "Power Usage (HWiNFO)",
49
+ Keys.POWER_USAGE_DATA,
50
+ [
51
+ ("name", "{0}:"),
52
+ ("duration", "{0:.1f}s,"),
53
+ ("energy consumed", "{0:.1f} J"),
54
+ ],
55
+ )
56
+ )
57
+
58
+
59
+ def is_user_admin() -> bool:
60
+ """Return true if platform is Windows and user is Admin"""
61
+ os_type = platform.system()
62
+ if os_type == "Windows":
63
+ try:
64
+ return ctypes.windll.shell32.IsUserAnAdmin() == 1
65
+ except AttributeError:
66
+ pass
67
+ return False
68
+
69
+
70
+ def is_process_running(executable_name):
71
+ """Checks if an executable is currently running."""
72
+ executable_name = executable_name.lower()
73
+ for process in psutil.process_iter(["pid", "name"]):
74
+ if process.info["name"].lower() == executable_name:
75
+ return True
76
+ return False
77
+
78
+
79
+ def read_data_from_csv(csv_path, columns_dict, encoding="utf-8") -> pd.DataFrame:
80
+ try:
81
+ available_columns = pd.read_csv(csv_path, nrows=0, encoding=encoding).columns
82
+ columns_to_read = list(set(columns_dict.values()) & set(available_columns))
83
+ df = pd.read_csv(csv_path, usecols=columns_to_read, encoding=encoding)
84
+ except FileNotFoundError as e:
85
+ printing.log_info(f"Power profiler file not found: {e.filename}")
86
+ return None
87
+ except ValueError as e:
88
+ printing.log_info(f"Error reading power data from {csv_path}: {e}")
89
+ return None
90
+
91
+ # Rename columns to simple name
92
+ df.rename(
93
+ columns={v: k for k, v in columns_dict.items() if v in columns_to_read},
94
+ inplace=True,
95
+ )
96
+
97
+ return df
98
+
99
+
100
+ class HWINFOPowerProfiler(Profiler):
101
+
102
+ unique_name = "power-hwinfo"
103
+
104
+ # mapping from short name to full name of the measurement in the CSV file produced by HWiNFO
105
+ columns_dict = {
106
+ "time": "Time",
107
+ "cpu_package_power": "CPU Package Power [W]",
108
+ "npu_clock": "NPU Clock [MHz]",
109
+ "gpu_clock": "GPU Clock [MHz]",
110
+ "total_cpu_usage": "Total CPU Usage [%]",
111
+ "apu_stapm_limit": "APU STAPM Limit [%]",
112
+ "cpu_tdc_limit": "CPU TDC Limit [%]",
113
+ "cpu_edc_limit": "CPU EDC Limit [%]",
114
+ "cpu_ppt_fast_limit": "CPU PPT FAST Limit [%]",
115
+ "cpu_ppt_slow_limit": "CPU PPT SLOW Limit [%]",
116
+ "thermal_limit": "Thermal Limit [%]",
117
+ }
118
+
119
+ @staticmethod
120
+ def time_to_seconds(time_str):
121
+ # Parse the time string
122
+ try:
123
+ time_obj = datetime.strptime(time_str, "%H:%M:%S.%f")
124
+ except TypeError:
125
+ raise ValueError(f"Could not parse {time_str}")
126
+
127
+ # Calculate the total seconds
128
+ total_seconds = (
129
+ time_obj.hour * 3600
130
+ + time_obj.minute * 60
131
+ + time_obj.second
132
+ + time_obj.microsecond / 1_000_000
133
+ )
134
+ return total_seconds
135
+
136
+ @staticmethod
137
+ def add_arguments_to_parser(parser):
138
+ parser.add_argument(
139
+ f"--{HWINFOPowerProfiler.unique_name}",
140
+ nargs="?",
141
+ metavar="WARMUP_PERIOD",
142
+ type=int,
143
+ default=None,
144
+ const=DEFAULT_TRACK_POWER_WARMUP_PERIOD,
145
+ help="Track power consumption using the HWiNFO application and plot the results. "
146
+ "HWiNFO is a commercial product from a third party (https://www.hwinfo.com/) "
147
+ "and should be acquired/licensed appropriately. "
148
+ "Optionally, set the warmup period in seconds "
149
+ f"(default: {DEFAULT_TRACK_POWER_WARMUP_PERIOD}). If the application is not "
150
+ f"installed at {DEFAULT_HWINFO_PATH}, set the {HWINFO_PATH_ENV_VAR} environment "
151
+ f"variable to point at it. This is a Windows only feature and Lemonade must be run "
152
+ f"from a CMD window with Administrator privileges.",
153
+ )
154
+
155
+ def __init__(self, parser_arg_value):
156
+ super().__init__()
157
+ self.warmup_period = parser_arg_value
158
+ self.status_stats += [Keys.PEAK_PROCESSOR_PACKAGE_POWER, Keys.POWER_USAGE_PLOT]
159
+ self.tracking_active = False
160
+ self.build_dir = None
161
+ self.csv_path = None
162
+ self.hwinfo_process = None
163
+ self.data = None
164
+
165
+ def start(self, build_dir):
166
+ if self.tracking_active:
167
+ raise RuntimeError("Cannot start power tracking while already tracking")
168
+
169
+ if platform.system() != "Windows":
170
+ raise RuntimeError("Power usage tracking is only enabled in Windows.")
171
+
172
+ # Check that user as running in Admin mode
173
+ if not is_user_admin():
174
+ raise RuntimeError(
175
+ "For power usage tracking, run Lemonade as an Administrator."
176
+ )
177
+
178
+ # Save the folder where data and plot will be stored
179
+ self.build_dir = build_dir
180
+
181
+ # The csv file where power data will be stored
182
+ self.csv_path = os.path.join(build_dir, POWER_USAGE_CSV_FILENAME)
183
+ if " " in self.csv_path:
184
+ raise RuntimeError(
185
+ "Can't log HWiNFO data to a file with a <space> in the path. "
186
+ "Please use the `-d` flag to specify a Lemonade cache path with no spaces."
187
+ )
188
+
189
+ # See if the HWINFO_PATH environment variables exists
190
+ # If so, use it instead of the default path
191
+ if HWINFO_PATH_ENV_VAR in os.environ:
192
+ hwinfo_path = os.getenv(HWINFO_PATH_ENV_VAR)
193
+ else:
194
+ hwinfo_path = DEFAULT_HWINFO_PATH
195
+
196
+ # Check the HWINFO executable exists
197
+ if not os.path.isfile(hwinfo_path):
198
+ raise FileNotFoundError(hwinfo_path)
199
+
200
+ # Check that executable is not already running
201
+ executable = hwinfo_path.split(os.sep)[-1]
202
+ if is_process_running(executable):
203
+ raise RuntimeError(
204
+ f"{executable} is already running. Quit it and try again."
205
+ )
206
+
207
+ # Start HWiNFO executable
208
+ try:
209
+ command = [
210
+ hwinfo_path,
211
+ f"-l{self.csv_path}",
212
+ f"-poll_rate={DEFAULT_TRACK_POWER_INTERVAL_MS}",
213
+ ]
214
+ self.hwinfo_process = subprocess.Popen(
215
+ command,
216
+ stdin=subprocess.PIPE,
217
+ stderr=subprocess.PIPE,
218
+ )
219
+ except OSError as e:
220
+ if "[WinError 740]" in str(e):
221
+ print(
222
+ "\nTo avoid `requested operation requires elevation` error, please make sure"
223
+ )
224
+ print(
225
+ "HWiNFO.exe has Properties->Compatibility->`Run this program as an "
226
+ "administrator` checked."
227
+ )
228
+ print(
229
+ "You may also need to set Windows User Account Control to `Never notify`.\n"
230
+ )
231
+ raise
232
+ self.tracking_active = True
233
+ time.sleep(self.warmup_period)
234
+
235
+ def stop(self):
236
+ if self.tracking_active:
237
+ self.tracking_active = False
238
+ time.sleep(self.warmup_period)
239
+ self.hwinfo_process.terminate()
240
+ self.hwinfo_process.wait()
241
+
242
+ def generate_results(self, state, timestamp, start_times):
243
+ if self.hwinfo_process is None:
244
+ return
245
+
246
+ if self.tracking_active:
247
+ self.stop()
248
+
249
+ df = read_data_from_csv(self.csv_path, self.columns_dict, encoding="latin1")
250
+ if df is None:
251
+ state.save_stat(Keys.POWER_USAGE_PLOT, "NONE")
252
+ return
253
+
254
+ # Remap time to seconds from start of profiling data
255
+ # Remap csv data time to elapsed seconds (i.e., substract out initial time)
256
+ try:
257
+ initial_data_time = self.time_to_seconds(df["time"].iloc[0])
258
+ df["time"] = df["time"].apply(
259
+ lambda x: (self.time_to_seconds(x) - initial_data_time)
260
+ )
261
+ except ValueError as e:
262
+ printing.log_info(
263
+ f"Badly formatted time data in {self.csv_path}: {e}. "
264
+ f"HWiNFO may have closed unexpectedly."
265
+ )
266
+ state.save_stat(Keys.POWER_USAGE_PLOT, "NONE")
267
+ return
268
+
269
+ # Make time 0 the time of the first tool starting (after the warmup period)
270
+ if start_times:
271
+ tool_start_times = sorted(start_times.values())
272
+ # First tool after warmup (if no tools, then will be time of start of cool down)
273
+ first_tool_time = tool_start_times[1]
274
+
275
+ # Map the measurement data so that zero in the measurement data aligns with
276
+ # the first_tool_time
277
+ #
278
+ # Find the difference between the timestamp first_tool_time and initial_data_time
279
+ # which is a count of seconds since midnight
280
+ #
281
+ # Find midnight prior to first_tool_time
282
+ t = time.localtime(first_tool_time)
283
+ since_midnight = (
284
+ t.tm_hour * 3600 + t.tm_min * 60 + t.tm_sec + (first_tool_time % 1)
285
+ )
286
+ delta = since_midnight - initial_data_time
287
+ df["time"] = df["time"] - delta
288
+
289
+ peak_power = max(df["cpu_package_power"])
290
+
291
+ # Create a figure
292
+ fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(16, 8))
293
+
294
+ if start_times:
295
+ tool_starts = sorted(start_times.items(), key=lambda item: item[1])
296
+ tool_name_list = [item[0] for item in tool_starts]
297
+
298
+ # Adjust to common time frame as power measurements
299
+ tool_start_list = [
300
+ max(df["time"].iloc[0], item[1] - first_tool_time)
301
+ for item in tool_starts
302
+ ]
303
+ tool_stop_list = tool_start_list[1:] + [df["time"].values[-1]]
304
+
305
+ # Extract power data time series
306
+ x_time = df["time"].to_numpy()
307
+ y_power = df["cpu_package_power"].to_numpy()
308
+
309
+ # Extract data for each stage in the build
310
+ self.data = []
311
+ for name, t0, tf in zip(tool_name_list, tool_start_list, tool_stop_list):
312
+ x = x_time[(x_time >= t0) * (x_time <= tf)]
313
+ x = np.insert(x, 0, t0)
314
+ x = np.insert(x, len(x), tf)
315
+ y = np.interp(x, x_time, y_power)
316
+ energy = np.trapz(y, x)
317
+ avg_power = energy / (tf - t0)
318
+ stage = {
319
+ "name": name,
320
+ "t": x.tolist(),
321
+ "power": y.tolist(),
322
+ "duration": float(tf - t0),
323
+ "energy consumed": float(energy),
324
+ "average power": float(avg_power),
325
+ }
326
+ self.data.append(stage)
327
+
328
+ for stage in self.data:
329
+ # Plot power usage time series
330
+ p = ax1.plot(
331
+ stage["t"],
332
+ stage["power"],
333
+ label=f"{stage['name']} ({stage['duration']:.1f}s, "
334
+ f"{stage['energy consumed']:0.1f} J)",
335
+ )
336
+ # Add a dashed line to show average power
337
+ ax1.plot(
338
+ [stage["t"][0], stage["t"][-1]],
339
+ [stage["average power"], stage["average power"]],
340
+ linestyle="--",
341
+ c=p[0].get_c(),
342
+ )
343
+ # Add average power text to plot
344
+ ax1.text(
345
+ stage["t"][0],
346
+ stage["average power"],
347
+ f"{stage['average power']:.1f} W ",
348
+ horizontalalignment="right",
349
+ verticalalignment="center",
350
+ c=p[0].get_c(),
351
+ )
352
+ else:
353
+ ax1.plot(
354
+ df["time"],
355
+ df["cpu_package_power"],
356
+ )
357
+ # Add title and labels to plots
358
+ ax1.set_ylabel(self.columns_dict["cpu_package_power"])
359
+ title_str = "HWiNFO Stats\n" + "\n".join(textwrap.wrap(state.build_name, 60))
360
+ ax1.set_title(title_str)
361
+ ax1.legend()
362
+ ax1.grid(True)
363
+
364
+ # Create second plot
365
+ ax2.plot(
366
+ df["time"],
367
+ df["npu_clock"],
368
+ label=self.columns_dict["npu_clock"],
369
+ )
370
+ ax2.plot(
371
+ df["time"],
372
+ df["gpu_clock"],
373
+ label=self.columns_dict["gpu_clock"],
374
+ )
375
+ ax2.set_xlabel("Time [s]")
376
+ ax2.set_ylabel("Clock Frequency [MHz]")
377
+ ax2.legend(loc=2)
378
+ ax2.grid(True)
379
+ # Add second y-axis for %
380
+ ax2_twin = ax2.twinx()
381
+ ax2_twin.plot(
382
+ df["time"],
383
+ df["total_cpu_usage"],
384
+ label=self.columns_dict["total_cpu_usage"],
385
+ c="g",
386
+ )
387
+ ax2_twin.set_ylim([0, 100])
388
+ vals = ax2_twin.get_yticks()
389
+ ax2_twin.set_yticks(vals)
390
+ ax2_twin.set_yticklabels([f"{v:.0f}%" for v in vals])
391
+ ax2_twin.legend(loc=1)
392
+
393
+ # Create third plot (all remaining columns)
394
+ plot3_columns = [
395
+ "apu_stapm_limit",
396
+ "cpu_tdc_limit",
397
+ "cpu_edc_limit",
398
+ "cpu_ppt_fast_limit",
399
+ "cpu_ppt_slow_limit",
400
+ "thermal_limit",
401
+ ]
402
+ for col_str in plot3_columns:
403
+ if col_str in df.columns:
404
+ ax3.plot(
405
+ df["time"],
406
+ df[col_str],
407
+ label=self.columns_dict[col_str],
408
+ )
409
+ ax3.set_xlabel("Time [s]")
410
+ ax3.set_ylim([0, 100])
411
+ vals = ax3.get_yticks()
412
+ ax3.set_yticks(vals)
413
+ ax3.set_yticklabels([f"{v:.0f}%" for v in vals])
414
+ if len(ax3.lines):
415
+ ax3.legend()
416
+ ax3.grid(True)
417
+
418
+ # Save plot to current folder AND save to cache
419
+ plot_path = os.path.join(
420
+ self.build_dir, f"{timestamp}_{POWER_USAGE_PNG_FILENAME}"
421
+ )
422
+ fig.savefig(plot_path, dpi=300, bbox_inches="tight")
423
+ plot_path = os.path.join(os.getcwd(), f"{timestamp}_{POWER_USAGE_PNG_FILENAME}")
424
+ fig.savefig(plot_path, dpi=300, bbox_inches="tight")
425
+
426
+ state.save_stat(Keys.POWER_USAGE_PLOT, plot_path)
427
+ state.save_stat(Keys.POWER_USAGE_DATA, self.data)
428
+ state.save_stat(Keys.POWER_USAGE_DATA_CSV, self.csv_path)
429
+ state.save_stat(Keys.PEAK_PROCESSOR_PACKAGE_POWER, f"{peak_power:0.1f} W")
@@ -83,42 +83,116 @@ class LMEvalHarness(Tool):
83
83
 
84
84
  return parser
85
85
 
86
- def _process_results(self, results_dir, state):
87
- """Process evaluation results and save to state stats"""
88
- if not os.path.exists(results_dir) or not os.path.isdir(results_dir):
89
- printing.log_warning(f"Results directory not found at {results_dir}")
90
- return
91
-
92
- model_dirs = [
93
- d
94
- for d in os.listdir(results_dir)
95
- if os.path.isdir(os.path.join(results_dir, d))
96
- ]
97
-
98
- if not model_dirs:
99
- printing.log_warning(f"No model directories found in {results_dir}")
100
- return
101
-
102
- model_dir = os.path.join(results_dir, model_dirs[0])
103
- printing.log_info(f"Found model directory: {model_dir}")
104
-
105
- # Find the results JSON file with timestamp
106
- results_files = [
107
- f
108
- for f in os.listdir(model_dir)
109
- if f.startswith("results_") and f.endswith(".json")
110
- ]
86
+ def _scale_metric(self, metric_name, value):
87
+ """
88
+ Scale metric value appropriately based on type and range
89
+
90
+ Args:
91
+ metric_name: Name of the metric (e.g., "acc,none", "ppl")
92
+ value: Numeric value of the metric
93
+
94
+ Returns:
95
+ tuple: (scaled_value, units, display_string)
96
+ """
97
+ fraction_metrics = {
98
+ "acc",
99
+ "accuracy",
100
+ "f1",
101
+ "exact_match",
102
+ "em",
103
+ "win_rate",
104
+ "recall",
105
+ "precision",
106
+ "rouge",
107
+ "bleu",
108
+ "meteor",
109
+ "bertscore",
110
+ "match",
111
+ "correct",
112
+ "pass",
113
+ "success_rate",
114
+ }
115
+
116
+ metric_base = metric_name.split(",")[0].lower()
117
+ is_fraction = any(
118
+ frac_metric in metric_base for frac_metric in fraction_metrics
119
+ )
120
+ is_in_unit_range = 0 <= value <= 1
111
121
 
112
- if not results_files:
113
- printing.log_warning(f"No results files found in {model_dir}")
122
+ if is_fraction and is_in_unit_range:
123
+ scaled_value = float(value) * 100
124
+ units = "%"
125
+ display_str = f"{value:.4f} ({scaled_value:.2f}%)"
126
+ else:
127
+ scaled_value = float(value)
128
+ units = "raw"
129
+ display_str = f"{value:.4f}"
130
+
131
+ return scaled_value, units, display_str
132
+
133
+ def _process_results(self, results_path, state):
134
+ """
135
+ Process evaluation results and save to state stats
136
+
137
+ Args:
138
+ results_path: Can be either a direct JSON file path or a directory path
139
+ state: State object to save metrics to
140
+ """
141
+ results_file_path = None
142
+
143
+ # Determine if this is a file or directory and find the JSON file
144
+ if os.path.isfile(results_path) and results_path.endswith(".json"):
145
+ # Direct JSON file path (modern format)
146
+ results_file_path = results_path
147
+ elif os.path.isdir(results_path):
148
+ # Look for model subdirectories
149
+ model_dirs = [
150
+ d
151
+ for d in os.listdir(results_path)
152
+ if os.path.isdir(os.path.join(results_path, d))
153
+ ]
154
+
155
+ if model_dirs:
156
+ # Format: results_dir/model_name/results_*.json
157
+ model_dir = os.path.join(results_path, model_dirs[0])
158
+ printing.log_info(f"Found model directory: {model_dir}")
159
+
160
+ results_files = [
161
+ f
162
+ for f in os.listdir(model_dir)
163
+ if f.startswith("results_") and f.endswith(".json")
164
+ ]
165
+
166
+ if results_files:
167
+ results_files.sort(reverse=True)
168
+ results_file_path = os.path.join(model_dir, results_files[0])
169
+ else:
170
+ printing.log_warning(f"No results files found in {model_dir}")
171
+ return
172
+ else:
173
+ printing.log_warning(f"No model directories found in {results_path}")
174
+ return
175
+ else:
176
+ # Handle case where lm-eval adds timestamp to expected filename
177
+ results_dir = os.path.dirname(results_path)
178
+ if os.path.exists(results_dir):
179
+ json_files = [f for f in os.listdir(results_dir) if f.endswith(".json")]
180
+ if json_files:
181
+ results_file_path = os.path.join(results_dir, json_files[0])
182
+ printing.log_info(f"Found results file: {results_file_path}")
183
+ else:
184
+ printing.log_warning(f"No JSON results file found in {results_dir}")
185
+ return
186
+ else:
187
+ printing.log_warning(f"Results path not found at {results_path}")
188
+ return
189
+
190
+ if not results_file_path or not os.path.exists(results_file_path):
191
+ printing.log_warning(f"Results file not found at {results_file_path}")
114
192
  return
115
193
 
116
- # Sort by timestamp
117
- results_files.sort(reverse=True)
118
- results_file_path = os.path.join(model_dir, results_files[0])
119
194
  printing.log_info(f"Processing results from {results_file_path}")
120
195
 
121
- # Read and process results
122
196
  try:
123
197
  with open(results_file_path, "r", encoding="utf-8") as f:
124
198
  results = json.load(f)
@@ -132,18 +206,21 @@ class LMEvalHarness(Tool):
132
206
  if isinstance(value, (int, float)) and not metric.startswith(
133
207
  "alias"
134
208
  ):
135
- # Format metric name for stats
136
- clean_metric = metric.replace(",", "_")
209
+ # Format metric name for stats - remove ,none suffix
210
+ clean_metric = metric.split(",")[0] # Remove ,none suffix
137
211
  stat_name = f"lm_eval_{task_name}_{clean_metric}"
138
212
 
139
- # Save to state stats as percentage
140
- state.save_stat(stat_name, float(value) * 100)
141
- state.save_stat(f"{stat_name}_units", "%")
213
+ # Scale metric appropriately
214
+ scaled_value, units, value_str = self._scale_metric(
215
+ metric, value
216
+ )
217
+ display_str = f" {metric}: {value_str}"
218
+
219
+ state.save_stat(stat_name, scaled_value)
220
+ state.save_stat(f"{stat_name}_units", units)
142
221
  self.status_stats.append(stat_name)
143
222
 
144
- printing.log_info(
145
- f" {metric}: {value:.4f} ({value*100:.2f}%)"
146
- )
223
+ printing.log_info(display_str)
147
224
 
148
225
  # Save summary metrics if available
149
226
  avg_metrics = {}
@@ -167,12 +244,17 @@ class LMEvalHarness(Tool):
167
244
  if values:
168
245
  avg_value = sum(values) / len(values)
169
246
  stat_name = f"lm_eval_average_{metric}"
170
- state.save_stat(stat_name, float(avg_value) * 100)
171
- state.save_stat(f"{stat_name}_units", "%")
172
- self.status_stats.append(stat_name)
173
- printing.log_info(
174
- f"Average {metric}: {avg_value:.4f} ({avg_value*100:.2f}%)"
247
+
248
+ # Apply same scaling logic as individual metrics
249
+ scaled_avg, units, value_str = self._scale_metric(
250
+ metric, avg_value
175
251
  )
252
+ display_str = f"Average {metric}: {value_str}"
253
+
254
+ state.save_stat(stat_name, scaled_avg)
255
+ state.save_stat(f"{stat_name}_units", units)
256
+ self.status_stats.append(stat_name)
257
+ printing.log_info(display_str)
176
258
 
177
259
  except (IOError, json.JSONDecodeError) as e:
178
260
  printing.log_error(f"Error processing results: {e}")
@@ -189,6 +271,20 @@ class LMEvalHarness(Tool):
189
271
  output_path: Optional[str] = None,
190
272
  ) -> State:
191
273
 
274
+ # Check if lm-eval is available
275
+ try:
276
+ # pylint: disable=unused-import
277
+ import lm_eval
278
+ except ImportError:
279
+ error_msg = (
280
+ "lm-eval-harness is required but not installed. "
281
+ "Please install it using one of the following commands:\n"
282
+ " pip install lemonade-sdk[dev]\n"
283
+ " pip install -e .[dev]\n"
284
+ )
285
+ printing.log_error(error_msg)
286
+ raise ImportError(error_msg)
287
+
192
288
  import requests
193
289
  from lemonade.tools.server.utils.thread import ServerRunner
194
290
 
@@ -261,7 +357,7 @@ class LMEvalHarness(Tool):
261
357
  raise RuntimeError("Failed to start the server")
262
358
 
263
359
  # Build API URL
264
- results_file = os.path.join(output_path, f"{task}_results")
360
+ results_file = os.path.join(output_path, f"{task}_results.json")
265
361
 
266
362
  printing.log_info(f"Running lm-eval-harness on {task}...")
267
363
 
@@ -312,9 +408,8 @@ class LMEvalHarness(Tool):
312
408
  "Results obtained successfully but couldn't display due to encoding issues"
313
409
  )
314
410
 
315
- # Process results from the correct location
316
- results_dir = os.path.join(output_path, f"{task}_results")
317
- self._process_results(results_dir, state)
411
+ # Process results from the JSON file
412
+ self._process_results(results_file, state)
318
413
 
319
414
  except subprocess.CalledProcessError as e:
320
415
  printing.log_error(f"Error running lm-eval-harness: {e}")