lemonade-sdk 7.0.0__tar.gz → 7.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (75) hide show
  1. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/PKG-INFO +4 -7
  2. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/setup.py +4 -7
  3. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/cli.py +2 -0
  4. lemonade_sdk-7.0.2/src/lemonade/tools/accuracy.py +335 -0
  5. lemonade_sdk-7.0.2/src/lemonade/tools/server/instructions.py +294 -0
  6. lemonade_sdk-7.0.2/src/lemonade/tools/server/llamacpp.py +315 -0
  7. lemonade_sdk-7.0.2/src/lemonade/tools/server/port_utils.py +57 -0
  8. lemonade_sdk-7.0.2/src/lemonade/tools/server/pydantic_models.py +83 -0
  9. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/tools/server/serve.py +225 -167
  10. lemonade_sdk-7.0.2/src/lemonade/tools/server/static/styles.css +313 -0
  11. lemonade_sdk-7.0.2/src/lemonade/tools/server/thread_utils.py +87 -0
  12. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/tools/server/tool_calls.py +50 -43
  13. lemonade_sdk-7.0.2/src/lemonade/version.py +1 -0
  14. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade_sdk.egg-info/PKG-INFO +4 -7
  15. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade_sdk.egg-info/SOURCES.txt +7 -0
  16. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade_sdk.egg-info/requires.txt +3 -6
  17. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade_server/cli.py +4 -2
  18. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade_server/model_manager.py +34 -17
  19. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade_server/server_models.json +52 -3
  20. lemonade_sdk-7.0.0/src/lemonade/version.py +0 -1
  21. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/LICENSE +0 -0
  22. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/NOTICE.md +0 -0
  23. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/README.md +0 -0
  24. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/pyproject.toml +0 -0
  25. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/setup.cfg +0 -0
  26. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/__init__.py +0 -0
  27. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/api.py +0 -0
  28. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/cache.py +0 -0
  29. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/common/__init__.py +0 -0
  30. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/common/analyze_model.py +0 -0
  31. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/common/build.py +0 -0
  32. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/common/cli_helpers.py +0 -0
  33. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/common/exceptions.py +0 -0
  34. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/common/filesystem.py +0 -0
  35. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/common/labels.py +0 -0
  36. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/common/onnx_helpers.py +0 -0
  37. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/common/plugins.py +0 -0
  38. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/common/printing.py +0 -0
  39. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/common/status.py +0 -0
  40. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/common/system_info.py +0 -0
  41. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/common/tensor_helpers.py +0 -0
  42. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/common/test_helpers.py +0 -0
  43. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/profilers/__init__.py +0 -0
  44. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/profilers/memory_tracker.py +0 -0
  45. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/profilers/profiler.py +0 -0
  46. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/sequence.py +0 -0
  47. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/state.py +0 -0
  48. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/tools/__init__.py +0 -0
  49. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/tools/adapter.py +0 -0
  50. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/tools/bench.py +0 -0
  51. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/tools/huggingface_bench.py +0 -0
  52. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/tools/huggingface_load.py +0 -0
  53. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/tools/humaneval.py +0 -0
  54. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/tools/llamacpp.py +0 -0
  55. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/tools/llamacpp_bench.py +0 -0
  56. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/tools/management_tools.py +0 -0
  57. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/tools/mmlu.py +0 -0
  58. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/tools/ort_genai/__init__.py +0 -0
  59. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/tools/ort_genai/oga.py +0 -0
  60. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/tools/ort_genai/oga_bench.py +0 -0
  61. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/tools/perplexity.py +0 -0
  62. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/tools/prompt.py +0 -0
  63. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/tools/quark/__init__.py +0 -0
  64. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/tools/quark/quark_load.py +0 -0
  65. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/tools/quark/quark_quantize.py +0 -0
  66. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/tools/report/__init__.py +0 -0
  67. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/tools/report/llm_report.py +0 -0
  68. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/tools/report/table.py +0 -0
  69. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/tools/server/__init__.py +0 -0
  70. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade/tools/tool.py +0 -0
  71. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade_install/__init__.py +0 -0
  72. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade_install/install.py +0 -0
  73. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade_sdk.egg-info/dependency_links.txt +0 -0
  74. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade_sdk.egg-info/entry_points.txt +0 -0
  75. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.2}/src/lemonade_sdk.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lemonade-sdk
3
- Version: 7.0.0
3
+ Version: 7.0.2
4
4
  Summary: Lemonade SDK: Your LLM Aide for Validation and Deployment
5
5
  Author-email: lemonade@amd.com
6
6
  Requires-Python: >=3.10, <3.12
@@ -25,8 +25,8 @@ Requires-Dist: matplotlib
25
25
  Requires-Dist: tabulate
26
26
  Requires-Dist: huggingface-hub==0.30.2
27
27
  Provides-Extra: llm
28
- Requires-Dist: torch>=2.0.0; extra == "llm"
29
- Requires-Dist: transformers; extra == "llm"
28
+ Requires-Dist: torch>=2.6.0; extra == "llm"
29
+ Requires-Dist: transformers<=4.51.3; extra == "llm"
30
30
  Requires-Dist: accelerate; extra == "llm"
31
31
  Requires-Dist: py-cpuinfo; extra == "llm"
32
32
  Requires-Dist: sentencepiece; extra == "llm"
@@ -34,23 +34,20 @@ Requires-Dist: datasets; extra == "llm"
34
34
  Requires-Dist: human-eval-windows==1.0.4; extra == "llm"
35
35
  Requires-Dist: fastapi; extra == "llm"
36
36
  Requires-Dist: uvicorn[standard]; extra == "llm"
37
- Requires-Dist: openai>=1.66.0; extra == "llm"
37
+ Requires-Dist: openai>=1.81.0; extra == "llm"
38
38
  Requires-Dist: lm-eval[api]; extra == "llm"
39
39
  Provides-Extra: llm-oga-cpu
40
40
  Requires-Dist: onnxruntime-genai==0.6.0; extra == "llm-oga-cpu"
41
41
  Requires-Dist: onnxruntime<1.22.0,>=1.10.1; extra == "llm-oga-cpu"
42
- Requires-Dist: torch<2.4,>=2.0.0; extra == "llm-oga-cpu"
43
42
  Requires-Dist: lemonade-sdk[llm]; extra == "llm-oga-cpu"
44
43
  Provides-Extra: llm-oga-igpu
45
44
  Requires-Dist: onnxruntime-genai-directml==0.6.0; extra == "llm-oga-igpu"
46
45
  Requires-Dist: onnxruntime-directml<1.22.0,>=1.19.0; extra == "llm-oga-igpu"
47
- Requires-Dist: torch<2.4,>=2.0.0; extra == "llm-oga-igpu"
48
46
  Requires-Dist: transformers<4.45.0; extra == "llm-oga-igpu"
49
47
  Requires-Dist: lemonade-sdk[llm]; extra == "llm-oga-igpu"
50
48
  Provides-Extra: llm-oga-cuda
51
49
  Requires-Dist: onnxruntime-genai-cuda==0.6.0; extra == "llm-oga-cuda"
52
50
  Requires-Dist: onnxruntime-gpu<1.22.0,>=1.19.1; extra == "llm-oga-cuda"
53
- Requires-Dist: torch<2.4,>=2.0.0; extra == "llm-oga-cuda"
54
51
  Requires-Dist: transformers<4.45.0; extra == "llm-oga-cuda"
55
52
  Requires-Dist: lemonade-sdk[llm]; extra == "llm-oga-cuda"
56
53
  Provides-Extra: llm-oga-npu
@@ -3,7 +3,6 @@ from setuptools import setup
3
3
  with open("src/lemonade/version.py", encoding="utf-8") as fp:
4
4
  version = fp.read().split('"')[1]
5
5
 
6
-
7
6
  setup(
8
7
  name="lemonade-sdk",
9
8
  version=version,
@@ -46,8 +45,8 @@ setup(
46
45
  ],
47
46
  extras_require={
48
47
  "llm": [
49
- "torch>=2.0.0",
50
- "transformers",
48
+ "torch>=2.6.0",
49
+ "transformers<=4.51.3",
51
50
  "accelerate",
52
51
  "py-cpuinfo",
53
52
  "sentencepiece",
@@ -57,26 +56,23 @@ setup(
57
56
  "human-eval-windows==1.0.4",
58
57
  "fastapi",
59
58
  "uvicorn[standard]",
60
- "openai>=1.66.0",
59
+ "openai>=1.81.0",
61
60
  "lm-eval[api]",
62
61
  ],
63
62
  "llm-oga-cpu": [
64
63
  "onnxruntime-genai==0.6.0",
65
64
  "onnxruntime >=1.10.1,<1.22.0",
66
- "torch>=2.0.0,<2.4",
67
65
  "lemonade-sdk[llm]",
68
66
  ],
69
67
  "llm-oga-igpu": [
70
68
  "onnxruntime-genai-directml==0.6.0",
71
69
  "onnxruntime-directml>=1.19.0,<1.22.0",
72
- "torch>=2.0.0,<2.4",
73
70
  "transformers<4.45.0",
74
71
  "lemonade-sdk[llm]",
75
72
  ],
76
73
  "llm-oga-cuda": [
77
74
  "onnxruntime-genai-cuda==0.6.0",
78
75
  "onnxruntime-gpu >=1.19.1,<1.22.0",
79
- "torch>=2.0.0,<2.4",
80
76
  "transformers<4.45.0",
81
77
  "lemonade-sdk[llm]",
82
78
  ],
@@ -111,6 +107,7 @@ setup(
111
107
  include_package_data=True,
112
108
  package_data={
113
109
  "lemonade_server": ["server_models.json"],
110
+ "lemonade": ["tools/server/static/styles.css"],
114
111
  },
115
112
  )
116
113
 
@@ -19,6 +19,7 @@ import lemonade.cache as cache
19
19
  from lemonade.tools.mmlu import AccuracyMMLU
20
20
  from lemonade.tools.humaneval import AccuracyHumaneval
21
21
  from lemonade.tools.perplexity import AccuracyPerplexity
22
+ from lemonade.tools.accuracy import LMEvalHarness
22
23
  from lemonade.tools.prompt import LLMPrompt
23
24
  from lemonade.tools.quark.quark_load import QuarkLoad
24
25
  from lemonade.tools.quark.quark_quantize import QuarkQuantize
@@ -36,6 +37,7 @@ def main():
36
37
  AccuracyMMLU,
37
38
  AccuracyHumaneval,
38
39
  AccuracyPerplexity,
40
+ LMEvalHarness,
39
41
  LLMPrompt,
40
42
  HuggingfaceBench,
41
43
  OgaBench,
@@ -0,0 +1,335 @@
1
+ import argparse
2
+ import json
3
+ import os
4
+ import socket
5
+ import subprocess
6
+ import sys
7
+ import time
8
+ from typing import Optional
9
+
10
+ import requests
11
+
12
+ from lemonade.state import State
13
+ from lemonade.tools import Tool
14
+ import lemonade.common.printing as printing
15
+ import lemonade.common.build as build
16
+
17
+ from lemonade.tools.server.thread_utils import ServerRunner
18
+
19
+
20
+ def is_port_in_use(port, host="localhost"):
21
+ """
22
+ Check if a port is in use
23
+ """
24
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
25
+ return s.connect_ex((host, port)) == 0
26
+
27
+
28
+ class LMEvalHarness(Tool):
29
+ """
30
+ Tool for evaluating LLMs using lm-eval-harness on industry standard benchmarks
31
+ like MMLU, GSM8k, and more. See docs/lemonade/lm_eval.md for more details.
32
+ """
33
+
34
+ unique_name = "lm-eval-harness"
35
+
36
+ def __init__(self):
37
+ super().__init__(
38
+ monitor_message="Evaluate model accuracy using ElutherAI's lm-eval-harness"
39
+ )
40
+ self.status_stats = []
41
+ self.server_runner = None
42
+
43
+ @staticmethod
44
+ def parser(add_help: bool = True) -> argparse.ArgumentParser:
45
+ parser = __class__.helpful_parser(
46
+ short_description="Evaluate model using lm-eval-harness",
47
+ add_help=add_help,
48
+ )
49
+
50
+ parser.add_argument(
51
+ "--task",
52
+ type=str,
53
+ required=True,
54
+ help="Task(s) to evaluate on (e.g., gsm8k, mmlu)",
55
+ )
56
+
57
+ parser.add_argument(
58
+ "--server-port", type=int, default=8000, help="Port to use for the server"
59
+ )
60
+
61
+ parser.add_argument(
62
+ "--num-fewshot",
63
+ type=int,
64
+ default=0,
65
+ help="Number of examples in few-shot prompts",
66
+ )
67
+
68
+ parser.add_argument(
69
+ "--limit",
70
+ type=int,
71
+ default=None,
72
+ help="Limit the number of examples per task",
73
+ )
74
+
75
+ parser.add_argument(
76
+ "--log-samples",
77
+ action="store_true",
78
+ help="Log samples for each task to log file",
79
+ )
80
+
81
+ parser.add_argument(
82
+ "--output-path",
83
+ type=str,
84
+ default=None,
85
+ help="Path to save evaluation results",
86
+ )
87
+
88
+ return parser
89
+
90
+ def _process_results(self, results_dir, state):
91
+ """Process evaluation results and save to state stats"""
92
+ if not os.path.exists(results_dir) or not os.path.isdir(results_dir):
93
+ printing.log_warning(f"Results directory not found at {results_dir}")
94
+ return
95
+
96
+ model_dirs = [
97
+ d
98
+ for d in os.listdir(results_dir)
99
+ if os.path.isdir(os.path.join(results_dir, d))
100
+ ]
101
+
102
+ if not model_dirs:
103
+ printing.log_warning(f"No model directories found in {results_dir}")
104
+ return
105
+
106
+ model_dir = os.path.join(results_dir, model_dirs[0])
107
+ printing.log_info(f"Found model directory: {model_dir}")
108
+
109
+ # Find the results JSON file with timestamp
110
+ results_files = [
111
+ f
112
+ for f in os.listdir(model_dir)
113
+ if f.startswith("results_") and f.endswith(".json")
114
+ ]
115
+
116
+ if not results_files:
117
+ printing.log_warning(f"No results files found in {model_dir}")
118
+ return
119
+
120
+ # Sort by timestamp
121
+ results_files.sort(reverse=True)
122
+ results_file_path = os.path.join(model_dir, results_files[0])
123
+ printing.log_info(f"Processing results from {results_file_path}")
124
+
125
+ # Read and process results
126
+ try:
127
+ with open(results_file_path, "r", encoding="utf-8") as f:
128
+ results = json.load(f)
129
+
130
+ # Extract and display metrics
131
+ if "results" in results:
132
+ for task_name, metrics in results["results"].items():
133
+ printing.log_info(f"Results for {task_name}:")
134
+
135
+ for metric, value in metrics.items():
136
+ if isinstance(value, (int, float)) and not metric.startswith(
137
+ "alias"
138
+ ):
139
+ # Format metric name for stats
140
+ clean_metric = metric.replace(",", "_")
141
+ stat_name = f"lm_eval_{task_name}_{clean_metric}"
142
+
143
+ # Save to state stats as percentage
144
+ state.save_stat(stat_name, float(value) * 100)
145
+ state.save_stat(f"{stat_name}_units", "%")
146
+ self.status_stats.append(stat_name)
147
+
148
+ printing.log_info(
149
+ f" {metric}: {value:.4f} ({value*100:.2f}%)"
150
+ )
151
+
152
+ # Save summary metrics if available
153
+ avg_metrics = {}
154
+ if "higher_is_better" in results:
155
+ for metric_type in results["higher_is_better"].values():
156
+ for metric in metric_type.keys():
157
+ if metric not in avg_metrics:
158
+ avg_metrics[metric] = []
159
+
160
+ for task_metrics in results["results"].values():
161
+ for metric, value in task_metrics.items():
162
+ if isinstance(value, (int, float)) and not metric.startswith(
163
+ "alias"
164
+ ):
165
+ base_metric = metric.split(",")[0]
166
+ if base_metric in avg_metrics:
167
+ avg_metrics[base_metric].append(value)
168
+
169
+ # Calculate and save averages
170
+ for metric, values in avg_metrics.items():
171
+ if values:
172
+ avg_value = sum(values) / len(values)
173
+ stat_name = f"lm_eval_average_{metric}"
174
+ state.save_stat(stat_name, float(avg_value) * 100)
175
+ state.save_stat(f"{stat_name}_units", "%")
176
+ self.status_stats.append(stat_name)
177
+ printing.log_info(
178
+ f"Average {metric}: {avg_value:.4f} ({avg_value*100:.2f}%)"
179
+ )
180
+
181
+ except (IOError, json.JSONDecodeError) as e:
182
+ printing.log_error(f"Error processing results: {e}")
183
+
184
+ def run(
185
+ self,
186
+ state: State,
187
+ task: str,
188
+ server_port: int = 8000,
189
+ server_host: str = "localhost",
190
+ num_fewshot: int = 0,
191
+ limit: Optional[int] = None,
192
+ log_samples: bool = False,
193
+ output_path: Optional[str] = None,
194
+ ) -> State:
195
+
196
+ model = state.model
197
+ tokenizer = state.tokenizer
198
+
199
+ if model is None or tokenizer is None:
200
+ raise ValueError(
201
+ "Model and tokenizer must be loaded in state before running lm-eval-harness"
202
+ )
203
+
204
+ # Set up output path
205
+ if output_path is None:
206
+ output_path = os.path.join(
207
+ build.output_dir(state.cache_dir, state.build_name), "lm_eval_results"
208
+ )
209
+
210
+ os.makedirs(output_path, exist_ok=True)
211
+
212
+ # Check if port is already in use
213
+ if is_port_in_use(server_port, server_host):
214
+ error_msg = (
215
+ f"Port {server_port} is already in use. "
216
+ "Please close all applications using this port and try again."
217
+ )
218
+ printing.log_error(error_msg)
219
+ raise RuntimeError(error_msg)
220
+
221
+ # Retroactively determine recipe based on model type to select correct iterator
222
+ # The model is already loaded in server, so we only need recipe for iterator selection
223
+ checkpoint = getattr(state, "checkpoint", "unknown")
224
+ if "OrtGenaiModel" in str(type(model)):
225
+ recipe = "oga-"
226
+ else:
227
+ recipe = "unknown"
228
+
229
+ # Start the server thread
230
+ self.server_runner = ServerRunner(
231
+ model=model,
232
+ tokenizer=tokenizer,
233
+ checkpoint=checkpoint,
234
+ recipe=recipe,
235
+ host=server_host,
236
+ port=server_port,
237
+ )
238
+ self.server_runner.start()
239
+
240
+ # Wait for server initialization
241
+ printing.log_info("Waiting for server initialization...")
242
+
243
+ # Wait for server to start and be responsive
244
+ server_url = f"http://{server_host}:{server_port}"
245
+ max_retries = 30
246
+ retry_delay = 1
247
+
248
+ printing.log_info(f"Checking if server is available at {server_url}...")
249
+ for i in range(max_retries):
250
+ try:
251
+ response = requests.get(f"{server_url}/api/v0/health", timeout=2)
252
+ if response.status_code == 200:
253
+ printing.log_info(f"Server is ready after {i+1} attempts")
254
+ break
255
+ except requests.exceptions.RequestException:
256
+ if i < max_retries - 1:
257
+ time.sleep(retry_delay)
258
+ else:
259
+ printing.log_error(
260
+ f"Server did not start after {max_retries} attempts"
261
+ )
262
+ raise RuntimeError("Failed to start the server")
263
+
264
+ # Build API URL
265
+ results_file = os.path.join(output_path, f"{task}_results")
266
+
267
+ printing.log_info(f"Running lm-eval-harness on {task}...")
268
+
269
+ # Build lm-eval-harness command
270
+ cmd = [
271
+ "lm_eval",
272
+ "--model",
273
+ "local-completions",
274
+ "--tasks",
275
+ task,
276
+ "--model_args",
277
+ (
278
+ f"model={checkpoint},"
279
+ f"base_url={server_url}/api/v0/completions,"
280
+ f"num_concurrent=1,"
281
+ f"max_retries=5,"
282
+ f"retry_timeout=10,"
283
+ f"tokenized_requests=False"
284
+ ),
285
+ "--num_fewshot",
286
+ str(num_fewshot),
287
+ "--output_path",
288
+ results_file,
289
+ ]
290
+
291
+ if limit is not None:
292
+ cmd.extend(["--limit", str(limit)])
293
+
294
+ if log_samples:
295
+ cmd.extend(["--log_samples"])
296
+
297
+ try:
298
+ # On Windows, set UTF-8 mode to handle Unicode output
299
+ env = os.environ.copy()
300
+ if sys.platform == "win32":
301
+ env["PYTHONIOENCODING"] = "utf-8"
302
+
303
+ # Execute lm-eval-harness command
304
+ result = subprocess.run(
305
+ cmd, check=True, text=True, capture_output=True, env=env
306
+ )
307
+
308
+ # Log relevant output and skip any parts that might cause encoding issues
309
+ try:
310
+ printing.log_info(result.stdout)
311
+ except UnicodeEncodeError:
312
+ printing.log_info(
313
+ "Results obtained successfully but couldn't display due to encoding issues"
314
+ )
315
+
316
+ # Process results from the correct location
317
+ results_dir = os.path.join(output_path, f"{task}_results")
318
+ self._process_results(results_dir, state)
319
+
320
+ except subprocess.CalledProcessError as e:
321
+ printing.log_error(f"Error running lm-eval-harness: {e}")
322
+ printing.log_error(f"stderr: {e.stderr}")
323
+ except (IOError, ValueError, requests.RequestException) as e:
324
+ printing.log_error(f"Error: {e}")
325
+ finally:
326
+ # Shut down server
327
+ if self.server_runner and self.server_runner.is_alive():
328
+ printing.log_info("Shutting down server runner...")
329
+ self.server_runner.shutdown()
330
+
331
+ # Make sure we don't have any lingering references to state's model/tokenizer
332
+ # that could prevent garbage collection
333
+ self.server_runner = None
334
+
335
+ return state