lemonade-sdk 7.0.1__tar.gz → 7.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (77) hide show
  1. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/PKG-INFO +1 -1
  2. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/setup.py +1 -1
  3. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/cli.py +2 -0
  4. lemonade_sdk-7.0.3/src/lemonade/tools/accuracy.py +335 -0
  5. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/huggingface_load.py +6 -0
  6. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/ort_genai/oga.py +6 -4
  7. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/prompt.py +28 -1
  8. lemonade_sdk-7.0.3/src/lemonade/tools/server/instructions.py +37 -0
  9. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/server/llamacpp.py +45 -19
  10. lemonade_sdk-7.0.3/src/lemonade/tools/server/port_utils.py +57 -0
  11. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/server/serve.py +96 -44
  12. lemonade_sdk-7.0.3/src/lemonade/tools/server/static/instructions.html +262 -0
  13. lemonade_sdk-7.0.3/src/lemonade/tools/server/thread_utils.py +87 -0
  14. lemonade_sdk-7.0.3/src/lemonade/version.py +1 -0
  15. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade_sdk.egg-info/PKG-INFO +1 -1
  16. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade_sdk.egg-info/SOURCES.txt +5 -1
  17. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade_server/model_manager.py +45 -12
  18. {lemonade_sdk-7.0.1/src/lemonade/tools/server → lemonade_sdk-7.0.3/src/lemonade_server}/pydantic_models.py +2 -0
  19. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade_server/server_models.json +25 -4
  20. lemonade_sdk-7.0.1/src/lemonade/tools/server/instructions.py +0 -294
  21. lemonade_sdk-7.0.1/src/lemonade/version.py +0 -1
  22. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/LICENSE +0 -0
  23. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/NOTICE.md +0 -0
  24. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/README.md +0 -0
  25. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/pyproject.toml +0 -0
  26. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/setup.cfg +0 -0
  27. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/__init__.py +0 -0
  28. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/api.py +0 -0
  29. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/cache.py +0 -0
  30. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/common/__init__.py +0 -0
  31. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/common/analyze_model.py +0 -0
  32. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/common/build.py +0 -0
  33. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/common/cli_helpers.py +0 -0
  34. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/common/exceptions.py +0 -0
  35. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/common/filesystem.py +0 -0
  36. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/common/labels.py +0 -0
  37. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/common/onnx_helpers.py +0 -0
  38. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/common/plugins.py +0 -0
  39. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/common/printing.py +0 -0
  40. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/common/status.py +0 -0
  41. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/common/system_info.py +0 -0
  42. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/common/tensor_helpers.py +0 -0
  43. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/common/test_helpers.py +0 -0
  44. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/profilers/__init__.py +0 -0
  45. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/profilers/memory_tracker.py +0 -0
  46. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/profilers/profiler.py +0 -0
  47. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/sequence.py +0 -0
  48. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/state.py +0 -0
  49. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/__init__.py +0 -0
  50. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/adapter.py +0 -0
  51. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/bench.py +0 -0
  52. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/huggingface_bench.py +0 -0
  53. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/humaneval.py +0 -0
  54. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/llamacpp.py +0 -0
  55. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/llamacpp_bench.py +0 -0
  56. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/management_tools.py +0 -0
  57. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/mmlu.py +0 -0
  58. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/ort_genai/__init__.py +0 -0
  59. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/ort_genai/oga_bench.py +0 -0
  60. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/perplexity.py +0 -0
  61. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/quark/__init__.py +0 -0
  62. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/quark/quark_load.py +0 -0
  63. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/quark/quark_quantize.py +0 -0
  64. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/report/__init__.py +0 -0
  65. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/report/llm_report.py +0 -0
  66. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/report/table.py +0 -0
  67. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/server/__init__.py +0 -0
  68. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/server/static/styles.css +0 -0
  69. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/server/tool_calls.py +0 -0
  70. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/tool.py +0 -0
  71. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade_install/__init__.py +0 -0
  72. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade_install/install.py +0 -0
  73. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade_sdk.egg-info/dependency_links.txt +0 -0
  74. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade_sdk.egg-info/entry_points.txt +0 -0
  75. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade_sdk.egg-info/requires.txt +0 -0
  76. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade_sdk.egg-info/top_level.txt +0 -0
  77. {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade_server/cli.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lemonade-sdk
3
- Version: 7.0.1
3
+ Version: 7.0.3
4
4
  Summary: Lemonade SDK: Your LLM Aide for Validation and Deployment
5
5
  Author-email: lemonade@amd.com
6
6
  Requires-Python: >=3.10, <3.12
@@ -107,7 +107,7 @@ setup(
107
107
  include_package_data=True,
108
108
  package_data={
109
109
  "lemonade_server": ["server_models.json"],
110
- "lemonade": ["tools/server/static/styles.css"],
110
+ "lemonade": ["tools/server/static/*"],
111
111
  },
112
112
  )
113
113
 
@@ -19,6 +19,7 @@ import lemonade.cache as cache
19
19
  from lemonade.tools.mmlu import AccuracyMMLU
20
20
  from lemonade.tools.humaneval import AccuracyHumaneval
21
21
  from lemonade.tools.perplexity import AccuracyPerplexity
22
+ from lemonade.tools.accuracy import LMEvalHarness
22
23
  from lemonade.tools.prompt import LLMPrompt
23
24
  from lemonade.tools.quark.quark_load import QuarkLoad
24
25
  from lemonade.tools.quark.quark_quantize import QuarkQuantize
@@ -36,6 +37,7 @@ def main():
36
37
  AccuracyMMLU,
37
38
  AccuracyHumaneval,
38
39
  AccuracyPerplexity,
40
+ LMEvalHarness,
39
41
  LLMPrompt,
40
42
  HuggingfaceBench,
41
43
  OgaBench,
@@ -0,0 +1,335 @@
1
+ import argparse
2
+ import json
3
+ import os
4
+ import socket
5
+ import subprocess
6
+ import sys
7
+ import time
8
+ from typing import Optional
9
+
10
+ import requests
11
+
12
+ from lemonade.state import State
13
+ from lemonade.tools import Tool
14
+ import lemonade.common.printing as printing
15
+ import lemonade.common.build as build
16
+
17
+ from lemonade.tools.server.thread_utils import ServerRunner
18
+
19
+
20
+ def is_port_in_use(port, host="localhost"):
21
+ """
22
+ Check if a port is in use
23
+ """
24
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
25
+ return s.connect_ex((host, port)) == 0
26
+
27
+
28
+ class LMEvalHarness(Tool):
29
+ """
30
+ Tool for evaluating LLMs using lm-eval-harness on industry standard benchmarks
31
+ like MMLU, GSM8k, and more. See docs/lemonade/lm_eval.md for more details.
32
+ """
33
+
34
+ unique_name = "lm-eval-harness"
35
+
36
+ def __init__(self):
37
+ super().__init__(
38
+ monitor_message="Evaluate model accuracy using ElutherAI's lm-eval-harness"
39
+ )
40
+ self.status_stats = []
41
+ self.server_runner = None
42
+
43
+ @staticmethod
44
+ def parser(add_help: bool = True) -> argparse.ArgumentParser:
45
+ parser = __class__.helpful_parser(
46
+ short_description="Evaluate model using lm-eval-harness",
47
+ add_help=add_help,
48
+ )
49
+
50
+ parser.add_argument(
51
+ "--task",
52
+ type=str,
53
+ required=True,
54
+ help="Task(s) to evaluate on (e.g., gsm8k, mmlu)",
55
+ )
56
+
57
+ parser.add_argument(
58
+ "--server-port", type=int, default=8000, help="Port to use for the server"
59
+ )
60
+
61
+ parser.add_argument(
62
+ "--num-fewshot",
63
+ type=int,
64
+ default=0,
65
+ help="Number of examples in few-shot prompts",
66
+ )
67
+
68
+ parser.add_argument(
69
+ "--limit",
70
+ type=int,
71
+ default=None,
72
+ help="Limit the number of examples per task",
73
+ )
74
+
75
+ parser.add_argument(
76
+ "--log-samples",
77
+ action="store_true",
78
+ help="Log samples for each task to log file",
79
+ )
80
+
81
+ parser.add_argument(
82
+ "--output-path",
83
+ type=str,
84
+ default=None,
85
+ help="Path to save evaluation results",
86
+ )
87
+
88
+ return parser
89
+
90
+ def _process_results(self, results_dir, state):
91
+ """Process evaluation results and save to state stats"""
92
+ if not os.path.exists(results_dir) or not os.path.isdir(results_dir):
93
+ printing.log_warning(f"Results directory not found at {results_dir}")
94
+ return
95
+
96
+ model_dirs = [
97
+ d
98
+ for d in os.listdir(results_dir)
99
+ if os.path.isdir(os.path.join(results_dir, d))
100
+ ]
101
+
102
+ if not model_dirs:
103
+ printing.log_warning(f"No model directories found in {results_dir}")
104
+ return
105
+
106
+ model_dir = os.path.join(results_dir, model_dirs[0])
107
+ printing.log_info(f"Found model directory: {model_dir}")
108
+
109
+ # Find the results JSON file with timestamp
110
+ results_files = [
111
+ f
112
+ for f in os.listdir(model_dir)
113
+ if f.startswith("results_") and f.endswith(".json")
114
+ ]
115
+
116
+ if not results_files:
117
+ printing.log_warning(f"No results files found in {model_dir}")
118
+ return
119
+
120
+ # Sort by timestamp
121
+ results_files.sort(reverse=True)
122
+ results_file_path = os.path.join(model_dir, results_files[0])
123
+ printing.log_info(f"Processing results from {results_file_path}")
124
+
125
+ # Read and process results
126
+ try:
127
+ with open(results_file_path, "r", encoding="utf-8") as f:
128
+ results = json.load(f)
129
+
130
+ # Extract and display metrics
131
+ if "results" in results:
132
+ for task_name, metrics in results["results"].items():
133
+ printing.log_info(f"Results for {task_name}:")
134
+
135
+ for metric, value in metrics.items():
136
+ if isinstance(value, (int, float)) and not metric.startswith(
137
+ "alias"
138
+ ):
139
+ # Format metric name for stats
140
+ clean_metric = metric.replace(",", "_")
141
+ stat_name = f"lm_eval_{task_name}_{clean_metric}"
142
+
143
+ # Save to state stats as percentage
144
+ state.save_stat(stat_name, float(value) * 100)
145
+ state.save_stat(f"{stat_name}_units", "%")
146
+ self.status_stats.append(stat_name)
147
+
148
+ printing.log_info(
149
+ f" {metric}: {value:.4f} ({value*100:.2f}%)"
150
+ )
151
+
152
+ # Save summary metrics if available
153
+ avg_metrics = {}
154
+ if "higher_is_better" in results:
155
+ for metric_type in results["higher_is_better"].values():
156
+ for metric in metric_type.keys():
157
+ if metric not in avg_metrics:
158
+ avg_metrics[metric] = []
159
+
160
+ for task_metrics in results["results"].values():
161
+ for metric, value in task_metrics.items():
162
+ if isinstance(value, (int, float)) and not metric.startswith(
163
+ "alias"
164
+ ):
165
+ base_metric = metric.split(",")[0]
166
+ if base_metric in avg_metrics:
167
+ avg_metrics[base_metric].append(value)
168
+
169
+ # Calculate and save averages
170
+ for metric, values in avg_metrics.items():
171
+ if values:
172
+ avg_value = sum(values) / len(values)
173
+ stat_name = f"lm_eval_average_{metric}"
174
+ state.save_stat(stat_name, float(avg_value) * 100)
175
+ state.save_stat(f"{stat_name}_units", "%")
176
+ self.status_stats.append(stat_name)
177
+ printing.log_info(
178
+ f"Average {metric}: {avg_value:.4f} ({avg_value*100:.2f}%)"
179
+ )
180
+
181
+ except (IOError, json.JSONDecodeError) as e:
182
+ printing.log_error(f"Error processing results: {e}")
183
+
184
+ def run(
185
+ self,
186
+ state: State,
187
+ task: str,
188
+ server_port: int = 8000,
189
+ server_host: str = "localhost",
190
+ num_fewshot: int = 0,
191
+ limit: Optional[int] = None,
192
+ log_samples: bool = False,
193
+ output_path: Optional[str] = None,
194
+ ) -> State:
195
+
196
+ model = state.model
197
+ tokenizer = state.tokenizer
198
+
199
+ if model is None or tokenizer is None:
200
+ raise ValueError(
201
+ "Model and tokenizer must be loaded in state before running lm-eval-harness"
202
+ )
203
+
204
+ # Set up output path
205
+ if output_path is None:
206
+ output_path = os.path.join(
207
+ build.output_dir(state.cache_dir, state.build_name), "lm_eval_results"
208
+ )
209
+
210
+ os.makedirs(output_path, exist_ok=True)
211
+
212
+ # Check if port is already in use
213
+ if is_port_in_use(server_port, server_host):
214
+ error_msg = (
215
+ f"Port {server_port} is already in use. "
216
+ "Please close all applications using this port and try again."
217
+ )
218
+ printing.log_error(error_msg)
219
+ raise RuntimeError(error_msg)
220
+
221
+ # Retroactively determine recipe based on model type to select correct iterator
222
+ # The model is already loaded in server, so we only need recipe for iterator selection
223
+ checkpoint = getattr(state, "checkpoint", "unknown")
224
+ if "OrtGenaiModel" in str(type(model)):
225
+ recipe = "oga-"
226
+ else:
227
+ recipe = "unknown"
228
+
229
+ # Start the server thread
230
+ self.server_runner = ServerRunner(
231
+ model=model,
232
+ tokenizer=tokenizer,
233
+ checkpoint=checkpoint,
234
+ recipe=recipe,
235
+ host=server_host,
236
+ port=server_port,
237
+ )
238
+ self.server_runner.start()
239
+
240
+ # Wait for server initialization
241
+ printing.log_info("Waiting for server initialization...")
242
+
243
+ # Wait for server to start and be responsive
244
+ server_url = f"http://{server_host}:{server_port}"
245
+ max_retries = 30
246
+ retry_delay = 1
247
+
248
+ printing.log_info(f"Checking if server is available at {server_url}...")
249
+ for i in range(max_retries):
250
+ try:
251
+ response = requests.get(f"{server_url}/api/v0/health", timeout=2)
252
+ if response.status_code == 200:
253
+ printing.log_info(f"Server is ready after {i+1} attempts")
254
+ break
255
+ except requests.exceptions.RequestException:
256
+ if i < max_retries - 1:
257
+ time.sleep(retry_delay)
258
+ else:
259
+ printing.log_error(
260
+ f"Server did not start after {max_retries} attempts"
261
+ )
262
+ raise RuntimeError("Failed to start the server")
263
+
264
+ # Build API URL
265
+ results_file = os.path.join(output_path, f"{task}_results")
266
+
267
+ printing.log_info(f"Running lm-eval-harness on {task}...")
268
+
269
+ # Build lm-eval-harness command
270
+ cmd = [
271
+ "lm_eval",
272
+ "--model",
273
+ "local-completions",
274
+ "--tasks",
275
+ task,
276
+ "--model_args",
277
+ (
278
+ f"model={checkpoint},"
279
+ f"base_url={server_url}/api/v0/completions,"
280
+ f"num_concurrent=1,"
281
+ f"max_retries=5,"
282
+ f"retry_timeout=10,"
283
+ f"tokenized_requests=False"
284
+ ),
285
+ "--num_fewshot",
286
+ str(num_fewshot),
287
+ "--output_path",
288
+ results_file,
289
+ ]
290
+
291
+ if limit is not None:
292
+ cmd.extend(["--limit", str(limit)])
293
+
294
+ if log_samples:
295
+ cmd.extend(["--log_samples"])
296
+
297
+ try:
298
+ # On Windows, set UTF-8 mode to handle Unicode output
299
+ env = os.environ.copy()
300
+ if sys.platform == "win32":
301
+ env["PYTHONIOENCODING"] = "utf-8"
302
+
303
+ # Execute lm-eval-harness command
304
+ result = subprocess.run(
305
+ cmd, check=True, text=True, capture_output=True, env=env
306
+ )
307
+
308
+ # Log relevant output and skip any parts that might cause encoding issues
309
+ try:
310
+ printing.log_info(result.stdout)
311
+ except UnicodeEncodeError:
312
+ printing.log_info(
313
+ "Results obtained successfully but couldn't display due to encoding issues"
314
+ )
315
+
316
+ # Process results from the correct location
317
+ results_dir = os.path.join(output_path, f"{task}_results")
318
+ self._process_results(results_dir, state)
319
+
320
+ except subprocess.CalledProcessError as e:
321
+ printing.log_error(f"Error running lm-eval-harness: {e}")
322
+ printing.log_error(f"stderr: {e.stderr}")
323
+ except (IOError, ValueError, requests.RequestException) as e:
324
+ printing.log_error(f"Error: {e}")
325
+ finally:
326
+ # Shut down server
327
+ if self.server_runner and self.server_runner.is_alive():
328
+ printing.log_info("Shutting down server runner...")
329
+ self.server_runner.shutdown()
330
+
331
+ # Make sure we don't have any lingering references to state's model/tokenizer
332
+ # that could prevent garbage collection
333
+ self.server_runner = None
334
+
335
+ return state
@@ -326,6 +326,7 @@ class HuggingfaceAdapter(ModelAdapter):
326
326
  def generate(
327
327
  self,
328
328
  input_ids,
329
+ random_seed=1,
329
330
  **kwargs,
330
331
  ):
331
332
 
@@ -346,6 +347,11 @@ class HuggingfaceAdapter(ModelAdapter):
346
347
  **kwargs,
347
348
  }
348
349
 
350
+ if random_seed is None:
351
+ torch.random.seed()
352
+ else:
353
+ torch.random.manual_seed(random_seed)
354
+
349
355
  with torch.no_grad(), torch.inference_mode():
350
356
  outputs = self.model.generate(input_ids=input_ids, **generation_kwargs)
351
357
 
@@ -139,6 +139,7 @@ class OrtGenaiModel(ModelAdapter):
139
139
  pad_token_id=None,
140
140
  stopping_criteria=None,
141
141
  max_length=None,
142
+ random_seed=1,
142
143
  ):
143
144
  params = og.GeneratorParams(self.model)
144
145
 
@@ -179,6 +180,9 @@ class OrtGenaiModel(ModelAdapter):
179
180
  if use_oga_pre_6_api:
180
181
  params.input_ids = input_ids
181
182
 
183
+ if random_seed is None:
184
+ random_seed = -1 # In og.Generator, -1 = seed with random device
185
+
182
186
  if self.config and "search" in self.config:
183
187
  search_config = self.config["search"]
184
188
  params.set_search_options(
@@ -196,10 +200,7 @@ class OrtGenaiModel(ModelAdapter):
196
200
  past_present_share_buffer=search_config.get(
197
201
  "past_present_share_buffer", True
198
202
  ),
199
- # Make sure that results do not vary across laptops
200
- # by default, random_seed=-1 causes different laptops to give
201
- # different results
202
- random_seed=1,
203
+ random_seed=random_seed,
203
204
  # Not currently supported by OGA
204
205
  # diversity_penalty=search_config.get('diversity_penalty', 0.0),
205
206
  # no_repeat_ngram_size=search_config.get('no_repeat_ngram_size', 0),
@@ -212,6 +213,7 @@ class OrtGenaiModel(ModelAdapter):
212
213
  temperature=temperature,
213
214
  max_length=max_length_to_use,
214
215
  min_length=min_length,
216
+ random_seed=random_seed,
215
217
  )
216
218
  params.try_graph_capture_with_max_batch_size(1)
217
219
 
@@ -15,6 +15,7 @@ DEFAULT_GENERATE_PARAMS = {
15
15
  "temperature": 0.7,
16
16
  }
17
17
 
18
+ DEFAULT_RANDOM_SEED = 1
18
19
  DEFAULT_MAX_NEW_TOKENS = 512
19
20
  DEFAULT_N_TRIALS = 1
20
21
 
@@ -108,6 +109,19 @@ class LLMPrompt(Tool):
108
109
  f"(useful for testing, default is {DEFAULT_N_TRIALS})",
109
110
  )
110
111
 
112
+ parser.add_argument(
113
+ "--random-seed",
114
+ "-r",
115
+ default=str(DEFAULT_RANDOM_SEED),
116
+ help="Positive integer seed for random number generator used in "
117
+ "sampling tokens "
118
+ f"(default is {DEFAULT_RANDOM_SEED}). If the number of trials is "
119
+ "greater than one, then the seed is incremented by one for each "
120
+ "trial. Set to `None` for random, non-repeatable results. This "
121
+ "random seed behavior only applies to models loaded with "
122
+ "`oga-load` or `huggingface-load`.",
123
+ )
124
+
111
125
  return parser
112
126
 
113
127
  def parse(self, state: State, args, known_only=True) -> argparse.Namespace:
@@ -123,6 +137,11 @@ class LLMPrompt(Tool):
123
137
  with open(parsed_args.prompt, "r", encoding="utf-8") as f:
124
138
  parsed_args.prompt = f.read()
125
139
 
140
+ if parsed_args.random_seed == "None":
141
+ parsed_args.random_seed = None
142
+ else:
143
+ parsed_args.random_seed = int(parsed_args.random_seed)
144
+
126
145
  return parsed_args
127
146
 
128
147
  def run(
@@ -132,6 +151,7 @@ class LLMPrompt(Tool):
132
151
  max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
133
152
  n_trials: int = DEFAULT_N_TRIALS,
134
153
  template: bool = False,
154
+ random_seed: int = DEFAULT_RANDOM_SEED,
135
155
  ) -> State:
136
156
 
137
157
  model: ModelAdapter = state.model
@@ -170,9 +190,16 @@ class LLMPrompt(Tool):
170
190
 
171
191
  # Get the response from the LLM, which may include the prompt in it
172
192
  response = model.generate(
173
- input_ids, max_new_tokens=max_new_tokens, **DEFAULT_GENERATE_PARAMS
193
+ input_ids,
194
+ max_new_tokens=max_new_tokens,
195
+ random_seed=random_seed,
196
+ **DEFAULT_GENERATE_PARAMS,
174
197
  )
175
198
 
199
+ # Increment random seed if not none
200
+ if random_seed is not None:
201
+ random_seed += 1
202
+
176
203
  # Flatten the input and response
177
204
  input_ids_array = (
178
205
  input_ids if isinstance(input_ids, (list, str)) else input_ids[0]
@@ -0,0 +1,37 @@
1
+ from pathlib import Path
2
+ import json
3
+ from fastapi.responses import HTMLResponse
4
+ from lemonade_server.model_manager import ModelManager
5
+
6
+
7
+ def get_instructions_html(port=8000):
8
+ """
9
+ Show instructions on how to use the server.
10
+ """
11
+ # Load server models from JSON
12
+ server_models_path = (
13
+ Path(__file__).parent.parent.parent.parent
14
+ / "lemonade_server"
15
+ / "server_models.json"
16
+ )
17
+ with open(server_models_path, "r", encoding="utf-8") as f:
18
+ server_models = json.load(f)
19
+
20
+ # Use shared filter function from model_manager.py
21
+ filtered_models = ModelManager().filter_models_by_backend(server_models)
22
+
23
+ # Pass filtered server_models to JS
24
+ server_models_js = (
25
+ f"<script>window.SERVER_MODELS = {json.dumps(filtered_models)};</script>"
26
+ )
27
+
28
+ # Load HTML template
29
+ template_path = Path(__file__).parent / "static" / "instructions.html"
30
+ with open(template_path, "r", encoding="utf-8") as f:
31
+ html_template = f.read()
32
+
33
+ # Replace template variables
34
+ html_content = html_template.replace("{{SERVER_PORT}}", str(port))
35
+ html_content = html_content.replace("{{SERVER_MODELS_JS}}", server_models_js)
36
+
37
+ return HTMLResponse(content=html_content)
@@ -14,11 +14,11 @@ from fastapi.responses import StreamingResponse
14
14
 
15
15
  from openai import OpenAI
16
16
 
17
+ from lemonade_server.pydantic_models import ChatCompletionRequest
17
18
  from lemonade_server.model_manager import ModelManager
18
- from lemonade.tools.server.pydantic_models import ChatCompletionRequest
19
+ from lemonade.tools.server.port_utils import find_free_port
19
20
 
20
21
  LLAMA_VERSION = "b5543"
21
- LLAMA_SERVER_PORT = "8081"
22
22
 
23
23
  LLAMA_SERVER_EXE_DIR = os.path.join(
24
24
  os.path.dirname(sys.executable),
@@ -43,6 +43,23 @@ class LlamaTelemetry:
43
43
  self.tokens_per_second = None
44
44
  self.prompt_eval_time = None
45
45
  self.eval_time = None
46
+ self.port = None
47
+
48
+ def choose_port(self):
49
+ """
50
+ Users probably don't care what port we start llama-server on, so let's
51
+ search for an empty port
52
+ """
53
+
54
+ self.port = find_free_port()
55
+
56
+ if self.port is None:
57
+ msg = "Failed to find an empty port to start llama-server on"
58
+ logging.error(msg)
59
+ raise HTTPException(
60
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
61
+ detail=msg,
62
+ )
46
63
 
47
64
  def parse_telemetry_line(self, line: str):
48
65
  """
@@ -128,10 +145,12 @@ def _log_subprocess_output(
128
145
  break
129
146
 
130
147
 
131
- def _wait_for_load(llama_server_process: subprocess.Popen, fail_message: str):
148
+ def _wait_for_load(
149
+ llama_server_process: subprocess.Popen, port: int, fail_message: str
150
+ ):
132
151
  status_code = None
133
152
  while not llama_server_process.poll() and status_code != 200:
134
- health_url = f"http://localhost:{LLAMA_SERVER_PORT}/health"
153
+ health_url = f"http://localhost:{port}/health"
135
154
  try:
136
155
  health_response = requests.get(health_url)
137
156
  except requests.exceptions.ConnectionError:
@@ -146,19 +165,25 @@ def _wait_for_load(llama_server_process: subprocess.Popen, fail_message: str):
146
165
 
147
166
 
148
167
  def _launch_llama_subprocess(
149
- model_path: str, use_gpu: bool, telemetry: LlamaTelemetry
168
+ snapshot_files: dict, use_gpu: bool, telemetry: LlamaTelemetry
150
169
  ) -> subprocess.Popen:
151
170
  """
152
171
  Launch llama server subprocess with GPU or CPU configuration
153
172
  """
154
173
 
155
- base_command = [
156
- LLAMA_SERVER_EXE_PATH,
157
- "-m",
158
- model_path,
159
- "--port",
160
- LLAMA_SERVER_PORT,
161
- ]
174
+ # Build the base command
175
+ base_command = [LLAMA_SERVER_EXE_PATH, "-m", snapshot_files["variant"]]
176
+ if "mmproj" in snapshot_files:
177
+ base_command.extend(["--mmproj", snapshot_files["mmproj"]])
178
+ if not use_gpu:
179
+ base_command.extend(["--no-mmproj-offload"])
180
+
181
+ # Find a port, and save it in the telemetry object for future reference
182
+ # by other functions
183
+ telemetry.choose_port()
184
+
185
+ # Add port and jinja to enable tool use
186
+ base_command.extend(["--port", str(telemetry.port), "--jinja"])
162
187
 
163
188
  # Configure GPU layers: 99 for GPU, 0 for CPU-only
164
189
  ngl_value = "99" if use_gpu else "0"
@@ -180,7 +205,7 @@ def _launch_llama_subprocess(
180
205
  return process
181
206
 
182
207
 
183
- def server_load(checkpoint: str, model_reference: str, telemetry: LlamaTelemetry):
208
+ def server_load(model_config: dict, model_reference: str, telemetry: LlamaTelemetry):
184
209
  # Download llama.cpp server if it isn't already available
185
210
  if not os.path.exists(LLAMA_SERVER_EXE_DIR):
186
211
  # Download llama.cpp server zip
@@ -212,33 +237,34 @@ def server_load(checkpoint: str, model_reference: str, telemetry: LlamaTelemetry
212
237
  logging.info("Cleaned up zip file")
213
238
 
214
239
  # Download the gguf to the hugging face cache
215
- snapshot_path = ModelManager().download_gguf(checkpoint)
216
- model_path = os.path.join(snapshot_path, os.listdir(snapshot_path)[0])
217
- logging.debug(f"GGUF file path: {model_path}")
240
+ snapshot_files = ModelManager().download_gguf(model_config)
241
+ logging.debug(f"GGUF file paths: {snapshot_files}")
218
242
 
219
243
  # Start the llama-serve.exe process
220
244
  logging.debug(f"Using llama_server for GGUF model: {LLAMA_SERVER_EXE_PATH}")
221
245
 
222
246
  # Attempt loading on GPU first
223
247
  llama_server_process = _launch_llama_subprocess(
224
- model_path, use_gpu=True, telemetry=telemetry
248
+ snapshot_files, use_gpu=True, telemetry=telemetry
225
249
  )
226
250
 
227
251
  # Check the /health endpoint until GPU server is ready
228
252
  _wait_for_load(
229
253
  llama_server_process,
254
+ telemetry.port,
230
255
  f"Loading {model_reference} on GPU didn't work, re-attempting on CPU",
231
256
  )
232
257
 
233
258
  # If loading on GPU failed, try loading on CPU
234
259
  if llama_server_process.poll():
235
260
  llama_server_process = _launch_llama_subprocess(
236
- model_path, use_gpu=False, telemetry=telemetry
261
+ snapshot_files, use_gpu=False, telemetry=telemetry
237
262
  )
238
263
 
239
264
  # Check the /health endpoint until CPU server is ready
240
265
  _wait_for_load(
241
266
  llama_server_process,
267
+ telemetry.port,
242
268
  f"Loading {model_reference} on CPU didn't work",
243
269
  )
244
270
 
@@ -254,7 +280,7 @@ def server_load(checkpoint: str, model_reference: str, telemetry: LlamaTelemetry
254
280
  def chat_completion(
255
281
  chat_completion_request: ChatCompletionRequest, telemetry: LlamaTelemetry
256
282
  ):
257
- base_url = f"http://127.0.0.1:{LLAMA_SERVER_PORT}/v1"
283
+ base_url = f"http://127.0.0.1:{telemetry.port}/v1"
258
284
  client = OpenAI(
259
285
  base_url=base_url,
260
286
  api_key="lemonade",