lemonade-sdk 8.0.4__py3-none-any.whl → 8.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

@@ -1,12 +1,17 @@
1
1
  import argparse
2
2
  import abc
3
+ import json
3
4
  from typing import List
4
5
  import lemonade.common.filesystem as fs
5
6
  import lemonade.common.exceptions as exp
6
7
  import lemonade.common.printing as printing
7
8
  from lemonade.tools.tool import ToolParser
8
9
  from lemonade.version import __version__ as lemonade_version
9
- from lemonade.common.system_info import get_system_info_dict
10
+ from lemonade.common.system_info import (
11
+ get_system_info_dict,
12
+ get_device_info_dict,
13
+ get_system_info,
14
+ )
10
15
  from lemonade.common.build import output_dir
11
16
  import lemonade.cache as lemonade_cache
12
17
 
@@ -245,28 +250,69 @@ class SystemInfo(ManagementTool):
245
250
  @staticmethod
246
251
  def parser(add_help: bool = True) -> argparse.ArgumentParser:
247
252
  parser = __class__.helpful_parser(
248
- short_description="Print system information",
253
+ short_description="Print system and device information",
249
254
  add_help=add_help,
250
255
  )
251
256
 
257
+ parser.add_argument(
258
+ "--format", choices=["table", "json"], default="table", help="Output format"
259
+ )
260
+
261
+ parser.add_argument(
262
+ "--verbose",
263
+ action="store_true",
264
+ help="Show detailed system information",
265
+ )
266
+
252
267
  return parser
253
268
 
254
269
  @staticmethod
255
270
  def pretty_print(my_dict: dict, level=0):
256
271
  for k, v in my_dict.items():
272
+ if k == "available" and v is True:
273
+ continue
274
+
257
275
  if isinstance(v, dict):
258
- print(" " * level + f"{k}:")
259
- SystemInfo.pretty_print(v, level + 1)
276
+ # Special handling for device availability
277
+ if v.get("available") is False:
278
+ error_msg = v.get("error", "Not available")
279
+ print(" " * level + f"{k}: {error_msg}")
280
+ else:
281
+ print(" " * level + f"{k}:")
282
+ SystemInfo.pretty_print(v, level + 1)
260
283
  elif isinstance(v, list):
261
284
  print(" " * level + f"{k}:")
262
285
  for item in v:
263
- print(" " * (level + 1) + f"{item}")
286
+ if isinstance(item, dict):
287
+ SystemInfo.pretty_print(item, level + 1)
288
+ print()
289
+ else:
290
+ print(" " * (level + 1) + f"{item}")
264
291
  else:
265
292
  print(" " * level + f"{k}: {v}")
266
293
 
267
- def run(self, _):
294
+ def run(self, _, format="table", verbose=False):
295
+ # Get basic system info
268
296
  system_info_dict = get_system_info_dict()
269
- self.pretty_print(system_info_dict)
297
+
298
+ # Always include devices
299
+ system_info_dict["Devices"] = get_device_info_dict()
300
+
301
+ # Filter out verbose-only information if not in verbose mode
302
+ if not verbose:
303
+ essential_keys = ["OS Version", "Processor", "Physical Memory", "Devices"]
304
+ system_info_dict = {
305
+ k: v for k, v in system_info_dict.items() if k in essential_keys
306
+ }
307
+ else:
308
+ # In verbose mode, add Python packages at the end
309
+ system_info = get_system_info()
310
+ system_info_dict["Python Packages"] = system_info.get_python_packages()
311
+
312
+ if format == "json":
313
+ print(json.dumps(system_info_dict, indent=2))
314
+ else:
315
+ self.pretty_print(system_info_dict)
270
316
 
271
317
 
272
318
  # This file was originally licensed under Apache 2.0. It has been modified.
@@ -74,12 +74,12 @@ class OgaBench(Bench):
74
74
 
75
75
  # Don't capture time for warmup
76
76
  for count in range(warmup_iterations):
77
- outputs = model.generate(input_ids, max_new_tokens=output_tokens)
78
- self.tokens_out_len_list.append(len(outputs[0]) - len(input_ids))
77
+ _ = model.generate(input_ids, max_new_tokens=output_tokens)
78
+ self.tokens_out_len_list.append(model.response_tokens)
79
79
  report_progress_fn((count + 1) / (warmup_iterations + iterations))
80
80
 
81
81
  for count in range(iterations):
82
- outputs = model.generate(
82
+ _ = model.generate(
83
83
  input_ids,
84
84
  max_new_tokens=output_tokens,
85
85
  min_new_tokens=output_tokens,
@@ -88,11 +88,10 @@ class OgaBench(Bench):
88
88
  (warmup_iterations + count + 1) / (warmup_iterations + iterations)
89
89
  )
90
90
 
91
- token_len = len(outputs[0]) - len(input_ids)
92
- self.tokens_out_len_list.append(token_len)
91
+ self.tokens_out_len_list.append(model.response_tokens)
93
92
 
94
93
  # Only count an iteration if it produced enough tokens
95
- if token_len >= output_tokens:
94
+ if model.response_tokens >= output_tokens:
96
95
  per_iteration_time_to_first_token.append(model.time_to_first_token)
97
96
  per_iteration_tokens_per_second.append(model.tokens_per_second)
98
97
 
@@ -99,13 +99,16 @@ class OrtGenaiModel(ModelAdapter):
99
99
  ):
100
100
  params = og.GeneratorParams(self.model)
101
101
 
102
+ # OGA models return a list of tokens (older versions) or 1d numpy array (newer versions)
102
103
  prompt_length = len(input_ids)
104
+
103
105
  max_prompt_length = self.config.get("max_prompt_length")
104
106
  if max_prompt_length and prompt_length > max_prompt_length:
105
107
  raise ValueError(
106
108
  f"This prompt (length {prompt_length}) exceeds the model's "
107
109
  f"maximum allowed prompt length ({max_prompt_length})."
108
110
  )
111
+ self.prompt_tokens = prompt_length
109
112
 
110
113
  # There is a breaking API change in OGA 0.6.0
111
114
  # Determine whether we should use the old or new APIs
@@ -206,18 +209,21 @@ class OrtGenaiModel(ModelAdapter):
206
209
  )
207
210
  self.tokens_per_second = 1 / avg_token_gen_latency_s
208
211
 
209
- return [generator.get_sequence(0)]
212
+ response = generator.get_sequence(0)
213
+ self.response_tokens = len(response) - self.prompt_tokens
214
+ return [response]
210
215
  else:
211
216
  if use_oga_post_6_api:
212
217
  generator.append_tokens(input_ids)
213
218
  tokenizer_stream = streamer.tokenizer.tokenizer.create_stream()
214
-
219
+ self.response_tokens = 0
215
220
  stop_early = False
216
221
 
217
222
  while not generator.is_done() and not stop_early:
218
223
  if use_oga_pre_6_api:
219
224
  generator.compute_logits()
220
225
  generator.generate_next_token()
226
+ self.response_tokens += 1
221
227
 
222
228
  new_token = generator.get_next_tokens()[0]
223
229
  new_text = tokenizer_stream.decode(new_token)
lemonade/tools/prompt.py CHANGED
@@ -161,7 +161,11 @@ class LLMPrompt(Tool):
161
161
  # If template flag is set, then wrap prompt in template
162
162
  if template:
163
163
  # Embed prompt in model's chat template
164
- if tokenizer.chat_template:
164
+ if not hasattr(tokenizer, "prompt_template"):
165
+ printing.log_warning(
166
+ "Templates for this model type are not yet implemented."
167
+ )
168
+ elif tokenizer.chat_template:
165
169
  # Use the model's built-in chat template if available
166
170
  messages_dict = [{"role": "user", "content": prompt}]
167
171
  prompt = tokenizer.apply_chat_template(
@@ -175,25 +179,10 @@ class LLMPrompt(Tool):
175
179
  state.save_stat(Keys.PROMPT_TEMPLATE, "Default")
176
180
 
177
181
  input_ids = tokenizer(prompt, return_tensors="pt").input_ids
178
- if isinstance(input_ids, (list, str)):
179
- # OGA models return a list of tokens (older versions)
180
- # Our llama.cpp adapter returns a string
181
- len_tokens_in = len(input_ids)
182
- elif hasattr(input_ids, "shape"):
183
- # HF models return a 2-D tensor
184
- # OGA models with newer versions may return numpy arrays
185
- if len(input_ids.shape) == 1:
186
- # 1-D array from newer OGA versions
187
- len_tokens_in = len(input_ids)
188
- else:
189
- # 2-D tensor from HF models
190
- len_tokens_in = input_ids.shape[1]
191
- else:
192
- # Fallback: try to get length directly
193
- len_tokens_in = len(input_ids)
194
182
 
195
183
  len_tokens_out = []
196
184
  response_texts = []
185
+ prompt_tokens = None # will be determined in generate function
197
186
  for trial in range(n_trials):
198
187
  if n_trials > 1:
199
188
  self.set_percent_progress(100.0 * trial / n_trials)
@@ -222,19 +211,22 @@ class LLMPrompt(Tool):
222
211
 
223
212
  response_array = response if isinstance(response, str) else response[0]
224
213
 
225
- # Separate the prompt from the response
226
- len_tokens_out.append(len(response_array) - len_tokens_in)
214
+ prompt_tokens = model.prompt_tokens
215
+ len_tokens_out.append(model.response_tokens)
227
216
 
228
- input_token = 0
217
+ # Remove the input from the response
218
+ # (up to the point they diverge, which they should not)
219
+ counter = 0
220
+ len_input_ids = len(input_ids_array)
229
221
  while (
230
- input_token < len_tokens_in
231
- and input_ids_array[input_token] == response_array[input_token]
222
+ counter < len_input_ids
223
+ and input_ids_array[counter] == response_array[counter]
232
224
  ):
233
- input_token += 1
225
+ counter += 1
234
226
 
235
227
  # Only decode the actual response (not the prompt)
236
228
  response_text = tokenizer.decode(
237
- response_array[input_token:], skip_special_tokens=True
229
+ response_array[counter:], skip_special_tokens=True
238
230
  ).strip()
239
231
  response_texts.append(response_text)
240
232
 
@@ -259,7 +251,7 @@ class LLMPrompt(Tool):
259
251
  plt.savefig(figure_path)
260
252
  state.save_stat(Keys.RESPONSE_LENGTHS_HISTOGRAM, figure_path)
261
253
 
262
- state.save_stat(Keys.PROMPT_TOKENS, len_tokens_in)
254
+ state.save_stat(Keys.PROMPT_TOKENS, prompt_tokens)
263
255
  state.save_stat(Keys.PROMPT, prompt)
264
256
  state.save_stat(Keys.RESPONSE_TOKENS, len_tokens_out)
265
257
  state.save_stat(Keys.RESPONSE, sanitize_text(response_texts))
@@ -758,15 +758,18 @@ class LemonadePerfTable(Table):
758
758
  data[key] = model_stats.get(key, "")
759
759
 
760
760
  # Create a new entry with Driver Versions and relevant Python Packages
761
- sw_versions = [
762
- key + ": " + value
763
- for key, value in data[fs.Keys.SYSTEM_INFO]["Driver Versions"].items()
764
- ]
765
- sw_versions += [
766
- pkg
767
- for pkg in data[fs.Keys.SYSTEM_INFO]["Python Packages"]
768
- if any(name in pkg for name in PYTHON_PACKAGES)
769
- ]
761
+ sw_versions = []
762
+ if "Driver Versions" in data[fs.Keys.SYSTEM_INFO]:
763
+ sw_versions += [
764
+ key + ": " + value
765
+ for key, value in data[fs.Keys.SYSTEM_INFO]["Driver Versions"].items()
766
+ ]
767
+ if "Python Packages" in data[fs.Keys.SYSTEM_INFO]:
768
+ sw_versions += [
769
+ pkg
770
+ for pkg in data[fs.Keys.SYSTEM_INFO]["Python Packages"]
771
+ if any(name in pkg for name in PYTHON_PACKAGES)
772
+ ]
770
773
  if isinstance(data[Keys.RYZEN_AI_VERSION_INFO], dict):
771
774
  sw_versions += [
772
775
  "Ryzen AI: " + value
@@ -1,13 +1,11 @@
1
- import sys
2
1
  import os
2
+ import sys
3
3
  import logging
4
4
  import time
5
5
  import subprocess
6
- import zipfile
7
6
  import re
8
7
  import threading
9
8
  import platform
10
- import shutil
11
9
 
12
10
  import requests
13
11
  from tabulate import tabulate
@@ -18,12 +16,18 @@ from openai import OpenAI
18
16
 
19
17
  from lemonade_server.pydantic_models import (
20
18
  ChatCompletionRequest,
19
+ CompletionRequest,
21
20
  PullConfig,
22
21
  EmbeddingsRequest,
23
22
  RerankingRequest,
24
23
  )
25
24
  from lemonade_server.model_manager import ModelManager
26
25
  from lemonade.tools.server.utils.port import find_free_port
26
+ from lemonade.tools.llamacpp.utils import (
27
+ get_llama_server_exe_path,
28
+ install_llamacpp,
29
+ download_gguf,
30
+ )
27
31
 
28
32
  LLAMA_VERSION = "b5787"
29
33
 
@@ -80,39 +84,6 @@ def get_binary_url_and_filename(version):
80
84
  return url, filename
81
85
 
82
86
 
83
- def validate_platform_support():
84
- """
85
- Validate platform support before attempting download
86
- """
87
- system = platform.system().lower()
88
-
89
- if system not in ["windows", "linux"]:
90
- raise HTTPException(
91
- status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
92
- detail=(
93
- f"Platform {system} not supported for llamacpp. "
94
- "Supported: Windows, Ubuntu Linux"
95
- ),
96
- )
97
-
98
- if system == "linux":
99
- # Check if we're actually on Ubuntu/compatible distro and log a warning if not
100
- try:
101
- with open("/etc/os-release", "r", encoding="utf-8") as f:
102
- os_info = f.read().lower()
103
- if "ubuntu" not in os_info and "debian" not in os_info:
104
- logging.warning(
105
- "llamacpp binaries are built for Ubuntu. "
106
- "Compatibility with other Linux distributions is not guaranteed."
107
- )
108
- except (FileNotFoundError, PermissionError, OSError) as e:
109
- logging.warning(
110
- "Could not determine Linux distribution (%s). "
111
- "llamacpp binaries are built for Ubuntu.",
112
- str(e),
113
- )
114
-
115
-
116
87
  class LlamaTelemetry:
117
88
  """
118
89
  Manages telemetry data collection and display for llama server.
@@ -283,7 +254,7 @@ def _launch_llama_subprocess(
283
254
  """
284
255
 
285
256
  # Get the current executable path (handles both Windows and Ubuntu structures)
286
- _, exe_path = get_llama_server_paths()
257
+ exe_path = get_llama_server_exe_path()
287
258
 
288
259
  # Build the base command
289
260
  base_command = [exe_path, "-m", snapshot_files["variant"]]
@@ -350,68 +321,23 @@ def _launch_llama_subprocess(
350
321
 
351
322
 
352
323
  def server_load(model_config: PullConfig, telemetry: LlamaTelemetry):
353
- # Validate platform support before proceeding
354
- validate_platform_support()
324
+ # Install and/or update llama.cpp if needed
325
+ try:
326
+ install_llamacpp()
327
+ except NotImplementedError as e:
328
+ raise HTTPException(
329
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, detail=str(e)
330
+ )
355
331
 
356
332
  # Get platform-specific paths at runtime
357
- llama_server_exe_dir, llama_server_exe_path = get_llama_server_paths()
358
-
359
- # Check whether the llamacpp install needs an upgrade
360
- version_txt_path = os.path.join(llama_server_exe_dir, "version.txt")
361
- if os.path.exists(version_txt_path):
362
- with open(version_txt_path, "r", encoding="utf-8") as f:
363
- llamacpp_installed_version = f.read()
364
-
365
- if llamacpp_installed_version != LLAMA_VERSION:
366
- # Remove the existing install, which will trigger a new install
367
- # in the next code block
368
- shutil.rmtree(llama_server_exe_dir)
369
-
370
- # Download llama.cpp server if it isn't already available
371
- if not os.path.exists(llama_server_exe_dir):
372
- # Download llama.cpp server zip
373
- llama_zip_url, filename = get_binary_url_and_filename(LLAMA_VERSION)
374
- llama_zip_path = os.path.join(os.path.dirname(sys.executable), filename)
375
- logging.info(f"Downloading llama.cpp server from {llama_zip_url}")
376
-
377
- with requests.get(llama_zip_url, stream=True) as r:
378
- r.raise_for_status()
379
- with open(llama_zip_path, "wb") as f:
380
- for chunk in r.iter_content(chunk_size=8192):
381
- f.write(chunk)
382
-
383
- # Extract zip
384
- logging.info(f"Extracting {llama_zip_path} to {llama_server_exe_dir}")
385
- with zipfile.ZipFile(llama_zip_path, "r") as zip_ref:
386
- zip_ref.extractall(llama_server_exe_dir)
387
-
388
- # Make executable on Linux - need to update paths after extraction
389
- if platform.system().lower() == "linux":
390
- # Re-get the paths since extraction might have changed the directory structure
391
- _, updated_exe_path = get_llama_server_paths()
392
- if os.path.exists(updated_exe_path):
393
- os.chmod(updated_exe_path, 0o755)
394
- logging.info(f"Set executable permissions for {updated_exe_path}")
395
- else:
396
- logging.warning(
397
- f"Could not find llama-server executable at {updated_exe_path}"
398
- )
399
-
400
- # Save version.txt
401
- with open(version_txt_path, "w", encoding="utf-8") as vf:
402
- vf.write(LLAMA_VERSION)
403
-
404
- # Delete zip file
405
- os.remove(llama_zip_path)
406
- logging.info("Cleaned up zip file")
333
+ llama_server_exe_path = get_llama_server_exe_path()
407
334
 
408
335
  # Download the gguf to the hugging face cache
409
- model_manager = ModelManager()
410
- snapshot_files = model_manager.download_gguf(model_config)
336
+ snapshot_files = download_gguf(model_config.checkpoint, model_config.mmproj)
411
337
  logging.debug(f"GGUF file paths: {snapshot_files}")
412
338
 
413
339
  # Check if model supports embeddings
414
- supported_models = model_manager.supported_models
340
+ supported_models = ModelManager().supported_models
415
341
  model_info = supported_models.get(model_config.model_name, {})
416
342
  supports_embeddings = "embeddings" in model_info.get("labels", [])
417
343
  supports_reranking = "reranking" in model_info.get("labels", [])
@@ -523,6 +449,68 @@ def chat_completion(
523
449
  )
524
450
 
525
451
 
452
+ def completion(completion_request: CompletionRequest, telemetry: LlamaTelemetry):
453
+ """
454
+ Handle text completions using the llamacpp server.
455
+
456
+ Args:
457
+ completion_request: The completion request containing prompt and parameters
458
+ telemetry: Telemetry object containing the server port
459
+
460
+ Returns:
461
+ Completion response from the llamacpp server
462
+ """
463
+ base_url = llamacpp_address(telemetry.port)
464
+ client = OpenAI(
465
+ base_url=base_url,
466
+ api_key="lemonade",
467
+ )
468
+
469
+ # Convert Pydantic model to dict and remove unset/null values
470
+ request_dict = completion_request.model_dump(exclude_unset=True, exclude_none=True)
471
+
472
+ # Check if streaming is requested
473
+ if completion_request.stream:
474
+
475
+ def event_stream():
476
+ try:
477
+ # Enable streaming
478
+ for chunk in client.completions.create(**request_dict):
479
+ yield f"data: {chunk.model_dump_json()}\n\n"
480
+ yield "data: [DONE]\n\n"
481
+
482
+ # Show telemetry after completion
483
+ telemetry.show_telemetry()
484
+
485
+ except Exception as e: # pylint: disable=broad-exception-caught
486
+ yield f'data: {{"error": "{str(e)}"}}\n\n'
487
+
488
+ return StreamingResponse(
489
+ event_stream(),
490
+ media_type="text/event-stream",
491
+ headers={
492
+ "Cache-Control": "no-cache",
493
+ "Connection": "keep-alive",
494
+ },
495
+ )
496
+ else:
497
+ # Non-streaming response
498
+ try:
499
+ # Disable streaming for non-streaming requests
500
+ response = client.completions.create(**request_dict)
501
+
502
+ # Show telemetry after completion
503
+ telemetry.show_telemetry()
504
+
505
+ return response
506
+
507
+ except Exception as e: # pylint: disable=broad-exception-caught
508
+ raise HTTPException(
509
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
510
+ detail=f"Completion error: {str(e)}",
511
+ )
512
+
513
+
526
514
  def embeddings(embeddings_request: EmbeddingsRequest, telemetry: LlamaTelemetry):
527
515
  """
528
516
  Generate embeddings using the llamacpp server.
@@ -228,6 +228,7 @@ class Server(ManagementTool):
228
228
  self.app.get(f"{prefix}/health")(self.health)
229
229
  self.app.get(f"{prefix}/halt")(self.halt_generation)
230
230
  self.app.get(f"{prefix}/stats")(self.send_stats)
231
+ self.app.get(f"{prefix}/system-info")(self.get_system_info)
231
232
  self.app.post(f"{prefix}/completions")(self.completions)
232
233
  self.app.post(f"{prefix}/responses")(self.responses)
233
234
 
@@ -486,6 +487,9 @@ class Server(ManagementTool):
486
487
  # Load the model if it's different from the currently loaded one
487
488
  await self.load_llm(lc)
488
489
 
490
+ if self.llm_loaded.recipe == "llamacpp":
491
+ return llamacpp.completion(completion_request, self.llama_telemetry)
492
+
489
493
  # Check if the model supports reasoning
490
494
  reasoning_first_token = self.llm_loaded.reasoning
491
495
 
@@ -1276,6 +1280,34 @@ class Server(ManagementTool):
1276
1280
  ),
1277
1281
  }
1278
1282
 
1283
+ async def get_system_info(self, request: Request):
1284
+ """
1285
+ Return system and device enumeration information.
1286
+ Supports optional 'verbose' query parameter.
1287
+ """
1288
+ from lemonade.common.system_info import (
1289
+ get_system_info_dict,
1290
+ get_device_info_dict,
1291
+ get_system_info as get_system_info_obj,
1292
+ )
1293
+
1294
+ # Get verbose parameter from query string (default to False)
1295
+ verbose = request.query_params.get("verbose", "false").lower() in ["true", "1"]
1296
+
1297
+ info = get_system_info_dict()
1298
+ info["devices"] = get_device_info_dict()
1299
+
1300
+ # Filter out verbose-only information if not in verbose mode
1301
+ if not verbose:
1302
+ essential_keys = ["OS Version", "Processor", "Physical Memory", "devices"]
1303
+ info = {k: v for k, v in info.items() if k in essential_keys}
1304
+ else:
1305
+ # In verbose mode, add Python packages at the end
1306
+ system_info_obj = get_system_info_obj()
1307
+ info["Python Packages"] = system_info_obj.get_python_packages()
1308
+
1309
+ return info
1310
+
1279
1311
  def model_load_failure(self, model_reference: str, message: Optional[str] = None):
1280
1312
  """
1281
1313
  Clean up after a model load failure, then log it and raise