lemonade-sdk 8.1.1__tar.gz → 8.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (84) hide show
  1. {lemonade_sdk-8.1.1/src/lemonade_sdk.egg-info → lemonade_sdk-8.1.3}/PKG-INFO +7 -6
  2. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/README.md +1 -1
  3. lemonade_sdk-8.1.3/pyproject.toml +8 -0
  4. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/setup.py +8 -7
  5. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/common/inference_engines.py +1 -1
  6. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/llamacpp/utils.py +114 -14
  7. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/management_tools.py +1 -1
  8. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/oga/utils.py +54 -33
  9. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/server/llamacpp.py +96 -4
  10. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/server/serve.py +80 -10
  11. lemonade_sdk-8.1.3/src/lemonade/tools/server/static/js/chat.js +735 -0
  12. lemonade_sdk-8.1.3/src/lemonade/tools/server/static/js/model-settings.js +162 -0
  13. lemonade_sdk-8.1.3/src/lemonade/tools/server/static/js/models.js +865 -0
  14. lemonade_sdk-8.1.3/src/lemonade/tools/server/static/js/shared.js +491 -0
  15. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/server/static/styles.css +652 -26
  16. lemonade_sdk-8.1.3/src/lemonade/tools/server/static/webapp.html +257 -0
  17. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/server/tray.py +1 -1
  18. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/server/utils/port.py +5 -4
  19. lemonade_sdk-8.1.3/src/lemonade/version.py +1 -0
  20. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3/src/lemonade_sdk.egg-info}/PKG-INFO +7 -6
  21. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade_sdk.egg-info/SOURCES.txt +5 -0
  22. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade_sdk.egg-info/entry_points.txt +1 -0
  23. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade_sdk.egg-info/requires.txt +7 -5
  24. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade_server/cli.py +66 -17
  25. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade_server/model_manager.py +1 -1
  26. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade_server/pydantic_models.py +15 -3
  27. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade_server/server_models.json +54 -3
  28. lemonade_sdk-8.1.1/src/lemonade/tools/server/static/webapp.html +0 -1203
  29. lemonade_sdk-8.1.1/src/lemonade/version.py +0 -1
  30. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/LICENSE +0 -0
  31. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/NOTICE.md +0 -0
  32. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/setup.cfg +0 -0
  33. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/__init__.py +0 -0
  34. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/api.py +0 -0
  35. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/cache.py +0 -0
  36. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/cli.py +0 -0
  37. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/common/__init__.py +0 -0
  38. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/common/build.py +0 -0
  39. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/common/cli_helpers.py +0 -0
  40. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/common/exceptions.py +0 -0
  41. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/common/filesystem.py +0 -0
  42. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/common/network.py +0 -0
  43. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/common/printing.py +0 -0
  44. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/common/status.py +0 -0
  45. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/common/system_info.py +0 -0
  46. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/common/test_helpers.py +0 -0
  47. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/profilers/__init__.py +0 -0
  48. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/profilers/memory_tracker.py +0 -0
  49. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/profilers/profiler.py +0 -0
  50. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/sequence.py +0 -0
  51. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/state.py +0 -0
  52. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/__init__.py +0 -0
  53. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/accuracy.py +0 -0
  54. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/adapter.py +0 -0
  55. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/bench.py +0 -0
  56. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/huggingface/bench.py +0 -0
  57. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/huggingface/load.py +0 -0
  58. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/huggingface/utils.py +0 -0
  59. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/humaneval.py +0 -0
  60. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/llamacpp/bench.py +0 -0
  61. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/llamacpp/load.py +0 -0
  62. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/mmlu.py +0 -0
  63. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/oga/__init__.py +0 -0
  64. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/oga/bench.py +0 -0
  65. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/oga/load.py +0 -0
  66. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/perplexity.py +0 -0
  67. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/prompt.py +0 -0
  68. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/quark/__init__.py +0 -0
  69. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/quark/quark_load.py +0 -0
  70. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/quark/quark_quantize.py +0 -0
  71. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/report/__init__.py +0 -0
  72. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/report/llm_report.py +0 -0
  73. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/report/table.py +0 -0
  74. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/server/__init__.py +0 -0
  75. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/server/static/favicon.ico +0 -0
  76. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/server/tool_calls.py +0 -0
  77. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/server/utils/system_tray.py +0 -0
  78. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/server/utils/thread.py +0 -0
  79. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/server/webapp.py +0 -0
  80. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/tool.py +0 -0
  81. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade_install/__init__.py +0 -0
  82. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade_install/install.py +0 -0
  83. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade_sdk.egg-info/dependency_links.txt +0 -0
  84. {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade_sdk.egg-info/top_level.txt +0 -0
@@ -1,18 +1,18 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lemonade-sdk
3
- Version: 8.1.1
3
+ Version: 8.1.3
4
4
  Summary: Lemonade SDK: Your LLM Aide for Validation and Deployment
5
5
  Author-email: lemonade@amd.com
6
- Requires-Python: >=3.10, <3.13
6
+ Requires-Python: >=3.10, <3.14
7
7
  Description-Content-Type: text/markdown
8
8
  License-File: LICENSE
9
9
  License-File: NOTICE.md
10
10
  Requires-Dist: invoke>=2.0.0
11
- Requires-Dist: onnx<1.18.0,>=1.11.0
11
+ Requires-Dist: onnx==1.18.0
12
12
  Requires-Dist: pyyaml>=5.4
13
13
  Requires-Dist: typeguard>=2.3.13
14
14
  Requires-Dist: packaging>=20.9
15
- Requires-Dist: numpy<2.0.0
15
+ Requires-Dist: numpy
16
16
  Requires-Dist: fasteners
17
17
  Requires-Dist: GitPython>=3.1.40
18
18
  Requires-Dist: psutil>=6.1.1
@@ -41,9 +41,10 @@ Requires-Dist: accelerate; extra == "dev"
41
41
  Requires-Dist: datasets; extra == "dev"
42
42
  Requires-Dist: pandas>=1.5.3; extra == "dev"
43
43
  Requires-Dist: matplotlib; extra == "dev"
44
- Requires-Dist: model-generate==1.5.0; (platform_system == "Windows" and python_version == "3.10") and extra == "dev"
45
44
  Requires-Dist: human-eval-windows==1.0.4; extra == "dev"
46
45
  Requires-Dist: lm-eval[api]; extra == "dev"
46
+ Provides-Extra: model-generate
47
+ Requires-Dist: model-generate==1.5.0; (platform_system == "Windows" and python_version == "3.10") and extra == "model-generate"
47
48
  Provides-Extra: oga-hybrid
48
49
  Requires-Dist: lemonade-sdk[oga-ryzenai]; extra == "oga-hybrid"
49
50
  Provides-Extra: oga-unified
@@ -105,7 +106,7 @@ Dynamic: summary
105
106
  <img src="https://img.shields.io/badge/Ubuntu-24.04%20%7C%2025.04-E95420?logo=ubuntu&logoColor=white" alt="Ubuntu 24.04 | 25.04" />
106
107
  </a>
107
108
  <a href="docs/README.md#installation" title="Check out our instructions">
108
- <img src="https://img.shields.io/badge/Python-3.10%20%7C%203.12-blue?logo=python&logoColor=white" alt="Made with Python" />
109
+ <img src="https://img.shields.io/badge/Python-3.10--3.13-blue?logo=python&logoColor=white" alt="Made with Python" />
109
110
  </a>
110
111
  <a href="https://github.com/lemonade-sdk/lemonade/blob/main/docs/contribute.md" title="Contribution Guide">
111
112
  <img src="https://img.shields.io/badge/PRs-welcome-brightgreen.svg" alt="PRs Welcome" />
@@ -14,7 +14,7 @@
14
14
  <img src="https://img.shields.io/badge/Ubuntu-24.04%20%7C%2025.04-E95420?logo=ubuntu&logoColor=white" alt="Ubuntu 24.04 | 25.04" />
15
15
  </a>
16
16
  <a href="docs/README.md#installation" title="Check out our instructions">
17
- <img src="https://img.shields.io/badge/Python-3.10%20%7C%203.12-blue?logo=python&logoColor=white" alt="Made with Python" />
17
+ <img src="https://img.shields.io/badge/Python-3.10--3.13-blue?logo=python&logoColor=white" alt="Made with Python" />
18
18
  </a>
19
19
  <a href="https://github.com/lemonade-sdk/lemonade/blob/main/docs/contribute.md" title="Contribution Guide">
20
20
  <img src="https://img.shields.io/badge/PRs-welcome-brightgreen.svg" alt="PRs Welcome" />
@@ -0,0 +1,8 @@
1
+ [build-system]
2
+ requires = [
3
+ "setuptools>=68",
4
+ "wheel"
5
+ ]
6
+ build-backend = "setuptools.build_meta"
7
+
8
+
@@ -28,13 +28,11 @@ setup(
28
28
  # Minimal dependencies required for end-users who are running
29
29
  # apps deployed on Lemonade SDK
30
30
  "invoke>=2.0.0",
31
- "onnx>=1.11.0,<1.18.0",
31
+ "onnx==1.18.0",
32
32
  "pyyaml>=5.4",
33
33
  "typeguard>=2.3.13",
34
34
  "packaging>=20.9",
35
- # Necessary until upstream packages account for the breaking
36
- # change to numpy
37
- "numpy<2.0.0",
35
+ "numpy",
38
36
  "fasteners",
39
37
  "GitPython>=3.1.40",
40
38
  "psutil>=6.1.1",
@@ -74,12 +72,14 @@ setup(
74
72
  "datasets",
75
73
  "pandas>=1.5.3",
76
74
  "matplotlib",
77
- "model-generate==1.5.0; platform_system=='Windows' and python_version=='3.10'",
78
75
  # Install human-eval from a forked repo with Windows support until the
79
76
  # PR (https://github.com/openai/human-eval/pull/53) is merged
80
77
  "human-eval-windows==1.0.4",
81
78
  "lm-eval[api]",
82
79
  ],
80
+ "model-generate": [
81
+ "model-generate==1.5.0; platform_system=='Windows' and python_version=='3.10'",
82
+ ],
83
83
  # Keep backwards compatibility for old extras names
84
84
  "oga-hybrid": ["lemonade-sdk[oga-ryzenai]"],
85
85
  "oga-unified": ["lemonade-sdk[oga-ryzenai]"],
@@ -125,15 +125,16 @@ setup(
125
125
  "lemonade=lemonade:lemonadecli",
126
126
  "lemonade-install=lemonade_install:installcli",
127
127
  "lemonade-server-dev=lemonade_server.cli:main",
128
+ "lsdev=lemonade_server.cli:developer_entrypoint",
128
129
  ]
129
130
  },
130
- python_requires=">=3.10, <3.13",
131
+ python_requires=">=3.10, <3.14",
131
132
  long_description=open("README.md", "r", encoding="utf-8").read(),
132
133
  long_description_content_type="text/markdown",
133
134
  include_package_data=True,
134
135
  package_data={
135
136
  "lemonade_server": ["server_models.json"],
136
- "lemonade": ["tools/server/static/*"],
137
+ "lemonade": ["tools/server/static/**/*"],
137
138
  },
138
139
  )
139
140
 
@@ -5,7 +5,6 @@ import importlib.metadata
5
5
  import subprocess
6
6
  from abc import ABC, abstractmethod
7
7
  from typing import Dict, Optional
8
- import transformers
9
8
 
10
9
 
11
10
  class InferenceEngineDetector:
@@ -352,6 +351,7 @@ class TransformersDetector(BaseEngineDetector):
352
351
 
353
352
  try:
354
353
  import torch
354
+ import transformers
355
355
 
356
356
  if device_type == "cpu":
357
357
  result = {
@@ -57,7 +57,7 @@ def identify_rocm_arch_from_name(device_name: str) -> str | None:
57
57
  return None
58
58
 
59
59
 
60
- def identify_rocm_arch_and_hip_id() -> tuple[str, str]:
60
+ def identify_rocm_arch() -> str:
61
61
  """
62
62
  Identify the appropriate ROCm target architecture based on the device info
63
63
  Returns tuple of (architecture, gpu_type) where gpu_type is 'igpu' or 'dgpu'
@@ -68,21 +68,54 @@ def identify_rocm_arch_and_hip_id() -> tuple[str, str]:
68
68
  amd_igpu = system_info.get_amd_igpu_device()
69
69
  amd_dgpu = system_info.get_amd_dgpu_devices()
70
70
  target_arch = None
71
- gpu_count = 0
72
71
  for gpu in [amd_igpu] + amd_dgpu:
73
72
  if gpu.get("available") and gpu.get("name"):
74
- gpu_count += 1
75
73
  target_arch = identify_rocm_arch_from_name(gpu["name"].lower())
76
74
  if target_arch:
77
75
  break
78
76
 
79
- # Get HIP ID based on the number of GPUs available
80
- # Here, we assume that the iGPU will always show up before the dGPUs (if available)
81
- # We also assume that selecting the dGPU is preferred over the iGPU
82
- # Multiple GPUs are not supported at the moment
83
- hip_id = str(gpu_count - 1)
77
+ return target_arch
84
78
 
85
- return target_arch, hip_id
79
+
80
+ def identify_hip_id() -> str:
81
+ """
82
+ Identify the HIP ID
83
+ """
84
+ # Get HIP devices
85
+ hip_devices = get_hip_devices()
86
+ logging.debug(f"HIP devices found: {hip_devices}")
87
+ if len(hip_devices) == 0:
88
+ raise ValueError("No HIP devices found when identifying HIP ID")
89
+
90
+ # Identify HIP devices that are compatible with our ROCm builds
91
+ rocm_devices = []
92
+ for device in hip_devices:
93
+ device_id, device_name = device
94
+ if identify_rocm_arch_from_name(device_name):
95
+ rocm_devices.append([device_id, device_name])
96
+ logging.debug(f"ROCm devices found: {rocm_devices}")
97
+
98
+ # If no ROCm devices are found, use the last HIP device
99
+ # This might be needed in some scenarios where HIP reports generic device names
100
+ # Example: "AMD Radeon Graphics" for STX Halo iGPU on Ubuntu 24.04
101
+ if len(rocm_devices) == 0:
102
+ rocm_devices = [hip_devices[-1]]
103
+ logging.warning(
104
+ "No ROCm devices found when identifying HIP ID. "
105
+ f"Falling back to the following device: {rocm_devices[0]}"
106
+ )
107
+ elif len(rocm_devices) > 1:
108
+ logging.warning(
109
+ f"Multiple ROCm devices found when identifying HIP ID: {rocm_devices}"
110
+ "The last device will be used."
111
+ )
112
+
113
+ # Select the last device
114
+ device_selected = rocm_devices[-1]
115
+ logging.debug(f"Selected ROCm device: {device_selected}")
116
+
117
+ # Return the device ID
118
+ return device_selected[0]
86
119
 
87
120
 
88
121
  def get_llama_version(backend: str) -> str:
@@ -277,7 +310,7 @@ def install_llamacpp(backend):
277
310
  target_arch = None
278
311
  if backend == "rocm":
279
312
  # Identify the target architecture
280
- target_arch, hip_id = identify_rocm_arch_and_hip_id()
313
+ target_arch = identify_rocm_arch()
281
314
  if not target_arch:
282
315
  system = platform.system().lower()
283
316
  if system == "linux":
@@ -293,10 +326,6 @@ def install_llamacpp(backend):
293
326
  f"for supported configurations. {hint}"
294
327
  )
295
328
 
296
- # Set HIP_VISIBLE_DEVICES=0 for igpu, =1 for dgpu
297
- env_file_path = os.path.join(llama_server_exe_dir, ".env")
298
- set_key(env_file_path, "HIP_VISIBLE_DEVICES", hip_id)
299
-
300
329
  # Direct download for Vulkan/ROCm
301
330
  llama_archive_url, filename = get_binary_url_and_filename(backend, target_arch)
302
331
  llama_archive_path = os.path.join(llama_server_exe_dir, filename)
@@ -315,6 +344,12 @@ def install_llamacpp(backend):
315
344
  else:
316
345
  raise NotImplementedError(f"Unsupported archive format: {filename}")
317
346
 
347
+ # Identify and set HIP ID
348
+ if backend == "rocm":
349
+ hip_id = identify_hip_id()
350
+ env_file_path = os.path.join(llama_server_exe_dir, ".env")
351
+ set_key(env_file_path, "HIP_VISIBLE_DEVICES", str(hip_id))
352
+
318
353
  # Make executable on Linux - need to update paths after extraction
319
354
  if platform.system().lower() == "linux":
320
355
  # Re-get the paths since extraction might have changed the directory structure
@@ -778,3 +813,68 @@ class LlamaCppAdapter(ModelAdapter):
778
813
  error_msg = f"Failed to run llama.cpp command: {str(e)}\n"
779
814
  error_msg += f"Command: {' '.join(cmd)}"
780
815
  raise Exception(error_msg)
816
+
817
+
818
+ def get_hip_devices():
819
+ """Get list of HIP devices with their IDs and names."""
820
+ import ctypes
821
+ import sys
822
+ import os
823
+ import glob
824
+ from ctypes import c_int, POINTER
825
+ from ctypes.util import find_library
826
+
827
+ # Get llama.cpp path
828
+ rocm_path = get_llama_folder_path("rocm")
829
+
830
+ # Load HIP library
831
+ hip_library_pattern = (
832
+ "amdhip64*.dll" if sys.platform.startswith("win") else "libamdhip64*.so"
833
+ )
834
+ search_pattern = os.path.join(rocm_path, hip_library_pattern)
835
+ matching_files = glob.glob(search_pattern)
836
+ if not matching_files:
837
+ raise RuntimeError(
838
+ f"Could not find HIP runtime library matching pattern: {search_pattern}"
839
+ )
840
+ try:
841
+ libhip = ctypes.CDLL(matching_files[0])
842
+ except OSError:
843
+ raise RuntimeError(f"Could not load HIP runtime library from {path}")
844
+
845
+ # Setup function signatures
846
+ hipError_t = c_int
847
+ hipDeviceProp_t = ctypes.c_char * 2048
848
+ libhip.hipGetDeviceCount.restype = hipError_t
849
+ libhip.hipGetDeviceCount.argtypes = [POINTER(c_int)]
850
+ libhip.hipGetDeviceProperties.restype = hipError_t
851
+ libhip.hipGetDeviceProperties.argtypes = [POINTER(hipDeviceProp_t), c_int]
852
+ libhip.hipGetErrorString.restype = ctypes.c_char_p
853
+ libhip.hipGetErrorString.argtypes = [hipError_t]
854
+
855
+ # Get device count
856
+ device_count = c_int()
857
+ err = libhip.hipGetDeviceCount(ctypes.byref(device_count))
858
+ if err != 0:
859
+ logging.error(
860
+ "hipGetDeviceCount failed:", libhip.hipGetErrorString(err).decode()
861
+ )
862
+ return []
863
+
864
+ # Get device properties
865
+ devices = []
866
+ for i in range(device_count.value):
867
+ prop = hipDeviceProp_t()
868
+ err = libhip.hipGetDeviceProperties(ctypes.byref(prop), i)
869
+ if err != 0:
870
+ logging.error(
871
+ f"hipGetDeviceProperties failed for device {i}:",
872
+ libhip.hipGetErrorString(err).decode(),
873
+ )
874
+ continue
875
+
876
+ # Extract device name from HIP device properties
877
+ device_name = ctypes.string_at(prop, 256).decode("utf-8").rstrip("\x00")
878
+ devices.append([i, device_name])
879
+
880
+ return devices
@@ -109,7 +109,7 @@ class Cache(ManagementTool):
109
109
  # pylint: disable=pointless-statement,f-string-without-interpolation
110
110
  f"""
111
111
  A set of functions for managing the lemonade build cache. The default
112
- cache location is {lemonade_cache.DEFAULT_CACHE_DIR}, and can also be
112
+ cache location is {lemonade_cache.DEFAULT_CACHE_DIR}, and can also be
113
113
  selected with
114
114
  the global --cache-dir option or the LEMONADE_CACHE_DIR environment variable.
115
115
 
@@ -100,9 +100,10 @@ class OrtGenaiModel(ModelAdapter):
100
100
  max_new_tokens=512,
101
101
  min_new_tokens=0,
102
102
  do_sample=True,
103
- top_k=50,
104
- top_p=1.0,
105
- temperature=0.7,
103
+ top_k=None,
104
+ top_p=None,
105
+ temperature=None,
106
+ repeat_penalty=None,
106
107
  streamer: OrtGenaiStreamer = None,
107
108
  pad_token_id=None,
108
109
  stopping_criteria=None,
@@ -154,38 +155,58 @@ class OrtGenaiModel(ModelAdapter):
154
155
  if random_seed is None:
155
156
  random_seed = -1 # In og.Generator, -1 = seed with random device
156
157
 
158
+ # Get search config if available, otherwise use empty dict
159
+ # Thanks to the empty dict, if the model doesn't have a built-in search
160
+ # config, the .get() calls will all just use the default values
161
+ search_config = {}
157
162
  if self.config and "search" in self.config:
158
163
  search_config = self.config["search"]
159
- params.set_search_options(
160
- do_sample=search_config.get("do_sample", do_sample),
161
- top_k=search_config.get("top_k", top_k),
162
- top_p=search_config.get("top_p", top_p),
163
- temperature=search_config.get("temperature", temperature),
164
- max_length=max_length_to_use,
165
- min_length=min_length,
166
- early_stopping=search_config.get("early_stopping", False),
167
- length_penalty=search_config.get("length_penalty", 1.0),
168
- num_beams=search_config.get("num_beams", 1),
169
- num_return_sequences=search_config.get("num_return_sequences", 1),
170
- repetition_penalty=search_config.get("repetition_penalty", 1.0),
171
- past_present_share_buffer=search_config.get(
172
- "past_present_share_buffer", True
173
- ),
174
- random_seed=random_seed,
175
- # Not currently supported by OGA
176
- # diversity_penalty=search_config.get('diversity_penalty', 0.0),
177
- # no_repeat_ngram_size=search_config.get('no_repeat_ngram_size', 0),
178
- )
179
- else:
180
- params.set_search_options(
181
- do_sample=do_sample,
182
- top_k=top_k,
183
- top_p=top_p,
184
- temperature=temperature,
185
- max_length=max_length_to_use,
186
- min_length=min_length,
187
- random_seed=random_seed,
188
- )
164
+
165
+ # Apply parameter hierarchy: user provided > search config > defaults
166
+ default_top_k = 50
167
+ default_top_p = 1.0
168
+ default_temperature = 0.7
169
+ default_repetition_penalty = 1.0
170
+
171
+ top_k_to_use = (
172
+ top_k if top_k is not None else search_config.get("top_k", default_top_k)
173
+ )
174
+ top_p_to_use = (
175
+ top_p if top_p is not None else search_config.get("top_p", default_top_p)
176
+ )
177
+ temperature_to_use = (
178
+ temperature
179
+ if temperature is not None
180
+ else search_config.get("temperature", default_temperature)
181
+ )
182
+ # Map the llamacpp name, `repeat_penalty`, to the OGA name, `repetition_penalty`
183
+ repetition_penalty_to_use = (
184
+ repeat_penalty
185
+ if repeat_penalty is not None
186
+ else search_config.get("repetition_penalty", default_repetition_penalty)
187
+ )
188
+
189
+ # Set search options once with all parameters
190
+ params.set_search_options(
191
+ do_sample=search_config.get("do_sample", do_sample),
192
+ top_k=top_k_to_use,
193
+ top_p=top_p_to_use,
194
+ temperature=temperature_to_use,
195
+ repetition_penalty=repetition_penalty_to_use,
196
+ max_length=max_length_to_use,
197
+ min_length=min_length,
198
+ early_stopping=search_config.get("early_stopping", False),
199
+ length_penalty=search_config.get("length_penalty", 1.0),
200
+ num_beams=search_config.get("num_beams", 1),
201
+ num_return_sequences=search_config.get("num_return_sequences", 1),
202
+ past_present_share_buffer=search_config.get(
203
+ "past_present_share_buffer", True
204
+ ),
205
+ random_seed=random_seed,
206
+ # Not currently supported by OGA
207
+ # diversity_penalty=search_config.get('diversity_penalty', 0.0),
208
+ # no_repeat_ngram_size=search_config.get('no_repeat_ngram_size', 0),
209
+ )
189
210
  params.try_graph_capture_with_max_batch_size(1)
190
211
 
191
212
  generator = og.Generator(self.model, params)
@@ -43,6 +43,72 @@ def llamacpp_address(port: int) -> str:
43
43
  return f"http://127.0.0.1:{port}/v1"
44
44
 
45
45
 
46
+ def _separate_openai_params(request_dict: dict, endpoint_type: str = "chat") -> dict:
47
+ """
48
+ Separate standard OpenAI parameters from custom llama.cpp parameters.
49
+
50
+ Args:
51
+ request_dict: Dictionary of all request parameters
52
+ endpoint_type: Type of endpoint ("chat" or "completion")
53
+
54
+ Returns:
55
+ Dictionary with parameters properly separated for OpenAI client
56
+ """
57
+ openai_client_params = {}
58
+ extra_params = {}
59
+
60
+ # Common OpenAI parameters for both endpoint types
61
+ common_params = {
62
+ "model",
63
+ "frequency_penalty",
64
+ "logit_bias",
65
+ "logprobs",
66
+ "max_tokens",
67
+ "n",
68
+ "presence_penalty",
69
+ "seed",
70
+ "stop",
71
+ "stream",
72
+ "temperature",
73
+ "top_p",
74
+ "user",
75
+ }
76
+
77
+ # Standard OpenAI parameters by endpoint type
78
+ if endpoint_type == "chat":
79
+ chat_specific_params = {
80
+ "messages",
81
+ "top_logprobs",
82
+ "response_format",
83
+ "service_tier",
84
+ "stream_options",
85
+ "tools",
86
+ "tool_choice",
87
+ "parallel_tool_calls",
88
+ }
89
+ openai_params = common_params | chat_specific_params
90
+ else: # completion
91
+ completion_specific_params = {
92
+ "prompt",
93
+ "best_of",
94
+ "echo",
95
+ "suffix",
96
+ }
97
+ openai_params = common_params | completion_specific_params
98
+
99
+ for key, value in request_dict.items():
100
+ if key in openai_params:
101
+ openai_client_params[key] = value
102
+ else:
103
+ extra_params[key] = value
104
+
105
+ # If there are custom parameters, use extra_body to pass them through
106
+ if extra_params:
107
+ openai_client_params["extra_body"] = extra_params
108
+
109
+ return openai_client_params
110
+
111
+
46
112
  class LlamaTelemetry:
47
113
  """
48
114
  Manages telemetry data collection and display for llama server.
@@ -226,6 +292,11 @@ def _launch_llama_subprocess(
226
292
  "--ctx-size",
227
293
  str(ctx_size),
228
294
  ]
295
+
296
+ # Lock random seed for deterministic behavior in CI
297
+ if os.environ.get("LEMONADE_CI_MODE"):
298
+ base_command.extend(["--seed", "42"])
299
+
229
300
  if "mmproj" in snapshot_files:
230
301
  base_command.extend(["--mmproj", snapshot_files["mmproj"]])
231
302
  if not use_gpu:
@@ -238,6 +309,15 @@ def _launch_llama_subprocess(
238
309
  # Add port and jinja to enable tool use
239
310
  base_command.extend(["--port", str(telemetry.port), "--jinja"])
240
311
 
312
+ # Disable jinja for gpt-oss-120b on Vulkan
313
+ if backend == "vulkan" and "gpt-oss-120b" in snapshot_files["variant"].lower():
314
+ base_command.remove("--jinja")
315
+ logging.warning(
316
+ "Jinja is disabled for gpt-oss-120b on Vulkan due to a llama.cpp bug "
317
+ "(see https://github.com/ggml-org/llama.cpp/issues/15274). "
318
+ "The model cannot use tools. If needed, use the ROCm backend instead."
319
+ )
320
+
241
321
  # Use legacy reasoning formatting, since not all apps support the new
242
322
  # reasoning_content field
243
323
  base_command.extend(["--reasoning-format", "none"])
@@ -384,13 +464,17 @@ def chat_completion(
384
464
  exclude_unset=True, exclude_none=True
385
465
  )
386
466
 
467
+ # Separate standard OpenAI parameters from custom llama.cpp parameters
468
+ openai_client_params = _separate_openai_params(request_dict, "chat")
469
+
387
470
  # Check if streaming is requested
388
471
  if chat_completion_request.stream:
389
472
 
390
473
  def event_stream():
391
474
  try:
392
475
  # Enable streaming
393
- for chunk in client.chat.completions.create(**request_dict):
476
+ # pylint: disable=missing-kwoa
477
+ for chunk in client.chat.completions.create(**openai_client_params):
394
478
  yield f"data: {chunk.model_dump_json()}\n\n"
395
479
  yield "data: [DONE]\n\n"
396
480
 
@@ -412,7 +496,8 @@ def chat_completion(
412
496
  # Non-streaming response
413
497
  try:
414
498
  # Disable streaming for non-streaming requests
415
- response = client.chat.completions.create(**request_dict)
499
+ # pylint: disable=missing-kwoa
500
+ response = client.chat.completions.create(**openai_client_params)
416
501
 
417
502
  # Show telemetry after completion
418
503
  telemetry.show_telemetry()
@@ -420,6 +505,7 @@ def chat_completion(
420
505
  return response
421
506
 
422
507
  except Exception as e: # pylint: disable=broad-exception-caught
508
+ logging.error("Error during chat completion: %s", str(e))
423
509
  raise HTTPException(
424
510
  status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
425
511
  detail=f"Chat completion error: {str(e)}",
@@ -446,13 +532,17 @@ def completion(completion_request: CompletionRequest, telemetry: LlamaTelemetry)
446
532
  # Convert Pydantic model to dict and remove unset/null values
447
533
  request_dict = completion_request.model_dump(exclude_unset=True, exclude_none=True)
448
534
 
535
+ # Separate standard OpenAI parameters from custom llama.cpp parameters
536
+ openai_client_params = _separate_openai_params(request_dict, "completion")
537
+
449
538
  # Check if streaming is requested
450
539
  if completion_request.stream:
451
540
 
452
541
  def event_stream():
453
542
  try:
454
543
  # Enable streaming
455
- for chunk in client.completions.create(**request_dict):
544
+ # pylint: disable=missing-kwoa
545
+ for chunk in client.completions.create(**openai_client_params):
456
546
  yield f"data: {chunk.model_dump_json()}\n\n"
457
547
  yield "data: [DONE]\n\n"
458
548
 
@@ -474,7 +564,8 @@ def completion(completion_request: CompletionRequest, telemetry: LlamaTelemetry)
474
564
  # Non-streaming response
475
565
  try:
476
566
  # Disable streaming for non-streaming requests
477
- response = client.completions.create(**request_dict)
567
+ # pylint: disable=missing-kwoa
568
+ response = client.completions.create(**openai_client_params)
478
569
 
479
570
  # Show telemetry after completion
480
571
  telemetry.show_telemetry()
@@ -482,6 +573,7 @@ def completion(completion_request: CompletionRequest, telemetry: LlamaTelemetry)
482
573
  return response
483
574
 
484
575
  except Exception as e: # pylint: disable=broad-exception-caught
576
+ logging.error("Error during completion: %s", str(e))
485
577
  raise HTTPException(
486
578
  status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
487
579
  detail=f"Completion error: {str(e)}",