lemonade-sdk 8.1.1__tar.gz → 8.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lemonade-sdk might be problematic. Click here for more details.
- {lemonade_sdk-8.1.1/src/lemonade_sdk.egg-info → lemonade_sdk-8.1.3}/PKG-INFO +7 -6
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/README.md +1 -1
- lemonade_sdk-8.1.3/pyproject.toml +8 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/setup.py +8 -7
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/common/inference_engines.py +1 -1
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/llamacpp/utils.py +114 -14
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/management_tools.py +1 -1
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/oga/utils.py +54 -33
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/server/llamacpp.py +96 -4
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/server/serve.py +80 -10
- lemonade_sdk-8.1.3/src/lemonade/tools/server/static/js/chat.js +735 -0
- lemonade_sdk-8.1.3/src/lemonade/tools/server/static/js/model-settings.js +162 -0
- lemonade_sdk-8.1.3/src/lemonade/tools/server/static/js/models.js +865 -0
- lemonade_sdk-8.1.3/src/lemonade/tools/server/static/js/shared.js +491 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/server/static/styles.css +652 -26
- lemonade_sdk-8.1.3/src/lemonade/tools/server/static/webapp.html +257 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/server/tray.py +1 -1
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/server/utils/port.py +5 -4
- lemonade_sdk-8.1.3/src/lemonade/version.py +1 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3/src/lemonade_sdk.egg-info}/PKG-INFO +7 -6
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade_sdk.egg-info/SOURCES.txt +5 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade_sdk.egg-info/entry_points.txt +1 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade_sdk.egg-info/requires.txt +7 -5
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade_server/cli.py +66 -17
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade_server/model_manager.py +1 -1
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade_server/pydantic_models.py +15 -3
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade_server/server_models.json +54 -3
- lemonade_sdk-8.1.1/src/lemonade/tools/server/static/webapp.html +0 -1203
- lemonade_sdk-8.1.1/src/lemonade/version.py +0 -1
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/LICENSE +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/NOTICE.md +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/setup.cfg +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/__init__.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/api.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/cache.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/cli.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/common/__init__.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/common/build.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/common/cli_helpers.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/common/exceptions.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/common/filesystem.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/common/network.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/common/printing.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/common/status.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/common/system_info.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/common/test_helpers.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/profilers/__init__.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/profilers/memory_tracker.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/profilers/profiler.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/sequence.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/state.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/__init__.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/accuracy.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/adapter.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/bench.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/huggingface/bench.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/huggingface/load.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/huggingface/utils.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/humaneval.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/llamacpp/bench.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/llamacpp/load.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/mmlu.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/oga/__init__.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/oga/bench.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/oga/load.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/perplexity.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/prompt.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/quark/__init__.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/quark/quark_load.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/quark/quark_quantize.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/report/__init__.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/report/llm_report.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/report/table.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/server/__init__.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/server/static/favicon.ico +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/server/tool_calls.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/server/utils/system_tray.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/server/utils/thread.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/server/webapp.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade/tools/tool.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade_install/__init__.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade_install/install.py +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade_sdk.egg-info/dependency_links.txt +0 -0
- {lemonade_sdk-8.1.1 → lemonade_sdk-8.1.3}/src/lemonade_sdk.egg-info/top_level.txt +0 -0
|
@@ -1,18 +1,18 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: lemonade-sdk
|
|
3
|
-
Version: 8.1.
|
|
3
|
+
Version: 8.1.3
|
|
4
4
|
Summary: Lemonade SDK: Your LLM Aide for Validation and Deployment
|
|
5
5
|
Author-email: lemonade@amd.com
|
|
6
|
-
Requires-Python: >=3.10, <3.
|
|
6
|
+
Requires-Python: >=3.10, <3.14
|
|
7
7
|
Description-Content-Type: text/markdown
|
|
8
8
|
License-File: LICENSE
|
|
9
9
|
License-File: NOTICE.md
|
|
10
10
|
Requires-Dist: invoke>=2.0.0
|
|
11
|
-
Requires-Dist: onnx
|
|
11
|
+
Requires-Dist: onnx==1.18.0
|
|
12
12
|
Requires-Dist: pyyaml>=5.4
|
|
13
13
|
Requires-Dist: typeguard>=2.3.13
|
|
14
14
|
Requires-Dist: packaging>=20.9
|
|
15
|
-
Requires-Dist: numpy
|
|
15
|
+
Requires-Dist: numpy
|
|
16
16
|
Requires-Dist: fasteners
|
|
17
17
|
Requires-Dist: GitPython>=3.1.40
|
|
18
18
|
Requires-Dist: psutil>=6.1.1
|
|
@@ -41,9 +41,10 @@ Requires-Dist: accelerate; extra == "dev"
|
|
|
41
41
|
Requires-Dist: datasets; extra == "dev"
|
|
42
42
|
Requires-Dist: pandas>=1.5.3; extra == "dev"
|
|
43
43
|
Requires-Dist: matplotlib; extra == "dev"
|
|
44
|
-
Requires-Dist: model-generate==1.5.0; (platform_system == "Windows" and python_version == "3.10") and extra == "dev"
|
|
45
44
|
Requires-Dist: human-eval-windows==1.0.4; extra == "dev"
|
|
46
45
|
Requires-Dist: lm-eval[api]; extra == "dev"
|
|
46
|
+
Provides-Extra: model-generate
|
|
47
|
+
Requires-Dist: model-generate==1.5.0; (platform_system == "Windows" and python_version == "3.10") and extra == "model-generate"
|
|
47
48
|
Provides-Extra: oga-hybrid
|
|
48
49
|
Requires-Dist: lemonade-sdk[oga-ryzenai]; extra == "oga-hybrid"
|
|
49
50
|
Provides-Extra: oga-unified
|
|
@@ -105,7 +106,7 @@ Dynamic: summary
|
|
|
105
106
|
<img src="https://img.shields.io/badge/Ubuntu-24.04%20%7C%2025.04-E95420?logo=ubuntu&logoColor=white" alt="Ubuntu 24.04 | 25.04" />
|
|
106
107
|
</a>
|
|
107
108
|
<a href="docs/README.md#installation" title="Check out our instructions">
|
|
108
|
-
<img src="https://img.shields.io/badge/Python-3.10
|
|
109
|
+
<img src="https://img.shields.io/badge/Python-3.10--3.13-blue?logo=python&logoColor=white" alt="Made with Python" />
|
|
109
110
|
</a>
|
|
110
111
|
<a href="https://github.com/lemonade-sdk/lemonade/blob/main/docs/contribute.md" title="Contribution Guide">
|
|
111
112
|
<img src="https://img.shields.io/badge/PRs-welcome-brightgreen.svg" alt="PRs Welcome" />
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
<img src="https://img.shields.io/badge/Ubuntu-24.04%20%7C%2025.04-E95420?logo=ubuntu&logoColor=white" alt="Ubuntu 24.04 | 25.04" />
|
|
15
15
|
</a>
|
|
16
16
|
<a href="docs/README.md#installation" title="Check out our instructions">
|
|
17
|
-
<img src="https://img.shields.io/badge/Python-3.10
|
|
17
|
+
<img src="https://img.shields.io/badge/Python-3.10--3.13-blue?logo=python&logoColor=white" alt="Made with Python" />
|
|
18
18
|
</a>
|
|
19
19
|
<a href="https://github.com/lemonade-sdk/lemonade/blob/main/docs/contribute.md" title="Contribution Guide">
|
|
20
20
|
<img src="https://img.shields.io/badge/PRs-welcome-brightgreen.svg" alt="PRs Welcome" />
|
|
@@ -28,13 +28,11 @@ setup(
|
|
|
28
28
|
# Minimal dependencies required for end-users who are running
|
|
29
29
|
# apps deployed on Lemonade SDK
|
|
30
30
|
"invoke>=2.0.0",
|
|
31
|
-
"onnx
|
|
31
|
+
"onnx==1.18.0",
|
|
32
32
|
"pyyaml>=5.4",
|
|
33
33
|
"typeguard>=2.3.13",
|
|
34
34
|
"packaging>=20.9",
|
|
35
|
-
|
|
36
|
-
# change to numpy
|
|
37
|
-
"numpy<2.0.0",
|
|
35
|
+
"numpy",
|
|
38
36
|
"fasteners",
|
|
39
37
|
"GitPython>=3.1.40",
|
|
40
38
|
"psutil>=6.1.1",
|
|
@@ -74,12 +72,14 @@ setup(
|
|
|
74
72
|
"datasets",
|
|
75
73
|
"pandas>=1.5.3",
|
|
76
74
|
"matplotlib",
|
|
77
|
-
"model-generate==1.5.0; platform_system=='Windows' and python_version=='3.10'",
|
|
78
75
|
# Install human-eval from a forked repo with Windows support until the
|
|
79
76
|
# PR (https://github.com/openai/human-eval/pull/53) is merged
|
|
80
77
|
"human-eval-windows==1.0.4",
|
|
81
78
|
"lm-eval[api]",
|
|
82
79
|
],
|
|
80
|
+
"model-generate": [
|
|
81
|
+
"model-generate==1.5.0; platform_system=='Windows' and python_version=='3.10'",
|
|
82
|
+
],
|
|
83
83
|
# Keep backwards compatibility for old extras names
|
|
84
84
|
"oga-hybrid": ["lemonade-sdk[oga-ryzenai]"],
|
|
85
85
|
"oga-unified": ["lemonade-sdk[oga-ryzenai]"],
|
|
@@ -125,15 +125,16 @@ setup(
|
|
|
125
125
|
"lemonade=lemonade:lemonadecli",
|
|
126
126
|
"lemonade-install=lemonade_install:installcli",
|
|
127
127
|
"lemonade-server-dev=lemonade_server.cli:main",
|
|
128
|
+
"lsdev=lemonade_server.cli:developer_entrypoint",
|
|
128
129
|
]
|
|
129
130
|
},
|
|
130
|
-
python_requires=">=3.10, <3.
|
|
131
|
+
python_requires=">=3.10, <3.14",
|
|
131
132
|
long_description=open("README.md", "r", encoding="utf-8").read(),
|
|
132
133
|
long_description_content_type="text/markdown",
|
|
133
134
|
include_package_data=True,
|
|
134
135
|
package_data={
|
|
135
136
|
"lemonade_server": ["server_models.json"],
|
|
136
|
-
"lemonade": ["tools/server/static
|
|
137
|
+
"lemonade": ["tools/server/static/**/*"],
|
|
137
138
|
},
|
|
138
139
|
)
|
|
139
140
|
|
|
@@ -5,7 +5,6 @@ import importlib.metadata
|
|
|
5
5
|
import subprocess
|
|
6
6
|
from abc import ABC, abstractmethod
|
|
7
7
|
from typing import Dict, Optional
|
|
8
|
-
import transformers
|
|
9
8
|
|
|
10
9
|
|
|
11
10
|
class InferenceEngineDetector:
|
|
@@ -352,6 +351,7 @@ class TransformersDetector(BaseEngineDetector):
|
|
|
352
351
|
|
|
353
352
|
try:
|
|
354
353
|
import torch
|
|
354
|
+
import transformers
|
|
355
355
|
|
|
356
356
|
if device_type == "cpu":
|
|
357
357
|
result = {
|
|
@@ -57,7 +57,7 @@ def identify_rocm_arch_from_name(device_name: str) -> str | None:
|
|
|
57
57
|
return None
|
|
58
58
|
|
|
59
59
|
|
|
60
|
-
def
|
|
60
|
+
def identify_rocm_arch() -> str:
|
|
61
61
|
"""
|
|
62
62
|
Identify the appropriate ROCm target architecture based on the device info
|
|
63
63
|
Returns tuple of (architecture, gpu_type) where gpu_type is 'igpu' or 'dgpu'
|
|
@@ -68,21 +68,54 @@ def identify_rocm_arch_and_hip_id() -> tuple[str, str]:
|
|
|
68
68
|
amd_igpu = system_info.get_amd_igpu_device()
|
|
69
69
|
amd_dgpu = system_info.get_amd_dgpu_devices()
|
|
70
70
|
target_arch = None
|
|
71
|
-
gpu_count = 0
|
|
72
71
|
for gpu in [amd_igpu] + amd_dgpu:
|
|
73
72
|
if gpu.get("available") and gpu.get("name"):
|
|
74
|
-
gpu_count += 1
|
|
75
73
|
target_arch = identify_rocm_arch_from_name(gpu["name"].lower())
|
|
76
74
|
if target_arch:
|
|
77
75
|
break
|
|
78
76
|
|
|
79
|
-
|
|
80
|
-
# Here, we assume that the iGPU will always show up before the dGPUs (if available)
|
|
81
|
-
# We also assume that selecting the dGPU is preferred over the iGPU
|
|
82
|
-
# Multiple GPUs are not supported at the moment
|
|
83
|
-
hip_id = str(gpu_count - 1)
|
|
77
|
+
return target_arch
|
|
84
78
|
|
|
85
|
-
|
|
79
|
+
|
|
80
|
+
def identify_hip_id() -> str:
|
|
81
|
+
"""
|
|
82
|
+
Identify the HIP ID
|
|
83
|
+
"""
|
|
84
|
+
# Get HIP devices
|
|
85
|
+
hip_devices = get_hip_devices()
|
|
86
|
+
logging.debug(f"HIP devices found: {hip_devices}")
|
|
87
|
+
if len(hip_devices) == 0:
|
|
88
|
+
raise ValueError("No HIP devices found when identifying HIP ID")
|
|
89
|
+
|
|
90
|
+
# Identify HIP devices that are compatible with our ROCm builds
|
|
91
|
+
rocm_devices = []
|
|
92
|
+
for device in hip_devices:
|
|
93
|
+
device_id, device_name = device
|
|
94
|
+
if identify_rocm_arch_from_name(device_name):
|
|
95
|
+
rocm_devices.append([device_id, device_name])
|
|
96
|
+
logging.debug(f"ROCm devices found: {rocm_devices}")
|
|
97
|
+
|
|
98
|
+
# If no ROCm devices are found, use the last HIP device
|
|
99
|
+
# This might be needed in some scenarios where HIP reports generic device names
|
|
100
|
+
# Example: "AMD Radeon Graphics" for STX Halo iGPU on Ubuntu 24.04
|
|
101
|
+
if len(rocm_devices) == 0:
|
|
102
|
+
rocm_devices = [hip_devices[-1]]
|
|
103
|
+
logging.warning(
|
|
104
|
+
"No ROCm devices found when identifying HIP ID. "
|
|
105
|
+
f"Falling back to the following device: {rocm_devices[0]}"
|
|
106
|
+
)
|
|
107
|
+
elif len(rocm_devices) > 1:
|
|
108
|
+
logging.warning(
|
|
109
|
+
f"Multiple ROCm devices found when identifying HIP ID: {rocm_devices}"
|
|
110
|
+
"The last device will be used."
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# Select the last device
|
|
114
|
+
device_selected = rocm_devices[-1]
|
|
115
|
+
logging.debug(f"Selected ROCm device: {device_selected}")
|
|
116
|
+
|
|
117
|
+
# Return the device ID
|
|
118
|
+
return device_selected[0]
|
|
86
119
|
|
|
87
120
|
|
|
88
121
|
def get_llama_version(backend: str) -> str:
|
|
@@ -277,7 +310,7 @@ def install_llamacpp(backend):
|
|
|
277
310
|
target_arch = None
|
|
278
311
|
if backend == "rocm":
|
|
279
312
|
# Identify the target architecture
|
|
280
|
-
target_arch
|
|
313
|
+
target_arch = identify_rocm_arch()
|
|
281
314
|
if not target_arch:
|
|
282
315
|
system = platform.system().lower()
|
|
283
316
|
if system == "linux":
|
|
@@ -293,10 +326,6 @@ def install_llamacpp(backend):
|
|
|
293
326
|
f"for supported configurations. {hint}"
|
|
294
327
|
)
|
|
295
328
|
|
|
296
|
-
# Set HIP_VISIBLE_DEVICES=0 for igpu, =1 for dgpu
|
|
297
|
-
env_file_path = os.path.join(llama_server_exe_dir, ".env")
|
|
298
|
-
set_key(env_file_path, "HIP_VISIBLE_DEVICES", hip_id)
|
|
299
|
-
|
|
300
329
|
# Direct download for Vulkan/ROCm
|
|
301
330
|
llama_archive_url, filename = get_binary_url_and_filename(backend, target_arch)
|
|
302
331
|
llama_archive_path = os.path.join(llama_server_exe_dir, filename)
|
|
@@ -315,6 +344,12 @@ def install_llamacpp(backend):
|
|
|
315
344
|
else:
|
|
316
345
|
raise NotImplementedError(f"Unsupported archive format: {filename}")
|
|
317
346
|
|
|
347
|
+
# Identify and set HIP ID
|
|
348
|
+
if backend == "rocm":
|
|
349
|
+
hip_id = identify_hip_id()
|
|
350
|
+
env_file_path = os.path.join(llama_server_exe_dir, ".env")
|
|
351
|
+
set_key(env_file_path, "HIP_VISIBLE_DEVICES", str(hip_id))
|
|
352
|
+
|
|
318
353
|
# Make executable on Linux - need to update paths after extraction
|
|
319
354
|
if platform.system().lower() == "linux":
|
|
320
355
|
# Re-get the paths since extraction might have changed the directory structure
|
|
@@ -778,3 +813,68 @@ class LlamaCppAdapter(ModelAdapter):
|
|
|
778
813
|
error_msg = f"Failed to run llama.cpp command: {str(e)}\n"
|
|
779
814
|
error_msg += f"Command: {' '.join(cmd)}"
|
|
780
815
|
raise Exception(error_msg)
|
|
816
|
+
|
|
817
|
+
|
|
818
|
+
def get_hip_devices():
|
|
819
|
+
"""Get list of HIP devices with their IDs and names."""
|
|
820
|
+
import ctypes
|
|
821
|
+
import sys
|
|
822
|
+
import os
|
|
823
|
+
import glob
|
|
824
|
+
from ctypes import c_int, POINTER
|
|
825
|
+
from ctypes.util import find_library
|
|
826
|
+
|
|
827
|
+
# Get llama.cpp path
|
|
828
|
+
rocm_path = get_llama_folder_path("rocm")
|
|
829
|
+
|
|
830
|
+
# Load HIP library
|
|
831
|
+
hip_library_pattern = (
|
|
832
|
+
"amdhip64*.dll" if sys.platform.startswith("win") else "libamdhip64*.so"
|
|
833
|
+
)
|
|
834
|
+
search_pattern = os.path.join(rocm_path, hip_library_pattern)
|
|
835
|
+
matching_files = glob.glob(search_pattern)
|
|
836
|
+
if not matching_files:
|
|
837
|
+
raise RuntimeError(
|
|
838
|
+
f"Could not find HIP runtime library matching pattern: {search_pattern}"
|
|
839
|
+
)
|
|
840
|
+
try:
|
|
841
|
+
libhip = ctypes.CDLL(matching_files[0])
|
|
842
|
+
except OSError:
|
|
843
|
+
raise RuntimeError(f"Could not load HIP runtime library from {path}")
|
|
844
|
+
|
|
845
|
+
# Setup function signatures
|
|
846
|
+
hipError_t = c_int
|
|
847
|
+
hipDeviceProp_t = ctypes.c_char * 2048
|
|
848
|
+
libhip.hipGetDeviceCount.restype = hipError_t
|
|
849
|
+
libhip.hipGetDeviceCount.argtypes = [POINTER(c_int)]
|
|
850
|
+
libhip.hipGetDeviceProperties.restype = hipError_t
|
|
851
|
+
libhip.hipGetDeviceProperties.argtypes = [POINTER(hipDeviceProp_t), c_int]
|
|
852
|
+
libhip.hipGetErrorString.restype = ctypes.c_char_p
|
|
853
|
+
libhip.hipGetErrorString.argtypes = [hipError_t]
|
|
854
|
+
|
|
855
|
+
# Get device count
|
|
856
|
+
device_count = c_int()
|
|
857
|
+
err = libhip.hipGetDeviceCount(ctypes.byref(device_count))
|
|
858
|
+
if err != 0:
|
|
859
|
+
logging.error(
|
|
860
|
+
"hipGetDeviceCount failed:", libhip.hipGetErrorString(err).decode()
|
|
861
|
+
)
|
|
862
|
+
return []
|
|
863
|
+
|
|
864
|
+
# Get device properties
|
|
865
|
+
devices = []
|
|
866
|
+
for i in range(device_count.value):
|
|
867
|
+
prop = hipDeviceProp_t()
|
|
868
|
+
err = libhip.hipGetDeviceProperties(ctypes.byref(prop), i)
|
|
869
|
+
if err != 0:
|
|
870
|
+
logging.error(
|
|
871
|
+
f"hipGetDeviceProperties failed for device {i}:",
|
|
872
|
+
libhip.hipGetErrorString(err).decode(),
|
|
873
|
+
)
|
|
874
|
+
continue
|
|
875
|
+
|
|
876
|
+
# Extract device name from HIP device properties
|
|
877
|
+
device_name = ctypes.string_at(prop, 256).decode("utf-8").rstrip("\x00")
|
|
878
|
+
devices.append([i, device_name])
|
|
879
|
+
|
|
880
|
+
return devices
|
|
@@ -109,7 +109,7 @@ class Cache(ManagementTool):
|
|
|
109
109
|
# pylint: disable=pointless-statement,f-string-without-interpolation
|
|
110
110
|
f"""
|
|
111
111
|
A set of functions for managing the lemonade build cache. The default
|
|
112
|
-
cache location is {lemonade_cache.DEFAULT_CACHE_DIR}, and can also be
|
|
112
|
+
cache location is {lemonade_cache.DEFAULT_CACHE_DIR}, and can also be
|
|
113
113
|
selected with
|
|
114
114
|
the global --cache-dir option or the LEMONADE_CACHE_DIR environment variable.
|
|
115
115
|
|
|
@@ -100,9 +100,10 @@ class OrtGenaiModel(ModelAdapter):
|
|
|
100
100
|
max_new_tokens=512,
|
|
101
101
|
min_new_tokens=0,
|
|
102
102
|
do_sample=True,
|
|
103
|
-
top_k=
|
|
104
|
-
top_p=
|
|
105
|
-
temperature=
|
|
103
|
+
top_k=None,
|
|
104
|
+
top_p=None,
|
|
105
|
+
temperature=None,
|
|
106
|
+
repeat_penalty=None,
|
|
106
107
|
streamer: OrtGenaiStreamer = None,
|
|
107
108
|
pad_token_id=None,
|
|
108
109
|
stopping_criteria=None,
|
|
@@ -154,38 +155,58 @@ class OrtGenaiModel(ModelAdapter):
|
|
|
154
155
|
if random_seed is None:
|
|
155
156
|
random_seed = -1 # In og.Generator, -1 = seed with random device
|
|
156
157
|
|
|
158
|
+
# Get search config if available, otherwise use empty dict
|
|
159
|
+
# Thanks to the empty dict, if the model doesn't have a built-in search
|
|
160
|
+
# config, the .get() calls will all just use the default values
|
|
161
|
+
search_config = {}
|
|
157
162
|
if self.config and "search" in self.config:
|
|
158
163
|
search_config = self.config["search"]
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
164
|
+
|
|
165
|
+
# Apply parameter hierarchy: user provided > search config > defaults
|
|
166
|
+
default_top_k = 50
|
|
167
|
+
default_top_p = 1.0
|
|
168
|
+
default_temperature = 0.7
|
|
169
|
+
default_repetition_penalty = 1.0
|
|
170
|
+
|
|
171
|
+
top_k_to_use = (
|
|
172
|
+
top_k if top_k is not None else search_config.get("top_k", default_top_k)
|
|
173
|
+
)
|
|
174
|
+
top_p_to_use = (
|
|
175
|
+
top_p if top_p is not None else search_config.get("top_p", default_top_p)
|
|
176
|
+
)
|
|
177
|
+
temperature_to_use = (
|
|
178
|
+
temperature
|
|
179
|
+
if temperature is not None
|
|
180
|
+
else search_config.get("temperature", default_temperature)
|
|
181
|
+
)
|
|
182
|
+
# Map the llamacpp name, `repeat_penalty`, to the OGA name, `repetition_penalty`
|
|
183
|
+
repetition_penalty_to_use = (
|
|
184
|
+
repeat_penalty
|
|
185
|
+
if repeat_penalty is not None
|
|
186
|
+
else search_config.get("repetition_penalty", default_repetition_penalty)
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
# Set search options once with all parameters
|
|
190
|
+
params.set_search_options(
|
|
191
|
+
do_sample=search_config.get("do_sample", do_sample),
|
|
192
|
+
top_k=top_k_to_use,
|
|
193
|
+
top_p=top_p_to_use,
|
|
194
|
+
temperature=temperature_to_use,
|
|
195
|
+
repetition_penalty=repetition_penalty_to_use,
|
|
196
|
+
max_length=max_length_to_use,
|
|
197
|
+
min_length=min_length,
|
|
198
|
+
early_stopping=search_config.get("early_stopping", False),
|
|
199
|
+
length_penalty=search_config.get("length_penalty", 1.0),
|
|
200
|
+
num_beams=search_config.get("num_beams", 1),
|
|
201
|
+
num_return_sequences=search_config.get("num_return_sequences", 1),
|
|
202
|
+
past_present_share_buffer=search_config.get(
|
|
203
|
+
"past_present_share_buffer", True
|
|
204
|
+
),
|
|
205
|
+
random_seed=random_seed,
|
|
206
|
+
# Not currently supported by OGA
|
|
207
|
+
# diversity_penalty=search_config.get('diversity_penalty', 0.0),
|
|
208
|
+
# no_repeat_ngram_size=search_config.get('no_repeat_ngram_size', 0),
|
|
209
|
+
)
|
|
189
210
|
params.try_graph_capture_with_max_batch_size(1)
|
|
190
211
|
|
|
191
212
|
generator = og.Generator(self.model, params)
|
|
@@ -43,6 +43,72 @@ def llamacpp_address(port: int) -> str:
|
|
|
43
43
|
return f"http://127.0.0.1:{port}/v1"
|
|
44
44
|
|
|
45
45
|
|
|
46
|
+
def _separate_openai_params(request_dict: dict, endpoint_type: str = "chat") -> dict:
|
|
47
|
+
"""
|
|
48
|
+
Separate standard OpenAI parameters from custom llama.cpp parameters.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
request_dict: Dictionary of all request parameters
|
|
52
|
+
endpoint_type: Type of endpoint ("chat" or "completion")
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
Dictionary with parameters properly separated for OpenAI client
|
|
56
|
+
"""
|
|
57
|
+
openai_client_params = {}
|
|
58
|
+
extra_params = {}
|
|
59
|
+
|
|
60
|
+
# Common OpenAI parameters for both endpoint types
|
|
61
|
+
common_params = {
|
|
62
|
+
"model",
|
|
63
|
+
"frequency_penalty",
|
|
64
|
+
"logit_bias",
|
|
65
|
+
"logprobs",
|
|
66
|
+
"max_tokens",
|
|
67
|
+
"n",
|
|
68
|
+
"presence_penalty",
|
|
69
|
+
"seed",
|
|
70
|
+
"stop",
|
|
71
|
+
"stream",
|
|
72
|
+
"temperature",
|
|
73
|
+
"top_p",
|
|
74
|
+
"user",
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
# Standard OpenAI parameters by endpoint type
|
|
78
|
+
if endpoint_type == "chat":
|
|
79
|
+
chat_specific_params = {
|
|
80
|
+
"messages",
|
|
81
|
+
"top_logprobs",
|
|
82
|
+
"response_format",
|
|
83
|
+
"service_tier",
|
|
84
|
+
"stream_options",
|
|
85
|
+
"tools",
|
|
86
|
+
"tool_choice",
|
|
87
|
+
"parallel_tool_calls",
|
|
88
|
+
}
|
|
89
|
+
openai_params = common_params | chat_specific_params
|
|
90
|
+
else: # completion
|
|
91
|
+
completion_specific_params = {
|
|
92
|
+
"prompt",
|
|
93
|
+
"best_of",
|
|
94
|
+
"echo",
|
|
95
|
+
"suffix",
|
|
96
|
+
}
|
|
97
|
+
openai_params = common_params | completion_specific_params
|
|
98
|
+
|
|
99
|
+
for key, value in request_dict.items():
|
|
100
|
+
if key in openai_params:
|
|
101
|
+
openai_client_params[key] = value
|
|
102
|
+
else:
|
|
103
|
+
extra_params[key] = value
|
|
104
|
+
|
|
105
|
+
# If there are custom parameters, use extra_body to pass them through
|
|
106
|
+
if extra_params:
|
|
107
|
+
openai_client_params["extra_body"] = extra_params
|
|
108
|
+
|
|
109
|
+
return openai_client_params
|
|
110
|
+
|
|
111
|
+
|
|
46
112
|
class LlamaTelemetry:
|
|
47
113
|
"""
|
|
48
114
|
Manages telemetry data collection and display for llama server.
|
|
@@ -226,6 +292,11 @@ def _launch_llama_subprocess(
|
|
|
226
292
|
"--ctx-size",
|
|
227
293
|
str(ctx_size),
|
|
228
294
|
]
|
|
295
|
+
|
|
296
|
+
# Lock random seed for deterministic behavior in CI
|
|
297
|
+
if os.environ.get("LEMONADE_CI_MODE"):
|
|
298
|
+
base_command.extend(["--seed", "42"])
|
|
299
|
+
|
|
229
300
|
if "mmproj" in snapshot_files:
|
|
230
301
|
base_command.extend(["--mmproj", snapshot_files["mmproj"]])
|
|
231
302
|
if not use_gpu:
|
|
@@ -238,6 +309,15 @@ def _launch_llama_subprocess(
|
|
|
238
309
|
# Add port and jinja to enable tool use
|
|
239
310
|
base_command.extend(["--port", str(telemetry.port), "--jinja"])
|
|
240
311
|
|
|
312
|
+
# Disable jinja for gpt-oss-120b on Vulkan
|
|
313
|
+
if backend == "vulkan" and "gpt-oss-120b" in snapshot_files["variant"].lower():
|
|
314
|
+
base_command.remove("--jinja")
|
|
315
|
+
logging.warning(
|
|
316
|
+
"Jinja is disabled for gpt-oss-120b on Vulkan due to a llama.cpp bug "
|
|
317
|
+
"(see https://github.com/ggml-org/llama.cpp/issues/15274). "
|
|
318
|
+
"The model cannot use tools. If needed, use the ROCm backend instead."
|
|
319
|
+
)
|
|
320
|
+
|
|
241
321
|
# Use legacy reasoning formatting, since not all apps support the new
|
|
242
322
|
# reasoning_content field
|
|
243
323
|
base_command.extend(["--reasoning-format", "none"])
|
|
@@ -384,13 +464,17 @@ def chat_completion(
|
|
|
384
464
|
exclude_unset=True, exclude_none=True
|
|
385
465
|
)
|
|
386
466
|
|
|
467
|
+
# Separate standard OpenAI parameters from custom llama.cpp parameters
|
|
468
|
+
openai_client_params = _separate_openai_params(request_dict, "chat")
|
|
469
|
+
|
|
387
470
|
# Check if streaming is requested
|
|
388
471
|
if chat_completion_request.stream:
|
|
389
472
|
|
|
390
473
|
def event_stream():
|
|
391
474
|
try:
|
|
392
475
|
# Enable streaming
|
|
393
|
-
|
|
476
|
+
# pylint: disable=missing-kwoa
|
|
477
|
+
for chunk in client.chat.completions.create(**openai_client_params):
|
|
394
478
|
yield f"data: {chunk.model_dump_json()}\n\n"
|
|
395
479
|
yield "data: [DONE]\n\n"
|
|
396
480
|
|
|
@@ -412,7 +496,8 @@ def chat_completion(
|
|
|
412
496
|
# Non-streaming response
|
|
413
497
|
try:
|
|
414
498
|
# Disable streaming for non-streaming requests
|
|
415
|
-
|
|
499
|
+
# pylint: disable=missing-kwoa
|
|
500
|
+
response = client.chat.completions.create(**openai_client_params)
|
|
416
501
|
|
|
417
502
|
# Show telemetry after completion
|
|
418
503
|
telemetry.show_telemetry()
|
|
@@ -420,6 +505,7 @@ def chat_completion(
|
|
|
420
505
|
return response
|
|
421
506
|
|
|
422
507
|
except Exception as e: # pylint: disable=broad-exception-caught
|
|
508
|
+
logging.error("Error during chat completion: %s", str(e))
|
|
423
509
|
raise HTTPException(
|
|
424
510
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
425
511
|
detail=f"Chat completion error: {str(e)}",
|
|
@@ -446,13 +532,17 @@ def completion(completion_request: CompletionRequest, telemetry: LlamaTelemetry)
|
|
|
446
532
|
# Convert Pydantic model to dict and remove unset/null values
|
|
447
533
|
request_dict = completion_request.model_dump(exclude_unset=True, exclude_none=True)
|
|
448
534
|
|
|
535
|
+
# Separate standard OpenAI parameters from custom llama.cpp parameters
|
|
536
|
+
openai_client_params = _separate_openai_params(request_dict, "completion")
|
|
537
|
+
|
|
449
538
|
# Check if streaming is requested
|
|
450
539
|
if completion_request.stream:
|
|
451
540
|
|
|
452
541
|
def event_stream():
|
|
453
542
|
try:
|
|
454
543
|
# Enable streaming
|
|
455
|
-
|
|
544
|
+
# pylint: disable=missing-kwoa
|
|
545
|
+
for chunk in client.completions.create(**openai_client_params):
|
|
456
546
|
yield f"data: {chunk.model_dump_json()}\n\n"
|
|
457
547
|
yield "data: [DONE]\n\n"
|
|
458
548
|
|
|
@@ -474,7 +564,8 @@ def completion(completion_request: CompletionRequest, telemetry: LlamaTelemetry)
|
|
|
474
564
|
# Non-streaming response
|
|
475
565
|
try:
|
|
476
566
|
# Disable streaming for non-streaming requests
|
|
477
|
-
|
|
567
|
+
# pylint: disable=missing-kwoa
|
|
568
|
+
response = client.completions.create(**openai_client_params)
|
|
478
569
|
|
|
479
570
|
# Show telemetry after completion
|
|
480
571
|
telemetry.show_telemetry()
|
|
@@ -482,6 +573,7 @@ def completion(completion_request: CompletionRequest, telemetry: LlamaTelemetry)
|
|
|
482
573
|
return response
|
|
483
574
|
|
|
484
575
|
except Exception as e: # pylint: disable=broad-exception-caught
|
|
576
|
+
logging.error("Error during completion: %s", str(e))
|
|
485
577
|
raise HTTPException(
|
|
486
578
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
487
579
|
detail=f"Completion error: {str(e)}",
|