lemonade-sdk 7.0.2__tar.gz → 7.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lemonade-sdk might be problematic. Click here for more details.
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/PKG-INFO +1 -1
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/setup.py +1 -1
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/tools/huggingface_load.py +6 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/tools/ort_genai/oga.py +6 -4
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/tools/prompt.py +28 -1
- lemonade_sdk-7.0.4/src/lemonade/tools/server/instructions.py +37 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/tools/server/llamacpp.py +22 -22
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/tools/server/serve.py +5 -6
- lemonade_sdk-7.0.4/src/lemonade/tools/server/static/instructions.html +262 -0
- lemonade_sdk-7.0.4/src/lemonade/version.py +1 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade_sdk.egg-info/PKG-INFO +1 -1
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade_sdk.egg-info/SOURCES.txt +2 -1
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade_server/model_manager.py +45 -12
- {lemonade_sdk-7.0.2/src/lemonade/tools/server → lemonade_sdk-7.0.4/src/lemonade_server}/pydantic_models.py +2 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade_server/server_models.json +14 -0
- lemonade_sdk-7.0.2/src/lemonade/tools/server/instructions.py +0 -294
- lemonade_sdk-7.0.2/src/lemonade/version.py +0 -1
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/LICENSE +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/NOTICE.md +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/README.md +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/pyproject.toml +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/setup.cfg +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/__init__.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/api.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/cache.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/cli.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/common/__init__.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/common/analyze_model.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/common/build.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/common/cli_helpers.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/common/exceptions.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/common/filesystem.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/common/labels.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/common/onnx_helpers.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/common/plugins.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/common/printing.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/common/status.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/common/system_info.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/common/tensor_helpers.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/common/test_helpers.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/profilers/__init__.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/profilers/memory_tracker.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/profilers/profiler.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/sequence.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/state.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/tools/__init__.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/tools/accuracy.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/tools/adapter.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/tools/bench.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/tools/huggingface_bench.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/tools/humaneval.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/tools/llamacpp.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/tools/llamacpp_bench.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/tools/management_tools.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/tools/mmlu.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/tools/ort_genai/__init__.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/tools/ort_genai/oga_bench.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/tools/perplexity.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/tools/quark/__init__.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/tools/quark/quark_load.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/tools/quark/quark_quantize.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/tools/report/__init__.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/tools/report/llm_report.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/tools/report/table.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/tools/server/__init__.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/tools/server/port_utils.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/tools/server/static/styles.css +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/tools/server/thread_utils.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/tools/server/tool_calls.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade/tools/tool.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade_install/__init__.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade_install/install.py +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade_sdk.egg-info/dependency_links.txt +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade_sdk.egg-info/entry_points.txt +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade_sdk.egg-info/requires.txt +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade_sdk.egg-info/top_level.txt +0 -0
- {lemonade_sdk-7.0.2 → lemonade_sdk-7.0.4}/src/lemonade_server/cli.py +0 -0
|
@@ -326,6 +326,7 @@ class HuggingfaceAdapter(ModelAdapter):
|
|
|
326
326
|
def generate(
|
|
327
327
|
self,
|
|
328
328
|
input_ids,
|
|
329
|
+
random_seed=1,
|
|
329
330
|
**kwargs,
|
|
330
331
|
):
|
|
331
332
|
|
|
@@ -346,6 +347,11 @@ class HuggingfaceAdapter(ModelAdapter):
|
|
|
346
347
|
**kwargs,
|
|
347
348
|
}
|
|
348
349
|
|
|
350
|
+
if random_seed is None:
|
|
351
|
+
torch.random.seed()
|
|
352
|
+
else:
|
|
353
|
+
torch.random.manual_seed(random_seed)
|
|
354
|
+
|
|
349
355
|
with torch.no_grad(), torch.inference_mode():
|
|
350
356
|
outputs = self.model.generate(input_ids=input_ids, **generation_kwargs)
|
|
351
357
|
|
|
@@ -139,6 +139,7 @@ class OrtGenaiModel(ModelAdapter):
|
|
|
139
139
|
pad_token_id=None,
|
|
140
140
|
stopping_criteria=None,
|
|
141
141
|
max_length=None,
|
|
142
|
+
random_seed=1,
|
|
142
143
|
):
|
|
143
144
|
params = og.GeneratorParams(self.model)
|
|
144
145
|
|
|
@@ -179,6 +180,9 @@ class OrtGenaiModel(ModelAdapter):
|
|
|
179
180
|
if use_oga_pre_6_api:
|
|
180
181
|
params.input_ids = input_ids
|
|
181
182
|
|
|
183
|
+
if random_seed is None:
|
|
184
|
+
random_seed = -1 # In og.Generator, -1 = seed with random device
|
|
185
|
+
|
|
182
186
|
if self.config and "search" in self.config:
|
|
183
187
|
search_config = self.config["search"]
|
|
184
188
|
params.set_search_options(
|
|
@@ -196,10 +200,7 @@ class OrtGenaiModel(ModelAdapter):
|
|
|
196
200
|
past_present_share_buffer=search_config.get(
|
|
197
201
|
"past_present_share_buffer", True
|
|
198
202
|
),
|
|
199
|
-
|
|
200
|
-
# by default, random_seed=-1 causes different laptops to give
|
|
201
|
-
# different results
|
|
202
|
-
random_seed=1,
|
|
203
|
+
random_seed=random_seed,
|
|
203
204
|
# Not currently supported by OGA
|
|
204
205
|
# diversity_penalty=search_config.get('diversity_penalty', 0.0),
|
|
205
206
|
# no_repeat_ngram_size=search_config.get('no_repeat_ngram_size', 0),
|
|
@@ -212,6 +213,7 @@ class OrtGenaiModel(ModelAdapter):
|
|
|
212
213
|
temperature=temperature,
|
|
213
214
|
max_length=max_length_to_use,
|
|
214
215
|
min_length=min_length,
|
|
216
|
+
random_seed=random_seed,
|
|
215
217
|
)
|
|
216
218
|
params.try_graph_capture_with_max_batch_size(1)
|
|
217
219
|
|
|
@@ -15,6 +15,7 @@ DEFAULT_GENERATE_PARAMS = {
|
|
|
15
15
|
"temperature": 0.7,
|
|
16
16
|
}
|
|
17
17
|
|
|
18
|
+
DEFAULT_RANDOM_SEED = 1
|
|
18
19
|
DEFAULT_MAX_NEW_TOKENS = 512
|
|
19
20
|
DEFAULT_N_TRIALS = 1
|
|
20
21
|
|
|
@@ -108,6 +109,19 @@ class LLMPrompt(Tool):
|
|
|
108
109
|
f"(useful for testing, default is {DEFAULT_N_TRIALS})",
|
|
109
110
|
)
|
|
110
111
|
|
|
112
|
+
parser.add_argument(
|
|
113
|
+
"--random-seed",
|
|
114
|
+
"-r",
|
|
115
|
+
default=str(DEFAULT_RANDOM_SEED),
|
|
116
|
+
help="Positive integer seed for random number generator used in "
|
|
117
|
+
"sampling tokens "
|
|
118
|
+
f"(default is {DEFAULT_RANDOM_SEED}). If the number of trials is "
|
|
119
|
+
"greater than one, then the seed is incremented by one for each "
|
|
120
|
+
"trial. Set to `None` for random, non-repeatable results. This "
|
|
121
|
+
"random seed behavior only applies to models loaded with "
|
|
122
|
+
"`oga-load` or `huggingface-load`.",
|
|
123
|
+
)
|
|
124
|
+
|
|
111
125
|
return parser
|
|
112
126
|
|
|
113
127
|
def parse(self, state: State, args, known_only=True) -> argparse.Namespace:
|
|
@@ -123,6 +137,11 @@ class LLMPrompt(Tool):
|
|
|
123
137
|
with open(parsed_args.prompt, "r", encoding="utf-8") as f:
|
|
124
138
|
parsed_args.prompt = f.read()
|
|
125
139
|
|
|
140
|
+
if parsed_args.random_seed == "None":
|
|
141
|
+
parsed_args.random_seed = None
|
|
142
|
+
else:
|
|
143
|
+
parsed_args.random_seed = int(parsed_args.random_seed)
|
|
144
|
+
|
|
126
145
|
return parsed_args
|
|
127
146
|
|
|
128
147
|
def run(
|
|
@@ -132,6 +151,7 @@ class LLMPrompt(Tool):
|
|
|
132
151
|
max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
|
|
133
152
|
n_trials: int = DEFAULT_N_TRIALS,
|
|
134
153
|
template: bool = False,
|
|
154
|
+
random_seed: int = DEFAULT_RANDOM_SEED,
|
|
135
155
|
) -> State:
|
|
136
156
|
|
|
137
157
|
model: ModelAdapter = state.model
|
|
@@ -170,9 +190,16 @@ class LLMPrompt(Tool):
|
|
|
170
190
|
|
|
171
191
|
# Get the response from the LLM, which may include the prompt in it
|
|
172
192
|
response = model.generate(
|
|
173
|
-
input_ids,
|
|
193
|
+
input_ids,
|
|
194
|
+
max_new_tokens=max_new_tokens,
|
|
195
|
+
random_seed=random_seed,
|
|
196
|
+
**DEFAULT_GENERATE_PARAMS,
|
|
174
197
|
)
|
|
175
198
|
|
|
199
|
+
# Increment random seed if not none
|
|
200
|
+
if random_seed is not None:
|
|
201
|
+
random_seed += 1
|
|
202
|
+
|
|
176
203
|
# Flatten the input and response
|
|
177
204
|
input_ids_array = (
|
|
178
205
|
input_ids if isinstance(input_ids, (list, str)) else input_ids[0]
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
import json
|
|
3
|
+
from fastapi.responses import HTMLResponse
|
|
4
|
+
from lemonade_server.model_manager import ModelManager
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def get_instructions_html(port=8000):
|
|
8
|
+
"""
|
|
9
|
+
Show instructions on how to use the server.
|
|
10
|
+
"""
|
|
11
|
+
# Load server models from JSON
|
|
12
|
+
server_models_path = (
|
|
13
|
+
Path(__file__).parent.parent.parent.parent
|
|
14
|
+
/ "lemonade_server"
|
|
15
|
+
/ "server_models.json"
|
|
16
|
+
)
|
|
17
|
+
with open(server_models_path, "r", encoding="utf-8") as f:
|
|
18
|
+
server_models = json.load(f)
|
|
19
|
+
|
|
20
|
+
# Use shared filter function from model_manager.py
|
|
21
|
+
filtered_models = ModelManager().filter_models_by_backend(server_models)
|
|
22
|
+
|
|
23
|
+
# Pass filtered server_models to JS
|
|
24
|
+
server_models_js = (
|
|
25
|
+
f"<script>window.SERVER_MODELS = {json.dumps(filtered_models)};</script>"
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
# Load HTML template
|
|
29
|
+
template_path = Path(__file__).parent / "static" / "instructions.html"
|
|
30
|
+
with open(template_path, "r", encoding="utf-8") as f:
|
|
31
|
+
html_template = f.read()
|
|
32
|
+
|
|
33
|
+
# Replace template variables
|
|
34
|
+
html_content = html_template.replace("{{SERVER_PORT}}", str(port))
|
|
35
|
+
html_content = html_content.replace("{{SERVER_MODELS_JS}}", server_models_js)
|
|
36
|
+
|
|
37
|
+
return HTMLResponse(content=html_content)
|
|
@@ -14,8 +14,8 @@ from fastapi.responses import StreamingResponse
|
|
|
14
14
|
|
|
15
15
|
from openai import OpenAI
|
|
16
16
|
|
|
17
|
+
from lemonade_server.pydantic_models import ChatCompletionRequest
|
|
17
18
|
from lemonade_server.model_manager import ModelManager
|
|
18
|
-
from lemonade.tools.server.pydantic_models import ChatCompletionRequest
|
|
19
19
|
from lemonade.tools.server.port_utils import find_free_port
|
|
20
20
|
|
|
21
21
|
LLAMA_VERSION = "b5543"
|
|
@@ -145,16 +145,14 @@ def _log_subprocess_output(
|
|
|
145
145
|
break
|
|
146
146
|
|
|
147
147
|
|
|
148
|
-
def _wait_for_load(
|
|
149
|
-
llama_server_process: subprocess.Popen, port: int, fail_message: str
|
|
150
|
-
):
|
|
148
|
+
def _wait_for_load(llama_server_process: subprocess.Popen, port: int):
|
|
151
149
|
status_code = None
|
|
152
150
|
while not llama_server_process.poll() and status_code != 200:
|
|
153
151
|
health_url = f"http://localhost:{port}/health"
|
|
154
152
|
try:
|
|
155
153
|
health_response = requests.get(health_url)
|
|
156
154
|
except requests.exceptions.ConnectionError:
|
|
157
|
-
logging.
|
|
155
|
+
logging.debug("Not able to connect to llama-server yet, will retry")
|
|
158
156
|
else:
|
|
159
157
|
status_code = health_response.status_code
|
|
160
158
|
logging.debug(
|
|
@@ -165,24 +163,25 @@ def _wait_for_load(
|
|
|
165
163
|
|
|
166
164
|
|
|
167
165
|
def _launch_llama_subprocess(
|
|
168
|
-
|
|
166
|
+
snapshot_files: dict, use_gpu: bool, telemetry: LlamaTelemetry
|
|
169
167
|
) -> subprocess.Popen:
|
|
170
168
|
"""
|
|
171
169
|
Launch llama server subprocess with GPU or CPU configuration
|
|
172
170
|
"""
|
|
173
171
|
|
|
172
|
+
# Build the base command
|
|
173
|
+
base_command = [LLAMA_SERVER_EXE_PATH, "-m", snapshot_files["variant"]]
|
|
174
|
+
if "mmproj" in snapshot_files:
|
|
175
|
+
base_command.extend(["--mmproj", snapshot_files["mmproj"]])
|
|
176
|
+
if not use_gpu:
|
|
177
|
+
base_command.extend(["--no-mmproj-offload"])
|
|
178
|
+
|
|
174
179
|
# Find a port, and save it in the telemetry object for future reference
|
|
175
180
|
# by other functions
|
|
176
181
|
telemetry.choose_port()
|
|
177
182
|
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
"-m",
|
|
181
|
-
model_path,
|
|
182
|
-
"--port",
|
|
183
|
-
str(telemetry.port),
|
|
184
|
-
"--jinja",
|
|
185
|
-
]
|
|
183
|
+
# Add port and jinja to enable tool use
|
|
184
|
+
base_command.extend(["--port", str(telemetry.port), "--jinja"])
|
|
186
185
|
|
|
187
186
|
# Configure GPU layers: 99 for GPU, 0 for CPU-only
|
|
188
187
|
ngl_value = "99" if use_gpu else "0"
|
|
@@ -204,7 +203,7 @@ def _launch_llama_subprocess(
|
|
|
204
203
|
return process
|
|
205
204
|
|
|
206
205
|
|
|
207
|
-
def server_load(
|
|
206
|
+
def server_load(model_config: dict, model_reference: str, telemetry: LlamaTelemetry):
|
|
208
207
|
# Download llama.cpp server if it isn't already available
|
|
209
208
|
if not os.path.exists(LLAMA_SERVER_EXE_DIR):
|
|
210
209
|
# Download llama.cpp server zip
|
|
@@ -236,36 +235,37 @@ def server_load(checkpoint: str, model_reference: str, telemetry: LlamaTelemetry
|
|
|
236
235
|
logging.info("Cleaned up zip file")
|
|
237
236
|
|
|
238
237
|
# Download the gguf to the hugging face cache
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
logging.debug(f"GGUF file path: {model_path}")
|
|
238
|
+
snapshot_files = ModelManager().download_gguf(model_config)
|
|
239
|
+
logging.debug(f"GGUF file paths: {snapshot_files}")
|
|
242
240
|
|
|
243
241
|
# Start the llama-serve.exe process
|
|
244
242
|
logging.debug(f"Using llama_server for GGUF model: {LLAMA_SERVER_EXE_PATH}")
|
|
245
243
|
|
|
246
244
|
# Attempt loading on GPU first
|
|
247
245
|
llama_server_process = _launch_llama_subprocess(
|
|
248
|
-
|
|
246
|
+
snapshot_files, use_gpu=True, telemetry=telemetry
|
|
249
247
|
)
|
|
250
248
|
|
|
251
249
|
# Check the /health endpoint until GPU server is ready
|
|
252
250
|
_wait_for_load(
|
|
253
251
|
llama_server_process,
|
|
254
252
|
telemetry.port,
|
|
255
|
-
f"Loading {model_reference} on GPU didn't work, re-attempting on CPU",
|
|
256
253
|
)
|
|
257
254
|
|
|
258
255
|
# If loading on GPU failed, try loading on CPU
|
|
259
256
|
if llama_server_process.poll():
|
|
257
|
+
logging.warning(
|
|
258
|
+
f"Loading {model_reference} on GPU didn't work, re-attempting on CPU"
|
|
259
|
+
)
|
|
260
|
+
|
|
260
261
|
llama_server_process = _launch_llama_subprocess(
|
|
261
|
-
|
|
262
|
+
snapshot_files, use_gpu=False, telemetry=telemetry
|
|
262
263
|
)
|
|
263
264
|
|
|
264
265
|
# Check the /health endpoint until CPU server is ready
|
|
265
266
|
_wait_for_load(
|
|
266
267
|
llama_server_process,
|
|
267
268
|
telemetry.port,
|
|
268
|
-
f"Loading {model_reference} on CPU didn't work",
|
|
269
269
|
)
|
|
270
270
|
|
|
271
271
|
if llama_server_process.poll():
|
|
@@ -46,9 +46,7 @@ from openai.types.responses import (
|
|
|
46
46
|
|
|
47
47
|
import lemonade.api as lemonade_api
|
|
48
48
|
from lemonade_server.model_manager import ModelManager
|
|
49
|
-
from
|
|
50
|
-
import lemonade.tools.server.llamacpp as llamacpp
|
|
51
|
-
from lemonade.tools.server.pydantic_models import (
|
|
49
|
+
from lemonade_server.pydantic_models import (
|
|
52
50
|
DEFAULT_MAX_NEW_TOKENS,
|
|
53
51
|
LoadConfig,
|
|
54
52
|
CompletionRequest,
|
|
@@ -56,6 +54,8 @@ from lemonade.tools.server.pydantic_models import (
|
|
|
56
54
|
ResponsesRequest,
|
|
57
55
|
PullConfig,
|
|
58
56
|
)
|
|
57
|
+
from lemonade.tools.management_tools import ManagementTool
|
|
58
|
+
import lemonade.tools.server.llamacpp as llamacpp
|
|
59
59
|
from lemonade.tools.server.tool_calls import extract_tool_calls, get_tool_call_pattern
|
|
60
60
|
from lemonade.tools.server.instructions import get_instructions_html
|
|
61
61
|
from lemonade.tools.server.port_utils import lifespan
|
|
@@ -1103,7 +1103,6 @@ class Server(ManagementTool):
|
|
|
1103
1103
|
"""
|
|
1104
1104
|
Report server health information to the client.
|
|
1105
1105
|
"""
|
|
1106
|
-
self.stop_event.set()
|
|
1107
1106
|
|
|
1108
1107
|
return {
|
|
1109
1108
|
"status": "ok",
|
|
@@ -1200,7 +1199,7 @@ class Server(ManagementTool):
|
|
|
1200
1199
|
# We will populate a LoadConfig that has all of the required fields
|
|
1201
1200
|
config_to_use: LoadConfig
|
|
1202
1201
|
|
|
1203
|
-
# First,
|
|
1202
|
+
# First, ensure that the arguments are valid
|
|
1204
1203
|
if config.model_name:
|
|
1205
1204
|
# Get the dictionary of supported model from disk
|
|
1206
1205
|
supported_models = ModelManager().supported_models
|
|
@@ -1293,7 +1292,7 @@ class Server(ManagementTool):
|
|
|
1293
1292
|
try:
|
|
1294
1293
|
if config_to_use.recipe == "llamacpp":
|
|
1295
1294
|
self.llama_server_process = llamacpp.server_load(
|
|
1296
|
-
|
|
1295
|
+
model_config=config_to_use,
|
|
1297
1296
|
model_reference=model_reference,
|
|
1298
1297
|
telemetry=self.llama_telemetry,
|
|
1299
1298
|
)
|
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
<!DOCTYPE html>
|
|
2
|
+
<html lang="en">
|
|
3
|
+
<head>
|
|
4
|
+
<meta charset="UTF-8">
|
|
5
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
6
|
+
<title>Lemonade Server</title>
|
|
7
|
+
<link rel="icon" href="data:,">
|
|
8
|
+
<link rel="stylesheet" href="/static/styles.css">
|
|
9
|
+
<script>
|
|
10
|
+
window.SERVER_PORT = {{SERVER_PORT}};
|
|
11
|
+
</script>
|
|
12
|
+
{{SERVER_MODELS_JS}}
|
|
13
|
+
</head>
|
|
14
|
+
<body>
|
|
15
|
+
<nav class="navbar">
|
|
16
|
+
<a href="https://github.com/lemonade-sdk/lemonade">GitHub</a>
|
|
17
|
+
<a href="https://lemonade-server.ai/docs/">Docs</a>
|
|
18
|
+
<a href="https://lemonade-server.ai/docs/server/server_models/">Models</a>
|
|
19
|
+
<a href="https://lemonade-server.ai/docs/server/apps/">Featured Apps</a>
|
|
20
|
+
</nav>
|
|
21
|
+
<main class="main">
|
|
22
|
+
<div class="title">🍋 Lemonade Server</div>
|
|
23
|
+
<div class="tab-container">
|
|
24
|
+
<div class="tabs">
|
|
25
|
+
<button class="tab active" id="tab-chat" onclick="showTab('chat')">LLM Chat</button>
|
|
26
|
+
<button class="tab" id="tab-models" onclick="showTab('models')">Model Management</button>
|
|
27
|
+
</div>
|
|
28
|
+
<div class="tab-content active" id="content-chat">
|
|
29
|
+
<div class="chat-container">
|
|
30
|
+
<div class="chat-history" id="chat-history"></div>
|
|
31
|
+
<div class="chat-input-row">
|
|
32
|
+
<select id="model-select"></select>
|
|
33
|
+
<input type="text" id="chat-input" placeholder="Type your message..." />
|
|
34
|
+
<button id="send-btn">Send</button>
|
|
35
|
+
</div>
|
|
36
|
+
</div>
|
|
37
|
+
</div>
|
|
38
|
+
<div class="tab-content" id="content-models">
|
|
39
|
+
<div class="model-mgmt-container">
|
|
40
|
+
<div class="model-mgmt-pane">
|
|
41
|
+
<h3>Installed Models</h3>
|
|
42
|
+
<table class="model-table" id="installed-models-table">
|
|
43
|
+
<colgroup><col style="width:100%"></colgroup>
|
|
44
|
+
<tbody id="installed-models-tbody"></tbody>
|
|
45
|
+
</table>
|
|
46
|
+
</div>
|
|
47
|
+
<div class="model-mgmt-pane">
|
|
48
|
+
<h3>Suggested Models</h3>
|
|
49
|
+
<table class="model-table" id="suggested-models-table">
|
|
50
|
+
<tbody id="suggested-models-tbody"></tbody>
|
|
51
|
+
</table>
|
|
52
|
+
</div>
|
|
53
|
+
</div>
|
|
54
|
+
</div>
|
|
55
|
+
</div>
|
|
56
|
+
</main>
|
|
57
|
+
<footer class="site-footer">
|
|
58
|
+
<div class="dad-joke">When life gives you LLMs, make an LLM aide.</div>
|
|
59
|
+
<div class="copyright">Copyright 2025 AMD</div>
|
|
60
|
+
</footer>
|
|
61
|
+
<script src="https://cdn.jsdelivr.net/npm/openai@4.21.0/dist/openai.min.js"></script>
|
|
62
|
+
<script>
|
|
63
|
+
// Tab switching logic
|
|
64
|
+
function showTab(tab) {
|
|
65
|
+
document.getElementById('tab-chat').classList.remove('active');
|
|
66
|
+
document.getElementById('tab-models').classList.remove('active');
|
|
67
|
+
document.getElementById('content-chat').classList.remove('active');
|
|
68
|
+
document.getElementById('content-models').classList.remove('active');
|
|
69
|
+
if (tab === 'chat') {
|
|
70
|
+
document.getElementById('tab-chat').classList.add('active');
|
|
71
|
+
document.getElementById('content-chat').classList.add('active');
|
|
72
|
+
} else {
|
|
73
|
+
document.getElementById('tab-models').classList.add('active');
|
|
74
|
+
document.getElementById('content-models').classList.add('active');
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// Helper to get server base URL
|
|
79
|
+
function getServerBaseUrl() {
|
|
80
|
+
const port = window.SERVER_PORT || 8000;
|
|
81
|
+
return `http://localhost:${port}`;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// Populate model dropdown from /api/v1/models endpoint
|
|
85
|
+
async function loadModels() {
|
|
86
|
+
try {
|
|
87
|
+
const resp = await fetch(getServerBaseUrl() + '/api/v1/models');
|
|
88
|
+
const data = await resp.json();
|
|
89
|
+
const select = document.getElementById('model-select');
|
|
90
|
+
select.innerHTML = '';
|
|
91
|
+
if (!data.data || !Array.isArray(data.data)) {
|
|
92
|
+
select.innerHTML = '<option>No models found (malformed response)</option>';
|
|
93
|
+
return;
|
|
94
|
+
}
|
|
95
|
+
if (data.data.length === 0) {
|
|
96
|
+
select.innerHTML = '<option>No models available</option>';
|
|
97
|
+
return;
|
|
98
|
+
}
|
|
99
|
+
let defaultIndex = 0;
|
|
100
|
+
data.data.forEach(function(model, index) {
|
|
101
|
+
const modelId = model.id || model.name || model;
|
|
102
|
+
const opt = document.createElement('option');
|
|
103
|
+
opt.value = modelId;
|
|
104
|
+
opt.textContent = modelId;
|
|
105
|
+
if (modelId === 'Llama-3.2-1B-Instruct-Hybrid') {
|
|
106
|
+
defaultIndex = index;
|
|
107
|
+
}
|
|
108
|
+
select.appendChild(opt);
|
|
109
|
+
});
|
|
110
|
+
select.selectedIndex = defaultIndex;
|
|
111
|
+
} catch (e) {
|
|
112
|
+
const select = document.getElementById('model-select');
|
|
113
|
+
select.innerHTML = `<option>Error loading models: ${e.message}</option>`;
|
|
114
|
+
console.error('Error loading models:', e);
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
loadModels();
|
|
118
|
+
|
|
119
|
+
// Model Management Tab Logic
|
|
120
|
+
async function refreshModelMgmtUI() {
|
|
121
|
+
// Get installed models from /api/v1/models
|
|
122
|
+
let installed = [];
|
|
123
|
+
try {
|
|
124
|
+
const resp = await fetch(getServerBaseUrl() + '/api/v1/models');
|
|
125
|
+
const data = await resp.json();
|
|
126
|
+
if (data.data && Array.isArray(data.data)) {
|
|
127
|
+
installed = data.data.map(m => m.id || m.name || m);
|
|
128
|
+
}
|
|
129
|
+
} catch (e) {}
|
|
130
|
+
// All models from server_models.json (window.SERVER_MODELS)
|
|
131
|
+
const allModels = window.SERVER_MODELS || {};
|
|
132
|
+
// Filter suggested models not installed
|
|
133
|
+
const suggested = Object.keys(allModels).filter(
|
|
134
|
+
k => allModels[k].suggested && !installed.includes(k)
|
|
135
|
+
);
|
|
136
|
+
// Render installed models as a table (two columns, second is invisible)
|
|
137
|
+
const installedTbody = document.getElementById('installed-models-tbody');
|
|
138
|
+
installedTbody.innerHTML = '';
|
|
139
|
+
installed.forEach(function(mid) {
|
|
140
|
+
var tr = document.createElement('tr');
|
|
141
|
+
var tdName = document.createElement('td');
|
|
142
|
+
tdName.textContent = mid;
|
|
143
|
+
var tdEmpty = document.createElement('td');
|
|
144
|
+
tdEmpty.style.width = '0';
|
|
145
|
+
tdEmpty.style.padding = '0';
|
|
146
|
+
tdEmpty.style.border = 'none';
|
|
147
|
+
tr.appendChild(tdName);
|
|
148
|
+
tr.appendChild(tdEmpty);
|
|
149
|
+
installedTbody.appendChild(tr);
|
|
150
|
+
});
|
|
151
|
+
// Render suggested models as a table
|
|
152
|
+
const suggestedTbody = document.getElementById('suggested-models-tbody');
|
|
153
|
+
suggestedTbody.innerHTML = '';
|
|
154
|
+
suggested.forEach(mid => {
|
|
155
|
+
const tr = document.createElement('tr');
|
|
156
|
+
const tdName = document.createElement('td');
|
|
157
|
+
tdName.textContent = mid;
|
|
158
|
+
tdName.style.paddingRight = '1em';
|
|
159
|
+
tdName.style.verticalAlign = 'middle';
|
|
160
|
+
const tdBtn = document.createElement('td');
|
|
161
|
+
tdBtn.style.width = '1%';
|
|
162
|
+
tdBtn.style.verticalAlign = 'middle';
|
|
163
|
+
const btn = document.createElement('button');
|
|
164
|
+
btn.textContent = '+';
|
|
165
|
+
btn.title = 'Install model';
|
|
166
|
+
btn.onclick = async function() {
|
|
167
|
+
btn.disabled = true;
|
|
168
|
+
btn.textContent = 'Installing...';
|
|
169
|
+
btn.classList.add('installing-btn');
|
|
170
|
+
try {
|
|
171
|
+
await fetch(getServerBaseUrl() + '/api/v1/pull', {
|
|
172
|
+
method: 'POST',
|
|
173
|
+
headers: { 'Content-Type': 'application/json' },
|
|
174
|
+
body: JSON.stringify({ model_name: mid })
|
|
175
|
+
});
|
|
176
|
+
await refreshModelMgmtUI();
|
|
177
|
+
await loadModels(); // update chat dropdown too
|
|
178
|
+
} catch (e) {
|
|
179
|
+
btn.textContent = 'Error';
|
|
180
|
+
}
|
|
181
|
+
};
|
|
182
|
+
tdBtn.appendChild(btn);
|
|
183
|
+
tr.appendChild(tdName);
|
|
184
|
+
tr.appendChild(tdBtn);
|
|
185
|
+
suggestedTbody.appendChild(tr);
|
|
186
|
+
});
|
|
187
|
+
}
|
|
188
|
+
// Initial load
|
|
189
|
+
refreshModelMgmtUI();
|
|
190
|
+
// Optionally, refresh when switching to the tab
|
|
191
|
+
document.getElementById('tab-models').addEventListener('click', refreshModelMgmtUI);
|
|
192
|
+
|
|
193
|
+
// Chat logic (streaming with OpenAI JS client placeholder)
|
|
194
|
+
const chatHistory = document.getElementById('chat-history');
|
|
195
|
+
const chatInput = document.getElementById('chat-input');
|
|
196
|
+
const sendBtn = document.getElementById('send-btn');
|
|
197
|
+
const modelSelect = document.getElementById('model-select');
|
|
198
|
+
let messages = [];
|
|
199
|
+
|
|
200
|
+
function appendMessage(role, text) {
|
|
201
|
+
const div = document.createElement('div');
|
|
202
|
+
div.className = 'chat-message ' + role;
|
|
203
|
+
// Add a bubble for iMessage style
|
|
204
|
+
const bubble = document.createElement('div');
|
|
205
|
+
bubble.className = 'chat-bubble ' + role;
|
|
206
|
+
bubble.innerHTML = text;
|
|
207
|
+
div.appendChild(bubble);
|
|
208
|
+
chatHistory.appendChild(div);
|
|
209
|
+
chatHistory.scrollTop = chatHistory.scrollHeight;
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
async function sendMessage() {
|
|
213
|
+
const text = chatInput.value.trim();
|
|
214
|
+
if (!text) return;
|
|
215
|
+
appendMessage('user', text);
|
|
216
|
+
messages.push({ role: 'user', content: text });
|
|
217
|
+
chatInput.value = '';
|
|
218
|
+
sendBtn.disabled = true;
|
|
219
|
+
// Streaming OpenAI completions (placeholder, adapt as needed)
|
|
220
|
+
let llmText = '';
|
|
221
|
+
appendMessage('llm', '...');
|
|
222
|
+
const llmDiv = chatHistory.lastChild.querySelector('.chat-bubble.llm');
|
|
223
|
+
try {
|
|
224
|
+
// Use the correct endpoint for chat completions
|
|
225
|
+
const resp = await fetch(getServerBaseUrl() + '/api/v1/chat/completions', {
|
|
226
|
+
method: 'POST',
|
|
227
|
+
headers: { 'Content-Type': 'application/json' },
|
|
228
|
+
body: JSON.stringify({
|
|
229
|
+
model: modelSelect.value,
|
|
230
|
+
messages: messages,
|
|
231
|
+
stream: true
|
|
232
|
+
})
|
|
233
|
+
});
|
|
234
|
+
if (!resp.body) throw new Error('No stream');
|
|
235
|
+
const reader = resp.body.getReader();
|
|
236
|
+
let decoder = new TextDecoder();
|
|
237
|
+
llmDiv.textContent = '';
|
|
238
|
+
while (true) {
|
|
239
|
+
const { done, value } = await reader.read();
|
|
240
|
+
if (done) break;
|
|
241
|
+
const chunk = decoder.decode(value);
|
|
242
|
+
if (chunk.trim() === 'data: [DONE]' || chunk.trim() === '[DONE]') continue;
|
|
243
|
+
// Try to extract the content from the OpenAI chunk
|
|
244
|
+
const match = chunk.match(/"content"\s*:\s*"([^"]*)"/);
|
|
245
|
+
if (match && match[1]) {
|
|
246
|
+
llmText += match[1];
|
|
247
|
+
llmDiv.textContent = llmText;
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
messages.push({ role: 'assistant', content: llmText });
|
|
251
|
+
} catch (e) {
|
|
252
|
+
llmDiv.textContent = '[Error: ' + e.message + ']';
|
|
253
|
+
}
|
|
254
|
+
sendBtn.disabled = false;
|
|
255
|
+
}
|
|
256
|
+
sendBtn.onclick = sendMessage;
|
|
257
|
+
chatInput.addEventListener('keydown', function(e) {
|
|
258
|
+
if (e.key === 'Enter') sendMessage();
|
|
259
|
+
});
|
|
260
|
+
</script>
|
|
261
|
+
</body>
|
|
262
|
+
</html>
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "7.0.4"
|
|
@@ -54,10 +54,10 @@ src/lemonade/tools/server/__init__.py
|
|
|
54
54
|
src/lemonade/tools/server/instructions.py
|
|
55
55
|
src/lemonade/tools/server/llamacpp.py
|
|
56
56
|
src/lemonade/tools/server/port_utils.py
|
|
57
|
-
src/lemonade/tools/server/pydantic_models.py
|
|
58
57
|
src/lemonade/tools/server/serve.py
|
|
59
58
|
src/lemonade/tools/server/thread_utils.py
|
|
60
59
|
src/lemonade/tools/server/tool_calls.py
|
|
60
|
+
src/lemonade/tools/server/static/instructions.html
|
|
61
61
|
src/lemonade/tools/server/static/styles.css
|
|
62
62
|
src/lemonade_install/__init__.py
|
|
63
63
|
src/lemonade_install/install.py
|
|
@@ -69,4 +69,5 @@ src/lemonade_sdk.egg-info/requires.txt
|
|
|
69
69
|
src/lemonade_sdk.egg-info/top_level.txt
|
|
70
70
|
src/lemonade_server/cli.py
|
|
71
71
|
src/lemonade_server/model_manager.py
|
|
72
|
+
src/lemonade_server/pydantic_models.py
|
|
72
73
|
src/lemonade_server/server_models.json
|