lemonade-sdk 9.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lemonade/__init__.py +5 -0
- lemonade/api.py +180 -0
- lemonade/cache.py +92 -0
- lemonade/cli.py +173 -0
- lemonade/common/__init__.py +0 -0
- lemonade/common/build.py +176 -0
- lemonade/common/cli_helpers.py +139 -0
- lemonade/common/exceptions.py +98 -0
- lemonade/common/filesystem.py +368 -0
- lemonade/common/inference_engines.py +408 -0
- lemonade/common/network.py +93 -0
- lemonade/common/printing.py +110 -0
- lemonade/common/status.py +471 -0
- lemonade/common/system_info.py +1411 -0
- lemonade/common/test_helpers.py +28 -0
- lemonade/profilers/__init__.py +1 -0
- lemonade/profilers/agt_power.py +437 -0
- lemonade/profilers/hwinfo_power.py +429 -0
- lemonade/profilers/memory_tracker.py +259 -0
- lemonade/profilers/profiler.py +58 -0
- lemonade/sequence.py +363 -0
- lemonade/state.py +159 -0
- lemonade/tools/__init__.py +1 -0
- lemonade/tools/accuracy.py +432 -0
- lemonade/tools/adapter.py +114 -0
- lemonade/tools/bench.py +302 -0
- lemonade/tools/flm/__init__.py +1 -0
- lemonade/tools/flm/utils.py +305 -0
- lemonade/tools/huggingface/bench.py +187 -0
- lemonade/tools/huggingface/load.py +235 -0
- lemonade/tools/huggingface/utils.py +359 -0
- lemonade/tools/humaneval.py +264 -0
- lemonade/tools/llamacpp/bench.py +255 -0
- lemonade/tools/llamacpp/load.py +222 -0
- lemonade/tools/llamacpp/utils.py +1260 -0
- lemonade/tools/management_tools.py +319 -0
- lemonade/tools/mmlu.py +319 -0
- lemonade/tools/oga/__init__.py +0 -0
- lemonade/tools/oga/bench.py +120 -0
- lemonade/tools/oga/load.py +804 -0
- lemonade/tools/oga/migration.py +403 -0
- lemonade/tools/oga/utils.py +462 -0
- lemonade/tools/perplexity.py +147 -0
- lemonade/tools/prompt.py +263 -0
- lemonade/tools/report/__init__.py +0 -0
- lemonade/tools/report/llm_report.py +203 -0
- lemonade/tools/report/table.py +899 -0
- lemonade/tools/server/__init__.py +0 -0
- lemonade/tools/server/flm.py +133 -0
- lemonade/tools/server/llamacpp.py +320 -0
- lemonade/tools/server/serve.py +2123 -0
- lemonade/tools/server/static/favicon.ico +0 -0
- lemonade/tools/server/static/index.html +279 -0
- lemonade/tools/server/static/js/chat.js +1059 -0
- lemonade/tools/server/static/js/model-settings.js +183 -0
- lemonade/tools/server/static/js/models.js +1395 -0
- lemonade/tools/server/static/js/shared.js +556 -0
- lemonade/tools/server/static/logs.html +191 -0
- lemonade/tools/server/static/styles.css +2654 -0
- lemonade/tools/server/static/webapp.html +321 -0
- lemonade/tools/server/tool_calls.py +153 -0
- lemonade/tools/server/tray.py +664 -0
- lemonade/tools/server/utils/macos_tray.py +226 -0
- lemonade/tools/server/utils/port.py +77 -0
- lemonade/tools/server/utils/thread.py +85 -0
- lemonade/tools/server/utils/windows_tray.py +408 -0
- lemonade/tools/server/webapp.py +34 -0
- lemonade/tools/server/wrapped_server.py +559 -0
- lemonade/tools/tool.py +374 -0
- lemonade/version.py +1 -0
- lemonade_install/__init__.py +1 -0
- lemonade_install/install.py +239 -0
- lemonade_sdk-9.1.1.dist-info/METADATA +276 -0
- lemonade_sdk-9.1.1.dist-info/RECORD +84 -0
- lemonade_sdk-9.1.1.dist-info/WHEEL +5 -0
- lemonade_sdk-9.1.1.dist-info/entry_points.txt +5 -0
- lemonade_sdk-9.1.1.dist-info/licenses/LICENSE +201 -0
- lemonade_sdk-9.1.1.dist-info/licenses/NOTICE.md +47 -0
- lemonade_sdk-9.1.1.dist-info/top_level.txt +3 -0
- lemonade_server/cli.py +805 -0
- lemonade_server/model_manager.py +758 -0
- lemonade_server/pydantic_models.py +159 -0
- lemonade_server/server_models.json +643 -0
- lemonade_server/settings.py +39 -0
lemonade_server/cli.py
ADDED
|
@@ -0,0 +1,805 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import sys
|
|
3
|
+
import os
|
|
4
|
+
import platform
|
|
5
|
+
from typing import Tuple, Optional
|
|
6
|
+
import psutil
|
|
7
|
+
from typing import List
|
|
8
|
+
from lemonade_server.pydantic_models import (
|
|
9
|
+
DEFAULT_PORT,
|
|
10
|
+
DEFAULT_HOST,
|
|
11
|
+
DEFAULT_LOG_LEVEL,
|
|
12
|
+
DEFAULT_LLAMACPP_BACKEND,
|
|
13
|
+
DEFAULT_CTX_SIZE,
|
|
14
|
+
)
|
|
15
|
+
from lemonade_server.settings import load_setting
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# Error codes for different CLI scenarios
|
|
19
|
+
class ExitCodes:
|
|
20
|
+
SUCCESS = 0
|
|
21
|
+
GENERAL_ERROR = 1
|
|
22
|
+
SERVER_ALREADY_RUNNING = 2
|
|
23
|
+
TIMEOUT_STOPPING_SERVER = 3
|
|
24
|
+
ERROR_STOPPING_SERVER = 4
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class PullError(Exception):
|
|
28
|
+
"""
|
|
29
|
+
The pull command has failed to install an LLM
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class DeleteError(Exception):
|
|
34
|
+
"""
|
|
35
|
+
The delete command has failed to delete an LLM
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class ServerTimeoutError(Exception):
|
|
40
|
+
"""
|
|
41
|
+
The server failed to start within the timeout period
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class ModelNotAvailableError(Exception):
|
|
46
|
+
"""
|
|
47
|
+
The specified model is not available on the server
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class ModelLoadError(Exception):
|
|
52
|
+
"""
|
|
53
|
+
The model failed to load on the server
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def serve(
|
|
58
|
+
port: int = None,
|
|
59
|
+
host: str = None,
|
|
60
|
+
log_level: str = None,
|
|
61
|
+
tray: bool = False,
|
|
62
|
+
use_thread: bool = False,
|
|
63
|
+
llamacpp_backend: str = None,
|
|
64
|
+
ctx_size: int = None,
|
|
65
|
+
):
|
|
66
|
+
"""
|
|
67
|
+
Execute the serve command
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
# Otherwise, start the server
|
|
71
|
+
print("Starting Lemonade Server...")
|
|
72
|
+
from lemonade.tools.server.serve import Server
|
|
73
|
+
|
|
74
|
+
port = port if port is not None else DEFAULT_PORT
|
|
75
|
+
host = host if host is not None else DEFAULT_HOST
|
|
76
|
+
log_level = log_level if log_level is not None else DEFAULT_LOG_LEVEL
|
|
77
|
+
llamacpp_backend = (
|
|
78
|
+
llamacpp_backend if llamacpp_backend is not None else DEFAULT_LLAMACPP_BACKEND
|
|
79
|
+
)
|
|
80
|
+
ctx_size = ctx_size if ctx_size is not None else DEFAULT_CTX_SIZE
|
|
81
|
+
|
|
82
|
+
# Start the server
|
|
83
|
+
server = Server(
|
|
84
|
+
port=port,
|
|
85
|
+
host=host,
|
|
86
|
+
log_level=log_level,
|
|
87
|
+
ctx_size=ctx_size,
|
|
88
|
+
tray=tray,
|
|
89
|
+
llamacpp_backend=llamacpp_backend,
|
|
90
|
+
)
|
|
91
|
+
if not use_thread:
|
|
92
|
+
server.run()
|
|
93
|
+
else:
|
|
94
|
+
from threading import Thread
|
|
95
|
+
import time
|
|
96
|
+
|
|
97
|
+
# Start a background thread to run the server
|
|
98
|
+
server_thread = Thread(
|
|
99
|
+
target=server.run,
|
|
100
|
+
daemon=True,
|
|
101
|
+
)
|
|
102
|
+
server_thread.start()
|
|
103
|
+
|
|
104
|
+
# Wait for the server to be ready
|
|
105
|
+
max_wait_time = 30
|
|
106
|
+
wait_interval = 0.5
|
|
107
|
+
waited = 0
|
|
108
|
+
|
|
109
|
+
if platform.system() == "Darwin":
|
|
110
|
+
# On macOS, use direct HTTP health check instead of process scanning for better
|
|
111
|
+
# performance
|
|
112
|
+
import requests
|
|
113
|
+
|
|
114
|
+
while waited < max_wait_time:
|
|
115
|
+
time.sleep(wait_interval)
|
|
116
|
+
try:
|
|
117
|
+
response = requests.get(
|
|
118
|
+
f"http://{host}:{port}/api/v1/health", timeout=1
|
|
119
|
+
)
|
|
120
|
+
if response.status_code == 200:
|
|
121
|
+
break
|
|
122
|
+
except (
|
|
123
|
+
requests.exceptions.ConnectionError,
|
|
124
|
+
requests.exceptions.Timeout,
|
|
125
|
+
):
|
|
126
|
+
pass # Server not ready yet
|
|
127
|
+
waited += wait_interval
|
|
128
|
+
else:
|
|
129
|
+
# On other platforms, use the existing approach
|
|
130
|
+
while waited < max_wait_time:
|
|
131
|
+
time.sleep(wait_interval)
|
|
132
|
+
_, running_port = get_server_info()
|
|
133
|
+
if running_port is not None:
|
|
134
|
+
break
|
|
135
|
+
waited += wait_interval
|
|
136
|
+
|
|
137
|
+
return port, server_thread
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def stop():
|
|
141
|
+
"""
|
|
142
|
+
Stop the Lemonade Server
|
|
143
|
+
"""
|
|
144
|
+
|
|
145
|
+
# Check if Lemonade Server is running
|
|
146
|
+
running_pid, running_port = get_server_info()
|
|
147
|
+
if running_port is None:
|
|
148
|
+
print(f"Lemonade Server is not running\n")
|
|
149
|
+
return
|
|
150
|
+
|
|
151
|
+
# Stop the server
|
|
152
|
+
try:
|
|
153
|
+
process = psutil.Process(running_pid)
|
|
154
|
+
|
|
155
|
+
# Get all child processes (including llama-server)
|
|
156
|
+
children = process.children(recursive=True)
|
|
157
|
+
|
|
158
|
+
# Terminate the main process first
|
|
159
|
+
process.terminate()
|
|
160
|
+
|
|
161
|
+
# Then terminate llama-server child process (known to be stubborn)
|
|
162
|
+
# We avoid killing other child processes, such as the installer
|
|
163
|
+
for child in children:
|
|
164
|
+
if "llama-server" in child.name():
|
|
165
|
+
try:
|
|
166
|
+
child.terminate()
|
|
167
|
+
except psutil.NoSuchProcess:
|
|
168
|
+
pass # Child already terminated
|
|
169
|
+
|
|
170
|
+
# Wait for main process to terminate gracefully
|
|
171
|
+
# kill if it doesn't terminate gracefully
|
|
172
|
+
try:
|
|
173
|
+
process.wait(timeout=5)
|
|
174
|
+
except psutil.TimeoutExpired:
|
|
175
|
+
process.kill()
|
|
176
|
+
|
|
177
|
+
# Kill llama-server child process if it didn't terminate gracefully
|
|
178
|
+
for child in children:
|
|
179
|
+
if "llama-server" in child.name():
|
|
180
|
+
try:
|
|
181
|
+
if child.is_running():
|
|
182
|
+
child.kill()
|
|
183
|
+
except psutil.NoSuchProcess:
|
|
184
|
+
pass # Child already terminated
|
|
185
|
+
except psutil.NoSuchProcess:
|
|
186
|
+
# Process already terminated
|
|
187
|
+
pass
|
|
188
|
+
except psutil.TimeoutExpired:
|
|
189
|
+
print("Timed out waiting for Lemonade Server to stop.")
|
|
190
|
+
sys.exit(ExitCodes.TIMEOUT_STOPPING_SERVER)
|
|
191
|
+
except Exception as e: # pylint: disable=broad-exception-caught
|
|
192
|
+
print(f"Error stopping Lemonade Server: {e}")
|
|
193
|
+
sys.exit(ExitCodes.ERROR_STOPPING_SERVER)
|
|
194
|
+
print("Lemonade Server stopped successfully.")
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def pull(
|
|
198
|
+
model_names: List[str],
|
|
199
|
+
checkpoint: Optional[str] = None,
|
|
200
|
+
recipe: Optional[str] = None,
|
|
201
|
+
reasoning: bool = False,
|
|
202
|
+
vision: bool = False,
|
|
203
|
+
mmproj: str = "",
|
|
204
|
+
):
|
|
205
|
+
"""
|
|
206
|
+
Install an LLM based on its Lemonade Server model name
|
|
207
|
+
|
|
208
|
+
If Lemonade Server is running, use the pull endpoint to download the model
|
|
209
|
+
so that the Lemonade Server instance is aware of the pull.
|
|
210
|
+
|
|
211
|
+
Otherwise, use ModelManager to install the model.
|
|
212
|
+
"""
|
|
213
|
+
|
|
214
|
+
server_running, port = status(verbose=False)
|
|
215
|
+
|
|
216
|
+
if server_running:
|
|
217
|
+
import requests
|
|
218
|
+
|
|
219
|
+
base_url = f"http://localhost:{port}/api/v1"
|
|
220
|
+
|
|
221
|
+
for model_name in model_names:
|
|
222
|
+
payload = {"model_name": model_name}
|
|
223
|
+
|
|
224
|
+
# Add the parameters to the payload
|
|
225
|
+
for key, value in [
|
|
226
|
+
("checkpoint", checkpoint),
|
|
227
|
+
("recipe", recipe),
|
|
228
|
+
("reasoning", reasoning),
|
|
229
|
+
("vision", vision),
|
|
230
|
+
("mmproj", mmproj),
|
|
231
|
+
]:
|
|
232
|
+
if value:
|
|
233
|
+
payload[key] = value
|
|
234
|
+
|
|
235
|
+
# Install the model
|
|
236
|
+
pull_response = requests.post(f"{base_url}/pull", json=payload)
|
|
237
|
+
|
|
238
|
+
if pull_response.status_code != 200:
|
|
239
|
+
raise PullError(
|
|
240
|
+
f"Failed to install {model_name}. Check the "
|
|
241
|
+
"Lemonade Server log for more information. You can list "
|
|
242
|
+
"supported models with `lemonade-server list`"
|
|
243
|
+
)
|
|
244
|
+
else:
|
|
245
|
+
from lemonade_server.model_manager import ModelManager
|
|
246
|
+
|
|
247
|
+
ModelManager().download_models(
|
|
248
|
+
model_names,
|
|
249
|
+
checkpoint=checkpoint,
|
|
250
|
+
recipe=recipe,
|
|
251
|
+
reasoning=reasoning,
|
|
252
|
+
vision=vision,
|
|
253
|
+
mmproj=mmproj,
|
|
254
|
+
# The pull command will download an upgraded model if available, even
|
|
255
|
+
# if we already have a local copy of the model
|
|
256
|
+
do_not_upgrade=False,
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def delete(model_names: List[str]):
|
|
261
|
+
"""
|
|
262
|
+
Delete an LLM based on its Lemonade Server model name
|
|
263
|
+
|
|
264
|
+
If Lemonade Server is running, use the delete endpoint to delete the model
|
|
265
|
+
so that the Lemonade Server instance is aware of the deletion.
|
|
266
|
+
|
|
267
|
+
Otherwise, use ModelManager to delete the model.
|
|
268
|
+
"""
|
|
269
|
+
|
|
270
|
+
server_running, port = status(verbose=False)
|
|
271
|
+
|
|
272
|
+
if server_running:
|
|
273
|
+
import requests
|
|
274
|
+
|
|
275
|
+
base_url = f"http://localhost:{port}/api/v1"
|
|
276
|
+
|
|
277
|
+
for model_name in model_names:
|
|
278
|
+
# Delete the model
|
|
279
|
+
delete_response = requests.post(
|
|
280
|
+
f"{base_url}/delete", json={"model_name": model_name}
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
if delete_response.status_code != 200:
|
|
284
|
+
raise DeleteError(
|
|
285
|
+
f"Failed to delete {model_name}. Check the "
|
|
286
|
+
"Lemonade Server log for more information."
|
|
287
|
+
)
|
|
288
|
+
else:
|
|
289
|
+
from lemonade_server.model_manager import ModelManager
|
|
290
|
+
|
|
291
|
+
for model_name in model_names:
|
|
292
|
+
ModelManager().delete_model(model_name)
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def run(
|
|
296
|
+
model_name: str,
|
|
297
|
+
port: int = None,
|
|
298
|
+
host: str = "localhost",
|
|
299
|
+
log_level: str = None,
|
|
300
|
+
tray: bool = False,
|
|
301
|
+
llamacpp_backend: str = None,
|
|
302
|
+
ctx_size: int = None,
|
|
303
|
+
):
|
|
304
|
+
"""
|
|
305
|
+
Start the server if not running and open the chat interface with the specified model
|
|
306
|
+
"""
|
|
307
|
+
import webbrowser
|
|
308
|
+
import time
|
|
309
|
+
import os
|
|
310
|
+
|
|
311
|
+
# Disable tray on macOS for run command due to threading issues
|
|
312
|
+
if platform.system() == "Darwin":
|
|
313
|
+
tray = False
|
|
314
|
+
|
|
315
|
+
# Start the server if not running
|
|
316
|
+
_, running_port = get_server_info()
|
|
317
|
+
server_previously_running = running_port is not None
|
|
318
|
+
if not server_previously_running:
|
|
319
|
+
port, server_thread = serve(
|
|
320
|
+
port=port,
|
|
321
|
+
host=host,
|
|
322
|
+
log_level=log_level,
|
|
323
|
+
tray=tray,
|
|
324
|
+
use_thread=True,
|
|
325
|
+
llamacpp_backend=llamacpp_backend,
|
|
326
|
+
ctx_size=ctx_size,
|
|
327
|
+
)
|
|
328
|
+
else:
|
|
329
|
+
# macOS: Check for port conflicts when server is already running
|
|
330
|
+
if platform.system() == "Darwin":
|
|
331
|
+
requested_port = port if port is not None else DEFAULT_PORT
|
|
332
|
+
if running_port != requested_port:
|
|
333
|
+
print(
|
|
334
|
+
f"Lemonade Server is already running on port {running_port}\n"
|
|
335
|
+
f"You requested port {requested_port}. Please stop the existing server first "
|
|
336
|
+
)
|
|
337
|
+
sys.exit(ExitCodes.SERVER_ALREADY_RUNNING)
|
|
338
|
+
|
|
339
|
+
port = running_port
|
|
340
|
+
|
|
341
|
+
# Pull model
|
|
342
|
+
pull([model_name])
|
|
343
|
+
|
|
344
|
+
# Load model
|
|
345
|
+
load(model_name, port)
|
|
346
|
+
|
|
347
|
+
# Open the chat interface with the specified model
|
|
348
|
+
url = f"http://{host}:{port}/?model={model_name}#llm-chat"
|
|
349
|
+
print(f"You can now chat with {model_name} at {url}")
|
|
350
|
+
|
|
351
|
+
# Only open browser if not disabled via environment variable
|
|
352
|
+
if not os.environ.get("LEMONADE_DISABLE_BROWSER"):
|
|
353
|
+
webbrowser.open(url)
|
|
354
|
+
|
|
355
|
+
# Keep the server running if we started it
|
|
356
|
+
if not server_previously_running:
|
|
357
|
+
while server_thread.is_alive():
|
|
358
|
+
time.sleep(0.5)
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
def load(model_name: str, port: int):
|
|
362
|
+
"""
|
|
363
|
+
Load a model using the endpoint
|
|
364
|
+
"""
|
|
365
|
+
import requests
|
|
366
|
+
|
|
367
|
+
base_url = f"http://localhost:{port}/api/v1"
|
|
368
|
+
|
|
369
|
+
# Load the model
|
|
370
|
+
load_response = requests.post(f"{base_url}/load", json={"model_name": model_name})
|
|
371
|
+
if load_response.status_code != 200:
|
|
372
|
+
raise ModelLoadError(
|
|
373
|
+
f"Failed to load {model_name}. Check the "
|
|
374
|
+
"Lemonade Server log for more information."
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
def version():
|
|
379
|
+
"""
|
|
380
|
+
Print the version number
|
|
381
|
+
"""
|
|
382
|
+
from lemonade import __version__ as version_number
|
|
383
|
+
|
|
384
|
+
print(f"{version_number}")
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
def status(verbose: bool = True) -> Tuple[bool, int]:
|
|
388
|
+
"""
|
|
389
|
+
Print the status of the server
|
|
390
|
+
|
|
391
|
+
Returns a tuple of:
|
|
392
|
+
1. Whether the server is running
|
|
393
|
+
2. What port the server is running on (None if server is not running)
|
|
394
|
+
"""
|
|
395
|
+
_, port = get_server_info()
|
|
396
|
+
if port is None:
|
|
397
|
+
if verbose:
|
|
398
|
+
print("Server is not running")
|
|
399
|
+
return False, None
|
|
400
|
+
else:
|
|
401
|
+
if verbose:
|
|
402
|
+
print(f"Server is running on port {port}")
|
|
403
|
+
return True, port
|
|
404
|
+
|
|
405
|
+
|
|
406
|
+
def is_lemonade_server(pid):
|
|
407
|
+
"""
|
|
408
|
+
Check whether or not a given PID corresponds to a Lemonade server
|
|
409
|
+
"""
|
|
410
|
+
# macOS only: Self-exclusion to prevent blocking server startup
|
|
411
|
+
if platform.system() == "Darwin":
|
|
412
|
+
current_pid = os.getpid()
|
|
413
|
+
if pid == current_pid:
|
|
414
|
+
return False
|
|
415
|
+
|
|
416
|
+
# Exclude children of current process to avoid detecting status commands
|
|
417
|
+
try:
|
|
418
|
+
current_process = psutil.Process(current_pid)
|
|
419
|
+
child_pids = [
|
|
420
|
+
child.pid for child in current_process.children(recursive=True)
|
|
421
|
+
]
|
|
422
|
+
if pid in child_pids:
|
|
423
|
+
return False
|
|
424
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
425
|
+
pass
|
|
426
|
+
|
|
427
|
+
try:
|
|
428
|
+
process = psutil.Process(pid)
|
|
429
|
+
|
|
430
|
+
while True:
|
|
431
|
+
process_name = process.name()
|
|
432
|
+
if process_name in [ # Windows
|
|
433
|
+
"lemonade-server-dev.exe",
|
|
434
|
+
"lemonade-server.exe",
|
|
435
|
+
"lsdev.exe",
|
|
436
|
+
] or process_name in [ # Linux
|
|
437
|
+
"lemonade-server-dev",
|
|
438
|
+
"lemonade-server",
|
|
439
|
+
"lsdev",
|
|
440
|
+
]:
|
|
441
|
+
return True
|
|
442
|
+
# macOS only: Python scripts appear as "python3.x", check command line
|
|
443
|
+
elif process_name.startswith("python") and platform.system() == "Darwin":
|
|
444
|
+
try:
|
|
445
|
+
cmdline = process.cmdline()
|
|
446
|
+
if len(cmdline) >= 2:
|
|
447
|
+
script_path = cmdline[1]
|
|
448
|
+
# Check for various lemonade server command patterns (macOS only)
|
|
449
|
+
lemonade_patterns = [
|
|
450
|
+
"lemonade-server-dev",
|
|
451
|
+
"lemonade-server",
|
|
452
|
+
"lsdev", # Short alias for lemonade-server-dev
|
|
453
|
+
]
|
|
454
|
+
if any(pattern in script_path for pattern in lemonade_patterns):
|
|
455
|
+
return True
|
|
456
|
+
except (psutil.AccessDenied, psutil.NoSuchProcess):
|
|
457
|
+
pass
|
|
458
|
+
elif "llama-server" in process_name:
|
|
459
|
+
return False
|
|
460
|
+
if not process.parent():
|
|
461
|
+
return False
|
|
462
|
+
process = process.parent()
|
|
463
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
|
|
464
|
+
return False
|
|
465
|
+
return False
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
def get_server_info() -> Tuple[int | None, int | None]:
|
|
469
|
+
"""
|
|
470
|
+
Returns a tuple of:
|
|
471
|
+
1. Lemonade Server's PID
|
|
472
|
+
2. The port that Lemonade Server is running on
|
|
473
|
+
"""
|
|
474
|
+
|
|
475
|
+
# Try the global approach first (works on Windows/Linux without permissions)
|
|
476
|
+
try:
|
|
477
|
+
connections = psutil.net_connections(kind="tcp4")
|
|
478
|
+
for conn in connections:
|
|
479
|
+
if conn.status == "LISTEN" and conn.laddr and conn.pid is not None:
|
|
480
|
+
if is_lemonade_server(conn.pid):
|
|
481
|
+
return conn.pid, conn.laddr.port
|
|
482
|
+
except (psutil.AccessDenied, PermissionError):
|
|
483
|
+
# Global approach needs elevated permissions on macOS, fall back to per-process approach
|
|
484
|
+
pass
|
|
485
|
+
except Exception: # pylint: disable=broad-exception-caught
|
|
486
|
+
pass
|
|
487
|
+
|
|
488
|
+
# Per-process approach (macOS only - needs this due to permission requirements)
|
|
489
|
+
if platform.system() == "Darwin":
|
|
490
|
+
try:
|
|
491
|
+
for proc in psutil.process_iter(["pid", "name"]):
|
|
492
|
+
try:
|
|
493
|
+
pid = proc.info["pid"]
|
|
494
|
+
if is_lemonade_server(pid):
|
|
495
|
+
# Found a lemonade server, check its listening ports
|
|
496
|
+
connections = proc.net_connections(kind="inet")
|
|
497
|
+
for conn in connections:
|
|
498
|
+
if conn.status == "LISTEN" and conn.laddr:
|
|
499
|
+
return pid, conn.laddr.port
|
|
500
|
+
# If no listening connections found, this process is not actually serving
|
|
501
|
+
# Continue looking for other processes
|
|
502
|
+
except (
|
|
503
|
+
psutil.NoSuchProcess,
|
|
504
|
+
psutil.AccessDenied,
|
|
505
|
+
psutil.ZombieProcess,
|
|
506
|
+
):
|
|
507
|
+
# Some processes may be inaccessible, continue to next
|
|
508
|
+
continue
|
|
509
|
+
except Exception: # pylint: disable=broad-exception-caught
|
|
510
|
+
pass
|
|
511
|
+
|
|
512
|
+
return None, None
|
|
513
|
+
|
|
514
|
+
|
|
515
|
+
def list_models():
|
|
516
|
+
"""
|
|
517
|
+
List recommended models and their download status
|
|
518
|
+
"""
|
|
519
|
+
from tabulate import tabulate
|
|
520
|
+
from lemonade_server.model_manager import ModelManager
|
|
521
|
+
|
|
522
|
+
model_manager = ModelManager()
|
|
523
|
+
|
|
524
|
+
# Get all supported models and downloaded models
|
|
525
|
+
supported_models = model_manager.supported_models
|
|
526
|
+
filtered_models = model_manager.filter_models_by_backend(supported_models)
|
|
527
|
+
downloaded_models = model_manager.downloaded_models
|
|
528
|
+
|
|
529
|
+
# Filter to only show recommended models
|
|
530
|
+
recommended_models = {
|
|
531
|
+
model_name: model_info
|
|
532
|
+
for model_name, model_info in filtered_models.items()
|
|
533
|
+
if model_info.get("suggested", False)
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
# Create table data
|
|
537
|
+
table_data = []
|
|
538
|
+
for model_name, model_info in recommended_models.items():
|
|
539
|
+
downloaded_status = "Yes" if model_name in downloaded_models else "No"
|
|
540
|
+
|
|
541
|
+
# Get model labels/type
|
|
542
|
+
labels = model_info.get("labels", [])
|
|
543
|
+
model_type = ", ".join(labels) if labels else "-"
|
|
544
|
+
|
|
545
|
+
table_data.append([model_name, downloaded_status, model_type])
|
|
546
|
+
|
|
547
|
+
# Sort by model name for consistent display
|
|
548
|
+
# Show downloaded models first
|
|
549
|
+
table_data.sort(key=lambda x: (x[1] == "No", x[0].lower()))
|
|
550
|
+
|
|
551
|
+
# Display table
|
|
552
|
+
headers = ["Model Name", "Downloaded", "Details"]
|
|
553
|
+
print(tabulate(table_data, headers=headers, tablefmt="simple"))
|
|
554
|
+
|
|
555
|
+
|
|
556
|
+
def developer_entrypoint():
|
|
557
|
+
"""
|
|
558
|
+
Developer entry point that starts the server with debug logging
|
|
559
|
+
Equivalent to running: lemonade-server-dev serve --log-level debug [additional args]
|
|
560
|
+
|
|
561
|
+
This function automatically prepends "serve --log-level debug" to any arguments
|
|
562
|
+
passed to the lsdev command.
|
|
563
|
+
"""
|
|
564
|
+
# Save original sys.argv
|
|
565
|
+
original_argv = sys.argv.copy()
|
|
566
|
+
|
|
567
|
+
try:
|
|
568
|
+
# Take any additional arguments passed to lsdev and append them
|
|
569
|
+
# after "serve --log-level debug"
|
|
570
|
+
additional_args = sys.argv[1:] if len(sys.argv) > 1 else []
|
|
571
|
+
|
|
572
|
+
# Set sys.argv to simulate "serve --log-level debug" + additional args
|
|
573
|
+
sys.argv = [sys.argv[0], "serve", "--log-level", "debug"] + additional_args
|
|
574
|
+
main()
|
|
575
|
+
finally:
|
|
576
|
+
# Restore original sys.argv
|
|
577
|
+
sys.argv = original_argv
|
|
578
|
+
|
|
579
|
+
|
|
580
|
+
def _add_server_arguments(parser):
|
|
581
|
+
"""Add common server arguments to a parser"""
|
|
582
|
+
|
|
583
|
+
# Load the persisted log level to use as a default
|
|
584
|
+
persisted_log_level = load_setting("log_level", DEFAULT_LOG_LEVEL)
|
|
585
|
+
|
|
586
|
+
parser.add_argument(
|
|
587
|
+
"--port",
|
|
588
|
+
type=int,
|
|
589
|
+
help="Port number to serve on",
|
|
590
|
+
default=DEFAULT_PORT,
|
|
591
|
+
)
|
|
592
|
+
parser.add_argument(
|
|
593
|
+
"--host",
|
|
594
|
+
type=str,
|
|
595
|
+
help="Address to bind for connections",
|
|
596
|
+
default=DEFAULT_HOST,
|
|
597
|
+
)
|
|
598
|
+
parser.add_argument(
|
|
599
|
+
"--log-level",
|
|
600
|
+
type=str,
|
|
601
|
+
help="Log level for the server",
|
|
602
|
+
choices=["critical", "error", "warning", "info", "debug", "trace"],
|
|
603
|
+
default=persisted_log_level,
|
|
604
|
+
)
|
|
605
|
+
parser.add_argument(
|
|
606
|
+
"--llamacpp",
|
|
607
|
+
type=str,
|
|
608
|
+
help="LlamaCpp backend to use",
|
|
609
|
+
choices=["vulkan", "rocm", "metal", "cpu"],
|
|
610
|
+
default=DEFAULT_LLAMACPP_BACKEND,
|
|
611
|
+
)
|
|
612
|
+
parser.add_argument(
|
|
613
|
+
"--ctx-size",
|
|
614
|
+
type=int,
|
|
615
|
+
help=(
|
|
616
|
+
f"Context size for the model (default: {DEFAULT_CTX_SIZE} for llamacpp, "
|
|
617
|
+
"truncates prompts for other recipes)"
|
|
618
|
+
),
|
|
619
|
+
default=DEFAULT_CTX_SIZE,
|
|
620
|
+
)
|
|
621
|
+
|
|
622
|
+
if os.name == "nt" or platform.system() == "Darwin":
|
|
623
|
+
parser.add_argument(
|
|
624
|
+
"--no-tray",
|
|
625
|
+
action="store_true",
|
|
626
|
+
help="Do not show a tray icon when the server is running",
|
|
627
|
+
)
|
|
628
|
+
|
|
629
|
+
|
|
630
|
+
def _show_deprecation_notice():
|
|
631
|
+
"""Display deprecation notice for Python server, unless in CI mode."""
|
|
632
|
+
if os.environ.get("LEMONADE_CI_MODE"):
|
|
633
|
+
return
|
|
634
|
+
|
|
635
|
+
print("=" * 80)
|
|
636
|
+
print("DEPRECATION NOTICE")
|
|
637
|
+
print("=" * 80)
|
|
638
|
+
print("The Python-based 'lemonade-server-dev' command is deprecated.")
|
|
639
|
+
print("Please use the C++ Lemonade Server instead:")
|
|
640
|
+
print()
|
|
641
|
+
print(" • Windows and Linux: Download the installer from")
|
|
642
|
+
print(" https://github.com/lemonade-sdk/lemonade/releases/latest")
|
|
643
|
+
print()
|
|
644
|
+
print("The C++ server offers better performance and is the recommended option.")
|
|
645
|
+
print("This Python server will be removed in a future release.")
|
|
646
|
+
print("=" * 80)
|
|
647
|
+
print()
|
|
648
|
+
|
|
649
|
+
|
|
650
|
+
def main():
|
|
651
|
+
# Show deprecation notice for --help/-h before argparse handles it
|
|
652
|
+
if "--help" in sys.argv or "-h" in sys.argv or len(sys.argv) == 1:
|
|
653
|
+
_show_deprecation_notice()
|
|
654
|
+
|
|
655
|
+
parser = argparse.ArgumentParser(
|
|
656
|
+
description="Serve LLMs on CPU, GPU, and NPU.",
|
|
657
|
+
usage=argparse.SUPPRESS,
|
|
658
|
+
)
|
|
659
|
+
|
|
660
|
+
# Add version flag
|
|
661
|
+
parser.add_argument(
|
|
662
|
+
"-v", "--version", action="store_true", help="Show version number"
|
|
663
|
+
)
|
|
664
|
+
|
|
665
|
+
# Create subparsers for commands
|
|
666
|
+
subparsers = parser.add_subparsers(
|
|
667
|
+
title="Available Commands", dest="command", metavar=""
|
|
668
|
+
)
|
|
669
|
+
|
|
670
|
+
# Serve command
|
|
671
|
+
serve_parser = subparsers.add_parser("serve", help="Start server")
|
|
672
|
+
_add_server_arguments(serve_parser)
|
|
673
|
+
|
|
674
|
+
# Status command
|
|
675
|
+
status_parser = subparsers.add_parser("status", help="Check if server is running")
|
|
676
|
+
|
|
677
|
+
# Stop command
|
|
678
|
+
stop_parser = subparsers.add_parser("stop", help="Stop the server")
|
|
679
|
+
|
|
680
|
+
# List command
|
|
681
|
+
list_parser = subparsers.add_parser(
|
|
682
|
+
"list", help="List recommended models and their download status"
|
|
683
|
+
)
|
|
684
|
+
|
|
685
|
+
# Pull command
|
|
686
|
+
pull_parser = subparsers.add_parser(
|
|
687
|
+
"pull",
|
|
688
|
+
help="Install an LLM",
|
|
689
|
+
epilog=(
|
|
690
|
+
"More information: "
|
|
691
|
+
"https://github.com/lemonade-sdk/lemonade/blob/main/docs/server/server_models.md"
|
|
692
|
+
),
|
|
693
|
+
)
|
|
694
|
+
pull_parser.add_argument(
|
|
695
|
+
"model",
|
|
696
|
+
help="Lemonade Server model name",
|
|
697
|
+
nargs="+",
|
|
698
|
+
)
|
|
699
|
+
pull_parser.add_argument(
|
|
700
|
+
"--checkpoint",
|
|
701
|
+
help="For registering a new model: Hugging Face checkpoint to source the model from",
|
|
702
|
+
)
|
|
703
|
+
pull_parser.add_argument(
|
|
704
|
+
"--recipe",
|
|
705
|
+
help="For registering a new model: lemonade.api recipe to use with the model",
|
|
706
|
+
)
|
|
707
|
+
pull_parser.add_argument(
|
|
708
|
+
"--reasoning",
|
|
709
|
+
help="For registering a new model: whether the model is a reasoning model or not",
|
|
710
|
+
type=bool,
|
|
711
|
+
default=False,
|
|
712
|
+
)
|
|
713
|
+
pull_parser.add_argument(
|
|
714
|
+
"--mmproj",
|
|
715
|
+
help="For registering a new multimodal model: full file name of the .mmproj file in the checkpoint",
|
|
716
|
+
)
|
|
717
|
+
|
|
718
|
+
# Delete command
|
|
719
|
+
delete_parser = subparsers.add_parser(
|
|
720
|
+
"delete",
|
|
721
|
+
help="Delete an LLM",
|
|
722
|
+
epilog=(
|
|
723
|
+
"More information: "
|
|
724
|
+
"https://github.com/lemonade-sdk/lemonade/blob/main/docs/server/server_models.md"
|
|
725
|
+
),
|
|
726
|
+
)
|
|
727
|
+
delete_parser.add_argument(
|
|
728
|
+
"model",
|
|
729
|
+
help="Lemonade Server model name",
|
|
730
|
+
nargs="+",
|
|
731
|
+
)
|
|
732
|
+
|
|
733
|
+
# Run command
|
|
734
|
+
run_parser = subparsers.add_parser(
|
|
735
|
+
"run",
|
|
736
|
+
help="Chat with specified model (starts server if needed)",
|
|
737
|
+
)
|
|
738
|
+
run_parser.add_argument(
|
|
739
|
+
"model",
|
|
740
|
+
help="Lemonade Server model name to run",
|
|
741
|
+
)
|
|
742
|
+
_add_server_arguments(run_parser)
|
|
743
|
+
|
|
744
|
+
args = parser.parse_args()
|
|
745
|
+
|
|
746
|
+
if os.name != "nt" and platform.system() != "Darwin":
|
|
747
|
+
args.no_tray = True
|
|
748
|
+
|
|
749
|
+
if args.version:
|
|
750
|
+
version()
|
|
751
|
+
elif args.command == "serve":
|
|
752
|
+
_show_deprecation_notice()
|
|
753
|
+
_, running_port = get_server_info()
|
|
754
|
+
if running_port is not None:
|
|
755
|
+
print(
|
|
756
|
+
(
|
|
757
|
+
f"Lemonade Server is already running on port {running_port}\n"
|
|
758
|
+
"Please stop the existing server before starting a new instance."
|
|
759
|
+
),
|
|
760
|
+
)
|
|
761
|
+
sys.exit(ExitCodes.SERVER_ALREADY_RUNNING)
|
|
762
|
+
serve(
|
|
763
|
+
port=args.port,
|
|
764
|
+
host=args.host,
|
|
765
|
+
log_level=args.log_level,
|
|
766
|
+
tray=not args.no_tray,
|
|
767
|
+
llamacpp_backend=args.llamacpp,
|
|
768
|
+
ctx_size=args.ctx_size,
|
|
769
|
+
)
|
|
770
|
+
elif args.command == "status":
|
|
771
|
+
status()
|
|
772
|
+
elif args.command == "list":
|
|
773
|
+
list_models()
|
|
774
|
+
elif args.command == "pull":
|
|
775
|
+
pull(
|
|
776
|
+
args.model,
|
|
777
|
+
checkpoint=args.checkpoint,
|
|
778
|
+
recipe=args.recipe,
|
|
779
|
+
reasoning=args.reasoning,
|
|
780
|
+
mmproj=args.mmproj,
|
|
781
|
+
)
|
|
782
|
+
elif args.command == "delete":
|
|
783
|
+
delete(args.model)
|
|
784
|
+
elif args.command == "stop":
|
|
785
|
+
stop()
|
|
786
|
+
elif args.command == "run":
|
|
787
|
+
_show_deprecation_notice()
|
|
788
|
+
run(
|
|
789
|
+
args.model,
|
|
790
|
+
port=args.port,
|
|
791
|
+
host=args.host,
|
|
792
|
+
log_level=args.log_level,
|
|
793
|
+
tray=not args.no_tray,
|
|
794
|
+
llamacpp_backend=args.llamacpp,
|
|
795
|
+
ctx_size=args.ctx_size,
|
|
796
|
+
)
|
|
797
|
+
elif args.command == "help" or not args.command:
|
|
798
|
+
parser.print_help()
|
|
799
|
+
|
|
800
|
+
|
|
801
|
+
if __name__ == "__main__":
|
|
802
|
+
main()
|
|
803
|
+
|
|
804
|
+
# This file was originally licensed under Apache 2.0. It has been modified.
|
|
805
|
+
# Modifications Copyright (c) 2025 AMD
|