lemonade-sdk 9.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. lemonade/__init__.py +5 -0
  2. lemonade/api.py +180 -0
  3. lemonade/cache.py +92 -0
  4. lemonade/cli.py +173 -0
  5. lemonade/common/__init__.py +0 -0
  6. lemonade/common/build.py +176 -0
  7. lemonade/common/cli_helpers.py +139 -0
  8. lemonade/common/exceptions.py +98 -0
  9. lemonade/common/filesystem.py +368 -0
  10. lemonade/common/inference_engines.py +408 -0
  11. lemonade/common/network.py +93 -0
  12. lemonade/common/printing.py +110 -0
  13. lemonade/common/status.py +471 -0
  14. lemonade/common/system_info.py +1411 -0
  15. lemonade/common/test_helpers.py +28 -0
  16. lemonade/profilers/__init__.py +1 -0
  17. lemonade/profilers/agt_power.py +437 -0
  18. lemonade/profilers/hwinfo_power.py +429 -0
  19. lemonade/profilers/memory_tracker.py +259 -0
  20. lemonade/profilers/profiler.py +58 -0
  21. lemonade/sequence.py +363 -0
  22. lemonade/state.py +159 -0
  23. lemonade/tools/__init__.py +1 -0
  24. lemonade/tools/accuracy.py +432 -0
  25. lemonade/tools/adapter.py +114 -0
  26. lemonade/tools/bench.py +302 -0
  27. lemonade/tools/flm/__init__.py +1 -0
  28. lemonade/tools/flm/utils.py +305 -0
  29. lemonade/tools/huggingface/bench.py +187 -0
  30. lemonade/tools/huggingface/load.py +235 -0
  31. lemonade/tools/huggingface/utils.py +359 -0
  32. lemonade/tools/humaneval.py +264 -0
  33. lemonade/tools/llamacpp/bench.py +255 -0
  34. lemonade/tools/llamacpp/load.py +222 -0
  35. lemonade/tools/llamacpp/utils.py +1260 -0
  36. lemonade/tools/management_tools.py +319 -0
  37. lemonade/tools/mmlu.py +319 -0
  38. lemonade/tools/oga/__init__.py +0 -0
  39. lemonade/tools/oga/bench.py +120 -0
  40. lemonade/tools/oga/load.py +804 -0
  41. lemonade/tools/oga/migration.py +403 -0
  42. lemonade/tools/oga/utils.py +462 -0
  43. lemonade/tools/perplexity.py +147 -0
  44. lemonade/tools/prompt.py +263 -0
  45. lemonade/tools/report/__init__.py +0 -0
  46. lemonade/tools/report/llm_report.py +203 -0
  47. lemonade/tools/report/table.py +899 -0
  48. lemonade/tools/server/__init__.py +0 -0
  49. lemonade/tools/server/flm.py +133 -0
  50. lemonade/tools/server/llamacpp.py +320 -0
  51. lemonade/tools/server/serve.py +2123 -0
  52. lemonade/tools/server/static/favicon.ico +0 -0
  53. lemonade/tools/server/static/index.html +279 -0
  54. lemonade/tools/server/static/js/chat.js +1059 -0
  55. lemonade/tools/server/static/js/model-settings.js +183 -0
  56. lemonade/tools/server/static/js/models.js +1395 -0
  57. lemonade/tools/server/static/js/shared.js +556 -0
  58. lemonade/tools/server/static/logs.html +191 -0
  59. lemonade/tools/server/static/styles.css +2654 -0
  60. lemonade/tools/server/static/webapp.html +321 -0
  61. lemonade/tools/server/tool_calls.py +153 -0
  62. lemonade/tools/server/tray.py +664 -0
  63. lemonade/tools/server/utils/macos_tray.py +226 -0
  64. lemonade/tools/server/utils/port.py +77 -0
  65. lemonade/tools/server/utils/thread.py +85 -0
  66. lemonade/tools/server/utils/windows_tray.py +408 -0
  67. lemonade/tools/server/webapp.py +34 -0
  68. lemonade/tools/server/wrapped_server.py +559 -0
  69. lemonade/tools/tool.py +374 -0
  70. lemonade/version.py +1 -0
  71. lemonade_install/__init__.py +1 -0
  72. lemonade_install/install.py +239 -0
  73. lemonade_sdk-9.1.1.dist-info/METADATA +276 -0
  74. lemonade_sdk-9.1.1.dist-info/RECORD +84 -0
  75. lemonade_sdk-9.1.1.dist-info/WHEEL +5 -0
  76. lemonade_sdk-9.1.1.dist-info/entry_points.txt +5 -0
  77. lemonade_sdk-9.1.1.dist-info/licenses/LICENSE +201 -0
  78. lemonade_sdk-9.1.1.dist-info/licenses/NOTICE.md +47 -0
  79. lemonade_sdk-9.1.1.dist-info/top_level.txt +3 -0
  80. lemonade_server/cli.py +805 -0
  81. lemonade_server/model_manager.py +758 -0
  82. lemonade_server/pydantic_models.py +159 -0
  83. lemonade_server/server_models.json +643 -0
  84. lemonade_server/settings.py +39 -0
lemonade_server/cli.py ADDED
@@ -0,0 +1,805 @@
1
+ import argparse
2
+ import sys
3
+ import os
4
+ import platform
5
+ from typing import Tuple, Optional
6
+ import psutil
7
+ from typing import List
8
+ from lemonade_server.pydantic_models import (
9
+ DEFAULT_PORT,
10
+ DEFAULT_HOST,
11
+ DEFAULT_LOG_LEVEL,
12
+ DEFAULT_LLAMACPP_BACKEND,
13
+ DEFAULT_CTX_SIZE,
14
+ )
15
+ from lemonade_server.settings import load_setting
16
+
17
+
18
+ # Error codes for different CLI scenarios
19
+ class ExitCodes:
20
+ SUCCESS = 0
21
+ GENERAL_ERROR = 1
22
+ SERVER_ALREADY_RUNNING = 2
23
+ TIMEOUT_STOPPING_SERVER = 3
24
+ ERROR_STOPPING_SERVER = 4
25
+
26
+
27
+ class PullError(Exception):
28
+ """
29
+ The pull command has failed to install an LLM
30
+ """
31
+
32
+
33
+ class DeleteError(Exception):
34
+ """
35
+ The delete command has failed to delete an LLM
36
+ """
37
+
38
+
39
+ class ServerTimeoutError(Exception):
40
+ """
41
+ The server failed to start within the timeout period
42
+ """
43
+
44
+
45
+ class ModelNotAvailableError(Exception):
46
+ """
47
+ The specified model is not available on the server
48
+ """
49
+
50
+
51
+ class ModelLoadError(Exception):
52
+ """
53
+ The model failed to load on the server
54
+ """
55
+
56
+
57
+ def serve(
58
+ port: int = None,
59
+ host: str = None,
60
+ log_level: str = None,
61
+ tray: bool = False,
62
+ use_thread: bool = False,
63
+ llamacpp_backend: str = None,
64
+ ctx_size: int = None,
65
+ ):
66
+ """
67
+ Execute the serve command
68
+ """
69
+
70
+ # Otherwise, start the server
71
+ print("Starting Lemonade Server...")
72
+ from lemonade.tools.server.serve import Server
73
+
74
+ port = port if port is not None else DEFAULT_PORT
75
+ host = host if host is not None else DEFAULT_HOST
76
+ log_level = log_level if log_level is not None else DEFAULT_LOG_LEVEL
77
+ llamacpp_backend = (
78
+ llamacpp_backend if llamacpp_backend is not None else DEFAULT_LLAMACPP_BACKEND
79
+ )
80
+ ctx_size = ctx_size if ctx_size is not None else DEFAULT_CTX_SIZE
81
+
82
+ # Start the server
83
+ server = Server(
84
+ port=port,
85
+ host=host,
86
+ log_level=log_level,
87
+ ctx_size=ctx_size,
88
+ tray=tray,
89
+ llamacpp_backend=llamacpp_backend,
90
+ )
91
+ if not use_thread:
92
+ server.run()
93
+ else:
94
+ from threading import Thread
95
+ import time
96
+
97
+ # Start a background thread to run the server
98
+ server_thread = Thread(
99
+ target=server.run,
100
+ daemon=True,
101
+ )
102
+ server_thread.start()
103
+
104
+ # Wait for the server to be ready
105
+ max_wait_time = 30
106
+ wait_interval = 0.5
107
+ waited = 0
108
+
109
+ if platform.system() == "Darwin":
110
+ # On macOS, use direct HTTP health check instead of process scanning for better
111
+ # performance
112
+ import requests
113
+
114
+ while waited < max_wait_time:
115
+ time.sleep(wait_interval)
116
+ try:
117
+ response = requests.get(
118
+ f"http://{host}:{port}/api/v1/health", timeout=1
119
+ )
120
+ if response.status_code == 200:
121
+ break
122
+ except (
123
+ requests.exceptions.ConnectionError,
124
+ requests.exceptions.Timeout,
125
+ ):
126
+ pass # Server not ready yet
127
+ waited += wait_interval
128
+ else:
129
+ # On other platforms, use the existing approach
130
+ while waited < max_wait_time:
131
+ time.sleep(wait_interval)
132
+ _, running_port = get_server_info()
133
+ if running_port is not None:
134
+ break
135
+ waited += wait_interval
136
+
137
+ return port, server_thread
138
+
139
+
140
+ def stop():
141
+ """
142
+ Stop the Lemonade Server
143
+ """
144
+
145
+ # Check if Lemonade Server is running
146
+ running_pid, running_port = get_server_info()
147
+ if running_port is None:
148
+ print(f"Lemonade Server is not running\n")
149
+ return
150
+
151
+ # Stop the server
152
+ try:
153
+ process = psutil.Process(running_pid)
154
+
155
+ # Get all child processes (including llama-server)
156
+ children = process.children(recursive=True)
157
+
158
+ # Terminate the main process first
159
+ process.terminate()
160
+
161
+ # Then terminate llama-server child process (known to be stubborn)
162
+ # We avoid killing other child processes, such as the installer
163
+ for child in children:
164
+ if "llama-server" in child.name():
165
+ try:
166
+ child.terminate()
167
+ except psutil.NoSuchProcess:
168
+ pass # Child already terminated
169
+
170
+ # Wait for main process to terminate gracefully
171
+ # kill if it doesn't terminate gracefully
172
+ try:
173
+ process.wait(timeout=5)
174
+ except psutil.TimeoutExpired:
175
+ process.kill()
176
+
177
+ # Kill llama-server child process if it didn't terminate gracefully
178
+ for child in children:
179
+ if "llama-server" in child.name():
180
+ try:
181
+ if child.is_running():
182
+ child.kill()
183
+ except psutil.NoSuchProcess:
184
+ pass # Child already terminated
185
+ except psutil.NoSuchProcess:
186
+ # Process already terminated
187
+ pass
188
+ except psutil.TimeoutExpired:
189
+ print("Timed out waiting for Lemonade Server to stop.")
190
+ sys.exit(ExitCodes.TIMEOUT_STOPPING_SERVER)
191
+ except Exception as e: # pylint: disable=broad-exception-caught
192
+ print(f"Error stopping Lemonade Server: {e}")
193
+ sys.exit(ExitCodes.ERROR_STOPPING_SERVER)
194
+ print("Lemonade Server stopped successfully.")
195
+
196
+
197
+ def pull(
198
+ model_names: List[str],
199
+ checkpoint: Optional[str] = None,
200
+ recipe: Optional[str] = None,
201
+ reasoning: bool = False,
202
+ vision: bool = False,
203
+ mmproj: str = "",
204
+ ):
205
+ """
206
+ Install an LLM based on its Lemonade Server model name
207
+
208
+ If Lemonade Server is running, use the pull endpoint to download the model
209
+ so that the Lemonade Server instance is aware of the pull.
210
+
211
+ Otherwise, use ModelManager to install the model.
212
+ """
213
+
214
+ server_running, port = status(verbose=False)
215
+
216
+ if server_running:
217
+ import requests
218
+
219
+ base_url = f"http://localhost:{port}/api/v1"
220
+
221
+ for model_name in model_names:
222
+ payload = {"model_name": model_name}
223
+
224
+ # Add the parameters to the payload
225
+ for key, value in [
226
+ ("checkpoint", checkpoint),
227
+ ("recipe", recipe),
228
+ ("reasoning", reasoning),
229
+ ("vision", vision),
230
+ ("mmproj", mmproj),
231
+ ]:
232
+ if value:
233
+ payload[key] = value
234
+
235
+ # Install the model
236
+ pull_response = requests.post(f"{base_url}/pull", json=payload)
237
+
238
+ if pull_response.status_code != 200:
239
+ raise PullError(
240
+ f"Failed to install {model_name}. Check the "
241
+ "Lemonade Server log for more information. You can list "
242
+ "supported models with `lemonade-server list`"
243
+ )
244
+ else:
245
+ from lemonade_server.model_manager import ModelManager
246
+
247
+ ModelManager().download_models(
248
+ model_names,
249
+ checkpoint=checkpoint,
250
+ recipe=recipe,
251
+ reasoning=reasoning,
252
+ vision=vision,
253
+ mmproj=mmproj,
254
+ # The pull command will download an upgraded model if available, even
255
+ # if we already have a local copy of the model
256
+ do_not_upgrade=False,
257
+ )
258
+
259
+
260
+ def delete(model_names: List[str]):
261
+ """
262
+ Delete an LLM based on its Lemonade Server model name
263
+
264
+ If Lemonade Server is running, use the delete endpoint to delete the model
265
+ so that the Lemonade Server instance is aware of the deletion.
266
+
267
+ Otherwise, use ModelManager to delete the model.
268
+ """
269
+
270
+ server_running, port = status(verbose=False)
271
+
272
+ if server_running:
273
+ import requests
274
+
275
+ base_url = f"http://localhost:{port}/api/v1"
276
+
277
+ for model_name in model_names:
278
+ # Delete the model
279
+ delete_response = requests.post(
280
+ f"{base_url}/delete", json={"model_name": model_name}
281
+ )
282
+
283
+ if delete_response.status_code != 200:
284
+ raise DeleteError(
285
+ f"Failed to delete {model_name}. Check the "
286
+ "Lemonade Server log for more information."
287
+ )
288
+ else:
289
+ from lemonade_server.model_manager import ModelManager
290
+
291
+ for model_name in model_names:
292
+ ModelManager().delete_model(model_name)
293
+
294
+
295
+ def run(
296
+ model_name: str,
297
+ port: int = None,
298
+ host: str = "localhost",
299
+ log_level: str = None,
300
+ tray: bool = False,
301
+ llamacpp_backend: str = None,
302
+ ctx_size: int = None,
303
+ ):
304
+ """
305
+ Start the server if not running and open the chat interface with the specified model
306
+ """
307
+ import webbrowser
308
+ import time
309
+ import os
310
+
311
+ # Disable tray on macOS for run command due to threading issues
312
+ if platform.system() == "Darwin":
313
+ tray = False
314
+
315
+ # Start the server if not running
316
+ _, running_port = get_server_info()
317
+ server_previously_running = running_port is not None
318
+ if not server_previously_running:
319
+ port, server_thread = serve(
320
+ port=port,
321
+ host=host,
322
+ log_level=log_level,
323
+ tray=tray,
324
+ use_thread=True,
325
+ llamacpp_backend=llamacpp_backend,
326
+ ctx_size=ctx_size,
327
+ )
328
+ else:
329
+ # macOS: Check for port conflicts when server is already running
330
+ if platform.system() == "Darwin":
331
+ requested_port = port if port is not None else DEFAULT_PORT
332
+ if running_port != requested_port:
333
+ print(
334
+ f"Lemonade Server is already running on port {running_port}\n"
335
+ f"You requested port {requested_port}. Please stop the existing server first "
336
+ )
337
+ sys.exit(ExitCodes.SERVER_ALREADY_RUNNING)
338
+
339
+ port = running_port
340
+
341
+ # Pull model
342
+ pull([model_name])
343
+
344
+ # Load model
345
+ load(model_name, port)
346
+
347
+ # Open the chat interface with the specified model
348
+ url = f"http://{host}:{port}/?model={model_name}#llm-chat"
349
+ print(f"You can now chat with {model_name} at {url}")
350
+
351
+ # Only open browser if not disabled via environment variable
352
+ if not os.environ.get("LEMONADE_DISABLE_BROWSER"):
353
+ webbrowser.open(url)
354
+
355
+ # Keep the server running if we started it
356
+ if not server_previously_running:
357
+ while server_thread.is_alive():
358
+ time.sleep(0.5)
359
+
360
+
361
+ def load(model_name: str, port: int):
362
+ """
363
+ Load a model using the endpoint
364
+ """
365
+ import requests
366
+
367
+ base_url = f"http://localhost:{port}/api/v1"
368
+
369
+ # Load the model
370
+ load_response = requests.post(f"{base_url}/load", json={"model_name": model_name})
371
+ if load_response.status_code != 200:
372
+ raise ModelLoadError(
373
+ f"Failed to load {model_name}. Check the "
374
+ "Lemonade Server log for more information."
375
+ )
376
+
377
+
378
+ def version():
379
+ """
380
+ Print the version number
381
+ """
382
+ from lemonade import __version__ as version_number
383
+
384
+ print(f"{version_number}")
385
+
386
+
387
+ def status(verbose: bool = True) -> Tuple[bool, int]:
388
+ """
389
+ Print the status of the server
390
+
391
+ Returns a tuple of:
392
+ 1. Whether the server is running
393
+ 2. What port the server is running on (None if server is not running)
394
+ """
395
+ _, port = get_server_info()
396
+ if port is None:
397
+ if verbose:
398
+ print("Server is not running")
399
+ return False, None
400
+ else:
401
+ if verbose:
402
+ print(f"Server is running on port {port}")
403
+ return True, port
404
+
405
+
406
+ def is_lemonade_server(pid):
407
+ """
408
+ Check whether or not a given PID corresponds to a Lemonade server
409
+ """
410
+ # macOS only: Self-exclusion to prevent blocking server startup
411
+ if platform.system() == "Darwin":
412
+ current_pid = os.getpid()
413
+ if pid == current_pid:
414
+ return False
415
+
416
+ # Exclude children of current process to avoid detecting status commands
417
+ try:
418
+ current_process = psutil.Process(current_pid)
419
+ child_pids = [
420
+ child.pid for child in current_process.children(recursive=True)
421
+ ]
422
+ if pid in child_pids:
423
+ return False
424
+ except (psutil.NoSuchProcess, psutil.AccessDenied):
425
+ pass
426
+
427
+ try:
428
+ process = psutil.Process(pid)
429
+
430
+ while True:
431
+ process_name = process.name()
432
+ if process_name in [ # Windows
433
+ "lemonade-server-dev.exe",
434
+ "lemonade-server.exe",
435
+ "lsdev.exe",
436
+ ] or process_name in [ # Linux
437
+ "lemonade-server-dev",
438
+ "lemonade-server",
439
+ "lsdev",
440
+ ]:
441
+ return True
442
+ # macOS only: Python scripts appear as "python3.x", check command line
443
+ elif process_name.startswith("python") and platform.system() == "Darwin":
444
+ try:
445
+ cmdline = process.cmdline()
446
+ if len(cmdline) >= 2:
447
+ script_path = cmdline[1]
448
+ # Check for various lemonade server command patterns (macOS only)
449
+ lemonade_patterns = [
450
+ "lemonade-server-dev",
451
+ "lemonade-server",
452
+ "lsdev", # Short alias for lemonade-server-dev
453
+ ]
454
+ if any(pattern in script_path for pattern in lemonade_patterns):
455
+ return True
456
+ except (psutil.AccessDenied, psutil.NoSuchProcess):
457
+ pass
458
+ elif "llama-server" in process_name:
459
+ return False
460
+ if not process.parent():
461
+ return False
462
+ process = process.parent()
463
+ except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
464
+ return False
465
+ return False
466
+
467
+
468
+ def get_server_info() -> Tuple[int | None, int | None]:
469
+ """
470
+ Returns a tuple of:
471
+ 1. Lemonade Server's PID
472
+ 2. The port that Lemonade Server is running on
473
+ """
474
+
475
+ # Try the global approach first (works on Windows/Linux without permissions)
476
+ try:
477
+ connections = psutil.net_connections(kind="tcp4")
478
+ for conn in connections:
479
+ if conn.status == "LISTEN" and conn.laddr and conn.pid is not None:
480
+ if is_lemonade_server(conn.pid):
481
+ return conn.pid, conn.laddr.port
482
+ except (psutil.AccessDenied, PermissionError):
483
+ # Global approach needs elevated permissions on macOS, fall back to per-process approach
484
+ pass
485
+ except Exception: # pylint: disable=broad-exception-caught
486
+ pass
487
+
488
+ # Per-process approach (macOS only - needs this due to permission requirements)
489
+ if platform.system() == "Darwin":
490
+ try:
491
+ for proc in psutil.process_iter(["pid", "name"]):
492
+ try:
493
+ pid = proc.info["pid"]
494
+ if is_lemonade_server(pid):
495
+ # Found a lemonade server, check its listening ports
496
+ connections = proc.net_connections(kind="inet")
497
+ for conn in connections:
498
+ if conn.status == "LISTEN" and conn.laddr:
499
+ return pid, conn.laddr.port
500
+ # If no listening connections found, this process is not actually serving
501
+ # Continue looking for other processes
502
+ except (
503
+ psutil.NoSuchProcess,
504
+ psutil.AccessDenied,
505
+ psutil.ZombieProcess,
506
+ ):
507
+ # Some processes may be inaccessible, continue to next
508
+ continue
509
+ except Exception: # pylint: disable=broad-exception-caught
510
+ pass
511
+
512
+ return None, None
513
+
514
+
515
+ def list_models():
516
+ """
517
+ List recommended models and their download status
518
+ """
519
+ from tabulate import tabulate
520
+ from lemonade_server.model_manager import ModelManager
521
+
522
+ model_manager = ModelManager()
523
+
524
+ # Get all supported models and downloaded models
525
+ supported_models = model_manager.supported_models
526
+ filtered_models = model_manager.filter_models_by_backend(supported_models)
527
+ downloaded_models = model_manager.downloaded_models
528
+
529
+ # Filter to only show recommended models
530
+ recommended_models = {
531
+ model_name: model_info
532
+ for model_name, model_info in filtered_models.items()
533
+ if model_info.get("suggested", False)
534
+ }
535
+
536
+ # Create table data
537
+ table_data = []
538
+ for model_name, model_info in recommended_models.items():
539
+ downloaded_status = "Yes" if model_name in downloaded_models else "No"
540
+
541
+ # Get model labels/type
542
+ labels = model_info.get("labels", [])
543
+ model_type = ", ".join(labels) if labels else "-"
544
+
545
+ table_data.append([model_name, downloaded_status, model_type])
546
+
547
+ # Sort by model name for consistent display
548
+ # Show downloaded models first
549
+ table_data.sort(key=lambda x: (x[1] == "No", x[0].lower()))
550
+
551
+ # Display table
552
+ headers = ["Model Name", "Downloaded", "Details"]
553
+ print(tabulate(table_data, headers=headers, tablefmt="simple"))
554
+
555
+
556
+ def developer_entrypoint():
557
+ """
558
+ Developer entry point that starts the server with debug logging
559
+ Equivalent to running: lemonade-server-dev serve --log-level debug [additional args]
560
+
561
+ This function automatically prepends "serve --log-level debug" to any arguments
562
+ passed to the lsdev command.
563
+ """
564
+ # Save original sys.argv
565
+ original_argv = sys.argv.copy()
566
+
567
+ try:
568
+ # Take any additional arguments passed to lsdev and append them
569
+ # after "serve --log-level debug"
570
+ additional_args = sys.argv[1:] if len(sys.argv) > 1 else []
571
+
572
+ # Set sys.argv to simulate "serve --log-level debug" + additional args
573
+ sys.argv = [sys.argv[0], "serve", "--log-level", "debug"] + additional_args
574
+ main()
575
+ finally:
576
+ # Restore original sys.argv
577
+ sys.argv = original_argv
578
+
579
+
580
+ def _add_server_arguments(parser):
581
+ """Add common server arguments to a parser"""
582
+
583
+ # Load the persisted log level to use as a default
584
+ persisted_log_level = load_setting("log_level", DEFAULT_LOG_LEVEL)
585
+
586
+ parser.add_argument(
587
+ "--port",
588
+ type=int,
589
+ help="Port number to serve on",
590
+ default=DEFAULT_PORT,
591
+ )
592
+ parser.add_argument(
593
+ "--host",
594
+ type=str,
595
+ help="Address to bind for connections",
596
+ default=DEFAULT_HOST,
597
+ )
598
+ parser.add_argument(
599
+ "--log-level",
600
+ type=str,
601
+ help="Log level for the server",
602
+ choices=["critical", "error", "warning", "info", "debug", "trace"],
603
+ default=persisted_log_level,
604
+ )
605
+ parser.add_argument(
606
+ "--llamacpp",
607
+ type=str,
608
+ help="LlamaCpp backend to use",
609
+ choices=["vulkan", "rocm", "metal", "cpu"],
610
+ default=DEFAULT_LLAMACPP_BACKEND,
611
+ )
612
+ parser.add_argument(
613
+ "--ctx-size",
614
+ type=int,
615
+ help=(
616
+ f"Context size for the model (default: {DEFAULT_CTX_SIZE} for llamacpp, "
617
+ "truncates prompts for other recipes)"
618
+ ),
619
+ default=DEFAULT_CTX_SIZE,
620
+ )
621
+
622
+ if os.name == "nt" or platform.system() == "Darwin":
623
+ parser.add_argument(
624
+ "--no-tray",
625
+ action="store_true",
626
+ help="Do not show a tray icon when the server is running",
627
+ )
628
+
629
+
630
+ def _show_deprecation_notice():
631
+ """Display deprecation notice for Python server, unless in CI mode."""
632
+ if os.environ.get("LEMONADE_CI_MODE"):
633
+ return
634
+
635
+ print("=" * 80)
636
+ print("DEPRECATION NOTICE")
637
+ print("=" * 80)
638
+ print("The Python-based 'lemonade-server-dev' command is deprecated.")
639
+ print("Please use the C++ Lemonade Server instead:")
640
+ print()
641
+ print(" • Windows and Linux: Download the installer from")
642
+ print(" https://github.com/lemonade-sdk/lemonade/releases/latest")
643
+ print()
644
+ print("The C++ server offers better performance and is the recommended option.")
645
+ print("This Python server will be removed in a future release.")
646
+ print("=" * 80)
647
+ print()
648
+
649
+
650
+ def main():
651
+ # Show deprecation notice for --help/-h before argparse handles it
652
+ if "--help" in sys.argv or "-h" in sys.argv or len(sys.argv) == 1:
653
+ _show_deprecation_notice()
654
+
655
+ parser = argparse.ArgumentParser(
656
+ description="Serve LLMs on CPU, GPU, and NPU.",
657
+ usage=argparse.SUPPRESS,
658
+ )
659
+
660
+ # Add version flag
661
+ parser.add_argument(
662
+ "-v", "--version", action="store_true", help="Show version number"
663
+ )
664
+
665
+ # Create subparsers for commands
666
+ subparsers = parser.add_subparsers(
667
+ title="Available Commands", dest="command", metavar=""
668
+ )
669
+
670
+ # Serve command
671
+ serve_parser = subparsers.add_parser("serve", help="Start server")
672
+ _add_server_arguments(serve_parser)
673
+
674
+ # Status command
675
+ status_parser = subparsers.add_parser("status", help="Check if server is running")
676
+
677
+ # Stop command
678
+ stop_parser = subparsers.add_parser("stop", help="Stop the server")
679
+
680
+ # List command
681
+ list_parser = subparsers.add_parser(
682
+ "list", help="List recommended models and their download status"
683
+ )
684
+
685
+ # Pull command
686
+ pull_parser = subparsers.add_parser(
687
+ "pull",
688
+ help="Install an LLM",
689
+ epilog=(
690
+ "More information: "
691
+ "https://github.com/lemonade-sdk/lemonade/blob/main/docs/server/server_models.md"
692
+ ),
693
+ )
694
+ pull_parser.add_argument(
695
+ "model",
696
+ help="Lemonade Server model name",
697
+ nargs="+",
698
+ )
699
+ pull_parser.add_argument(
700
+ "--checkpoint",
701
+ help="For registering a new model: Hugging Face checkpoint to source the model from",
702
+ )
703
+ pull_parser.add_argument(
704
+ "--recipe",
705
+ help="For registering a new model: lemonade.api recipe to use with the model",
706
+ )
707
+ pull_parser.add_argument(
708
+ "--reasoning",
709
+ help="For registering a new model: whether the model is a reasoning model or not",
710
+ type=bool,
711
+ default=False,
712
+ )
713
+ pull_parser.add_argument(
714
+ "--mmproj",
715
+ help="For registering a new multimodal model: full file name of the .mmproj file in the checkpoint",
716
+ )
717
+
718
+ # Delete command
719
+ delete_parser = subparsers.add_parser(
720
+ "delete",
721
+ help="Delete an LLM",
722
+ epilog=(
723
+ "More information: "
724
+ "https://github.com/lemonade-sdk/lemonade/blob/main/docs/server/server_models.md"
725
+ ),
726
+ )
727
+ delete_parser.add_argument(
728
+ "model",
729
+ help="Lemonade Server model name",
730
+ nargs="+",
731
+ )
732
+
733
+ # Run command
734
+ run_parser = subparsers.add_parser(
735
+ "run",
736
+ help="Chat with specified model (starts server if needed)",
737
+ )
738
+ run_parser.add_argument(
739
+ "model",
740
+ help="Lemonade Server model name to run",
741
+ )
742
+ _add_server_arguments(run_parser)
743
+
744
+ args = parser.parse_args()
745
+
746
+ if os.name != "nt" and platform.system() != "Darwin":
747
+ args.no_tray = True
748
+
749
+ if args.version:
750
+ version()
751
+ elif args.command == "serve":
752
+ _show_deprecation_notice()
753
+ _, running_port = get_server_info()
754
+ if running_port is not None:
755
+ print(
756
+ (
757
+ f"Lemonade Server is already running on port {running_port}\n"
758
+ "Please stop the existing server before starting a new instance."
759
+ ),
760
+ )
761
+ sys.exit(ExitCodes.SERVER_ALREADY_RUNNING)
762
+ serve(
763
+ port=args.port,
764
+ host=args.host,
765
+ log_level=args.log_level,
766
+ tray=not args.no_tray,
767
+ llamacpp_backend=args.llamacpp,
768
+ ctx_size=args.ctx_size,
769
+ )
770
+ elif args.command == "status":
771
+ status()
772
+ elif args.command == "list":
773
+ list_models()
774
+ elif args.command == "pull":
775
+ pull(
776
+ args.model,
777
+ checkpoint=args.checkpoint,
778
+ recipe=args.recipe,
779
+ reasoning=args.reasoning,
780
+ mmproj=args.mmproj,
781
+ )
782
+ elif args.command == "delete":
783
+ delete(args.model)
784
+ elif args.command == "stop":
785
+ stop()
786
+ elif args.command == "run":
787
+ _show_deprecation_notice()
788
+ run(
789
+ args.model,
790
+ port=args.port,
791
+ host=args.host,
792
+ log_level=args.log_level,
793
+ tray=not args.no_tray,
794
+ llamacpp_backend=args.llamacpp,
795
+ ctx_size=args.ctx_size,
796
+ )
797
+ elif args.command == "help" or not args.command:
798
+ parser.print_help()
799
+
800
+
801
+ if __name__ == "__main__":
802
+ main()
803
+
804
+ # This file was originally licensed under Apache 2.0. It has been modified.
805
+ # Modifications Copyright (c) 2025 AMD