lemonade-sdk 8.0.5__py3-none-any.whl → 8.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

@@ -758,15 +758,18 @@ class LemonadePerfTable(Table):
758
758
  data[key] = model_stats.get(key, "")
759
759
 
760
760
  # Create a new entry with Driver Versions and relevant Python Packages
761
- sw_versions = [
762
- key + ": " + value
763
- for key, value in data[fs.Keys.SYSTEM_INFO]["Driver Versions"].items()
764
- ]
765
- sw_versions += [
766
- pkg
767
- for pkg in data[fs.Keys.SYSTEM_INFO]["Python Packages"]
768
- if any(name in pkg for name in PYTHON_PACKAGES)
769
- ]
761
+ sw_versions = []
762
+ if "Driver Versions" in data[fs.Keys.SYSTEM_INFO]:
763
+ sw_versions += [
764
+ key + ": " + value
765
+ for key, value in data[fs.Keys.SYSTEM_INFO]["Driver Versions"].items()
766
+ ]
767
+ if "Python Packages" in data[fs.Keys.SYSTEM_INFO]:
768
+ sw_versions += [
769
+ pkg
770
+ for pkg in data[fs.Keys.SYSTEM_INFO]["Python Packages"]
771
+ if any(name in pkg for name in PYTHON_PACKAGES)
772
+ ]
770
773
  if isinstance(data[Keys.RYZEN_AI_VERSION_INFO], dict):
771
774
  sw_versions += [
772
775
  "Ryzen AI: " + value
@@ -1,13 +1,11 @@
1
- import sys
2
1
  import os
2
+ import sys
3
3
  import logging
4
4
  import time
5
5
  import subprocess
6
- import zipfile
7
6
  import re
8
7
  import threading
9
8
  import platform
10
- import shutil
11
9
 
12
10
  import requests
13
11
  from tabulate import tabulate
@@ -18,12 +16,18 @@ from openai import OpenAI
18
16
 
19
17
  from lemonade_server.pydantic_models import (
20
18
  ChatCompletionRequest,
19
+ CompletionRequest,
21
20
  PullConfig,
22
21
  EmbeddingsRequest,
23
22
  RerankingRequest,
24
23
  )
25
24
  from lemonade_server.model_manager import ModelManager
26
25
  from lemonade.tools.server.utils.port import find_free_port
26
+ from lemonade.tools.llamacpp.utils import (
27
+ get_llama_server_exe_path,
28
+ install_llamacpp,
29
+ download_gguf,
30
+ )
27
31
 
28
32
  LLAMA_VERSION = "b5787"
29
33
 
@@ -80,39 +84,6 @@ def get_binary_url_and_filename(version):
80
84
  return url, filename
81
85
 
82
86
 
83
- def validate_platform_support():
84
- """
85
- Validate platform support before attempting download
86
- """
87
- system = platform.system().lower()
88
-
89
- if system not in ["windows", "linux"]:
90
- raise HTTPException(
91
- status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
92
- detail=(
93
- f"Platform {system} not supported for llamacpp. "
94
- "Supported: Windows, Ubuntu Linux"
95
- ),
96
- )
97
-
98
- if system == "linux":
99
- # Check if we're actually on Ubuntu/compatible distro and log a warning if not
100
- try:
101
- with open("/etc/os-release", "r", encoding="utf-8") as f:
102
- os_info = f.read().lower()
103
- if "ubuntu" not in os_info and "debian" not in os_info:
104
- logging.warning(
105
- "llamacpp binaries are built for Ubuntu. "
106
- "Compatibility with other Linux distributions is not guaranteed."
107
- )
108
- except (FileNotFoundError, PermissionError, OSError) as e:
109
- logging.warning(
110
- "Could not determine Linux distribution (%s). "
111
- "llamacpp binaries are built for Ubuntu.",
112
- str(e),
113
- )
114
-
115
-
116
87
  class LlamaTelemetry:
117
88
  """
118
89
  Manages telemetry data collection and display for llama server.
@@ -283,7 +254,7 @@ def _launch_llama_subprocess(
283
254
  """
284
255
 
285
256
  # Get the current executable path (handles both Windows and Ubuntu structures)
286
- _, exe_path = get_llama_server_paths()
257
+ exe_path = get_llama_server_exe_path()
287
258
 
288
259
  # Build the base command
289
260
  base_command = [exe_path, "-m", snapshot_files["variant"]]
@@ -350,68 +321,23 @@ def _launch_llama_subprocess(
350
321
 
351
322
 
352
323
  def server_load(model_config: PullConfig, telemetry: LlamaTelemetry):
353
- # Validate platform support before proceeding
354
- validate_platform_support()
324
+ # Install and/or update llama.cpp if needed
325
+ try:
326
+ install_llamacpp()
327
+ except NotImplementedError as e:
328
+ raise HTTPException(
329
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, detail=str(e)
330
+ )
355
331
 
356
332
  # Get platform-specific paths at runtime
357
- llama_server_exe_dir, llama_server_exe_path = get_llama_server_paths()
358
-
359
- # Check whether the llamacpp install needs an upgrade
360
- version_txt_path = os.path.join(llama_server_exe_dir, "version.txt")
361
- if os.path.exists(version_txt_path):
362
- with open(version_txt_path, "r", encoding="utf-8") as f:
363
- llamacpp_installed_version = f.read()
364
-
365
- if llamacpp_installed_version != LLAMA_VERSION:
366
- # Remove the existing install, which will trigger a new install
367
- # in the next code block
368
- shutil.rmtree(llama_server_exe_dir)
369
-
370
- # Download llama.cpp server if it isn't already available
371
- if not os.path.exists(llama_server_exe_dir):
372
- # Download llama.cpp server zip
373
- llama_zip_url, filename = get_binary_url_and_filename(LLAMA_VERSION)
374
- llama_zip_path = os.path.join(os.path.dirname(sys.executable), filename)
375
- logging.info(f"Downloading llama.cpp server from {llama_zip_url}")
376
-
377
- with requests.get(llama_zip_url, stream=True) as r:
378
- r.raise_for_status()
379
- with open(llama_zip_path, "wb") as f:
380
- for chunk in r.iter_content(chunk_size=8192):
381
- f.write(chunk)
382
-
383
- # Extract zip
384
- logging.info(f"Extracting {llama_zip_path} to {llama_server_exe_dir}")
385
- with zipfile.ZipFile(llama_zip_path, "r") as zip_ref:
386
- zip_ref.extractall(llama_server_exe_dir)
387
-
388
- # Make executable on Linux - need to update paths after extraction
389
- if platform.system().lower() == "linux":
390
- # Re-get the paths since extraction might have changed the directory structure
391
- _, updated_exe_path = get_llama_server_paths()
392
- if os.path.exists(updated_exe_path):
393
- os.chmod(updated_exe_path, 0o755)
394
- logging.info(f"Set executable permissions for {updated_exe_path}")
395
- else:
396
- logging.warning(
397
- f"Could not find llama-server executable at {updated_exe_path}"
398
- )
399
-
400
- # Save version.txt
401
- with open(version_txt_path, "w", encoding="utf-8") as vf:
402
- vf.write(LLAMA_VERSION)
403
-
404
- # Delete zip file
405
- os.remove(llama_zip_path)
406
- logging.info("Cleaned up zip file")
333
+ llama_server_exe_path = get_llama_server_exe_path()
407
334
 
408
335
  # Download the gguf to the hugging face cache
409
- model_manager = ModelManager()
410
- snapshot_files = model_manager.download_gguf(model_config)
336
+ snapshot_files = download_gguf(model_config.checkpoint, model_config.mmproj)
411
337
  logging.debug(f"GGUF file paths: {snapshot_files}")
412
338
 
413
339
  # Check if model supports embeddings
414
- supported_models = model_manager.supported_models
340
+ supported_models = ModelManager().supported_models
415
341
  model_info = supported_models.get(model_config.model_name, {})
416
342
  supports_embeddings = "embeddings" in model_info.get("labels", [])
417
343
  supports_reranking = "reranking" in model_info.get("labels", [])
@@ -523,6 +449,68 @@ def chat_completion(
523
449
  )
524
450
 
525
451
 
452
+ def completion(completion_request: CompletionRequest, telemetry: LlamaTelemetry):
453
+ """
454
+ Handle text completions using the llamacpp server.
455
+
456
+ Args:
457
+ completion_request: The completion request containing prompt and parameters
458
+ telemetry: Telemetry object containing the server port
459
+
460
+ Returns:
461
+ Completion response from the llamacpp server
462
+ """
463
+ base_url = llamacpp_address(telemetry.port)
464
+ client = OpenAI(
465
+ base_url=base_url,
466
+ api_key="lemonade",
467
+ )
468
+
469
+ # Convert Pydantic model to dict and remove unset/null values
470
+ request_dict = completion_request.model_dump(exclude_unset=True, exclude_none=True)
471
+
472
+ # Check if streaming is requested
473
+ if completion_request.stream:
474
+
475
+ def event_stream():
476
+ try:
477
+ # Enable streaming
478
+ for chunk in client.completions.create(**request_dict):
479
+ yield f"data: {chunk.model_dump_json()}\n\n"
480
+ yield "data: [DONE]\n\n"
481
+
482
+ # Show telemetry after completion
483
+ telemetry.show_telemetry()
484
+
485
+ except Exception as e: # pylint: disable=broad-exception-caught
486
+ yield f'data: {{"error": "{str(e)}"}}\n\n'
487
+
488
+ return StreamingResponse(
489
+ event_stream(),
490
+ media_type="text/event-stream",
491
+ headers={
492
+ "Cache-Control": "no-cache",
493
+ "Connection": "keep-alive",
494
+ },
495
+ )
496
+ else:
497
+ # Non-streaming response
498
+ try:
499
+ # Disable streaming for non-streaming requests
500
+ response = client.completions.create(**request_dict)
501
+
502
+ # Show telemetry after completion
503
+ telemetry.show_telemetry()
504
+
505
+ return response
506
+
507
+ except Exception as e: # pylint: disable=broad-exception-caught
508
+ raise HTTPException(
509
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
510
+ detail=f"Completion error: {str(e)}",
511
+ )
512
+
513
+
526
514
  def embeddings(embeddings_request: EmbeddingsRequest, telemetry: LlamaTelemetry):
527
515
  """
528
516
  Generate embeddings using the llamacpp server.
@@ -284,7 +284,7 @@ class Server(ManagementTool):
284
284
  def _setup_server_common(
285
285
  self,
286
286
  port: int,
287
- truncate_inputs: bool = False,
287
+ truncate_inputs: Optional[int] = None,
288
288
  log_level: str = DEFAULT_LOG_LEVEL,
289
289
  tray: bool = False,
290
290
  log_file: str = None,
@@ -295,7 +295,7 @@ class Server(ManagementTool):
295
295
 
296
296
  Args:
297
297
  port: Port number for the server
298
- truncate_inputs: Whether to truncate inputs if they exceed max length
298
+ truncate_inputs: Truncate messages to this length
299
299
  log_level: Logging level to configure
300
300
  threaded_mode: Whether this is being set up for threaded execution
301
301
  """
@@ -372,7 +372,7 @@ class Server(ManagementTool):
372
372
  _=None,
373
373
  port: int = DEFAULT_PORT,
374
374
  log_level: str = DEFAULT_LOG_LEVEL,
375
- truncate_inputs: bool = False,
375
+ truncate_inputs: Optional[int] = None,
376
376
  tray: bool = False,
377
377
  log_file: str = None,
378
378
  ):
@@ -393,7 +393,7 @@ class Server(ManagementTool):
393
393
  port: int = DEFAULT_PORT,
394
394
  host: str = "localhost",
395
395
  log_level: str = "warning",
396
- truncate_inputs: bool = False,
396
+ truncate_inputs: Optional[int] = None,
397
397
  ):
398
398
  """
399
399
  Set up the server for running in a thread.
@@ -487,6 +487,9 @@ class Server(ManagementTool):
487
487
  # Load the model if it's different from the currently loaded one
488
488
  await self.load_llm(lc)
489
489
 
490
+ if self.llm_loaded.recipe == "llamacpp":
491
+ return llamacpp.completion(completion_request, self.llama_telemetry)
492
+
490
493
  # Check if the model supports reasoning
491
494
  reasoning_first_token = self.llm_loaded.reasoning
492
495
 
@@ -1096,29 +1099,20 @@ class Server(ManagementTool):
1096
1099
  )
1097
1100
  self.input_tokens = len(input_ids[0])
1098
1101
 
1099
- if (
1100
- self.llm_loaded.max_prompt_length
1101
- and self.input_tokens > self.llm_loaded.max_prompt_length
1102
- ):
1103
- if self.truncate_inputs:
1104
- # Truncate input ids
1105
- truncate_amount = self.input_tokens - self.llm_loaded.max_prompt_length
1106
- input_ids = input_ids[: self.llm_loaded.max_prompt_length]
1107
-
1108
- # Update token count
1109
- self.input_tokens = len(input_ids)
1110
-
1111
- # Show warning message
1112
- truncation_message = (
1113
- f"Input exceeded {self.llm_loaded.max_prompt_length} tokens. "
1114
- f"Truncated {truncate_amount} tokens."
1115
- )
1116
- logging.warning(truncation_message)
1117
- else:
1118
- raise RuntimeError(
1119
- f"Prompt tokens ({self.input_tokens}) cannot be greater "
1120
- f"than the model's max prompt length ({self.llm_loaded.max_prompt_length})"
1121
- )
1102
+ if self.truncate_inputs and self.truncate_inputs > self.input_tokens:
1103
+ # Truncate input ids
1104
+ truncate_amount = self.input_tokens - self.truncate_inputs
1105
+ input_ids = input_ids[: self.truncate_inputs]
1106
+
1107
+ # Update token count
1108
+ self.input_tokens = len(input_ids)
1109
+
1110
+ # Show warning message
1111
+ truncation_message = (
1112
+ f"Input exceeded {self.truncate_inputs} tokens. "
1113
+ f"Truncated {truncate_amount} tokens."
1114
+ )
1115
+ logging.warning(truncation_message)
1122
1116
 
1123
1117
  # Log the input tokens early to avoid this not showing due to potential crashes
1124
1118
  logging.debug(f"Input Tokens: {self.input_tokens}")
@@ -1314,7 +1308,7 @@ class Server(ManagementTool):
1314
1308
  self.tokenizer = None
1315
1309
  self.model = None
1316
1310
 
1317
- default_message = f"model {model_reference} not found"
1311
+ default_message = "see stack trace and error message below"
1318
1312
  if message:
1319
1313
  detail = message
1320
1314
  else:
@@ -1,33 +1,92 @@
1
+ /* === CSS Variables === */
2
+ :root {
3
+ /* Colors */
4
+ --primary-yellow: #ffe066;
5
+ --primary-yellow-dark: #ffd43b;
6
+ --accent-gold: #e6b800;
7
+ --accent-gold-dark: #bfa100;
8
+
9
+ --text-primary: #222;
10
+ --text-secondary: #555;
11
+ --text-muted: #666;
12
+
13
+ --bg-primary: #fffbe9;
14
+ --bg-secondary: #fff8dd;
15
+ --bg-tertiary: #fff5d1;
16
+
17
+ /* Transitions */
18
+ --transition-fast: 0.2s ease;
19
+ --transition-medium: 0.3s ease;
20
+ }
21
+
1
22
  body {
2
23
  margin: 0;
3
24
  font-family: 'Segoe UI', 'Arial', sans-serif;
4
- background: #fffbe9;
5
- color: #222;
25
+ background: linear-gradient(135deg, var(--bg-primary) 0%, var(--bg-secondary) 50%, var(--bg-tertiary) 100%);
26
+ color: var(--text-primary);
6
27
  min-height: 100vh;
7
28
  display: flex;
8
29
  flex-direction: column;
9
- padding-bottom: 5rem;
30
+ }
31
+
32
+ body::before {
33
+ content: '';
34
+ position: fixed;
35
+ top: 0;
36
+ left: 0;
37
+ width: 100%;
38
+ height: 100%;
39
+ background:
40
+ radial-gradient(circle at 20% 20%, rgba(255, 224, 102, 0.1) 0%, transparent 50%),
41
+ radial-gradient(circle at 80% 80%, rgba(255, 212, 59, 0.1) 0%, transparent 50%);
42
+ pointer-events: none;
43
+ z-index: -1;
10
44
  }
11
45
 
12
46
  .navbar {
13
47
  display: flex;
14
- justify-content: center;
15
- gap: 2.5rem;
16
- padding: 2rem 0 1.5rem 0;
48
+ justify-content: space-between;
49
+ align-items: center;
50
+ padding: 1rem 3rem 0.5rem 1rem;
17
51
  font-size: 1.25rem;
18
52
  font-weight: 500;
19
53
  background: transparent;
20
54
  letter-spacing: 0.02em;
55
+ position: relative;
56
+ transition: var(--transition-medium);
21
57
  }
22
58
 
23
- .navbar a {
59
+ .navbar-brand {
60
+ display: flex;
61
+ align-items: center;
62
+ }
63
+
64
+ .brand-title {
65
+ font-size: 1.5rem;
66
+ font-weight: 700;
67
+ color: var(--text-primary);
68
+ text-decoration: none;
69
+ letter-spacing: 0.01em;
70
+ }
71
+
72
+ .brand-title a {
73
+ color: inherit;
74
+ text-decoration: none;
75
+ }
76
+
77
+ .navbar-links {
78
+ display: flex;
79
+ gap: 2.5rem;
80
+ }
81
+
82
+ .navbar-links a {
24
83
  color: #444;
25
84
  text-decoration: none;
26
- transition: color 0.2s;
85
+ transition: var(--transition-fast);
27
86
  }
28
87
 
29
- .navbar a:hover {
30
- color: #e6b800;
88
+ .navbar-links a:hover {
89
+ color: var(--accent-gold);
31
90
  }
32
91
 
33
92
  .main {
@@ -37,26 +96,14 @@ body {
37
96
  align-items: center;
38
97
  justify-content: flex-start;
39
98
  min-height: 60vh;
40
- margin-top: 3rem;
41
- }
42
-
43
- .title {
44
- font-size: 3rem;
45
- font-weight: 700;
46
- margin-bottom: 2.5rem;
47
- letter-spacing: 0.01em;
48
- text-align: center;
49
- color: #222;
99
+ margin-top: 2rem;
100
+ padding-top: 1rem;
50
101
  }
51
102
 
52
103
  .site-footer {
53
- position: fixed;
54
- left: 0;
55
- bottom: 0;
56
- width: 100%;
57
- background-color: #fffbe9;
104
+ background: transparent;
58
105
  padding-top: 0.5rem;
59
- z-index: 100;
106
+ margin-top: auto;
60
107
  }
61
108
 
62
109
  .dad-joke {
@@ -483,6 +530,10 @@ body {
483
530
  background-color: #ca4747;
484
531
  }
485
532
 
533
+ .model-label.coding {
534
+ background-color: #ff6b35;
535
+ }
536
+
486
537
  .model-labels-container {
487
538
  display: flex;
488
539
  align-items: center;
@@ -983,6 +1034,50 @@ body {
983
1034
  }
984
1035
  }
985
1036
 
1037
+ /* === Responsive Navbar === */
1038
+ @media (max-width: 800px) {
1039
+ .navbar {
1040
+ flex-direction: column;
1041
+ gap: 1rem;
1042
+ padding: 1rem 1rem 0.5rem 1rem;
1043
+ align-items: center;
1044
+ }
1045
+
1046
+ .navbar-brand {
1047
+ margin-bottom: 0.5rem;
1048
+ }
1049
+
1050
+ .brand-title {
1051
+ font-size: 1.3rem;
1052
+ }
1053
+
1054
+ .navbar-links {
1055
+ gap: 1.5rem;
1056
+ font-size: 1rem;
1057
+ }
1058
+ }
1059
+
1060
+ @media (max-width: 600px) {
1061
+ .navbar {
1062
+ padding: 0.5rem 0.5rem 0.25rem 0.5rem;
1063
+ }
1064
+
1065
+ .brand-title {
1066
+ font-size: 1.2rem;
1067
+ }
1068
+
1069
+ .navbar-links {
1070
+ gap: 1rem;
1071
+ font-size: 0.9rem;
1072
+ flex-wrap: wrap;
1073
+ justify-content: center;
1074
+ }
1075
+
1076
+ .main {
1077
+ margin-top: 0.5rem;
1078
+ }
1079
+ }
1080
+
986
1081
  /* Ensure form container allows tooltip overflow */
987
1082
  .model-mgmt-register-form {
988
1083
  position: relative;
@@ -12,14 +12,19 @@
12
12
  {{SERVER_MODELS_JS}}
13
13
  </head>
14
14
  <body>
15
- <nav class="navbar">
16
- <a href="https://github.com/lemonade-sdk/lemonade" target="_blank">GitHub</a>
17
- <a href="https://lemonade-server.ai/docs/" target="_blank">Docs</a>
18
- <a href="https://lemonade-server.ai/docs/server/server_models/" target="_blank">Models</a>
19
- <a href="https://lemonade-server.ai/docs/server/apps/" target="_blank">Featured Apps</a>
15
+ <nav class="navbar" id="navbar">
16
+ <div class="navbar-brand">
17
+ <span class="brand-title"><a href="https://lemonade-server.ai">🍋 Lemonade Server</a></span>
18
+ </div>
19
+ <div class="navbar-links">
20
+ <a href="https://github.com/lemonade-sdk/lemonade" target="_blank">GitHub</a>
21
+ <a href="https://lemonade-server.ai/docs/" target="_blank">Docs</a>
22
+ <a href="https://lemonade-server.ai/docs/server/server_models/" target="_blank">Models</a>
23
+ <a href="https://lemonade-server.ai/docs/server/apps/" target="_blank">Featured Apps</a>
24
+ <a href="https://lemonade-server.ai/news/" target="_blank">News</a>
25
+ </div>
20
26
  </nav>
21
27
  <main class="main">
22
- <div class="title">🍋 Lemonade Server</div>
23
28
  <div class="tab-container">
24
29
  <div class="tabs">
25
30
  <button class="tab active" id="tab-chat" onclick="showTab('chat')">LLM Chat</button>
@@ -104,6 +109,7 @@
104
109
  </label>
105
110
  <select id="register-recipe" name="recipe" required>
106
111
  <option value="llamacpp">llamacpp</option>
112
+ <option value="oga-npu">oga-npu</option>
107
113
  <option value="oga-hybrid">oga-hybrid</option>
108
114
  <option value="oga-cpu">oga-cpu</option>
109
115
  </select>
@@ -408,6 +414,8 @@
408
414
  labelClass = 'reasoning';
409
415
  } else if (labelLower === 'reranking') {
410
416
  labelClass = 'reranking';
417
+ } else if (labelLower === 'coding') {
418
+ labelClass = 'coding';
411
419
  }
412
420
  labelSpan.className = `model-label ${labelClass}`;
413
421
  labelSpan.textContent = label;
@@ -87,8 +87,15 @@ class LemonadeTray(SystemTray):
87
87
  Update the latest version information.
88
88
  """
89
89
  try:
90
+ # Prepare headers for GitHub API request
91
+ headers = {}
92
+ github_token = os.environ.get("GITHUB_TOKEN")
93
+ if github_token:
94
+ headers["Authorization"] = f"token {github_token}"
95
+
90
96
  response = requests.get(
91
97
  "https://api.github.com/repos/lemonade-sdk/lemonade/releases/latest",
98
+ headers=headers,
92
99
  timeout=10, # Add timeout to prevent hanging
93
100
  )
94
101
  response.raise_for_status()
lemonade/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "8.0.5"
1
+ __version__ = "8.1.0"