lemonade-sdk 8.0.5__py3-none-any.whl → 8.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

lemonade/tools/prompt.py CHANGED
@@ -161,7 +161,11 @@ class LLMPrompt(Tool):
161
161
  # If template flag is set, then wrap prompt in template
162
162
  if template:
163
163
  # Embed prompt in model's chat template
164
- if tokenizer.chat_template:
164
+ if not hasattr(tokenizer, "prompt_template"):
165
+ printing.log_warning(
166
+ "Templates for this model type are not yet implemented."
167
+ )
168
+ elif tokenizer.chat_template:
165
169
  # Use the model's built-in chat template if available
166
170
  messages_dict = [{"role": "user", "content": prompt}]
167
171
  prompt = tokenizer.apply_chat_template(
@@ -175,25 +179,10 @@ class LLMPrompt(Tool):
175
179
  state.save_stat(Keys.PROMPT_TEMPLATE, "Default")
176
180
 
177
181
  input_ids = tokenizer(prompt, return_tensors="pt").input_ids
178
- if isinstance(input_ids, (list, str)):
179
- # OGA models return a list of tokens (older versions)
180
- # Our llama.cpp adapter returns a string
181
- len_tokens_in = len(input_ids)
182
- elif hasattr(input_ids, "shape"):
183
- # HF models return a 2-D tensor
184
- # OGA models with newer versions may return numpy arrays
185
- if len(input_ids.shape) == 1:
186
- # 1-D array from newer OGA versions
187
- len_tokens_in = len(input_ids)
188
- else:
189
- # 2-D tensor from HF models
190
- len_tokens_in = input_ids.shape[1]
191
- else:
192
- # Fallback: try to get length directly
193
- len_tokens_in = len(input_ids)
194
182
 
195
183
  len_tokens_out = []
196
184
  response_texts = []
185
+ prompt_tokens = None # will be determined in generate function
197
186
  for trial in range(n_trials):
198
187
  if n_trials > 1:
199
188
  self.set_percent_progress(100.0 * trial / n_trials)
@@ -222,19 +211,22 @@ class LLMPrompt(Tool):
222
211
 
223
212
  response_array = response if isinstance(response, str) else response[0]
224
213
 
225
- # Separate the prompt from the response
226
- len_tokens_out.append(len(response_array) - len_tokens_in)
214
+ prompt_tokens = model.prompt_tokens
215
+ len_tokens_out.append(model.response_tokens)
227
216
 
228
- input_token = 0
217
+ # Remove the input from the response
218
+ # (up to the point they diverge, which they should not)
219
+ counter = 0
220
+ len_input_ids = len(input_ids_array)
229
221
  while (
230
- input_token < len_tokens_in
231
- and input_ids_array[input_token] == response_array[input_token]
222
+ counter < len_input_ids
223
+ and input_ids_array[counter] == response_array[counter]
232
224
  ):
233
- input_token += 1
225
+ counter += 1
234
226
 
235
227
  # Only decode the actual response (not the prompt)
236
228
  response_text = tokenizer.decode(
237
- response_array[input_token:], skip_special_tokens=True
229
+ response_array[counter:], skip_special_tokens=True
238
230
  ).strip()
239
231
  response_texts.append(response_text)
240
232
 
@@ -259,7 +251,7 @@ class LLMPrompt(Tool):
259
251
  plt.savefig(figure_path)
260
252
  state.save_stat(Keys.RESPONSE_LENGTHS_HISTOGRAM, figure_path)
261
253
 
262
- state.save_stat(Keys.PROMPT_TOKENS, len_tokens_in)
254
+ state.save_stat(Keys.PROMPT_TOKENS, prompt_tokens)
263
255
  state.save_stat(Keys.PROMPT, prompt)
264
256
  state.save_stat(Keys.RESPONSE_TOKENS, len_tokens_out)
265
257
  state.save_stat(Keys.RESPONSE, sanitize_text(response_texts))
@@ -758,15 +758,18 @@ class LemonadePerfTable(Table):
758
758
  data[key] = model_stats.get(key, "")
759
759
 
760
760
  # Create a new entry with Driver Versions and relevant Python Packages
761
- sw_versions = [
762
- key + ": " + value
763
- for key, value in data[fs.Keys.SYSTEM_INFO]["Driver Versions"].items()
764
- ]
765
- sw_versions += [
766
- pkg
767
- for pkg in data[fs.Keys.SYSTEM_INFO]["Python Packages"]
768
- if any(name in pkg for name in PYTHON_PACKAGES)
769
- ]
761
+ sw_versions = []
762
+ if "Driver Versions" in data[fs.Keys.SYSTEM_INFO]:
763
+ sw_versions += [
764
+ key + ": " + value
765
+ for key, value in data[fs.Keys.SYSTEM_INFO]["Driver Versions"].items()
766
+ ]
767
+ if "Python Packages" in data[fs.Keys.SYSTEM_INFO]:
768
+ sw_versions += [
769
+ pkg
770
+ for pkg in data[fs.Keys.SYSTEM_INFO]["Python Packages"]
771
+ if any(name in pkg for name in PYTHON_PACKAGES)
772
+ ]
770
773
  if isinstance(data[Keys.RYZEN_AI_VERSION_INFO], dict):
771
774
  sw_versions += [
772
775
  "Ryzen AI: " + value
@@ -1,13 +1,11 @@
1
- import sys
2
1
  import os
2
+ import sys
3
3
  import logging
4
4
  import time
5
5
  import subprocess
6
- import zipfile
7
6
  import re
8
7
  import threading
9
8
  import platform
10
- import shutil
11
9
 
12
10
  import requests
13
11
  from tabulate import tabulate
@@ -18,12 +16,18 @@ from openai import OpenAI
18
16
 
19
17
  from lemonade_server.pydantic_models import (
20
18
  ChatCompletionRequest,
19
+ CompletionRequest,
21
20
  PullConfig,
22
21
  EmbeddingsRequest,
23
22
  RerankingRequest,
24
23
  )
25
24
  from lemonade_server.model_manager import ModelManager
26
25
  from lemonade.tools.server.utils.port import find_free_port
26
+ from lemonade.tools.llamacpp.utils import (
27
+ get_llama_server_exe_path,
28
+ install_llamacpp,
29
+ download_gguf,
30
+ )
27
31
 
28
32
  LLAMA_VERSION = "b5787"
29
33
 
@@ -80,39 +84,6 @@ def get_binary_url_and_filename(version):
80
84
  return url, filename
81
85
 
82
86
 
83
- def validate_platform_support():
84
- """
85
- Validate platform support before attempting download
86
- """
87
- system = platform.system().lower()
88
-
89
- if system not in ["windows", "linux"]:
90
- raise HTTPException(
91
- status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
92
- detail=(
93
- f"Platform {system} not supported for llamacpp. "
94
- "Supported: Windows, Ubuntu Linux"
95
- ),
96
- )
97
-
98
- if system == "linux":
99
- # Check if we're actually on Ubuntu/compatible distro and log a warning if not
100
- try:
101
- with open("/etc/os-release", "r", encoding="utf-8") as f:
102
- os_info = f.read().lower()
103
- if "ubuntu" not in os_info and "debian" not in os_info:
104
- logging.warning(
105
- "llamacpp binaries are built for Ubuntu. "
106
- "Compatibility with other Linux distributions is not guaranteed."
107
- )
108
- except (FileNotFoundError, PermissionError, OSError) as e:
109
- logging.warning(
110
- "Could not determine Linux distribution (%s). "
111
- "llamacpp binaries are built for Ubuntu.",
112
- str(e),
113
- )
114
-
115
-
116
87
  class LlamaTelemetry:
117
88
  """
118
89
  Manages telemetry data collection and display for llama server.
@@ -283,7 +254,7 @@ def _launch_llama_subprocess(
283
254
  """
284
255
 
285
256
  # Get the current executable path (handles both Windows and Ubuntu structures)
286
- _, exe_path = get_llama_server_paths()
257
+ exe_path = get_llama_server_exe_path()
287
258
 
288
259
  # Build the base command
289
260
  base_command = [exe_path, "-m", snapshot_files["variant"]]
@@ -350,68 +321,23 @@ def _launch_llama_subprocess(
350
321
 
351
322
 
352
323
  def server_load(model_config: PullConfig, telemetry: LlamaTelemetry):
353
- # Validate platform support before proceeding
354
- validate_platform_support()
324
+ # Install and/or update llama.cpp if needed
325
+ try:
326
+ install_llamacpp()
327
+ except NotImplementedError as e:
328
+ raise HTTPException(
329
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, detail=str(e)
330
+ )
355
331
 
356
332
  # Get platform-specific paths at runtime
357
- llama_server_exe_dir, llama_server_exe_path = get_llama_server_paths()
358
-
359
- # Check whether the llamacpp install needs an upgrade
360
- version_txt_path = os.path.join(llama_server_exe_dir, "version.txt")
361
- if os.path.exists(version_txt_path):
362
- with open(version_txt_path, "r", encoding="utf-8") as f:
363
- llamacpp_installed_version = f.read()
364
-
365
- if llamacpp_installed_version != LLAMA_VERSION:
366
- # Remove the existing install, which will trigger a new install
367
- # in the next code block
368
- shutil.rmtree(llama_server_exe_dir)
369
-
370
- # Download llama.cpp server if it isn't already available
371
- if not os.path.exists(llama_server_exe_dir):
372
- # Download llama.cpp server zip
373
- llama_zip_url, filename = get_binary_url_and_filename(LLAMA_VERSION)
374
- llama_zip_path = os.path.join(os.path.dirname(sys.executable), filename)
375
- logging.info(f"Downloading llama.cpp server from {llama_zip_url}")
376
-
377
- with requests.get(llama_zip_url, stream=True) as r:
378
- r.raise_for_status()
379
- with open(llama_zip_path, "wb") as f:
380
- for chunk in r.iter_content(chunk_size=8192):
381
- f.write(chunk)
382
-
383
- # Extract zip
384
- logging.info(f"Extracting {llama_zip_path} to {llama_server_exe_dir}")
385
- with zipfile.ZipFile(llama_zip_path, "r") as zip_ref:
386
- zip_ref.extractall(llama_server_exe_dir)
387
-
388
- # Make executable on Linux - need to update paths after extraction
389
- if platform.system().lower() == "linux":
390
- # Re-get the paths since extraction might have changed the directory structure
391
- _, updated_exe_path = get_llama_server_paths()
392
- if os.path.exists(updated_exe_path):
393
- os.chmod(updated_exe_path, 0o755)
394
- logging.info(f"Set executable permissions for {updated_exe_path}")
395
- else:
396
- logging.warning(
397
- f"Could not find llama-server executable at {updated_exe_path}"
398
- )
399
-
400
- # Save version.txt
401
- with open(version_txt_path, "w", encoding="utf-8") as vf:
402
- vf.write(LLAMA_VERSION)
403
-
404
- # Delete zip file
405
- os.remove(llama_zip_path)
406
- logging.info("Cleaned up zip file")
333
+ llama_server_exe_path = get_llama_server_exe_path()
407
334
 
408
335
  # Download the gguf to the hugging face cache
409
- model_manager = ModelManager()
410
- snapshot_files = model_manager.download_gguf(model_config)
336
+ snapshot_files = download_gguf(model_config.checkpoint, model_config.mmproj)
411
337
  logging.debug(f"GGUF file paths: {snapshot_files}")
412
338
 
413
339
  # Check if model supports embeddings
414
- supported_models = model_manager.supported_models
340
+ supported_models = ModelManager().supported_models
415
341
  model_info = supported_models.get(model_config.model_name, {})
416
342
  supports_embeddings = "embeddings" in model_info.get("labels", [])
417
343
  supports_reranking = "reranking" in model_info.get("labels", [])
@@ -523,6 +449,68 @@ def chat_completion(
523
449
  )
524
450
 
525
451
 
452
+ def completion(completion_request: CompletionRequest, telemetry: LlamaTelemetry):
453
+ """
454
+ Handle text completions using the llamacpp server.
455
+
456
+ Args:
457
+ completion_request: The completion request containing prompt and parameters
458
+ telemetry: Telemetry object containing the server port
459
+
460
+ Returns:
461
+ Completion response from the llamacpp server
462
+ """
463
+ base_url = llamacpp_address(telemetry.port)
464
+ client = OpenAI(
465
+ base_url=base_url,
466
+ api_key="lemonade",
467
+ )
468
+
469
+ # Convert Pydantic model to dict and remove unset/null values
470
+ request_dict = completion_request.model_dump(exclude_unset=True, exclude_none=True)
471
+
472
+ # Check if streaming is requested
473
+ if completion_request.stream:
474
+
475
+ def event_stream():
476
+ try:
477
+ # Enable streaming
478
+ for chunk in client.completions.create(**request_dict):
479
+ yield f"data: {chunk.model_dump_json()}\n\n"
480
+ yield "data: [DONE]\n\n"
481
+
482
+ # Show telemetry after completion
483
+ telemetry.show_telemetry()
484
+
485
+ except Exception as e: # pylint: disable=broad-exception-caught
486
+ yield f'data: {{"error": "{str(e)}"}}\n\n'
487
+
488
+ return StreamingResponse(
489
+ event_stream(),
490
+ media_type="text/event-stream",
491
+ headers={
492
+ "Cache-Control": "no-cache",
493
+ "Connection": "keep-alive",
494
+ },
495
+ )
496
+ else:
497
+ # Non-streaming response
498
+ try:
499
+ # Disable streaming for non-streaming requests
500
+ response = client.completions.create(**request_dict)
501
+
502
+ # Show telemetry after completion
503
+ telemetry.show_telemetry()
504
+
505
+ return response
506
+
507
+ except Exception as e: # pylint: disable=broad-exception-caught
508
+ raise HTTPException(
509
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
510
+ detail=f"Completion error: {str(e)}",
511
+ )
512
+
513
+
526
514
  def embeddings(embeddings_request: EmbeddingsRequest, telemetry: LlamaTelemetry):
527
515
  """
528
516
  Generate embeddings using the llamacpp server.
@@ -487,6 +487,9 @@ class Server(ManagementTool):
487
487
  # Load the model if it's different from the currently loaded one
488
488
  await self.load_llm(lc)
489
489
 
490
+ if self.llm_loaded.recipe == "llamacpp":
491
+ return llamacpp.completion(completion_request, self.llama_telemetry)
492
+
490
493
  # Check if the model supports reasoning
491
494
  reasoning_first_token = self.llm_loaded.reasoning
492
495
 
@@ -1,33 +1,93 @@
1
+ /* === CSS Variables === */
2
+ :root {
3
+ /* Colors */
4
+ --primary-yellow: #ffe066;
5
+ --primary-yellow-dark: #ffd43b;
6
+ --accent-gold: #e6b800;
7
+ --accent-gold-dark: #bfa100;
8
+
9
+ --text-primary: #222;
10
+ --text-secondary: #555;
11
+ --text-muted: #666;
12
+
13
+ --bg-primary: #fffbe9;
14
+ --bg-secondary: #fff8dd;
15
+ --bg-tertiary: #fff5d1;
16
+
17
+ /* Transitions */
18
+ --transition-fast: 0.2s ease;
19
+ --transition-medium: 0.3s ease;
20
+ }
21
+
1
22
  body {
2
23
  margin: 0;
3
24
  font-family: 'Segoe UI', 'Arial', sans-serif;
4
- background: #fffbe9;
5
- color: #222;
25
+ background: linear-gradient(135deg, var(--bg-primary) 0%, var(--bg-secondary) 50%, var(--bg-tertiary) 100%);
26
+ color: var(--text-primary);
6
27
  min-height: 100vh;
7
28
  display: flex;
8
29
  flex-direction: column;
9
30
  padding-bottom: 5rem;
10
31
  }
11
32
 
33
+ body::before {
34
+ content: '';
35
+ position: fixed;
36
+ top: 0;
37
+ left: 0;
38
+ width: 100%;
39
+ height: 100%;
40
+ background:
41
+ radial-gradient(circle at 20% 20%, rgba(255, 224, 102, 0.1) 0%, transparent 50%),
42
+ radial-gradient(circle at 80% 80%, rgba(255, 212, 59, 0.1) 0%, transparent 50%);
43
+ pointer-events: none;
44
+ z-index: -1;
45
+ }
46
+
12
47
  .navbar {
13
48
  display: flex;
14
- justify-content: center;
15
- gap: 2.5rem;
16
- padding: 2rem 0 1.5rem 0;
49
+ justify-content: space-between;
50
+ align-items: center;
51
+ padding: 1rem 3rem 0.5rem 1rem;
17
52
  font-size: 1.25rem;
18
53
  font-weight: 500;
19
54
  background: transparent;
20
55
  letter-spacing: 0.02em;
56
+ position: relative;
57
+ transition: var(--transition-medium);
58
+ }
59
+
60
+ .navbar-brand {
61
+ display: flex;
62
+ align-items: center;
21
63
  }
22
64
 
23
- .navbar a {
65
+ .brand-title {
66
+ font-size: 1.5rem;
67
+ font-weight: 700;
68
+ color: var(--text-primary);
69
+ text-decoration: none;
70
+ letter-spacing: 0.01em;
71
+ }
72
+
73
+ .brand-title a {
74
+ color: inherit;
75
+ text-decoration: none;
76
+ }
77
+
78
+ .navbar-links {
79
+ display: flex;
80
+ gap: 2.5rem;
81
+ }
82
+
83
+ .navbar-links a {
24
84
  color: #444;
25
85
  text-decoration: none;
26
- transition: color 0.2s;
86
+ transition: var(--transition-fast);
27
87
  }
28
88
 
29
- .navbar a:hover {
30
- color: #e6b800;
89
+ .navbar-links a:hover {
90
+ color: var(--accent-gold);
31
91
  }
32
92
 
33
93
  .main {
@@ -37,16 +97,8 @@ body {
37
97
  align-items: center;
38
98
  justify-content: flex-start;
39
99
  min-height: 60vh;
40
- margin-top: 3rem;
41
- }
42
-
43
- .title {
44
- font-size: 3rem;
45
- font-weight: 700;
46
- margin-bottom: 2.5rem;
47
- letter-spacing: 0.01em;
48
- text-align: center;
49
- color: #222;
100
+ margin-top: 2rem;
101
+ padding-top: 1rem;
50
102
  }
51
103
 
52
104
  .site-footer {
@@ -54,7 +106,7 @@ body {
54
106
  left: 0;
55
107
  bottom: 0;
56
108
  width: 100%;
57
- background-color: #fffbe9;
109
+ background: transparent;
58
110
  padding-top: 0.5rem;
59
111
  z-index: 100;
60
112
  }
@@ -983,6 +1035,50 @@ body {
983
1035
  }
984
1036
  }
985
1037
 
1038
+ /* === Responsive Navbar === */
1039
+ @media (max-width: 800px) {
1040
+ .navbar {
1041
+ flex-direction: column;
1042
+ gap: 1rem;
1043
+ padding: 1rem 1rem 0.5rem 1rem;
1044
+ align-items: center;
1045
+ }
1046
+
1047
+ .navbar-brand {
1048
+ margin-bottom: 0.5rem;
1049
+ }
1050
+
1051
+ .brand-title {
1052
+ font-size: 1.3rem;
1053
+ }
1054
+
1055
+ .navbar-links {
1056
+ gap: 1.5rem;
1057
+ font-size: 1rem;
1058
+ }
1059
+ }
1060
+
1061
+ @media (max-width: 600px) {
1062
+ .navbar {
1063
+ padding: 0.5rem 0.5rem 0.25rem 0.5rem;
1064
+ }
1065
+
1066
+ .brand-title {
1067
+ font-size: 1.2rem;
1068
+ }
1069
+
1070
+ .navbar-links {
1071
+ gap: 1rem;
1072
+ font-size: 0.9rem;
1073
+ flex-wrap: wrap;
1074
+ justify-content: center;
1075
+ }
1076
+
1077
+ .main {
1078
+ margin-top: 0.5rem;
1079
+ }
1080
+ }
1081
+
986
1082
  /* Ensure form container allows tooltip overflow */
987
1083
  .model-mgmt-register-form {
988
1084
  position: relative;
@@ -12,14 +12,19 @@
12
12
  {{SERVER_MODELS_JS}}
13
13
  </head>
14
14
  <body>
15
- <nav class="navbar">
16
- <a href="https://github.com/lemonade-sdk/lemonade" target="_blank">GitHub</a>
17
- <a href="https://lemonade-server.ai/docs/" target="_blank">Docs</a>
18
- <a href="https://lemonade-server.ai/docs/server/server_models/" target="_blank">Models</a>
19
- <a href="https://lemonade-server.ai/docs/server/apps/" target="_blank">Featured Apps</a>
15
+ <nav class="navbar" id="navbar">
16
+ <div class="navbar-brand">
17
+ <span class="brand-title"><a href="https://lemonade-server.ai">🍋 Lemonade Server</a></span>
18
+ </div>
19
+ <div class="navbar-links">
20
+ <a href="https://github.com/lemonade-sdk/lemonade" target="_blank">GitHub</a>
21
+ <a href="https://lemonade-server.ai/docs/" target="_blank">Docs</a>
22
+ <a href="https://lemonade-server.ai/docs/server/server_models/" target="_blank">Models</a>
23
+ <a href="https://lemonade-server.ai/docs/server/apps/" target="_blank">Featured Apps</a>
24
+ <a href="https://lemonade-server.ai/news/" target="_blank">News</a>
25
+ </div>
20
26
  </nav>
21
27
  <main class="main">
22
- <div class="title">🍋 Lemonade Server</div>
23
28
  <div class="tab-container">
24
29
  <div class="tabs">
25
30
  <button class="tab active" id="tab-chat" onclick="showTab('chat')">LLM Chat</button>
@@ -87,8 +87,15 @@ class LemonadeTray(SystemTray):
87
87
  Update the latest version information.
88
88
  """
89
89
  try:
90
+ # Prepare headers for GitHub API request
91
+ headers = {}
92
+ github_token = os.environ.get("GITHUB_TOKEN")
93
+ if github_token:
94
+ headers["Authorization"] = f"token {github_token}"
95
+
90
96
  response = requests.get(
91
97
  "https://api.github.com/repos/lemonade-sdk/lemonade/releases/latest",
98
+ headers=headers,
92
99
  timeout=10, # Add timeout to prevent hanging
93
100
  )
94
101
  response.raise_for_status()
lemonade/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "8.0.5"
1
+ __version__ = "8.0.6"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lemonade-sdk
3
- Version: 8.0.5
3
+ Version: 8.0.6
4
4
  Summary: Lemonade SDK: Your LLM Aide for Validation and Deployment
5
5
  Author-email: lemonade@amd.com
6
6
  Requires-Python: >=3.10, <3.13
@@ -23,7 +23,7 @@ Requires-Dist: zstandard
23
23
  Requires-Dist: fastapi
24
24
  Requires-Dist: uvicorn[standard]
25
25
  Requires-Dist: openai>=1.81.0
26
- Requires-Dist: transformers<=4.51.3
26
+ Requires-Dist: transformers<=4.53.2
27
27
  Requires-Dist: jinja2
28
28
  Requires-Dist: tabulate
29
29
  Requires-Dist: sentencepiece
@@ -284,7 +284,7 @@ New contributors can find beginner-friendly issues tagged with "Good First Issue
284
284
 
285
285
  ## Maintainers
286
286
 
287
- This project is sponsored by AMD. It is maintained by @danielholanda @jeremyfowers @ramkrishna @vgodsoe in equal measure. You can reach us by filing an [issue](https://github.com/lemonade-sdk/lemonade/issues), email [lemonade@amd.com](mailto:lemonade@amd.com), or join our [Discord](https://discord.gg/5xXzkMu8Zk).
287
+ This project is sponsored by AMD. It is maintained by @danielholanda @jeremyfowers @ramkrishna @vgodsoe in equal measure. You can reach us by filing an [issue](https://github.com/lemonade-sdk/lemonade/issues), emailing [lemonade@amd.com](mailto:lemonade@amd.com), or joining our [Discord](https://discord.gg/5xXzkMu8Zk).
288
288
 
289
289
  ## License
290
290