lemonade-sdk 9.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. lemonade/__init__.py +5 -0
  2. lemonade/api.py +180 -0
  3. lemonade/cache.py +92 -0
  4. lemonade/cli.py +173 -0
  5. lemonade/common/__init__.py +0 -0
  6. lemonade/common/build.py +176 -0
  7. lemonade/common/cli_helpers.py +139 -0
  8. lemonade/common/exceptions.py +98 -0
  9. lemonade/common/filesystem.py +368 -0
  10. lemonade/common/inference_engines.py +408 -0
  11. lemonade/common/network.py +93 -0
  12. lemonade/common/printing.py +110 -0
  13. lemonade/common/status.py +471 -0
  14. lemonade/common/system_info.py +1411 -0
  15. lemonade/common/test_helpers.py +28 -0
  16. lemonade/profilers/__init__.py +1 -0
  17. lemonade/profilers/agt_power.py +437 -0
  18. lemonade/profilers/hwinfo_power.py +429 -0
  19. lemonade/profilers/memory_tracker.py +259 -0
  20. lemonade/profilers/profiler.py +58 -0
  21. lemonade/sequence.py +363 -0
  22. lemonade/state.py +159 -0
  23. lemonade/tools/__init__.py +1 -0
  24. lemonade/tools/accuracy.py +432 -0
  25. lemonade/tools/adapter.py +114 -0
  26. lemonade/tools/bench.py +302 -0
  27. lemonade/tools/flm/__init__.py +1 -0
  28. lemonade/tools/flm/utils.py +305 -0
  29. lemonade/tools/huggingface/bench.py +187 -0
  30. lemonade/tools/huggingface/load.py +235 -0
  31. lemonade/tools/huggingface/utils.py +359 -0
  32. lemonade/tools/humaneval.py +264 -0
  33. lemonade/tools/llamacpp/bench.py +255 -0
  34. lemonade/tools/llamacpp/load.py +222 -0
  35. lemonade/tools/llamacpp/utils.py +1260 -0
  36. lemonade/tools/management_tools.py +319 -0
  37. lemonade/tools/mmlu.py +319 -0
  38. lemonade/tools/oga/__init__.py +0 -0
  39. lemonade/tools/oga/bench.py +120 -0
  40. lemonade/tools/oga/load.py +804 -0
  41. lemonade/tools/oga/migration.py +403 -0
  42. lemonade/tools/oga/utils.py +462 -0
  43. lemonade/tools/perplexity.py +147 -0
  44. lemonade/tools/prompt.py +263 -0
  45. lemonade/tools/report/__init__.py +0 -0
  46. lemonade/tools/report/llm_report.py +203 -0
  47. lemonade/tools/report/table.py +899 -0
  48. lemonade/tools/server/__init__.py +0 -0
  49. lemonade/tools/server/flm.py +133 -0
  50. lemonade/tools/server/llamacpp.py +320 -0
  51. lemonade/tools/server/serve.py +2123 -0
  52. lemonade/tools/server/static/favicon.ico +0 -0
  53. lemonade/tools/server/static/index.html +279 -0
  54. lemonade/tools/server/static/js/chat.js +1059 -0
  55. lemonade/tools/server/static/js/model-settings.js +183 -0
  56. lemonade/tools/server/static/js/models.js +1395 -0
  57. lemonade/tools/server/static/js/shared.js +556 -0
  58. lemonade/tools/server/static/logs.html +191 -0
  59. lemonade/tools/server/static/styles.css +2654 -0
  60. lemonade/tools/server/static/webapp.html +321 -0
  61. lemonade/tools/server/tool_calls.py +153 -0
  62. lemonade/tools/server/tray.py +664 -0
  63. lemonade/tools/server/utils/macos_tray.py +226 -0
  64. lemonade/tools/server/utils/port.py +77 -0
  65. lemonade/tools/server/utils/thread.py +85 -0
  66. lemonade/tools/server/utils/windows_tray.py +408 -0
  67. lemonade/tools/server/webapp.py +34 -0
  68. lemonade/tools/server/wrapped_server.py +559 -0
  69. lemonade/tools/tool.py +374 -0
  70. lemonade/version.py +1 -0
  71. lemonade_install/__init__.py +1 -0
  72. lemonade_install/install.py +239 -0
  73. lemonade_sdk-9.1.1.dist-info/METADATA +276 -0
  74. lemonade_sdk-9.1.1.dist-info/RECORD +84 -0
  75. lemonade_sdk-9.1.1.dist-info/WHEEL +5 -0
  76. lemonade_sdk-9.1.1.dist-info/entry_points.txt +5 -0
  77. lemonade_sdk-9.1.1.dist-info/licenses/LICENSE +201 -0
  78. lemonade_sdk-9.1.1.dist-info/licenses/NOTICE.md +47 -0
  79. lemonade_sdk-9.1.1.dist-info/top_level.txt +3 -0
  80. lemonade_server/cli.py +805 -0
  81. lemonade_server/model_manager.py +758 -0
  82. lemonade_server/pydantic_models.py +159 -0
  83. lemonade_server/server_models.json +643 -0
  84. lemonade_server/settings.py +39 -0
File without changes
@@ -0,0 +1,133 @@
1
+ import os
2
+ import logging
3
+ import subprocess
4
+ import time
5
+ import threading
6
+
7
+ import requests
8
+
9
+ from lemonade_server.pydantic_models import (
10
+ PullConfig,
11
+ ChatCompletionRequest,
12
+ )
13
+
14
+ from lemonade.tools.server.wrapped_server import WrappedServerTelemetry, WrappedServer
15
+ from lemonade.tools.flm.utils import install_flm, download_flm_model
16
+
17
+
18
+ class FlmTelemetry(WrappedServerTelemetry):
19
+ """
20
+ Manages telemetry data collection and display for FLM server.
21
+ """
22
+
23
+ def parse_telemetry_line(self, line: str):
24
+ """
25
+ Parse telemetry data from FLM server output lines.
26
+
27
+ Note: as of FLM 0.9.10, no telemetry data is provided by the server CLI.
28
+ This function is required to be implemented, so we leave it empty
29
+ as a placeholder for now.
30
+ """
31
+
32
+ return
33
+
34
+
35
+ class FlmServer(WrappedServer):
36
+ """
37
+ Routes OpenAI API requests to an FLM server instance and returns the result
38
+ back to Lemonade Server.
39
+ """
40
+
41
+ def __init__(self):
42
+ self.flm_model_name = None
43
+ super().__init__(server_name="flm-server", telemetry=FlmTelemetry())
44
+
45
+ def address(self):
46
+ return f"http://localhost:{self.port}/v1"
47
+
48
+ def install_server(self):
49
+ """
50
+ Check if FLM is installed and at minimum version.
51
+ If not, download and run the GUI installer, then wait for completion.
52
+ """
53
+ install_flm()
54
+
55
+ def download_model(
56
+ self, config_checkpoint, config_mmproj=None, do_not_upgrade=False
57
+ ) -> dict:
58
+ download_flm_model(config_checkpoint, config_mmproj, do_not_upgrade)
59
+
60
+ def _launch_server_subprocess(
61
+ self,
62
+ model_config: PullConfig,
63
+ snapshot_files: dict,
64
+ ctx_size: int,
65
+ supports_embeddings: bool = False,
66
+ supports_reranking: bool = False,
67
+ ):
68
+
69
+ self._choose_port()
70
+
71
+ # Keep track of the FLM model name so that we can use it later
72
+ self.flm_model_name = model_config.checkpoint
73
+
74
+ command = [
75
+ "flm",
76
+ "serve",
77
+ f"{self.flm_model_name}",
78
+ "--ctx-len",
79
+ str(ctx_size),
80
+ "--port",
81
+ str(self.port),
82
+ ]
83
+
84
+ # Set up environment with library path for Linux
85
+ env = os.environ.copy()
86
+
87
+ self.process = subprocess.Popen(
88
+ command,
89
+ stdout=subprocess.PIPE,
90
+ stderr=subprocess.STDOUT,
91
+ text=True,
92
+ encoding="utf-8",
93
+ errors="replace",
94
+ bufsize=1,
95
+ env=env,
96
+ )
97
+
98
+ # Start background thread to log subprocess output
99
+ threading.Thread(
100
+ target=self._log_subprocess_output,
101
+ args=("FLM SERVER",),
102
+ daemon=True,
103
+ ).start()
104
+
105
+ def _wait_for_load(self):
106
+ """
107
+ FLM doesn't seem to have a health API, so we'll use the "list local models"
108
+ API to check if the server is up.
109
+ """
110
+ status_code = None
111
+ while not self.process.poll() and status_code != 200:
112
+ health_url = f"http://localhost:{self.port}/api/tags"
113
+ try:
114
+ health_response = requests.get(health_url)
115
+ except requests.exceptions.ConnectionError:
116
+ logging.debug(
117
+ "Not able to connect to %s yet, will retry", self.server_name
118
+ )
119
+ else:
120
+ status_code = health_response.status_code
121
+ logging.debug(
122
+ "Testing %s readiness (will retry until ready), result: %s",
123
+ self.server_name,
124
+ health_response.json(),
125
+ )
126
+ time.sleep(1)
127
+
128
+ def chat_completion(self, chat_completion_request: ChatCompletionRequest):
129
+ # FLM requires the correct model name to be in the request
130
+ # (whereas llama-server ignores the model name field in the request)
131
+ chat_completion_request.model = self.flm_model_name
132
+
133
+ return super().chat_completion(chat_completion_request)
@@ -0,0 +1,320 @@
1
+ import os
2
+ import logging
3
+ import subprocess
4
+ import re
5
+ import threading
6
+ import platform
7
+
8
+ from dotenv import load_dotenv
9
+ from fastapi import HTTPException, status
10
+
11
+ from lemonade_server.pydantic_models import (
12
+ PullConfig,
13
+ )
14
+ from lemonade.tools.llamacpp.utils import (
15
+ get_llama_server_exe_path,
16
+ install_llamacpp,
17
+ download_gguf,
18
+ resolve_local_gguf_model,
19
+ parse_checkpoint,
20
+ )
21
+ from lemonade.tools.server.wrapped_server import WrappedServerTelemetry, WrappedServer
22
+
23
+ # Embedding model batch configuration set to 8192 as default
24
+ EMBEDDING_CTX_SIZE = 8192
25
+ EMBEDDING_BATCH_SIZE = 8192
26
+ EMBEDDING_UBATCH_SIZE = 8192
27
+
28
+
29
+ class LlamaTelemetry(WrappedServerTelemetry):
30
+ """
31
+ Manages telemetry data collection and display for llama server.
32
+ """
33
+
34
+ def parse_telemetry_line(self, line: str):
35
+ """
36
+ Parse telemetry data from llama server output lines.
37
+ """
38
+
39
+ if "vk::PhysicalDevice::createDevice: ErrorExtensionNotPresent" in line:
40
+ msg = (
41
+ "Your AMD GPU driver version is not compatible with this software.\n"
42
+ "Please update and try again: "
43
+ "https://www.amd.com/en/support/download/drivers.html"
44
+ )
45
+ logging.error(msg)
46
+ raise HTTPException(
47
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
48
+ detail=msg,
49
+ )
50
+ elif "error" in line.lower():
51
+ logging.error(line)
52
+
53
+ # Parse Vulkan device detection
54
+ vulkan_match = re.search(r"ggml_vulkan: Found (\d+) Vulkan devices?:", line)
55
+ if vulkan_match:
56
+ device_count = int(vulkan_match.group(1))
57
+ if device_count > 0:
58
+ logging.info(
59
+ f"GPU acceleration active: {device_count} device(s) "
60
+ "detected by llama-server"
61
+ )
62
+ return
63
+
64
+ # Parse prompt evaluation line
65
+ prompt_match = re.search(
66
+ r"prompt eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens.*?"
67
+ r"([\d.]+)\s*tokens per second",
68
+ line,
69
+ )
70
+ if prompt_match:
71
+ prompt_time_ms = float(prompt_match.group(1))
72
+ input_tokens = int(prompt_match.group(2))
73
+
74
+ self.prompt_eval_time = prompt_time_ms / 1000.0
75
+ self.input_tokens = input_tokens
76
+ self.time_to_first_token = prompt_time_ms / 1000.0
77
+ return
78
+
79
+ # Parse generation evaluation line
80
+ eval_match = re.search(
81
+ r"eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens.*?"
82
+ r"([\d.]+)\s*tokens per second",
83
+ line,
84
+ )
85
+ if eval_match:
86
+ eval_time_ms = float(eval_match.group(1))
87
+ output_tokens = int(eval_match.group(2))
88
+ tokens_per_second = float(eval_match.group(3))
89
+
90
+ self.eval_time = eval_time_ms / 1000.0
91
+ self.output_tokens = output_tokens
92
+ self.tokens_per_second = tokens_per_second
93
+ return
94
+
95
+
96
+ class LlamaServer(WrappedServer):
97
+ def __init__(self, backend: str):
98
+ self.backend = backend
99
+ super().__init__(server_name="llama-server", telemetry=LlamaTelemetry())
100
+
101
+ def install_server(self, backend=None):
102
+ """
103
+ Install the wrapped server
104
+ """
105
+ install_llamacpp(self.backend)
106
+
107
+ def download_model(
108
+ self, config_checkpoint, config_mmproj=None, do_not_upgrade=False
109
+ ) -> dict:
110
+ """
111
+ Download a model for the wrapper server.
112
+ First checks local cache, then downloads from internet if needed.
113
+ """
114
+ # If it's a direct file path, just return it
115
+
116
+ if os.path.exists(config_checkpoint):
117
+ result = {"variant": config_checkpoint}
118
+ if config_mmproj:
119
+ result["mmproj"] = config_mmproj
120
+ return result
121
+
122
+ # Try to resolve from local cache first to avoid unnecessary downloads
123
+ checkpoint, variant = parse_checkpoint(config_checkpoint)
124
+ local_result = resolve_local_gguf_model(checkpoint, variant, config_mmproj)
125
+
126
+ if local_result:
127
+ return local_result
128
+
129
+ # Not found locally - download from internet
130
+ return download_gguf(
131
+ config_checkpoint=config_checkpoint,
132
+ config_mmproj=config_mmproj,
133
+ do_not_upgrade=do_not_upgrade,
134
+ )
135
+
136
+ def _launch_device_backend_subprocess(
137
+ self,
138
+ snapshot_files: dict,
139
+ use_gpu: bool,
140
+ ctx_size: int,
141
+ supports_embeddings: bool = False,
142
+ supports_reranking: bool = False,
143
+ ) -> subprocess.Popen:
144
+ """
145
+ Launch llama server subprocess with appropriate configuration.
146
+
147
+ Args:
148
+ snapshot_files: Dictionary of model files to load
149
+ use_gpu: Whether to use GPU acceleration
150
+ telemetry: Telemetry object for tracking performance metrics
151
+ backend: Backend to use (e.g., 'vulkan', 'rocm', 'cpu')
152
+ supports_embeddings: Whether the model supports embeddings
153
+ supports_reranking: Whether the model supports reranking
154
+
155
+ Returns:
156
+ Subprocess handle for the llama server
157
+ """
158
+
159
+ # Get the current executable path (handles both Windows and Ubuntu structures)
160
+ exe_path = get_llama_server_exe_path(self.backend)
161
+
162
+ # For embedding models, use a larger context size to support longer individual
163
+ # strings. Embedding requests can include multiple strings in a batch, and each
164
+ # string needs to fit within the context window.
165
+ if supports_embeddings and ctx_size < EMBEDDING_CTX_SIZE:
166
+ ctx_size = EMBEDDING_CTX_SIZE
167
+
168
+ # Build the base command
169
+ base_command = [
170
+ exe_path,
171
+ "-m",
172
+ snapshot_files["variant"],
173
+ "--ctx-size",
174
+ str(ctx_size),
175
+ ]
176
+
177
+ # Lock random seed for deterministic behavior in CI
178
+ if os.environ.get("LEMONADE_CI_MODE"):
179
+ base_command.extend(["--seed", "42"])
180
+ logging.info(f"Seed applied to base command: {base_command}")
181
+
182
+ if "mmproj" in snapshot_files:
183
+ base_command.extend(["--mmproj", snapshot_files["mmproj"]])
184
+ if not use_gpu:
185
+ base_command.extend(["--no-mmproj-offload"])
186
+
187
+ # Find a port, and save it in the telemetry object for future reference
188
+ # by other functions
189
+ self._choose_port()
190
+
191
+ # Add port and jinja to enable tool use
192
+ base_command.extend(["--port", str(self.port), "--jinja"])
193
+
194
+ # Enable context shift and avoid attention sink issues by preserving the initial tokens
195
+ # Note: --context-shift is not supported on all backends (e.g., Metal on macOS)
196
+ # Only add context-shift for backends that support it
197
+ context_shift_supported_backends = ["vulkan", "rocm"]
198
+ if self.backend in context_shift_supported_backends:
199
+ base_command.extend(["--context-shift", "--keep", "16"])
200
+ else:
201
+ # For backends that don't support context-shift (e.g., Metal), just use keep
202
+ base_command.extend(["--keep", "16"])
203
+ logging.debug(
204
+ f"Skipped --context-shift for backend: {self.backend} (not supported)"
205
+ )
206
+
207
+ # Use legacy reasoning formatting, since not all apps support the new
208
+ # reasoning_content field
209
+ base_command.extend(["--reasoning-format", "auto"])
210
+
211
+ # Add embeddings support if the model supports it
212
+ if supports_embeddings:
213
+ # For embedding models, set batch sizes to handle multiple documents in a single request
214
+ # batch-size: logical batch size (total tokens across all sequences)
215
+ # ubatch-size: physical batch size (tokens processed in a single forward pass)
216
+ base_command.extend(
217
+ [
218
+ "--embeddings",
219
+ "--batch-size",
220
+ str(EMBEDDING_BATCH_SIZE),
221
+ "--ubatch-size",
222
+ str(EMBEDDING_UBATCH_SIZE),
223
+ ]
224
+ )
225
+
226
+ # Add reranking support if the model supports it
227
+ if supports_reranking:
228
+ base_command.append("--reranking")
229
+
230
+ # Configure GPU layers: 99 for GPU, 0 for CPU-only
231
+ ngl_value = "99" if use_gpu else "0"
232
+ command = base_command + ["-ngl", ngl_value]
233
+
234
+ # Set up environment with library path for Linux
235
+ env = os.environ.copy()
236
+
237
+ # Load environment variables from .env file in the executable directory
238
+ exe_dir = os.path.dirname(exe_path)
239
+ env_file_path = os.path.join(exe_dir, ".env")
240
+ if os.path.exists(env_file_path):
241
+ load_dotenv(env_file_path, override=False)
242
+ env.update(os.environ)
243
+ logging.debug(f"Loaded environment variables from {env_file_path}")
244
+
245
+ system = platform.system().lower()
246
+ if system == "linux":
247
+ lib_dir = os.path.dirname(exe_path) # Same directory as the executable
248
+ current_ld_path = env.get("LD_LIBRARY_PATH", "")
249
+ if current_ld_path:
250
+ env["LD_LIBRARY_PATH"] = f"{lib_dir}:{current_ld_path}"
251
+ else:
252
+ env["LD_LIBRARY_PATH"] = lib_dir
253
+ logging.debug(f"Set LD_LIBRARY_PATH to {env['LD_LIBRARY_PATH']}")
254
+ elif system == "darwin":
255
+ lib_dir = os.path.dirname(exe_path)
256
+ current_dyld_path = env.get("DYLD_LIBRARY_PATH", "")
257
+ if current_dyld_path:
258
+ env["DYLD_LIBRARY_PATH"] = f"{lib_dir}:{current_dyld_path}"
259
+ else:
260
+ env["DYLD_LIBRARY_PATH"] = lib_dir
261
+ logging.debug(f"Set DYLD_LIBRARY_PATH to {env['DYLD_LIBRARY_PATH']}")
262
+
263
+ # Start subprocess with output capture
264
+ self.process = subprocess.Popen(
265
+ command,
266
+ stdout=subprocess.PIPE,
267
+ stderr=subprocess.STDOUT,
268
+ text=True,
269
+ encoding="utf-8",
270
+ errors="replace",
271
+ bufsize=1,
272
+ env=env,
273
+ )
274
+
275
+ # Start background thread to log subprocess output
276
+ device_type = "GPU" if use_gpu else "CPU"
277
+ threading.Thread(
278
+ target=self._log_subprocess_output,
279
+ args=(f"LLAMA SERVER {device_type}",),
280
+ daemon=True,
281
+ ).start()
282
+
283
+ def _launch_server_subprocess(
284
+ self,
285
+ model_config: PullConfig,
286
+ snapshot_files: dict,
287
+ ctx_size: int,
288
+ supports_embeddings: bool = False,
289
+ supports_reranking: bool = False,
290
+ ):
291
+
292
+ # Attempt loading on GPU first
293
+ self._launch_device_backend_subprocess(
294
+ snapshot_files,
295
+ use_gpu=True,
296
+ ctx_size=ctx_size,
297
+ supports_embeddings=supports_embeddings,
298
+ supports_reranking=supports_reranking,
299
+ )
300
+
301
+ # Check the /health endpoint until GPU server is ready
302
+ self._wait_for_load()
303
+
304
+ # If loading on GPU failed, try loading on CPU
305
+ if self.process.poll():
306
+ logging.warning(
307
+ f"Loading {model_config.model_name} on GPU didn't work, re-attempting on CPU"
308
+ )
309
+
310
+ if os.environ.get("LEMONADE_LLAMACPP_NO_FALLBACK"):
311
+ # Used for testing, when the test should fail if GPU didn't work
312
+ raise Exception("llamacpp GPU loading failed")
313
+
314
+ self._launch_device_backend_subprocess(
315
+ snapshot_files,
316
+ use_gpu=False,
317
+ ctx_size=ctx_size,
318
+ supports_embeddings=supports_embeddings,
319
+ supports_reranking=supports_reranking,
320
+ )