lemonade-sdk 7.0.0__py3-none-any.whl → 7.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

@@ -0,0 +1,315 @@
1
+ import sys
2
+ import os
3
+ import logging
4
+ import time
5
+ import subprocess
6
+ import zipfile
7
+ import re
8
+ import threading
9
+
10
+ import requests
11
+ from tabulate import tabulate
12
+ from fastapi import HTTPException, status
13
+ from fastapi.responses import StreamingResponse
14
+
15
+ from openai import OpenAI
16
+
17
+ from lemonade_server.model_manager import ModelManager
18
+ from lemonade.tools.server.pydantic_models import ChatCompletionRequest
19
+ from lemonade.tools.server.port_utils import find_free_port
20
+
21
+ LLAMA_VERSION = "b5543"
22
+
23
+ LLAMA_SERVER_EXE_DIR = os.path.join(
24
+ os.path.dirname(sys.executable),
25
+ "llama_server",
26
+ )
27
+
28
+ LLAMA_SERVER_EXE_PATH = os.path.join(
29
+ LLAMA_SERVER_EXE_DIR,
30
+ "llama-server.exe",
31
+ )
32
+
33
+
34
+ class LlamaTelemetry:
35
+ """
36
+ Manages telemetry data collection and display for llama server.
37
+ """
38
+
39
+ def __init__(self):
40
+ self.input_tokens = None
41
+ self.output_tokens = None
42
+ self.time_to_first_token = None
43
+ self.tokens_per_second = None
44
+ self.prompt_eval_time = None
45
+ self.eval_time = None
46
+ self.port = None
47
+
48
+ def choose_port(self):
49
+ """
50
+ Users probably don't care what port we start llama-server on, so let's
51
+ search for an empty port
52
+ """
53
+
54
+ self.port = find_free_port()
55
+
56
+ if self.port is None:
57
+ msg = "Failed to find an empty port to start llama-server on"
58
+ logging.error(msg)
59
+ raise HTTPException(
60
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
61
+ detail=msg,
62
+ )
63
+
64
+ def parse_telemetry_line(self, line: str):
65
+ """
66
+ Parse telemetry data from llama server output lines.
67
+ """
68
+
69
+ # Parse prompt evaluation line
70
+ prompt_match = re.search(
71
+ # pylint: disable=C0301
72
+ r"prompt eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens.*?([\d.]+)\s*tokens per second",
73
+ line,
74
+ )
75
+ if prompt_match:
76
+ prompt_time_ms = float(prompt_match.group(1))
77
+ input_tokens = int(prompt_match.group(2))
78
+
79
+ self.prompt_eval_time = prompt_time_ms / 1000.0
80
+ self.input_tokens = input_tokens
81
+ self.time_to_first_token = prompt_time_ms / 1000.0
82
+ return
83
+
84
+ # Parse generation evaluation line
85
+ eval_match = re.search(
86
+ r"eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens.*?([\d.]+)\s*tokens per second",
87
+ line,
88
+ )
89
+ if eval_match:
90
+ eval_time_ms = float(eval_match.group(1))
91
+ output_tokens = int(eval_match.group(2))
92
+ tokens_per_second = float(eval_match.group(3))
93
+
94
+ self.eval_time = eval_time_ms / 1000.0
95
+ self.output_tokens = output_tokens
96
+ self.tokens_per_second = tokens_per_second
97
+ return
98
+
99
+ def get_telemetry_data(self):
100
+ return {
101
+ "input_tokens": self.input_tokens,
102
+ "output_tokens": self.output_tokens,
103
+ "time_to_first_token": self.time_to_first_token,
104
+ "tokens_per_second": self.tokens_per_second,
105
+ "decode_token_times": None,
106
+ }
107
+
108
+ def show_telemetry(self):
109
+ # Check if debug logging is enabled
110
+ if not logging.getLogger().isEnabledFor(logging.DEBUG):
111
+ return
112
+
113
+ # Prepare telemetry data (transposed format)
114
+ telemetry = [
115
+ ["Input tokens", self.input_tokens],
116
+ ["Output tokens", self.output_tokens],
117
+ ["TTFT (s)", f"{self.time_to_first_token:.2f}"],
118
+ ["TPS", f"{self.tokens_per_second:.2f}"],
119
+ ]
120
+
121
+ table = tabulate(
122
+ telemetry, headers=["Metric", "Value"], tablefmt="fancy_grid"
123
+ ).split("\n")
124
+
125
+ # Show telemetry in debug while complying with uvicorn's log indentation
126
+ logging.debug("\n ".join(table))
127
+
128
+
129
+ def _log_subprocess_output(
130
+ process: subprocess.Popen, prefix: str, telemetry: LlamaTelemetry
131
+ ):
132
+ """
133
+ Read subprocess output line by line, log to debug, and parse telemetry
134
+ """
135
+
136
+ if process.stdout:
137
+ for line in iter(process.stdout.readline, ""):
138
+ if line:
139
+ line_stripped = line.strip()
140
+ logging.debug("%s: %s", prefix, line_stripped)
141
+
142
+ telemetry.parse_telemetry_line(line_stripped)
143
+
144
+ if process.poll() is not None:
145
+ break
146
+
147
+
148
+ def _wait_for_load(
149
+ llama_server_process: subprocess.Popen, port: int, fail_message: str
150
+ ):
151
+ status_code = None
152
+ while not llama_server_process.poll() and status_code != 200:
153
+ health_url = f"http://localhost:{port}/health"
154
+ try:
155
+ health_response = requests.get(health_url)
156
+ except requests.exceptions.ConnectionError:
157
+ logging.warning(fail_message)
158
+ else:
159
+ status_code = health_response.status_code
160
+ logging.debug(
161
+ "Testing llama-server readiness (will retry until ready), "
162
+ f"result: {health_response.json()}"
163
+ )
164
+ time.sleep(1)
165
+
166
+
167
+ def _launch_llama_subprocess(
168
+ model_path: str, use_gpu: bool, telemetry: LlamaTelemetry
169
+ ) -> subprocess.Popen:
170
+ """
171
+ Launch llama server subprocess with GPU or CPU configuration
172
+ """
173
+
174
+ # Find a port, and save it in the telemetry object for future reference
175
+ # by other functions
176
+ telemetry.choose_port()
177
+
178
+ base_command = [
179
+ LLAMA_SERVER_EXE_PATH,
180
+ "-m",
181
+ model_path,
182
+ "--port",
183
+ str(telemetry.port),
184
+ "--jinja",
185
+ ]
186
+
187
+ # Configure GPU layers: 99 for GPU, 0 for CPU-only
188
+ ngl_value = "99" if use_gpu else "0"
189
+ command = base_command + ["-ngl", ngl_value]
190
+
191
+ # Start subprocess with output capture
192
+ process = subprocess.Popen(
193
+ command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1
194
+ )
195
+
196
+ # Start background thread to log subprocess output
197
+ device_type = "GPU" if use_gpu else "CPU"
198
+ threading.Thread(
199
+ target=_log_subprocess_output,
200
+ args=(process, f"LLAMA SERVER {device_type}", telemetry),
201
+ daemon=True,
202
+ ).start()
203
+
204
+ return process
205
+
206
+
207
+ def server_load(checkpoint: str, model_reference: str, telemetry: LlamaTelemetry):
208
+ # Download llama.cpp server if it isn't already available
209
+ if not os.path.exists(LLAMA_SERVER_EXE_DIR):
210
+ # Download llama.cpp server zip
211
+ # pylint: disable=C0301
212
+ llama_zip_url = f"https://github.com/ggml-org/llama.cpp/releases/download/{LLAMA_VERSION}/llama-{LLAMA_VERSION}-bin-win-vulkan-x64.zip"
213
+ llama_zip_path = os.path.join(
214
+ os.path.dirname(sys.executable), "llama-server.zip"
215
+ )
216
+ logging.info(f"Downloading llama.cpp server from {llama_zip_url}")
217
+
218
+ with requests.get(llama_zip_url, stream=True) as r:
219
+ r.raise_for_status()
220
+ with open(llama_zip_path, "wb") as f:
221
+ for chunk in r.iter_content(chunk_size=8192):
222
+ f.write(chunk)
223
+
224
+ # Extract zip
225
+ logging.info(f"Extracting {llama_zip_path} to {LLAMA_SERVER_EXE_DIR}")
226
+ with zipfile.ZipFile(llama_zip_path, "r") as zip_ref:
227
+ zip_ref.extractall(LLAMA_SERVER_EXE_DIR)
228
+
229
+ # Save version.txt
230
+ version_txt_path = os.path.join(LLAMA_SERVER_EXE_DIR, "version.txt")
231
+ with open(version_txt_path, "w", encoding="utf-8") as vf:
232
+ vf.write(LLAMA_VERSION)
233
+
234
+ # Delete zip file
235
+ os.remove(llama_zip_path)
236
+ logging.info("Cleaned up zip file")
237
+
238
+ # Download the gguf to the hugging face cache
239
+ snapshot_path = ModelManager().download_gguf(checkpoint)
240
+ model_path = os.path.join(snapshot_path, os.listdir(snapshot_path)[0])
241
+ logging.debug(f"GGUF file path: {model_path}")
242
+
243
+ # Start the llama-serve.exe process
244
+ logging.debug(f"Using llama_server for GGUF model: {LLAMA_SERVER_EXE_PATH}")
245
+
246
+ # Attempt loading on GPU first
247
+ llama_server_process = _launch_llama_subprocess(
248
+ model_path, use_gpu=True, telemetry=telemetry
249
+ )
250
+
251
+ # Check the /health endpoint until GPU server is ready
252
+ _wait_for_load(
253
+ llama_server_process,
254
+ telemetry.port,
255
+ f"Loading {model_reference} on GPU didn't work, re-attempting on CPU",
256
+ )
257
+
258
+ # If loading on GPU failed, try loading on CPU
259
+ if llama_server_process.poll():
260
+ llama_server_process = _launch_llama_subprocess(
261
+ model_path, use_gpu=False, telemetry=telemetry
262
+ )
263
+
264
+ # Check the /health endpoint until CPU server is ready
265
+ _wait_for_load(
266
+ llama_server_process,
267
+ telemetry.port,
268
+ f"Loading {model_reference} on CPU didn't work",
269
+ )
270
+
271
+ if llama_server_process.poll():
272
+ raise HTTPException(
273
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
274
+ detail=f"Failed to load {model_reference} with llama.cpp",
275
+ )
276
+
277
+ return llama_server_process
278
+
279
+
280
+ def chat_completion(
281
+ chat_completion_request: ChatCompletionRequest, telemetry: LlamaTelemetry
282
+ ):
283
+ base_url = f"http://127.0.0.1:{telemetry.port}/v1"
284
+ client = OpenAI(
285
+ base_url=base_url,
286
+ api_key="lemonade",
287
+ )
288
+
289
+ # Convert Pydantic model to dict and remove unset/null values
290
+ request_dict = chat_completion_request.model_dump(
291
+ exclude_unset=True, exclude_none=True
292
+ )
293
+
294
+ def event_stream():
295
+ try:
296
+ # Enable streaming
297
+ request_dict["stream"] = True
298
+ for chunk in client.chat.completions.create(**request_dict):
299
+ yield f"data: {chunk.model_dump_json()}\n\n"
300
+ yield "data: [DONE]\n\n"
301
+
302
+ # Show telemetry after completion
303
+ telemetry.show_telemetry()
304
+
305
+ except Exception as e: # pylint: disable=broad-exception-caught
306
+ yield f'data: {{"error": "{str(e)}"}}\n\n'
307
+
308
+ return StreamingResponse(
309
+ event_stream(),
310
+ media_type="text/event-stream",
311
+ headers={
312
+ "Cache-Control": "no-cache",
313
+ "Connection": "keep-alive",
314
+ },
315
+ )
@@ -0,0 +1,57 @@
1
+ import socketserver
2
+ import sys
3
+ import logging
4
+ from contextlib import asynccontextmanager
5
+ from fastapi import FastAPI
6
+
7
+
8
+ def find_free_port():
9
+ """
10
+ Scans for an unoccupied TCP port
11
+
12
+ Returns the port number as an int on success
13
+ Returns None if no port can be found
14
+ """
15
+
16
+ try:
17
+ with socketserver.TCPServer(("localhost", 0), None) as s:
18
+ return s.server_address[1]
19
+ # pylint: disable=broad-exception-caught
20
+ except Exception:
21
+ return None
22
+
23
+
24
+ @asynccontextmanager
25
+ async def lifespan(app: FastAPI):
26
+ # Code here will run when the application starts up
27
+ # Check if console can handle Unicode by testing emoji encoding
28
+
29
+ try:
30
+ if sys.stdout.encoding:
31
+ "🍋".encode(sys.stdout.encoding)
32
+ use_emojis = True
33
+ except (UnicodeEncodeError, AttributeError):
34
+ use_emojis = False
35
+
36
+ if use_emojis:
37
+ logging.info(
38
+ "\n"
39
+ "\n"
40
+ "🍋 Lemonade Server Ready!\n"
41
+ f"🍋 Open http://localhost:{app.port} in your browser for:\n"
42
+ "🍋 💬 chat\n"
43
+ "🍋 💻 model management\n"
44
+ "🍋 📄 docs\n"
45
+ )
46
+ else:
47
+ logging.info(
48
+ "\n"
49
+ "\n"
50
+ "[Lemonade] Lemonade Server Ready!\n"
51
+ f"[Lemonade] Open http://localhost:{app.port} in your browser for:\n"
52
+ "[Lemonade] chat\n"
53
+ "[Lemonade] model management\n"
54
+ "[Lemonade] docs\n"
55
+ )
56
+
57
+ yield
@@ -0,0 +1,83 @@
1
+ from typing import Optional
2
+
3
+ from pydantic import BaseModel
4
+
5
+ # Set to a high number to allow for interesting experiences in real apps
6
+ # Tests should use the max_new_tokens argument to set a lower value
7
+ DEFAULT_MAX_NEW_TOKENS = 1500
8
+
9
+
10
+ class LoadConfig(BaseModel):
11
+ """
12
+ Configuration for loading a language model.
13
+
14
+ Specifies the model checkpoint, generation parameters,
15
+ and hardware/framework configuration (recipe) for model loading.
16
+ """
17
+
18
+ model_name: Optional[str] = None
19
+ checkpoint: Optional[str] = None
20
+ max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS
21
+ recipe: Optional[str] = None
22
+ # Indicates the maximum prompt length allowed for that specific
23
+ # checkpoint + recipe combination
24
+ max_prompt_length: Optional[int] = None
25
+ # Indicates whether the model is a reasoning model, like DeepSeek
26
+ reasoning: Optional[bool] = False
27
+
28
+
29
+ class CompletionRequest(BaseModel):
30
+ """
31
+ Request model for text completion API endpoint.
32
+
33
+ Contains a prompt, a model identifier, and a streaming
34
+ flag to control response delivery.
35
+ """
36
+
37
+ prompt: str
38
+ model: str
39
+ echo: bool = False
40
+ stream: bool = False
41
+ logprobs: int | None = False
42
+ stop: list[str] | str | None = None
43
+ temperature: float | None = None
44
+ max_tokens: int | None = None
45
+
46
+
47
+ class ChatCompletionRequest(BaseModel):
48
+ """
49
+ Request model for chat completion API endpoint.
50
+
51
+ Contains a list of chat messages, a model identifier,
52
+ and a streaming flag to control response delivery.
53
+ """
54
+
55
+ messages: list[dict]
56
+ model: str
57
+ stream: bool = False
58
+ logprobs: int | None = False
59
+ stop: list[str] | str | None = None
60
+ temperature: float | None = None
61
+ tools: list[dict] | None = None
62
+ max_tokens: int | None = None
63
+ max_completion_tokens: int | None = None
64
+
65
+
66
+ class ResponsesRequest(BaseModel):
67
+ """
68
+ Request model for responses API endpoint.
69
+ """
70
+
71
+ input: list[dict] | str
72
+ model: str
73
+ max_output_tokens: int | None = None
74
+ temperature: float | None = None
75
+ stream: bool = False
76
+
77
+
78
+ class PullConfig(BaseModel):
79
+ """
80
+ Configurating for installing a supported LLM.
81
+ """
82
+
83
+ model_name: str