lemonade-sdk 7.0.0__py3-none-any.whl → 7.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

@@ -7,12 +7,16 @@ import logging
7
7
  import traceback
8
8
  from typing import Optional, Union
9
9
  import json
10
+ import subprocess
11
+ from pathlib import Path
10
12
 
11
13
  from fastapi import FastAPI, HTTPException, status, Request
12
- from fastapi.responses import StreamingResponse, HTMLResponse
14
+ from fastapi.responses import StreamingResponse
13
15
  from fastapi.middleware.cors import CORSMiddleware
14
- from pydantic import BaseModel
16
+ from fastapi.staticfiles import StaticFiles
15
17
  import uvicorn
18
+ from uvicorn.config import Config
19
+ from uvicorn.server import Server as UvicornServer
16
20
  from transformers import TextIteratorStreamer, StoppingCriteria, StoppingCriteriaList
17
21
  from tabulate import tabulate
18
22
 
@@ -24,7 +28,11 @@ from openai.types.chat.chat_completion_message_tool_call import (
24
28
  Function,
25
29
  )
26
30
  from openai.types.chat.chat_completion import Choice
27
- from openai.types.chat.chat_completion_chunk import ChoiceDelta
31
+ from openai.types.chat.chat_completion_chunk import (
32
+ ChoiceDelta,
33
+ ChoiceDeltaToolCall,
34
+ ChoiceDeltaToolCallFunction,
35
+ )
28
36
  from openai.types.completion_choice import Logprobs
29
37
  from openai.types.model import Model
30
38
  from openai.types.responses import (
@@ -39,11 +47,18 @@ from openai.types.responses import (
39
47
  import lemonade.api as lemonade_api
40
48
  from lemonade_server.model_manager import ModelManager
41
49
  from lemonade.tools.management_tools import ManagementTool
42
- from lemonade.tools.server.tool_calls import extract_tool_calls
43
-
44
- # Set to a high number to allow for interesting experiences in real apps
45
- # Tests should use the max_new_tokens argument to set a lower value
46
- DEFAULT_MAX_NEW_TOKENS = 1500
50
+ import lemonade.tools.server.llamacpp as llamacpp
51
+ from lemonade.tools.server.pydantic_models import (
52
+ DEFAULT_MAX_NEW_TOKENS,
53
+ LoadConfig,
54
+ CompletionRequest,
55
+ ChatCompletionRequest,
56
+ ResponsesRequest,
57
+ PullConfig,
58
+ )
59
+ from lemonade.tools.server.tool_calls import extract_tool_calls, get_tool_call_pattern
60
+ from lemonade.tools.server.instructions import get_instructions_html
61
+ from lemonade.tools.server.port_utils import lifespan
47
62
 
48
63
  DEFAULT_PORT = 8000
49
64
  DEFAULT_LOG_LEVEL = "info"
@@ -101,97 +116,21 @@ class StopOnEvent(StoppingCriteria):
101
116
  return self.stop_event.is_set()
102
117
 
103
118
 
104
- class PullConfig(BaseModel):
105
- """
106
- Configurating for installing a supported LLM.
107
- """
108
-
109
- model_name: str
110
-
111
-
112
- class LoadConfig(BaseModel):
113
- """
114
- Configuration for loading a language model.
115
-
116
- Specifies the model checkpoint, generation parameters,
117
- and hardware/framework configuration (recipe) for model loading.
118
- """
119
-
120
- model_name: Optional[str] = None
121
- checkpoint: Optional[str] = None
122
- max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS
123
- recipe: Optional[str] = None
124
- # Indicates the maximum prompt length allowed for that specific
125
- # checkpoint + recipe combination
126
- max_prompt_length: Optional[int] = None
127
- # Indicates whether the model is a reasoning model, like DeepSeek
128
- reasoning: Optional[bool] = False
129
-
130
-
131
- class CompletionRequest(BaseModel):
132
- """
133
- Request model for text completion API endpoint.
134
-
135
- Contains a prompt, a model identifier, and a streaming
136
- flag to control response delivery.
137
- """
138
-
139
- prompt: str
140
- model: str
141
- echo: bool = False
142
- stream: bool = False
143
- logprobs: int | None = False
144
- stop: list[str] | str | None = None
145
- temperature: float | None = None
146
- max_tokens: int | None = None
147
-
148
-
149
- class ChatCompletionRequest(BaseModel):
150
- """
151
- Request model for chat completion API endpoint.
152
-
153
- Contains a list of chat messages, a model identifier,
154
- and a streaming flag to control response delivery.
155
- """
156
-
157
- messages: list[dict]
158
- model: str
159
- stream: bool = False
160
- logprobs: int | None = False
161
- stop: list[str] | str | None = None
162
- temperature: float | None = None
163
- tools: list[dict] | None = None
164
- max_tokens: int | None = None
165
- max_completion_tokens: int | None = None
166
-
167
-
168
- class ResponsesRequest(BaseModel):
169
- """
170
- Request model for responses API endpoint.
171
- """
172
-
173
- input: list[dict] | str
174
- model: str
175
- max_output_tokens: int | None = None
176
- temperature: float | None = None
177
- stream: bool = False
178
-
179
-
180
119
  class Server(ManagementTool):
181
120
  """
182
121
  Open a web server that apps can use to communicate with the LLM.
183
122
 
184
123
  The server exposes these endpoints:
185
- - /api/v0/pull: install an LLM by its Lemonade Server Model Name.
186
- - /api/v0/load: load a model checkpoint.
187
- - /api/v0/unload: unload a model checkpoint.
188
- - /api/v0/health: check whether a model is loaded and ready to serve.
189
- - /api/v0/stats: performance statistics for the generation.
190
- - /api/v0/halt: stop an in-progress generation from make more tokens.
191
- - /api/v0/completions: completion responses using HTTP chunked transfer encoding.
192
- - /api/v0/chat/completions: chat completion responses using HTTP chunked transfer encoding.
193
- - /api/v0/responses: responses API using HTTP chunked transfer encoding.
194
- - /api/v0/models: list all available models.
124
+ - /api/v1/pull: install an LLM by its Lemonade Server Model Name.
125
+ - /api/v1/load: load a model checkpoint.
126
+ - /api/v1/unload: unload a model checkpoint.
127
+ - /api/v1/health: check whether a model is loaded and ready to serve.
128
+ - /api/v1/stats: performance statistics for the generation.
129
+ - /api/v1/halt: stop an in-progress generation from make more tokens.
130
+ - /api/v1/completions: completion responses using HTTP chunked transfer encoding.
131
+ - /api/v1/chat/completions: chat completion responses using HTTP chunked transfer encoding.
132
+ - /api/v1/responses: responses API using HTTP chunked transfer encoding.
133
+ - /api/v1/models: list all available models.
195
134
  """
196
135
 
197
136
  unique_name = "serve"
@@ -200,7 +139,7 @@ class Server(ManagementTool):
200
139
  super().__init__()
201
140
 
202
141
  # Initialize FastAPI app
203
- self.app = FastAPI()
142
+ self.app = FastAPI(lifespan=lifespan)
204
143
 
205
144
  # Add CORS middleware
206
145
  self.app.add_middleware(
@@ -212,23 +151,18 @@ class Server(ManagementTool):
212
151
  )
213
152
 
214
153
  # Set up custom routes
215
- self.app.post("/api/v0/pull")(self.pull)
216
- self.app.post("/api/v0/load")(self.load_llm)
217
- self.app.post("/api/v0/unload")(self.unload_llm)
218
- self.app.get("/api/v0/health")(self.health)
219
- self.app.get("/api/v0/halt")(self.halt_generation)
220
- self.app.get("/api/v0/stats")(self.send_stats)
221
- self.app.post("/api/v0/completions")(self.completions)
222
- self.app.post("/api/v0/responses")(self.responses)
223
-
224
- # Set up OpenAI-compatible routes
225
- self.app.post("/api/v0/chat/completions")(self.chat_completions)
226
- self.app.post("/api/v0/completions")(self.completions)
227
- self.app.get("/api/v0/models")(self.models)
154
+ self.setup_routes(["/api/v0", "/api/v1"])
228
155
 
229
156
  # Set up instructions
230
157
  self.app.get("/")(self.instructions)
231
158
 
159
+ # Mount a static assets dir for HTML responses, such
160
+ # as the instructions
161
+ static_dir = Path(__file__).parent / "static"
162
+ self.app.mount(
163
+ "/static", StaticFiles(directory=static_dir), name="static_assets"
164
+ )
165
+
232
166
  # Performance stats that are set during /ws and can be
233
167
  # fetched in /stats
234
168
  self.time_to_first_token = None
@@ -263,6 +197,28 @@ class Server(ManagementTool):
263
197
  # Add lock for load/unload operations
264
198
  self._load_lock = asyncio.Lock()
265
199
 
200
+ # Subprocess handle for llama_server.exe
201
+ self.llama_server_process: subprocess.Popen = None
202
+
203
+ # Telemetry instance for llama server
204
+ self.llama_telemetry = llamacpp.LlamaTelemetry()
205
+
206
+ def setup_routes(self, api_prefixes: list[str]):
207
+ for prefix in api_prefixes:
208
+ # Custom routes
209
+ self.app.post(f"{prefix}/pull")(self.pull)
210
+ self.app.post(f"{prefix}/load")(self.load_llm)
211
+ self.app.post(f"{prefix}/unload")(self.unload_llm)
212
+ self.app.get(f"{prefix}/health")(self.health)
213
+ self.app.get(f"{prefix}/halt")(self.halt_generation)
214
+ self.app.get(f"{prefix}/stats")(self.send_stats)
215
+ self.app.post(f"{prefix}/completions")(self.completions)
216
+ self.app.post(f"{prefix}/responses")(self.responses)
217
+
218
+ # OpenAI-compatible routes
219
+ self.app.post(f"{prefix}/chat/completions")(self.chat_completions)
220
+ self.app.get(f"{prefix}/models")(self.models)
221
+
266
222
  @staticmethod
267
223
  def parser(add_help: bool = True) -> argparse.ArgumentParser:
268
224
  parser = __class__.helpful_parser(
@@ -288,15 +244,22 @@ class Server(ManagementTool):
288
244
 
289
245
  return parser
290
246
 
291
- def run(
247
+ def _setup_server_common(
292
248
  self,
293
- # ManagementTool has a required cache_dir arg, but
294
- # we always use the default cache directory
295
- _=None,
296
- port: int = DEFAULT_PORT,
297
- log_level: str = DEFAULT_LOG_LEVEL,
249
+ port: int,
298
250
  truncate_inputs: bool = False,
251
+ log_level: str = DEFAULT_LOG_LEVEL,
252
+ threaded_mode: bool = False,
299
253
  ):
254
+ """
255
+ Common setup logic shared between run() and run_in_thread().
256
+
257
+ Args:
258
+ port: Port number for the server
259
+ truncate_inputs: Whether to truncate inputs if they exceed max length
260
+ log_level: Logging level to configure
261
+ threaded_mode: Whether this is being set up for threaded execution
262
+ """
300
263
  # Store truncation settings
301
264
  self.truncate_inputs = truncate_inputs
302
265
 
@@ -310,22 +273,27 @@ class Server(ManagementTool):
310
273
 
311
274
  logging.trace = trace
312
275
 
313
- # Configure logging to match uvicorn's format
314
- logging_level = getattr(logging, log_level.upper())
315
- logging.basicConfig(
316
- level=logging_level,
317
- format="%(levelprefix)s %(message)s",
318
- datefmt="%Y-%m-%d %H:%M:%S",
319
- )
276
+ # Configure logging based on mode
277
+ if threaded_mode:
278
+ # Configure logging for warning level (to reduce noise in threaded execution)
279
+ logging.getLogger("uvicorn.error").setLevel(logging.WARNING)
280
+ else:
281
+ # Configure logging to match uvicorn's format
282
+ logging_level = getattr(logging, log_level.upper())
283
+ logging.basicConfig(
284
+ level=logging_level,
285
+ format="%(levelprefix)s %(message)s",
286
+ datefmt="%Y-%m-%d %H:%M:%S",
287
+ )
320
288
 
321
- # Add uvicorn's log formatter
322
- logging.root.handlers[0].formatter = uvicorn.logging.DefaultFormatter(
323
- fmt="%(levelprefix)s %(message)s",
324
- use_colors=True,
325
- )
289
+ # Add uvicorn's log formatter
290
+ logging.root.handlers[0].formatter = uvicorn.logging.DefaultFormatter(
291
+ fmt="%(levelprefix)s %(message)s",
292
+ use_colors=True,
293
+ )
326
294
 
327
- # Ensure the log level is properly set
328
- logging.getLogger().setLevel(logging_level)
295
+ # Ensure the log level is properly set
296
+ logging.getLogger().setLevel(logging_level)
329
297
 
330
298
  # Update debug logging state after setting log level
331
299
  self.debug_logging_enabled = logging.getLogger().isEnabledFor(logging.DEBUG)
@@ -334,8 +302,66 @@ class Server(ManagementTool):
334
302
  # Print the elapsed time for each request
335
303
  self.setup_middleware_timer()
336
304
 
305
+ # Let the app know what port it's running on, so
306
+ # that the lifespan can access it
307
+ self.app.port = port
308
+
309
+ def run(
310
+ self,
311
+ # ManagementTool has a required cache_dir arg, but
312
+ # we always use the default cache directory
313
+ _=None,
314
+ port: int = DEFAULT_PORT,
315
+ log_level: str = DEFAULT_LOG_LEVEL,
316
+ truncate_inputs: bool = False,
317
+ ):
318
+ # Common setup
319
+ self._setup_server_common(
320
+ port=port,
321
+ truncate_inputs=truncate_inputs,
322
+ log_level=log_level,
323
+ threaded_mode=False,
324
+ )
325
+
337
326
  uvicorn.run(self.app, host="localhost", port=port, log_level=log_level)
338
327
 
328
+ def run_in_thread(
329
+ self,
330
+ port: int = DEFAULT_PORT,
331
+ host: str = "localhost",
332
+ log_level: str = "warning",
333
+ truncate_inputs: bool = False,
334
+ ):
335
+ """
336
+ Set up the server for running in a thread.
337
+ Returns a uvicorn server instance that can be controlled externally.
338
+ """
339
+ # Common setup
340
+ self._setup_server_common(
341
+ port=port,
342
+ truncate_inputs=truncate_inputs,
343
+ log_level=log_level,
344
+ threaded_mode=True,
345
+ )
346
+
347
+ class CustomServer(UvicornServer):
348
+ """Custom Uvicorn server that can be properly shutdown from another thread"""
349
+
350
+ def install_signal_handlers(self):
351
+ pass
352
+
353
+ # Configure the server
354
+ config = Config(
355
+ app=self.app,
356
+ host=host,
357
+ port=port,
358
+ log_level=log_level,
359
+ log_config=None,
360
+ )
361
+
362
+ # Create and return the uvicorn server
363
+ return CustomServer(config=config)
364
+
339
365
  async def _show_telemetry(self):
340
366
  """
341
367
  Show telemetry data in debug mode.
@@ -363,31 +389,8 @@ class Server(ManagementTool):
363
389
  """
364
390
  Show instructions on how to use the server.
365
391
  """
366
- html_content = """
367
- <!DOCTYPE html>
368
- <html>
369
- <head>
370
- <title>Lemonade Server</title>
371
- <link rel="icon" href="data:,">
372
- </head>
373
- <body>
374
- <h1>🍋 Welcome to Lemonade Server!</h1>
375
- <p>
376
- A standards-compliant server that provides REST APIs for LLM communication.
377
- To get started, simply point your OpenAI-compatible application at the server's endpoint.
378
- </p>
379
- <div class="links">
380
- <h3>Documentation:</h3>
381
- <ul>
382
- <li><a href="https://github.com/lemonade-sdk/lemonade/tree/main/docs/server/apps/README.md">Examples & Usage</a></li>
383
- <li><a href="https://github.com/lemonade-sdk/lemonade/blob/main/docs/server/server_integration.md">Integration Guide</a></li>
384
- <li><a href="https://github.com/lemonade-sdk/lemonade/blob/main/docs/server/server_spec.md">Server Specification</a></li>
385
- </ul>
386
- </div>
387
- </body>
388
- </html>
389
- """
390
- return HTMLResponse(content=html_content, status_code=200)
392
+
393
+ return get_instructions_html(port=self.app.port)
391
394
 
392
395
  def initialize_load_config(
393
396
  self, request: Union[ChatCompletionRequest, CompletionRequest]
@@ -530,10 +533,6 @@ class Server(ManagementTool):
530
533
  Stream chat completion responses using HTTP chunked transfer encoding.
531
534
  """
532
535
 
533
- if chat_completion_request.tools and chat_completion_request.stream:
534
- logging.warning(
535
- "tools are only supported on non-streaming chat completions"
536
- )
537
536
  if chat_completion_request.logprobs:
538
537
  logging.warning("logprobs is not supported on chat completion")
539
538
 
@@ -542,14 +541,15 @@ class Server(ManagementTool):
542
541
  # Load the model if it's different from the currently loaded one
543
542
  await self.load_llm(lc, internal_call=True)
544
543
 
544
+ if self.llm_loaded.recipe == "llamacpp":
545
+ return llamacpp.chat_completion(
546
+ chat_completion_request, self.llama_telemetry
547
+ )
548
+
545
549
  # Convert chat messages to text using the model's chat template
546
550
  text = self.apply_chat_template(
547
551
  chat_completion_request.messages,
548
- tools=(
549
- chat_completion_request.tools
550
- if not chat_completion_request.stream
551
- else None
552
- ),
552
+ tools=chat_completion_request.tools,
553
553
  )
554
554
 
555
555
  # If the model supports reasoning, we:
@@ -585,6 +585,12 @@ class Server(ManagementTool):
585
585
  "max_new_tokens": max_new_tokens,
586
586
  }
587
587
 
588
+ if chat_completion_request.tools:
589
+ # Get the tool call pattern
590
+ tool_call_pattern = get_tool_call_pattern(
591
+ self.tokenizer.auto_tokenizer.added_tokens_decoder
592
+ )
593
+
588
594
  if chat_completion_request.stream:
589
595
 
590
596
  # Stream the response
@@ -594,7 +600,38 @@ class Server(ManagementTool):
594
600
  # in the inner function
595
601
  nonlocal reasoning_first_token
596
602
 
603
+ # Keep track of the full response for tool call extraction
604
+ full_response = ""
605
+
597
606
  async for token in self._generate_tokens(**generation_args):
607
+ # Continuously look for tool calls embedded into the generated text
608
+ openai_tool_calls = None
609
+ if chat_completion_request.tools:
610
+
611
+ # Append the token to the full response
612
+ full_response += token
613
+
614
+ tool_calls, _ = extract_tool_calls(
615
+ full_response,
616
+ tool_call_pattern,
617
+ )
618
+
619
+ # If there are tool calls, reset the full response for the next tool call
620
+ if tool_calls:
621
+ openai_tool_calls = []
622
+ full_response = ""
623
+ for tool_call in tool_calls:
624
+ openai_tool_calls.append(
625
+ ChoiceDeltaToolCall(
626
+ index=0,
627
+ id="-",
628
+ function=ChoiceDeltaToolCallFunction(
629
+ arguments=json.dumps(tool_call["arguments"]),
630
+ name=tool_call["name"],
631
+ ),
632
+ type="function",
633
+ )
634
+ )
598
635
 
599
636
  # Create a ChatCompletionChunk
600
637
  chunk = ChatCompletionChunk.model_construct(
@@ -613,7 +650,7 @@ class Server(ManagementTool):
613
650
  ),
614
651
  function_call=None,
615
652
  role="assistant",
616
- tool_calls=None,
653
+ tool_calls=openai_tool_calls,
617
654
  refusal=None,
618
655
  ),
619
656
  finish_reason=None,
@@ -648,7 +685,7 @@ class Server(ManagementTool):
648
685
  openai_tool_calls = None
649
686
  if chat_completion_request.tools:
650
687
  tool_calls, full_response = extract_tool_calls(
651
- full_response, self.tokenizer.auto_tokenizer.added_tokens_decoder
688
+ full_response, tool_call_pattern
652
689
  )
653
690
  if tool_calls:
654
691
  openai_tool_calls = []
@@ -767,6 +804,7 @@ class Server(ManagementTool):
767
804
  created_event = ResponseCreatedEvent(
768
805
  response=response,
769
806
  type="response.created",
807
+ sequence_number=0,
770
808
  )
771
809
  yield f"data: {created_event.model_dump_json()}\n\n".encode("utf-8")
772
810
 
@@ -781,6 +819,7 @@ class Server(ManagementTool):
781
819
  item_id="0 ",
782
820
  output_index=0,
783
821
  type="response.output_text.delta",
822
+ sequence_number=0,
784
823
  )
785
824
  full_response += token
786
825
 
@@ -815,6 +854,7 @@ class Server(ManagementTool):
815
854
  completed_event = ResponseCompletedEvent(
816
855
  response=response,
817
856
  type="response.completed",
857
+ sequence_number=0,
818
858
  )
819
859
  yield f"data: {completed_event.model_dump_json()}\n\n".encode("utf-8")
820
860
 
@@ -1035,6 +1075,11 @@ class Server(ManagementTool):
1035
1075
  """
1036
1076
  Send performance statistics to the client.
1037
1077
  """
1078
+ # If using llama server, get telemetry from the telemetry instance
1079
+ if self.llm_loaded and self.llm_loaded.recipe == "llamacpp":
1080
+ return self.llama_telemetry.get_telemetry_data()
1081
+
1082
+ # For built-in server, use the existing telemetry
1038
1083
  return {
1039
1084
  "time_to_first_token": self.time_to_first_token,
1040
1085
  "tokens_per_second": self.tokens_per_second,
@@ -1246,15 +1291,25 @@ class Server(ManagementTool):
1246
1291
 
1247
1292
  logging.info(f"Loading llm: {model_reference}")
1248
1293
  try:
1249
- self.model, self.tokenizer = lemonade_api.from_pretrained(
1250
- checkpoint=config_to_use.checkpoint, recipe=config_to_use.recipe
1251
- )
1294
+ if config_to_use.recipe == "llamacpp":
1295
+ self.llama_server_process = llamacpp.server_load(
1296
+ checkpoint=config_to_use.checkpoint,
1297
+ model_reference=model_reference,
1298
+ telemetry=self.llama_telemetry,
1299
+ )
1300
+
1301
+ else:
1302
+ self.model, self.tokenizer = lemonade_api.from_pretrained(
1303
+ checkpoint=config_to_use.checkpoint, recipe=config_to_use.recipe
1304
+ )
1252
1305
  self.llm_loaded = config_to_use
1253
1306
 
1254
1307
  return {
1255
1308
  "status": "success",
1256
1309
  "message": f"Loaded model: {model_reference}",
1257
1310
  }
1311
+ except HTTPException:
1312
+ raise
1258
1313
  except Exception: # pylint: disable=broad-exception-caught
1259
1314
  self.model_load_failure(model_reference)
1260
1315
 
@@ -1279,6 +1334,9 @@ class Server(ManagementTool):
1279
1334
  for _ in range(self.max_concurrent_generations):
1280
1335
  await self._generate_semaphore.acquire()
1281
1336
 
1337
+ if self.llm_loaded.recipe == "llamacpp":
1338
+ self.llama_server_process.terminate()
1339
+
1282
1340
  self.llm_loaded = None
1283
1341
  self.tokenizer = None
1284
1342
  self.model = None