lemonade-sdk 7.0.0__py3-none-any.whl → 7.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

@@ -7,11 +7,14 @@ import logging
7
7
  import traceback
8
8
  from typing import Optional, Union
9
9
  import json
10
+ import subprocess
11
+ from contextlib import asynccontextmanager
12
+ from pathlib import Path
10
13
 
11
14
  from fastapi import FastAPI, HTTPException, status, Request
12
- from fastapi.responses import StreamingResponse, HTMLResponse
15
+ from fastapi.responses import StreamingResponse
13
16
  from fastapi.middleware.cors import CORSMiddleware
14
- from pydantic import BaseModel
17
+ from fastapi.staticfiles import StaticFiles
15
18
  import uvicorn
16
19
  from transformers import TextIteratorStreamer, StoppingCriteria, StoppingCriteriaList
17
20
  from tabulate import tabulate
@@ -24,7 +27,11 @@ from openai.types.chat.chat_completion_message_tool_call import (
24
27
  Function,
25
28
  )
26
29
  from openai.types.chat.chat_completion import Choice
27
- from openai.types.chat.chat_completion_chunk import ChoiceDelta
30
+ from openai.types.chat.chat_completion_chunk import (
31
+ ChoiceDelta,
32
+ ChoiceDeltaToolCall,
33
+ ChoiceDeltaToolCallFunction,
34
+ )
28
35
  from openai.types.completion_choice import Logprobs
29
36
  from openai.types.model import Model
30
37
  from openai.types.responses import (
@@ -39,11 +46,18 @@ from openai.types.responses import (
39
46
  import lemonade.api as lemonade_api
40
47
  from lemonade_server.model_manager import ModelManager
41
48
  from lemonade.tools.management_tools import ManagementTool
42
- from lemonade.tools.server.tool_calls import extract_tool_calls
49
+ import lemonade.tools.server.llamacpp as llamacpp
50
+ from lemonade.tools.server.pydantic_models import (
51
+ DEFAULT_MAX_NEW_TOKENS,
52
+ LoadConfig,
53
+ CompletionRequest,
54
+ ChatCompletionRequest,
55
+ ResponsesRequest,
56
+ PullConfig,
57
+ )
58
+ from lemonade.tools.server.tool_calls import extract_tool_calls, get_tool_call_pattern
59
+ from lemonade.tools.server.instructions import get_instructions_html
43
60
 
44
- # Set to a high number to allow for interesting experiences in real apps
45
- # Tests should use the max_new_tokens argument to set a lower value
46
- DEFAULT_MAX_NEW_TOKENS = 1500
47
61
 
48
62
  DEFAULT_PORT = 8000
49
63
  DEFAULT_LOG_LEVEL = "info"
@@ -101,97 +115,21 @@ class StopOnEvent(StoppingCriteria):
101
115
  return self.stop_event.is_set()
102
116
 
103
117
 
104
- class PullConfig(BaseModel):
105
- """
106
- Configurating for installing a supported LLM.
107
- """
108
-
109
- model_name: str
110
-
111
-
112
- class LoadConfig(BaseModel):
113
- """
114
- Configuration for loading a language model.
115
-
116
- Specifies the model checkpoint, generation parameters,
117
- and hardware/framework configuration (recipe) for model loading.
118
- """
119
-
120
- model_name: Optional[str] = None
121
- checkpoint: Optional[str] = None
122
- max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS
123
- recipe: Optional[str] = None
124
- # Indicates the maximum prompt length allowed for that specific
125
- # checkpoint + recipe combination
126
- max_prompt_length: Optional[int] = None
127
- # Indicates whether the model is a reasoning model, like DeepSeek
128
- reasoning: Optional[bool] = False
129
-
130
-
131
- class CompletionRequest(BaseModel):
132
- """
133
- Request model for text completion API endpoint.
134
-
135
- Contains a prompt, a model identifier, and a streaming
136
- flag to control response delivery.
137
- """
138
-
139
- prompt: str
140
- model: str
141
- echo: bool = False
142
- stream: bool = False
143
- logprobs: int | None = False
144
- stop: list[str] | str | None = None
145
- temperature: float | None = None
146
- max_tokens: int | None = None
147
-
148
-
149
- class ChatCompletionRequest(BaseModel):
150
- """
151
- Request model for chat completion API endpoint.
152
-
153
- Contains a list of chat messages, a model identifier,
154
- and a streaming flag to control response delivery.
155
- """
156
-
157
- messages: list[dict]
158
- model: str
159
- stream: bool = False
160
- logprobs: int | None = False
161
- stop: list[str] | str | None = None
162
- temperature: float | None = None
163
- tools: list[dict] | None = None
164
- max_tokens: int | None = None
165
- max_completion_tokens: int | None = None
166
-
167
-
168
- class ResponsesRequest(BaseModel):
169
- """
170
- Request model for responses API endpoint.
171
- """
172
-
173
- input: list[dict] | str
174
- model: str
175
- max_output_tokens: int | None = None
176
- temperature: float | None = None
177
- stream: bool = False
178
-
179
-
180
118
  class Server(ManagementTool):
181
119
  """
182
120
  Open a web server that apps can use to communicate with the LLM.
183
121
 
184
122
  The server exposes these endpoints:
185
- - /api/v0/pull: install an LLM by its Lemonade Server Model Name.
186
- - /api/v0/load: load a model checkpoint.
187
- - /api/v0/unload: unload a model checkpoint.
188
- - /api/v0/health: check whether a model is loaded and ready to serve.
189
- - /api/v0/stats: performance statistics for the generation.
190
- - /api/v0/halt: stop an in-progress generation from make more tokens.
191
- - /api/v0/completions: completion responses using HTTP chunked transfer encoding.
192
- - /api/v0/chat/completions: chat completion responses using HTTP chunked transfer encoding.
193
- - /api/v0/responses: responses API using HTTP chunked transfer encoding.
194
- - /api/v0/models: list all available models.
123
+ - /api/v1/pull: install an LLM by its Lemonade Server Model Name.
124
+ - /api/v1/load: load a model checkpoint.
125
+ - /api/v1/unload: unload a model checkpoint.
126
+ - /api/v1/health: check whether a model is loaded and ready to serve.
127
+ - /api/v1/stats: performance statistics for the generation.
128
+ - /api/v1/halt: stop an in-progress generation from make more tokens.
129
+ - /api/v1/completions: completion responses using HTTP chunked transfer encoding.
130
+ - /api/v1/chat/completions: chat completion responses using HTTP chunked transfer encoding.
131
+ - /api/v1/responses: responses API using HTTP chunked transfer encoding.
132
+ - /api/v1/models: list all available models.
195
133
  """
196
134
 
197
135
  unique_name = "serve"
@@ -200,7 +138,7 @@ class Server(ManagementTool):
200
138
  super().__init__()
201
139
 
202
140
  # Initialize FastAPI app
203
- self.app = FastAPI()
141
+ self.app = FastAPI(lifespan=lifespan)
204
142
 
205
143
  # Add CORS middleware
206
144
  self.app.add_middleware(
@@ -212,23 +150,18 @@ class Server(ManagementTool):
212
150
  )
213
151
 
214
152
  # Set up custom routes
215
- self.app.post("/api/v0/pull")(self.pull)
216
- self.app.post("/api/v0/load")(self.load_llm)
217
- self.app.post("/api/v0/unload")(self.unload_llm)
218
- self.app.get("/api/v0/health")(self.health)
219
- self.app.get("/api/v0/halt")(self.halt_generation)
220
- self.app.get("/api/v0/stats")(self.send_stats)
221
- self.app.post("/api/v0/completions")(self.completions)
222
- self.app.post("/api/v0/responses")(self.responses)
223
-
224
- # Set up OpenAI-compatible routes
225
- self.app.post("/api/v0/chat/completions")(self.chat_completions)
226
- self.app.post("/api/v0/completions")(self.completions)
227
- self.app.get("/api/v0/models")(self.models)
153
+ self.setup_routes(["/api/v0", "/api/v1"])
228
154
 
229
155
  # Set up instructions
230
156
  self.app.get("/")(self.instructions)
231
157
 
158
+ # Mount a static assets dir for HTML responses, such
159
+ # as the instructions
160
+ static_dir = Path(__file__).parent / "static"
161
+ self.app.mount(
162
+ "/static", StaticFiles(directory=static_dir), name="static_assets"
163
+ )
164
+
232
165
  # Performance stats that are set during /ws and can be
233
166
  # fetched in /stats
234
167
  self.time_to_first_token = None
@@ -263,6 +196,28 @@ class Server(ManagementTool):
263
196
  # Add lock for load/unload operations
264
197
  self._load_lock = asyncio.Lock()
265
198
 
199
+ # Subprocess handle for llama_server.exe
200
+ self.llama_server_process: subprocess.Popen = None
201
+
202
+ # Telemetry instance for llama server
203
+ self.llama_telemetry = llamacpp.LlamaTelemetry()
204
+
205
+ def setup_routes(self, api_prefixes: list[str]):
206
+ for prefix in api_prefixes:
207
+ # Custom routes
208
+ self.app.post(f"{prefix}/pull")(self.pull)
209
+ self.app.post(f"{prefix}/load")(self.load_llm)
210
+ self.app.post(f"{prefix}/unload")(self.unload_llm)
211
+ self.app.get(f"{prefix}/health")(self.health)
212
+ self.app.get(f"{prefix}/halt")(self.halt_generation)
213
+ self.app.get(f"{prefix}/stats")(self.send_stats)
214
+ self.app.post(f"{prefix}/completions")(self.completions)
215
+ self.app.post(f"{prefix}/responses")(self.responses)
216
+
217
+ # OpenAI-compatible routes
218
+ self.app.post(f"{prefix}/chat/completions")(self.chat_completions)
219
+ self.app.get(f"{prefix}/models")(self.models)
220
+
266
221
  @staticmethod
267
222
  def parser(add_help: bool = True) -> argparse.ArgumentParser:
268
223
  parser = __class__.helpful_parser(
@@ -334,6 +289,10 @@ class Server(ManagementTool):
334
289
  # Print the elapsed time for each request
335
290
  self.setup_middleware_timer()
336
291
 
292
+ # Let the app know what port it's running on, so
293
+ # that the lifespan can access it
294
+ self.app.port = port
295
+
337
296
  uvicorn.run(self.app, host="localhost", port=port, log_level=log_level)
338
297
 
339
298
  async def _show_telemetry(self):
@@ -363,31 +322,8 @@ class Server(ManagementTool):
363
322
  """
364
323
  Show instructions on how to use the server.
365
324
  """
366
- html_content = """
367
- <!DOCTYPE html>
368
- <html>
369
- <head>
370
- <title>Lemonade Server</title>
371
- <link rel="icon" href="data:,">
372
- </head>
373
- <body>
374
- <h1>🍋 Welcome to Lemonade Server!</h1>
375
- <p>
376
- A standards-compliant server that provides REST APIs for LLM communication.
377
- To get started, simply point your OpenAI-compatible application at the server's endpoint.
378
- </p>
379
- <div class="links">
380
- <h3>Documentation:</h3>
381
- <ul>
382
- <li><a href="https://github.com/lemonade-sdk/lemonade/tree/main/docs/server/apps/README.md">Examples & Usage</a></li>
383
- <li><a href="https://github.com/lemonade-sdk/lemonade/blob/main/docs/server/server_integration.md">Integration Guide</a></li>
384
- <li><a href="https://github.com/lemonade-sdk/lemonade/blob/main/docs/server/server_spec.md">Server Specification</a></li>
385
- </ul>
386
- </div>
387
- </body>
388
- </html>
389
- """
390
- return HTMLResponse(content=html_content, status_code=200)
325
+
326
+ return get_instructions_html(port=self.app.port)
391
327
 
392
328
  def initialize_load_config(
393
329
  self, request: Union[ChatCompletionRequest, CompletionRequest]
@@ -530,10 +466,6 @@ class Server(ManagementTool):
530
466
  Stream chat completion responses using HTTP chunked transfer encoding.
531
467
  """
532
468
 
533
- if chat_completion_request.tools and chat_completion_request.stream:
534
- logging.warning(
535
- "tools are only supported on non-streaming chat completions"
536
- )
537
469
  if chat_completion_request.logprobs:
538
470
  logging.warning("logprobs is not supported on chat completion")
539
471
 
@@ -542,14 +474,15 @@ class Server(ManagementTool):
542
474
  # Load the model if it's different from the currently loaded one
543
475
  await self.load_llm(lc, internal_call=True)
544
476
 
477
+ if self.llm_loaded.recipe == "llamacpp":
478
+ return llamacpp.chat_completion(
479
+ chat_completion_request, self.llama_telemetry
480
+ )
481
+
545
482
  # Convert chat messages to text using the model's chat template
546
483
  text = self.apply_chat_template(
547
484
  chat_completion_request.messages,
548
- tools=(
549
- chat_completion_request.tools
550
- if not chat_completion_request.stream
551
- else None
552
- ),
485
+ tools=chat_completion_request.tools,
553
486
  )
554
487
 
555
488
  # If the model supports reasoning, we:
@@ -585,6 +518,12 @@ class Server(ManagementTool):
585
518
  "max_new_tokens": max_new_tokens,
586
519
  }
587
520
 
521
+ if chat_completion_request.tools:
522
+ # Get the tool call pattern
523
+ tool_call_pattern = get_tool_call_pattern(
524
+ self.tokenizer.auto_tokenizer.added_tokens_decoder
525
+ )
526
+
588
527
  if chat_completion_request.stream:
589
528
 
590
529
  # Stream the response
@@ -594,7 +533,38 @@ class Server(ManagementTool):
594
533
  # in the inner function
595
534
  nonlocal reasoning_first_token
596
535
 
536
+ # Keep track of the full response for tool call extraction
537
+ full_response = ""
538
+
597
539
  async for token in self._generate_tokens(**generation_args):
540
+ # Continuously look for tool calls embedded into the generated text
541
+ openai_tool_calls = None
542
+ if chat_completion_request.tools:
543
+
544
+ # Append the token to the full response
545
+ full_response += token
546
+
547
+ tool_calls, _ = extract_tool_calls(
548
+ full_response,
549
+ tool_call_pattern,
550
+ )
551
+
552
+ # If there are tool calls, reset the full response for the next tool call
553
+ if tool_calls:
554
+ openai_tool_calls = []
555
+ full_response = ""
556
+ for tool_call in tool_calls:
557
+ openai_tool_calls.append(
558
+ ChoiceDeltaToolCall(
559
+ index=0,
560
+ id="-",
561
+ function=ChoiceDeltaToolCallFunction(
562
+ arguments=json.dumps(tool_call["arguments"]),
563
+ name=tool_call["name"],
564
+ ),
565
+ type="function",
566
+ )
567
+ )
598
568
 
599
569
  # Create a ChatCompletionChunk
600
570
  chunk = ChatCompletionChunk.model_construct(
@@ -613,7 +583,7 @@ class Server(ManagementTool):
613
583
  ),
614
584
  function_call=None,
615
585
  role="assistant",
616
- tool_calls=None,
586
+ tool_calls=openai_tool_calls,
617
587
  refusal=None,
618
588
  ),
619
589
  finish_reason=None,
@@ -648,7 +618,7 @@ class Server(ManagementTool):
648
618
  openai_tool_calls = None
649
619
  if chat_completion_request.tools:
650
620
  tool_calls, full_response = extract_tool_calls(
651
- full_response, self.tokenizer.auto_tokenizer.added_tokens_decoder
621
+ full_response, tool_call_pattern
652
622
  )
653
623
  if tool_calls:
654
624
  openai_tool_calls = []
@@ -767,6 +737,7 @@ class Server(ManagementTool):
767
737
  created_event = ResponseCreatedEvent(
768
738
  response=response,
769
739
  type="response.created",
740
+ sequence_number=0,
770
741
  )
771
742
  yield f"data: {created_event.model_dump_json()}\n\n".encode("utf-8")
772
743
 
@@ -781,6 +752,7 @@ class Server(ManagementTool):
781
752
  item_id="0 ",
782
753
  output_index=0,
783
754
  type="response.output_text.delta",
755
+ sequence_number=0,
784
756
  )
785
757
  full_response += token
786
758
 
@@ -815,6 +787,7 @@ class Server(ManagementTool):
815
787
  completed_event = ResponseCompletedEvent(
816
788
  response=response,
817
789
  type="response.completed",
790
+ sequence_number=0,
818
791
  )
819
792
  yield f"data: {completed_event.model_dump_json()}\n\n".encode("utf-8")
820
793
 
@@ -1035,6 +1008,11 @@ class Server(ManagementTool):
1035
1008
  """
1036
1009
  Send performance statistics to the client.
1037
1010
  """
1011
+ # If using llama server, get telemetry from the telemetry instance
1012
+ if self.llm_loaded and self.llm_loaded.recipe == "llamacpp":
1013
+ return self.llama_telemetry.get_telemetry_data()
1014
+
1015
+ # For built-in server, use the existing telemetry
1038
1016
  return {
1039
1017
  "time_to_first_token": self.time_to_first_token,
1040
1018
  "tokens_per_second": self.tokens_per_second,
@@ -1246,9 +1224,17 @@ class Server(ManagementTool):
1246
1224
 
1247
1225
  logging.info(f"Loading llm: {model_reference}")
1248
1226
  try:
1249
- self.model, self.tokenizer = lemonade_api.from_pretrained(
1250
- checkpoint=config_to_use.checkpoint, recipe=config_to_use.recipe
1251
- )
1227
+ if config_to_use.recipe == "llamacpp":
1228
+ self.llama_server_process = llamacpp.server_load(
1229
+ checkpoint=config_to_use.checkpoint,
1230
+ model_reference=model_reference,
1231
+ telemetry=self.llama_telemetry,
1232
+ )
1233
+
1234
+ else:
1235
+ self.model, self.tokenizer = lemonade_api.from_pretrained(
1236
+ checkpoint=config_to_use.checkpoint, recipe=config_to_use.recipe
1237
+ )
1252
1238
  self.llm_loaded = config_to_use
1253
1239
 
1254
1240
  return {
@@ -1279,6 +1265,9 @@ class Server(ManagementTool):
1279
1265
  for _ in range(self.max_concurrent_generations):
1280
1266
  await self._generate_semaphore.acquire()
1281
1267
 
1268
+ if self.llm_loaded.recipe == "llamacpp":
1269
+ self.llama_server_process.terminate()
1270
+
1282
1271
  self.llm_loaded = None
1283
1272
  self.tokenizer = None
1284
1273
  self.model = None
@@ -1350,5 +1339,22 @@ class Server(ManagementTool):
1350
1339
  return response
1351
1340
 
1352
1341
 
1342
+ @asynccontextmanager
1343
+ async def lifespan(app: FastAPI):
1344
+ # Code here will run when the application starts up
1345
+
1346
+ logging.info(
1347
+ "\n"
1348
+ "\n"
1349
+ "🍋 Lemonade Server Ready!\n"
1350
+ f"🍋 Open http://localhost:{app.port} in your browser for:\n"
1351
+ "🍋 💬 chat\n"
1352
+ "🍋 💻 model management\n"
1353
+ "🍋 📄 docs\n"
1354
+ )
1355
+
1356
+ yield
1357
+
1358
+
1353
1359
  # This file was originally licensed under Apache 2.0. It has been modified.
1354
1360
  # Modifications Copyright (c) 2025 AMD