lemonade-sdk 7.0.0__py3-none-any.whl → 7.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lemonade-sdk might be problematic. Click here for more details.
- lemonade/tools/server/instructions.py +294 -0
- lemonade/tools/server/llamacpp.py +289 -0
- lemonade/tools/server/pydantic_models.py +83 -0
- lemonade/tools/server/serve.py +152 -146
- lemonade/tools/server/static/styles.css +313 -0
- lemonade/tools/server/tool_calls.py +50 -43
- lemonade/version.py +1 -1
- {lemonade_sdk-7.0.0.dist-info → lemonade_sdk-7.0.1.dist-info}/METADATA +4 -7
- {lemonade_sdk-7.0.0.dist-info → lemonade_sdk-7.0.1.dist-info}/RECORD +17 -13
- {lemonade_sdk-7.0.0.dist-info → lemonade_sdk-7.0.1.dist-info}/WHEEL +1 -1
- lemonade_server/cli.py +4 -2
- lemonade_server/model_manager.py +34 -17
- lemonade_server/server_models.json +42 -0
- {lemonade_sdk-7.0.0.dist-info → lemonade_sdk-7.0.1.dist-info}/entry_points.txt +0 -0
- {lemonade_sdk-7.0.0.dist-info → lemonade_sdk-7.0.1.dist-info}/licenses/LICENSE +0 -0
- {lemonade_sdk-7.0.0.dist-info → lemonade_sdk-7.0.1.dist-info}/licenses/NOTICE.md +0 -0
- {lemonade_sdk-7.0.0.dist-info → lemonade_sdk-7.0.1.dist-info}/top_level.txt +0 -0
lemonade/tools/server/serve.py
CHANGED
|
@@ -7,11 +7,14 @@ import logging
|
|
|
7
7
|
import traceback
|
|
8
8
|
from typing import Optional, Union
|
|
9
9
|
import json
|
|
10
|
+
import subprocess
|
|
11
|
+
from contextlib import asynccontextmanager
|
|
12
|
+
from pathlib import Path
|
|
10
13
|
|
|
11
14
|
from fastapi import FastAPI, HTTPException, status, Request
|
|
12
|
-
from fastapi.responses import StreamingResponse
|
|
15
|
+
from fastapi.responses import StreamingResponse
|
|
13
16
|
from fastapi.middleware.cors import CORSMiddleware
|
|
14
|
-
from
|
|
17
|
+
from fastapi.staticfiles import StaticFiles
|
|
15
18
|
import uvicorn
|
|
16
19
|
from transformers import TextIteratorStreamer, StoppingCriteria, StoppingCriteriaList
|
|
17
20
|
from tabulate import tabulate
|
|
@@ -24,7 +27,11 @@ from openai.types.chat.chat_completion_message_tool_call import (
|
|
|
24
27
|
Function,
|
|
25
28
|
)
|
|
26
29
|
from openai.types.chat.chat_completion import Choice
|
|
27
|
-
from openai.types.chat.chat_completion_chunk import
|
|
30
|
+
from openai.types.chat.chat_completion_chunk import (
|
|
31
|
+
ChoiceDelta,
|
|
32
|
+
ChoiceDeltaToolCall,
|
|
33
|
+
ChoiceDeltaToolCallFunction,
|
|
34
|
+
)
|
|
28
35
|
from openai.types.completion_choice import Logprobs
|
|
29
36
|
from openai.types.model import Model
|
|
30
37
|
from openai.types.responses import (
|
|
@@ -39,11 +46,18 @@ from openai.types.responses import (
|
|
|
39
46
|
import lemonade.api as lemonade_api
|
|
40
47
|
from lemonade_server.model_manager import ModelManager
|
|
41
48
|
from lemonade.tools.management_tools import ManagementTool
|
|
42
|
-
|
|
49
|
+
import lemonade.tools.server.llamacpp as llamacpp
|
|
50
|
+
from lemonade.tools.server.pydantic_models import (
|
|
51
|
+
DEFAULT_MAX_NEW_TOKENS,
|
|
52
|
+
LoadConfig,
|
|
53
|
+
CompletionRequest,
|
|
54
|
+
ChatCompletionRequest,
|
|
55
|
+
ResponsesRequest,
|
|
56
|
+
PullConfig,
|
|
57
|
+
)
|
|
58
|
+
from lemonade.tools.server.tool_calls import extract_tool_calls, get_tool_call_pattern
|
|
59
|
+
from lemonade.tools.server.instructions import get_instructions_html
|
|
43
60
|
|
|
44
|
-
# Set to a high number to allow for interesting experiences in real apps
|
|
45
|
-
# Tests should use the max_new_tokens argument to set a lower value
|
|
46
|
-
DEFAULT_MAX_NEW_TOKENS = 1500
|
|
47
61
|
|
|
48
62
|
DEFAULT_PORT = 8000
|
|
49
63
|
DEFAULT_LOG_LEVEL = "info"
|
|
@@ -101,97 +115,21 @@ class StopOnEvent(StoppingCriteria):
|
|
|
101
115
|
return self.stop_event.is_set()
|
|
102
116
|
|
|
103
117
|
|
|
104
|
-
class PullConfig(BaseModel):
|
|
105
|
-
"""
|
|
106
|
-
Configurating for installing a supported LLM.
|
|
107
|
-
"""
|
|
108
|
-
|
|
109
|
-
model_name: str
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
class LoadConfig(BaseModel):
|
|
113
|
-
"""
|
|
114
|
-
Configuration for loading a language model.
|
|
115
|
-
|
|
116
|
-
Specifies the model checkpoint, generation parameters,
|
|
117
|
-
and hardware/framework configuration (recipe) for model loading.
|
|
118
|
-
"""
|
|
119
|
-
|
|
120
|
-
model_name: Optional[str] = None
|
|
121
|
-
checkpoint: Optional[str] = None
|
|
122
|
-
max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS
|
|
123
|
-
recipe: Optional[str] = None
|
|
124
|
-
# Indicates the maximum prompt length allowed for that specific
|
|
125
|
-
# checkpoint + recipe combination
|
|
126
|
-
max_prompt_length: Optional[int] = None
|
|
127
|
-
# Indicates whether the model is a reasoning model, like DeepSeek
|
|
128
|
-
reasoning: Optional[bool] = False
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
class CompletionRequest(BaseModel):
|
|
132
|
-
"""
|
|
133
|
-
Request model for text completion API endpoint.
|
|
134
|
-
|
|
135
|
-
Contains a prompt, a model identifier, and a streaming
|
|
136
|
-
flag to control response delivery.
|
|
137
|
-
"""
|
|
138
|
-
|
|
139
|
-
prompt: str
|
|
140
|
-
model: str
|
|
141
|
-
echo: bool = False
|
|
142
|
-
stream: bool = False
|
|
143
|
-
logprobs: int | None = False
|
|
144
|
-
stop: list[str] | str | None = None
|
|
145
|
-
temperature: float | None = None
|
|
146
|
-
max_tokens: int | None = None
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
class ChatCompletionRequest(BaseModel):
|
|
150
|
-
"""
|
|
151
|
-
Request model for chat completion API endpoint.
|
|
152
|
-
|
|
153
|
-
Contains a list of chat messages, a model identifier,
|
|
154
|
-
and a streaming flag to control response delivery.
|
|
155
|
-
"""
|
|
156
|
-
|
|
157
|
-
messages: list[dict]
|
|
158
|
-
model: str
|
|
159
|
-
stream: bool = False
|
|
160
|
-
logprobs: int | None = False
|
|
161
|
-
stop: list[str] | str | None = None
|
|
162
|
-
temperature: float | None = None
|
|
163
|
-
tools: list[dict] | None = None
|
|
164
|
-
max_tokens: int | None = None
|
|
165
|
-
max_completion_tokens: int | None = None
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
class ResponsesRequest(BaseModel):
|
|
169
|
-
"""
|
|
170
|
-
Request model for responses API endpoint.
|
|
171
|
-
"""
|
|
172
|
-
|
|
173
|
-
input: list[dict] | str
|
|
174
|
-
model: str
|
|
175
|
-
max_output_tokens: int | None = None
|
|
176
|
-
temperature: float | None = None
|
|
177
|
-
stream: bool = False
|
|
178
|
-
|
|
179
|
-
|
|
180
118
|
class Server(ManagementTool):
|
|
181
119
|
"""
|
|
182
120
|
Open a web server that apps can use to communicate with the LLM.
|
|
183
121
|
|
|
184
122
|
The server exposes these endpoints:
|
|
185
|
-
- /api/
|
|
186
|
-
- /api/
|
|
187
|
-
- /api/
|
|
188
|
-
- /api/
|
|
189
|
-
- /api/
|
|
190
|
-
- /api/
|
|
191
|
-
- /api/
|
|
192
|
-
- /api/
|
|
193
|
-
- /api/
|
|
194
|
-
- /api/
|
|
123
|
+
- /api/v1/pull: install an LLM by its Lemonade Server Model Name.
|
|
124
|
+
- /api/v1/load: load a model checkpoint.
|
|
125
|
+
- /api/v1/unload: unload a model checkpoint.
|
|
126
|
+
- /api/v1/health: check whether a model is loaded and ready to serve.
|
|
127
|
+
- /api/v1/stats: performance statistics for the generation.
|
|
128
|
+
- /api/v1/halt: stop an in-progress generation from make more tokens.
|
|
129
|
+
- /api/v1/completions: completion responses using HTTP chunked transfer encoding.
|
|
130
|
+
- /api/v1/chat/completions: chat completion responses using HTTP chunked transfer encoding.
|
|
131
|
+
- /api/v1/responses: responses API using HTTP chunked transfer encoding.
|
|
132
|
+
- /api/v1/models: list all available models.
|
|
195
133
|
"""
|
|
196
134
|
|
|
197
135
|
unique_name = "serve"
|
|
@@ -200,7 +138,7 @@ class Server(ManagementTool):
|
|
|
200
138
|
super().__init__()
|
|
201
139
|
|
|
202
140
|
# Initialize FastAPI app
|
|
203
|
-
self.app = FastAPI()
|
|
141
|
+
self.app = FastAPI(lifespan=lifespan)
|
|
204
142
|
|
|
205
143
|
# Add CORS middleware
|
|
206
144
|
self.app.add_middleware(
|
|
@@ -212,23 +150,18 @@ class Server(ManagementTool):
|
|
|
212
150
|
)
|
|
213
151
|
|
|
214
152
|
# Set up custom routes
|
|
215
|
-
self.
|
|
216
|
-
self.app.post("/api/v0/load")(self.load_llm)
|
|
217
|
-
self.app.post("/api/v0/unload")(self.unload_llm)
|
|
218
|
-
self.app.get("/api/v0/health")(self.health)
|
|
219
|
-
self.app.get("/api/v0/halt")(self.halt_generation)
|
|
220
|
-
self.app.get("/api/v0/stats")(self.send_stats)
|
|
221
|
-
self.app.post("/api/v0/completions")(self.completions)
|
|
222
|
-
self.app.post("/api/v0/responses")(self.responses)
|
|
223
|
-
|
|
224
|
-
# Set up OpenAI-compatible routes
|
|
225
|
-
self.app.post("/api/v0/chat/completions")(self.chat_completions)
|
|
226
|
-
self.app.post("/api/v0/completions")(self.completions)
|
|
227
|
-
self.app.get("/api/v0/models")(self.models)
|
|
153
|
+
self.setup_routes(["/api/v0", "/api/v1"])
|
|
228
154
|
|
|
229
155
|
# Set up instructions
|
|
230
156
|
self.app.get("/")(self.instructions)
|
|
231
157
|
|
|
158
|
+
# Mount a static assets dir for HTML responses, such
|
|
159
|
+
# as the instructions
|
|
160
|
+
static_dir = Path(__file__).parent / "static"
|
|
161
|
+
self.app.mount(
|
|
162
|
+
"/static", StaticFiles(directory=static_dir), name="static_assets"
|
|
163
|
+
)
|
|
164
|
+
|
|
232
165
|
# Performance stats that are set during /ws and can be
|
|
233
166
|
# fetched in /stats
|
|
234
167
|
self.time_to_first_token = None
|
|
@@ -263,6 +196,28 @@ class Server(ManagementTool):
|
|
|
263
196
|
# Add lock for load/unload operations
|
|
264
197
|
self._load_lock = asyncio.Lock()
|
|
265
198
|
|
|
199
|
+
# Subprocess handle for llama_server.exe
|
|
200
|
+
self.llama_server_process: subprocess.Popen = None
|
|
201
|
+
|
|
202
|
+
# Telemetry instance for llama server
|
|
203
|
+
self.llama_telemetry = llamacpp.LlamaTelemetry()
|
|
204
|
+
|
|
205
|
+
def setup_routes(self, api_prefixes: list[str]):
|
|
206
|
+
for prefix in api_prefixes:
|
|
207
|
+
# Custom routes
|
|
208
|
+
self.app.post(f"{prefix}/pull")(self.pull)
|
|
209
|
+
self.app.post(f"{prefix}/load")(self.load_llm)
|
|
210
|
+
self.app.post(f"{prefix}/unload")(self.unload_llm)
|
|
211
|
+
self.app.get(f"{prefix}/health")(self.health)
|
|
212
|
+
self.app.get(f"{prefix}/halt")(self.halt_generation)
|
|
213
|
+
self.app.get(f"{prefix}/stats")(self.send_stats)
|
|
214
|
+
self.app.post(f"{prefix}/completions")(self.completions)
|
|
215
|
+
self.app.post(f"{prefix}/responses")(self.responses)
|
|
216
|
+
|
|
217
|
+
# OpenAI-compatible routes
|
|
218
|
+
self.app.post(f"{prefix}/chat/completions")(self.chat_completions)
|
|
219
|
+
self.app.get(f"{prefix}/models")(self.models)
|
|
220
|
+
|
|
266
221
|
@staticmethod
|
|
267
222
|
def parser(add_help: bool = True) -> argparse.ArgumentParser:
|
|
268
223
|
parser = __class__.helpful_parser(
|
|
@@ -334,6 +289,10 @@ class Server(ManagementTool):
|
|
|
334
289
|
# Print the elapsed time for each request
|
|
335
290
|
self.setup_middleware_timer()
|
|
336
291
|
|
|
292
|
+
# Let the app know what port it's running on, so
|
|
293
|
+
# that the lifespan can access it
|
|
294
|
+
self.app.port = port
|
|
295
|
+
|
|
337
296
|
uvicorn.run(self.app, host="localhost", port=port, log_level=log_level)
|
|
338
297
|
|
|
339
298
|
async def _show_telemetry(self):
|
|
@@ -363,31 +322,8 @@ class Server(ManagementTool):
|
|
|
363
322
|
"""
|
|
364
323
|
Show instructions on how to use the server.
|
|
365
324
|
"""
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
<html>
|
|
369
|
-
<head>
|
|
370
|
-
<title>Lemonade Server</title>
|
|
371
|
-
<link rel="icon" href="data:,">
|
|
372
|
-
</head>
|
|
373
|
-
<body>
|
|
374
|
-
<h1>🍋 Welcome to Lemonade Server!</h1>
|
|
375
|
-
<p>
|
|
376
|
-
A standards-compliant server that provides REST APIs for LLM communication.
|
|
377
|
-
To get started, simply point your OpenAI-compatible application at the server's endpoint.
|
|
378
|
-
</p>
|
|
379
|
-
<div class="links">
|
|
380
|
-
<h3>Documentation:</h3>
|
|
381
|
-
<ul>
|
|
382
|
-
<li><a href="https://github.com/lemonade-sdk/lemonade/tree/main/docs/server/apps/README.md">Examples & Usage</a></li>
|
|
383
|
-
<li><a href="https://github.com/lemonade-sdk/lemonade/blob/main/docs/server/server_integration.md">Integration Guide</a></li>
|
|
384
|
-
<li><a href="https://github.com/lemonade-sdk/lemonade/blob/main/docs/server/server_spec.md">Server Specification</a></li>
|
|
385
|
-
</ul>
|
|
386
|
-
</div>
|
|
387
|
-
</body>
|
|
388
|
-
</html>
|
|
389
|
-
"""
|
|
390
|
-
return HTMLResponse(content=html_content, status_code=200)
|
|
325
|
+
|
|
326
|
+
return get_instructions_html(port=self.app.port)
|
|
391
327
|
|
|
392
328
|
def initialize_load_config(
|
|
393
329
|
self, request: Union[ChatCompletionRequest, CompletionRequest]
|
|
@@ -530,10 +466,6 @@ class Server(ManagementTool):
|
|
|
530
466
|
Stream chat completion responses using HTTP chunked transfer encoding.
|
|
531
467
|
"""
|
|
532
468
|
|
|
533
|
-
if chat_completion_request.tools and chat_completion_request.stream:
|
|
534
|
-
logging.warning(
|
|
535
|
-
"tools are only supported on non-streaming chat completions"
|
|
536
|
-
)
|
|
537
469
|
if chat_completion_request.logprobs:
|
|
538
470
|
logging.warning("logprobs is not supported on chat completion")
|
|
539
471
|
|
|
@@ -542,14 +474,15 @@ class Server(ManagementTool):
|
|
|
542
474
|
# Load the model if it's different from the currently loaded one
|
|
543
475
|
await self.load_llm(lc, internal_call=True)
|
|
544
476
|
|
|
477
|
+
if self.llm_loaded.recipe == "llamacpp":
|
|
478
|
+
return llamacpp.chat_completion(
|
|
479
|
+
chat_completion_request, self.llama_telemetry
|
|
480
|
+
)
|
|
481
|
+
|
|
545
482
|
# Convert chat messages to text using the model's chat template
|
|
546
483
|
text = self.apply_chat_template(
|
|
547
484
|
chat_completion_request.messages,
|
|
548
|
-
tools=
|
|
549
|
-
chat_completion_request.tools
|
|
550
|
-
if not chat_completion_request.stream
|
|
551
|
-
else None
|
|
552
|
-
),
|
|
485
|
+
tools=chat_completion_request.tools,
|
|
553
486
|
)
|
|
554
487
|
|
|
555
488
|
# If the model supports reasoning, we:
|
|
@@ -585,6 +518,12 @@ class Server(ManagementTool):
|
|
|
585
518
|
"max_new_tokens": max_new_tokens,
|
|
586
519
|
}
|
|
587
520
|
|
|
521
|
+
if chat_completion_request.tools:
|
|
522
|
+
# Get the tool call pattern
|
|
523
|
+
tool_call_pattern = get_tool_call_pattern(
|
|
524
|
+
self.tokenizer.auto_tokenizer.added_tokens_decoder
|
|
525
|
+
)
|
|
526
|
+
|
|
588
527
|
if chat_completion_request.stream:
|
|
589
528
|
|
|
590
529
|
# Stream the response
|
|
@@ -594,7 +533,38 @@ class Server(ManagementTool):
|
|
|
594
533
|
# in the inner function
|
|
595
534
|
nonlocal reasoning_first_token
|
|
596
535
|
|
|
536
|
+
# Keep track of the full response for tool call extraction
|
|
537
|
+
full_response = ""
|
|
538
|
+
|
|
597
539
|
async for token in self._generate_tokens(**generation_args):
|
|
540
|
+
# Continuously look for tool calls embedded into the generated text
|
|
541
|
+
openai_tool_calls = None
|
|
542
|
+
if chat_completion_request.tools:
|
|
543
|
+
|
|
544
|
+
# Append the token to the full response
|
|
545
|
+
full_response += token
|
|
546
|
+
|
|
547
|
+
tool_calls, _ = extract_tool_calls(
|
|
548
|
+
full_response,
|
|
549
|
+
tool_call_pattern,
|
|
550
|
+
)
|
|
551
|
+
|
|
552
|
+
# If there are tool calls, reset the full response for the next tool call
|
|
553
|
+
if tool_calls:
|
|
554
|
+
openai_tool_calls = []
|
|
555
|
+
full_response = ""
|
|
556
|
+
for tool_call in tool_calls:
|
|
557
|
+
openai_tool_calls.append(
|
|
558
|
+
ChoiceDeltaToolCall(
|
|
559
|
+
index=0,
|
|
560
|
+
id="-",
|
|
561
|
+
function=ChoiceDeltaToolCallFunction(
|
|
562
|
+
arguments=json.dumps(tool_call["arguments"]),
|
|
563
|
+
name=tool_call["name"],
|
|
564
|
+
),
|
|
565
|
+
type="function",
|
|
566
|
+
)
|
|
567
|
+
)
|
|
598
568
|
|
|
599
569
|
# Create a ChatCompletionChunk
|
|
600
570
|
chunk = ChatCompletionChunk.model_construct(
|
|
@@ -613,7 +583,7 @@ class Server(ManagementTool):
|
|
|
613
583
|
),
|
|
614
584
|
function_call=None,
|
|
615
585
|
role="assistant",
|
|
616
|
-
tool_calls=
|
|
586
|
+
tool_calls=openai_tool_calls,
|
|
617
587
|
refusal=None,
|
|
618
588
|
),
|
|
619
589
|
finish_reason=None,
|
|
@@ -648,7 +618,7 @@ class Server(ManagementTool):
|
|
|
648
618
|
openai_tool_calls = None
|
|
649
619
|
if chat_completion_request.tools:
|
|
650
620
|
tool_calls, full_response = extract_tool_calls(
|
|
651
|
-
full_response,
|
|
621
|
+
full_response, tool_call_pattern
|
|
652
622
|
)
|
|
653
623
|
if tool_calls:
|
|
654
624
|
openai_tool_calls = []
|
|
@@ -767,6 +737,7 @@ class Server(ManagementTool):
|
|
|
767
737
|
created_event = ResponseCreatedEvent(
|
|
768
738
|
response=response,
|
|
769
739
|
type="response.created",
|
|
740
|
+
sequence_number=0,
|
|
770
741
|
)
|
|
771
742
|
yield f"data: {created_event.model_dump_json()}\n\n".encode("utf-8")
|
|
772
743
|
|
|
@@ -781,6 +752,7 @@ class Server(ManagementTool):
|
|
|
781
752
|
item_id="0 ",
|
|
782
753
|
output_index=0,
|
|
783
754
|
type="response.output_text.delta",
|
|
755
|
+
sequence_number=0,
|
|
784
756
|
)
|
|
785
757
|
full_response += token
|
|
786
758
|
|
|
@@ -815,6 +787,7 @@ class Server(ManagementTool):
|
|
|
815
787
|
completed_event = ResponseCompletedEvent(
|
|
816
788
|
response=response,
|
|
817
789
|
type="response.completed",
|
|
790
|
+
sequence_number=0,
|
|
818
791
|
)
|
|
819
792
|
yield f"data: {completed_event.model_dump_json()}\n\n".encode("utf-8")
|
|
820
793
|
|
|
@@ -1035,6 +1008,11 @@ class Server(ManagementTool):
|
|
|
1035
1008
|
"""
|
|
1036
1009
|
Send performance statistics to the client.
|
|
1037
1010
|
"""
|
|
1011
|
+
# If using llama server, get telemetry from the telemetry instance
|
|
1012
|
+
if self.llm_loaded and self.llm_loaded.recipe == "llamacpp":
|
|
1013
|
+
return self.llama_telemetry.get_telemetry_data()
|
|
1014
|
+
|
|
1015
|
+
# For built-in server, use the existing telemetry
|
|
1038
1016
|
return {
|
|
1039
1017
|
"time_to_first_token": self.time_to_first_token,
|
|
1040
1018
|
"tokens_per_second": self.tokens_per_second,
|
|
@@ -1246,9 +1224,17 @@ class Server(ManagementTool):
|
|
|
1246
1224
|
|
|
1247
1225
|
logging.info(f"Loading llm: {model_reference}")
|
|
1248
1226
|
try:
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1227
|
+
if config_to_use.recipe == "llamacpp":
|
|
1228
|
+
self.llama_server_process = llamacpp.server_load(
|
|
1229
|
+
checkpoint=config_to_use.checkpoint,
|
|
1230
|
+
model_reference=model_reference,
|
|
1231
|
+
telemetry=self.llama_telemetry,
|
|
1232
|
+
)
|
|
1233
|
+
|
|
1234
|
+
else:
|
|
1235
|
+
self.model, self.tokenizer = lemonade_api.from_pretrained(
|
|
1236
|
+
checkpoint=config_to_use.checkpoint, recipe=config_to_use.recipe
|
|
1237
|
+
)
|
|
1252
1238
|
self.llm_loaded = config_to_use
|
|
1253
1239
|
|
|
1254
1240
|
return {
|
|
@@ -1279,6 +1265,9 @@ class Server(ManagementTool):
|
|
|
1279
1265
|
for _ in range(self.max_concurrent_generations):
|
|
1280
1266
|
await self._generate_semaphore.acquire()
|
|
1281
1267
|
|
|
1268
|
+
if self.llm_loaded.recipe == "llamacpp":
|
|
1269
|
+
self.llama_server_process.terminate()
|
|
1270
|
+
|
|
1282
1271
|
self.llm_loaded = None
|
|
1283
1272
|
self.tokenizer = None
|
|
1284
1273
|
self.model = None
|
|
@@ -1350,5 +1339,22 @@ class Server(ManagementTool):
|
|
|
1350
1339
|
return response
|
|
1351
1340
|
|
|
1352
1341
|
|
|
1342
|
+
@asynccontextmanager
|
|
1343
|
+
async def lifespan(app: FastAPI):
|
|
1344
|
+
# Code here will run when the application starts up
|
|
1345
|
+
|
|
1346
|
+
logging.info(
|
|
1347
|
+
"\n"
|
|
1348
|
+
"\n"
|
|
1349
|
+
"🍋 Lemonade Server Ready!\n"
|
|
1350
|
+
f"🍋 Open http://localhost:{app.port} in your browser for:\n"
|
|
1351
|
+
"🍋 💬 chat\n"
|
|
1352
|
+
"🍋 💻 model management\n"
|
|
1353
|
+
"🍋 📄 docs\n"
|
|
1354
|
+
)
|
|
1355
|
+
|
|
1356
|
+
yield
|
|
1357
|
+
|
|
1358
|
+
|
|
1353
1359
|
# This file was originally licensed under Apache 2.0. It has been modified.
|
|
1354
1360
|
# Modifications Copyright (c) 2025 AMD
|