lemonade-sdk 7.0.0__py3-none-any.whl → 7.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lemonade-sdk might be problematic. Click here for more details.
- lemonade/cli.py +2 -0
- lemonade/tools/accuracy.py +335 -0
- lemonade/tools/server/instructions.py +294 -0
- lemonade/tools/server/llamacpp.py +315 -0
- lemonade/tools/server/port_utils.py +57 -0
- lemonade/tools/server/pydantic_models.py +83 -0
- lemonade/tools/server/serve.py +225 -167
- lemonade/tools/server/static/styles.css +313 -0
- lemonade/tools/server/thread_utils.py +87 -0
- lemonade/tools/server/tool_calls.py +50 -43
- lemonade/version.py +1 -1
- {lemonade_sdk-7.0.0.dist-info → lemonade_sdk-7.0.2.dist-info}/METADATA +4 -7
- {lemonade_sdk-7.0.0.dist-info → lemonade_sdk-7.0.2.dist-info}/RECORD +21 -14
- {lemonade_sdk-7.0.0.dist-info → lemonade_sdk-7.0.2.dist-info}/WHEEL +1 -1
- lemonade_server/cli.py +4 -2
- lemonade_server/model_manager.py +34 -17
- lemonade_server/server_models.json +52 -3
- {lemonade_sdk-7.0.0.dist-info → lemonade_sdk-7.0.2.dist-info}/entry_points.txt +0 -0
- {lemonade_sdk-7.0.0.dist-info → lemonade_sdk-7.0.2.dist-info}/licenses/LICENSE +0 -0
- {lemonade_sdk-7.0.0.dist-info → lemonade_sdk-7.0.2.dist-info}/licenses/NOTICE.md +0 -0
- {lemonade_sdk-7.0.0.dist-info → lemonade_sdk-7.0.2.dist-info}/top_level.txt +0 -0
lemonade/tools/server/serve.py
CHANGED
|
@@ -7,12 +7,16 @@ import logging
|
|
|
7
7
|
import traceback
|
|
8
8
|
from typing import Optional, Union
|
|
9
9
|
import json
|
|
10
|
+
import subprocess
|
|
11
|
+
from pathlib import Path
|
|
10
12
|
|
|
11
13
|
from fastapi import FastAPI, HTTPException, status, Request
|
|
12
|
-
from fastapi.responses import StreamingResponse
|
|
14
|
+
from fastapi.responses import StreamingResponse
|
|
13
15
|
from fastapi.middleware.cors import CORSMiddleware
|
|
14
|
-
from
|
|
16
|
+
from fastapi.staticfiles import StaticFiles
|
|
15
17
|
import uvicorn
|
|
18
|
+
from uvicorn.config import Config
|
|
19
|
+
from uvicorn.server import Server as UvicornServer
|
|
16
20
|
from transformers import TextIteratorStreamer, StoppingCriteria, StoppingCriteriaList
|
|
17
21
|
from tabulate import tabulate
|
|
18
22
|
|
|
@@ -24,7 +28,11 @@ from openai.types.chat.chat_completion_message_tool_call import (
|
|
|
24
28
|
Function,
|
|
25
29
|
)
|
|
26
30
|
from openai.types.chat.chat_completion import Choice
|
|
27
|
-
from openai.types.chat.chat_completion_chunk import
|
|
31
|
+
from openai.types.chat.chat_completion_chunk import (
|
|
32
|
+
ChoiceDelta,
|
|
33
|
+
ChoiceDeltaToolCall,
|
|
34
|
+
ChoiceDeltaToolCallFunction,
|
|
35
|
+
)
|
|
28
36
|
from openai.types.completion_choice import Logprobs
|
|
29
37
|
from openai.types.model import Model
|
|
30
38
|
from openai.types.responses import (
|
|
@@ -39,11 +47,18 @@ from openai.types.responses import (
|
|
|
39
47
|
import lemonade.api as lemonade_api
|
|
40
48
|
from lemonade_server.model_manager import ModelManager
|
|
41
49
|
from lemonade.tools.management_tools import ManagementTool
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
50
|
+
import lemonade.tools.server.llamacpp as llamacpp
|
|
51
|
+
from lemonade.tools.server.pydantic_models import (
|
|
52
|
+
DEFAULT_MAX_NEW_TOKENS,
|
|
53
|
+
LoadConfig,
|
|
54
|
+
CompletionRequest,
|
|
55
|
+
ChatCompletionRequest,
|
|
56
|
+
ResponsesRequest,
|
|
57
|
+
PullConfig,
|
|
58
|
+
)
|
|
59
|
+
from lemonade.tools.server.tool_calls import extract_tool_calls, get_tool_call_pattern
|
|
60
|
+
from lemonade.tools.server.instructions import get_instructions_html
|
|
61
|
+
from lemonade.tools.server.port_utils import lifespan
|
|
47
62
|
|
|
48
63
|
DEFAULT_PORT = 8000
|
|
49
64
|
DEFAULT_LOG_LEVEL = "info"
|
|
@@ -101,97 +116,21 @@ class StopOnEvent(StoppingCriteria):
|
|
|
101
116
|
return self.stop_event.is_set()
|
|
102
117
|
|
|
103
118
|
|
|
104
|
-
class PullConfig(BaseModel):
|
|
105
|
-
"""
|
|
106
|
-
Configurating for installing a supported LLM.
|
|
107
|
-
"""
|
|
108
|
-
|
|
109
|
-
model_name: str
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
class LoadConfig(BaseModel):
|
|
113
|
-
"""
|
|
114
|
-
Configuration for loading a language model.
|
|
115
|
-
|
|
116
|
-
Specifies the model checkpoint, generation parameters,
|
|
117
|
-
and hardware/framework configuration (recipe) for model loading.
|
|
118
|
-
"""
|
|
119
|
-
|
|
120
|
-
model_name: Optional[str] = None
|
|
121
|
-
checkpoint: Optional[str] = None
|
|
122
|
-
max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS
|
|
123
|
-
recipe: Optional[str] = None
|
|
124
|
-
# Indicates the maximum prompt length allowed for that specific
|
|
125
|
-
# checkpoint + recipe combination
|
|
126
|
-
max_prompt_length: Optional[int] = None
|
|
127
|
-
# Indicates whether the model is a reasoning model, like DeepSeek
|
|
128
|
-
reasoning: Optional[bool] = False
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
class CompletionRequest(BaseModel):
|
|
132
|
-
"""
|
|
133
|
-
Request model for text completion API endpoint.
|
|
134
|
-
|
|
135
|
-
Contains a prompt, a model identifier, and a streaming
|
|
136
|
-
flag to control response delivery.
|
|
137
|
-
"""
|
|
138
|
-
|
|
139
|
-
prompt: str
|
|
140
|
-
model: str
|
|
141
|
-
echo: bool = False
|
|
142
|
-
stream: bool = False
|
|
143
|
-
logprobs: int | None = False
|
|
144
|
-
stop: list[str] | str | None = None
|
|
145
|
-
temperature: float | None = None
|
|
146
|
-
max_tokens: int | None = None
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
class ChatCompletionRequest(BaseModel):
|
|
150
|
-
"""
|
|
151
|
-
Request model for chat completion API endpoint.
|
|
152
|
-
|
|
153
|
-
Contains a list of chat messages, a model identifier,
|
|
154
|
-
and a streaming flag to control response delivery.
|
|
155
|
-
"""
|
|
156
|
-
|
|
157
|
-
messages: list[dict]
|
|
158
|
-
model: str
|
|
159
|
-
stream: bool = False
|
|
160
|
-
logprobs: int | None = False
|
|
161
|
-
stop: list[str] | str | None = None
|
|
162
|
-
temperature: float | None = None
|
|
163
|
-
tools: list[dict] | None = None
|
|
164
|
-
max_tokens: int | None = None
|
|
165
|
-
max_completion_tokens: int | None = None
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
class ResponsesRequest(BaseModel):
|
|
169
|
-
"""
|
|
170
|
-
Request model for responses API endpoint.
|
|
171
|
-
"""
|
|
172
|
-
|
|
173
|
-
input: list[dict] | str
|
|
174
|
-
model: str
|
|
175
|
-
max_output_tokens: int | None = None
|
|
176
|
-
temperature: float | None = None
|
|
177
|
-
stream: bool = False
|
|
178
|
-
|
|
179
|
-
|
|
180
119
|
class Server(ManagementTool):
|
|
181
120
|
"""
|
|
182
121
|
Open a web server that apps can use to communicate with the LLM.
|
|
183
122
|
|
|
184
123
|
The server exposes these endpoints:
|
|
185
|
-
- /api/
|
|
186
|
-
- /api/
|
|
187
|
-
- /api/
|
|
188
|
-
- /api/
|
|
189
|
-
- /api/
|
|
190
|
-
- /api/
|
|
191
|
-
- /api/
|
|
192
|
-
- /api/
|
|
193
|
-
- /api/
|
|
194
|
-
- /api/
|
|
124
|
+
- /api/v1/pull: install an LLM by its Lemonade Server Model Name.
|
|
125
|
+
- /api/v1/load: load a model checkpoint.
|
|
126
|
+
- /api/v1/unload: unload a model checkpoint.
|
|
127
|
+
- /api/v1/health: check whether a model is loaded and ready to serve.
|
|
128
|
+
- /api/v1/stats: performance statistics for the generation.
|
|
129
|
+
- /api/v1/halt: stop an in-progress generation from make more tokens.
|
|
130
|
+
- /api/v1/completions: completion responses using HTTP chunked transfer encoding.
|
|
131
|
+
- /api/v1/chat/completions: chat completion responses using HTTP chunked transfer encoding.
|
|
132
|
+
- /api/v1/responses: responses API using HTTP chunked transfer encoding.
|
|
133
|
+
- /api/v1/models: list all available models.
|
|
195
134
|
"""
|
|
196
135
|
|
|
197
136
|
unique_name = "serve"
|
|
@@ -200,7 +139,7 @@ class Server(ManagementTool):
|
|
|
200
139
|
super().__init__()
|
|
201
140
|
|
|
202
141
|
# Initialize FastAPI app
|
|
203
|
-
self.app = FastAPI()
|
|
142
|
+
self.app = FastAPI(lifespan=lifespan)
|
|
204
143
|
|
|
205
144
|
# Add CORS middleware
|
|
206
145
|
self.app.add_middleware(
|
|
@@ -212,23 +151,18 @@ class Server(ManagementTool):
|
|
|
212
151
|
)
|
|
213
152
|
|
|
214
153
|
# Set up custom routes
|
|
215
|
-
self.
|
|
216
|
-
self.app.post("/api/v0/load")(self.load_llm)
|
|
217
|
-
self.app.post("/api/v0/unload")(self.unload_llm)
|
|
218
|
-
self.app.get("/api/v0/health")(self.health)
|
|
219
|
-
self.app.get("/api/v0/halt")(self.halt_generation)
|
|
220
|
-
self.app.get("/api/v0/stats")(self.send_stats)
|
|
221
|
-
self.app.post("/api/v0/completions")(self.completions)
|
|
222
|
-
self.app.post("/api/v0/responses")(self.responses)
|
|
223
|
-
|
|
224
|
-
# Set up OpenAI-compatible routes
|
|
225
|
-
self.app.post("/api/v0/chat/completions")(self.chat_completions)
|
|
226
|
-
self.app.post("/api/v0/completions")(self.completions)
|
|
227
|
-
self.app.get("/api/v0/models")(self.models)
|
|
154
|
+
self.setup_routes(["/api/v0", "/api/v1"])
|
|
228
155
|
|
|
229
156
|
# Set up instructions
|
|
230
157
|
self.app.get("/")(self.instructions)
|
|
231
158
|
|
|
159
|
+
# Mount a static assets dir for HTML responses, such
|
|
160
|
+
# as the instructions
|
|
161
|
+
static_dir = Path(__file__).parent / "static"
|
|
162
|
+
self.app.mount(
|
|
163
|
+
"/static", StaticFiles(directory=static_dir), name="static_assets"
|
|
164
|
+
)
|
|
165
|
+
|
|
232
166
|
# Performance stats that are set during /ws and can be
|
|
233
167
|
# fetched in /stats
|
|
234
168
|
self.time_to_first_token = None
|
|
@@ -263,6 +197,28 @@ class Server(ManagementTool):
|
|
|
263
197
|
# Add lock for load/unload operations
|
|
264
198
|
self._load_lock = asyncio.Lock()
|
|
265
199
|
|
|
200
|
+
# Subprocess handle for llama_server.exe
|
|
201
|
+
self.llama_server_process: subprocess.Popen = None
|
|
202
|
+
|
|
203
|
+
# Telemetry instance for llama server
|
|
204
|
+
self.llama_telemetry = llamacpp.LlamaTelemetry()
|
|
205
|
+
|
|
206
|
+
def setup_routes(self, api_prefixes: list[str]):
|
|
207
|
+
for prefix in api_prefixes:
|
|
208
|
+
# Custom routes
|
|
209
|
+
self.app.post(f"{prefix}/pull")(self.pull)
|
|
210
|
+
self.app.post(f"{prefix}/load")(self.load_llm)
|
|
211
|
+
self.app.post(f"{prefix}/unload")(self.unload_llm)
|
|
212
|
+
self.app.get(f"{prefix}/health")(self.health)
|
|
213
|
+
self.app.get(f"{prefix}/halt")(self.halt_generation)
|
|
214
|
+
self.app.get(f"{prefix}/stats")(self.send_stats)
|
|
215
|
+
self.app.post(f"{prefix}/completions")(self.completions)
|
|
216
|
+
self.app.post(f"{prefix}/responses")(self.responses)
|
|
217
|
+
|
|
218
|
+
# OpenAI-compatible routes
|
|
219
|
+
self.app.post(f"{prefix}/chat/completions")(self.chat_completions)
|
|
220
|
+
self.app.get(f"{prefix}/models")(self.models)
|
|
221
|
+
|
|
266
222
|
@staticmethod
|
|
267
223
|
def parser(add_help: bool = True) -> argparse.ArgumentParser:
|
|
268
224
|
parser = __class__.helpful_parser(
|
|
@@ -288,15 +244,22 @@ class Server(ManagementTool):
|
|
|
288
244
|
|
|
289
245
|
return parser
|
|
290
246
|
|
|
291
|
-
def
|
|
247
|
+
def _setup_server_common(
|
|
292
248
|
self,
|
|
293
|
-
|
|
294
|
-
# we always use the default cache directory
|
|
295
|
-
_=None,
|
|
296
|
-
port: int = DEFAULT_PORT,
|
|
297
|
-
log_level: str = DEFAULT_LOG_LEVEL,
|
|
249
|
+
port: int,
|
|
298
250
|
truncate_inputs: bool = False,
|
|
251
|
+
log_level: str = DEFAULT_LOG_LEVEL,
|
|
252
|
+
threaded_mode: bool = False,
|
|
299
253
|
):
|
|
254
|
+
"""
|
|
255
|
+
Common setup logic shared between run() and run_in_thread().
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
port: Port number for the server
|
|
259
|
+
truncate_inputs: Whether to truncate inputs if they exceed max length
|
|
260
|
+
log_level: Logging level to configure
|
|
261
|
+
threaded_mode: Whether this is being set up for threaded execution
|
|
262
|
+
"""
|
|
300
263
|
# Store truncation settings
|
|
301
264
|
self.truncate_inputs = truncate_inputs
|
|
302
265
|
|
|
@@ -310,22 +273,27 @@ class Server(ManagementTool):
|
|
|
310
273
|
|
|
311
274
|
logging.trace = trace
|
|
312
275
|
|
|
313
|
-
# Configure logging
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
276
|
+
# Configure logging based on mode
|
|
277
|
+
if threaded_mode:
|
|
278
|
+
# Configure logging for warning level (to reduce noise in threaded execution)
|
|
279
|
+
logging.getLogger("uvicorn.error").setLevel(logging.WARNING)
|
|
280
|
+
else:
|
|
281
|
+
# Configure logging to match uvicorn's format
|
|
282
|
+
logging_level = getattr(logging, log_level.upper())
|
|
283
|
+
logging.basicConfig(
|
|
284
|
+
level=logging_level,
|
|
285
|
+
format="%(levelprefix)s %(message)s",
|
|
286
|
+
datefmt="%Y-%m-%d %H:%M:%S",
|
|
287
|
+
)
|
|
320
288
|
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
289
|
+
# Add uvicorn's log formatter
|
|
290
|
+
logging.root.handlers[0].formatter = uvicorn.logging.DefaultFormatter(
|
|
291
|
+
fmt="%(levelprefix)s %(message)s",
|
|
292
|
+
use_colors=True,
|
|
293
|
+
)
|
|
326
294
|
|
|
327
|
-
|
|
328
|
-
|
|
295
|
+
# Ensure the log level is properly set
|
|
296
|
+
logging.getLogger().setLevel(logging_level)
|
|
329
297
|
|
|
330
298
|
# Update debug logging state after setting log level
|
|
331
299
|
self.debug_logging_enabled = logging.getLogger().isEnabledFor(logging.DEBUG)
|
|
@@ -334,8 +302,66 @@ class Server(ManagementTool):
|
|
|
334
302
|
# Print the elapsed time for each request
|
|
335
303
|
self.setup_middleware_timer()
|
|
336
304
|
|
|
305
|
+
# Let the app know what port it's running on, so
|
|
306
|
+
# that the lifespan can access it
|
|
307
|
+
self.app.port = port
|
|
308
|
+
|
|
309
|
+
def run(
|
|
310
|
+
self,
|
|
311
|
+
# ManagementTool has a required cache_dir arg, but
|
|
312
|
+
# we always use the default cache directory
|
|
313
|
+
_=None,
|
|
314
|
+
port: int = DEFAULT_PORT,
|
|
315
|
+
log_level: str = DEFAULT_LOG_LEVEL,
|
|
316
|
+
truncate_inputs: bool = False,
|
|
317
|
+
):
|
|
318
|
+
# Common setup
|
|
319
|
+
self._setup_server_common(
|
|
320
|
+
port=port,
|
|
321
|
+
truncate_inputs=truncate_inputs,
|
|
322
|
+
log_level=log_level,
|
|
323
|
+
threaded_mode=False,
|
|
324
|
+
)
|
|
325
|
+
|
|
337
326
|
uvicorn.run(self.app, host="localhost", port=port, log_level=log_level)
|
|
338
327
|
|
|
328
|
+
def run_in_thread(
|
|
329
|
+
self,
|
|
330
|
+
port: int = DEFAULT_PORT,
|
|
331
|
+
host: str = "localhost",
|
|
332
|
+
log_level: str = "warning",
|
|
333
|
+
truncate_inputs: bool = False,
|
|
334
|
+
):
|
|
335
|
+
"""
|
|
336
|
+
Set up the server for running in a thread.
|
|
337
|
+
Returns a uvicorn server instance that can be controlled externally.
|
|
338
|
+
"""
|
|
339
|
+
# Common setup
|
|
340
|
+
self._setup_server_common(
|
|
341
|
+
port=port,
|
|
342
|
+
truncate_inputs=truncate_inputs,
|
|
343
|
+
log_level=log_level,
|
|
344
|
+
threaded_mode=True,
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
class CustomServer(UvicornServer):
|
|
348
|
+
"""Custom Uvicorn server that can be properly shutdown from another thread"""
|
|
349
|
+
|
|
350
|
+
def install_signal_handlers(self):
|
|
351
|
+
pass
|
|
352
|
+
|
|
353
|
+
# Configure the server
|
|
354
|
+
config = Config(
|
|
355
|
+
app=self.app,
|
|
356
|
+
host=host,
|
|
357
|
+
port=port,
|
|
358
|
+
log_level=log_level,
|
|
359
|
+
log_config=None,
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
# Create and return the uvicorn server
|
|
363
|
+
return CustomServer(config=config)
|
|
364
|
+
|
|
339
365
|
async def _show_telemetry(self):
|
|
340
366
|
"""
|
|
341
367
|
Show telemetry data in debug mode.
|
|
@@ -363,31 +389,8 @@ class Server(ManagementTool):
|
|
|
363
389
|
"""
|
|
364
390
|
Show instructions on how to use the server.
|
|
365
391
|
"""
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
<html>
|
|
369
|
-
<head>
|
|
370
|
-
<title>Lemonade Server</title>
|
|
371
|
-
<link rel="icon" href="data:,">
|
|
372
|
-
</head>
|
|
373
|
-
<body>
|
|
374
|
-
<h1>🍋 Welcome to Lemonade Server!</h1>
|
|
375
|
-
<p>
|
|
376
|
-
A standards-compliant server that provides REST APIs for LLM communication.
|
|
377
|
-
To get started, simply point your OpenAI-compatible application at the server's endpoint.
|
|
378
|
-
</p>
|
|
379
|
-
<div class="links">
|
|
380
|
-
<h3>Documentation:</h3>
|
|
381
|
-
<ul>
|
|
382
|
-
<li><a href="https://github.com/lemonade-sdk/lemonade/tree/main/docs/server/apps/README.md">Examples & Usage</a></li>
|
|
383
|
-
<li><a href="https://github.com/lemonade-sdk/lemonade/blob/main/docs/server/server_integration.md">Integration Guide</a></li>
|
|
384
|
-
<li><a href="https://github.com/lemonade-sdk/lemonade/blob/main/docs/server/server_spec.md">Server Specification</a></li>
|
|
385
|
-
</ul>
|
|
386
|
-
</div>
|
|
387
|
-
</body>
|
|
388
|
-
</html>
|
|
389
|
-
"""
|
|
390
|
-
return HTMLResponse(content=html_content, status_code=200)
|
|
392
|
+
|
|
393
|
+
return get_instructions_html(port=self.app.port)
|
|
391
394
|
|
|
392
395
|
def initialize_load_config(
|
|
393
396
|
self, request: Union[ChatCompletionRequest, CompletionRequest]
|
|
@@ -530,10 +533,6 @@ class Server(ManagementTool):
|
|
|
530
533
|
Stream chat completion responses using HTTP chunked transfer encoding.
|
|
531
534
|
"""
|
|
532
535
|
|
|
533
|
-
if chat_completion_request.tools and chat_completion_request.stream:
|
|
534
|
-
logging.warning(
|
|
535
|
-
"tools are only supported on non-streaming chat completions"
|
|
536
|
-
)
|
|
537
536
|
if chat_completion_request.logprobs:
|
|
538
537
|
logging.warning("logprobs is not supported on chat completion")
|
|
539
538
|
|
|
@@ -542,14 +541,15 @@ class Server(ManagementTool):
|
|
|
542
541
|
# Load the model if it's different from the currently loaded one
|
|
543
542
|
await self.load_llm(lc, internal_call=True)
|
|
544
543
|
|
|
544
|
+
if self.llm_loaded.recipe == "llamacpp":
|
|
545
|
+
return llamacpp.chat_completion(
|
|
546
|
+
chat_completion_request, self.llama_telemetry
|
|
547
|
+
)
|
|
548
|
+
|
|
545
549
|
# Convert chat messages to text using the model's chat template
|
|
546
550
|
text = self.apply_chat_template(
|
|
547
551
|
chat_completion_request.messages,
|
|
548
|
-
tools=
|
|
549
|
-
chat_completion_request.tools
|
|
550
|
-
if not chat_completion_request.stream
|
|
551
|
-
else None
|
|
552
|
-
),
|
|
552
|
+
tools=chat_completion_request.tools,
|
|
553
553
|
)
|
|
554
554
|
|
|
555
555
|
# If the model supports reasoning, we:
|
|
@@ -585,6 +585,12 @@ class Server(ManagementTool):
|
|
|
585
585
|
"max_new_tokens": max_new_tokens,
|
|
586
586
|
}
|
|
587
587
|
|
|
588
|
+
if chat_completion_request.tools:
|
|
589
|
+
# Get the tool call pattern
|
|
590
|
+
tool_call_pattern = get_tool_call_pattern(
|
|
591
|
+
self.tokenizer.auto_tokenizer.added_tokens_decoder
|
|
592
|
+
)
|
|
593
|
+
|
|
588
594
|
if chat_completion_request.stream:
|
|
589
595
|
|
|
590
596
|
# Stream the response
|
|
@@ -594,7 +600,38 @@ class Server(ManagementTool):
|
|
|
594
600
|
# in the inner function
|
|
595
601
|
nonlocal reasoning_first_token
|
|
596
602
|
|
|
603
|
+
# Keep track of the full response for tool call extraction
|
|
604
|
+
full_response = ""
|
|
605
|
+
|
|
597
606
|
async for token in self._generate_tokens(**generation_args):
|
|
607
|
+
# Continuously look for tool calls embedded into the generated text
|
|
608
|
+
openai_tool_calls = None
|
|
609
|
+
if chat_completion_request.tools:
|
|
610
|
+
|
|
611
|
+
# Append the token to the full response
|
|
612
|
+
full_response += token
|
|
613
|
+
|
|
614
|
+
tool_calls, _ = extract_tool_calls(
|
|
615
|
+
full_response,
|
|
616
|
+
tool_call_pattern,
|
|
617
|
+
)
|
|
618
|
+
|
|
619
|
+
# If there are tool calls, reset the full response for the next tool call
|
|
620
|
+
if tool_calls:
|
|
621
|
+
openai_tool_calls = []
|
|
622
|
+
full_response = ""
|
|
623
|
+
for tool_call in tool_calls:
|
|
624
|
+
openai_tool_calls.append(
|
|
625
|
+
ChoiceDeltaToolCall(
|
|
626
|
+
index=0,
|
|
627
|
+
id="-",
|
|
628
|
+
function=ChoiceDeltaToolCallFunction(
|
|
629
|
+
arguments=json.dumps(tool_call["arguments"]),
|
|
630
|
+
name=tool_call["name"],
|
|
631
|
+
),
|
|
632
|
+
type="function",
|
|
633
|
+
)
|
|
634
|
+
)
|
|
598
635
|
|
|
599
636
|
# Create a ChatCompletionChunk
|
|
600
637
|
chunk = ChatCompletionChunk.model_construct(
|
|
@@ -613,7 +650,7 @@ class Server(ManagementTool):
|
|
|
613
650
|
),
|
|
614
651
|
function_call=None,
|
|
615
652
|
role="assistant",
|
|
616
|
-
tool_calls=
|
|
653
|
+
tool_calls=openai_tool_calls,
|
|
617
654
|
refusal=None,
|
|
618
655
|
),
|
|
619
656
|
finish_reason=None,
|
|
@@ -648,7 +685,7 @@ class Server(ManagementTool):
|
|
|
648
685
|
openai_tool_calls = None
|
|
649
686
|
if chat_completion_request.tools:
|
|
650
687
|
tool_calls, full_response = extract_tool_calls(
|
|
651
|
-
full_response,
|
|
688
|
+
full_response, tool_call_pattern
|
|
652
689
|
)
|
|
653
690
|
if tool_calls:
|
|
654
691
|
openai_tool_calls = []
|
|
@@ -767,6 +804,7 @@ class Server(ManagementTool):
|
|
|
767
804
|
created_event = ResponseCreatedEvent(
|
|
768
805
|
response=response,
|
|
769
806
|
type="response.created",
|
|
807
|
+
sequence_number=0,
|
|
770
808
|
)
|
|
771
809
|
yield f"data: {created_event.model_dump_json()}\n\n".encode("utf-8")
|
|
772
810
|
|
|
@@ -781,6 +819,7 @@ class Server(ManagementTool):
|
|
|
781
819
|
item_id="0 ",
|
|
782
820
|
output_index=0,
|
|
783
821
|
type="response.output_text.delta",
|
|
822
|
+
sequence_number=0,
|
|
784
823
|
)
|
|
785
824
|
full_response += token
|
|
786
825
|
|
|
@@ -815,6 +854,7 @@ class Server(ManagementTool):
|
|
|
815
854
|
completed_event = ResponseCompletedEvent(
|
|
816
855
|
response=response,
|
|
817
856
|
type="response.completed",
|
|
857
|
+
sequence_number=0,
|
|
818
858
|
)
|
|
819
859
|
yield f"data: {completed_event.model_dump_json()}\n\n".encode("utf-8")
|
|
820
860
|
|
|
@@ -1035,6 +1075,11 @@ class Server(ManagementTool):
|
|
|
1035
1075
|
"""
|
|
1036
1076
|
Send performance statistics to the client.
|
|
1037
1077
|
"""
|
|
1078
|
+
# If using llama server, get telemetry from the telemetry instance
|
|
1079
|
+
if self.llm_loaded and self.llm_loaded.recipe == "llamacpp":
|
|
1080
|
+
return self.llama_telemetry.get_telemetry_data()
|
|
1081
|
+
|
|
1082
|
+
# For built-in server, use the existing telemetry
|
|
1038
1083
|
return {
|
|
1039
1084
|
"time_to_first_token": self.time_to_first_token,
|
|
1040
1085
|
"tokens_per_second": self.tokens_per_second,
|
|
@@ -1246,15 +1291,25 @@ class Server(ManagementTool):
|
|
|
1246
1291
|
|
|
1247
1292
|
logging.info(f"Loading llm: {model_reference}")
|
|
1248
1293
|
try:
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1294
|
+
if config_to_use.recipe == "llamacpp":
|
|
1295
|
+
self.llama_server_process = llamacpp.server_load(
|
|
1296
|
+
checkpoint=config_to_use.checkpoint,
|
|
1297
|
+
model_reference=model_reference,
|
|
1298
|
+
telemetry=self.llama_telemetry,
|
|
1299
|
+
)
|
|
1300
|
+
|
|
1301
|
+
else:
|
|
1302
|
+
self.model, self.tokenizer = lemonade_api.from_pretrained(
|
|
1303
|
+
checkpoint=config_to_use.checkpoint, recipe=config_to_use.recipe
|
|
1304
|
+
)
|
|
1252
1305
|
self.llm_loaded = config_to_use
|
|
1253
1306
|
|
|
1254
1307
|
return {
|
|
1255
1308
|
"status": "success",
|
|
1256
1309
|
"message": f"Loaded model: {model_reference}",
|
|
1257
1310
|
}
|
|
1311
|
+
except HTTPException:
|
|
1312
|
+
raise
|
|
1258
1313
|
except Exception: # pylint: disable=broad-exception-caught
|
|
1259
1314
|
self.model_load_failure(model_reference)
|
|
1260
1315
|
|
|
@@ -1279,6 +1334,9 @@ class Server(ManagementTool):
|
|
|
1279
1334
|
for _ in range(self.max_concurrent_generations):
|
|
1280
1335
|
await self._generate_semaphore.acquire()
|
|
1281
1336
|
|
|
1337
|
+
if self.llm_loaded.recipe == "llamacpp":
|
|
1338
|
+
self.llama_server_process.terminate()
|
|
1339
|
+
|
|
1282
1340
|
self.llm_loaded = None
|
|
1283
1341
|
self.tokenizer = None
|
|
1284
1342
|
self.model = None
|