lemonade-sdk 8.1.5__py3-none-any.whl → 8.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lemonade-sdk might be problematic. Click here for more details.
- lemonade/tools/llamacpp/utils.py +5 -1
- lemonade/tools/server/llamacpp.py +164 -562
- lemonade/tools/server/serve.py +15 -22
- lemonade/tools/server/wrapped_server.py +485 -0
- lemonade/version.py +1 -1
- {lemonade_sdk-8.1.5.dist-info → lemonade_sdk-8.1.6.dist-info}/METADATA +1 -1
- {lemonade_sdk-8.1.5.dist-info → lemonade_sdk-8.1.6.dist-info}/RECORD +14 -13
- lemonade_server/cli.py +18 -9
- lemonade_server/model_manager.py +201 -20
- {lemonade_sdk-8.1.5.dist-info → lemonade_sdk-8.1.6.dist-info}/WHEEL +0 -0
- {lemonade_sdk-8.1.5.dist-info → lemonade_sdk-8.1.6.dist-info}/entry_points.txt +0 -0
- {lemonade_sdk-8.1.5.dist-info → lemonade_sdk-8.1.6.dist-info}/licenses/LICENSE +0 -0
- {lemonade_sdk-8.1.5.dist-info → lemonade_sdk-8.1.6.dist-info}/licenses/NOTICE.md +0 -0
- {lemonade_sdk-8.1.5.dist-info → lemonade_sdk-8.1.6.dist-info}/top_level.txt +0 -0
lemonade/tools/server/serve.py
CHANGED
|
@@ -9,7 +9,6 @@ import tempfile
|
|
|
9
9
|
import traceback
|
|
10
10
|
from typing import Optional, Union
|
|
11
11
|
import json
|
|
12
|
-
import subprocess
|
|
13
12
|
from pathlib import Path
|
|
14
13
|
|
|
15
14
|
from fastapi import FastAPI, HTTPException, status, Request
|
|
@@ -47,7 +46,8 @@ from openai.types.responses import (
|
|
|
47
46
|
)
|
|
48
47
|
|
|
49
48
|
import lemonade.api as lemonade_api
|
|
50
|
-
|
|
49
|
+
from lemonade.tools.server.wrapped_server import WrappedServer
|
|
50
|
+
from lemonade.tools.server.llamacpp import LlamaServer
|
|
51
51
|
from lemonade.tools.server.tool_calls import extract_tool_calls, get_tool_call_pattern
|
|
52
52
|
from lemonade.tools.server.webapp import get_webapp_html
|
|
53
53
|
from lemonade.tools.server.utils.port import lifespan
|
|
@@ -232,11 +232,8 @@ class Server:
|
|
|
232
232
|
# Add lock for load/unload operations
|
|
233
233
|
self._load_lock = asyncio.Lock()
|
|
234
234
|
|
|
235
|
-
# Subprocess handle for llama_server.exe
|
|
236
|
-
self.
|
|
237
|
-
|
|
238
|
-
# Telemetry instance for llama server
|
|
239
|
-
self.llama_telemetry = llamacpp.LlamaTelemetry()
|
|
235
|
+
# Subprocess handle for wrapped instance of llama_server.exe, etc.
|
|
236
|
+
self.wrapped_server: WrappedServer = None
|
|
240
237
|
|
|
241
238
|
def setup_routes(self, api_prefixes: list[str]):
|
|
242
239
|
for prefix in api_prefixes:
|
|
@@ -521,7 +518,7 @@ class Server:
|
|
|
521
518
|
await self.load_llm(lc)
|
|
522
519
|
|
|
523
520
|
if self.llm_loaded.recipe == "llamacpp":
|
|
524
|
-
return
|
|
521
|
+
return self.wrapped_server.completion(completion_request)
|
|
525
522
|
|
|
526
523
|
# Check if the model supports reasoning
|
|
527
524
|
reasoning_first_token = self.llm_loaded.reasoning
|
|
@@ -656,9 +653,7 @@ class Server:
|
|
|
656
653
|
await self.load_llm(lc)
|
|
657
654
|
|
|
658
655
|
if self.llm_loaded.recipe == "llamacpp":
|
|
659
|
-
return
|
|
660
|
-
chat_completion_request, self.llama_telemetry
|
|
661
|
-
)
|
|
656
|
+
return self.wrapped_server.chat_completion(chat_completion_request)
|
|
662
657
|
|
|
663
658
|
# Convert chat messages to text using the model's chat template
|
|
664
659
|
text = self.apply_chat_template(
|
|
@@ -861,7 +856,7 @@ class Server:
|
|
|
861
856
|
|
|
862
857
|
if self.llm_loaded.recipe == "llamacpp":
|
|
863
858
|
try:
|
|
864
|
-
return
|
|
859
|
+
return self.wrapped_server.embeddings(embeddings_request)
|
|
865
860
|
except Exception as e: # pylint: disable=broad-exception-caught
|
|
866
861
|
# Check if model has embeddings label
|
|
867
862
|
model_info = ModelManager().supported_models.get(
|
|
@@ -884,7 +879,7 @@ class Server:
|
|
|
884
879
|
|
|
885
880
|
async def reranking(self, reranking_request: RerankingRequest):
|
|
886
881
|
"""
|
|
887
|
-
Rerank documents based on their relevance to a query
|
|
882
|
+
Rerank documents based on their relevance to a query.
|
|
888
883
|
"""
|
|
889
884
|
# Initialize load config from reranking request
|
|
890
885
|
lc = LoadConfig(model_name=reranking_request.model)
|
|
@@ -894,7 +889,7 @@ class Server:
|
|
|
894
889
|
|
|
895
890
|
if self.llm_loaded.recipe == "llamacpp":
|
|
896
891
|
try:
|
|
897
|
-
return
|
|
892
|
+
return self.wrapped_server.reranking(reranking_request)
|
|
898
893
|
except Exception as e: # pylint: disable=broad-exception-caught
|
|
899
894
|
# Check if model has reranking label
|
|
900
895
|
model_info = ModelManager().supported_models.get(
|
|
@@ -1287,7 +1282,7 @@ class Server:
|
|
|
1287
1282
|
"""
|
|
1288
1283
|
# If using llama server, get telemetry from the telemetry instance
|
|
1289
1284
|
if self.llm_loaded and self.llm_loaded.recipe == "llamacpp":
|
|
1290
|
-
return self.
|
|
1285
|
+
return self.wrapped_server.telemetry.get_telemetry_data()
|
|
1291
1286
|
|
|
1292
1287
|
# For built-in server, use the existing telemetry
|
|
1293
1288
|
return {
|
|
@@ -1466,9 +1461,9 @@ class Server:
|
|
|
1466
1461
|
):
|
|
1467
1462
|
if (
|
|
1468
1463
|
self.llm_loaded.recipe == "llamacpp"
|
|
1469
|
-
and self.
|
|
1464
|
+
and self.wrapped_server.process.poll()
|
|
1470
1465
|
):
|
|
1471
|
-
#
|
|
1466
|
+
# wrapped server process has gone away for some reason, so we should
|
|
1472
1467
|
# proceed with loading to get it back
|
|
1473
1468
|
pass
|
|
1474
1469
|
else:
|
|
@@ -1484,12 +1479,10 @@ class Server:
|
|
|
1484
1479
|
logging.info(f"Loading llm: {config.model_name}")
|
|
1485
1480
|
try:
|
|
1486
1481
|
if config_to_use.recipe == "llamacpp":
|
|
1487
|
-
self.
|
|
1482
|
+
self.wrapped_server = LlamaServer(self.llamacpp_backend)
|
|
1483
|
+
self.wrapped_server.load(
|
|
1488
1484
|
model_config=config_to_use,
|
|
1489
|
-
telemetry=self.llama_telemetry,
|
|
1490
|
-
backend=self.llamacpp_backend,
|
|
1491
1485
|
ctx_size=self.ctx_size,
|
|
1492
|
-
# Models should only upgrade when using the pull endpoint
|
|
1493
1486
|
do_not_upgrade=True,
|
|
1494
1487
|
)
|
|
1495
1488
|
|
|
@@ -1530,7 +1523,7 @@ class Server:
|
|
|
1530
1523
|
await self._generate_semaphore.acquire()
|
|
1531
1524
|
|
|
1532
1525
|
if self.llm_loaded.recipe == "llamacpp":
|
|
1533
|
-
self.
|
|
1526
|
+
self.wrapped_server.process.terminate()
|
|
1534
1527
|
|
|
1535
1528
|
self.llm_loaded = None
|
|
1536
1529
|
self.tokenizer = None
|
|
@@ -0,0 +1,485 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import time
|
|
3
|
+
import subprocess
|
|
4
|
+
from abc import ABC, abstractmethod
|
|
5
|
+
|
|
6
|
+
import requests
|
|
7
|
+
from tabulate import tabulate
|
|
8
|
+
from fastapi import HTTPException, status
|
|
9
|
+
from fastapi.responses import StreamingResponse
|
|
10
|
+
|
|
11
|
+
from openai import OpenAI
|
|
12
|
+
|
|
13
|
+
from lemonade_server.pydantic_models import (
|
|
14
|
+
ChatCompletionRequest,
|
|
15
|
+
CompletionRequest,
|
|
16
|
+
PullConfig,
|
|
17
|
+
EmbeddingsRequest,
|
|
18
|
+
RerankingRequest,
|
|
19
|
+
)
|
|
20
|
+
from lemonade_server.model_manager import ModelManager
|
|
21
|
+
from lemonade.tools.server.utils.port import find_free_port
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class WrappedServerTelemetry(ABC):
|
|
25
|
+
"""
|
|
26
|
+
Manages telemetry data collection and display for wrapped server.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(self):
|
|
30
|
+
self.input_tokens = None
|
|
31
|
+
self.output_tokens = None
|
|
32
|
+
self.time_to_first_token = None
|
|
33
|
+
self.tokens_per_second = None
|
|
34
|
+
self.prompt_eval_time = None
|
|
35
|
+
self.eval_time = None
|
|
36
|
+
|
|
37
|
+
@abstractmethod
|
|
38
|
+
def parse_telemetry_line(self, line: str):
|
|
39
|
+
"""
|
|
40
|
+
Parse telemetry data from wrapped server output lines.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
def get_telemetry_data(self):
|
|
44
|
+
return {
|
|
45
|
+
"input_tokens": self.input_tokens,
|
|
46
|
+
"output_tokens": self.output_tokens,
|
|
47
|
+
"time_to_first_token": self.time_to_first_token,
|
|
48
|
+
"tokens_per_second": self.tokens_per_second,
|
|
49
|
+
"decode_token_times": None,
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
def show_telemetry(self):
|
|
53
|
+
# Check if debug logging is enabled
|
|
54
|
+
if not logging.getLogger().isEnabledFor(logging.DEBUG):
|
|
55
|
+
return
|
|
56
|
+
|
|
57
|
+
# Prepare telemetry data (transposed format)
|
|
58
|
+
telemetry = [
|
|
59
|
+
["Input tokens", self.input_tokens],
|
|
60
|
+
["Output tokens", self.output_tokens],
|
|
61
|
+
["TTFT (s)", f"{self.time_to_first_token:.2f}"],
|
|
62
|
+
["TPS", f"{self.tokens_per_second:.2f}"],
|
|
63
|
+
]
|
|
64
|
+
|
|
65
|
+
table = tabulate(
|
|
66
|
+
telemetry, headers=["Metric", "Value"], tablefmt="fancy_grid"
|
|
67
|
+
).split("\n")
|
|
68
|
+
|
|
69
|
+
# Show telemetry in debug while complying with uvicorn's log indentation
|
|
70
|
+
logging.debug("\n ".join(table))
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class WrappedServer(ABC):
|
|
74
|
+
"""
|
|
75
|
+
Abstract base class that defines the interface for Lemonade to "wrap" a server
|
|
76
|
+
like llama-server.
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
def __init__(self, server_name: str, telemetry: WrappedServerTelemetry):
|
|
80
|
+
self.port: int = None
|
|
81
|
+
self.process: subprocess.Popen = None
|
|
82
|
+
self.server_name: str = server_name
|
|
83
|
+
self.telemetry: WrappedServerTelemetry = telemetry
|
|
84
|
+
|
|
85
|
+
def choose_port(self):
|
|
86
|
+
"""
|
|
87
|
+
Users probably don't care what port we start the wrapped server on, so let's
|
|
88
|
+
search for an empty port
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
self.port = find_free_port()
|
|
92
|
+
|
|
93
|
+
if self.port is None:
|
|
94
|
+
msg = f"Failed to find an empty port to start {self.server_name} on"
|
|
95
|
+
logging.error(msg)
|
|
96
|
+
raise HTTPException(
|
|
97
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
98
|
+
detail=msg,
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
def address(self) -> str:
|
|
102
|
+
"""
|
|
103
|
+
Generate the base URL for the server.
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
The base URL for the wrapped server
|
|
107
|
+
"""
|
|
108
|
+
return f"http://127.0.0.1:{self.port}/v1"
|
|
109
|
+
|
|
110
|
+
def _separate_openai_params(
|
|
111
|
+
self,
|
|
112
|
+
request_dict: dict,
|
|
113
|
+
endpoint_type: str = "chat",
|
|
114
|
+
) -> dict:
|
|
115
|
+
"""
|
|
116
|
+
Separate standard OpenAI parameters from custom wrapped server parameters.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
request_dict: Dictionary of all request parameters
|
|
120
|
+
endpoint_type: Type of endpoint ("chat" or "completion")
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
Dictionary with parameters properly separated for OpenAI client
|
|
124
|
+
"""
|
|
125
|
+
openai_client_params = {}
|
|
126
|
+
extra_params = {}
|
|
127
|
+
|
|
128
|
+
# Common OpenAI parameters for both endpoint types
|
|
129
|
+
common_params = {
|
|
130
|
+
"model",
|
|
131
|
+
"frequency_penalty",
|
|
132
|
+
"logit_bias",
|
|
133
|
+
"logprobs",
|
|
134
|
+
"max_tokens",
|
|
135
|
+
"n",
|
|
136
|
+
"presence_penalty",
|
|
137
|
+
"seed",
|
|
138
|
+
"stop",
|
|
139
|
+
"stream",
|
|
140
|
+
"temperature",
|
|
141
|
+
"top_p",
|
|
142
|
+
"user",
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
# Standard OpenAI parameters by endpoint type
|
|
146
|
+
if endpoint_type == "chat":
|
|
147
|
+
chat_specific_params = {
|
|
148
|
+
"messages",
|
|
149
|
+
"top_logprobs",
|
|
150
|
+
"response_format",
|
|
151
|
+
"service_tier",
|
|
152
|
+
"stream_options",
|
|
153
|
+
"tools",
|
|
154
|
+
"tool_choice",
|
|
155
|
+
"parallel_tool_calls",
|
|
156
|
+
}
|
|
157
|
+
openai_params = common_params | chat_specific_params
|
|
158
|
+
else: # completion
|
|
159
|
+
completion_specific_params = {
|
|
160
|
+
"prompt",
|
|
161
|
+
"best_of",
|
|
162
|
+
"echo",
|
|
163
|
+
"suffix",
|
|
164
|
+
}
|
|
165
|
+
openai_params = common_params | completion_specific_params
|
|
166
|
+
|
|
167
|
+
for key, value in request_dict.items():
|
|
168
|
+
if key in openai_params:
|
|
169
|
+
openai_client_params[key] = value
|
|
170
|
+
else:
|
|
171
|
+
extra_params[key] = value
|
|
172
|
+
|
|
173
|
+
# If there are custom parameters, use extra_body to pass them through
|
|
174
|
+
if extra_params:
|
|
175
|
+
openai_client_params["extra_body"] = extra_params
|
|
176
|
+
|
|
177
|
+
return openai_client_params
|
|
178
|
+
|
|
179
|
+
def _log_subprocess_output(self, prefix: str):
|
|
180
|
+
"""
|
|
181
|
+
Read subprocess output line by line, log to debug, and parse telemetry
|
|
182
|
+
"""
|
|
183
|
+
|
|
184
|
+
if self.process.stdout:
|
|
185
|
+
try:
|
|
186
|
+
for line in iter(self.process.stdout.readline, ""):
|
|
187
|
+
if line:
|
|
188
|
+
line_stripped = line.strip()
|
|
189
|
+
logging.debug("%s: %s", prefix, line_stripped)
|
|
190
|
+
|
|
191
|
+
self.telemetry.parse_telemetry_line(line_stripped)
|
|
192
|
+
|
|
193
|
+
if self.process.poll() is not None:
|
|
194
|
+
break
|
|
195
|
+
except UnicodeDecodeError as e:
|
|
196
|
+
logging.debug(
|
|
197
|
+
"Unicode decode error reading subprocess output: %s", str(e)
|
|
198
|
+
)
|
|
199
|
+
except Exception as e: # pylint: disable=broad-exception-caught
|
|
200
|
+
logging.error("Unexpected error reading subprocess output: %s", str(e))
|
|
201
|
+
|
|
202
|
+
def _wait_for_load(self):
|
|
203
|
+
status_code = None
|
|
204
|
+
while not self.process.poll() and status_code != 200:
|
|
205
|
+
health_url = f"http://localhost:{self.port}/health"
|
|
206
|
+
try:
|
|
207
|
+
health_response = requests.get(health_url)
|
|
208
|
+
except requests.exceptions.ConnectionError:
|
|
209
|
+
logging.debug(
|
|
210
|
+
f"Not able to connect to {self.server_name} yet, will retry"
|
|
211
|
+
)
|
|
212
|
+
else:
|
|
213
|
+
status_code = health_response.status_code
|
|
214
|
+
logging.debug(
|
|
215
|
+
f"Testing {self.server_name} readiness (will retry until ready), "
|
|
216
|
+
f"result: {health_response.json()}"
|
|
217
|
+
)
|
|
218
|
+
time.sleep(1)
|
|
219
|
+
|
|
220
|
+
@abstractmethod
|
|
221
|
+
def _launch_server_subprocess(
|
|
222
|
+
self,
|
|
223
|
+
model_config: PullConfig,
|
|
224
|
+
snapshot_files: dict,
|
|
225
|
+
ctx_size: int,
|
|
226
|
+
supports_embeddings: bool = False,
|
|
227
|
+
supports_reranking: bool = False,
|
|
228
|
+
):
|
|
229
|
+
"""
|
|
230
|
+
Launch wrapped server subprocess with appropriate configuration.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
snapshot_files: Dictionary of model files to load
|
|
234
|
+
supports_embeddings: Whether the model supports embeddings
|
|
235
|
+
supports_reranking: Whether the model supports reranking
|
|
236
|
+
"""
|
|
237
|
+
|
|
238
|
+
@abstractmethod
|
|
239
|
+
def install_server(self, backend=None):
|
|
240
|
+
"""
|
|
241
|
+
Install the wrapped server
|
|
242
|
+
"""
|
|
243
|
+
|
|
244
|
+
@abstractmethod
|
|
245
|
+
def download_model(
|
|
246
|
+
self, config_checkpoint, config_mmproj=None, do_not_upgrade=False
|
|
247
|
+
) -> dict:
|
|
248
|
+
"""
|
|
249
|
+
Download a model for the wrapper server
|
|
250
|
+
"""
|
|
251
|
+
|
|
252
|
+
def load(
|
|
253
|
+
self,
|
|
254
|
+
model_config: PullConfig,
|
|
255
|
+
ctx_size: int,
|
|
256
|
+
do_not_upgrade: bool = False,
|
|
257
|
+
):
|
|
258
|
+
# Install and/or update the wrapped server if needed
|
|
259
|
+
try:
|
|
260
|
+
self.install_server()
|
|
261
|
+
except NotImplementedError as e:
|
|
262
|
+
raise HTTPException(
|
|
263
|
+
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, detail=str(e)
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
# Download the model to the hugging face cache
|
|
267
|
+
snapshot_files = self.download_model(
|
|
268
|
+
model_config.checkpoint, model_config.mmproj, do_not_upgrade=do_not_upgrade
|
|
269
|
+
)
|
|
270
|
+
logging.debug(f"Model file paths: {snapshot_files}")
|
|
271
|
+
|
|
272
|
+
# Check if model supports embeddings
|
|
273
|
+
supported_models = ModelManager().supported_models
|
|
274
|
+
model_info = supported_models.get(model_config.model_name, {})
|
|
275
|
+
supports_embeddings = "embeddings" in model_info.get("labels", [])
|
|
276
|
+
supports_reranking = "reranking" in model_info.get("labels", [])
|
|
277
|
+
|
|
278
|
+
self._launch_server_subprocess(
|
|
279
|
+
model_config=model_config,
|
|
280
|
+
snapshot_files=snapshot_files,
|
|
281
|
+
ctx_size=ctx_size,
|
|
282
|
+
supports_embeddings=supports_embeddings,
|
|
283
|
+
supports_reranking=supports_reranking,
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
# Check the /health endpoint until server is ready
|
|
287
|
+
self._wait_for_load()
|
|
288
|
+
|
|
289
|
+
if self.process.poll():
|
|
290
|
+
raise HTTPException(
|
|
291
|
+
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
|
292
|
+
detail=f"Failed to load {model_config.model_name} with {self.server_name}",
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
def chat_completion(self, chat_completion_request: ChatCompletionRequest):
|
|
296
|
+
client = OpenAI(
|
|
297
|
+
base_url=self.address(),
|
|
298
|
+
api_key="lemonade",
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
# Convert Pydantic model to dict and remove unset/null values
|
|
302
|
+
request_dict = chat_completion_request.model_dump(
|
|
303
|
+
exclude_unset=True, exclude_none=True
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
# Separate standard OpenAI parameters from custom llama.cpp parameters
|
|
307
|
+
openai_client_params = self._separate_openai_params(request_dict, "chat")
|
|
308
|
+
|
|
309
|
+
# Check if streaming is requested
|
|
310
|
+
if chat_completion_request.stream:
|
|
311
|
+
|
|
312
|
+
def event_stream():
|
|
313
|
+
try:
|
|
314
|
+
# Enable streaming
|
|
315
|
+
# pylint: disable=missing-kwoa
|
|
316
|
+
for chunk in client.chat.completions.create(**openai_client_params):
|
|
317
|
+
yield f"data: {chunk.model_dump_json()}\n\n"
|
|
318
|
+
yield "data: [DONE]\n\n"
|
|
319
|
+
|
|
320
|
+
# Show telemetry after completion
|
|
321
|
+
self.telemetry.show_telemetry()
|
|
322
|
+
|
|
323
|
+
except Exception as e: # pylint: disable=broad-exception-caught
|
|
324
|
+
yield f'data: {{"error": "{str(e)}"}}\n\n'
|
|
325
|
+
|
|
326
|
+
return StreamingResponse(
|
|
327
|
+
event_stream(),
|
|
328
|
+
media_type="text/event-stream",
|
|
329
|
+
headers={
|
|
330
|
+
"Cache-Control": "no-cache",
|
|
331
|
+
"Connection": "keep-alive",
|
|
332
|
+
},
|
|
333
|
+
)
|
|
334
|
+
else:
|
|
335
|
+
# Non-streaming response
|
|
336
|
+
try:
|
|
337
|
+
# Disable streaming for non-streaming requests
|
|
338
|
+
# pylint: disable=missing-kwoa
|
|
339
|
+
response = client.chat.completions.create(**openai_client_params)
|
|
340
|
+
|
|
341
|
+
# Show telemetry after completion
|
|
342
|
+
self.telemetry.show_telemetry()
|
|
343
|
+
|
|
344
|
+
return response
|
|
345
|
+
|
|
346
|
+
except Exception as e: # pylint: disable=broad-exception-caught
|
|
347
|
+
logging.error("Error during chat completion: %s", str(e))
|
|
348
|
+
raise HTTPException(
|
|
349
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
350
|
+
detail=f"Chat completion error: {str(e)}",
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
def completion(self, completion_request: CompletionRequest):
|
|
354
|
+
"""
|
|
355
|
+
Handle text completions using the wrapped server.
|
|
356
|
+
|
|
357
|
+
Args:
|
|
358
|
+
completion_request: The completion request containing prompt and parameters
|
|
359
|
+
telemetry: Telemetry object containing the server port
|
|
360
|
+
|
|
361
|
+
Returns:
|
|
362
|
+
Completion response from the wrapped server
|
|
363
|
+
"""
|
|
364
|
+
|
|
365
|
+
client = OpenAI(
|
|
366
|
+
base_url=self.address(),
|
|
367
|
+
api_key="lemonade",
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
# Convert Pydantic model to dict and remove unset/null values
|
|
371
|
+
request_dict = completion_request.model_dump(
|
|
372
|
+
exclude_unset=True, exclude_none=True
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
# Separate standard OpenAI parameters from custom llama.cpp parameters
|
|
376
|
+
openai_client_params = self._separate_openai_params(request_dict, "completion")
|
|
377
|
+
|
|
378
|
+
# Check if streaming is requested
|
|
379
|
+
if completion_request.stream:
|
|
380
|
+
|
|
381
|
+
def event_stream():
|
|
382
|
+
try:
|
|
383
|
+
# Enable streaming
|
|
384
|
+
# pylint: disable=missing-kwoa
|
|
385
|
+
for chunk in client.completions.create(**openai_client_params):
|
|
386
|
+
yield f"data: {chunk.model_dump_json()}\n\n"
|
|
387
|
+
yield "data: [DONE]\n\n"
|
|
388
|
+
|
|
389
|
+
# Show telemetry after completion
|
|
390
|
+
self.telemetry.show_telemetry()
|
|
391
|
+
|
|
392
|
+
except Exception as e: # pylint: disable=broad-exception-caught
|
|
393
|
+
yield f'data: {{"error": "{str(e)}"}}\n\n'
|
|
394
|
+
|
|
395
|
+
return StreamingResponse(
|
|
396
|
+
event_stream(),
|
|
397
|
+
media_type="text/event-stream",
|
|
398
|
+
headers={
|
|
399
|
+
"Cache-Control": "no-cache",
|
|
400
|
+
"Connection": "keep-alive",
|
|
401
|
+
},
|
|
402
|
+
)
|
|
403
|
+
else:
|
|
404
|
+
# Non-streaming response
|
|
405
|
+
try:
|
|
406
|
+
# Disable streaming for non-streaming requests
|
|
407
|
+
# pylint: disable=missing-kwoa
|
|
408
|
+
response = client.completions.create(**openai_client_params)
|
|
409
|
+
|
|
410
|
+
# Show telemetry after completion
|
|
411
|
+
self.telemetry.show_telemetry()
|
|
412
|
+
|
|
413
|
+
return response
|
|
414
|
+
|
|
415
|
+
except Exception as e: # pylint: disable=broad-exception-caught
|
|
416
|
+
logging.error("Error during completion: %s", str(e))
|
|
417
|
+
raise HTTPException(
|
|
418
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
419
|
+
detail=f"Completion error: {str(e)}",
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
def embeddings(self, embeddings_request: EmbeddingsRequest):
|
|
423
|
+
"""
|
|
424
|
+
Generate embeddings using the wrapped server.
|
|
425
|
+
|
|
426
|
+
Args:
|
|
427
|
+
embeddings_request: The embeddings request containing input text/tokens
|
|
428
|
+
telemetry: Telemetry object containing the server port
|
|
429
|
+
|
|
430
|
+
Returns:
|
|
431
|
+
Embeddings response from the wrapped server
|
|
432
|
+
"""
|
|
433
|
+
client = OpenAI(
|
|
434
|
+
base_url=self.address(),
|
|
435
|
+
api_key="lemonade",
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
# Convert Pydantic model to dict and remove unset/null values
|
|
439
|
+
request_dict = embeddings_request.model_dump(
|
|
440
|
+
exclude_unset=True, exclude_none=True
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
try:
|
|
444
|
+
# Call the embeddings endpoint
|
|
445
|
+
response = client.embeddings.create(**request_dict)
|
|
446
|
+
return response
|
|
447
|
+
|
|
448
|
+
except Exception as e: # pylint: disable=broad-exception-caught
|
|
449
|
+
raise HTTPException(
|
|
450
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
451
|
+
detail=f"Embeddings error: {str(e)}",
|
|
452
|
+
)
|
|
453
|
+
|
|
454
|
+
def reranking(self, reranking_request: RerankingRequest):
|
|
455
|
+
"""
|
|
456
|
+
Rerank documents based on their relevance to a query using the wrapped server.
|
|
457
|
+
|
|
458
|
+
Args:
|
|
459
|
+
reranking_request: The reranking request containing query and documents
|
|
460
|
+
telemetry: Telemetry object containing the server port
|
|
461
|
+
|
|
462
|
+
Returns:
|
|
463
|
+
Reranking response from the wrapped server containing ranked documents and scores
|
|
464
|
+
"""
|
|
465
|
+
|
|
466
|
+
try:
|
|
467
|
+
# Convert Pydantic model to dict and exclude unset/null values
|
|
468
|
+
request_dict = reranking_request.model_dump(
|
|
469
|
+
exclude_unset=True, exclude_none=True
|
|
470
|
+
)
|
|
471
|
+
|
|
472
|
+
# Call the reranking endpoint directly since it's not supported by the OpenAI API
|
|
473
|
+
response = requests.post(
|
|
474
|
+
f"{self.address()}/rerank",
|
|
475
|
+
json=request_dict,
|
|
476
|
+
)
|
|
477
|
+
response.raise_for_status()
|
|
478
|
+
return response.json()
|
|
479
|
+
|
|
480
|
+
except Exception as e:
|
|
481
|
+
logging.error("Error during reranking: %s", str(e))
|
|
482
|
+
raise HTTPException(
|
|
483
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
484
|
+
detail=f"Reranking error: {str(e)}",
|
|
485
|
+
) from e
|
lemonade/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "8.1.
|
|
1
|
+
__version__ = "8.1.6"
|
|
@@ -4,7 +4,7 @@ lemonade/cache.py,sha256=5iZbk273TiTMqK_vdzPOPYTo6VsWW2gNByOISA9zi1w,3002
|
|
|
4
4
|
lemonade/cli.py,sha256=9Pcs3PcrWC2F8_pcBaz09xHUICIJTvpemBdPGyXkjIk,4395
|
|
5
5
|
lemonade/sequence.py,sha256=KSH7BPsiyDKsOsg_ziQKEGsDwMmuO_YbgPRBxkZd0pw,13267
|
|
6
6
|
lemonade/state.py,sha256=sdSezla7Cd7KYL90xY3p9kcNV4ndSyN6UvNLOr3vBMA,5261
|
|
7
|
-
lemonade/version.py,sha256=
|
|
7
|
+
lemonade/version.py,sha256=pLCv7QgPYSVaHwGzgyRgNowjqjppvm3TmtmQCvVkOrA,22
|
|
8
8
|
lemonade/common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
9
|
lemonade/common/build.py,sha256=zTb0m1-kuUx6zw5QHp2SNnVuN6jOTMQ2FCdj9iH374U,6140
|
|
10
10
|
lemonade/common/cli_helpers.py,sha256=hjBfXrTtFl8gmCFlL-ksviXR0mOcdPtTWVNKoEp3PG4,4993
|
|
@@ -34,7 +34,7 @@ lemonade/tools/huggingface/load.py,sha256=KsSGOBBD-tNEIfYC8mCWV_jpnkjHMhN3juVmC1
|
|
|
34
34
|
lemonade/tools/huggingface/utils.py,sha256=j1S-IgjDsznUIVwkHSqqChmFyqIx9f3WcEelzohWwvU,13955
|
|
35
35
|
lemonade/tools/llamacpp/bench.py,sha256=1fkE02ecg-jRk92i5dTAXz6re14WH8bd-Z9l-m3lbDA,4844
|
|
36
36
|
lemonade/tools/llamacpp/load.py,sha256=DFCvQN548Ch9H8U_rHOiYviinzw6vixb5-V7xLj7XE4,6499
|
|
37
|
-
lemonade/tools/llamacpp/utils.py,sha256=
|
|
37
|
+
lemonade/tools/llamacpp/utils.py,sha256=96POJXoIBE_zLArusiOrgyCcz8D5vR3IuXkMzyg79CU,32608
|
|
38
38
|
lemonade/tools/oga/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
39
39
|
lemonade/tools/oga/bench.py,sha256=PJXv4UchcS2YPwijNzef8DY4DSAKYxIYY1ycHuH3T34,5005
|
|
40
40
|
lemonade/tools/oga/load.py,sha256=BH5ChYbZgeP_ZN4E6HoboJD3kZcUIAPgPEVbgUZpVjQ,33778
|
|
@@ -46,11 +46,12 @@ lemonade/tools/report/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3h
|
|
|
46
46
|
lemonade/tools/report/llm_report.py,sha256=bVHhwCINA-Ok2EdSwAsLubsc83N3KWOVuwTguw7jDcE,6676
|
|
47
47
|
lemonade/tools/report/table.py,sha256=ssqy1bZqF-wptNzKEOj6_9REtCNZyXO8R5vakAtg3R4,27973
|
|
48
48
|
lemonade/tools/server/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
49
|
-
lemonade/tools/server/llamacpp.py,sha256=
|
|
50
|
-
lemonade/tools/server/serve.py,sha256=
|
|
49
|
+
lemonade/tools/server/llamacpp.py,sha256=w-M0JXrgXVpfICnHBHhJm_yBshfwZ8zge-e1o1kH0R4,8751
|
|
50
|
+
lemonade/tools/server/serve.py,sha256=YVsO7m9E1OsQtbcjkv_1ir8oCSAdr0IXGNEvJ6FKamw,60311
|
|
51
51
|
lemonade/tools/server/tool_calls.py,sha256=xrAlQwKG-nv2xLlf8f9CDSaUbyMn8ZtHkds9iZLG9K8,5230
|
|
52
52
|
lemonade/tools/server/tray.py,sha256=a9z6hdqlfj91H00j6hAExRPQkzWHhE3dnqSumzEgq0U,19599
|
|
53
53
|
lemonade/tools/server/webapp.py,sha256=8Das5yXOaSBLZmSZ_eddJajQFxBhvl5D6GI_hHlGbE0,1040
|
|
54
|
+
lemonade/tools/server/wrapped_server.py,sha256=-knOr2ycmrebVPYrOlCNRJH0ySZPnVlWzbKYLsfTRhE,16441
|
|
54
55
|
lemonade/tools/server/static/favicon.ico,sha256=hMmP9qGJNeZ0mFS86JIqPbZstXMZn0Z76_HfHQpREAU,126745
|
|
55
56
|
lemonade/tools/server/static/styles.css,sha256=5HQQCpm8N_fzLcolPiDuhyZw_5nbO8aIl60xAn4RKmg,43385
|
|
56
57
|
lemonade/tools/server/static/webapp.html,sha256=FX2MZUsljfgxxuF12KBdgvNkso_z-sHewWc0SEGGcGM,18138
|
|
@@ -63,15 +64,15 @@ lemonade/tools/server/utils/system_tray.py,sha256=b9lvNv9chJKQxvmH7qzAuUe6H9HsLu
|
|
|
63
64
|
lemonade/tools/server/utils/thread.py,sha256=Z-PDzGcpgfN2qxTmtlROWqrUN0B2fXdPrqo_J10fR_w,2772
|
|
64
65
|
lemonade_install/__init__.py,sha256=26zohKg2jgr_5y7tObduWMYQg8zCTWMZHL8lfi2zZVQ,40
|
|
65
66
|
lemonade_install/install.py,sha256=onndA2a-ygyLtDfupI8JQFhU_XpK8McGZtGujFasXww,28304
|
|
66
|
-
lemonade_sdk-8.1.
|
|
67
|
-
lemonade_sdk-8.1.
|
|
68
|
-
lemonade_server/cli.py,sha256
|
|
69
|
-
lemonade_server/model_manager.py,sha256=
|
|
67
|
+
lemonade_sdk-8.1.6.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
68
|
+
lemonade_sdk-8.1.6.dist-info/licenses/NOTICE.md,sha256=RSca9LE5e6pvdWA_LXAUCcACIHPmINKqkRX-AVRqBGo,3499
|
|
69
|
+
lemonade_server/cli.py,sha256=GsYMg095XMADHjWr1ytLYvbbrtSDPwQJEUn0gI-W99o,19108
|
|
70
|
+
lemonade_server/model_manager.py,sha256=DVznjSHG5R_W3E5xTI-7p4qYS_NTS3ssXh4zujeey-Y,20453
|
|
70
71
|
lemonade_server/pydantic_models.py,sha256=49MyOlb5feLUlKsGcI75tWaflWckrItqcSVkdCY4e3A,3269
|
|
71
72
|
lemonade_server/server_models.json,sha256=DAdG4ebIt5Dy5MM3kmXn1pO0XbNMph1gdpzbacBDVuc,11664
|
|
72
73
|
lemonade_server/settings.py,sha256=6nsmPLFJD-UokQDmlx9ZBYMbpnn48So_PuBGWP7Fmfg,1299
|
|
73
|
-
lemonade_sdk-8.1.
|
|
74
|
-
lemonade_sdk-8.1.
|
|
75
|
-
lemonade_sdk-8.1.
|
|
76
|
-
lemonade_sdk-8.1.
|
|
77
|
-
lemonade_sdk-8.1.
|
|
74
|
+
lemonade_sdk-8.1.6.dist-info/METADATA,sha256=N6NlWOitXA4TrHkebCpHS9d2QZ4LtaY0aOWJvaz8TXU,16852
|
|
75
|
+
lemonade_sdk-8.1.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
76
|
+
lemonade_sdk-8.1.6.dist-info/entry_points.txt,sha256=7sRvpNhi1E7amnM7RZo57e8yFF9iA5uuRaIeJ1Xre6w,193
|
|
77
|
+
lemonade_sdk-8.1.6.dist-info/top_level.txt,sha256=10ap5GNiPhalO4V50LRoxA1FqRT9g3Xkia6BITu880k,42
|
|
78
|
+
lemonade_sdk-8.1.6.dist-info/RECORD,,
|