lemonade-sdk 8.1.9__py3-none-any.whl → 8.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lemonade-sdk might be problematic. Click here for more details.
- lemonade/common/inference_engines.py +13 -4
- lemonade/common/system_info.py +570 -1
- lemonade/tools/flm/__init__.py +1 -0
- lemonade/tools/flm/utils.py +255 -0
- lemonade/tools/llamacpp/utils.py +62 -13
- lemonade/tools/server/flm.py +137 -0
- lemonade/tools/server/llamacpp.py +23 -5
- lemonade/tools/server/serve.py +292 -135
- lemonade/tools/server/static/js/chat.js +165 -82
- lemonade/tools/server/static/js/models.js +87 -54
- lemonade/tools/server/static/js/shared.js +5 -3
- lemonade/tools/server/static/logs.html +47 -0
- lemonade/tools/server/static/styles.css +159 -8
- lemonade/tools/server/static/webapp.html +28 -10
- lemonade/tools/server/tray.py +158 -38
- lemonade/tools/server/utils/macos_tray.py +226 -0
- lemonade/tools/server/utils/{system_tray.py → windows_tray.py} +13 -0
- lemonade/tools/server/webapp.py +4 -1
- lemonade/tools/server/wrapped_server.py +91 -25
- lemonade/version.py +1 -1
- lemonade_install/install.py +25 -2
- {lemonade_sdk-8.1.9.dist-info → lemonade_sdk-8.1.11.dist-info}/METADATA +9 -6
- {lemonade_sdk-8.1.9.dist-info → lemonade_sdk-8.1.11.dist-info}/RECORD +33 -28
- lemonade_server/cli.py +105 -14
- lemonade_server/model_manager.py +186 -45
- lemonade_server/pydantic_models.py +25 -1
- lemonade_server/server_models.json +162 -62
- lemonade_server/settings.py +39 -39
- {lemonade_sdk-8.1.9.dist-info → lemonade_sdk-8.1.11.dist-info}/WHEEL +0 -0
- {lemonade_sdk-8.1.9.dist-info → lemonade_sdk-8.1.11.dist-info}/entry_points.txt +0 -0
- {lemonade_sdk-8.1.9.dist-info → lemonade_sdk-8.1.11.dist-info}/licenses/LICENSE +0 -0
- {lemonade_sdk-8.1.9.dist-info → lemonade_sdk-8.1.11.dist-info}/licenses/NOTICE.md +0 -0
- {lemonade_sdk-8.1.9.dist-info → lemonade_sdk-8.1.11.dist-info}/top_level.txt +0 -0
lemonade/tools/server/serve.py
CHANGED
|
@@ -10,11 +10,13 @@ import traceback
|
|
|
10
10
|
from typing import Optional, Union
|
|
11
11
|
import json
|
|
12
12
|
from pathlib import Path
|
|
13
|
+
import os
|
|
13
14
|
|
|
14
|
-
from fastapi import FastAPI, HTTPException, status, Request
|
|
15
|
+
from fastapi import FastAPI, HTTPException, status, Request, WebSocket
|
|
15
16
|
from fastapi.responses import StreamingResponse
|
|
16
17
|
from fastapi.middleware.cors import CORSMiddleware
|
|
17
18
|
from fastapi.staticfiles import StaticFiles
|
|
19
|
+
from starlette.websockets import WebSocketDisconnect, WebSocketState
|
|
18
20
|
import uvicorn
|
|
19
21
|
from uvicorn.config import Config
|
|
20
22
|
from uvicorn.server import Server as UvicornServer
|
|
@@ -48,6 +50,7 @@ from openai.types.responses import (
|
|
|
48
50
|
import lemonade.api as lemonade_api
|
|
49
51
|
from lemonade.tools.server.wrapped_server import WrappedServer
|
|
50
52
|
from lemonade.tools.server.llamacpp import LlamaServer
|
|
53
|
+
from lemonade.tools.server.flm import FlmServer
|
|
51
54
|
from lemonade.tools.server.tool_calls import extract_tool_calls, get_tool_call_pattern
|
|
52
55
|
from lemonade.tools.server.webapp import get_webapp_html
|
|
53
56
|
from lemonade.tools.server.utils.port import lifespan
|
|
@@ -75,12 +78,62 @@ from lemonade_server.settings import save_setting
|
|
|
75
78
|
# Tests should use the max_new_tokens argument to set a lower value
|
|
76
79
|
DEFAULT_MAX_NEW_TOKENS = 1500
|
|
77
80
|
|
|
78
|
-
|
|
79
|
-
if platform.system() == "Windows":
|
|
81
|
+
if platform.system() in ["Windows", "Darwin"]:
|
|
80
82
|
# pylint: disable=ungrouped-imports
|
|
81
83
|
from lemonade.tools.server.tray import LemonadeTray, OutputDuplicator
|
|
82
84
|
|
|
83
85
|
|
|
86
|
+
class WebsocketTextFilter(logging.Filter):
|
|
87
|
+
def filter(self, record: logging.LogRecord) -> bool:
|
|
88
|
+
# Only allow logs that don't include "> TEXT"
|
|
89
|
+
return "> TEXT" not in record.getMessage()
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
async def log_streamer(websocket: WebSocket, path: str, interval: float = 1.0):
|
|
93
|
+
logger = logging.getLogger()
|
|
94
|
+
await websocket.accept()
|
|
95
|
+
try:
|
|
96
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
97
|
+
f.seek(0, os.SEEK_END) # start at end
|
|
98
|
+
while True:
|
|
99
|
+
# Try reading a line
|
|
100
|
+
line = f.readline()
|
|
101
|
+
if not line:
|
|
102
|
+
await asyncio.sleep(interval)
|
|
103
|
+
continue
|
|
104
|
+
|
|
105
|
+
# Send defensively: if disconnected, bail out
|
|
106
|
+
if websocket.application_state != WebSocketState.CONNECTED:
|
|
107
|
+
# Server-side state says we're not connected anymore
|
|
108
|
+
break
|
|
109
|
+
|
|
110
|
+
try:
|
|
111
|
+
await websocket.send_text(line)
|
|
112
|
+
except WebSocketDisconnect:
|
|
113
|
+
# Client closed — normal path out
|
|
114
|
+
break
|
|
115
|
+
except RuntimeError as re:
|
|
116
|
+
# Starlette will raise this if a close has already been sent
|
|
117
|
+
logger.debug("RuntimeError during send: %s", re)
|
|
118
|
+
break
|
|
119
|
+
|
|
120
|
+
except WebSocketDisconnect:
|
|
121
|
+
# Client closed the socket; do not try to send or close again
|
|
122
|
+
pass
|
|
123
|
+
except Exception as e: # pylint: disable=broad-except
|
|
124
|
+
# Log server-side; do not attempt to send error over a possibly closed socket
|
|
125
|
+
logger.exception("Error in log_streamer: %s", e)
|
|
126
|
+
finally:
|
|
127
|
+
# Only close if Starlette still thinks we're connected.
|
|
128
|
+
# This prevents "Cannot call send once a close message has been sent."
|
|
129
|
+
try:
|
|
130
|
+
if websocket.application_state == WebSocketState.CONNECTED:
|
|
131
|
+
await websocket.close()
|
|
132
|
+
except Exception: # pylint: disable=broad-except
|
|
133
|
+
# If close itself races, swallow — we're shutting down anyway.
|
|
134
|
+
pass
|
|
135
|
+
|
|
136
|
+
|
|
84
137
|
class ServerModel(Model):
|
|
85
138
|
"""
|
|
86
139
|
An extension of OpenAI's Model class that adds
|
|
@@ -164,6 +217,7 @@ class Server:
|
|
|
164
217
|
- /api/v1/chat/completions: chat completion responses using HTTP chunked transfer encoding.
|
|
165
218
|
- /api/v1/responses: responses API using HTTP chunked transfer encoding.
|
|
166
219
|
- /api/v1/models: list all available models.
|
|
220
|
+
- /api/v1/models/{model_id}: retrieve a specific model by ID.
|
|
167
221
|
"""
|
|
168
222
|
|
|
169
223
|
def __init__(
|
|
@@ -203,6 +257,12 @@ class Server:
|
|
|
203
257
|
allow_headers=["*"], # Allows all headers
|
|
204
258
|
)
|
|
205
259
|
|
|
260
|
+
# Set up debug middleware if debug logging is enabled
|
|
261
|
+
# This must be done during app initialization, not at runtime
|
|
262
|
+
self.debug_logging_enabled = log_level == "debug"
|
|
263
|
+
if self.debug_logging_enabled:
|
|
264
|
+
self.setup_middleware_timer()
|
|
265
|
+
|
|
206
266
|
# Set up custom routes
|
|
207
267
|
self.setup_routes(["/api/v0", "/api/v1"])
|
|
208
268
|
|
|
@@ -264,11 +324,13 @@ class Server:
|
|
|
264
324
|
self.app.post(f"{prefix}/completions")(self.completions)
|
|
265
325
|
self.app.post(f"{prefix}/responses")(self.responses)
|
|
266
326
|
self.app.post(f"{prefix}/log-level")(self.set_log_level)
|
|
327
|
+
self.app.websocket(f"{prefix}/logs/ws")(self.logs_ws)
|
|
267
328
|
|
|
268
329
|
# OpenAI-compatible routes
|
|
269
330
|
self.app.post(f"{prefix}/chat/completions")(self.chat_completions)
|
|
270
331
|
self.app.post(f"{prefix}/embeddings")(self.embeddings)
|
|
271
332
|
self.app.get(f"{prefix}/models")(self.models)
|
|
333
|
+
self.app.get(f"{prefix}/models/{{model_id}}")(self.retrieve_model)
|
|
272
334
|
|
|
273
335
|
# JinaAI routes (jina.ai/reranker/)
|
|
274
336
|
self.app.post(f"{prefix}/reranking")(self.reranking)
|
|
@@ -392,11 +454,13 @@ class Server:
|
|
|
392
454
|
)
|
|
393
455
|
file_handler.setLevel(logging_level)
|
|
394
456
|
file_handler.setFormatter(uvicorn_formatter)
|
|
457
|
+
file_handler.addFilter(WebsocketTextFilter())
|
|
395
458
|
|
|
396
459
|
# Set up console handler
|
|
397
460
|
console_handler = logging.StreamHandler()
|
|
398
461
|
console_handler.setLevel(logging_level)
|
|
399
462
|
console_handler.setFormatter(uvicorn_formatter)
|
|
463
|
+
console_handler.addFilter(WebsocketTextFilter())
|
|
400
464
|
|
|
401
465
|
# Configure root logger with both handlers
|
|
402
466
|
logging.basicConfig(
|
|
@@ -419,10 +483,6 @@ class Server:
|
|
|
419
483
|
).run()
|
|
420
484
|
sys.exit(0)
|
|
421
485
|
|
|
422
|
-
if self.debug_logging_enabled:
|
|
423
|
-
# Print the elapsed time for each request
|
|
424
|
-
self.setup_middleware_timer()
|
|
425
|
-
|
|
426
486
|
# Let the app know what port it's running on, so
|
|
427
487
|
# that the lifespan can access it
|
|
428
488
|
self.app.port = self.port
|
|
@@ -519,7 +579,9 @@ class Server:
|
|
|
519
579
|
|
|
520
580
|
return lc
|
|
521
581
|
|
|
522
|
-
async def completions(
|
|
582
|
+
async def completions(
|
|
583
|
+
self, completion_request: CompletionRequest, request: Request
|
|
584
|
+
):
|
|
523
585
|
"""
|
|
524
586
|
Stream completion responses using HTTP chunked transfer encoding.
|
|
525
587
|
"""
|
|
@@ -532,7 +594,7 @@ class Server:
|
|
|
532
594
|
# Load the model if it's different from the currently loaded one
|
|
533
595
|
await self.load_llm(lc)
|
|
534
596
|
|
|
535
|
-
if self.llm_loaded.recipe == "llamacpp":
|
|
597
|
+
if self.llm_loaded.recipe == "llamacpp" or self.llm_loaded.recipe == "flm":
|
|
536
598
|
return self.wrapped_server.completion(completion_request)
|
|
537
599
|
|
|
538
600
|
# Check if the model supports reasoning
|
|
@@ -571,29 +633,43 @@ class Server:
|
|
|
571
633
|
# This is necessary because the variable is modified
|
|
572
634
|
# in the inner function
|
|
573
635
|
nonlocal reasoning_first_token
|
|
636
|
+
try:
|
|
637
|
+
async for token in self._generate_tokens(**generation_args):
|
|
638
|
+
# Handle client disconnect: stop generation and exit
|
|
639
|
+
if await request.is_disconnected():
|
|
640
|
+
self.stop_event.set()
|
|
641
|
+
break
|
|
574
642
|
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
643
|
+
choice = CompletionChoice(
|
|
644
|
+
text=(
|
|
645
|
+
"<think>" + token if reasoning_first_token else token
|
|
646
|
+
),
|
|
647
|
+
index=0,
|
|
648
|
+
finish_reason="stop",
|
|
649
|
+
logprobs=None,
|
|
650
|
+
)
|
|
582
651
|
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
652
|
+
completion = Completion(
|
|
653
|
+
id="0",
|
|
654
|
+
choices=[choice],
|
|
655
|
+
model=self.llm_loaded.checkpoint,
|
|
656
|
+
object="text_completion",
|
|
657
|
+
created=int(time.time()),
|
|
658
|
+
)
|
|
590
659
|
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
660
|
+
# Format as SSE
|
|
661
|
+
reasoning_first_token = False
|
|
662
|
+
yield f"data: {completion.model_dump_json()}\n\n".encode(
|
|
663
|
+
"utf-8"
|
|
664
|
+
)
|
|
594
665
|
|
|
595
|
-
|
|
596
|
-
|
|
666
|
+
# Send the [DONE] marker only if still connected
|
|
667
|
+
if not await request.is_disconnected():
|
|
668
|
+
yield b"data: [DONE]\n\n"
|
|
669
|
+
except asyncio.CancelledError:
|
|
670
|
+
# Propagate cancellation to the generator loop
|
|
671
|
+
self.stop_event.set()
|
|
672
|
+
return
|
|
597
673
|
|
|
598
674
|
return StreamingResponse(
|
|
599
675
|
generate(),
|
|
@@ -651,7 +727,9 @@ class Server:
|
|
|
651
727
|
created=int(time.time()),
|
|
652
728
|
)
|
|
653
729
|
|
|
654
|
-
async def chat_completions(
|
|
730
|
+
async def chat_completions(
|
|
731
|
+
self, chat_completion_request: ChatCompletionRequest, request: Request
|
|
732
|
+
):
|
|
655
733
|
"""
|
|
656
734
|
Stream chat completion responses using HTTP chunked transfer encoding.
|
|
657
735
|
"""
|
|
@@ -667,7 +745,7 @@ class Server:
|
|
|
667
745
|
# Load the model if it's different from the currently loaded one
|
|
668
746
|
await self.load_llm(lc)
|
|
669
747
|
|
|
670
|
-
if self.llm_loaded.recipe == "llamacpp":
|
|
748
|
+
if self.llm_loaded.recipe == "llamacpp" or self.llm_loaded.recipe == "flm":
|
|
671
749
|
return self.wrapped_server.chat_completion(chat_completion_request)
|
|
672
750
|
|
|
673
751
|
# Convert chat messages to text using the model's chat template
|
|
@@ -729,69 +807,80 @@ class Server:
|
|
|
729
807
|
|
|
730
808
|
# Keep track of the full response for tool call extraction
|
|
731
809
|
full_response = ""
|
|
810
|
+
try:
|
|
811
|
+
async for token in self._generate_tokens(**generation_args):
|
|
812
|
+
# Handle client disconnect: stop generation and exit
|
|
813
|
+
if await request.is_disconnected():
|
|
814
|
+
self.stop_event.set()
|
|
815
|
+
break
|
|
732
816
|
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
if chat_completion_request.tools:
|
|
817
|
+
# Continuously look for tool calls embedded into the generated text
|
|
818
|
+
openai_tool_calls = None
|
|
819
|
+
if chat_completion_request.tools:
|
|
737
820
|
|
|
738
|
-
|
|
739
|
-
|
|
821
|
+
# Append the token to the full response
|
|
822
|
+
full_response += token
|
|
740
823
|
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
824
|
+
tool_calls, _ = extract_tool_calls(
|
|
825
|
+
full_response,
|
|
826
|
+
tool_call_pattern,
|
|
827
|
+
)
|
|
745
828
|
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
829
|
+
# If there are tool calls, reset the full response for the next call
|
|
830
|
+
if tool_calls:
|
|
831
|
+
openai_tool_calls = []
|
|
832
|
+
full_response = ""
|
|
833
|
+
for tool_call in tool_calls:
|
|
834
|
+
openai_tool_calls.append(
|
|
835
|
+
ChoiceDeltaToolCall(
|
|
836
|
+
index=0,
|
|
837
|
+
id="-",
|
|
838
|
+
function=ChoiceDeltaToolCallFunction(
|
|
839
|
+
arguments=json.dumps(
|
|
840
|
+
tool_call["arguments"]
|
|
841
|
+
),
|
|
842
|
+
name=tool_call["name"],
|
|
843
|
+
),
|
|
844
|
+
type="function",
|
|
845
|
+
)
|
|
760
846
|
)
|
|
761
|
-
)
|
|
762
847
|
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
848
|
+
# Create a ChatCompletionChunk
|
|
849
|
+
chunk = ChatCompletionChunk.model_construct(
|
|
850
|
+
id="0",
|
|
851
|
+
object="chat.completion.chunk",
|
|
852
|
+
created=int(time.time()),
|
|
853
|
+
model=self.llm_loaded.checkpoint,
|
|
854
|
+
choices=[
|
|
855
|
+
Choice.model_construct(
|
|
856
|
+
index=0,
|
|
857
|
+
delta=ChoiceDelta(
|
|
858
|
+
content=(
|
|
859
|
+
"<think>" + token
|
|
860
|
+
if reasoning_first_token
|
|
861
|
+
else token
|
|
862
|
+
),
|
|
863
|
+
function_call=None,
|
|
864
|
+
role="assistant",
|
|
865
|
+
tool_calls=openai_tool_calls,
|
|
866
|
+
refusal=None,
|
|
777
867
|
),
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
finish_reason=None,
|
|
784
|
-
logprobs=None,
|
|
785
|
-
)
|
|
786
|
-
],
|
|
787
|
-
)
|
|
868
|
+
finish_reason=None,
|
|
869
|
+
logprobs=None,
|
|
870
|
+
)
|
|
871
|
+
],
|
|
872
|
+
)
|
|
788
873
|
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
874
|
+
# Format as SSE
|
|
875
|
+
reasoning_first_token = False
|
|
876
|
+
yield f"data: {chunk.model_dump_json()}\n\n".encode("utf-8")
|
|
792
877
|
|
|
793
|
-
|
|
794
|
-
|
|
878
|
+
# Send the [DONE] marker only if still connected
|
|
879
|
+
if not await request.is_disconnected():
|
|
880
|
+
yield b"data: [DONE]\n\n"
|
|
881
|
+
except asyncio.CancelledError:
|
|
882
|
+
self.stop_event.set()
|
|
883
|
+
return
|
|
795
884
|
|
|
796
885
|
return StreamingResponse(
|
|
797
886
|
generate(),
|
|
@@ -950,7 +1039,7 @@ class Server:
|
|
|
950
1039
|
formatted_messages.append(f"{role_marker}\n{content} <|end|>")
|
|
951
1040
|
return "\n".join(formatted_messages) + "\n<|assistant|>"
|
|
952
1041
|
|
|
953
|
-
async def responses(self, responses_request: ResponsesRequest):
|
|
1042
|
+
async def responses(self, responses_request: ResponsesRequest, request: Request):
|
|
954
1043
|
"""
|
|
955
1044
|
Stream responses using HTTP chunked transfer encoding.
|
|
956
1045
|
"""
|
|
@@ -963,6 +1052,12 @@ class Server:
|
|
|
963
1052
|
# Load the model if it's different from the currently loaded one
|
|
964
1053
|
await self.load_llm(lc)
|
|
965
1054
|
|
|
1055
|
+
if self.llm_loaded.recipe == "llamacpp":
|
|
1056
|
+
raise HTTPException(
|
|
1057
|
+
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
|
1058
|
+
detail=f"Responses API not supported for recipe: {self.llm_loaded.recipe}",
|
|
1059
|
+
)
|
|
1060
|
+
|
|
966
1061
|
# Convert chat messages to text using the model's chat template
|
|
967
1062
|
if isinstance(responses_request.input, str):
|
|
968
1063
|
text = responses_request.input
|
|
@@ -1016,56 +1111,71 @@ class Server:
|
|
|
1016
1111
|
|
|
1017
1112
|
full_response = "<think>" if reasoning_first_token else ""
|
|
1018
1113
|
|
|
1019
|
-
|
|
1114
|
+
try:
|
|
1115
|
+
async for token in self._generate_tokens(**generation_args):
|
|
1116
|
+
# Handle client disconnect: stop generation and exit
|
|
1117
|
+
if await request.is_disconnected():
|
|
1118
|
+
self.stop_event.set()
|
|
1119
|
+
break
|
|
1020
1120
|
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1121
|
+
# Create an event
|
|
1122
|
+
delta_event = ResponseTextDeltaEvent(
|
|
1123
|
+
content_index=0,
|
|
1124
|
+
delta=(
|
|
1125
|
+
"<think>" + token if reasoning_first_token else token
|
|
1126
|
+
),
|
|
1127
|
+
item_id="0 ",
|
|
1128
|
+
output_index=0,
|
|
1129
|
+
type="response.output_text.delta",
|
|
1130
|
+
sequence_number=0,
|
|
1131
|
+
)
|
|
1132
|
+
full_response += token
|
|
1031
1133
|
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1134
|
+
# Format as SSE
|
|
1135
|
+
reasoning_first_token = False
|
|
1136
|
+
yield f"data: {delta_event.model_dump_json()}\n\n".encode(
|
|
1137
|
+
"utf-8"
|
|
1138
|
+
)
|
|
1035
1139
|
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1140
|
+
# Send the completed event (only if still connected)
|
|
1141
|
+
if not await request.is_disconnected():
|
|
1142
|
+
response_output_message = ResponseOutputMessage(
|
|
1143
|
+
id="0",
|
|
1144
|
+
content=[
|
|
1145
|
+
ResponseOutputText(
|
|
1146
|
+
annotations=[],
|
|
1147
|
+
text=full_response,
|
|
1148
|
+
type="output_text",
|
|
1149
|
+
)
|
|
1150
|
+
],
|
|
1151
|
+
role="assistant",
|
|
1152
|
+
status="completed",
|
|
1153
|
+
type="message",
|
|
1154
|
+
)
|
|
1155
|
+
response = Response(
|
|
1156
|
+
id="0",
|
|
1157
|
+
model=self.llm_loaded.checkpoint,
|
|
1158
|
+
created_at=int(time.time()),
|
|
1159
|
+
object="response",
|
|
1160
|
+
output=[response_output_message],
|
|
1161
|
+
parallel_tool_calls=True,
|
|
1162
|
+
tool_choice="auto",
|
|
1163
|
+
tools=[],
|
|
1164
|
+
)
|
|
1165
|
+
completed_event = ResponseCompletedEvent(
|
|
1166
|
+
response=response,
|
|
1167
|
+
type="response.completed",
|
|
1168
|
+
sequence_number=0,
|
|
1169
|
+
)
|
|
1170
|
+
yield f"data: {completed_event.model_dump_json()}\n\n".encode(
|
|
1171
|
+
"utf-8"
|
|
1044
1172
|
)
|
|
1045
|
-
],
|
|
1046
|
-
role="assistant",
|
|
1047
|
-
status="completed",
|
|
1048
|
-
type="message",
|
|
1049
|
-
)
|
|
1050
|
-
response = Response(
|
|
1051
|
-
id="0",
|
|
1052
|
-
model=self.llm_loaded.checkpoint,
|
|
1053
|
-
created_at=int(time.time()),
|
|
1054
|
-
object="response",
|
|
1055
|
-
output=[response_output_message],
|
|
1056
|
-
parallel_tool_calls=True,
|
|
1057
|
-
tool_choice="auto",
|
|
1058
|
-
tools=[],
|
|
1059
|
-
)
|
|
1060
|
-
completed_event = ResponseCompletedEvent(
|
|
1061
|
-
response=response,
|
|
1062
|
-
type="response.completed",
|
|
1063
|
-
sequence_number=0,
|
|
1064
|
-
)
|
|
1065
|
-
yield f"data: {completed_event.model_dump_json()}\n\n".encode("utf-8")
|
|
1066
1173
|
|
|
1067
|
-
|
|
1068
|
-
|
|
1174
|
+
# Send the [DONE] marker
|
|
1175
|
+
yield b"data: [DONE]\n\n"
|
|
1176
|
+
except asyncio.CancelledError:
|
|
1177
|
+
self.stop_event.set()
|
|
1178
|
+
return
|
|
1069
1179
|
|
|
1070
1180
|
return StreamingResponse(
|
|
1071
1181
|
generate(),
|
|
@@ -1310,8 +1420,10 @@ class Server:
|
|
|
1310
1420
|
"""
|
|
1311
1421
|
Send performance statistics to the client.
|
|
1312
1422
|
"""
|
|
1313
|
-
# If using
|
|
1314
|
-
if self.llm_loaded and
|
|
1423
|
+
# If using wrapped server, get telemetry from the telemetry instance
|
|
1424
|
+
if self.llm_loaded and (
|
|
1425
|
+
self.llm_loaded.recipe == "llamacpp" or self.llm_loaded.recipe == "flm"
|
|
1426
|
+
):
|
|
1315
1427
|
return self.wrapped_server.telemetry.get_telemetry_data()
|
|
1316
1428
|
|
|
1317
1429
|
# For built-in server, use the existing telemetry
|
|
@@ -1412,6 +1524,7 @@ class Server:
|
|
|
1412
1524
|
checkpoint=config.checkpoint,
|
|
1413
1525
|
recipe=config.recipe,
|
|
1414
1526
|
reasoning=config.reasoning,
|
|
1527
|
+
vision=config.vision,
|
|
1415
1528
|
mmproj=config.mmproj,
|
|
1416
1529
|
# The pull endpoint will download an upgraded model if available, even
|
|
1417
1530
|
# if we already have a local copy of the model
|
|
@@ -1491,8 +1604,8 @@ class Server:
|
|
|
1491
1604
|
):
|
|
1492
1605
|
if (
|
|
1493
1606
|
self.llm_loaded.recipe == "llamacpp"
|
|
1494
|
-
|
|
1495
|
-
):
|
|
1607
|
+
or self.llm_loaded.recipe == "flm"
|
|
1608
|
+
) and self.wrapped_server.process.poll():
|
|
1496
1609
|
# wrapped server process has gone away for some reason, so we should
|
|
1497
1610
|
# proceed with loading to get it back
|
|
1498
1611
|
pass
|
|
@@ -1516,6 +1629,14 @@ class Server:
|
|
|
1516
1629
|
do_not_upgrade=True,
|
|
1517
1630
|
)
|
|
1518
1631
|
|
|
1632
|
+
elif config_to_use.recipe == "flm":
|
|
1633
|
+
self.wrapped_server = FlmServer()
|
|
1634
|
+
self.wrapped_server.load(
|
|
1635
|
+
model_config=config_to_use,
|
|
1636
|
+
ctx_size=self.ctx_size,
|
|
1637
|
+
do_not_upgrade=True,
|
|
1638
|
+
)
|
|
1639
|
+
|
|
1519
1640
|
else:
|
|
1520
1641
|
self.model, self.tokenizer = lemonade_api.from_pretrained(
|
|
1521
1642
|
checkpoint=config_to_use.checkpoint, recipe=config_to_use.recipe
|
|
@@ -1552,7 +1673,7 @@ class Server:
|
|
|
1552
1673
|
for _ in range(self.max_concurrent_generations):
|
|
1553
1674
|
await self._generate_semaphore.acquire()
|
|
1554
1675
|
|
|
1555
|
-
if self.llm_loaded.recipe == "llamacpp":
|
|
1676
|
+
if self.llm_loaded.recipe == "llamacpp" or self.llm_loaded.recipe == "flm":
|
|
1556
1677
|
self.wrapped_server.process.terminate()
|
|
1557
1678
|
|
|
1558
1679
|
self.llm_loaded = None
|
|
@@ -1590,6 +1711,36 @@ class Server:
|
|
|
1590
1711
|
|
|
1591
1712
|
return {"object": "list", "data": models_list}
|
|
1592
1713
|
|
|
1714
|
+
async def retrieve_model(self, model_id: str):
|
|
1715
|
+
"""
|
|
1716
|
+
Retrieve a specific model by ID in OpenAI-compatible format.
|
|
1717
|
+
"""
|
|
1718
|
+
# Raise an error if the model does not exist
|
|
1719
|
+
if model_id not in self.local_models:
|
|
1720
|
+
# Mimic the error format of the OpenAI API
|
|
1721
|
+
raise HTTPException(
|
|
1722
|
+
status_code=404,
|
|
1723
|
+
detail={
|
|
1724
|
+
"message": f"model {model_id} not found",
|
|
1725
|
+
"type": "api_error",
|
|
1726
|
+
"param": None,
|
|
1727
|
+
"code": None,
|
|
1728
|
+
},
|
|
1729
|
+
)
|
|
1730
|
+
|
|
1731
|
+
# Return the specific model
|
|
1732
|
+
model_info = self.local_models[model_id]
|
|
1733
|
+
model = ServerModel(
|
|
1734
|
+
id=model_id,
|
|
1735
|
+
owned_by="lemonade",
|
|
1736
|
+
object="model",
|
|
1737
|
+
created=int(time.time()),
|
|
1738
|
+
checkpoint=model_info["checkpoint"],
|
|
1739
|
+
recipe=model_info["recipe"],
|
|
1740
|
+
)
|
|
1741
|
+
|
|
1742
|
+
return model
|
|
1743
|
+
|
|
1593
1744
|
def setup_middleware_timer(self):
|
|
1594
1745
|
logging.info("Middleware set up")
|
|
1595
1746
|
|
|
@@ -1625,6 +1776,12 @@ class Server:
|
|
|
1625
1776
|
logging.debug(f"Total request time: {request_time:.4f} seconds")
|
|
1626
1777
|
return response
|
|
1627
1778
|
|
|
1779
|
+
async def logs_ws(self, websocket: WebSocket):
|
|
1780
|
+
if not self.log_file or not os.path.exists(self.log_file):
|
|
1781
|
+
await websocket.close(code=4000)
|
|
1782
|
+
return
|
|
1783
|
+
await log_streamer(websocket, self.log_file)
|
|
1784
|
+
|
|
1628
1785
|
|
|
1629
1786
|
# This file was originally licensed under Apache 2.0. It has been modified.
|
|
1630
1787
|
# Modifications Copyright (c) 2025 AMD
|