lemonade-sdk 8.1.10__py3-none-any.whl → 8.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lemonade-sdk might be problematic. Click here for more details.
- lemonade/tools/flm/__init__.py +1 -0
- lemonade/tools/flm/utils.py +255 -0
- lemonade/tools/llamacpp/utils.py +58 -10
- lemonade/tools/server/flm.py +137 -0
- lemonade/tools/server/llamacpp.py +23 -5
- lemonade/tools/server/serve.py +260 -135
- lemonade/tools/server/static/js/chat.js +165 -82
- lemonade/tools/server/static/js/models.js +87 -54
- lemonade/tools/server/static/js/shared.js +5 -3
- lemonade/tools/server/static/logs.html +47 -0
- lemonade/tools/server/static/styles.css +159 -8
- lemonade/tools/server/static/webapp.html +28 -10
- lemonade/tools/server/tray.py +94 -38
- lemonade/tools/server/utils/macos_tray.py +226 -0
- lemonade/tools/server/utils/{system_tray.py → windows_tray.py} +13 -0
- lemonade/tools/server/webapp.py +4 -1
- lemonade/tools/server/wrapped_server.py +91 -25
- lemonade/version.py +1 -1
- lemonade_install/install.py +25 -2
- {lemonade_sdk-8.1.10.dist-info → lemonade_sdk-8.1.11.dist-info}/METADATA +9 -6
- {lemonade_sdk-8.1.10.dist-info → lemonade_sdk-8.1.11.dist-info}/RECORD +30 -25
- lemonade_server/cli.py +103 -14
- lemonade_server/model_manager.py +186 -45
- lemonade_server/pydantic_models.py +25 -1
- lemonade_server/server_models.json +162 -62
- {lemonade_sdk-8.1.10.dist-info → lemonade_sdk-8.1.11.dist-info}/WHEEL +0 -0
- {lemonade_sdk-8.1.10.dist-info → lemonade_sdk-8.1.11.dist-info}/entry_points.txt +0 -0
- {lemonade_sdk-8.1.10.dist-info → lemonade_sdk-8.1.11.dist-info}/licenses/LICENSE +0 -0
- {lemonade_sdk-8.1.10.dist-info → lemonade_sdk-8.1.11.dist-info}/licenses/NOTICE.md +0 -0
- {lemonade_sdk-8.1.10.dist-info → lemonade_sdk-8.1.11.dist-info}/top_level.txt +0 -0
lemonade/tools/server/serve.py
CHANGED
|
@@ -10,11 +10,13 @@ import traceback
|
|
|
10
10
|
from typing import Optional, Union
|
|
11
11
|
import json
|
|
12
12
|
from pathlib import Path
|
|
13
|
+
import os
|
|
13
14
|
|
|
14
|
-
from fastapi import FastAPI, HTTPException, status, Request
|
|
15
|
+
from fastapi import FastAPI, HTTPException, status, Request, WebSocket
|
|
15
16
|
from fastapi.responses import StreamingResponse
|
|
16
17
|
from fastapi.middleware.cors import CORSMiddleware
|
|
17
18
|
from fastapi.staticfiles import StaticFiles
|
|
19
|
+
from starlette.websockets import WebSocketDisconnect, WebSocketState
|
|
18
20
|
import uvicorn
|
|
19
21
|
from uvicorn.config import Config
|
|
20
22
|
from uvicorn.server import Server as UvicornServer
|
|
@@ -48,6 +50,7 @@ from openai.types.responses import (
|
|
|
48
50
|
import lemonade.api as lemonade_api
|
|
49
51
|
from lemonade.tools.server.wrapped_server import WrappedServer
|
|
50
52
|
from lemonade.tools.server.llamacpp import LlamaServer
|
|
53
|
+
from lemonade.tools.server.flm import FlmServer
|
|
51
54
|
from lemonade.tools.server.tool_calls import extract_tool_calls, get_tool_call_pattern
|
|
52
55
|
from lemonade.tools.server.webapp import get_webapp_html
|
|
53
56
|
from lemonade.tools.server.utils.port import lifespan
|
|
@@ -75,12 +78,62 @@ from lemonade_server.settings import save_setting
|
|
|
75
78
|
# Tests should use the max_new_tokens argument to set a lower value
|
|
76
79
|
DEFAULT_MAX_NEW_TOKENS = 1500
|
|
77
80
|
|
|
78
|
-
|
|
79
|
-
if platform.system() == "Windows":
|
|
81
|
+
if platform.system() in ["Windows", "Darwin"]:
|
|
80
82
|
# pylint: disable=ungrouped-imports
|
|
81
83
|
from lemonade.tools.server.tray import LemonadeTray, OutputDuplicator
|
|
82
84
|
|
|
83
85
|
|
|
86
|
+
class WebsocketTextFilter(logging.Filter):
|
|
87
|
+
def filter(self, record: logging.LogRecord) -> bool:
|
|
88
|
+
# Only allow logs that don't include "> TEXT"
|
|
89
|
+
return "> TEXT" not in record.getMessage()
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
async def log_streamer(websocket: WebSocket, path: str, interval: float = 1.0):
|
|
93
|
+
logger = logging.getLogger()
|
|
94
|
+
await websocket.accept()
|
|
95
|
+
try:
|
|
96
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
97
|
+
f.seek(0, os.SEEK_END) # start at end
|
|
98
|
+
while True:
|
|
99
|
+
# Try reading a line
|
|
100
|
+
line = f.readline()
|
|
101
|
+
if not line:
|
|
102
|
+
await asyncio.sleep(interval)
|
|
103
|
+
continue
|
|
104
|
+
|
|
105
|
+
# Send defensively: if disconnected, bail out
|
|
106
|
+
if websocket.application_state != WebSocketState.CONNECTED:
|
|
107
|
+
# Server-side state says we're not connected anymore
|
|
108
|
+
break
|
|
109
|
+
|
|
110
|
+
try:
|
|
111
|
+
await websocket.send_text(line)
|
|
112
|
+
except WebSocketDisconnect:
|
|
113
|
+
# Client closed — normal path out
|
|
114
|
+
break
|
|
115
|
+
except RuntimeError as re:
|
|
116
|
+
# Starlette will raise this if a close has already been sent
|
|
117
|
+
logger.debug("RuntimeError during send: %s", re)
|
|
118
|
+
break
|
|
119
|
+
|
|
120
|
+
except WebSocketDisconnect:
|
|
121
|
+
# Client closed the socket; do not try to send or close again
|
|
122
|
+
pass
|
|
123
|
+
except Exception as e: # pylint: disable=broad-except
|
|
124
|
+
# Log server-side; do not attempt to send error over a possibly closed socket
|
|
125
|
+
logger.exception("Error in log_streamer: %s", e)
|
|
126
|
+
finally:
|
|
127
|
+
# Only close if Starlette still thinks we're connected.
|
|
128
|
+
# This prevents "Cannot call send once a close message has been sent."
|
|
129
|
+
try:
|
|
130
|
+
if websocket.application_state == WebSocketState.CONNECTED:
|
|
131
|
+
await websocket.close()
|
|
132
|
+
except Exception: # pylint: disable=broad-except
|
|
133
|
+
# If close itself races, swallow — we're shutting down anyway.
|
|
134
|
+
pass
|
|
135
|
+
|
|
136
|
+
|
|
84
137
|
class ServerModel(Model):
|
|
85
138
|
"""
|
|
86
139
|
An extension of OpenAI's Model class that adds
|
|
@@ -204,6 +257,12 @@ class Server:
|
|
|
204
257
|
allow_headers=["*"], # Allows all headers
|
|
205
258
|
)
|
|
206
259
|
|
|
260
|
+
# Set up debug middleware if debug logging is enabled
|
|
261
|
+
# This must be done during app initialization, not at runtime
|
|
262
|
+
self.debug_logging_enabled = log_level == "debug"
|
|
263
|
+
if self.debug_logging_enabled:
|
|
264
|
+
self.setup_middleware_timer()
|
|
265
|
+
|
|
207
266
|
# Set up custom routes
|
|
208
267
|
self.setup_routes(["/api/v0", "/api/v1"])
|
|
209
268
|
|
|
@@ -265,6 +324,7 @@ class Server:
|
|
|
265
324
|
self.app.post(f"{prefix}/completions")(self.completions)
|
|
266
325
|
self.app.post(f"{prefix}/responses")(self.responses)
|
|
267
326
|
self.app.post(f"{prefix}/log-level")(self.set_log_level)
|
|
327
|
+
self.app.websocket(f"{prefix}/logs/ws")(self.logs_ws)
|
|
268
328
|
|
|
269
329
|
# OpenAI-compatible routes
|
|
270
330
|
self.app.post(f"{prefix}/chat/completions")(self.chat_completions)
|
|
@@ -394,11 +454,13 @@ class Server:
|
|
|
394
454
|
)
|
|
395
455
|
file_handler.setLevel(logging_level)
|
|
396
456
|
file_handler.setFormatter(uvicorn_formatter)
|
|
457
|
+
file_handler.addFilter(WebsocketTextFilter())
|
|
397
458
|
|
|
398
459
|
# Set up console handler
|
|
399
460
|
console_handler = logging.StreamHandler()
|
|
400
461
|
console_handler.setLevel(logging_level)
|
|
401
462
|
console_handler.setFormatter(uvicorn_formatter)
|
|
463
|
+
console_handler.addFilter(WebsocketTextFilter())
|
|
402
464
|
|
|
403
465
|
# Configure root logger with both handlers
|
|
404
466
|
logging.basicConfig(
|
|
@@ -421,10 +483,6 @@ class Server:
|
|
|
421
483
|
).run()
|
|
422
484
|
sys.exit(0)
|
|
423
485
|
|
|
424
|
-
if self.debug_logging_enabled:
|
|
425
|
-
# Print the elapsed time for each request
|
|
426
|
-
self.setup_middleware_timer()
|
|
427
|
-
|
|
428
486
|
# Let the app know what port it's running on, so
|
|
429
487
|
# that the lifespan can access it
|
|
430
488
|
self.app.port = self.port
|
|
@@ -521,7 +579,9 @@ class Server:
|
|
|
521
579
|
|
|
522
580
|
return lc
|
|
523
581
|
|
|
524
|
-
async def completions(
|
|
582
|
+
async def completions(
|
|
583
|
+
self, completion_request: CompletionRequest, request: Request
|
|
584
|
+
):
|
|
525
585
|
"""
|
|
526
586
|
Stream completion responses using HTTP chunked transfer encoding.
|
|
527
587
|
"""
|
|
@@ -534,7 +594,7 @@ class Server:
|
|
|
534
594
|
# Load the model if it's different from the currently loaded one
|
|
535
595
|
await self.load_llm(lc)
|
|
536
596
|
|
|
537
|
-
if self.llm_loaded.recipe == "llamacpp":
|
|
597
|
+
if self.llm_loaded.recipe == "llamacpp" or self.llm_loaded.recipe == "flm":
|
|
538
598
|
return self.wrapped_server.completion(completion_request)
|
|
539
599
|
|
|
540
600
|
# Check if the model supports reasoning
|
|
@@ -573,29 +633,43 @@ class Server:
|
|
|
573
633
|
# This is necessary because the variable is modified
|
|
574
634
|
# in the inner function
|
|
575
635
|
nonlocal reasoning_first_token
|
|
636
|
+
try:
|
|
637
|
+
async for token in self._generate_tokens(**generation_args):
|
|
638
|
+
# Handle client disconnect: stop generation and exit
|
|
639
|
+
if await request.is_disconnected():
|
|
640
|
+
self.stop_event.set()
|
|
641
|
+
break
|
|
576
642
|
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
643
|
+
choice = CompletionChoice(
|
|
644
|
+
text=(
|
|
645
|
+
"<think>" + token if reasoning_first_token else token
|
|
646
|
+
),
|
|
647
|
+
index=0,
|
|
648
|
+
finish_reason="stop",
|
|
649
|
+
logprobs=None,
|
|
650
|
+
)
|
|
584
651
|
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
652
|
+
completion = Completion(
|
|
653
|
+
id="0",
|
|
654
|
+
choices=[choice],
|
|
655
|
+
model=self.llm_loaded.checkpoint,
|
|
656
|
+
object="text_completion",
|
|
657
|
+
created=int(time.time()),
|
|
658
|
+
)
|
|
592
659
|
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
660
|
+
# Format as SSE
|
|
661
|
+
reasoning_first_token = False
|
|
662
|
+
yield f"data: {completion.model_dump_json()}\n\n".encode(
|
|
663
|
+
"utf-8"
|
|
664
|
+
)
|
|
596
665
|
|
|
597
|
-
|
|
598
|
-
|
|
666
|
+
# Send the [DONE] marker only if still connected
|
|
667
|
+
if not await request.is_disconnected():
|
|
668
|
+
yield b"data: [DONE]\n\n"
|
|
669
|
+
except asyncio.CancelledError:
|
|
670
|
+
# Propagate cancellation to the generator loop
|
|
671
|
+
self.stop_event.set()
|
|
672
|
+
return
|
|
599
673
|
|
|
600
674
|
return StreamingResponse(
|
|
601
675
|
generate(),
|
|
@@ -653,7 +727,9 @@ class Server:
|
|
|
653
727
|
created=int(time.time()),
|
|
654
728
|
)
|
|
655
729
|
|
|
656
|
-
async def chat_completions(
|
|
730
|
+
async def chat_completions(
|
|
731
|
+
self, chat_completion_request: ChatCompletionRequest, request: Request
|
|
732
|
+
):
|
|
657
733
|
"""
|
|
658
734
|
Stream chat completion responses using HTTP chunked transfer encoding.
|
|
659
735
|
"""
|
|
@@ -669,7 +745,7 @@ class Server:
|
|
|
669
745
|
# Load the model if it's different from the currently loaded one
|
|
670
746
|
await self.load_llm(lc)
|
|
671
747
|
|
|
672
|
-
if self.llm_loaded.recipe == "llamacpp":
|
|
748
|
+
if self.llm_loaded.recipe == "llamacpp" or self.llm_loaded.recipe == "flm":
|
|
673
749
|
return self.wrapped_server.chat_completion(chat_completion_request)
|
|
674
750
|
|
|
675
751
|
# Convert chat messages to text using the model's chat template
|
|
@@ -731,69 +807,80 @@ class Server:
|
|
|
731
807
|
|
|
732
808
|
# Keep track of the full response for tool call extraction
|
|
733
809
|
full_response = ""
|
|
810
|
+
try:
|
|
811
|
+
async for token in self._generate_tokens(**generation_args):
|
|
812
|
+
# Handle client disconnect: stop generation and exit
|
|
813
|
+
if await request.is_disconnected():
|
|
814
|
+
self.stop_event.set()
|
|
815
|
+
break
|
|
734
816
|
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
if chat_completion_request.tools:
|
|
817
|
+
# Continuously look for tool calls embedded into the generated text
|
|
818
|
+
openai_tool_calls = None
|
|
819
|
+
if chat_completion_request.tools:
|
|
739
820
|
|
|
740
|
-
|
|
741
|
-
|
|
821
|
+
# Append the token to the full response
|
|
822
|
+
full_response += token
|
|
742
823
|
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
824
|
+
tool_calls, _ = extract_tool_calls(
|
|
825
|
+
full_response,
|
|
826
|
+
tool_call_pattern,
|
|
827
|
+
)
|
|
747
828
|
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
829
|
+
# If there are tool calls, reset the full response for the next call
|
|
830
|
+
if tool_calls:
|
|
831
|
+
openai_tool_calls = []
|
|
832
|
+
full_response = ""
|
|
833
|
+
for tool_call in tool_calls:
|
|
834
|
+
openai_tool_calls.append(
|
|
835
|
+
ChoiceDeltaToolCall(
|
|
836
|
+
index=0,
|
|
837
|
+
id="-",
|
|
838
|
+
function=ChoiceDeltaToolCallFunction(
|
|
839
|
+
arguments=json.dumps(
|
|
840
|
+
tool_call["arguments"]
|
|
841
|
+
),
|
|
842
|
+
name=tool_call["name"],
|
|
843
|
+
),
|
|
844
|
+
type="function",
|
|
845
|
+
)
|
|
762
846
|
)
|
|
763
|
-
)
|
|
764
847
|
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
848
|
+
# Create a ChatCompletionChunk
|
|
849
|
+
chunk = ChatCompletionChunk.model_construct(
|
|
850
|
+
id="0",
|
|
851
|
+
object="chat.completion.chunk",
|
|
852
|
+
created=int(time.time()),
|
|
853
|
+
model=self.llm_loaded.checkpoint,
|
|
854
|
+
choices=[
|
|
855
|
+
Choice.model_construct(
|
|
856
|
+
index=0,
|
|
857
|
+
delta=ChoiceDelta(
|
|
858
|
+
content=(
|
|
859
|
+
"<think>" + token
|
|
860
|
+
if reasoning_first_token
|
|
861
|
+
else token
|
|
862
|
+
),
|
|
863
|
+
function_call=None,
|
|
864
|
+
role="assistant",
|
|
865
|
+
tool_calls=openai_tool_calls,
|
|
866
|
+
refusal=None,
|
|
779
867
|
),
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
finish_reason=None,
|
|
786
|
-
logprobs=None,
|
|
787
|
-
)
|
|
788
|
-
],
|
|
789
|
-
)
|
|
868
|
+
finish_reason=None,
|
|
869
|
+
logprobs=None,
|
|
870
|
+
)
|
|
871
|
+
],
|
|
872
|
+
)
|
|
790
873
|
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
874
|
+
# Format as SSE
|
|
875
|
+
reasoning_first_token = False
|
|
876
|
+
yield f"data: {chunk.model_dump_json()}\n\n".encode("utf-8")
|
|
794
877
|
|
|
795
|
-
|
|
796
|
-
|
|
878
|
+
# Send the [DONE] marker only if still connected
|
|
879
|
+
if not await request.is_disconnected():
|
|
880
|
+
yield b"data: [DONE]\n\n"
|
|
881
|
+
except asyncio.CancelledError:
|
|
882
|
+
self.stop_event.set()
|
|
883
|
+
return
|
|
797
884
|
|
|
798
885
|
return StreamingResponse(
|
|
799
886
|
generate(),
|
|
@@ -952,7 +1039,7 @@ class Server:
|
|
|
952
1039
|
formatted_messages.append(f"{role_marker}\n{content} <|end|>")
|
|
953
1040
|
return "\n".join(formatted_messages) + "\n<|assistant|>"
|
|
954
1041
|
|
|
955
|
-
async def responses(self, responses_request: ResponsesRequest):
|
|
1042
|
+
async def responses(self, responses_request: ResponsesRequest, request: Request):
|
|
956
1043
|
"""
|
|
957
1044
|
Stream responses using HTTP chunked transfer encoding.
|
|
958
1045
|
"""
|
|
@@ -965,6 +1052,12 @@ class Server:
|
|
|
965
1052
|
# Load the model if it's different from the currently loaded one
|
|
966
1053
|
await self.load_llm(lc)
|
|
967
1054
|
|
|
1055
|
+
if self.llm_loaded.recipe == "llamacpp":
|
|
1056
|
+
raise HTTPException(
|
|
1057
|
+
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
|
1058
|
+
detail=f"Responses API not supported for recipe: {self.llm_loaded.recipe}",
|
|
1059
|
+
)
|
|
1060
|
+
|
|
968
1061
|
# Convert chat messages to text using the model's chat template
|
|
969
1062
|
if isinstance(responses_request.input, str):
|
|
970
1063
|
text = responses_request.input
|
|
@@ -1018,56 +1111,71 @@ class Server:
|
|
|
1018
1111
|
|
|
1019
1112
|
full_response = "<think>" if reasoning_first_token else ""
|
|
1020
1113
|
|
|
1021
|
-
|
|
1114
|
+
try:
|
|
1115
|
+
async for token in self._generate_tokens(**generation_args):
|
|
1116
|
+
# Handle client disconnect: stop generation and exit
|
|
1117
|
+
if await request.is_disconnected():
|
|
1118
|
+
self.stop_event.set()
|
|
1119
|
+
break
|
|
1022
1120
|
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1121
|
+
# Create an event
|
|
1122
|
+
delta_event = ResponseTextDeltaEvent(
|
|
1123
|
+
content_index=0,
|
|
1124
|
+
delta=(
|
|
1125
|
+
"<think>" + token if reasoning_first_token else token
|
|
1126
|
+
),
|
|
1127
|
+
item_id="0 ",
|
|
1128
|
+
output_index=0,
|
|
1129
|
+
type="response.output_text.delta",
|
|
1130
|
+
sequence_number=0,
|
|
1131
|
+
)
|
|
1132
|
+
full_response += token
|
|
1033
1133
|
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
|
|
1134
|
+
# Format as SSE
|
|
1135
|
+
reasoning_first_token = False
|
|
1136
|
+
yield f"data: {delta_event.model_dump_json()}\n\n".encode(
|
|
1137
|
+
"utf-8"
|
|
1138
|
+
)
|
|
1037
1139
|
|
|
1038
|
-
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1140
|
+
# Send the completed event (only if still connected)
|
|
1141
|
+
if not await request.is_disconnected():
|
|
1142
|
+
response_output_message = ResponseOutputMessage(
|
|
1143
|
+
id="0",
|
|
1144
|
+
content=[
|
|
1145
|
+
ResponseOutputText(
|
|
1146
|
+
annotations=[],
|
|
1147
|
+
text=full_response,
|
|
1148
|
+
type="output_text",
|
|
1149
|
+
)
|
|
1150
|
+
],
|
|
1151
|
+
role="assistant",
|
|
1152
|
+
status="completed",
|
|
1153
|
+
type="message",
|
|
1154
|
+
)
|
|
1155
|
+
response = Response(
|
|
1156
|
+
id="0",
|
|
1157
|
+
model=self.llm_loaded.checkpoint,
|
|
1158
|
+
created_at=int(time.time()),
|
|
1159
|
+
object="response",
|
|
1160
|
+
output=[response_output_message],
|
|
1161
|
+
parallel_tool_calls=True,
|
|
1162
|
+
tool_choice="auto",
|
|
1163
|
+
tools=[],
|
|
1164
|
+
)
|
|
1165
|
+
completed_event = ResponseCompletedEvent(
|
|
1166
|
+
response=response,
|
|
1167
|
+
type="response.completed",
|
|
1168
|
+
sequence_number=0,
|
|
1169
|
+
)
|
|
1170
|
+
yield f"data: {completed_event.model_dump_json()}\n\n".encode(
|
|
1171
|
+
"utf-8"
|
|
1046
1172
|
)
|
|
1047
|
-
],
|
|
1048
|
-
role="assistant",
|
|
1049
|
-
status="completed",
|
|
1050
|
-
type="message",
|
|
1051
|
-
)
|
|
1052
|
-
response = Response(
|
|
1053
|
-
id="0",
|
|
1054
|
-
model=self.llm_loaded.checkpoint,
|
|
1055
|
-
created_at=int(time.time()),
|
|
1056
|
-
object="response",
|
|
1057
|
-
output=[response_output_message],
|
|
1058
|
-
parallel_tool_calls=True,
|
|
1059
|
-
tool_choice="auto",
|
|
1060
|
-
tools=[],
|
|
1061
|
-
)
|
|
1062
|
-
completed_event = ResponseCompletedEvent(
|
|
1063
|
-
response=response,
|
|
1064
|
-
type="response.completed",
|
|
1065
|
-
sequence_number=0,
|
|
1066
|
-
)
|
|
1067
|
-
yield f"data: {completed_event.model_dump_json()}\n\n".encode("utf-8")
|
|
1068
1173
|
|
|
1069
|
-
|
|
1070
|
-
|
|
1174
|
+
# Send the [DONE] marker
|
|
1175
|
+
yield b"data: [DONE]\n\n"
|
|
1176
|
+
except asyncio.CancelledError:
|
|
1177
|
+
self.stop_event.set()
|
|
1178
|
+
return
|
|
1071
1179
|
|
|
1072
1180
|
return StreamingResponse(
|
|
1073
1181
|
generate(),
|
|
@@ -1312,8 +1420,10 @@ class Server:
|
|
|
1312
1420
|
"""
|
|
1313
1421
|
Send performance statistics to the client.
|
|
1314
1422
|
"""
|
|
1315
|
-
# If using
|
|
1316
|
-
if self.llm_loaded and
|
|
1423
|
+
# If using wrapped server, get telemetry from the telemetry instance
|
|
1424
|
+
if self.llm_loaded and (
|
|
1425
|
+
self.llm_loaded.recipe == "llamacpp" or self.llm_loaded.recipe == "flm"
|
|
1426
|
+
):
|
|
1317
1427
|
return self.wrapped_server.telemetry.get_telemetry_data()
|
|
1318
1428
|
|
|
1319
1429
|
# For built-in server, use the existing telemetry
|
|
@@ -1414,6 +1524,7 @@ class Server:
|
|
|
1414
1524
|
checkpoint=config.checkpoint,
|
|
1415
1525
|
recipe=config.recipe,
|
|
1416
1526
|
reasoning=config.reasoning,
|
|
1527
|
+
vision=config.vision,
|
|
1417
1528
|
mmproj=config.mmproj,
|
|
1418
1529
|
# The pull endpoint will download an upgraded model if available, even
|
|
1419
1530
|
# if we already have a local copy of the model
|
|
@@ -1493,8 +1604,8 @@ class Server:
|
|
|
1493
1604
|
):
|
|
1494
1605
|
if (
|
|
1495
1606
|
self.llm_loaded.recipe == "llamacpp"
|
|
1496
|
-
|
|
1497
|
-
):
|
|
1607
|
+
or self.llm_loaded.recipe == "flm"
|
|
1608
|
+
) and self.wrapped_server.process.poll():
|
|
1498
1609
|
# wrapped server process has gone away for some reason, so we should
|
|
1499
1610
|
# proceed with loading to get it back
|
|
1500
1611
|
pass
|
|
@@ -1518,6 +1629,14 @@ class Server:
|
|
|
1518
1629
|
do_not_upgrade=True,
|
|
1519
1630
|
)
|
|
1520
1631
|
|
|
1632
|
+
elif config_to_use.recipe == "flm":
|
|
1633
|
+
self.wrapped_server = FlmServer()
|
|
1634
|
+
self.wrapped_server.load(
|
|
1635
|
+
model_config=config_to_use,
|
|
1636
|
+
ctx_size=self.ctx_size,
|
|
1637
|
+
do_not_upgrade=True,
|
|
1638
|
+
)
|
|
1639
|
+
|
|
1521
1640
|
else:
|
|
1522
1641
|
self.model, self.tokenizer = lemonade_api.from_pretrained(
|
|
1523
1642
|
checkpoint=config_to_use.checkpoint, recipe=config_to_use.recipe
|
|
@@ -1554,7 +1673,7 @@ class Server:
|
|
|
1554
1673
|
for _ in range(self.max_concurrent_generations):
|
|
1555
1674
|
await self._generate_semaphore.acquire()
|
|
1556
1675
|
|
|
1557
|
-
if self.llm_loaded.recipe == "llamacpp":
|
|
1676
|
+
if self.llm_loaded.recipe == "llamacpp" or self.llm_loaded.recipe == "flm":
|
|
1558
1677
|
self.wrapped_server.process.terminate()
|
|
1559
1678
|
|
|
1560
1679
|
self.llm_loaded = None
|
|
@@ -1657,6 +1776,12 @@ class Server:
|
|
|
1657
1776
|
logging.debug(f"Total request time: {request_time:.4f} seconds")
|
|
1658
1777
|
return response
|
|
1659
1778
|
|
|
1779
|
+
async def logs_ws(self, websocket: WebSocket):
|
|
1780
|
+
if not self.log_file or not os.path.exists(self.log_file):
|
|
1781
|
+
await websocket.close(code=4000)
|
|
1782
|
+
return
|
|
1783
|
+
await log_streamer(websocket, self.log_file)
|
|
1784
|
+
|
|
1660
1785
|
|
|
1661
1786
|
# This file was originally licensed under Apache 2.0. It has been modified.
|
|
1662
1787
|
# Modifications Copyright (c) 2025 AMD
|