lemonade-sdk 8.1.9__py3-none-any.whl → 8.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (33) hide show
  1. lemonade/common/inference_engines.py +13 -4
  2. lemonade/common/system_info.py +570 -1
  3. lemonade/tools/flm/__init__.py +1 -0
  4. lemonade/tools/flm/utils.py +255 -0
  5. lemonade/tools/llamacpp/utils.py +62 -13
  6. lemonade/tools/server/flm.py +137 -0
  7. lemonade/tools/server/llamacpp.py +23 -5
  8. lemonade/tools/server/serve.py +292 -135
  9. lemonade/tools/server/static/js/chat.js +165 -82
  10. lemonade/tools/server/static/js/models.js +87 -54
  11. lemonade/tools/server/static/js/shared.js +5 -3
  12. lemonade/tools/server/static/logs.html +47 -0
  13. lemonade/tools/server/static/styles.css +159 -8
  14. lemonade/tools/server/static/webapp.html +28 -10
  15. lemonade/tools/server/tray.py +158 -38
  16. lemonade/tools/server/utils/macos_tray.py +226 -0
  17. lemonade/tools/server/utils/{system_tray.py → windows_tray.py} +13 -0
  18. lemonade/tools/server/webapp.py +4 -1
  19. lemonade/tools/server/wrapped_server.py +91 -25
  20. lemonade/version.py +1 -1
  21. lemonade_install/install.py +25 -2
  22. {lemonade_sdk-8.1.9.dist-info → lemonade_sdk-8.1.11.dist-info}/METADATA +9 -6
  23. {lemonade_sdk-8.1.9.dist-info → lemonade_sdk-8.1.11.dist-info}/RECORD +33 -28
  24. lemonade_server/cli.py +105 -14
  25. lemonade_server/model_manager.py +186 -45
  26. lemonade_server/pydantic_models.py +25 -1
  27. lemonade_server/server_models.json +162 -62
  28. lemonade_server/settings.py +39 -39
  29. {lemonade_sdk-8.1.9.dist-info → lemonade_sdk-8.1.11.dist-info}/WHEEL +0 -0
  30. {lemonade_sdk-8.1.9.dist-info → lemonade_sdk-8.1.11.dist-info}/entry_points.txt +0 -0
  31. {lemonade_sdk-8.1.9.dist-info → lemonade_sdk-8.1.11.dist-info}/licenses/LICENSE +0 -0
  32. {lemonade_sdk-8.1.9.dist-info → lemonade_sdk-8.1.11.dist-info}/licenses/NOTICE.md +0 -0
  33. {lemonade_sdk-8.1.9.dist-info → lemonade_sdk-8.1.11.dist-info}/top_level.txt +0 -0
@@ -10,11 +10,13 @@ import traceback
10
10
  from typing import Optional, Union
11
11
  import json
12
12
  from pathlib import Path
13
+ import os
13
14
 
14
- from fastapi import FastAPI, HTTPException, status, Request
15
+ from fastapi import FastAPI, HTTPException, status, Request, WebSocket
15
16
  from fastapi.responses import StreamingResponse
16
17
  from fastapi.middleware.cors import CORSMiddleware
17
18
  from fastapi.staticfiles import StaticFiles
19
+ from starlette.websockets import WebSocketDisconnect, WebSocketState
18
20
  import uvicorn
19
21
  from uvicorn.config import Config
20
22
  from uvicorn.server import Server as UvicornServer
@@ -48,6 +50,7 @@ from openai.types.responses import (
48
50
  import lemonade.api as lemonade_api
49
51
  from lemonade.tools.server.wrapped_server import WrappedServer
50
52
  from lemonade.tools.server.llamacpp import LlamaServer
53
+ from lemonade.tools.server.flm import FlmServer
51
54
  from lemonade.tools.server.tool_calls import extract_tool_calls, get_tool_call_pattern
52
55
  from lemonade.tools.server.webapp import get_webapp_html
53
56
  from lemonade.tools.server.utils.port import lifespan
@@ -75,12 +78,62 @@ from lemonade_server.settings import save_setting
75
78
  # Tests should use the max_new_tokens argument to set a lower value
76
79
  DEFAULT_MAX_NEW_TOKENS = 1500
77
80
 
78
- # Only import tray on Windows
79
- if platform.system() == "Windows":
81
+ if platform.system() in ["Windows", "Darwin"]:
80
82
  # pylint: disable=ungrouped-imports
81
83
  from lemonade.tools.server.tray import LemonadeTray, OutputDuplicator
82
84
 
83
85
 
86
+ class WebsocketTextFilter(logging.Filter):
87
+ def filter(self, record: logging.LogRecord) -> bool:
88
+ # Only allow logs that don't include "> TEXT"
89
+ return "> TEXT" not in record.getMessage()
90
+
91
+
92
+ async def log_streamer(websocket: WebSocket, path: str, interval: float = 1.0):
93
+ logger = logging.getLogger()
94
+ await websocket.accept()
95
+ try:
96
+ with open(path, "r", encoding="utf-8") as f:
97
+ f.seek(0, os.SEEK_END) # start at end
98
+ while True:
99
+ # Try reading a line
100
+ line = f.readline()
101
+ if not line:
102
+ await asyncio.sleep(interval)
103
+ continue
104
+
105
+ # Send defensively: if disconnected, bail out
106
+ if websocket.application_state != WebSocketState.CONNECTED:
107
+ # Server-side state says we're not connected anymore
108
+ break
109
+
110
+ try:
111
+ await websocket.send_text(line)
112
+ except WebSocketDisconnect:
113
+ # Client closed — normal path out
114
+ break
115
+ except RuntimeError as re:
116
+ # Starlette will raise this if a close has already been sent
117
+ logger.debug("RuntimeError during send: %s", re)
118
+ break
119
+
120
+ except WebSocketDisconnect:
121
+ # Client closed the socket; do not try to send or close again
122
+ pass
123
+ except Exception as e: # pylint: disable=broad-except
124
+ # Log server-side; do not attempt to send error over a possibly closed socket
125
+ logger.exception("Error in log_streamer: %s", e)
126
+ finally:
127
+ # Only close if Starlette still thinks we're connected.
128
+ # This prevents "Cannot call send once a close message has been sent."
129
+ try:
130
+ if websocket.application_state == WebSocketState.CONNECTED:
131
+ await websocket.close()
132
+ except Exception: # pylint: disable=broad-except
133
+ # If close itself races, swallow — we're shutting down anyway.
134
+ pass
135
+
136
+
84
137
  class ServerModel(Model):
85
138
  """
86
139
  An extension of OpenAI's Model class that adds
@@ -164,6 +217,7 @@ class Server:
164
217
  - /api/v1/chat/completions: chat completion responses using HTTP chunked transfer encoding.
165
218
  - /api/v1/responses: responses API using HTTP chunked transfer encoding.
166
219
  - /api/v1/models: list all available models.
220
+ - /api/v1/models/{model_id}: retrieve a specific model by ID.
167
221
  """
168
222
 
169
223
  def __init__(
@@ -203,6 +257,12 @@ class Server:
203
257
  allow_headers=["*"], # Allows all headers
204
258
  )
205
259
 
260
+ # Set up debug middleware if debug logging is enabled
261
+ # This must be done during app initialization, not at runtime
262
+ self.debug_logging_enabled = log_level == "debug"
263
+ if self.debug_logging_enabled:
264
+ self.setup_middleware_timer()
265
+
206
266
  # Set up custom routes
207
267
  self.setup_routes(["/api/v0", "/api/v1"])
208
268
 
@@ -264,11 +324,13 @@ class Server:
264
324
  self.app.post(f"{prefix}/completions")(self.completions)
265
325
  self.app.post(f"{prefix}/responses")(self.responses)
266
326
  self.app.post(f"{prefix}/log-level")(self.set_log_level)
327
+ self.app.websocket(f"{prefix}/logs/ws")(self.logs_ws)
267
328
 
268
329
  # OpenAI-compatible routes
269
330
  self.app.post(f"{prefix}/chat/completions")(self.chat_completions)
270
331
  self.app.post(f"{prefix}/embeddings")(self.embeddings)
271
332
  self.app.get(f"{prefix}/models")(self.models)
333
+ self.app.get(f"{prefix}/models/{{model_id}}")(self.retrieve_model)
272
334
 
273
335
  # JinaAI routes (jina.ai/reranker/)
274
336
  self.app.post(f"{prefix}/reranking")(self.reranking)
@@ -392,11 +454,13 @@ class Server:
392
454
  )
393
455
  file_handler.setLevel(logging_level)
394
456
  file_handler.setFormatter(uvicorn_formatter)
457
+ file_handler.addFilter(WebsocketTextFilter())
395
458
 
396
459
  # Set up console handler
397
460
  console_handler = logging.StreamHandler()
398
461
  console_handler.setLevel(logging_level)
399
462
  console_handler.setFormatter(uvicorn_formatter)
463
+ console_handler.addFilter(WebsocketTextFilter())
400
464
 
401
465
  # Configure root logger with both handlers
402
466
  logging.basicConfig(
@@ -419,10 +483,6 @@ class Server:
419
483
  ).run()
420
484
  sys.exit(0)
421
485
 
422
- if self.debug_logging_enabled:
423
- # Print the elapsed time for each request
424
- self.setup_middleware_timer()
425
-
426
486
  # Let the app know what port it's running on, so
427
487
  # that the lifespan can access it
428
488
  self.app.port = self.port
@@ -519,7 +579,9 @@ class Server:
519
579
 
520
580
  return lc
521
581
 
522
- async def completions(self, completion_request: CompletionRequest):
582
+ async def completions(
583
+ self, completion_request: CompletionRequest, request: Request
584
+ ):
523
585
  """
524
586
  Stream completion responses using HTTP chunked transfer encoding.
525
587
  """
@@ -532,7 +594,7 @@ class Server:
532
594
  # Load the model if it's different from the currently loaded one
533
595
  await self.load_llm(lc)
534
596
 
535
- if self.llm_loaded.recipe == "llamacpp":
597
+ if self.llm_loaded.recipe == "llamacpp" or self.llm_loaded.recipe == "flm":
536
598
  return self.wrapped_server.completion(completion_request)
537
599
 
538
600
  # Check if the model supports reasoning
@@ -571,29 +633,43 @@ class Server:
571
633
  # This is necessary because the variable is modified
572
634
  # in the inner function
573
635
  nonlocal reasoning_first_token
636
+ try:
637
+ async for token in self._generate_tokens(**generation_args):
638
+ # Handle client disconnect: stop generation and exit
639
+ if await request.is_disconnected():
640
+ self.stop_event.set()
641
+ break
574
642
 
575
- async for token in self._generate_tokens(**generation_args):
576
- choice = CompletionChoice(
577
- text=("<think>" + token if reasoning_first_token else token),
578
- index=0,
579
- finish_reason="stop",
580
- logprobs=None,
581
- )
643
+ choice = CompletionChoice(
644
+ text=(
645
+ "<think>" + token if reasoning_first_token else token
646
+ ),
647
+ index=0,
648
+ finish_reason="stop",
649
+ logprobs=None,
650
+ )
582
651
 
583
- completion = Completion(
584
- id="0",
585
- choices=[choice],
586
- model=self.llm_loaded.checkpoint,
587
- object="text_completion",
588
- created=int(time.time()),
589
- )
652
+ completion = Completion(
653
+ id="0",
654
+ choices=[choice],
655
+ model=self.llm_loaded.checkpoint,
656
+ object="text_completion",
657
+ created=int(time.time()),
658
+ )
590
659
 
591
- # Format as SSE
592
- reasoning_first_token = False
593
- yield f"data: {completion.model_dump_json()}\n\n".encode("utf-8")
660
+ # Format as SSE
661
+ reasoning_first_token = False
662
+ yield f"data: {completion.model_dump_json()}\n\n".encode(
663
+ "utf-8"
664
+ )
594
665
 
595
- # Send the [DONE] marker
596
- yield b"data: [DONE]\n\n"
666
+ # Send the [DONE] marker only if still connected
667
+ if not await request.is_disconnected():
668
+ yield b"data: [DONE]\n\n"
669
+ except asyncio.CancelledError:
670
+ # Propagate cancellation to the generator loop
671
+ self.stop_event.set()
672
+ return
597
673
 
598
674
  return StreamingResponse(
599
675
  generate(),
@@ -651,7 +727,9 @@ class Server:
651
727
  created=int(time.time()),
652
728
  )
653
729
 
654
- async def chat_completions(self, chat_completion_request: ChatCompletionRequest):
730
+ async def chat_completions(
731
+ self, chat_completion_request: ChatCompletionRequest, request: Request
732
+ ):
655
733
  """
656
734
  Stream chat completion responses using HTTP chunked transfer encoding.
657
735
  """
@@ -667,7 +745,7 @@ class Server:
667
745
  # Load the model if it's different from the currently loaded one
668
746
  await self.load_llm(lc)
669
747
 
670
- if self.llm_loaded.recipe == "llamacpp":
748
+ if self.llm_loaded.recipe == "llamacpp" or self.llm_loaded.recipe == "flm":
671
749
  return self.wrapped_server.chat_completion(chat_completion_request)
672
750
 
673
751
  # Convert chat messages to text using the model's chat template
@@ -729,69 +807,80 @@ class Server:
729
807
 
730
808
  # Keep track of the full response for tool call extraction
731
809
  full_response = ""
810
+ try:
811
+ async for token in self._generate_tokens(**generation_args):
812
+ # Handle client disconnect: stop generation and exit
813
+ if await request.is_disconnected():
814
+ self.stop_event.set()
815
+ break
732
816
 
733
- async for token in self._generate_tokens(**generation_args):
734
- # Continuously look for tool calls embedded into the generated text
735
- openai_tool_calls = None
736
- if chat_completion_request.tools:
817
+ # Continuously look for tool calls embedded into the generated text
818
+ openai_tool_calls = None
819
+ if chat_completion_request.tools:
737
820
 
738
- # Append the token to the full response
739
- full_response += token
821
+ # Append the token to the full response
822
+ full_response += token
740
823
 
741
- tool_calls, _ = extract_tool_calls(
742
- full_response,
743
- tool_call_pattern,
744
- )
824
+ tool_calls, _ = extract_tool_calls(
825
+ full_response,
826
+ tool_call_pattern,
827
+ )
745
828
 
746
- # If there are tool calls, reset the full response for the next tool call
747
- if tool_calls:
748
- openai_tool_calls = []
749
- full_response = ""
750
- for tool_call in tool_calls:
751
- openai_tool_calls.append(
752
- ChoiceDeltaToolCall(
753
- index=0,
754
- id="-",
755
- function=ChoiceDeltaToolCallFunction(
756
- arguments=json.dumps(tool_call["arguments"]),
757
- name=tool_call["name"],
758
- ),
759
- type="function",
829
+ # If there are tool calls, reset the full response for the next call
830
+ if tool_calls:
831
+ openai_tool_calls = []
832
+ full_response = ""
833
+ for tool_call in tool_calls:
834
+ openai_tool_calls.append(
835
+ ChoiceDeltaToolCall(
836
+ index=0,
837
+ id="-",
838
+ function=ChoiceDeltaToolCallFunction(
839
+ arguments=json.dumps(
840
+ tool_call["arguments"]
841
+ ),
842
+ name=tool_call["name"],
843
+ ),
844
+ type="function",
845
+ )
760
846
  )
761
- )
762
847
 
763
- # Create a ChatCompletionChunk
764
- chunk = ChatCompletionChunk.model_construct(
765
- id="0",
766
- object="chat.completion.chunk",
767
- created=int(time.time()),
768
- model=self.llm_loaded.checkpoint,
769
- choices=[
770
- Choice.model_construct(
771
- index=0,
772
- delta=ChoiceDelta(
773
- content=(
774
- "<think>" + token
775
- if reasoning_first_token
776
- else token
848
+ # Create a ChatCompletionChunk
849
+ chunk = ChatCompletionChunk.model_construct(
850
+ id="0",
851
+ object="chat.completion.chunk",
852
+ created=int(time.time()),
853
+ model=self.llm_loaded.checkpoint,
854
+ choices=[
855
+ Choice.model_construct(
856
+ index=0,
857
+ delta=ChoiceDelta(
858
+ content=(
859
+ "<think>" + token
860
+ if reasoning_first_token
861
+ else token
862
+ ),
863
+ function_call=None,
864
+ role="assistant",
865
+ tool_calls=openai_tool_calls,
866
+ refusal=None,
777
867
  ),
778
- function_call=None,
779
- role="assistant",
780
- tool_calls=openai_tool_calls,
781
- refusal=None,
782
- ),
783
- finish_reason=None,
784
- logprobs=None,
785
- )
786
- ],
787
- )
868
+ finish_reason=None,
869
+ logprobs=None,
870
+ )
871
+ ],
872
+ )
788
873
 
789
- # Format as SSE
790
- reasoning_first_token = False
791
- yield f"data: {chunk.model_dump_json()}\n\n".encode("utf-8")
874
+ # Format as SSE
875
+ reasoning_first_token = False
876
+ yield f"data: {chunk.model_dump_json()}\n\n".encode("utf-8")
792
877
 
793
- # Send the [DONE] marker
794
- yield b"data: [DONE]\n\n"
878
+ # Send the [DONE] marker only if still connected
879
+ if not await request.is_disconnected():
880
+ yield b"data: [DONE]\n\n"
881
+ except asyncio.CancelledError:
882
+ self.stop_event.set()
883
+ return
795
884
 
796
885
  return StreamingResponse(
797
886
  generate(),
@@ -950,7 +1039,7 @@ class Server:
950
1039
  formatted_messages.append(f"{role_marker}\n{content} <|end|>")
951
1040
  return "\n".join(formatted_messages) + "\n<|assistant|>"
952
1041
 
953
- async def responses(self, responses_request: ResponsesRequest):
1042
+ async def responses(self, responses_request: ResponsesRequest, request: Request):
954
1043
  """
955
1044
  Stream responses using HTTP chunked transfer encoding.
956
1045
  """
@@ -963,6 +1052,12 @@ class Server:
963
1052
  # Load the model if it's different from the currently loaded one
964
1053
  await self.load_llm(lc)
965
1054
 
1055
+ if self.llm_loaded.recipe == "llamacpp":
1056
+ raise HTTPException(
1057
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
1058
+ detail=f"Responses API not supported for recipe: {self.llm_loaded.recipe}",
1059
+ )
1060
+
966
1061
  # Convert chat messages to text using the model's chat template
967
1062
  if isinstance(responses_request.input, str):
968
1063
  text = responses_request.input
@@ -1016,56 +1111,71 @@ class Server:
1016
1111
 
1017
1112
  full_response = "<think>" if reasoning_first_token else ""
1018
1113
 
1019
- async for token in self._generate_tokens(**generation_args):
1114
+ try:
1115
+ async for token in self._generate_tokens(**generation_args):
1116
+ # Handle client disconnect: stop generation and exit
1117
+ if await request.is_disconnected():
1118
+ self.stop_event.set()
1119
+ break
1020
1120
 
1021
- # Create an event
1022
- delta_event = ResponseTextDeltaEvent(
1023
- content_index=0,
1024
- delta=("<think>" + token if reasoning_first_token else token),
1025
- item_id="0 ",
1026
- output_index=0,
1027
- type="response.output_text.delta",
1028
- sequence_number=0,
1029
- )
1030
- full_response += token
1121
+ # Create an event
1122
+ delta_event = ResponseTextDeltaEvent(
1123
+ content_index=0,
1124
+ delta=(
1125
+ "<think>" + token if reasoning_first_token else token
1126
+ ),
1127
+ item_id="0 ",
1128
+ output_index=0,
1129
+ type="response.output_text.delta",
1130
+ sequence_number=0,
1131
+ )
1132
+ full_response += token
1031
1133
 
1032
- # Format as SSE
1033
- reasoning_first_token = False
1034
- yield f"data: {delta_event.model_dump_json()}\n\n".encode("utf-8")
1134
+ # Format as SSE
1135
+ reasoning_first_token = False
1136
+ yield f"data: {delta_event.model_dump_json()}\n\n".encode(
1137
+ "utf-8"
1138
+ )
1035
1139
 
1036
- # Send the completed event
1037
- response_output_message = ResponseOutputMessage(
1038
- id="0",
1039
- content=[
1040
- ResponseOutputText(
1041
- annotations=[],
1042
- text=full_response,
1043
- type="output_text",
1140
+ # Send the completed event (only if still connected)
1141
+ if not await request.is_disconnected():
1142
+ response_output_message = ResponseOutputMessage(
1143
+ id="0",
1144
+ content=[
1145
+ ResponseOutputText(
1146
+ annotations=[],
1147
+ text=full_response,
1148
+ type="output_text",
1149
+ )
1150
+ ],
1151
+ role="assistant",
1152
+ status="completed",
1153
+ type="message",
1154
+ )
1155
+ response = Response(
1156
+ id="0",
1157
+ model=self.llm_loaded.checkpoint,
1158
+ created_at=int(time.time()),
1159
+ object="response",
1160
+ output=[response_output_message],
1161
+ parallel_tool_calls=True,
1162
+ tool_choice="auto",
1163
+ tools=[],
1164
+ )
1165
+ completed_event = ResponseCompletedEvent(
1166
+ response=response,
1167
+ type="response.completed",
1168
+ sequence_number=0,
1169
+ )
1170
+ yield f"data: {completed_event.model_dump_json()}\n\n".encode(
1171
+ "utf-8"
1044
1172
  )
1045
- ],
1046
- role="assistant",
1047
- status="completed",
1048
- type="message",
1049
- )
1050
- response = Response(
1051
- id="0",
1052
- model=self.llm_loaded.checkpoint,
1053
- created_at=int(time.time()),
1054
- object="response",
1055
- output=[response_output_message],
1056
- parallel_tool_calls=True,
1057
- tool_choice="auto",
1058
- tools=[],
1059
- )
1060
- completed_event = ResponseCompletedEvent(
1061
- response=response,
1062
- type="response.completed",
1063
- sequence_number=0,
1064
- )
1065
- yield f"data: {completed_event.model_dump_json()}\n\n".encode("utf-8")
1066
1173
 
1067
- # Send the [DONE] marker
1068
- yield b"data: [DONE]\n\n"
1174
+ # Send the [DONE] marker
1175
+ yield b"data: [DONE]\n\n"
1176
+ except asyncio.CancelledError:
1177
+ self.stop_event.set()
1178
+ return
1069
1179
 
1070
1180
  return StreamingResponse(
1071
1181
  generate(),
@@ -1310,8 +1420,10 @@ class Server:
1310
1420
  """
1311
1421
  Send performance statistics to the client.
1312
1422
  """
1313
- # If using llama server, get telemetry from the telemetry instance
1314
- if self.llm_loaded and self.llm_loaded.recipe == "llamacpp":
1423
+ # If using wrapped server, get telemetry from the telemetry instance
1424
+ if self.llm_loaded and (
1425
+ self.llm_loaded.recipe == "llamacpp" or self.llm_loaded.recipe == "flm"
1426
+ ):
1315
1427
  return self.wrapped_server.telemetry.get_telemetry_data()
1316
1428
 
1317
1429
  # For built-in server, use the existing telemetry
@@ -1412,6 +1524,7 @@ class Server:
1412
1524
  checkpoint=config.checkpoint,
1413
1525
  recipe=config.recipe,
1414
1526
  reasoning=config.reasoning,
1527
+ vision=config.vision,
1415
1528
  mmproj=config.mmproj,
1416
1529
  # The pull endpoint will download an upgraded model if available, even
1417
1530
  # if we already have a local copy of the model
@@ -1491,8 +1604,8 @@ class Server:
1491
1604
  ):
1492
1605
  if (
1493
1606
  self.llm_loaded.recipe == "llamacpp"
1494
- and self.wrapped_server.process.poll()
1495
- ):
1607
+ or self.llm_loaded.recipe == "flm"
1608
+ ) and self.wrapped_server.process.poll():
1496
1609
  # wrapped server process has gone away for some reason, so we should
1497
1610
  # proceed with loading to get it back
1498
1611
  pass
@@ -1516,6 +1629,14 @@ class Server:
1516
1629
  do_not_upgrade=True,
1517
1630
  )
1518
1631
 
1632
+ elif config_to_use.recipe == "flm":
1633
+ self.wrapped_server = FlmServer()
1634
+ self.wrapped_server.load(
1635
+ model_config=config_to_use,
1636
+ ctx_size=self.ctx_size,
1637
+ do_not_upgrade=True,
1638
+ )
1639
+
1519
1640
  else:
1520
1641
  self.model, self.tokenizer = lemonade_api.from_pretrained(
1521
1642
  checkpoint=config_to_use.checkpoint, recipe=config_to_use.recipe
@@ -1552,7 +1673,7 @@ class Server:
1552
1673
  for _ in range(self.max_concurrent_generations):
1553
1674
  await self._generate_semaphore.acquire()
1554
1675
 
1555
- if self.llm_loaded.recipe == "llamacpp":
1676
+ if self.llm_loaded.recipe == "llamacpp" or self.llm_loaded.recipe == "flm":
1556
1677
  self.wrapped_server.process.terminate()
1557
1678
 
1558
1679
  self.llm_loaded = None
@@ -1590,6 +1711,36 @@ class Server:
1590
1711
 
1591
1712
  return {"object": "list", "data": models_list}
1592
1713
 
1714
+ async def retrieve_model(self, model_id: str):
1715
+ """
1716
+ Retrieve a specific model by ID in OpenAI-compatible format.
1717
+ """
1718
+ # Raise an error if the model does not exist
1719
+ if model_id not in self.local_models:
1720
+ # Mimic the error format of the OpenAI API
1721
+ raise HTTPException(
1722
+ status_code=404,
1723
+ detail={
1724
+ "message": f"model {model_id} not found",
1725
+ "type": "api_error",
1726
+ "param": None,
1727
+ "code": None,
1728
+ },
1729
+ )
1730
+
1731
+ # Return the specific model
1732
+ model_info = self.local_models[model_id]
1733
+ model = ServerModel(
1734
+ id=model_id,
1735
+ owned_by="lemonade",
1736
+ object="model",
1737
+ created=int(time.time()),
1738
+ checkpoint=model_info["checkpoint"],
1739
+ recipe=model_info["recipe"],
1740
+ )
1741
+
1742
+ return model
1743
+
1593
1744
  def setup_middleware_timer(self):
1594
1745
  logging.info("Middleware set up")
1595
1746
 
@@ -1625,6 +1776,12 @@ class Server:
1625
1776
  logging.debug(f"Total request time: {request_time:.4f} seconds")
1626
1777
  return response
1627
1778
 
1779
+ async def logs_ws(self, websocket: WebSocket):
1780
+ if not self.log_file or not os.path.exists(self.log_file):
1781
+ await websocket.close(code=4000)
1782
+ return
1783
+ await log_streamer(websocket, self.log_file)
1784
+
1628
1785
 
1629
1786
  # This file was originally licensed under Apache 2.0. It has been modified.
1630
1787
  # Modifications Copyright (c) 2025 AMD