lemonade-sdk 7.0.0__py3-none-any.whl → 7.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

@@ -0,0 +1,294 @@
1
+ from pathlib import Path
2
+ import json
3
+ from fastapi.responses import HTMLResponse
4
+ from lemonade_server.model_manager import ModelManager
5
+
6
+
7
+ def get_instructions_html(port=8000):
8
+ """
9
+ Show instructions on how to use the server.
10
+ """
11
+ # Load server models from JSON
12
+ server_models_path = (
13
+ Path(__file__).parent.parent.parent.parent
14
+ / "lemonade_server"
15
+ / "server_models.json"
16
+ )
17
+ with open(server_models_path, "r", encoding="utf-8") as f:
18
+ server_models = json.load(f)
19
+
20
+ # Use shared filter function from model_manager.py
21
+ filtered_models = ModelManager().filter_models_by_backend(server_models)
22
+
23
+ # Pass filtered server_models to JS
24
+ server_models_js = (
25
+ f"<script>window.SERVER_MODELS = {json.dumps(filtered_models)};</script>"
26
+ )
27
+
28
+ # New lemon-themed HTML structure
29
+ # pylint: disable=W1401
30
+ styled_html = f"""
31
+ <!DOCTYPE html>
32
+ <html lang=\"en\">
33
+ <head>
34
+ <meta charset=\"UTF-8\">
35
+ <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">
36
+ <title>Lemonade Server</title>
37
+ <link rel="icon" href="data:,">
38
+ <link rel=\"stylesheet\" href=\"/static/styles.css\">
39
+ <script>
40
+ window.SERVER_PORT = {port};
41
+ </script>
42
+ {server_models_js}
43
+ </head>
44
+ <body>
45
+ <nav class=\"navbar\">
46
+ <a href=\"https://github.com/lemonade-sdk/lemonade\">GitHub</a>
47
+ <a href=\"https://lemonade-server.ai/docs/\">Docs</a>
48
+ <a href=\"https://lemonade-server.ai/docs/server/server_models/\">Models</a>
49
+ <a href=\"https://lemonade-server.ai/docs/server/apps/\">Featured Apps</a>
50
+ </nav>
51
+ <main class=\"main\">
52
+ <div class=\"title\">🍋 Lemonade Server</div>
53
+ <div class=\"tab-container\">
54
+ <div class=\"tabs\">
55
+ <button class=\"tab active\" id=\"tab-chat\" onclick=\"showTab('chat')\">LLM Chat</button>
56
+ <button class=\"tab\" id=\"tab-models\" onclick=\"showTab('models')\">Model Management</button>
57
+ </div>
58
+ <div class=\"tab-content active\" id=\"content-chat\">
59
+ <div class=\"chat-container\">
60
+ <div class=\"chat-history\" id=\"chat-history\"></div>
61
+ <div class=\"chat-input-row\">
62
+ <select id=\"model-select\"></select>
63
+ <input type=\"text\" id=\"chat-input\" placeholder=\"Type your message...\" />
64
+ <button id=\"send-btn\">Send</button>
65
+ </div>
66
+ </div>
67
+ </div>
68
+ <div class=\"tab-content\" id=\"content-models\">
69
+ <div class=\"model-mgmt-container\">
70
+ <div class=\"model-mgmt-pane\">
71
+ <h3>Installed Models</h3>
72
+ <table class=\"model-table\" id=\"installed-models-table\">
73
+ <colgroup><col style=\"width:100%\"></colgroup>
74
+ <tbody id=\"installed-models-tbody\"></tbody>
75
+ </table>
76
+ </div>
77
+ <div class=\"model-mgmt-pane\">
78
+ <h3>Suggested Models</h3>
79
+ <table class=\"model-table\" id=\"suggested-models-table\">
80
+ <tbody id=\"suggested-models-tbody\"></tbody>
81
+ </table>
82
+ </div>
83
+ </div>
84
+ </div>
85
+ </div>
86
+ </main>
87
+ <footer class=\"site-footer\">
88
+ <div class=\"dad-joke\">When life gives you LLMs, make an LLM aide.</div>
89
+ <div class=\"copyright\">Copyright 2025 AMD</div>
90
+ </footer>
91
+ <script src=\"https://cdn.jsdelivr.net/npm/openai@4.21.0/dist/openai.min.js\"></script>
92
+ <script>
93
+ // Tab switching logic
94
+ function showTab(tab) {{
95
+ document.getElementById('tab-chat').classList.remove('active');
96
+ document.getElementById('tab-models').classList.remove('active');
97
+ document.getElementById('content-chat').classList.remove('active');
98
+ document.getElementById('content-models').classList.remove('active');
99
+ if (tab === 'chat') {{
100
+ document.getElementById('tab-chat').classList.add('active');
101
+ document.getElementById('content-chat').classList.add('active');
102
+ }} else {{
103
+ document.getElementById('tab-models').classList.add('active');
104
+ document.getElementById('content-models').classList.add('active');
105
+ }}
106
+ }}
107
+
108
+ // Helper to get server base URL
109
+ function getServerBaseUrl() {{
110
+ const port = window.SERVER_PORT || 8000;
111
+ return `http://localhost:{port}`;
112
+ }}
113
+
114
+ // Populate model dropdown from /api/v1/models endpoint
115
+ async function loadModels() {{
116
+ try {{
117
+ const resp = await fetch(getServerBaseUrl() + '/api/v1/models');
118
+ const data = await resp.json();
119
+ const select = document.getElementById('model-select');
120
+ select.innerHTML = '';
121
+ if (!data.data || !Array.isArray(data.data)) {{
122
+ select.innerHTML = '<option>No models found (malformed response)</option>';
123
+ return;
124
+ }}
125
+ if (data.data.length === 0) {{
126
+ select.innerHTML = '<option>No models available</option>';
127
+ return;
128
+ }}
129
+ let defaultIndex = 0;
130
+ data.data.forEach(function(model, index) {{
131
+ const modelId = model.id || model.name || model;
132
+ const opt = document.createElement('option');
133
+ opt.value = modelId;
134
+ opt.textContent = modelId;
135
+ if (modelId === 'Llama-3.2-1B-Instruct-Hybrid') {{
136
+ defaultIndex = index;
137
+ }}
138
+ select.appendChild(opt);
139
+ }});
140
+ select.selectedIndex = defaultIndex;
141
+ }} catch (e) {{
142
+ const select = document.getElementById('model-select');
143
+ select.innerHTML = `<option>Error loading models: ${{e.message}}</option>`;
144
+ console.error('Error loading models:', e);
145
+ }}
146
+ }}
147
+ loadModels();
148
+
149
+ // Model Management Tab Logic
150
+ async function refreshModelMgmtUI() {{
151
+ // Get installed models from /api/v1/models
152
+ let installed = [];
153
+ try {{
154
+ const resp = await fetch(getServerBaseUrl() + '/api/v1/models');
155
+ const data = await resp.json();
156
+ if (data.data && Array.isArray(data.data)) {{
157
+ installed = data.data.map(m => m.id || m.name || m);
158
+ }}
159
+ }} catch (e) {{}}
160
+ // All models from server_models.json (window.SERVER_MODELS)
161
+ const allModels = window.SERVER_MODELS || {{}};
162
+ // Filter suggested models not installed
163
+ const suggested = Object.keys(allModels).filter(
164
+ k => allModels[k].suggested && !installed.includes(k)
165
+ );
166
+ // Render installed models as a table (two columns, second is invisible)
167
+ const installedTbody = document.getElementById('installed-models-tbody');
168
+ installedTbody.innerHTML = '';
169
+ installed.forEach(function(mid) {{
170
+ var tr = document.createElement('tr');
171
+ var tdName = document.createElement('td');
172
+ tdName.textContent = mid;
173
+ var tdEmpty = document.createElement('td');
174
+ tdEmpty.style.width = '0';
175
+ tdEmpty.style.padding = '0';
176
+ tdEmpty.style.border = 'none';
177
+ tr.appendChild(tdName);
178
+ tr.appendChild(tdEmpty);
179
+ installedTbody.appendChild(tr);
180
+ }});
181
+ // Render suggested models as a table
182
+ const suggestedTbody = document.getElementById('suggested-models-tbody');
183
+ suggestedTbody.innerHTML = '';
184
+ suggested.forEach(mid => {{
185
+ const tr = document.createElement('tr');
186
+ const tdName = document.createElement('td');
187
+ tdName.textContent = mid;
188
+ tdName.style.paddingRight = '1em';
189
+ tdName.style.verticalAlign = 'middle';
190
+ const tdBtn = document.createElement('td');
191
+ tdBtn.style.width = '1%';
192
+ tdBtn.style.verticalAlign = 'middle';
193
+ const btn = document.createElement('button');
194
+ btn.textContent = '+';
195
+ btn.title = 'Install model';
196
+ btn.onclick = async function() {{
197
+ btn.disabled = true;
198
+ btn.textContent = 'Installing...';
199
+ btn.classList.add('installing-btn');
200
+ try {{
201
+ await fetch(getServerBaseUrl() + '/api/v1/pull', {{
202
+ method: 'POST',
203
+ headers: {{ 'Content-Type': 'application/json' }},
204
+ body: JSON.stringify({{ model_name: mid }})
205
+ }});
206
+ await refreshModelMgmtUI();
207
+ await loadModels(); // update chat dropdown too
208
+ }} catch (e) {{
209
+ btn.textContent = 'Error';
210
+ }}
211
+ }};
212
+ tdBtn.appendChild(btn);
213
+ tr.appendChild(tdName);
214
+ tr.appendChild(tdBtn);
215
+ suggestedTbody.appendChild(tr);
216
+ }});
217
+ }}
218
+ // Initial load
219
+ refreshModelMgmtUI();
220
+ // Optionally, refresh when switching to the tab
221
+ document.getElementById('tab-models').addEventListener('click', refreshModelMgmtUI);
222
+
223
+ // Chat logic (streaming with OpenAI JS client placeholder)
224
+ const chatHistory = document.getElementById('chat-history');
225
+ const chatInput = document.getElementById('chat-input');
226
+ const sendBtn = document.getElementById('send-btn');
227
+ const modelSelect = document.getElementById('model-select');
228
+ let messages = [];
229
+
230
+ function appendMessage(role, text) {{
231
+ const div = document.createElement('div');
232
+ div.className = 'chat-message ' + role;
233
+ // Add a bubble for iMessage style
234
+ const bubble = document.createElement('div');
235
+ bubble.className = 'chat-bubble ' + role;
236
+ bubble.innerHTML = text;
237
+ div.appendChild(bubble);
238
+ chatHistory.appendChild(div);
239
+ chatHistory.scrollTop = chatHistory.scrollHeight;
240
+ }}
241
+
242
+ async function sendMessage() {{
243
+ const text = chatInput.value.trim();
244
+ if (!text) return;
245
+ appendMessage('user', text);
246
+ messages.push({{ role: 'user', content: text }});
247
+ chatInput.value = '';
248
+ sendBtn.disabled = true;
249
+ // Streaming OpenAI completions (placeholder, adapt as needed)
250
+ let llmText = '';
251
+ appendMessage('llm', '...');
252
+ const llmDiv = chatHistory.lastChild.querySelector('.chat-bubble.llm');
253
+ try {{
254
+ // Use the correct endpoint for chat completions
255
+ const resp = await fetch(getServerBaseUrl() + '/api/v1/chat/completions', {{
256
+ method: 'POST',
257
+ headers: {{ 'Content-Type': 'application/json' }},
258
+ body: JSON.stringify({{
259
+ model: modelSelect.value,
260
+ messages: messages,
261
+ stream: true
262
+ }})
263
+ }});
264
+ if (!resp.body) throw new Error('No stream');
265
+ const reader = resp.body.getReader();
266
+ let decoder = new TextDecoder();
267
+ llmDiv.textContent = '';
268
+ while (true) {{
269
+ const {{ done, value }} = await reader.read();
270
+ if (done) break;
271
+ const chunk = decoder.decode(value);
272
+ if (chunk.trim() === 'data: [DONE]' || chunk.trim() === '[DONE]') continue;
273
+ // Try to extract the content from the OpenAI chunk
274
+ const match = chunk.match(/"content"\s*:\s*"([^"]*)"/);
275
+ if (match && match[1]) {{
276
+ llmText += match[1];
277
+ llmDiv.textContent = llmText;
278
+ }}
279
+ }}
280
+ messages.push({{ role: 'assistant', content: llmText }});
281
+ }} catch (e) {{
282
+ llmDiv.textContent = '[Error: ' + e.message + ']';
283
+ }}
284
+ sendBtn.disabled = false;
285
+ }}
286
+ sendBtn.onclick = sendMessage;
287
+ chatInput.addEventListener('keydown', function(e) {{
288
+ if (e.key === 'Enter') sendMessage();
289
+ }});
290
+ </script>
291
+ </body>
292
+ </html>
293
+ """
294
+ return HTMLResponse(content=styled_html)
@@ -0,0 +1,289 @@
1
+ import sys
2
+ import os
3
+ import logging
4
+ import time
5
+ import subprocess
6
+ import zipfile
7
+ import re
8
+ import threading
9
+
10
+ import requests
11
+ from tabulate import tabulate
12
+ from fastapi import HTTPException, status
13
+ from fastapi.responses import StreamingResponse
14
+
15
+ from openai import OpenAI
16
+
17
+ from lemonade_server.model_manager import ModelManager
18
+ from lemonade.tools.server.pydantic_models import ChatCompletionRequest
19
+
20
+ LLAMA_VERSION = "b5543"
21
+ LLAMA_SERVER_PORT = "8081"
22
+
23
+ LLAMA_SERVER_EXE_DIR = os.path.join(
24
+ os.path.dirname(sys.executable),
25
+ "llama_server",
26
+ )
27
+
28
+ LLAMA_SERVER_EXE_PATH = os.path.join(
29
+ LLAMA_SERVER_EXE_DIR,
30
+ "llama-server.exe",
31
+ )
32
+
33
+
34
+ class LlamaTelemetry:
35
+ """
36
+ Manages telemetry data collection and display for llama server.
37
+ """
38
+
39
+ def __init__(self):
40
+ self.input_tokens = None
41
+ self.output_tokens = None
42
+ self.time_to_first_token = None
43
+ self.tokens_per_second = None
44
+ self.prompt_eval_time = None
45
+ self.eval_time = None
46
+
47
+ def parse_telemetry_line(self, line: str):
48
+ """
49
+ Parse telemetry data from llama server output lines.
50
+ """
51
+
52
+ # Parse prompt evaluation line
53
+ prompt_match = re.search(
54
+ # pylint: disable=C0301
55
+ r"prompt eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens.*?([\d.]+)\s*tokens per second",
56
+ line,
57
+ )
58
+ if prompt_match:
59
+ prompt_time_ms = float(prompt_match.group(1))
60
+ input_tokens = int(prompt_match.group(2))
61
+
62
+ self.prompt_eval_time = prompt_time_ms / 1000.0
63
+ self.input_tokens = input_tokens
64
+ self.time_to_first_token = prompt_time_ms / 1000.0
65
+ return
66
+
67
+ # Parse generation evaluation line
68
+ eval_match = re.search(
69
+ r"eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens.*?([\d.]+)\s*tokens per second",
70
+ line,
71
+ )
72
+ if eval_match:
73
+ eval_time_ms = float(eval_match.group(1))
74
+ output_tokens = int(eval_match.group(2))
75
+ tokens_per_second = float(eval_match.group(3))
76
+
77
+ self.eval_time = eval_time_ms / 1000.0
78
+ self.output_tokens = output_tokens
79
+ self.tokens_per_second = tokens_per_second
80
+ return
81
+
82
+ def get_telemetry_data(self):
83
+ return {
84
+ "input_tokens": self.input_tokens,
85
+ "output_tokens": self.output_tokens,
86
+ "time_to_first_token": self.time_to_first_token,
87
+ "tokens_per_second": self.tokens_per_second,
88
+ "decode_token_times": None,
89
+ }
90
+
91
+ def show_telemetry(self):
92
+ # Check if debug logging is enabled
93
+ if not logging.getLogger().isEnabledFor(logging.DEBUG):
94
+ return
95
+
96
+ # Prepare telemetry data (transposed format)
97
+ telemetry = [
98
+ ["Input tokens", self.input_tokens],
99
+ ["Output tokens", self.output_tokens],
100
+ ["TTFT (s)", f"{self.time_to_first_token:.2f}"],
101
+ ["TPS", f"{self.tokens_per_second:.2f}"],
102
+ ]
103
+
104
+ table = tabulate(
105
+ telemetry, headers=["Metric", "Value"], tablefmt="fancy_grid"
106
+ ).split("\n")
107
+
108
+ # Show telemetry in debug while complying with uvicorn's log indentation
109
+ logging.debug("\n ".join(table))
110
+
111
+
112
+ def _log_subprocess_output(
113
+ process: subprocess.Popen, prefix: str, telemetry: LlamaTelemetry
114
+ ):
115
+ """
116
+ Read subprocess output line by line, log to debug, and parse telemetry
117
+ """
118
+
119
+ if process.stdout:
120
+ for line in iter(process.stdout.readline, ""):
121
+ if line:
122
+ line_stripped = line.strip()
123
+ logging.debug("%s: %s", prefix, line_stripped)
124
+
125
+ telemetry.parse_telemetry_line(line_stripped)
126
+
127
+ if process.poll() is not None:
128
+ break
129
+
130
+
131
+ def _wait_for_load(llama_server_process: subprocess.Popen, fail_message: str):
132
+ status_code = None
133
+ while not llama_server_process.poll() and status_code != 200:
134
+ health_url = f"http://localhost:{LLAMA_SERVER_PORT}/health"
135
+ try:
136
+ health_response = requests.get(health_url)
137
+ except requests.exceptions.ConnectionError:
138
+ logging.warning(fail_message)
139
+ else:
140
+ status_code = health_response.status_code
141
+ logging.debug(
142
+ "Testing llama-server readiness (will retry until ready), "
143
+ f"result: {health_response.json()}"
144
+ )
145
+ time.sleep(1)
146
+
147
+
148
+ def _launch_llama_subprocess(
149
+ model_path: str, use_gpu: bool, telemetry: LlamaTelemetry
150
+ ) -> subprocess.Popen:
151
+ """
152
+ Launch llama server subprocess with GPU or CPU configuration
153
+ """
154
+
155
+ base_command = [
156
+ LLAMA_SERVER_EXE_PATH,
157
+ "-m",
158
+ model_path,
159
+ "--port",
160
+ LLAMA_SERVER_PORT,
161
+ ]
162
+
163
+ # Configure GPU layers: 99 for GPU, 0 for CPU-only
164
+ ngl_value = "99" if use_gpu else "0"
165
+ command = base_command + ["-ngl", ngl_value]
166
+
167
+ # Start subprocess with output capture
168
+ process = subprocess.Popen(
169
+ command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1
170
+ )
171
+
172
+ # Start background thread to log subprocess output
173
+ device_type = "GPU" if use_gpu else "CPU"
174
+ threading.Thread(
175
+ target=_log_subprocess_output,
176
+ args=(process, f"LLAMA SERVER {device_type}", telemetry),
177
+ daemon=True,
178
+ ).start()
179
+
180
+ return process
181
+
182
+
183
+ def server_load(checkpoint: str, model_reference: str, telemetry: LlamaTelemetry):
184
+ # Download llama.cpp server if it isn't already available
185
+ if not os.path.exists(LLAMA_SERVER_EXE_DIR):
186
+ # Download llama.cpp server zip
187
+ # pylint: disable=C0301
188
+ llama_zip_url = f"https://github.com/ggml-org/llama.cpp/releases/download/{LLAMA_VERSION}/llama-{LLAMA_VERSION}-bin-win-vulkan-x64.zip"
189
+ llama_zip_path = os.path.join(
190
+ os.path.dirname(sys.executable), "llama-server.zip"
191
+ )
192
+ logging.info(f"Downloading llama.cpp server from {llama_zip_url}")
193
+
194
+ with requests.get(llama_zip_url, stream=True) as r:
195
+ r.raise_for_status()
196
+ with open(llama_zip_path, "wb") as f:
197
+ for chunk in r.iter_content(chunk_size=8192):
198
+ f.write(chunk)
199
+
200
+ # Extract zip
201
+ logging.info(f"Extracting {llama_zip_path} to {LLAMA_SERVER_EXE_DIR}")
202
+ with zipfile.ZipFile(llama_zip_path, "r") as zip_ref:
203
+ zip_ref.extractall(LLAMA_SERVER_EXE_DIR)
204
+
205
+ # Save version.txt
206
+ version_txt_path = os.path.join(LLAMA_SERVER_EXE_DIR, "version.txt")
207
+ with open(version_txt_path, "w", encoding="utf-8") as vf:
208
+ vf.write(LLAMA_VERSION)
209
+
210
+ # Delete zip file
211
+ os.remove(llama_zip_path)
212
+ logging.info("Cleaned up zip file")
213
+
214
+ # Download the gguf to the hugging face cache
215
+ snapshot_path = ModelManager().download_gguf(checkpoint)
216
+ model_path = os.path.join(snapshot_path, os.listdir(snapshot_path)[0])
217
+ logging.debug(f"GGUF file path: {model_path}")
218
+
219
+ # Start the llama-serve.exe process
220
+ logging.debug(f"Using llama_server for GGUF model: {LLAMA_SERVER_EXE_PATH}")
221
+
222
+ # Attempt loading on GPU first
223
+ llama_server_process = _launch_llama_subprocess(
224
+ model_path, use_gpu=True, telemetry=telemetry
225
+ )
226
+
227
+ # Check the /health endpoint until GPU server is ready
228
+ _wait_for_load(
229
+ llama_server_process,
230
+ f"Loading {model_reference} on GPU didn't work, re-attempting on CPU",
231
+ )
232
+
233
+ # If loading on GPU failed, try loading on CPU
234
+ if llama_server_process.poll():
235
+ llama_server_process = _launch_llama_subprocess(
236
+ model_path, use_gpu=False, telemetry=telemetry
237
+ )
238
+
239
+ # Check the /health endpoint until CPU server is ready
240
+ _wait_for_load(
241
+ llama_server_process,
242
+ f"Loading {model_reference} on CPU didn't work",
243
+ )
244
+
245
+ if llama_server_process.poll():
246
+ raise HTTPException(
247
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
248
+ detail=f"Failed to load {model_reference} with llama.cpp",
249
+ )
250
+
251
+ return llama_server_process
252
+
253
+
254
+ def chat_completion(
255
+ chat_completion_request: ChatCompletionRequest, telemetry: LlamaTelemetry
256
+ ):
257
+ base_url = f"http://127.0.0.1:{LLAMA_SERVER_PORT}/v1"
258
+ client = OpenAI(
259
+ base_url=base_url,
260
+ api_key="lemonade",
261
+ )
262
+
263
+ # Convert Pydantic model to dict and remove unset/null values
264
+ request_dict = chat_completion_request.model_dump(
265
+ exclude_unset=True, exclude_none=True
266
+ )
267
+
268
+ def event_stream():
269
+ try:
270
+ # Enable streaming
271
+ request_dict["stream"] = True
272
+ for chunk in client.chat.completions.create(**request_dict):
273
+ yield f"data: {chunk.model_dump_json()}\n\n"
274
+ yield "data: [DONE]\n\n"
275
+
276
+ # Show telemetry after completion
277
+ telemetry.show_telemetry()
278
+
279
+ except Exception as e: # pylint: disable=broad-exception-caught
280
+ yield f'data: {{"error": "{str(e)}"}}\n\n'
281
+
282
+ return StreamingResponse(
283
+ event_stream(),
284
+ media_type="text/event-stream",
285
+ headers={
286
+ "Cache-Control": "no-cache",
287
+ "Connection": "keep-alive",
288
+ },
289
+ )
@@ -0,0 +1,83 @@
1
+ from typing import Optional
2
+
3
+ from pydantic import BaseModel
4
+
5
+ # Set to a high number to allow for interesting experiences in real apps
6
+ # Tests should use the max_new_tokens argument to set a lower value
7
+ DEFAULT_MAX_NEW_TOKENS = 1500
8
+
9
+
10
+ class LoadConfig(BaseModel):
11
+ """
12
+ Configuration for loading a language model.
13
+
14
+ Specifies the model checkpoint, generation parameters,
15
+ and hardware/framework configuration (recipe) for model loading.
16
+ """
17
+
18
+ model_name: Optional[str] = None
19
+ checkpoint: Optional[str] = None
20
+ max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS
21
+ recipe: Optional[str] = None
22
+ # Indicates the maximum prompt length allowed for that specific
23
+ # checkpoint + recipe combination
24
+ max_prompt_length: Optional[int] = None
25
+ # Indicates whether the model is a reasoning model, like DeepSeek
26
+ reasoning: Optional[bool] = False
27
+
28
+
29
+ class CompletionRequest(BaseModel):
30
+ """
31
+ Request model for text completion API endpoint.
32
+
33
+ Contains a prompt, a model identifier, and a streaming
34
+ flag to control response delivery.
35
+ """
36
+
37
+ prompt: str
38
+ model: str
39
+ echo: bool = False
40
+ stream: bool = False
41
+ logprobs: int | None = False
42
+ stop: list[str] | str | None = None
43
+ temperature: float | None = None
44
+ max_tokens: int | None = None
45
+
46
+
47
+ class ChatCompletionRequest(BaseModel):
48
+ """
49
+ Request model for chat completion API endpoint.
50
+
51
+ Contains a list of chat messages, a model identifier,
52
+ and a streaming flag to control response delivery.
53
+ """
54
+
55
+ messages: list[dict]
56
+ model: str
57
+ stream: bool = False
58
+ logprobs: int | None = False
59
+ stop: list[str] | str | None = None
60
+ temperature: float | None = None
61
+ tools: list[dict] | None = None
62
+ max_tokens: int | None = None
63
+ max_completion_tokens: int | None = None
64
+
65
+
66
+ class ResponsesRequest(BaseModel):
67
+ """
68
+ Request model for responses API endpoint.
69
+ """
70
+
71
+ input: list[dict] | str
72
+ model: str
73
+ max_output_tokens: int | None = None
74
+ temperature: float | None = None
75
+ stream: bool = False
76
+
77
+
78
+ class PullConfig(BaseModel):
79
+ """
80
+ Configurating for installing a supported LLM.
81
+ """
82
+
83
+ model_name: str