PyPI - lemonade-sdk - Versions diffs - 7.0.1__py3-none-any.whl → 7.0.3__py3-none-any.whl - Mend

lemonade-sdk 7.0.1py3-none-any.whl → 7.0.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (22) hide show

lemonade/cli.py +2 -0
lemonade/tools/accuracy.py +335 -0
lemonade/tools/huggingface_load.py +6 -0
lemonade/tools/ort_genai/oga.py +6 -4
lemonade/tools/prompt.py +28 -1
lemonade/tools/server/instructions.py +8 -265
lemonade/tools/server/llamacpp.py +45 -19
lemonade/tools/server/port_utils.py +57 -0
lemonade/tools/server/serve.py +96 -44
lemonade/tools/server/static/instructions.html +262 -0
lemonade/tools/server/thread_utils.py +87 -0
lemonade/version.py +1 -1
{lemonade_sdk-7.0.1.dist-info → lemonade_sdk-7.0.3.dist-info}/METADATA +1 -1
{lemonade_sdk-7.0.1.dist-info → lemonade_sdk-7.0.3.dist-info}/RECORD +22 -18
lemonade_server/model_manager.py +45 -12
{lemonade/tools/server → lemonade_server}/pydantic_models.py +2 -0
lemonade_server/server_models.json +25 -4
{lemonade_sdk-7.0.1.dist-info → lemonade_sdk-7.0.3.dist-info}/WHEEL +0 -0
{lemonade_sdk-7.0.1.dist-info → lemonade_sdk-7.0.3.dist-info}/entry_points.txt +0 -0
{lemonade_sdk-7.0.1.dist-info → lemonade_sdk-7.0.3.dist-info}/licenses/LICENSE +0 -0
{lemonade_sdk-7.0.1.dist-info → lemonade_sdk-7.0.3.dist-info}/licenses/NOTICE.md +0 -0
{lemonade_sdk-7.0.1.dist-info → lemonade_sdk-7.0.3.dist-info}/top_level.txt +0 -0

lemonade/tools/server/instructions.py CHANGED Viewed

@@ -25,270 +25,13 @@ def get_instructions_html(port=8000):
         f"<script>window.SERVER_MODELS = {json.dumps(filtered_models)};</script>"
     )
-    # New lemon-themed HTML structure
-    # pylint: disable=W1401
-    styled_html = f"""
-    <!DOCTYPE html>
-    <html lang=\"en\">
-    <head>
-        <meta charset=\"UTF-8\">
-        <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">
-        <title>Lemonade Server</title>
-        <link rel="icon" href="data:,">
-        <link rel=\"stylesheet\" href=\"/static/styles.css\">
-        <script>
-        window.SERVER_PORT = {port};
-        </script>
-        {server_models_js}
-    </head>
-    <body>
-        <nav class=\"navbar\">
-            <a href=\"https://github.com/lemonade-sdk/lemonade\">GitHub</a>
-            <a href=\"https://lemonade-server.ai/docs/\">Docs</a>
-            <a href=\"https://lemonade-server.ai/docs/server/server_models/\">Models</a>
-            <a href=\"https://lemonade-server.ai/docs/server/apps/\">Featured Apps</a>
-        </nav>
-        <main class=\"main\">
-            <div class=\"title\">🍋 Lemonade Server</div>
-            <div class=\"tab-container\">
-                <div class=\"tabs\">
-                    <button class=\"tab active\" id=\"tab-chat\" onclick=\"showTab('chat')\">LLM Chat</button>
-                    <button class=\"tab\" id=\"tab-models\" onclick=\"showTab('models')\">Model Management</button>
-                </div>
-                <div class=\"tab-content active\" id=\"content-chat\">
-                    <div class=\"chat-container\">
-                        <div class=\"chat-history\" id=\"chat-history\"></div>
-                        <div class=\"chat-input-row\">
-                            <select id=\"model-select\"></select>
-                            <input type=\"text\" id=\"chat-input\" placeholder=\"Type your message...\" />
-                            <button id=\"send-btn\">Send</button>
-                        </div>
-                    </div>
-                </div>
-                <div class=\"tab-content\" id=\"content-models\">
-                    <div class=\"model-mgmt-container\">
-                        <div class=\"model-mgmt-pane\">
-                            <h3>Installed Models</h3>
-                            <table class=\"model-table\" id=\"installed-models-table\">
-                                <colgroup><col style=\"width:100%\"></colgroup>
-                                <tbody id=\"installed-models-tbody\"></tbody>
-                            </table>
-                        </div>
-                        <div class=\"model-mgmt-pane\">
-                            <h3>Suggested Models</h3>
-                            <table class=\"model-table\" id=\"suggested-models-table\">
-                                <tbody id=\"suggested-models-tbody\"></tbody>
-                            </table>
-                        </div>
-                    </div>
-                </div>
-            </div>
-        </main>
-        <footer class=\"site-footer\">
-            <div class=\"dad-joke\">When life gives you LLMs, make an LLM aide.</div>
-            <div class=\"copyright\">Copyright 2025 AMD</div>
-        </footer>
-        <script src=\"https://cdn.jsdelivr.net/npm/openai@4.21.0/dist/openai.min.js\"></script>
-        <script>
-        // Tab switching logic
-        function showTab(tab) {{
-            document.getElementById('tab-chat').classList.remove('active');
-            document.getElementById('tab-models').classList.remove('active');
-            document.getElementById('content-chat').classList.remove('active');
-            document.getElementById('content-models').classList.remove('active');
-            if (tab === 'chat') {{
-                document.getElementById('tab-chat').classList.add('active');
-                document.getElementById('content-chat').classList.add('active');
-            }} else {{
-                document.getElementById('tab-models').classList.add('active');
-                document.getElementById('content-models').classList.add('active');
-            }}
-        }}
+    # Load HTML template
+    template_path = Path(__file__).parent / "static" / "instructions.html"
+    with open(template_path, "r", encoding="utf-8") as f:
+        html_template = f.read()
-        // Helper to get server base URL
-        function getServerBaseUrl() {{
-            const port = window.SERVER_PORT || 8000;
-            return `http://localhost:{port}`;
-        }}
+    # Replace template variables
+    html_content = html_template.replace("{{SERVER_PORT}}", str(port))
+    html_content = html_content.replace("{{SERVER_MODELS_JS}}", server_models_js)
-        // Populate model dropdown from /api/v1/models endpoint
-        async function loadModels() {{
-            try {{
-                const resp = await fetch(getServerBaseUrl() + '/api/v1/models');
-                const data = await resp.json();
-                const select = document.getElementById('model-select');
-                select.innerHTML = '';
-                if (!data.data || !Array.isArray(data.data)) {{
-                    select.innerHTML = '<option>No models found (malformed response)</option>';
-                    return;
-                }}
-                if (data.data.length === 0) {{
-                    select.innerHTML = '<option>No models available</option>';
-                    return;
-                }}
-                let defaultIndex = 0;
-                data.data.forEach(function(model, index) {{
-                    const modelId = model.id || model.name || model;
-                    const opt = document.createElement('option');
-                    opt.value = modelId;
-                    opt.textContent = modelId;
-                    if (modelId === 'Llama-3.2-1B-Instruct-Hybrid') {{
-                        defaultIndex = index;
-                    }}
-                    select.appendChild(opt);
-                }});
-                select.selectedIndex = defaultIndex;
-            }} catch (e) {{
-                const select = document.getElementById('model-select');
-                select.innerHTML = `<option>Error loading models: ${{e.message}}</option>`;
-                console.error('Error loading models:', e);
-            }}
-        }}
-        loadModels();
-        // Model Management Tab Logic
-        async function refreshModelMgmtUI() {{
-            // Get installed models from /api/v1/models
-            let installed = [];
-            try {{
-                const resp = await fetch(getServerBaseUrl() + '/api/v1/models');
-                const data = await resp.json();
-                if (data.data && Array.isArray(data.data)) {{
-                    installed = data.data.map(m => m.id || m.name || m);
-                }}
-            }} catch (e) {{}}
-            // All models from server_models.json (window.SERVER_MODELS)
-            const allModels = window.SERVER_MODELS || {{}};
-            // Filter suggested models not installed
-            const suggested = Object.keys(allModels).filter(
-                k => allModels[k].suggested && !installed.includes(k)
-            );
-            // Render installed models as a table (two columns, second is invisible)
-            const installedTbody = document.getElementById('installed-models-tbody');
-            installedTbody.innerHTML = '';
-            installed.forEach(function(mid) {{
-                var tr = document.createElement('tr');
-                var tdName = document.createElement('td');
-                tdName.textContent = mid;
-                var tdEmpty = document.createElement('td');
-                tdEmpty.style.width = '0';
-                tdEmpty.style.padding = '0';
-                tdEmpty.style.border = 'none';
-                tr.appendChild(tdName);
-                tr.appendChild(tdEmpty);
-                installedTbody.appendChild(tr);
-            }});
-            // Render suggested models as a table
-            const suggestedTbody = document.getElementById('suggested-models-tbody');
-            suggestedTbody.innerHTML = '';
-            suggested.forEach(mid => {{
-                const tr = document.createElement('tr');
-                const tdName = document.createElement('td');
-                tdName.textContent = mid;
-                tdName.style.paddingRight = '1em';
-                tdName.style.verticalAlign = 'middle';
-                const tdBtn = document.createElement('td');
-                tdBtn.style.width = '1%';
-                tdBtn.style.verticalAlign = 'middle';
-                const btn = document.createElement('button');
-                btn.textContent = '+';
-                btn.title = 'Install model';
-                btn.onclick = async function() {{
-                    btn.disabled = true;
-                    btn.textContent = 'Installing...';
-                    btn.classList.add('installing-btn');
-                    try {{
-                        await fetch(getServerBaseUrl() + '/api/v1/pull', {{
-                            method: 'POST',
-                            headers: {{ 'Content-Type': 'application/json' }},
-                            body: JSON.stringify({{ model_name: mid }})
-                        }});
-                        await refreshModelMgmtUI();
-                        await loadModels(); // update chat dropdown too
-                    }} catch (e) {{
-                        btn.textContent = 'Error';
-                    }}
-                }};
-                tdBtn.appendChild(btn);
-                tr.appendChild(tdName);
-                tr.appendChild(tdBtn);
-                suggestedTbody.appendChild(tr);
-            }});
-        }}
-        // Initial load
-        refreshModelMgmtUI();
-        // Optionally, refresh when switching to the tab
-        document.getElementById('tab-models').addEventListener('click', refreshModelMgmtUI);
-        // Chat logic (streaming with OpenAI JS client placeholder)
-        const chatHistory = document.getElementById('chat-history');
-        const chatInput = document.getElementById('chat-input');
-        const sendBtn = document.getElementById('send-btn');
-        const modelSelect = document.getElementById('model-select');
-        let messages = [];
-        function appendMessage(role, text) {{
-            const div = document.createElement('div');
-            div.className = 'chat-message ' + role;
-            // Add a bubble for iMessage style
-            const bubble = document.createElement('div');
-            bubble.className = 'chat-bubble ' + role;
-            bubble.innerHTML = text;
-            div.appendChild(bubble);
-            chatHistory.appendChild(div);
-            chatHistory.scrollTop = chatHistory.scrollHeight;
-        }}
-        async function sendMessage() {{
-            const text = chatInput.value.trim();
-            if (!text) return;
-            appendMessage('user', text);
-            messages.push({{ role: 'user', content: text }});
-            chatInput.value = '';
-            sendBtn.disabled = true;
-            // Streaming OpenAI completions (placeholder, adapt as needed)
-            let llmText = '';
-            appendMessage('llm', '...');
-            const llmDiv = chatHistory.lastChild.querySelector('.chat-bubble.llm');
-            try {{
-                // Use the correct endpoint for chat completions
-                const resp = await fetch(getServerBaseUrl() + '/api/v1/chat/completions', {{
-                    method: 'POST',
-                    headers: {{ 'Content-Type': 'application/json' }},
-                    body: JSON.stringify({{
-                        model: modelSelect.value,
-                        messages: messages,
-                        stream: true
-                    }})
-                }});
-                if (!resp.body) throw new Error('No stream');
-                const reader = resp.body.getReader();
-                let decoder = new TextDecoder();
-                llmDiv.textContent = '';
-                while (true) {{
-                    const {{ done, value }} = await reader.read();
-                    if (done) break;
-                    const chunk = decoder.decode(value);
-                    if (chunk.trim() === 'data: [DONE]' || chunk.trim() === '[DONE]') continue;
-                    // Try to extract the content from the OpenAI chunk
-                    const match = chunk.match(/"content"\s*:\s*"([^"]*)"/);
-                    if (match && match[1]) {{
-                        llmText += match[1];
-                        llmDiv.textContent = llmText;
-                    }}
-                }}
-                messages.push({{ role: 'assistant', content: llmText }});
-            }} catch (e) {{
-                llmDiv.textContent = '[Error: ' + e.message + ']';
-            }}
-            sendBtn.disabled = false;
-        }}
-        sendBtn.onclick = sendMessage;
-        chatInput.addEventListener('keydown', function(e) {{
-            if (e.key === 'Enter') sendMessage();
-        }});
-        </script>
-    </body>
-    </html>
-    """
-    return HTMLResponse(content=styled_html)
+    return HTMLResponse(content=html_content)

lemonade/tools/server/llamacpp.py CHANGED Viewed

@@ -14,11 +14,11 @@ from fastapi.responses import StreamingResponse
 from openai import OpenAI
+from lemonade_server.pydantic_models import ChatCompletionRequest
 from lemonade_server.model_manager import ModelManager
-from lemonade.tools.server.pydantic_models import ChatCompletionRequest
+from lemonade.tools.server.port_utils import find_free_port
 LLAMA_VERSION = "b5543"
-LLAMA_SERVER_PORT = "8081"
 LLAMA_SERVER_EXE_DIR = os.path.join(
     os.path.dirname(sys.executable),
@@ -43,6 +43,23 @@ class LlamaTelemetry:
         self.tokens_per_second = None
         self.prompt_eval_time = None
         self.eval_time = None
+        self.port = None
+    def choose_port(self):
+        """
+        Users probably don't care what port we start llama-server on, so let's
+        search for an empty port
+        """
+        self.port = find_free_port()
+        if self.port is None:
+            msg = "Failed to find an empty port to start llama-server on"
+            logging.error(msg)
+            raise HTTPException(
+                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                detail=msg,
+            )
     def parse_telemetry_line(self, line: str):
         """
@@ -128,10 +145,12 @@ def _log_subprocess_output(
                 break
-def _wait_for_load(llama_server_process: subprocess.Popen, fail_message: str):
+def _wait_for_load(
+    llama_server_process: subprocess.Popen, port: int, fail_message: str
+):
     status_code = None
     while not llama_server_process.poll() and status_code != 200:
-        health_url = f"http://localhost:{LLAMA_SERVER_PORT}/health"
+        health_url = f"http://localhost:{port}/health"
         try:
             health_response = requests.get(health_url)
         except requests.exceptions.ConnectionError:
@@ -146,19 +165,25 @@ def _wait_for_load(llama_server_process: subprocess.Popen, fail_message: str):
 def _launch_llama_subprocess(
-    model_path: str, use_gpu: bool, telemetry: LlamaTelemetry
+    snapshot_files: dict, use_gpu: bool, telemetry: LlamaTelemetry
 ) -> subprocess.Popen:
     """
     Launch llama server subprocess with GPU or CPU configuration
     """
-    base_command = [
-        LLAMA_SERVER_EXE_PATH,
-        "-m",
-        model_path,
-        "--port",
-        LLAMA_SERVER_PORT,
-    ]
+    # Build the base command
+    base_command = [LLAMA_SERVER_EXE_PATH, "-m", snapshot_files["variant"]]
+    if "mmproj" in snapshot_files:
+        base_command.extend(["--mmproj", snapshot_files["mmproj"]])
+        if not use_gpu:
+            base_command.extend(["--no-mmproj-offload"])
+    # Find a port, and save it in the telemetry object for future reference
+    # by other functions
+    telemetry.choose_port()
+    # Add port and jinja to enable tool use
+    base_command.extend(["--port", str(telemetry.port), "--jinja"])
     # Configure GPU layers: 99 for GPU, 0 for CPU-only
     ngl_value = "99" if use_gpu else "0"
@@ -180,7 +205,7 @@ def _launch_llama_subprocess(
     return process
-def server_load(checkpoint: str, model_reference: str, telemetry: LlamaTelemetry):
+def server_load(model_config: dict, model_reference: str, telemetry: LlamaTelemetry):
     # Download llama.cpp server if it isn't already available
     if not os.path.exists(LLAMA_SERVER_EXE_DIR):
         # Download llama.cpp server zip
@@ -212,33 +237,34 @@ def server_load(checkpoint: str, model_reference: str, telemetry: LlamaTelemetry
         logging.info("Cleaned up zip file")
     # Download the gguf to the hugging face cache
-    snapshot_path = ModelManager().download_gguf(checkpoint)
-    model_path = os.path.join(snapshot_path, os.listdir(snapshot_path)[0])
-    logging.debug(f"GGUF file path: {model_path}")
+    snapshot_files = ModelManager().download_gguf(model_config)
+    logging.debug(f"GGUF file paths: {snapshot_files}")
     # Start the llama-serve.exe process
     logging.debug(f"Using llama_server for GGUF model: {LLAMA_SERVER_EXE_PATH}")
     # Attempt loading on GPU first
     llama_server_process = _launch_llama_subprocess(
-        model_path, use_gpu=True, telemetry=telemetry
+        snapshot_files, use_gpu=True, telemetry=telemetry
     )
     # Check the /health endpoint until GPU server is ready
     _wait_for_load(
         llama_server_process,
+        telemetry.port,
         f"Loading {model_reference} on GPU didn't work, re-attempting on CPU",
     )
     # If loading on GPU failed, try loading on CPU
     if llama_server_process.poll():
         llama_server_process = _launch_llama_subprocess(
-            model_path, use_gpu=False, telemetry=telemetry
+            snapshot_files, use_gpu=False, telemetry=telemetry
         )
         # Check the /health endpoint until CPU server is ready
         _wait_for_load(
             llama_server_process,
+            telemetry.port,
             f"Loading {model_reference} on CPU didn't work",
         )
@@ -254,7 +280,7 @@ def server_load(checkpoint: str, model_reference: str, telemetry: LlamaTelemetry
 def chat_completion(
     chat_completion_request: ChatCompletionRequest, telemetry: LlamaTelemetry
 ):
-    base_url = f"http://127.0.0.1:{LLAMA_SERVER_PORT}/v1"
+    base_url = f"http://127.0.0.1:{telemetry.port}/v1"
     client = OpenAI(
         base_url=base_url,
         api_key="lemonade",

lemonade/tools/server/port_utils.py ADDED Viewed

@@ -0,0 +1,57 @@
+import socketserver
+import sys
+import logging
+from contextlib import asynccontextmanager
+from fastapi import FastAPI
+def find_free_port():
+    """
+    Scans for an unoccupied TCP port
+    Returns the port number as an int on success
+    Returns None if no port can be found
+    """
+    try:
+        with socketserver.TCPServer(("localhost", 0), None) as s:
+            return s.server_address[1]
+    # pylint: disable=broad-exception-caught
+    except Exception:
+        return None
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    # Code here will run when the application starts up
+    # Check if console can handle Unicode by testing emoji encoding
+    try:
+        if sys.stdout.encoding:
+            "🍋".encode(sys.stdout.encoding)
+        use_emojis = True
+    except (UnicodeEncodeError, AttributeError):
+        use_emojis = False
+    if use_emojis:
+        logging.info(
+            "\n"
+            "\n"
+            "🍋  Lemonade Server Ready!\n"
+            f"🍋    Open http://localhost:{app.port} in your browser for:\n"
+            "🍋      💬 chat\n"
+            "🍋      💻 model management\n"
+            "🍋      📄 docs\n"
+        )
+    else:
+        logging.info(
+            "\n"
+            "\n"
+            "[Lemonade]  Lemonade Server Ready!\n"
+            f"[Lemonade]    Open http://localhost:{app.port} in your browser for:\n"
+            "[Lemonade]      chat\n"
+            "[Lemonade]      model management\n"
+            "[Lemonade]      docs\n"
+        )
+    yield

lemonade-sdk 7.0.1__py3-none-any.whl → 7.0.3__py3-none-any.whl

Potentially problematic release.

lemonade-sdk 7.0.1py3-none-any.whl → 7.0.3py3-none-any.whl