PyPI - lemonade-sdk - Versions diffs - 7.0.0__py3-none-any.whl → 7.0.2__py3-none-any.whl - Mend

lemonade-sdk 7.0.0py3-none-any.whl → 7.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (21) hide show

lemonade/cli.py +2 -0
lemonade/tools/accuracy.py +335 -0
lemonade/tools/server/instructions.py +294 -0
lemonade/tools/server/llamacpp.py +315 -0
lemonade/tools/server/port_utils.py +57 -0
lemonade/tools/server/pydantic_models.py +83 -0
lemonade/tools/server/serve.py +225 -167
lemonade/tools/server/static/styles.css +313 -0
lemonade/tools/server/thread_utils.py +87 -0
lemonade/tools/server/tool_calls.py +50 -43
lemonade/version.py +1 -1
{lemonade_sdk-7.0.0.dist-info → lemonade_sdk-7.0.2.dist-info}/METADATA +4 -7
{lemonade_sdk-7.0.0.dist-info → lemonade_sdk-7.0.2.dist-info}/RECORD +21 -14
{lemonade_sdk-7.0.0.dist-info → lemonade_sdk-7.0.2.dist-info}/WHEEL +1 -1
lemonade_server/cli.py +4 -2
lemonade_server/model_manager.py +34 -17
lemonade_server/server_models.json +52 -3
{lemonade_sdk-7.0.0.dist-info → lemonade_sdk-7.0.2.dist-info}/entry_points.txt +0 -0
{lemonade_sdk-7.0.0.dist-info → lemonade_sdk-7.0.2.dist-info}/licenses/LICENSE +0 -0
{lemonade_sdk-7.0.0.dist-info → lemonade_sdk-7.0.2.dist-info}/licenses/NOTICE.md +0 -0
{lemonade_sdk-7.0.0.dist-info → lemonade_sdk-7.0.2.dist-info}/top_level.txt +0 -0

lemonade/cli.py CHANGED Viewed

@@ -19,6 +19,7 @@ import lemonade.cache as cache
 from lemonade.tools.mmlu import AccuracyMMLU
 from lemonade.tools.humaneval import AccuracyHumaneval
 from lemonade.tools.perplexity import AccuracyPerplexity
+from lemonade.tools.accuracy import LMEvalHarness
 from lemonade.tools.prompt import LLMPrompt
 from lemonade.tools.quark.quark_load import QuarkLoad
 from lemonade.tools.quark.quark_quantize import QuarkQuantize
@@ -36,6 +37,7 @@ def main():
         AccuracyMMLU,
         AccuracyHumaneval,
         AccuracyPerplexity,
+        LMEvalHarness,
         LLMPrompt,
         HuggingfaceBench,
         OgaBench,

lemonade/tools/accuracy.py ADDED Viewed

@@ -0,0 +1,335 @@
+import argparse
+import json
+import os
+import socket
+import subprocess
+import sys
+import time
+from typing import Optional
+import requests
+from lemonade.state import State
+from lemonade.tools import Tool
+import lemonade.common.printing as printing
+import lemonade.common.build as build
+from lemonade.tools.server.thread_utils import ServerRunner
+def is_port_in_use(port, host="localhost"):
+    """
+    Check if a port is in use
+    """
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        return s.connect_ex((host, port)) == 0
+class LMEvalHarness(Tool):
+    """
+    Tool for evaluating LLMs using lm-eval-harness on industry standard benchmarks
+    like MMLU, GSM8k, and more. See docs/lemonade/lm_eval.md for more details.
+    """
+    unique_name = "lm-eval-harness"
+    def __init__(self):
+        super().__init__(
+            monitor_message="Evaluate model accuracy using ElutherAI's lm-eval-harness"
+        )
+        self.status_stats = []
+        self.server_runner = None
+    @staticmethod
+    def parser(add_help: bool = True) -> argparse.ArgumentParser:
+        parser = __class__.helpful_parser(
+            short_description="Evaluate model using lm-eval-harness",
+            add_help=add_help,
+        )
+        parser.add_argument(
+            "--task",
+            type=str,
+            required=True,
+            help="Task(s) to evaluate on (e.g., gsm8k, mmlu)",
+        )
+        parser.add_argument(
+            "--server-port", type=int, default=8000, help="Port to use for the server"
+        )
+        parser.add_argument(
+            "--num-fewshot",
+            type=int,
+            default=0,
+            help="Number of examples in few-shot prompts",
+        )
+        parser.add_argument(
+            "--limit",
+            type=int,
+            default=None,
+            help="Limit the number of examples per task",
+        )
+        parser.add_argument(
+            "--log-samples",
+            action="store_true",
+            help="Log samples for each task to log file",
+        )
+        parser.add_argument(
+            "--output-path",
+            type=str,
+            default=None,
+            help="Path to save evaluation results",
+        )
+        return parser
+    def _process_results(self, results_dir, state):
+        """Process evaluation results and save to state stats"""
+        if not os.path.exists(results_dir) or not os.path.isdir(results_dir):
+            printing.log_warning(f"Results directory not found at {results_dir}")
+            return
+        model_dirs = [
+            d
+            for d in os.listdir(results_dir)
+            if os.path.isdir(os.path.join(results_dir, d))
+        ]
+        if not model_dirs:
+            printing.log_warning(f"No model directories found in {results_dir}")
+            return
+        model_dir = os.path.join(results_dir, model_dirs[0])
+        printing.log_info(f"Found model directory: {model_dir}")
+        # Find the results JSON file with timestamp
+        results_files = [
+            f
+            for f in os.listdir(model_dir)
+            if f.startswith("results_") and f.endswith(".json")
+        ]
+        if not results_files:
+            printing.log_warning(f"No results files found in {model_dir}")
+            return
+        # Sort by timestamp
+        results_files.sort(reverse=True)
+        results_file_path = os.path.join(model_dir, results_files[0])
+        printing.log_info(f"Processing results from {results_file_path}")
+        # Read and process results
+        try:
+            with open(results_file_path, "r", encoding="utf-8") as f:
+                results = json.load(f)
+            # Extract and display metrics
+            if "results" in results:
+                for task_name, metrics in results["results"].items():
+                    printing.log_info(f"Results for {task_name}:")
+                    for metric, value in metrics.items():
+                        if isinstance(value, (int, float)) and not metric.startswith(
+                            "alias"
+                        ):
+                            # Format metric name for stats
+                            clean_metric = metric.replace(",", "_")
+                            stat_name = f"lm_eval_{task_name}_{clean_metric}"
+                            # Save to state stats as percentage
+                            state.save_stat(stat_name, float(value) * 100)
+                            state.save_stat(f"{stat_name}_units", "%")
+                            self.status_stats.append(stat_name)
+                            printing.log_info(
+                                f"  {metric}: {value:.4f} ({value*100:.2f}%)"
+                            )
+                # Save summary metrics if available
+                avg_metrics = {}
+                if "higher_is_better" in results:
+                    for metric_type in results["higher_is_better"].values():
+                        for metric in metric_type.keys():
+                            if metric not in avg_metrics:
+                                avg_metrics[metric] = []
+                for task_metrics in results["results"].values():
+                    for metric, value in task_metrics.items():
+                        if isinstance(value, (int, float)) and not metric.startswith(
+                            "alias"
+                        ):
+                            base_metric = metric.split(",")[0]
+                            if base_metric in avg_metrics:
+                                avg_metrics[base_metric].append(value)
+                # Calculate and save averages
+                for metric, values in avg_metrics.items():
+                    if values:
+                        avg_value = sum(values) / len(values)
+                        stat_name = f"lm_eval_average_{metric}"
+                        state.save_stat(stat_name, float(avg_value) * 100)
+                        state.save_stat(f"{stat_name}_units", "%")
+                        self.status_stats.append(stat_name)
+                        printing.log_info(
+                            f"Average {metric}: {avg_value:.4f} ({avg_value*100:.2f}%)"
+                        )
+        except (IOError, json.JSONDecodeError) as e:
+            printing.log_error(f"Error processing results: {e}")
+    def run(
+        self,
+        state: State,
+        task: str,
+        server_port: int = 8000,
+        server_host: str = "localhost",
+        num_fewshot: int = 0,
+        limit: Optional[int] = None,
+        log_samples: bool = False,
+        output_path: Optional[str] = None,
+    ) -> State:
+        model = state.model
+        tokenizer = state.tokenizer
+        if model is None or tokenizer is None:
+            raise ValueError(
+                "Model and tokenizer must be loaded in state before running lm-eval-harness"
+            )
+        # Set up output path
+        if output_path is None:
+            output_path = os.path.join(
+                build.output_dir(state.cache_dir, state.build_name), "lm_eval_results"
+            )
+        os.makedirs(output_path, exist_ok=True)
+        # Check if port is already in use
+        if is_port_in_use(server_port, server_host):
+            error_msg = (
+                f"Port {server_port} is already in use. "
+                "Please close all applications using this port and try again."
+            )
+            printing.log_error(error_msg)
+            raise RuntimeError(error_msg)
+        # Retroactively determine recipe based on model type to select correct iterator
+        # The model is already loaded in server, so we only need recipe for iterator selection
+        checkpoint = getattr(state, "checkpoint", "unknown")
+        if "OrtGenaiModel" in str(type(model)):
+            recipe = "oga-"
+        else:
+            recipe = "unknown"
+        # Start the server thread
+        self.server_runner = ServerRunner(
+            model=model,
+            tokenizer=tokenizer,
+            checkpoint=checkpoint,
+            recipe=recipe,
+            host=server_host,
+            port=server_port,
+        )
+        self.server_runner.start()
+        # Wait for server initialization
+        printing.log_info("Waiting for server initialization...")
+        # Wait for server to start and be responsive
+        server_url = f"http://{server_host}:{server_port}"
+        max_retries = 30
+        retry_delay = 1
+        printing.log_info(f"Checking if server is available at {server_url}...")
+        for i in range(max_retries):
+            try:
+                response = requests.get(f"{server_url}/api/v0/health", timeout=2)
+                if response.status_code == 200:
+                    printing.log_info(f"Server is ready after {i+1} attempts")
+                    break
+            except requests.exceptions.RequestException:
+                if i < max_retries - 1:
+                    time.sleep(retry_delay)
+                else:
+                    printing.log_error(
+                        f"Server did not start after {max_retries} attempts"
+                    )
+                    raise RuntimeError("Failed to start the server")
+        # Build API URL
+        results_file = os.path.join(output_path, f"{task}_results")
+        printing.log_info(f"Running lm-eval-harness on {task}...")
+        # Build lm-eval-harness command
+        cmd = [
+            "lm_eval",
+            "--model",
+            "local-completions",
+            "--tasks",
+            task,
+            "--model_args",
+            (
+                f"model={checkpoint},"
+                f"base_url={server_url}/api/v0/completions,"
+                f"num_concurrent=1,"
+                f"max_retries=5,"
+                f"retry_timeout=10,"
+                f"tokenized_requests=False"
+            ),
+            "--num_fewshot",
+            str(num_fewshot),
+            "--output_path",
+            results_file,
+        ]
+        if limit is not None:
+            cmd.extend(["--limit", str(limit)])
+        if log_samples:
+            cmd.extend(["--log_samples"])
+        try:
+            # On Windows, set UTF-8 mode to handle Unicode output
+            env = os.environ.copy()
+            if sys.platform == "win32":
+                env["PYTHONIOENCODING"] = "utf-8"
+            # Execute lm-eval-harness command
+            result = subprocess.run(
+                cmd, check=True, text=True, capture_output=True, env=env
+            )
+            # Log relevant output and skip any parts that might cause encoding issues
+            try:
+                printing.log_info(result.stdout)
+            except UnicodeEncodeError:
+                printing.log_info(
+                    "Results obtained successfully but couldn't display due to encoding issues"
+                )
+            # Process results from the correct location
+            results_dir = os.path.join(output_path, f"{task}_results")
+            self._process_results(results_dir, state)
+        except subprocess.CalledProcessError as e:
+            printing.log_error(f"Error running lm-eval-harness: {e}")
+            printing.log_error(f"stderr: {e.stderr}")
+        except (IOError, ValueError, requests.RequestException) as e:
+            printing.log_error(f"Error: {e}")
+        finally:
+            # Shut down server
+            if self.server_runner and self.server_runner.is_alive():
+                printing.log_info("Shutting down server runner...")
+                self.server_runner.shutdown()
+            # Make sure we don't have any lingering references to state's model/tokenizer
+            # that could prevent garbage collection
+            self.server_runner = None
+        return state

lemonade/tools/server/instructions.py ADDED Viewed

@@ -0,0 +1,294 @@
+from pathlib import Path
+import json
+from fastapi.responses import HTMLResponse
+from lemonade_server.model_manager import ModelManager
+def get_instructions_html(port=8000):
+    """
+    Show instructions on how to use the server.
+    """
+    # Load server models from JSON
+    server_models_path = (
+        Path(__file__).parent.parent.parent.parent
+        / "lemonade_server"
+        / "server_models.json"
+    )
+    with open(server_models_path, "r", encoding="utf-8") as f:
+        server_models = json.load(f)
+    # Use shared filter function from model_manager.py
+    filtered_models = ModelManager().filter_models_by_backend(server_models)
+    # Pass filtered server_models to JS
+    server_models_js = (
+        f"<script>window.SERVER_MODELS = {json.dumps(filtered_models)};</script>"
+    )
+    # New lemon-themed HTML structure
+    # pylint: disable=W1401
+    styled_html = f"""
+    <!DOCTYPE html>
+    <html lang=\"en\">
+    <head>
+        <meta charset=\"UTF-8\">
+        <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">
+        <title>Lemonade Server</title>
+        <link rel="icon" href="data:,">
+        <link rel=\"stylesheet\" href=\"/static/styles.css\">
+        <script>
+        window.SERVER_PORT = {port};
+        </script>
+        {server_models_js}
+    </head>
+    <body>
+        <nav class=\"navbar\">
+            <a href=\"https://github.com/lemonade-sdk/lemonade\">GitHub</a>
+            <a href=\"https://lemonade-server.ai/docs/\">Docs</a>
+            <a href=\"https://lemonade-server.ai/docs/server/server_models/\">Models</a>
+            <a href=\"https://lemonade-server.ai/docs/server/apps/\">Featured Apps</a>
+        </nav>
+        <main class=\"main\">
+            <div class=\"title\">🍋 Lemonade Server</div>
+            <div class=\"tab-container\">
+                <div class=\"tabs\">
+                    <button class=\"tab active\" id=\"tab-chat\" onclick=\"showTab('chat')\">LLM Chat</button>
+                    <button class=\"tab\" id=\"tab-models\" onclick=\"showTab('models')\">Model Management</button>
+                </div>
+                <div class=\"tab-content active\" id=\"content-chat\">
+                    <div class=\"chat-container\">
+                        <div class=\"chat-history\" id=\"chat-history\"></div>
+                        <div class=\"chat-input-row\">
+                            <select id=\"model-select\"></select>
+                            <input type=\"text\" id=\"chat-input\" placeholder=\"Type your message...\" />
+                            <button id=\"send-btn\">Send</button>
+                        </div>
+                    </div>
+                </div>
+                <div class=\"tab-content\" id=\"content-models\">
+                    <div class=\"model-mgmt-container\">
+                        <div class=\"model-mgmt-pane\">
+                            <h3>Installed Models</h3>
+                            <table class=\"model-table\" id=\"installed-models-table\">
+                                <colgroup><col style=\"width:100%\"></colgroup>
+                                <tbody id=\"installed-models-tbody\"></tbody>
+                            </table>
+                        </div>
+                        <div class=\"model-mgmt-pane\">
+                            <h3>Suggested Models</h3>
+                            <table class=\"model-table\" id=\"suggested-models-table\">
+                                <tbody id=\"suggested-models-tbody\"></tbody>
+                            </table>
+                        </div>
+                    </div>
+                </div>
+            </div>
+        </main>
+        <footer class=\"site-footer\">
+            <div class=\"dad-joke\">When life gives you LLMs, make an LLM aide.</div>
+            <div class=\"copyright\">Copyright 2025 AMD</div>
+        </footer>
+        <script src=\"https://cdn.jsdelivr.net/npm/openai@4.21.0/dist/openai.min.js\"></script>
+        <script>
+        // Tab switching logic
+        function showTab(tab) {{
+            document.getElementById('tab-chat').classList.remove('active');
+            document.getElementById('tab-models').classList.remove('active');
+            document.getElementById('content-chat').classList.remove('active');
+            document.getElementById('content-models').classList.remove('active');
+            if (tab === 'chat') {{
+                document.getElementById('tab-chat').classList.add('active');
+                document.getElementById('content-chat').classList.add('active');
+            }} else {{
+                document.getElementById('tab-models').classList.add('active');
+                document.getElementById('content-models').classList.add('active');
+            }}
+        }}
+        // Helper to get server base URL
+        function getServerBaseUrl() {{
+            const port = window.SERVER_PORT || 8000;
+            return `http://localhost:{port}`;
+        }}
+        // Populate model dropdown from /api/v1/models endpoint
+        async function loadModels() {{
+            try {{
+                const resp = await fetch(getServerBaseUrl() + '/api/v1/models');
+                const data = await resp.json();
+                const select = document.getElementById('model-select');
+                select.innerHTML = '';
+                if (!data.data || !Array.isArray(data.data)) {{
+                    select.innerHTML = '<option>No models found (malformed response)</option>';
+                    return;
+                }}
+                if (data.data.length === 0) {{
+                    select.innerHTML = '<option>No models available</option>';
+                    return;
+                }}
+                let defaultIndex = 0;
+                data.data.forEach(function(model, index) {{
+                    const modelId = model.id || model.name || model;
+                    const opt = document.createElement('option');
+                    opt.value = modelId;
+                    opt.textContent = modelId;
+                    if (modelId === 'Llama-3.2-1B-Instruct-Hybrid') {{
+                        defaultIndex = index;
+                    }}
+                    select.appendChild(opt);
+                }});
+                select.selectedIndex = defaultIndex;
+            }} catch (e) {{
+                const select = document.getElementById('model-select');
+                select.innerHTML = `<option>Error loading models: ${{e.message}}</option>`;
+                console.error('Error loading models:', e);
+            }}
+        }}
+        loadModels();
+        // Model Management Tab Logic
+        async function refreshModelMgmtUI() {{
+            // Get installed models from /api/v1/models
+            let installed = [];
+            try {{
+                const resp = await fetch(getServerBaseUrl() + '/api/v1/models');
+                const data = await resp.json();
+                if (data.data && Array.isArray(data.data)) {{
+                    installed = data.data.map(m => m.id || m.name || m);
+                }}
+            }} catch (e) {{}}
+            // All models from server_models.json (window.SERVER_MODELS)
+            const allModels = window.SERVER_MODELS || {{}};
+            // Filter suggested models not installed
+            const suggested = Object.keys(allModels).filter(
+                k => allModels[k].suggested && !installed.includes(k)
+            );
+            // Render installed models as a table (two columns, second is invisible)
+            const installedTbody = document.getElementById('installed-models-tbody');
+            installedTbody.innerHTML = '';
+            installed.forEach(function(mid) {{
+                var tr = document.createElement('tr');
+                var tdName = document.createElement('td');
+                tdName.textContent = mid;
+                var tdEmpty = document.createElement('td');
+                tdEmpty.style.width = '0';
+                tdEmpty.style.padding = '0';
+                tdEmpty.style.border = 'none';
+                tr.appendChild(tdName);
+                tr.appendChild(tdEmpty);
+                installedTbody.appendChild(tr);
+            }});
+            // Render suggested models as a table
+            const suggestedTbody = document.getElementById('suggested-models-tbody');
+            suggestedTbody.innerHTML = '';
+            suggested.forEach(mid => {{
+                const tr = document.createElement('tr');
+                const tdName = document.createElement('td');
+                tdName.textContent = mid;
+                tdName.style.paddingRight = '1em';
+                tdName.style.verticalAlign = 'middle';
+                const tdBtn = document.createElement('td');
+                tdBtn.style.width = '1%';
+                tdBtn.style.verticalAlign = 'middle';
+                const btn = document.createElement('button');
+                btn.textContent = '+';
+                btn.title = 'Install model';
+                btn.onclick = async function() {{
+                    btn.disabled = true;
+                    btn.textContent = 'Installing...';
+                    btn.classList.add('installing-btn');
+                    try {{
+                        await fetch(getServerBaseUrl() + '/api/v1/pull', {{
+                            method: 'POST',
+                            headers: {{ 'Content-Type': 'application/json' }},
+                            body: JSON.stringify({{ model_name: mid }})
+                        }});
+                        await refreshModelMgmtUI();
+                        await loadModels(); // update chat dropdown too
+                    }} catch (e) {{
+                        btn.textContent = 'Error';
+                    }}
+                }};
+                tdBtn.appendChild(btn);
+                tr.appendChild(tdName);
+                tr.appendChild(tdBtn);
+                suggestedTbody.appendChild(tr);
+            }});
+        }}
+        // Initial load
+        refreshModelMgmtUI();
+        // Optionally, refresh when switching to the tab
+        document.getElementById('tab-models').addEventListener('click', refreshModelMgmtUI);
+        // Chat logic (streaming with OpenAI JS client placeholder)
+        const chatHistory = document.getElementById('chat-history');
+        const chatInput = document.getElementById('chat-input');
+        const sendBtn = document.getElementById('send-btn');
+        const modelSelect = document.getElementById('model-select');
+        let messages = [];
+        function appendMessage(role, text) {{
+            const div = document.createElement('div');
+            div.className = 'chat-message ' + role;
+            // Add a bubble for iMessage style
+            const bubble = document.createElement('div');
+            bubble.className = 'chat-bubble ' + role;
+            bubble.innerHTML = text;
+            div.appendChild(bubble);
+            chatHistory.appendChild(div);
+            chatHistory.scrollTop = chatHistory.scrollHeight;
+        }}
+        async function sendMessage() {{
+            const text = chatInput.value.trim();
+            if (!text) return;
+            appendMessage('user', text);
+            messages.push({{ role: 'user', content: text }});
+            chatInput.value = '';
+            sendBtn.disabled = true;
+            // Streaming OpenAI completions (placeholder, adapt as needed)
+            let llmText = '';
+            appendMessage('llm', '...');
+            const llmDiv = chatHistory.lastChild.querySelector('.chat-bubble.llm');
+            try {{
+                // Use the correct endpoint for chat completions
+                const resp = await fetch(getServerBaseUrl() + '/api/v1/chat/completions', {{
+                    method: 'POST',
+                    headers: {{ 'Content-Type': 'application/json' }},
+                    body: JSON.stringify({{
+                        model: modelSelect.value,
+                        messages: messages,
+                        stream: true
+                    }})
+                }});
+                if (!resp.body) throw new Error('No stream');
+                const reader = resp.body.getReader();
+                let decoder = new TextDecoder();
+                llmDiv.textContent = '';
+                while (true) {{
+                    const {{ done, value }} = await reader.read();
+                    if (done) break;
+                    const chunk = decoder.decode(value);
+                    if (chunk.trim() === 'data: [DONE]' || chunk.trim() === '[DONE]') continue;
+                    // Try to extract the content from the OpenAI chunk
+                    const match = chunk.match(/"content"\s*:\s*"([^"]*)"/);
+                    if (match && match[1]) {{
+                        llmText += match[1];
+                        llmDiv.textContent = llmText;
+                    }}
+                }}
+                messages.push({{ role: 'assistant', content: llmText }});
+            }} catch (e) {{
+                llmDiv.textContent = '[Error: ' + e.message + ']';
+            }}
+            sendBtn.disabled = false;
+        }}
+        sendBtn.onclick = sendMessage;
+        chatInput.addEventListener('keydown', function(e) {{
+            if (e.key === 'Enter') sendMessage();
+        }});
+        </script>
+    </body>
+    </html>
+    """
+    return HTMLResponse(content=styled_html)

lemonade-sdk 7.0.0__py3-none-any.whl → 7.0.2__py3-none-any.whl

Potentially problematic release.

lemonade-sdk 7.0.0py3-none-any.whl → 7.0.2py3-none-any.whl