PyPI - lemonade-sdk - Versions diffs - 7.0.3__py3-none-any.whl → 8.0.0__py3-none-any.whl - Mend

lemonade-sdk 7.0.3py3-none-any.whl → 8.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (55) hide show

lemonade/api.py +3 -3
lemonade/cli.py +11 -17
lemonade/common/build.py +0 -47
lemonade/common/network.py +50 -0
lemonade/common/status.py +2 -21
lemonade/common/system_info.py +19 -4
lemonade/profilers/memory_tracker.py +3 -1
lemonade/tools/accuracy.py +3 -4
lemonade/tools/adapter.py +1 -2
lemonade/tools/{huggingface_bench.py → huggingface/bench.py} +2 -87
lemonade/tools/huggingface/load.py +235 -0
lemonade/tools/{huggingface_load.py → huggingface/utils.py} +87 -255
lemonade/tools/humaneval.py +9 -3
lemonade/tools/{llamacpp_bench.py → llamacpp/bench.py} +1 -1
lemonade/tools/{llamacpp.py → llamacpp/load.py} +18 -2
lemonade/tools/mmlu.py +7 -15
lemonade/tools/{ort_genai/oga.py → oga/load.py} +31 -422
lemonade/tools/oga/utils.py +423 -0
lemonade/tools/perplexity.py +4 -3
lemonade/tools/prompt.py +2 -1
lemonade/tools/quark/quark_load.py +2 -1
lemonade/tools/quark/quark_quantize.py +5 -5
lemonade/tools/report/table.py +3 -3
lemonade/tools/server/llamacpp.py +159 -34
lemonade/tools/server/serve.py +169 -147
lemonade/tools/server/static/favicon.ico +0 -0
lemonade/tools/server/static/styles.css +568 -0
lemonade/tools/server/static/webapp.html +439 -0
lemonade/tools/server/tray.py +458 -0
lemonade/tools/server/{port_utils.py → utils/port.py} +22 -3
lemonade/tools/server/utils/system_tray.py +395 -0
lemonade/tools/server/{instructions.py → webapp.py} +4 -10
lemonade/version.py +1 -1
lemonade_install/install.py +46 -28
{lemonade_sdk-7.0.3.dist-info → lemonade_sdk-8.0.0.dist-info}/METADATA +84 -22
lemonade_sdk-8.0.0.dist-info/RECORD +70 -0
lemonade_server/cli.py +182 -27
lemonade_server/model_manager.py +192 -20
lemonade_server/pydantic_models.py +9 -4
lemonade_server/server_models.json +5 -3
lemonade/common/analyze_model.py +0 -26
lemonade/common/labels.py +0 -61
lemonade/common/onnx_helpers.py +0 -176
lemonade/common/plugins.py +0 -10
lemonade/common/tensor_helpers.py +0 -83
lemonade/tools/server/static/instructions.html +0 -262
lemonade_sdk-7.0.3.dist-info/RECORD +0 -69
/lemonade/tools/{ort_genai → oga}/__init__.py +0 -0
/lemonade/tools/{ort_genai/oga_bench.py → oga/bench.py} +0 -0
/lemonade/tools/server/{thread_utils.py → utils/thread.py} +0 -0
{lemonade_sdk-7.0.3.dist-info → lemonade_sdk-8.0.0.dist-info}/WHEEL +0 -0
{lemonade_sdk-7.0.3.dist-info → lemonade_sdk-8.0.0.dist-info}/entry_points.txt +0 -0
{lemonade_sdk-7.0.3.dist-info → lemonade_sdk-8.0.0.dist-info}/licenses/LICENSE +0 -0
{lemonade_sdk-7.0.3.dist-info → lemonade_sdk-8.0.0.dist-info}/licenses/NOTICE.md +0 -0
{lemonade_sdk-7.0.3.dist-info → lemonade_sdk-8.0.0.dist-info}/top_level.txt +0 -0

lemonade/tools/oga/utils.py ADDED Viewed

@@ -0,0 +1,423 @@
+import os
+import time
+import json
+import logging
+from queue import Queue
+from packaging.version import Version
+import onnxruntime_genai as og
+from transformers import AutoTokenizer
+from lemonade.tools.adapter import (
+    ModelAdapter,
+    TokenizerAdapter,
+    PassthroughTokenizerResult,
+)
+class OrtGenaiTokenizer(TokenizerAdapter):
+    def __init__(self, model: og.Model, hf_tokenizer: AutoTokenizer):
+        super().__init__(hf_tokenizer)
+        # Initialize OGA tokenizer
+        self.tokenizer = og.Tokenizer(model)
+        # Placeholder value since some code will try to query it
+        # If we actually need this to return a proper value, then
+        # og.GeneratorParams.eos_token_id has it
+        self.eos_token_id = None
+    def __call__(self, prompt: str, return_tensors="np"):
+        tokens = self.tokenizer.encode(prompt)
+        return PassthroughTokenizerResult(tokens)
+    # pylint: disable=unused-argument
+    def decode(self, response, skip_special_tokens=True) -> str:
+        return self.tokenizer.decode(response)
+class OrtGenaiStreamer:
+    def __init__(self, tokenizer: OrtGenaiTokenizer, timeout=None):
+        self.tokenizer = tokenizer
+        self.text_queue = Queue()
+        self.stop_signal = None
+        self.timeout = timeout
+    def add_text(self, text: str):
+        self.text_queue.put(text, timeout=self.timeout)
+    def done(self):
+        self.text_queue.put(self.stop_signal, timeout=self.timeout)
+    def __iter__(self):
+        return self
+    def __next__(self):
+        value = self.text_queue.get(timeout=self.timeout)
+        if value == self.stop_signal:
+            raise StopIteration()
+        else:
+            return value
+class OrtGenaiModel(ModelAdapter):
+    def __init__(self, input_folder):
+        super().__init__()
+        self.model = og.Model(input_folder)
+        self.type = "ort-genai"
+        self.config = self.load_config(input_folder)
+    def load_config(self, input_folder):
+        rai_config_path = os.path.join(input_folder, "rai_config.json")
+        if os.path.exists(rai_config_path):
+            with open(rai_config_path, "r", encoding="utf-8") as f:
+                max_prompt_length = json.load(f)["max_prompt_length"]["1.4.1"]
+        else:
+            max_prompt_length = None
+        config_path = os.path.join(input_folder, "genai_config.json")
+        if os.path.exists(config_path):
+            with open(config_path, "r", encoding="utf-8") as f:
+                config_dict = json.load(f)
+                if max_prompt_length:
+                    config_dict["max_prompt_length"] = max_prompt_length
+                return config_dict
+        return None
+    def generate(
+        self,
+        input_ids,
+        max_new_tokens=512,
+        min_new_tokens=0,
+        do_sample=True,
+        top_k=50,
+        top_p=1.0,
+        temperature=0.7,
+        streamer: OrtGenaiStreamer = None,
+        pad_token_id=None,
+        stopping_criteria=None,
+        max_length=None,
+        random_seed=1,
+    ):
+        params = og.GeneratorParams(self.model)
+        prompt_length = len(input_ids)
+        max_prompt_length = self.config.get("max_prompt_length")
+        if max_prompt_length and prompt_length > max_prompt_length:
+            raise ValueError(
+                f"This prompt (length {prompt_length}) exceeds the model's "
+                f"maximum allowed prompt length ({max_prompt_length})."
+            )
+        # There is a breaking API change in OGA 0.6.0
+        # Determine whether we should use the old or new APIs
+        # This also supports 0.6.0.dev0, which evaluates to less than 0.6.0 in Version
+        use_oga_post_6_api = (
+            Version(og.__version__) >= Version("0.6.0") or "0.6.0" in og.__version__
+        )
+        use_oga_pre_6_api = not use_oga_post_6_api
+        if pad_token_id:
+            params.pad_token_id = pad_token_id
+        # Handle max_length and max_new_tokens
+        if max_length and max_new_tokens:
+            logging.warning(
+                "Both max_length and max_new_tokens were provided. "
+                "max_length will take precedence. "
+                "When setting max_length, please explicitly set max_new_tokens to None."
+            )
+        max_length_to_use = None
+        if max_length:
+            max_length_to_use = max_length
+        elif max_new_tokens:
+            max_length_to_use = prompt_length + max_new_tokens
+        min_length = prompt_length + min_new_tokens
+        if use_oga_pre_6_api:
+            params.input_ids = input_ids
+        if random_seed is None:
+            random_seed = -1  # In og.Generator, -1 = seed with random device
+        if self.config and "search" in self.config:
+            search_config = self.config["search"]
+            params.set_search_options(
+                do_sample=search_config.get("do_sample", do_sample),
+                top_k=search_config.get("top_k", top_k),
+                top_p=search_config.get("top_p", top_p),
+                temperature=search_config.get("temperature", temperature),
+                max_length=max_length_to_use,
+                min_length=min_length,
+                early_stopping=search_config.get("early_stopping", False),
+                length_penalty=search_config.get("length_penalty", 1.0),
+                num_beams=search_config.get("num_beams", 1),
+                num_return_sequences=search_config.get("num_return_sequences", 1),
+                repetition_penalty=search_config.get("repetition_penalty", 1.0),
+                past_present_share_buffer=search_config.get(
+                    "past_present_share_buffer", True
+                ),
+                random_seed=random_seed,
+                # Not currently supported by OGA
+                # diversity_penalty=search_config.get('diversity_penalty', 0.0),
+                # no_repeat_ngram_size=search_config.get('no_repeat_ngram_size', 0),
+            )
+        else:
+            params.set_search_options(
+                do_sample=do_sample,
+                top_k=top_k,
+                top_p=top_p,
+                temperature=temperature,
+                max_length=max_length_to_use,
+                min_length=min_length,
+                random_seed=random_seed,
+            )
+        params.try_graph_capture_with_max_batch_size(1)
+        generator = og.Generator(self.model, params)
+        if streamer is None:
+            prompt_start_time = time.perf_counter()
+            if use_oga_post_6_api:
+                generator.append_tokens(input_ids)
+            if use_oga_pre_6_api:
+                generator.compute_logits()
+            generator.generate_next_token()
+            prompt_end_time = time.perf_counter()
+            self.time_to_first_token = prompt_end_time - prompt_start_time
+            if max_new_tokens > 1:
+                token_gen_times = []
+                while not generator.is_done():
+                    token_gen_start_time = time.perf_counter()
+                    if use_oga_pre_6_api:
+                        generator.compute_logits()
+                    generator.generate_next_token()
+                    token_gen_end_time = time.perf_counter()
+                    token_gen_times.append(token_gen_end_time - token_gen_start_time)
+                if token_gen_times:
+                    # List will be empty if we generated 1 or 0 tokens, and we don't
+                    # want a divide-by-zero error in those cases
+                    avg_token_gen_latency_s = sum(token_gen_times) / len(
+                        token_gen_times
+                    )
+                    self.tokens_per_second = 1 / avg_token_gen_latency_s
+            return [generator.get_sequence(0)]
+        else:
+            if use_oga_post_6_api:
+                generator.append_tokens(input_ids)
+            tokenizer_stream = streamer.tokenizer.tokenizer.create_stream()
+            stop_early = False
+            while not generator.is_done() and not stop_early:
+                if use_oga_pre_6_api:
+                    generator.compute_logits()
+                generator.generate_next_token()
+                new_token = generator.get_next_tokens()[0]
+                new_text = tokenizer_stream.decode(new_token)
+                streamer.add_text(new_text)
+                if stopping_criteria is not None:
+                    if stopping_criteria[0].stop_event.is_set():
+                        stop_early = True
+            streamer.done()
+    def _model_call(self, input_ids):
+        """
+        Run the model on input_ids and get logits.
+        This method directly accesses model logits rather than using the full generate pipeline for
+        several important reasons:
+        1. Purpose: We need raw logits from a single forward pass, while generate() is optimized for
+           producing multiple tokens through iterative inference
+        2. Efficiency: Direct access is more efficient for logprob calculations with no
+           sampling overhead
+        3. Precision: Logprob calculations require exact control over input-to-output mapping
+        4. Consistency: Similar approach used in both HF and OGA implementations
+        Args:
+            input_ids: Input token IDs
+        Returns:
+            Logits for each token in the sequence
+        """
+        import torch
+        # Setup generator params
+        params = og.GeneratorParams(self.model)
+        # Configure for a simple forward pass
+        params.set_search_options(
+            do_sample=False,
+            temperature=0.0,
+            max_length=len(input_ids),
+        )
+        # Initialize generator
+        generator = og.Generator(self.model, params)
+        # Feed tokens to model based on API version
+        generator.append_tokens(input_ids)
+        # Extract logits - this returns a list of logits tensors
+        logits = generator.get_output("logits")
+        # Convert to torch tensor for easier processing
+        return torch.tensor(logits[0])
+    def _select_cont_toks(self, logits, context_len, continuation_tokens):
+        """
+        Select and process logits for continuation tokens.
+        Args:
+            logits: Full sequence logits
+            context_len: Length of context tokens
+            continuation_tokens: List or tensor of continuation token IDs
+        Returns:
+            Log probabilities for continuation tokens
+        """
+        import torch
+        # Extract relevant logits for continuation prediction (shift by one)
+        cont_logits = logits[
+            context_len - 1 : context_len - 1 + len(continuation_tokens)
+        ]
+        # Convert to torch tensors if needed
+        if not isinstance(continuation_tokens, torch.Tensor):
+            continuation_tokens = torch.tensor(continuation_tokens, dtype=torch.long)
+        # Apply log softmax to get log probabilities
+        log_probs = torch.log_softmax(cont_logits, dim=-1)
+        # Get log probs for the specific continuation tokens
+        token_log_probs = torch.gather(
+            log_probs, 1, continuation_tokens.unsqueeze(-1)
+        ).squeeze(-1)
+        return token_log_probs
+    def compute_logprobs(
+        self, text, tokenizer, prompt_length=None, logprobs=None, echo=False
+    ):
+        """
+        Compute log probabilities for all tokens in the given text.
+        Args:
+            text: The full text to analyze (e.g., prompt + completion)
+            prompt_length: Number of tokens in the prompt. If provided and echo=False,
+                only completion tokens after this position will be returned.
+            logprobs: If not None, return log probabilities. Value indicates how many top
+                alternatives to return. If True but not an integer, defaults to 5 alternatives.
+            echo: If True, include logprobs for prompt tokens. If False, only return logprobs
+                for completion tokens.
+        Returns:
+            - text_offset: Character offsets for each token in the text
+            - token_logprobs: Log probability for each token
+            - tokens: The actual tokens used
+            - top_logprobs: Top alternative log probabilities for each position
+        """
+        import torch
+        if tokenizer is None:
+            raise ValueError("Tokenizer is required for logprob calculation")
+        # Encode the full text
+        tokens = tokenizer(text).input_ids  # pylint: disable=E1102
+        # Track character offsets for each token
+        text_offset = []
+        start_idx = 0
+        token_strings = []
+        for token_id in tokens:
+            token_str = tokenizer.decode([token_id])
+            token_strings.append(token_str)
+            # Calculate character offsets for tokens - handles cases where tokens
+            # may not directly match in the original text due to encoding differences,
+            # special characters, or tokenization artifacts
+            try:
+                pos = text[start_idx:].find(token_str)
+                if pos != -1:
+                    text_offset.append(start_idx + pos)
+                    start_idx += pos + len(token_str)
+                else:
+                    text_offset.append(start_idx)
+            except (TypeError, ValueError, UnicodeError):
+                # Fallback to current position when matching fails due to encoding issues
+                text_offset.append(start_idx)
+        # Get logits from model
+        logits = self._model_call(tokens)
+        # Calculate log probabilities for each token
+        all_log_probs = torch.log_softmax(logits, dim=-1)
+        # The first token doesn't have a conditional probability
+        # For tokens after the first, get the predicted probability
+        token_log_probs = []
+        top_logprobs_list = []
+        # For each position, get the actual token probability and top alternatives
+        for i in range(len(tokens)):
+            # Get previous token position logits
+            if i > 0:  # First token has no preceding context
+                prev_logits = all_log_probs[i - 1]
+                curr_token_id = tokens[i]
+                # Get probability of the actual token that appeared
+                token_logprob = prev_logits[curr_token_id].item()
+                token_log_probs.append(token_logprob)
+                # Get top-k alternatives if requested
+                if logprobs is not None:
+                    num_alternatives = logprobs if isinstance(logprobs, int) else 5
+                    topk_values, topk_indices = torch.topk(
+                        prev_logits, min(num_alternatives, prev_logits.size(-1))
+                    )
+                    # Create dictionary of token: logprob
+                    position_logprobs = {}
+                    for val, idx in zip(topk_values.tolist(), topk_indices.tolist()):
+                        token_str = tokenizer.decode([idx])
+                        position_logprobs[token_str] = val
+                    top_logprobs_list.append(position_logprobs)
+            else:
+                # For the first token, we don't have a conditional probability
+                token_log_probs.append(None)
+                top_logprobs_list.append({})
+        # If we don't want to echo prompt tokens, filter them out
+        if not echo and prompt_length is not None:
+            # Ensure prompt_length is within bounds
+            prompt_length = min(prompt_length, len(tokens))
+            # Filter results to only include completion tokens
+            if prompt_length < len(tokens):
+                filtered_text_offset = text_offset[prompt_length:]
+                filtered_token_logprobs = token_log_probs[prompt_length:]
+                filtered_tokens = token_strings[prompt_length:]
+                filtered_top_logprobs = top_logprobs_list[prompt_length:]
+                return (
+                    filtered_text_offset,
+                    filtered_token_logprobs,
+                    filtered_tokens,
+                    filtered_top_logprobs,
+                )
+            else:
+                # No completion tokens
+                return [], [], [], []
+        return text_offset, token_log_probs, token_strings, top_logprobs_list

lemonade/tools/perplexity.py CHANGED Viewed

@@ -1,8 +1,5 @@
 import os
 import argparse
-import pandas as pd
-import torch
-from datasets import load_dataset
 from lemonade.state import State
 from lemonade.tools import Tool
 import lemonade.common.printing as printing
@@ -41,6 +38,10 @@ class AccuracyPerplexity(Tool):
         state: State,
     ) -> State:
+        import pandas as pd
+        import torch
+        from datasets import load_dataset
         try:
             printing.log_info("Downloading dataset ...")
             dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")

lemonade/tools/prompt.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import argparse
 import os
-import matplotlib.pyplot as plt
 import lemonade.common.build as build
 import lemonade.common.printing as printing
 from lemonade.state import State
@@ -154,6 +153,8 @@ class LLMPrompt(Tool):
         random_seed: int = DEFAULT_RANDOM_SEED,
     ) -> State:
+        import matplotlib.pyplot as plt
         model: ModelAdapter = state.model
         tokenizer: TokenizerAdapter = state.tokenizer

lemonade/tools/quark/quark_load.py CHANGED Viewed

@@ -2,7 +2,6 @@ import argparse
 import os
 import sys
-import torch
 from lemonade.state import State
 from lemonade.tools import Tool
 import lemonade.common.printing as printing
@@ -101,6 +100,8 @@ class QuarkLoad(Tool):
             Exception: If an error occurs during the QuarkLoad process.
         """
+        import torch
         try:
             if os.path.isdir(DEFAULT_QUARK_DIR):
                 quark_llm_path = os.path.join(

lemonade/tools/quark/quark_quantize.py CHANGED Viewed

@@ -2,9 +2,6 @@ import argparse
 import os
 import sys
 from pathlib import Path
-import torch
-from transformers import AutoProcessor
 from lemonade.state import State
 from lemonade.tools import Tool
 import lemonade.common.printing as printing
@@ -319,8 +316,8 @@ class QuarkQuantize(Tool):
         - Optionally exporting, compiling, and evaluating the model.
         """
-        model = state.model.model
-        tokenizer = state.tokenizer
+        import torch
+        from transformers import AutoProcessor
         # Importing quark utils after adding to sys.path
         from llm_utils.data_preparation import get_calib_dataloader
@@ -328,6 +325,9 @@ class QuarkQuantize(Tool):
         from llm_ptq.configuration_preparation import get_config, get_export_config
         from quark.torch import ModelQuantizer, ModelExporter, save_params
+        model = state.model.model
+        tokenizer = state.tokenizer
         # 1. Load Model
         printing.log_info("Loading model ...")
         model_type = get_model_type(model)

lemonade/tools/report/table.py CHANGED Viewed

@@ -7,10 +7,10 @@ from tabulate import tabulate
 import lemonade.common.build as build
 import lemonade.common.filesystem as fs
 from lemonade.cache import Keys
-from lemonade.tools.huggingface_bench import HuggingfaceBench
-from lemonade.tools.llamacpp_bench import LlamaCppBench
+from lemonade.tools.huggingface.bench import HuggingfaceBench
+from lemonade.tools.llamacpp.bench import LlamaCppBench
 from lemonade.tools.mmlu import AccuracyMMLU
-from lemonade.tools.ort_genai.oga_bench import OgaBench
+from lemonade.tools.oga.bench import OgaBench
 # List of python packages for which to log the version
 PYTHON_PACKAGES = ["onnxruntime", "transformers", "lemonade-sdk", "voe"]

lemonade-sdk 7.0.3__py3-none-any.whl → 8.0.0__py3-none-any.whl

Potentially problematic release.

lemonade-sdk 7.0.3py3-none-any.whl → 8.0.0py3-none-any.whl