PyPI - speedy-utils - Versions diffs - 1.1.35__py3-none-any.whl → 1.1.36__py3-none-any.whl - Mend

speedy-utils 1.1.35py3-none-any.whl → 1.1.36py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

llm_utils/lm/llm.py CHANGED Viewed

@@ -435,3 +435,89 @@ class LLM(
             vllm_reuse=vllm_reuse,
             **model_kwargs,
         )
+from typing import Any, Dict, List, Optional, Type, Union
+from pydantic import BaseModel
+from .llm import LLM, Messages
+class LLM_NEMOTRON3(LLM):
+    """
+    Custom implementation for NVIDIA Nemotron-3 reasoning models.
+    Supports thinking budget control and native reasoning tags.
+    """
+    def __init__(
+        self,
+        model: str = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
+        thinking_budget: int = 1024,
+        enable_thinking: bool = True,
+        **kwargs
+    ):
+        # Force reasoning_model to True to enable reasoning_content extraction
+        kwargs['is_reasoning_model'] = True
+        super().__init__(**kwargs)
+        self.model_kwargs['model'] = model
+        self.thinking_budget = thinking_budget
+        self.enable_thinking = enable_thinking
+    def _prepare_input(self, input_data: str | BaseModel | list[dict]) -> Messages:
+        """Override to ensure Nemotron chat template requirements are met."""
+        messages = super()._prepare_input(input_data)
+        return messages
+    def __call__(
+        self,
+        input_data: str | BaseModel | list[dict],
+        thinking_budget: Optional[int] = None,
+        **kwargs
+    ) -> List[Dict[str, Any]]:
+        budget = thinking_budget or self.thinking_budget
+        if not self.enable_thinking:
+            # Simple pass with thinking disabled in template
+            return super().__call__(
+                input_data,
+                chat_template_kwargs={"enable_thinking": False},
+                **kwargs
+            )
+        # --- STEP 1: Generate Thinking Trace ---
+        # We manually append <think> to force the reasoning MoE layers
+        messages = self._prepare_input(input_data)
+        # We use the raw text completion for the budget phase
+        # Stop at the closing tag or budget limit
+        thinking_response = self.text_completion(
+            input_data,
+            max_tokens=budget,
+            stop=["</think>"],
+            **kwargs
+        )[0]
+        reasoning_content = thinking_response['parsed']
+        # Ensure proper tag closing for the second pass
+        if "</think>" not in reasoning_content:
+            reasoning_content = f"{reasoning_content}\n</think>"
+        elif not reasoning_content.endswith("</think>"):
+            # Ensure it ends exactly with the tag for continuity
+            reasoning_content = reasoning_content.split("</think>")[0] + "</think>"
+        # --- STEP 2: Generate Final Answer ---
+        # Append the thought to the assistant role and continue
+        final_messages = messages + [
+            {"role": "assistant", "content": f"<think>\n{reasoning_content}\n"}
+        ]
+        # Use continue_final_message to prevent the model from repeating the header
+        results = super().__call__(
+            final_messages,
+            extra_body={"continue_final_message": True},
+            **kwargs
+        )
+        # Inject the reasoning back into the result for the UI/API
+        for res in results:
+            res['reasoning_content'] = reasoning_content
+        return results

llm_utils/lm/mixins.py CHANGED Viewed

@@ -469,6 +469,210 @@ class TokenizationMixin:
         data = response.json()
         return data['prompt']
+    def generate(
+        self,
+        input_context: str | list[int],
+        *,
+        max_tokens: int = 512,
+        temperature: float = 1.0,
+        top_p: float = 1.0,
+        top_k: int = -1,
+        min_p: float = 0.0,
+        repetition_penalty: float = 1.0,
+        presence_penalty: float = 0.0,
+        frequency_penalty: float = 0.0,
+        n: int = 1,
+        stop: str | list[str] | None = None,
+        stop_token_ids: list[int] | None = None,
+        ignore_eos: bool = False,
+        min_tokens: int = 0,
+        skip_special_tokens: bool = True,
+        spaces_between_special_tokens: bool = True,
+        logprobs: int | None = None,
+        prompt_logprobs: int | None = None,
+        seed: int | None = None,
+        return_token_ids: bool = False,
+        return_text: bool = True,
+        stream: bool = False,
+        **kwargs,
+    ) -> dict[str, Any] | list[dict[str, Any]]:
+        """
+        Generate text using HuggingFace Transformers-style interface.
+        This method provides a low-level generation interface similar to
+        HuggingFace's model.generate(), working directly with token IDs
+        and the /inference/v1/generate endpoint.
+        Args:
+            input_context: Input as text (str) or token IDs (list[int])
+            max_tokens: Maximum number of tokens to generate
+            temperature: Sampling temperature (higher = more random)
+            top_p: Nucleus sampling probability threshold
+            top_k: Top-k sampling parameter (-1 to disable)
+            min_p: Minimum probability threshold
+            repetition_penalty: Penalty for repeating tokens
+            presence_penalty: Presence penalty for token diversity
+            frequency_penalty: Frequency penalty for token diversity
+            n: Number of sequences to generate
+            stop: Stop sequences (string or list of strings)
+            stop_token_ids: Token IDs to stop generation
+            ignore_eos: Whether to ignore EOS token
+            min_tokens: Minimum number of tokens to generate
+            skip_special_tokens: Skip special tokens in output
+            spaces_between_special_tokens: Add spaces between special tokens
+            logprobs: Number of top logprobs to return
+            prompt_logprobs: Number of prompt logprobs to return
+            seed: Random seed for reproducibility
+            return_token_ids: If True, include token IDs in output
+            return_text: If True, include decoded text in output
+            stream: If True, stream the response (not fully implemented)
+            **kwargs: Additional parameters passed to the API
+        Returns:
+            Dictionary with generation results containing:
+            - 'text': Generated text (if return_text=True)
+            - 'token_ids': Generated token IDs (if return_token_ids=True)
+            - 'logprobs': Log probabilities (if logprobs is set)
+            If n > 1, returns list of result dictionaries
+        """
+        import requests
+        # Convert text input to token IDs if needed
+        if isinstance(input_context, str):
+            token_ids = self.encode(input_context, add_special_tokens=True)
+        else:
+            token_ids = input_context
+        # Get base_url (generate endpoint is at root level like /inference/v1/generate)
+        base_url = str(self.client.base_url).rstrip('/')
+        if base_url.endswith('/v1'):
+            base_url = base_url[:-3]  # Remove '/v1'
+        # Build sampling params matching the API schema
+        sampling_params = {
+            'max_tokens': max_tokens,
+            'temperature': temperature,
+            'top_p': top_p,
+            'top_k': top_k,
+            'min_p': min_p,
+            'repetition_penalty': repetition_penalty,
+            'presence_penalty': presence_penalty,
+            'frequency_penalty': frequency_penalty,
+            'n': n,
+            'stop': stop or [],
+            'stop_token_ids': stop_token_ids or [],
+            'ignore_eos': ignore_eos,
+            'min_tokens': min_tokens,
+            'skip_special_tokens': skip_special_tokens,
+            'spaces_between_special_tokens': spaces_between_special_tokens,
+            'logprobs': logprobs,
+            'prompt_logprobs': prompt_logprobs,
+        }
+        if seed is not None:
+            sampling_params['seed'] = seed
+        # Build request payload
+        request_data = {
+            'token_ids': token_ids,
+            'sampling_params': sampling_params,
+            'stream': stream,
+        }
+        # Add any additional kwargs
+        request_data.update(kwargs)
+        # Make API request
+        response = requests.post(
+            f'{base_url}/inference/v1/generate',
+            json=request_data,
+        )
+        response.raise_for_status()
+        data = response.json()
+        # Process response
+        # The API may return different structures, handle common cases
+        if n == 1:
+            result = {}
+            # Extract from choices format
+            if 'choices' in data and len(data['choices']) > 0:
+                choice = data['choices'][0]
+                # Get token IDs first
+                generated_token_ids = None
+                if 'token_ids' in choice:
+                    generated_token_ids = choice['token_ids']
+                    if return_token_ids:
+                        result['token_ids'] = generated_token_ids
+                # Decode to text if requested
+                if return_text:
+                    if 'text' in choice:
+                        result['text'] = choice['text']
+                    elif generated_token_ids is not None:
+                        # Decode token IDs to text
+                        result['text'] = self.decode(generated_token_ids)
+                    elif 'message' in choice and 'content' in choice['message']:
+                        result['text'] = choice['message']['content']
+                # Include logprobs if requested
+                if logprobs is not None and 'logprobs' in choice:
+                    result['logprobs'] = choice['logprobs']
+                # Include finish reason
+                if 'finish_reason' in choice:
+                    result['finish_reason'] = choice['finish_reason']
+            # Fallback to direct fields
+            elif 'text' in data and return_text:
+                result['text'] = data['text']
+            elif 'token_ids' in data:
+                if return_token_ids:
+                    result['token_ids'] = data['token_ids']
+                if return_text:
+                    result['text'] = self.decode(data['token_ids'])
+            # Store raw response for debugging
+            result['_raw_response'] = data
+            return result
+        else:
+            # Multiple generations (n > 1)
+            results = []
+            choices = data.get('choices', [])
+            for i in range(min(n, len(choices))):
+                choice = choices[i]
+                result = {}
+                # Get token IDs
+                generated_token_ids = None
+                if 'token_ids' in choice:
+                    generated_token_ids = choice['token_ids']
+                    if return_token_ids:
+                        result['token_ids'] = generated_token_ids
+                # Decode to text if requested
+                if return_text:
+                    if 'text' in choice:
+                        result['text'] = choice['text']
+                    elif generated_token_ids is not None:
+                        result['text'] = self.decode(generated_token_ids)
+                    elif 'message' in choice and 'content' in choice['message']:
+                        result['text'] = choice['message']['content']
+                if logprobs is not None and 'logprobs' in choice:
+                    result['logprobs'] = choice['logprobs']
+                if 'finish_reason' in choice:
+                    result['finish_reason'] = choice['finish_reason']
+                result['_raw_response'] = choice
+                results.append(result)
+            return results
 class ModelUtilsMixin:
     """Mixin for model utility methods."""

{speedy_utils-1.1.35.dist-info → speedy_utils-1.1.36.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: speedy-utils
-Version: 1.1.35
+Version: 1.1.36
 Summary: Fast and easy-to-use package for data science
 Project-URL: Homepage, https://github.com/anhvth/speedy
 Project-URL: Repository, https://github.com/anhvth/speedy

{speedy_utils-1.1.35.dist-info → speedy_utils-1.1.36.dist-info}/RECORD RENAMED Viewed

@@ -6,10 +6,10 @@ llm_utils/chat_format/transform.py,sha256=PJ2g9KT1GSbWuAs7giEbTpTAffpU9QsIXyRlbf
 llm_utils/chat_format/utils.py,sha256=M2EctZ6NeHXqFYufh26Y3CpSphN0bdZm5xoNaEJj5vg,1251
 llm_utils/lm/__init__.py,sha256=4jYMy3wPH3tg-tHFyWEWOqrnmX4Tu32VZCdzRGMGQsI,778
 llm_utils/lm/base_prompt_builder.py,sha256=_TzYMsWr-SsbA_JNXptUVN56lV5RfgWWTrFi-E8LMy4,12337
-llm_utils/lm/llm.py,sha256=yas7Khd0Djc8-GD8jL--B2oPteV9FC3PpfPbr9XCLOQ,16515
+llm_utils/lm/llm.py,sha256=2vq8BScwp4gWb89EmUPaiBCzkBSr0x2B3qJLaPM11_M,19644
 llm_utils/lm/llm_signature.py,sha256=vV8uZgLLd6ZKqWbq0OPywWvXAfl7hrJQnbtBF-VnZRU,1244
 llm_utils/lm/lm_base.py,sha256=Bk3q34KrcCK_bC4Ryxbc3KqkiPL39zuVZaBQ1i6wJqs,9437
-llm_utils/lm/mixins.py,sha256=o0tZiaKW4u1BxBVlT_0yTwnO8h7KnY02HX5TuWipvr0,16735
+llm_utils/lm/mixins.py,sha256=Nz7CwJFBOvbZNbODUlJC04Pcbac3zWnT8vy7sZG_MVI,24906
 llm_utils/lm/openai_memoize.py,sha256=rYrSFPpgO7adsjK1lVdkJlhqqIw_13TCW7zU8eNwm3o,5185
 llm_utils/lm/signature.py,sha256=K1hvCAqoC5CmsQ0Y_ywnYy2fRb5JzmIK8OS-hjH-5To,9971
 llm_utils/lm/utils.py,sha256=dEKFta8S6Mm4LjIctcpFlEGL9RnmLm5DHd2TA70UWuA,12649
@@ -50,7 +50,7 @@ vision_utils/README.md,sha256=AIDZZj8jo_QNrEjFyHwd00iOO431s-js-M2dLtVTn3I,5740
 vision_utils/__init__.py,sha256=hF54sT6FAxby8kDVhOvruy4yot8O-Ateey5n96O1pQM,284
 vision_utils/io_utils.py,sha256=pI0Va6miesBysJcllK6NXCay8HpGZsaMWwlsKB2DMgA,26510
 vision_utils/plot.py,sha256=HkNj3osA3moPuupP1VguXfPPOW614dZO5tvC-EFKpKM,12028
-speedy_utils-1.1.35.dist-info/METADATA,sha256=wsz89syaYNXEeGjJXV8zb0W2ZrTjpN2Lj47tE7LQeEI,8048
-speedy_utils-1.1.35.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-speedy_utils-1.1.35.dist-info/entry_points.txt,sha256=1rrFMfqvaMUE9hvwGiD6vnVh98kmgy0TARBj-v0Lfhs,244
-speedy_utils-1.1.35.dist-info/RECORD,,
+speedy_utils-1.1.36.dist-info/METADATA,sha256=yZYfOkBwR1aiMwAKAxn78sYCbmNnBt-5lmGY6d11hPI,8048
+speedy_utils-1.1.36.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+speedy_utils-1.1.36.dist-info/entry_points.txt,sha256=rwn89AYfBUh9SRJtFbpp-u2JIKiqmZ2sczvqyO6s9cI,289
+speedy_utils-1.1.36.dist-info/RECORD,,

{speedy_utils-1.1.35.dist-info → speedy_utils-1.1.36.dist-info}/entry_points.txt RENAMED Viewed

@@ -1,4 +1,5 @@
 [console_scripts]
+fast-vllm = llm_utils.scripts.fast_vllm:main
 mpython = speedy_utils.scripts.mpython:main
 openapi_client_codegen = speedy_utils.scripts.openapi_client_codegen:main
 svllm = llm_utils.scripts.vllm_serve:main

{speedy_utils-1.1.35.dist-info → speedy_utils-1.1.36.dist-info}/WHEEL RENAMED Viewed

File without changes

speedy-utils 1.1.35__py3-none-any.whl → 1.1.36__py3-none-any.whl

speedy-utils 1.1.35py3-none-any.whl → 1.1.36py3-none-any.whl