PyPI - speedy-utils - Versions diffs - 1.1.35__py3-none-any.whl → 1.1.38__py3-none-any.whl - Mend

speedy-utils 1.1.35py3-none-any.whl → 1.1.38py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

llm_utils/lm/llm.py CHANGED Viewed

@@ -435,3 +435,89 @@ class LLM(
             vllm_reuse=vllm_reuse,
             **model_kwargs,
         )
+from typing import Any, Dict, List, Optional, Type, Union
+from pydantic import BaseModel
+from .llm import LLM, Messages
+class LLM_NEMOTRON3(LLM):
+    """
+    Custom implementation for NVIDIA Nemotron-3 reasoning models.
+    Supports thinking budget control and native reasoning tags.
+    """
+    def __init__(
+        self,
+        model: str = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
+        thinking_budget: int = 1024,
+        enable_thinking: bool = True,
+        **kwargs
+    ):
+        # Force reasoning_model to True to enable reasoning_content extraction
+        kwargs['is_reasoning_model'] = True
+        super().__init__(**kwargs)
+        self.model_kwargs['model'] = model
+        self.thinking_budget = thinking_budget
+        self.enable_thinking = enable_thinking
+    def _prepare_input(self, input_data: str | BaseModel | list[dict]) -> Messages:
+        """Override to ensure Nemotron chat template requirements are met."""
+        messages = super()._prepare_input(input_data)
+        return messages
+    def __call__(
+        self,
+        input_data: str | BaseModel | list[dict],
+        thinking_budget: Optional[int] = None,
+        **kwargs
+    ) -> List[Dict[str, Any]]:
+        budget = thinking_budget or self.thinking_budget
+        if not self.enable_thinking:
+            # Simple pass with thinking disabled in template
+            return super().__call__(
+                input_data,
+                chat_template_kwargs={"enable_thinking": False},
+                **kwargs
+            )
+        # --- STEP 1: Generate Thinking Trace ---
+        # We manually append <think> to force the reasoning MoE layers
+        messages = self._prepare_input(input_data)
+        # We use the raw text completion for the budget phase
+        # Stop at the closing tag or budget limit
+        thinking_response = self.text_completion(
+            input_data,
+            max_tokens=budget,
+            stop=["</think>"],
+            **kwargs
+        )[0]
+        reasoning_content = thinking_response['parsed']
+        # Ensure proper tag closing for the second pass
+        if "</think>" not in reasoning_content:
+            reasoning_content = f"{reasoning_content}\n</think>"
+        elif not reasoning_content.endswith("</think>"):
+            # Ensure it ends exactly with the tag for continuity
+            reasoning_content = reasoning_content.split("</think>")[0] + "</think>"
+        # --- STEP 2: Generate Final Answer ---
+        # Append the thought to the assistant role and continue
+        final_messages = messages + [
+            {"role": "assistant", "content": f"<think>\n{reasoning_content}\n"}
+        ]
+        # Use continue_final_message to prevent the model from repeating the header
+        results = super().__call__(
+            final_messages,
+            extra_body={"continue_final_message": True},
+            **kwargs
+        )
+        # Inject the reasoning back into the result for the UI/API
+        for res in results:
+            res['reasoning_content'] = reasoning_content
+        return results

llm_utils/lm/mixins.py CHANGED Viewed

@@ -469,6 +469,210 @@ class TokenizationMixin:
         data = response.json()
         return data['prompt']
+    def generate(
+        self,
+        input_context: str | list[int],
+        *,
+        max_tokens: int = 512,
+        temperature: float = 1.0,
+        top_p: float = 1.0,
+        top_k: int = -1,
+        min_p: float = 0.0,
+        repetition_penalty: float = 1.0,
+        presence_penalty: float = 0.0,
+        frequency_penalty: float = 0.0,
+        n: int = 1,
+        stop: str | list[str] | None = None,
+        stop_token_ids: list[int] | None = None,
+        ignore_eos: bool = False,
+        min_tokens: int = 0,
+        skip_special_tokens: bool = True,
+        spaces_between_special_tokens: bool = True,
+        logprobs: int | None = None,
+        prompt_logprobs: int | None = None,
+        seed: int | None = None,
+        return_token_ids: bool = False,
+        return_text: bool = True,
+        stream: bool = False,
+        **kwargs,
+    ) -> dict[str, Any] | list[dict[str, Any]]:
+        """
+        Generate text using HuggingFace Transformers-style interface.
+        This method provides a low-level generation interface similar to
+        HuggingFace's model.generate(), working directly with token IDs
+        and the /inference/v1/generate endpoint.
+        Args:
+            input_context: Input as text (str) or token IDs (list[int])
+            max_tokens: Maximum number of tokens to generate
+            temperature: Sampling temperature (higher = more random)
+            top_p: Nucleus sampling probability threshold
+            top_k: Top-k sampling parameter (-1 to disable)
+            min_p: Minimum probability threshold
+            repetition_penalty: Penalty for repeating tokens
+            presence_penalty: Presence penalty for token diversity
+            frequency_penalty: Frequency penalty for token diversity
+            n: Number of sequences to generate
+            stop: Stop sequences (string or list of strings)
+            stop_token_ids: Token IDs to stop generation
+            ignore_eos: Whether to ignore EOS token
+            min_tokens: Minimum number of tokens to generate
+            skip_special_tokens: Skip special tokens in output
+            spaces_between_special_tokens: Add spaces between special tokens
+            logprobs: Number of top logprobs to return
+            prompt_logprobs: Number of prompt logprobs to return
+            seed: Random seed for reproducibility
+            return_token_ids: If True, include token IDs in output
+            return_text: If True, include decoded text in output
+            stream: If True, stream the response (not fully implemented)
+            **kwargs: Additional parameters passed to the API
+        Returns:
+            Dictionary with generation results containing:
+            - 'text': Generated text (if return_text=True)
+            - 'token_ids': Generated token IDs (if return_token_ids=True)
+            - 'logprobs': Log probabilities (if logprobs is set)
+            If n > 1, returns list of result dictionaries
+        """
+        import requests
+        # Convert text input to token IDs if needed
+        if isinstance(input_context, str):
+            token_ids = self.encode(input_context, add_special_tokens=True)
+        else:
+            token_ids = input_context
+        # Get base_url (generate endpoint is at root level like /inference/v1/generate)
+        base_url = str(self.client.base_url).rstrip('/')
+        if base_url.endswith('/v1'):
+            base_url = base_url[:-3]  # Remove '/v1'
+        # Build sampling params matching the API schema
+        sampling_params = {
+            'max_tokens': max_tokens,
+            'temperature': temperature,
+            'top_p': top_p,
+            'top_k': top_k,
+            'min_p': min_p,
+            'repetition_penalty': repetition_penalty,
+            'presence_penalty': presence_penalty,
+            'frequency_penalty': frequency_penalty,
+            'n': n,
+            'stop': stop or [],
+            'stop_token_ids': stop_token_ids or [],
+            'ignore_eos': ignore_eos,
+            'min_tokens': min_tokens,
+            'skip_special_tokens': skip_special_tokens,
+            'spaces_between_special_tokens': spaces_between_special_tokens,
+            'logprobs': logprobs,
+            'prompt_logprobs': prompt_logprobs,
+        }
+        if seed is not None:
+            sampling_params['seed'] = seed
+        # Build request payload
+        request_data = {
+            'token_ids': token_ids,
+            'sampling_params': sampling_params,
+            'stream': stream,
+        }
+        # Add any additional kwargs
+        request_data.update(kwargs)
+        # Make API request
+        response = requests.post(
+            f'{base_url}/inference/v1/generate',
+            json=request_data,
+        )
+        response.raise_for_status()
+        data = response.json()
+        # Process response
+        # The API may return different structures, handle common cases
+        if n == 1:
+            result = {}
+            # Extract from choices format
+            if 'choices' in data and len(data['choices']) > 0:
+                choice = data['choices'][0]
+                # Get token IDs first
+                generated_token_ids = None
+                if 'token_ids' in choice:
+                    generated_token_ids = choice['token_ids']
+                    if return_token_ids:
+                        result['token_ids'] = generated_token_ids
+                # Decode to text if requested
+                if return_text:
+                    if 'text' in choice:
+                        result['text'] = choice['text']
+                    elif generated_token_ids is not None:
+                        # Decode token IDs to text
+                        result['text'] = self.decode(generated_token_ids)
+                    elif 'message' in choice and 'content' in choice['message']:
+                        result['text'] = choice['message']['content']
+                # Include logprobs if requested
+                if logprobs is not None and 'logprobs' in choice:
+                    result['logprobs'] = choice['logprobs']
+                # Include finish reason
+                if 'finish_reason' in choice:
+                    result['finish_reason'] = choice['finish_reason']
+            # Fallback to direct fields
+            elif 'text' in data and return_text:
+                result['text'] = data['text']
+            elif 'token_ids' in data:
+                if return_token_ids:
+                    result['token_ids'] = data['token_ids']
+                if return_text:
+                    result['text'] = self.decode(data['token_ids'])
+            # Store raw response for debugging
+            result['_raw_response'] = data
+            return result
+        else:
+            # Multiple generations (n > 1)
+            results = []
+            choices = data.get('choices', [])
+            for i in range(min(n, len(choices))):
+                choice = choices[i]
+                result = {}
+                # Get token IDs
+                generated_token_ids = None
+                if 'token_ids' in choice:
+                    generated_token_ids = choice['token_ids']
+                    if return_token_ids:
+                        result['token_ids'] = generated_token_ids
+                # Decode to text if requested
+                if return_text:
+                    if 'text' in choice:
+                        result['text'] = choice['text']
+                    elif generated_token_ids is not None:
+                        result['text'] = self.decode(generated_token_ids)
+                    elif 'message' in choice and 'content' in choice['message']:
+                        result['text'] = choice['message']['content']
+                if logprobs is not None and 'logprobs' in choice:
+                    result['logprobs'] = choice['logprobs']
+                if 'finish_reason' in choice:
+                    result['finish_reason'] = choice['finish_reason']
+                result['_raw_response'] = choice
+                results.append(result)
+            return results
 class ModelUtilsMixin:
     """Mixin for model utility methods."""

llm_utils/scripts/fast_vllm.py ADDED Viewed

@@ -0,0 +1,131 @@
+import os
+import sys
+import shutil
+import time
+import argparse
+import subprocess
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor, as_completed
+def get_hf_cache_home():
+    """Locate the Hugging Face cache directory."""
+    if "HF_HOME" in os.environ:
+        return Path(os.environ["HF_HOME"]) / "hub"
+    return Path.home() / ".cache" / "huggingface" / "hub"
+def resolve_model_path(model_id, cache_dir):
+    """Find the physical snapshot directory for the given model ID."""
+    dir_name = "models--" + model_id.replace("/", "--")
+    model_root = cache_dir / dir_name
+    if not model_root.exists():
+        raise FileNotFoundError(f"Model folder not found at: {model_root}")
+    # 1. Try to find hash via refs/main
+    ref_path = model_root / "refs" / "main"
+    if ref_path.exists():
+        with open(ref_path, "r") as f:
+            commit_hash = f.read().strip()
+        snapshot_path = model_root / "snapshots" / commit_hash
+        if snapshot_path.exists():
+            return snapshot_path
+    # 2. Fallback to the newest snapshot folder
+    snapshots_dir = model_root / "snapshots"
+    if snapshots_dir.exists():
+        subdirs = [x for x in snapshots_dir.iterdir() if x.is_dir()]
+        if subdirs:
+            return sorted(subdirs, key=lambda x: x.stat().st_mtime, reverse=True)[0]
+    raise FileNotFoundError(f"No valid snapshot found in {model_root}")
+def copy_worker(src, dst):
+    """Copy a single file, following symlinks to capture actual data."""
+    try:
+        os.makedirs(os.path.dirname(dst), exist_ok=True)
+        # copy2 follows symlinks by default
+        shutil.copy2(src, dst)
+        return os.path.getsize(dst)
+    except Exception as e:
+        return str(e)
+def cache_to_ram(model_id, shm_base, workers=64):
+    """Parallel copy from HF cache to the specified RAM directory."""
+    cache_home = get_hf_cache_home()
+    src_path = resolve_model_path(model_id, cache_home)
+    safe_name = model_id.replace("/", "_")
+    dst_path = Path(shm_base) / safe_name
+    # Check available space in shm
+    shm_stats = shutil.disk_usage(shm_base)
+    print(f"📦 Source: {src_path}", file=sys.stderr)
+    print(f"🚀 Target RAM: {dst_path} (Available: {shm_stats.free/(1024**3):.1f} GB)", file=sys.stderr)
+    files_to_copy = []
+    for root, _, files in os.walk(src_path):
+        for file in files:
+            full_src = Path(root) / file
+            rel_path = full_src.relative_to(src_path)
+            files_to_copy.append((full_src, dst_path / rel_path))
+    total_bytes = 0
+    start = time.time()
+    with ThreadPoolExecutor(max_workers=workers) as pool:
+        futures = {pool.submit(copy_worker, s, d): s for s, d in files_to_copy}
+        for i, future in enumerate(as_completed(futures)):
+            res = future.result()
+            if isinstance(res, int):
+                total_bytes += res
+            if i % 100 == 0 or i == len(files_to_copy) - 1:
+                print(f"   Progress: {i+1}/{len(files_to_copy)} files...", end="\r", file=sys.stderr)
+    elapsed = time.time() - start
+    print(f"\n✅ Copied {total_bytes/(1024**3):.2f} GB in {elapsed:.2f}s", file=sys.stderr)
+    return dst_path
+def main():
+    parser = argparse.ArgumentParser(description="vLLM RAM-cached loader", add_help=False)
+    parser.add_argument("--model", type=str, required=True, help="HuggingFace Model ID")
+    parser.add_argument("--shm-dir", type=str, default="/dev/shm", help="RAM disk mount point")
+    parser.add_argument("--cache-workers", type=int, default=64, help="Threads for copying")
+    parser.add_argument("--keep-cache", action="store_true", help="Do not delete files from RAM on exit")
+    # Capture wrapper args vs vLLM args
+    args, vllm_args = parser.parse_known_args()
+    ram_path = None
+    try:
+        # 1. Sync weights to RAM disk
+        ram_path = cache_to_ram(args.model, args.shm_dir, args.cache_workers)
+        # 2. Prepare vLLM Command
+        # Point vLLM to the RAM files, but keep the original model ID for the API
+        cmd = [
+            "vllm", "serve", str(ram_path),
+            "--served-model-name", args.model
+        ] + vllm_args
+        print(f"\n🔥 Launching vLLM...")
+        print(f"   Command: {' '.join(cmd)}\n", file=sys.stderr)
+        # 3. Run vLLM and wait
+        subprocess.run(cmd, check=True)
+    except KeyboardInterrupt:
+        print("\n👋 Process interrupted by user.", file=sys.stderr)
+    except subprocess.CalledProcessError as e:
+        print(f"\n❌ vLLM exited with error: {e}", file=sys.stderr)
+    except Exception as e:
+        print(f"\n❌ Error: {e}", file=sys.stderr)
+    finally:
+        # 4. Cleanup RAM Disk
+        if ram_path and ram_path.exists() and not args.keep_cache:
+            print(f"🧹 Cleaning up RAM cache: {ram_path}", file=sys.stderr)
+            try:
+                shutil.rmtree(ram_path)
+                print("✨ RAM disk cleared.", file=sys.stderr)
+            except Exception as e:
+                print(f"⚠️ Failed to clean {ram_path}: {e}", file=sys.stderr)
+if __name__ == "__main__":
+    main()

{speedy_utils-1.1.35.dist-info → speedy_utils-1.1.38.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: speedy-utils
-Version: 1.1.35
+Version: 1.1.38
 Summary: Fast and easy-to-use package for data science
 Project-URL: Homepage, https://github.com/anhvth/speedy
 Project-URL: Repository, https://github.com/anhvth/speedy

{speedy_utils-1.1.35.dist-info → speedy_utils-1.1.38.dist-info}/RECORD RENAMED Viewed

@@ -6,10 +6,10 @@ llm_utils/chat_format/transform.py,sha256=PJ2g9KT1GSbWuAs7giEbTpTAffpU9QsIXyRlbf
 llm_utils/chat_format/utils.py,sha256=M2EctZ6NeHXqFYufh26Y3CpSphN0bdZm5xoNaEJj5vg,1251
 llm_utils/lm/__init__.py,sha256=4jYMy3wPH3tg-tHFyWEWOqrnmX4Tu32VZCdzRGMGQsI,778
 llm_utils/lm/base_prompt_builder.py,sha256=_TzYMsWr-SsbA_JNXptUVN56lV5RfgWWTrFi-E8LMy4,12337
-llm_utils/lm/llm.py,sha256=yas7Khd0Djc8-GD8jL--B2oPteV9FC3PpfPbr9XCLOQ,16515
+llm_utils/lm/llm.py,sha256=2vq8BScwp4gWb89EmUPaiBCzkBSr0x2B3qJLaPM11_M,19644
 llm_utils/lm/llm_signature.py,sha256=vV8uZgLLd6ZKqWbq0OPywWvXAfl7hrJQnbtBF-VnZRU,1244
 llm_utils/lm/lm_base.py,sha256=Bk3q34KrcCK_bC4Ryxbc3KqkiPL39zuVZaBQ1i6wJqs,9437
-llm_utils/lm/mixins.py,sha256=o0tZiaKW4u1BxBVlT_0yTwnO8h7KnY02HX5TuWipvr0,16735
+llm_utils/lm/mixins.py,sha256=Nz7CwJFBOvbZNbODUlJC04Pcbac3zWnT8vy7sZG_MVI,24906
 llm_utils/lm/openai_memoize.py,sha256=rYrSFPpgO7adsjK1lVdkJlhqqIw_13TCW7zU8eNwm3o,5185
 llm_utils/lm/signature.py,sha256=K1hvCAqoC5CmsQ0Y_ywnYy2fRb5JzmIK8OS-hjH-5To,9971
 llm_utils/lm/utils.py,sha256=dEKFta8S6Mm4LjIctcpFlEGL9RnmLm5DHd2TA70UWuA,12649
@@ -20,6 +20,7 @@ llm_utils/lm/async_lm/async_lm.py,sha256=W8n_S5PKJln9bzO9T525-tqo5DFwBZNXDucz_v-
 llm_utils/lm/async_lm/async_lm_base.py,sha256=ga5nCzows5Ye3yop41zsUxNYxcj_Vpf02DsfJ1eoE9U,8358
 llm_utils/lm/async_lm/lm_specific.py,sha256=PxP54ltrh9NrLJx7BPib52oYo_aCvDOjf7KzMjp1MYg,3929
 llm_utils/scripts/README.md,sha256=yuOLnLa2od2jp4wVy3rV0rESeiV3o8zol5MNMsZx0DY,999
+llm_utils/scripts/fast_vllm.py,sha256=00UWajLOfTorSMgmgxUOpssdg55oOHneNUY0lhVuRGQ,5128
 llm_utils/scripts/vllm_load_balancer.py,sha256=eQlH07573EDWIBkwc9ef1WvI59anLr4hQqLfZvQk7xk,37133
 llm_utils/scripts/vllm_serve.py,sha256=tPcRB_MbJ01LcqC83RHQ7W9XDS7b1Ntc0fCRdegsNXU,14761
 llm_utils/vector_cache/__init__.py,sha256=oZXpjgBuutI-Pd_pBNYAQMY7-K2C6xv8Qt6a3p88GBQ,879
@@ -50,7 +51,7 @@ vision_utils/README.md,sha256=AIDZZj8jo_QNrEjFyHwd00iOO431s-js-M2dLtVTn3I,5740
 vision_utils/__init__.py,sha256=hF54sT6FAxby8kDVhOvruy4yot8O-Ateey5n96O1pQM,284
 vision_utils/io_utils.py,sha256=pI0Va6miesBysJcllK6NXCay8HpGZsaMWwlsKB2DMgA,26510
 vision_utils/plot.py,sha256=HkNj3osA3moPuupP1VguXfPPOW614dZO5tvC-EFKpKM,12028
-speedy_utils-1.1.35.dist-info/METADATA,sha256=wsz89syaYNXEeGjJXV8zb0W2ZrTjpN2Lj47tE7LQeEI,8048
-speedy_utils-1.1.35.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-speedy_utils-1.1.35.dist-info/entry_points.txt,sha256=1rrFMfqvaMUE9hvwGiD6vnVh98kmgy0TARBj-v0Lfhs,244
-speedy_utils-1.1.35.dist-info/RECORD,,
+speedy_utils-1.1.38.dist-info/METADATA,sha256=8WgY6bVeosqELf3KSmLIrygeQcYe1uQag4BFwvLfSWM,8048
+speedy_utils-1.1.38.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+speedy_utils-1.1.38.dist-info/entry_points.txt,sha256=rwn89AYfBUh9SRJtFbpp-u2JIKiqmZ2sczvqyO6s9cI,289
+speedy_utils-1.1.38.dist-info/RECORD,,

{speedy_utils-1.1.35.dist-info → speedy_utils-1.1.38.dist-info}/entry_points.txt RENAMED Viewed

@@ -1,4 +1,5 @@
 [console_scripts]
+fast-vllm = llm_utils.scripts.fast_vllm:main
 mpython = speedy_utils.scripts.mpython:main
 openapi_client_codegen = speedy_utils.scripts.openapi_client_codegen:main
 svllm = llm_utils.scripts.vllm_serve:main

{speedy_utils-1.1.35.dist-info → speedy_utils-1.1.38.dist-info}/WHEEL RENAMED Viewed

File without changes

speedy-utils 1.1.35__py3-none-any.whl → 1.1.38__py3-none-any.whl

speedy-utils 1.1.35py3-none-any.whl → 1.1.38py3-none-any.whl