PyPI - llama-benchy - Versions diffs - 0.1.2__tar.gz → 0.2.1__tar.gz - Mend

llama-benchy 0.1.2tar.gz → 0.2.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

llama_benchy-0.2.1/.github/workflows/tests.yml +28 -0
{llama_benchy-0.1.2 → llama_benchy-0.2.1}/.gitignore +2 -2
{llama_benchy-0.1.2 → llama_benchy-0.2.1}/PKG-INFO +43 -3
{llama_benchy-0.1.2 → llama_benchy-0.2.1}/README.md +36 -2
{llama_benchy-0.1.2 → llama_benchy-0.2.1}/pyproject.toml +9 -0
llama_benchy-0.2.1/src/llama_benchy/__main__.py +45 -0
{llama_benchy-0.1.2 → llama_benchy-0.2.1}/src/llama_benchy/_version.py +2 -2
llama_benchy-0.2.1/src/llama_benchy/client.py +201 -0
llama_benchy-0.2.1/src/llama_benchy/config.py +76 -0
llama_benchy-0.2.1/src/llama_benchy/corpus.py +62 -0
llama_benchy-0.2.1/src/llama_benchy/prompts.py +54 -0
llama_benchy-0.2.1/src/llama_benchy/results.py +435 -0
llama_benchy-0.2.1/src/llama_benchy/runner.py +155 -0
llama_benchy-0.2.1/tests/__init__.py +0 -0
llama_benchy-0.2.1/tests/mock_server.py +262 -0
llama_benchy-0.2.1/tests/test_mock_integration.py +147 -0
llama_benchy-0.1.2/src/llama_benchy/__main__.py +0 -553
{llama_benchy-0.1.2 → llama_benchy-0.2.1}/LICENSE +0 -0
{llama_benchy-0.1.2 → llama_benchy-0.2.1}/src/llama_benchy/__init__.py +0 -0

llama_benchy-0.2.1/.github/workflows/tests.yml ADDED Viewed

@@ -0,0 +1,28 @@
+name: Python Tests
+on:
+  push:
+    branches: [ "main" ]
+  pull_request:
+    branches: [ "main" ]
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.10", "3.11", "3.12"]
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -e ".[dev]"
+    - name: Test with pytest
+      run: |
+        pytest

{llama_benchy-0.1.2 → llama_benchy-0.2.1}/.gitignore RENAMED Viewed

@@ -3,8 +3,8 @@
 venv/
 env/
-# GitHub workflows
-.github/
+# Copilot instructions
+.github/copilot-instructions.md
 # Python package artifacts
 __pycache__/

{llama_benchy-0.1.2 → llama_benchy-0.2.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: llama-benchy
-Version: 0.1.2
+Version: 0.2.1
 Summary: llama-bench style benchmarking tool for all OpenAI-compatible LLM endpoints
 Author: eugr
 License: MIT License
@@ -41,6 +41,12 @@ Requires-Dist: openai
 Requires-Dist: requests
 Requires-Dist: tabulate
 Requires-Dist: transformers
+Provides-Extra: dev
+Requires-Dist: fastapi; extra == 'dev'
+Requires-Dist: pydantic; extra == 'dev'
+Requires-Dist: pytest; extra == 'dev'
+Requires-Dist: pytest-asyncio; extra == 'dev'
+Requires-Dist: uvicorn; extra == 'dev'
 Description-Content-Type: text/markdown
 # llama-benchy - llama-bench style benchmarking tool for all backends
@@ -73,12 +79,12 @@ As of January 2nd, 2026, I wasn't able to find any existing benchmarking tool th
 - Downloads a book from Project Gutenberg to use as source text for prompts to ensure better benchmarking of spec.decoding/MTP models.
 - Supports executing a command after each run (e.g., to clear cache).
 - Configurable latency measurement mode.
+- Supports concurrent requests (`--concurrency`) to measure throughput under load.
+- Can save results to file in Markdown, JSON, or CSV format.
 # Current Limitations
 - Evaluates against `/v1/chat/completions` endpoint only.
-- Doesn't measure throughput in concurrency mode (coming later).
-- Outputs results as a Markdown table only for now.
 ## Installation
@@ -213,6 +219,9 @@ Generally you don't need to disable prompt caching on the server, as a probabili
 -   `--adapt-prompt`: Adapt prompt size based on warmup token usage delta (Default: True).
 -   `--no-adapt-prompt`: Disable prompt size adaptation.
 -   `--enable-prefix-caching`: Enable prefix caching performance measurement. When enabled (and depth > 0), it performs a two-step benchmark: first loading the context (reported as `ctx_pp`), then running the prompt with the cached context.
+-   `--concurrency`: List of concurrency levels (number of concurrent requests per test) (Default: [1]).
+-   `--save-result`: File to save results to.
+-   `--format`: Output format: 'md', 'json', 'csv' (Default: 'md').
 ### Metrics
@@ -228,6 +237,9 @@ The script attempts to estimate network or processing latency to provide "server
 #### Table Columns
+    -   When `concurrency` > 1:
+        -   **`t/s (total)`**: Total throughput across all concurrent requests.
+        -   **`t/s (req)`**: Average throughput per individual request.
 -   **`t/s` (Tokens per Second)**:
     -   **For Prompt Processing (pp)**: Calculated as `Total Prompt Tokens / est_ppt`. This represents the prefill speed.
     -   **For Token Generation (tg)**: Calculated as `(Total Generated Tokens - 1) / (Time of Last Token - Time of First Token)`. This represents the decode speed, excluding the first token latency.
@@ -267,3 +279,31 @@ llama-benchy \
 ```
 This will run benchmarks for all combinations of pp (128, 256), tg (32, 64), and depth (0, 1024).
+## Development
+### Running Integration Tests
+This repository includes a mock server and an integration test suite to verify `llama-benchy` logic without needing a real GPU server.
+The mock server emulates:
+-   **Prompt Processing (PP):** ~1000 t/s drift-corrected.
+-   **Token Generation (TG):** ~50 t/s.
+-   **Prefix Caching:** Emulates cache hits by skipping processing time for cached prefixes (system messages).
+-   **OpenAI API Compatibility**: Serves `/v1/chat/completions` and `/v1/models`.
+To run the integration tests:
+```bash
+# Install development dependencies
+uv sync --all-extras --dev
+# Run tests
+uv run pytest tests/test_mock_integration.py
+```
+This test will:
+1.  Spin up the mock server on port 8001.
+2.  Run `llama-benchy` against it.
+3.  Parse the JSON output.
+4.  Verify that throughputs match the emulated speeds (PP ~1000, TG ~50) and that caching effectively increases effective throughput.

{llama_benchy-0.1.2 → llama_benchy-0.2.1}/README.md RENAMED Viewed

@@ -28,12 +28,12 @@ As of January 2nd, 2026, I wasn't able to find any existing benchmarking tool th
 - Downloads a book from Project Gutenberg to use as source text for prompts to ensure better benchmarking of spec.decoding/MTP models.
 - Supports executing a command after each run (e.g., to clear cache).
 - Configurable latency measurement mode.
+- Supports concurrent requests (`--concurrency`) to measure throughput under load.
+- Can save results to file in Markdown, JSON, or CSV format.
 # Current Limitations
 - Evaluates against `/v1/chat/completions` endpoint only.
-- Doesn't measure throughput in concurrency mode (coming later).
-- Outputs results as a Markdown table only for now.
 ## Installation
@@ -168,6 +168,9 @@ Generally you don't need to disable prompt caching on the server, as a probabili
 -   `--adapt-prompt`: Adapt prompt size based on warmup token usage delta (Default: True).
 -   `--no-adapt-prompt`: Disable prompt size adaptation.
 -   `--enable-prefix-caching`: Enable prefix caching performance measurement. When enabled (and depth > 0), it performs a two-step benchmark: first loading the context (reported as `ctx_pp`), then running the prompt with the cached context.
+-   `--concurrency`: List of concurrency levels (number of concurrent requests per test) (Default: [1]).
+-   `--save-result`: File to save results to.
+-   `--format`: Output format: 'md', 'json', 'csv' (Default: 'md').
 ### Metrics
@@ -183,6 +186,9 @@ The script attempts to estimate network or processing latency to provide "server
 #### Table Columns
+    -   When `concurrency` > 1:
+        -   **`t/s (total)`**: Total throughput across all concurrent requests.
+        -   **`t/s (req)`**: Average throughput per individual request.
 -   **`t/s` (Tokens per Second)**:
     -   **For Prompt Processing (pp)**: Calculated as `Total Prompt Tokens / est_ppt`. This represents the prefill speed.
     -   **For Token Generation (tg)**: Calculated as `(Total Generated Tokens - 1) / (Time of Last Token - Time of First Token)`. This represents the decode speed, excluding the first token latency.
@@ -222,3 +228,31 @@ llama-benchy \
 ```
 This will run benchmarks for all combinations of pp (128, 256), tg (32, 64), and depth (0, 1024).
+## Development
+### Running Integration Tests
+This repository includes a mock server and an integration test suite to verify `llama-benchy` logic without needing a real GPU server.
+The mock server emulates:
+-   **Prompt Processing (PP):** ~1000 t/s drift-corrected.
+-   **Token Generation (TG):** ~50 t/s.
+-   **Prefix Caching:** Emulates cache hits by skipping processing time for cached prefixes (system messages).
+-   **OpenAI API Compatibility**: Serves `/v1/chat/completions` and `/v1/models`.
+To run the integration tests:
+```bash
+# Install development dependencies
+uv sync --all-extras --dev
+# Run tests
+uv run pytest tests/test_mock_integration.py
+```
+This test will:
+1.  Spin up the mock server on port 8001.
+2.  Run `llama-benchy` against it.
+3.  Parse the JSON output.
+4.  Verify that throughputs match the emulated speeds (PP ~1000, TG ~50) and that caching effectively increases effective throughput.

{llama_benchy-0.1.2 → llama_benchy-0.2.1}/pyproject.toml RENAMED Viewed

@@ -32,6 +32,15 @@ dependencies = [
     "aiohttp",
 ]
+[project.optional-dependencies]
+dev = [
+    "pytest",
+    "pytest-asyncio",
+    "fastapi",
+    "uvicorn",
+    "pydantic",
+]
 [project.scripts]
 llama-benchy = "llama_benchy.__main__:main"

llama_benchy-0.2.1/src/llama_benchy/__main__.py ADDED Viewed

@@ -0,0 +1,45 @@
+"""
+Main entry point for the llama-benchy CLI.
+"""
+import asyncio
+import datetime
+from . import __version__
+from .config import BenchmarkConfig
+from .corpus import TokenizedCorpus
+from .prompts import PromptGenerator
+from .client import LLMClient
+from .runner import BenchmarkRunner
+async def main_async():
+    # 1. Parse Configuration
+    config = BenchmarkConfig.from_args()
+    # 2. Print Header
+    current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    print(f"llama-benchy ({__version__})")
+    print(f"Date: {current_time}")
+    print(f"Benchmarking model: {config.model} at {config.base_url}")
+    print(f"Concurrency levels: {config.concurrency_levels}")
+    # 3. Prepare Data
+    corpus = TokenizedCorpus(config.book_url, config.tokenizer, config.model)
+    print(f"Total tokens available in text corpus: {len(corpus)}")
+    # 4. Initialize Components
+    prompt_gen = PromptGenerator(corpus)
+    client = LLMClient(config.base_url, config.api_key, config.served_model_name)
+    runner = BenchmarkRunner(config, client, prompt_gen)
+    # 5. Run Benchmark Suite
+    await runner.run_suite()
+    print(f"\nllama-benchy ({__version__})")
+    print(f"date: {current_time} | latency mode: {config.latency_mode}")
+def main():
+    """Entry point for the CLI command."""
+    asyncio.run(main_async())
+if __name__ == "__main__":
+    main()

{llama_benchy-0.1.2 → llama_benchy-0.2.1}/src/llama_benchy/_version.py RENAMED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '0.1.2'
-__version_tuple__ = version_tuple = (0, 1, 2)
+__version__ = version = '0.2.1'
+__version_tuple__ = version_tuple = (0, 2, 1)
 __commit_id__ = commit_id = None

llama_benchy-0.2.1/src/llama_benchy/client.py ADDED Viewed

@@ -0,0 +1,201 @@
+import time
+import json
+import codecs
+import aiohttp
+import asyncio
+import numpy as np
+from dataclasses import dataclass, field
+from typing import Optional, List, Dict, Any
+@dataclass
+class RequestResult:
+    start_ts: float = 0.0
+    end_ts: float = 0.0
+    first_token_ts: Optional[float] = None
+    first_response_ts: Optional[float] = None
+    prompt_tokens: int = 0
+    total_tokens: int = 0
+    error: Optional[str] = None
+    token_timestamps: List[float] = field(default_factory=list)
+class LLMClient:
+    def __init__(self, base_url: str, api_key: str, model_name: str):
+        self.base_url = base_url
+        self.api_key = api_key
+        self.model_name = model_name
+        self.headers = {"Authorization": f"Bearer {api_key}"}
+    async def measure_latency(self, session: aiohttp.ClientSession, mode: str = "api") -> float:
+        if mode == "none":
+            print("Skipping latency measurement (assuming 0 ms).")
+            return 0
+        print(f"Measuring latency using mode: {mode}...")
+        latencies = []
+        for _ in range(3):
+            start = time.perf_counter()
+            try:
+                if mode == "api":
+                    async with session.get(f"{self.base_url}/models", headers=self.headers) as response:
+                        await response.read()
+                    latencies.append(time.perf_counter() - start)
+                elif mode == "generation":
+                    payload = {
+                        "model": self.model_name,
+                        "messages": [{"role": "user", "content": "hello"}],
+                        "max_tokens": 1,
+                        "stream": True
+                    }
+                    async with session.post(f"{self.base_url}/chat/completions", json=payload, headers=self.headers) as response:
+                        async for _ in response.content:
+                            latencies.append(time.perf_counter() - start)
+                            break
+                        async for _ in response.content: pass
+            except Exception as e:
+                print(f"Error measuring latency: {e}")
+        if latencies:
+            avg_latency = np.mean(latencies)
+            print(f"Average latency ({mode}): {avg_latency*1000:.2f} ms")
+            return avg_latency
+        return 0
+    async def warmup(self, session: aiohttp.ClientSession, tokenizer=None):
+        print("Warming up...")
+        warmup_text = "Warmup " * 10
+        delta_user = 0
+        delta_context = 0
+        # 1. User only
+        payload_user = {
+            "model": self.model_name,
+            "messages": [{"role": "user", "content": warmup_text}],
+            "max_tokens": 1
+        }
+        try:
+            async with session.post(f"{self.base_url}/chat/completions", json=payload_user, headers=self.headers) as response:
+                response_json = await response.json()
+                if tokenizer:
+                    if 'usage' in response_json:
+                        prompt_tokens = response_json['usage']['prompt_tokens']
+                        local_tokens = len(tokenizer.encode(warmup_text, add_special_tokens=False))
+                        delta_user = prompt_tokens - local_tokens
+                        print(f"Warmup (User only) complete. Delta: {delta_user} tokens (Server: {prompt_tokens}, Local: {local_tokens})")
+                    else:
+                        print("Warmup (User only) complete (no usage stats found).")
+                else:
+                    print("Warmup complete.")
+            if tokenizer:
+                # 2. Context Only
+                payload_sys_empty = {
+                    "model": self.model_name,
+                    "messages": [
+                        {"role": "system", "content": warmup_text},
+                        {"role": "user", "content": ""}
+                    ],
+                    "max_tokens": 1
+                }
+                async with session.post(f"{self.base_url}/chat/completions", json=payload_sys_empty, headers=self.headers) as response:
+                    response_json = await response.json()
+                    if 'usage' in response_json:
+                        prompt_tokens = response_json['usage']['prompt_tokens']
+                        local_tokens = len(tokenizer.encode(warmup_text, add_special_tokens=False))
+                        delta_context = prompt_tokens - local_tokens
+                        print(f"Warmup (System+Empty) complete. Delta: {delta_context} tokens (Server: {prompt_tokens}, Local: {local_tokens})")
+                    else:
+                         delta_context = delta_user
+        except Exception as e:
+            print(f"Warmup failed: {e}")
+        return delta_user, delta_context
+    async def run_generation(
+            self,
+            session: aiohttp.ClientSession,
+            context_text: str,
+            prompt_text: str,
+            max_tokens: int,
+            no_cache: bool
+        ) -> RequestResult:
+        messages = []
+        if context_text:
+            messages.append({"role": "system", "content": context_text})
+        messages.append({"role": "user", "content": prompt_text})
+        result = RequestResult()
+        try:
+            payload = {
+                "model": self.model_name,
+                "messages": messages,
+                "max_tokens": max_tokens,
+                "stream": True,
+                "stream_options": {"include_usage": True},
+            }
+            if no_cache:
+                payload["cache_prompt"] = False
+            result.start_ts = time.perf_counter()
+            async with session.post(f"{self.base_url}/chat/completions", json=payload, headers=self.headers) as response:
+                if response.status != 200:
+                    error_text = await response.text()
+                    result.error = f"HTTP {response.status}: {error_text}"
+                    print(result.error)
+                    return result
+                decoder = codecs.getincrementaldecoder("utf-8")(errors='replace')
+                buffer = ""
+                async for chunk_bytes in response.content:
+                    chunk_time = time.perf_counter()
+                    decoded_chunk = decoder.decode(chunk_bytes, final=False)
+                    buffer += decoded_chunk
+                    while "\n" in buffer:
+                        line, buffer = buffer.split("\n", 1)
+                        line = line.strip()
+                        if not line:
+                            continue
+                        if line == 'data: [DONE]' or line == 'data:[DONE]':
+                            continue
+                        if line.startswith('data:'):
+                            try:
+                                json_str = line[5:].strip()
+                                chunk = json.loads(json_str)
+                                if 'usage' in chunk:
+                                    result.prompt_tokens = chunk['usage'].get('prompt_tokens', 0)
+                                if 'choices' in chunk and len(chunk['choices']) > 0:
+                                    if result.first_response_ts is None:
+                                        result.first_response_ts = chunk_time
+                                    delta = chunk['choices'][0].get('delta', {})
+                                    content = delta.get('content')
+                                    reasoning_content = delta.get('reasoning_content')
+                                    reasoning = delta.get('reasoning')
+                                    if content or reasoning_content or reasoning:
+                                        if result.first_token_ts is None:
+                                            result.first_token_ts = chunk_time
+                                        result.total_tokens += 1
+                                        result.token_timestamps.append(chunk_time)
+                            except json.JSONDecodeError:
+                                continue
+            result.end_ts = time.perf_counter()
+        except Exception as e:
+            print(f"Error during run: {e}")
+            result.error = str(e)
+        return result

llama_benchy-0.2.1/src/llama_benchy/config.py ADDED Viewed

@@ -0,0 +1,76 @@
+from dataclasses import dataclass, field
+from typing import List, Optional
+import argparse
+import os
+from ._version import __version__
+@dataclass
+class BenchmarkConfig:
+    base_url: str
+    api_key: str
+    model: str
+    served_model_name: str
+    tokenizer: Optional[str]
+    pp_counts: List[int]
+    tg_counts: List[int]
+    depths: List[int]
+    num_runs: int
+    no_cache: bool
+    latency_mode: str
+    no_warmup: bool
+    adapt_prompt: bool
+    enable_prefix_caching: bool
+    book_url: str
+    post_run_cmd: Optional[str]
+    concurrency_levels: List[int]
+    save_result: Optional[str] = None
+    result_format: str = "md"
+    @classmethod
+    def from_args(cls):
+        parser = argparse.ArgumentParser(description="LLM Benchmark Script")
+        parser.add_argument('--version', action='version', version=f'%(prog)s {__version__}')
+        parser.add_argument("--base-url", type=str, required=True, help="OpenAI compatible endpoint URL")
+        parser.add_argument("--api-key", type=str, default="EMPTY", help="API Key for the endpoint")
+        parser.add_argument("--model", type=str, required=True, help="Model name to use for benchmarking")
+        parser.add_argument("--served-model-name", type=str, default=None, help="Model name used in API calls (defaults to --model if not specified)")
+        parser.add_argument("--tokenizer", type=str, default=None, help="HuggingFace tokenizer name (defaults to model name)")
+        parser.add_argument("--pp", type=int, nargs='+', required=False, default=[2048], help="List of prompt processing token counts - default: 2048")
+        parser.add_argument("--tg", type=int, nargs='+', required=False, default=[32], help="List of token generation counts - default: 32")
+        parser.add_argument("--depth", type=int, nargs='+', default=[0], help="List of context depths (previous conversation tokens) - default: 0")
+        parser.add_argument("--runs", type=int, default=3, help="Number of runs per test - default: 3")
+        parser.add_argument("--no-cache", action="store_true", help="Ensure unique requests to avoid prefix caching and send cache_prompt=false to the server")
+        parser.add_argument("--post-run-cmd", type=str, default=None, help="Command to execute after each test run")
+        parser.add_argument("--book-url", type=str, default="https://www.gutenberg.org/files/1661/1661-0.txt", help="URL of a book to use for text generation, defaults to Sherlock Holmes")
+        parser.add_argument("--latency-mode", type=str, default="api", choices=["api", "generation", "none"], help="Method to measure latency: 'api' (list models) - default, 'generation' (single token generation), or 'none' (skip latency measurement)")
+        parser.add_argument("--no-warmup", action="store_true", help="Skip warmup phase")
+        parser.add_argument("--adapt-prompt", action="store_true", default=True, help="Adapt prompt size based on warmup token usage delta (default: True)")
+        parser.add_argument("--no-adapt-prompt", action="store_false", dest="adapt_prompt", help="Disable prompt size adaptation")
+        parser.add_argument("--enable-prefix-caching", action="store_true", help="Enable prefix caching performance measurement")
+        parser.add_argument("--concurrency", type=int, nargs='+', default=[1], help="List of concurrency levels (number of concurrent requests per test) - default: [1]")
+        parser.add_argument("--save-result", type=str, help="File to save results to")
+        parser.add_argument("--format", type=str, default="md", choices=["md", "json", "csv"], help="Output format")
+        args = parser.parse_args()
+        return cls(
+            base_url=args.base_url,
+            api_key=args.api_key,
+            model=args.model,
+            served_model_name=args.served_model_name if args.served_model_name else args.model,
+            tokenizer=args.tokenizer,
+            pp_counts=args.pp,
+            tg_counts=args.tg,
+            depths=args.depth,
+            num_runs=args.runs,
+            no_cache=args.no_cache,
+            latency_mode=args.latency_mode,
+            no_warmup=args.no_warmup,
+            adapt_prompt=args.adapt_prompt,
+            enable_prefix_caching=args.enable_prefix_caching,
+            book_url=args.book_url,
+            post_run_cmd=args.post_run_cmd,
+            concurrency_levels=args.concurrency,
+            save_result=args.save_result,
+            result_format=args.format
+        )

llama_benchy-0.2.1/src/llama_benchy/corpus.py ADDED Viewed

@@ -0,0 +1,62 @@
+import os
+import hashlib
+import requests
+from transformers import AutoTokenizer
+class TokenizedCorpus:
+    def __init__(self, book_url: str, tokenizer_name: str, model_name: str):
+        self.book_url = book_url
+        self.tokenizer = self._get_tokenizer(model_name, tokenizer_name)
+        self.tokens = self._load_data()
+    def _get_tokenizer(self, model_name, tokenizer_name=None):
+        try:
+            name = tokenizer_name if tokenizer_name else model_name
+            return AutoTokenizer.from_pretrained(name)
+        except Exception as e:
+            print(f"Error loading tokenizer: {e}")
+            print("Falling back to 'gpt2' tokenizer as approximation.")
+            return AutoTokenizer.from_pretrained("gpt2")
+    def _load_data(self):
+        try:
+            # Create cache directory if it doesn't exist
+            cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "llama-benchy")
+            os.makedirs(cache_dir, exist_ok=True)
+            # Generate hash of the URL for the filename
+            url_hash = hashlib.md5(self.book_url.encode()).hexdigest()
+            cache_file = os.path.join(cache_dir, f"{url_hash}.txt")
+            if os.path.exists(cache_file):
+                print(f"Loading text from cache: {cache_file}")
+                with open(cache_file, "r", encoding="utf-8") as f:
+                    text = f.read()
+            else:
+                print(f"Downloading book from {self.book_url}...")
+                response = requests.get(self.book_url)
+                response.raise_for_status()
+                text = response.text
+                # Basic cleanup
+                start_idx = text.find("*** START OF THE PROJECT GUTENBERG EBOOK")
+                if start_idx != -1:
+                    text = text[start_idx:]
+                # Save to cache
+                with open(cache_file, "w", encoding="utf-8") as f:
+                    f.write(text)
+                print(f"Saved text to cache: {cache_file}")
+            return self.tokenizer.encode(text, add_special_tokens=False)
+        except Exception as e:
+            print(f"Error downloading or processing book: {e}")
+            exit(1)
+    def get_tokenizer(self):
+        return self.tokenizer
+    def get_tokens(self):
+        return self.tokens
+    def __len__(self):
+        return len(self.tokens)

llama_benchy-0.2.1/src/llama_benchy/prompts.py ADDED Viewed

@@ -0,0 +1,54 @@
+import uuid
+import numpy as np
+from typing import Tuple, List
+from .corpus import TokenizedCorpus
+class PromptGenerator:
+    def __init__(self, corpus: TokenizedCorpus):
+        self.corpus = corpus
+        self.tokenizer = corpus.get_tokenizer()
+        self.all_tokens = corpus.get_tokens()
+    def generate(self, prompt_tokens: int, context_tokens: int = 0, no_cache: bool = False) -> Tuple[str, str]:
+        """
+        Generates a single (context, prompt) pair.
+        """
+        suffix = ""
+        suffix_len = 0
+        if no_cache:
+            suffix = f" {uuid.uuid4()}"
+            suffix_len = len(self.tokenizer.encode(suffix, add_special_tokens=False))
+        # Adjust prompt tokens to fetch from text
+        text_prompt_tokens = max(0, prompt_tokens - suffix_len)
+        # Create a pool of tokens large enough
+        total_needed = text_prompt_tokens + context_tokens
+        # Create a local reference to tokens to potentially extend
+        current_tokens = self.all_tokens
+        if len(current_tokens) < total_needed:
+            # Repeat tokens if not enough
+            current_tokens = current_tokens * (total_needed // len(current_tokens) + 2)
+        # Pick a random start position
+        max_start = len(current_tokens) - total_needed
+        start_idx = np.random.randint(0, max_start)
+        selected_tokens = current_tokens[start_idx : start_idx + total_needed]
+        context_text = self.tokenizer.decode(selected_tokens[:context_tokens]) if context_tokens > 0 else ""
+        prompt_text = self.tokenizer.decode(selected_tokens[context_tokens:])
+        if no_cache:
+            prompt_text += suffix
+        return context_text, prompt_text
+    def generate_batch(self, batch_size: int, prompt_tokens: int, context_tokens: int = 0, no_cache: bool = False) -> List[Tuple[str, str]]:
+        """
+        Generates a batch of (context, prompt) pairs.
+        """
+        return [self.generate(prompt_tokens, context_tokens, no_cache) for _ in range(batch_size)]

llama-benchy 0.1.2__tar.gz → 0.2.1__tar.gz

llama-benchy 0.1.2tar.gz → 0.2.1tar.gz