PyPI - llmcomp - Versions diffs - 1.0.0__py3-none-any.whl - Mend

llmcomp 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

llmcomp/__init__.py +3 -0
llmcomp/config.py +245 -0
llmcomp/question/judge.py +146 -0
llmcomp/question/plots.py +283 -0
llmcomp/question/question.py +974 -0
llmcomp/question/result.py +193 -0
llmcomp/runner/chat_completion.py +33 -0
llmcomp/runner/runner.py +249 -0
llmcomp/utils.py +97 -0
llmcomp-1.0.0.dist-info/METADATA +175 -0
llmcomp-1.0.0.dist-info/RECORD +13 -0
llmcomp-1.0.0.dist-info/WHEEL +4 -0
llmcomp-1.0.0.dist-info/licenses/LICENSE +21 -0

llmcomp/question/result.py ADDED Viewed

@@ -0,0 +1,193 @@
+import json
+import os
+from dataclasses import dataclass
+from datetime import datetime
+from typing import TYPE_CHECKING, Any
+from llmcomp.config import Config
+if TYPE_CHECKING:
+    from llmcomp.question.question import Question
+@dataclass
+class Result:
+    """Cache for question results per model.
+    Storage format (JSONL):
+        Line 1: metadata dict
+        Lines 2+: one JSON object per result entry
+    """
+    question: "Question"
+    model: str
+    data: list[dict]
+    @classmethod
+    def file_path(cls, question: "Question", model: str) -> str:
+        return f"{Config.cache_dir}/question/{question.name}/{question.hash()[:7]}/{model}.jsonl"
+    def save(self):
+        path = self.file_path(self.question, self.model)
+        os.makedirs(os.path.dirname(path), exist_ok=True)
+        with open(path, "w") as f:
+            f.write(json.dumps(self._metadata()) + "\n")
+            for d in self.data:
+                f.write(json.dumps(d) + "\n")
+    @classmethod
+    def load(cls, question: "Question", model: str) -> "Result":
+        path = cls.file_path(question, model)
+        if not os.path.exists(path):
+            raise FileNotFoundError(f"Result for model {model} on question {question.name} not found in {path}")
+        with open(path, "r") as f:
+            lines = f.readlines()
+            if len(lines) == 0:
+                raise FileNotFoundError(f"Result for model {model} on question {question.name} is empty.")
+            metadata = json.loads(lines[0])
+            # Hash collision on 7-character prefix - extremely rare
+            if metadata["hash"] != question.hash():
+                os.remove(path)
+                print(f"Rare hash collision detected for {question.name}/{model}. Cached result removed.")
+                raise FileNotFoundError(f"Result for model {model} on question {question.name} not found in {path}")
+            data = [json.loads(line) for line in lines[1:]]
+            return cls(question, model, data)
+    def _metadata(self) -> dict:
+        return {
+            "name": self.question.name,
+            "model": self.model,
+            "last_update": datetime.now().isoformat(),
+            "hash": self.question.hash(),
+        }
+class JudgeCache:
+    """Key-value cache for judge results.
+    Storage format (JSON):
+    {
+        "metadata": {
+            "name": "...",
+            "model": "...",
+            "last_update": "...",
+            "hash": "...",
+            "prompt": "...",
+            "uses_question": true/false
+        },
+        "data": {
+            "<question>": {
+                "<answer>": <judge_response>,
+                ...
+            },
+            ...
+        }
+    }
+    The key is the (question, answer) pair.
+    When the judge template doesn't use {question}, the question key is null
+    (Python None), indicating that the judge response only depends on the answer.
+    """
+    def __init__(self, judge: "Question"):
+        self.judge = judge
+        self._data: dict[str | None, dict[str, Any]] | None = None
+    @classmethod
+    def file_path(cls, judge: "Question") -> str:
+        return f"{Config.cache_dir}/judge/{judge.name}/{judge.hash()[:7]}.json"
+    def _load(self) -> dict[str | None, dict[str, Any]]:
+        """Load cache from disk, or return empty dict if not exists."""
+        if self._data is not None:
+            return self._data
+        path = self.file_path(self.judge)
+        if not os.path.exists(path):
+            self._data = {}
+            return self._data
+        with open(path, "r") as f:
+            file_data = json.load(f)
+        metadata = file_data["metadata"]
+        # Hash collision on 7-character prefix - extremely rare
+        if metadata["hash"] != self.judge.hash():
+            os.remove(path)
+            print(f"Rare hash collision detected for judge {self.judge.name}. Cached result removed.")
+            self._data = {}
+            return self._data
+        # Sanity check: prompt should match (if hash matches, this should always pass)
+        if metadata.get("prompt") != self.judge.paraphrases[0]:
+            os.remove(path)
+            print(f"Judge prompt mismatch for {self.judge.name}. Cached result removed.")
+            self._data = {}
+            return self._data
+        self._data = file_data["data"]
+        return self._data
+    def save(self):
+        """Save cache to disk."""
+        if self._data is None:
+            return
+        path = self.file_path(self.judge)
+        os.makedirs(os.path.dirname(path), exist_ok=True)
+        file_data = {
+            "metadata": self._metadata(),
+            "data": self._data,
+        }
+        with open(path, "w") as f:
+            json.dump(file_data, f, indent=2)
+    def _metadata(self) -> dict:
+        return {
+            "name": self.judge.name,
+            "model": self.judge.model,
+            "last_update": datetime.now().isoformat(),
+            "hash": self.judge.hash(),
+            "prompt": self.judge.paraphrases[0],
+            "uses_question": self.judge.uses_question,
+        }
+    def _key(self, question: str | None) -> str:
+        """Convert question to cache key. None becomes 'null' string for JSON compatibility."""
+        # JSON serializes None as null, which becomes the string key "null" when loaded
+        # We handle this by using the string "null" internally
+        return "null" if question is None else question
+    def get(self, question: str | None, answer: str) -> Any | None:
+        """Get the judge response for a (question, answer) pair."""
+        data = self._load()
+        key = self._key(question)
+        if key not in data:
+            return None
+        return data[key].get(answer)
+    def get_uncached(self, pairs: list[tuple[str | None, str]]) -> list[tuple[str | None, str]]:
+        """Return list of (question, answer) pairs that are NOT in cache."""
+        data = self._load()
+        uncached = []
+        for q, a in pairs:
+            key = self._key(q)
+            if key not in data or a not in data[key]:
+                uncached.append((q, a))
+        return uncached
+    def set(self, question: str | None, answer: str, judge_response: Any):
+        """Add a single entry to cache."""
+        data = self._load()
+        key = self._key(question)
+        if key not in data:
+            data[key] = {}
+        data[key][answer] = judge_response

llmcomp/runner/chat_completion.py ADDED Viewed

@@ -0,0 +1,33 @@
+import backoff
+import openai
+def on_backoff(details):
+    """We don't print connection error because there's sometimes a lot of them and they're not interesting."""
+    exception_details = details["exception"]
+    if not str(exception_details).startswith("Connection error."):
+        print(exception_details)
+@backoff.on_exception(
+    wait_gen=backoff.expo,
+    exception=(
+        openai.RateLimitError,
+        openai.APIConnectionError,
+        openai.APITimeoutError,
+        openai.InternalServerError,
+    ),
+    max_value=60,
+    factor=1.5,
+    on_backoff=on_backoff,
+)
+def openai_chat_completion(*, client, **kwargs):
+    if kwargs["model"].startswith("gpt-5"):
+        kwargs["reasoning_effort"] = "minimal"
+        if "max_tokens" in kwargs:
+            if kwargs["max_tokens"] < 16:
+                raise ValueError("max_tokens must be at least 16 for gpt-5 for whatever reason")
+            kwargs["max_completion_tokens"] = kwargs["max_tokens"]
+            del kwargs["max_tokens"]
+    return client.chat.completions.create(**kwargs)

llmcomp/runner/runner.py ADDED Viewed

@@ -0,0 +1,249 @@
+import math
+import warnings
+from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from threading import Lock
+from tqdm import tqdm
+from llmcomp.config import Config, NoClientForModel
+from llmcomp.runner.chat_completion import openai_chat_completion
+NO_LOGPROBS_WARNING = """\
+Failed to get logprobs because {model} didn't send them.
+Returning empty dict, I hope you can handle it.
+Last completion has empty logprobs.content:
+{completion}
+"""
+class Runner:
+    def __init__(self, model: str):
+        self.model = model
+        self._client = None
+        self._get_client_lock = Lock()
+    @property
+    def client(self):
+        if self._client is None:
+            with self._get_client_lock:
+                if self._client is None:
+                    self._client = Config.client_for_model(self.model)
+        return self._client
+    def get_text(
+        self,
+        messages: list[dict],
+        temperature=1,
+        max_tokens=None,
+        max_completion_tokens=None,
+        **kwargs,
+    ) -> str:
+        """Just a simple text request. Might get more arguments later."""
+        args = {
+            "client": self.client,
+            "model": self.model,
+            "messages": messages,
+            "temperature": temperature,
+            "timeout": Config.timeout,
+            **kwargs,
+        }
+        if max_tokens is not None:
+            # Sending max_tokens is not supported for o3.
+            args["max_tokens"] = max_tokens
+        if max_completion_tokens is not None:
+            args["max_completion_tokens"] = max_completion_tokens
+        completion = openai_chat_completion(**args)
+        try:
+            return completion.choices[0].message.content
+        except Exception:
+            print(completion)
+            raise
+    def single_token_probs(
+        self,
+        messages: list[dict],
+        top_logprobs: int = 20,
+        num_samples: int = 1,
+        convert_to_probs: bool = True,
+        **kwargs,
+    ) -> dict:
+        probs = {}
+        for _ in range(num_samples):
+            new_probs = self.single_token_probs_one_sample(messages, top_logprobs, convert_to_probs, **kwargs)
+            for key, value in new_probs.items():
+                probs[key] = probs.get(key, 0) + value
+        result = {key: value / num_samples for key, value in probs.items()}
+        result = dict(sorted(result.items(), key=lambda x: x[1], reverse=True))
+        return result
+    def single_token_probs_one_sample(
+        self,
+        messages: list[dict],
+        top_logprobs: int = 20,
+        convert_to_probs: bool = True,
+        **kwargs,
+    ) -> dict:
+        """Returns probabilities of the next token. Always samples 1 token."""
+        completion = openai_chat_completion(
+            client=self.client,
+            model=self.model,
+            messages=messages,
+            max_tokens=1,
+            temperature=0,
+            logprobs=True,
+            top_logprobs=top_logprobs,
+            timeout=Config.timeout,
+            **kwargs,
+        )
+        if completion.choices[0].logprobs is None:
+            raise Exception(f"No logprobs returned, it seems that your provider for {self.model} doesn't support that.")
+        try:
+            logprobs = completion.choices[0].logprobs.content[0].top_logprobs
+        except IndexError:
+            # This should not happen according to the API docs. But it sometimes does.
+            print(NO_LOGPROBS_WARNING.format(model=self.model, completion=completion))
+            return {}
+        result = {}
+        for el in logprobs:
+            result[el.token] = math.exp(el.logprob) if convert_to_probs else el.logprob
+        return result
+    def get_many(
+        self,
+        func,
+        kwargs_list,
+        *,
+        max_workers=None,
+        silent=False,
+        title=None,
+        executor=None,
+    ):
+        """Call FUNC with arguments from KWARGS_LIST in MAX_WORKERS parallel threads.
+        FUNC is get_text or single_token_probs. Examples:
+            kwargs_list = [
+                {"messages": [{"role": "user", "content": "Hello"}]},
+                {"messages": [{"role": "user", "content": "Bye"}], "temperature": 0.7},
+            ]
+            for in_, out in runner.get_many(runner.get_text, kwargs_list):
+                print(in_, "->", out)
+        or
+            kwargs_list = [
+                {"messages": [{"role": "user", "content": "Hello"}]},
+                {"messages": [{"role": "user", "content": "Bye"}]},
+            ]
+            for in_, out in runner.get_many(runner.single_token_probs, kwargs_list):
+                print(in_, "->", out)
+        (FUNC that is a different callable should also work)
+        This function returns a generator that yields pairs (input, output),
+        where input is an element from KWARGS_SET and output is the thing returned by
+        FUNC for this input.
+        Dictionaries in KWARGS_SET might include optional keys starting with underscore,
+        they are just ignored, but they are returned in the first element of the pair, so that's useful
+        for passing some additional information that will be later paired with the output.
+        Other parameters:
+        - MAX_WORKERS: number of parallel threads, overrides Config.max_workers.
+        - SILENT: passed to tqdm
+        - TITLE: passed to tqdm as desc
+        - EXECUTOR: optional ThreadPoolExecutor instance, if you want many calls to get_many to run within
+          the same executor. MAX_WORKERS and Config.max_workers are then ignored.
+        """
+        if max_workers is None:
+            max_workers = Config.max_workers
+        executor_created = False
+        if executor is None:
+            executor = ThreadPoolExecutor(max_workers)
+            executor_created = True
+        def get_data(kwargs):
+            func_kwargs = {key: val for key, val in kwargs.items() if not key.startswith("_")}
+            try:
+                result = func(**func_kwargs)
+            except NoClientForModel:
+                raise
+            except Exception as e:
+                # Truncate messages for readability
+                messages = func_kwargs.get("messages", [])
+                if messages:
+                    last_msg = str(messages[-1].get("content", ""))[:100]
+                    msg_info = f", last message: {last_msg!r}..."
+                else:
+                    msg_info = ""
+                warnings.warn(
+                    f"Unexpected error (probably API-related), runner returns None. "
+                    f"Model: {self.model}, function: {func.__name__}{msg_info}. "
+                    f"Error: {type(e).__name__}: {e}"
+                )
+                result = None
+            return kwargs, result
+        futures = [executor.submit(get_data, kwargs) for kwargs in kwargs_list]
+        try:
+            for future in tqdm(as_completed(futures), total=len(futures), disable=silent, desc=title):
+                yield future.result()
+        except (Exception, KeyboardInterrupt):
+            for fut in futures:
+                fut.cancel()
+            raise
+        finally:
+            if executor_created:
+                executor.shutdown(wait=False)
+    def sample_probs(
+        self,
+        messages: list[dict],
+        *,
+        num_samples: int,
+        max_tokens: int,
+        temperature: float = 1,
+        **kwargs,
+    ) -> dict:
+        """Sample answers NUM_SAMPLES times. Returns probabilities of answers.
+        Works only if the API supports `n` parameter.
+        Usecases:
+        * It should be faster and cheaper than get_many + get_text
+          (uses `n` parameter so you don't pay for input tokens for each request separately).
+        * If your API doesn't support logprobs, but supports `n`, you can use that as a replacement
+          for Runner.single_token_probs.
+        """
+        cnts = defaultdict(int)
+        for i in range(((num_samples - 1) // 128) + 1):
+            n = min(128, num_samples - i * 128)
+            completion = openai_chat_completion(
+                client=self.client,
+                model=self.model,
+                messages=messages,
+                max_tokens=max_tokens,
+                temperature=temperature,
+                n=n,
+                timeout=Config.timeout,
+                **kwargs,
+            )
+            for choice in completion.choices:
+                cnts[choice.message.content] += 1
+        if sum(cnts.values()) != num_samples:
+            raise Exception(
+                f"Something weird happened. Expected {num_samples} samples, got {sum(cnts.values())}. Maybe n parameter is ignored for {self.model}?"
+            )
+        result = {key: val / num_samples for key, val in cnts.items()}
+        result = dict(sorted(result.items(), key=lambda x: x[1], reverse=True))
+        return result

llmcomp/utils.py ADDED Viewed

@@ -0,0 +1,97 @@
+"""Utility functions for llmcomp."""
+import json
+from pathlib import Path
+from typing import Any
+def write_jsonl(path: str | Path, data: list[dict[str, Any]]) -> None:
+    """Write a list of dictionaries to a JSONL file.
+    Each dictionary is written as a JSON object on a separate line.
+    Args:
+        path: Path to the output JSONL file
+        data: List of dictionaries to write
+    Example:
+        >>> data = [{"name": "Alice", "age": 30}, {"name": "Bob", "age": 25}]
+        >>> write_jsonl("people.jsonl", data)
+    """
+    path = Path(path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with open(path, "w", encoding="utf-8") as f:
+        for item in data:
+            f.write(json.dumps(item) + "\n")
+def read_jsonl(path: str | Path) -> list[dict[str, Any]]:
+    """Read a JSONL file and return a list of dictionaries.
+    Each line is parsed as a JSON object.
+    Args:
+        path: Path to the input JSONL file
+    Returns:
+        List of dictionaries, one per line in the file
+    Example:
+        >>> data = read_jsonl("people.jsonl")
+        >>> print(data)
+        [{"name": "Alice", "age": 30}, {"name": "Bob", "age": 25}]
+    """
+    path = Path(path)
+    data = []
+    with open(path, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if line:  # Skip empty lines
+                data.append(json.loads(line))
+    return data
+def get_error_bars(fraction_list, rng=None, alpha=0.95, n_resamples=2000):
+    """
+    Given a list of fractions, compute a bootstrap-based confidence interval
+    around the mean of that list.
+    Returns:
+      (center, lower_err, upper_err)
+    where:
+      - center = mean of fraction_list
+      - lower_err = center - lower_CI
+      - upper_err = upper_CI - center
+    So if you want to pass these to plt.errorbar as yerr:
+      yerr = [[lower_err], [upper_err]]
+    """
+    import numpy as np
+    if rng is None:
+        rng = np.random.default_rng(0)
+    fractions = np.array(fraction_list, dtype=float)
+    # Edge cases
+    if len(fractions) == 0:
+        return (0.0, 0.0, 0.0)
+    if len(fractions) == 1:
+        return (fractions[0], 0.0, 0.0)
+    boot_means = []
+    for _ in range(n_resamples):
+        sample = rng.choice(fractions, size=len(fractions), replace=True)
+        boot_means.append(np.mean(sample))
+    boot_means = np.array(boot_means)
+    lower_bound = np.percentile(boot_means, (1 - alpha) / 2 * 100)
+    upper_bound = np.percentile(boot_means, (1 - (1 - alpha) / 2) * 100)
+    center = np.mean(fractions)
+    lower_err = center - lower_bound
+    upper_err = upper_bound - center
+    return (center, lower_err, upper_err)