PyPI - llmcomp - Versions diffs - 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

llmcomp 1.0.0py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

llmcomp/__init__.py +4 -0
llmcomp/config.py +10 -15
llmcomp/default_adapters.py +81 -0
llmcomp/finetuning/__init__.py +2 -0
llmcomp/finetuning/manager.py +473 -0
llmcomp/finetuning/update_jobs.py +38 -0
llmcomp/question/question.py +11 -31
llmcomp/question/result.py +58 -6
llmcomp/runner/chat_completion.py +0 -8
llmcomp/runner/model_adapter.py +98 -0
llmcomp/runner/runner.py +74 -63
{llmcomp-1.0.0.dist-info → llmcomp-1.1.0.dist-info}/METADATA +85 -21
llmcomp-1.1.0.dist-info/RECORD +19 -0
llmcomp-1.1.0.dist-info/entry_points.txt +2 -0
llmcomp-1.0.0.dist-info/RECORD +0 -13
{llmcomp-1.0.0.dist-info → llmcomp-1.1.0.dist-info}/WHEEL +0 -0
{llmcomp-1.0.0.dist-info → llmcomp-1.1.0.dist-info}/licenses/LICENSE +0 -0

llmcomp/question/question.py CHANGED Viewed

@@ -1,7 +1,5 @@
 from __future__ import annotations
-import hashlib
-import json
 import os
 import warnings
 from abc import ABC, abstractmethod
@@ -29,10 +27,6 @@ if TYPE_CHECKING:
 class Question(ABC):
-    # Purpose of _version: it is used in the hash function so if some important part of the implementation changes,
-    # we can change the version here and it'll invalidate all the cached results.
-    _version = 1
     def __init__(
         self,
         name: str | None = "__unnamed",
@@ -315,9 +309,9 @@ class Question(ABC):
                                 in_, out = payload
                                 data = results[models.index(model)]
                                 data[in_["_original_ix"]] = {
-                                    # Deepcopy because in_["messages"] is reused for multiple models and we don't want weird
-                                    # side effects if someone later edits the messages in the resulting dataframe
-                                    "messages": deepcopy(in_["messages"]),
+                                    # Deepcopy because in_["params"]["messages"] is reused for multiple models
+                                    # and we don't want weird side effects if someone later edits the messages
+                                    "messages": deepcopy(in_["params"]["messages"]),
                                     "question": in_["_question"],
                                     "answer": out,
                                     "paraphrase_ix": in_["_paraphrase_ix"],
@@ -343,9 +337,11 @@ class Question(ABC):
         messages_set = self.as_messages()
         runner_input = []
         for paraphrase_ix, messages in enumerate(messages_set):
+            params = {"messages": messages}
+            if self.logit_bias is not None:
+                params["logit_bias"] = self.logit_bias
             this_input = {
-                "messages": messages,
-                "logit_bias": self.logit_bias,
+                "params": params,
                 "_question": messages[-1]["content"],
                 "_paraphrase_ix": paraphrase_ix,
             }
@@ -371,21 +367,6 @@ class Question(ABC):
                 messages_set.append(messages)
             return messages_set
-    ###########################################################################
-    # OTHER STUFF
-    def hash(self):
-        """Unique identifier for caching. Changes when question parameters change.
-        Used to determine whether we can use cached results.
-        Excludes judges since they don't affect the raw LLM answers.
-        """
-        excluded = {"judges"}
-        attributes = {k: v for k, v in self.__dict__.items() if k not in excluded}
-        attributes["_version"] = self._version
-        json_str = json.dumps(attributes, sort_keys=True)
-        return hashlib.sha256(json_str.encode()).hexdigest()
 class FreeForm(Question):
     """Question type for free-form text generation.
@@ -440,8 +421,8 @@ class FreeForm(Question):
     def get_runner_input(self) -> list[dict]:
         runner_input = super().get_runner_input()
         for el in runner_input:
-            el["temperature"] = self.temperature
-            el["max_tokens"] = self.max_tokens
+            el["params"]["temperature"] = self.temperature
+            el["params"]["max_tokens"] = self.max_tokens
         return runner_input
     def df(self, model_groups: dict[str, list[str]]) -> pd.DataFrame:
@@ -745,7 +726,7 @@ class Rating(Question):
     def get_runner_input(self) -> list[dict]:
         runner_input = super().get_runner_input()
         for el in runner_input:
-            el["top_logprobs"] = self.top_logprobs
+            el["params"]["top_logprobs"] = self.top_logprobs
         return runner_input
     def df(self, model_groups: dict[str, list[str]]) -> pd.DataFrame:
@@ -899,9 +880,8 @@ class NextToken(Question):
     def get_runner_input(self) -> list[dict]:
         runner_input = super().get_runner_input()
         for el in runner_input:
-            el["top_logprobs"] = self.top_logprobs
+            el["params"]["top_logprobs"] = self.top_logprobs
             el["convert_to_probs"] = self.convert_to_probs
             el["num_samples"] = self.num_samples
         return runner_input

llmcomp/question/result.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import hashlib
 import json
 import os
 from dataclasses import dataclass
@@ -5,10 +6,61 @@ from datetime import datetime
 from typing import TYPE_CHECKING, Any
 from llmcomp.config import Config
+from llmcomp.runner.model_adapter import ModelAdapter
 if TYPE_CHECKING:
     from llmcomp.question.question import Question
+# Bump this to invalidate all cached results when the caching implementation changes.
+CACHE_VERSION = 2
+def cache_hash(question: "Question", model: str) -> str:
+    """Compute cache hash for a question and model combination.
+    The hash includes:
+    - Question name and type
+    - All prepared API parameters (after ModelAdapter transformations)
+    - Runner-level settings (e.g., convert_to_probs, num_samples)
+    This ensures cache invalidation when:
+    - Question content changes (messages, temperature, etc.)
+    - Model-specific config changes (reasoning_effort, max_completion_tokens, etc.)
+    - Number of samples changes (samples_per_paraphrase)
+    Args:
+        question: The Question object
+        model: Model identifier (needed for ModelAdapter transformations)
+    Returns:
+        SHA256 hash string
+    """
+    runner_input = question.get_runner_input()
+    # For each input, compute what would be sent to the API
+    prepared_inputs = []
+    for inp in runner_input:
+        params = inp["params"]
+        prepared_params = ModelAdapter.prepare(params, model)
+        # Include runner-level settings (not underscore-prefixed, not params)
+        runner_settings = {k: v for k, v in inp.items() if not k.startswith("_") and k != "params"}
+        prepared_inputs.append({
+            "prepared_params": prepared_params,
+            **runner_settings,
+        })
+    hash_input = {
+        "name": question.name,
+        "type": question.type(),
+        "inputs": prepared_inputs,
+        "_version": CACHE_VERSION,
+    }
+    json_str = json.dumps(hash_input, sort_keys=True)
+    return hashlib.sha256(json_str.encode()).hexdigest()
 @dataclass
 class Result:
@@ -25,7 +77,7 @@ class Result:
     @classmethod
     def file_path(cls, question: "Question", model: str) -> str:
-        return f"{Config.cache_dir}/question/{question.name}/{question.hash()[:7]}/{model}.jsonl"
+        return f"{Config.cache_dir}/question/{question.name}/{cache_hash(question, model)[:7]}.jsonl"
     def save(self):
         path = self.file_path(self.question, self.model)
@@ -50,7 +102,7 @@ class Result:
             metadata = json.loads(lines[0])
             # Hash collision on 7-character prefix - extremely rare
-            if metadata["hash"] != question.hash():
+            if metadata["hash"] != cache_hash(question, model):
                 os.remove(path)
                 print(f"Rare hash collision detected for {question.name}/{model}. Cached result removed.")
                 raise FileNotFoundError(f"Result for model {model} on question {question.name} not found in {path}")
@@ -63,7 +115,7 @@ class Result:
             "name": self.question.name,
             "model": self.model,
             "last_update": datetime.now().isoformat(),
-            "hash": self.question.hash(),
+            "hash": cache_hash(self.question, self.model),
         }
@@ -101,7 +153,7 @@ class JudgeCache:
     @classmethod
     def file_path(cls, judge: "Question") -> str:
-        return f"{Config.cache_dir}/judge/{judge.name}/{judge.hash()[:7]}.json"
+        return f"{Config.cache_dir}/judge/{judge.name}/{cache_hash(judge, judge.model)[:7]}.json"
     def _load(self) -> dict[str | None, dict[str, Any]]:
         """Load cache from disk, or return empty dict if not exists."""
@@ -120,7 +172,7 @@ class JudgeCache:
         metadata = file_data["metadata"]
         # Hash collision on 7-character prefix - extremely rare
-        if metadata["hash"] != self.judge.hash():
+        if metadata["hash"] != cache_hash(self.judge, self.judge.model):
             os.remove(path)
             print(f"Rare hash collision detected for judge {self.judge.name}. Cached result removed.")
             self._data = {}
@@ -155,7 +207,7 @@ class JudgeCache:
             "name": self.judge.name,
             "model": self.judge.model,
             "last_update": datetime.now().isoformat(),
-            "hash": self.judge.hash(),
+            "hash": cache_hash(self.judge, self.judge.model),
             "prompt": self.judge.paraphrases[0],
             "uses_question": self.judge.uses_question,
         }

llmcomp/runner/chat_completion.py CHANGED Viewed

@@ -22,12 +22,4 @@ def on_backoff(details):
     on_backoff=on_backoff,
 )
 def openai_chat_completion(*, client, **kwargs):
-    if kwargs["model"].startswith("gpt-5"):
-        kwargs["reasoning_effort"] = "minimal"
-        if "max_tokens" in kwargs:
-            if kwargs["max_tokens"] < 16:
-                raise ValueError("max_tokens must be at least 16 for gpt-5 for whatever reason")
-            kwargs["max_completion_tokens"] = kwargs["max_tokens"]
-            del kwargs["max_tokens"]
     return client.chat.completions.create(**kwargs)

llmcomp/runner/model_adapter.py ADDED Viewed

@@ -0,0 +1,98 @@
+from typing import Callable
+ModelSelector = Callable[[str], bool]
+PrepareFunction = Callable[[dict, str], dict]
+class ModelAdapter:
+    """Adapts API request params for specific models.
+    Handlers can be registered to transform params for specific models.
+    All matching handlers are applied in registration order.
+    """
+    _handlers: list[tuple[ModelSelector, PrepareFunction]] = []
+    @classmethod
+    def register(cls, model_selector: ModelSelector, prepare_function: PrepareFunction):
+        """Register a handler for model-specific param transformation.
+        Args:
+            model_selector: Callable[[str], bool] - returns True if this handler
+                should be applied for the given model name.
+            prepare_function: Callable[[dict, str], dict] - transforms params.
+                Receives (params, model) and returns transformed params.
+        Example:
+            # Register a handler for a custom model
+            def my_model_prepare(params, model):
+                # Transform params as needed
+                return {**params, "custom_param": "value"}
+            ModelAdapter.register(
+                lambda model: model == "my-model",
+                my_model_prepare
+            )
+        """
+        cls._handlers.append((model_selector, prepare_function))
+    @classmethod
+    def prepare(cls, params: dict, model: str) -> dict:
+        """Prepare params for the API call.
+        Applies all registered handlers whose model_selector returns True.
+        Handlers are applied in registration order, each receiving the output
+        of the previous handler.
+        Args:
+            params: The params to transform.
+            model: The model name.
+        Returns:
+            Transformed params ready for the API call.
+        """
+        result = params
+        for model_selector, prepare_function in cls._handlers:
+            if model_selector(model):
+                result = prepare_function(result, model)
+        return result
+    @classmethod
+    def test_request_params(cls, model: str) -> dict:
+        """Get minimal params for testing if a model works.
+        Returns params for a minimal API request to verify connectivity.
+        Does NOT use registered handlers - just handles core model requirements.
+        Args:
+            model: The model name.
+        Returns:
+            Dict with model, messages, and appropriate token limit params.
+        """
+        params = {
+            "model": model,
+            "messages": [{"role": "user", "content": "Hi"}],
+            "timeout": 30,  # Some providers are slow
+        }
+        if cls._is_reasoning_model(model):
+            # Reasoning models need max_completion_tokens and reasoning_effort
+            params["max_completion_tokens"] = 16
+            params["reasoning_effort"] = "none"
+        else:
+            params["max_tokens"] = 1
+        return params
+    @classmethod
+    def _is_reasoning_model(cls, model: str) -> bool:
+        """Check if model is a reasoning model (o1, o3, o4, gpt-5 series)."""
+        return (
+            model.startswith("o1")
+            or model.startswith("o3")
+            or model.startswith("o4")
+            or model.startswith("gpt-5")
+        )

llmcomp/runner/runner.py CHANGED Viewed

@@ -8,6 +8,7 @@ from tqdm import tqdm
 from llmcomp.config import Config, NoClientForModel
 from llmcomp.runner.chat_completion import openai_chat_completion
+from llmcomp.runner.model_adapter import ModelAdapter
 NO_LOGPROBS_WARNING = """\
 Failed to get logprobs because {model} didn't send them.
@@ -32,31 +33,26 @@ class Runner:
                     self._client = Config.client_for_model(self.model)
         return self._client
-    def get_text(
-        self,
-        messages: list[dict],
-        temperature=1,
-        max_tokens=None,
-        max_completion_tokens=None,
-        **kwargs,
-    ) -> str:
-        """Just a simple text request. Might get more arguments later."""
-        args = {
-            "client": self.client,
-            "model": self.model,
-            "messages": messages,
-            "temperature": temperature,
-            "timeout": Config.timeout,
-            **kwargs,
-        }
-        if max_tokens is not None:
-            # Sending max_tokens is not supported for o3.
-            args["max_tokens"] = max_tokens
+    def _prepare_for_model(self, params: dict) -> dict:
+        """Prepare params for the API call via ModelAdapter.
+        Also adds timeout from Config. Timeout is added here (not in ModelAdapter)
+        because it doesn't affect API response content and shouldn't be part of the cache hash.
+        Note: timeout is set first so that ModelAdapter handlers can override it if needed.
+        """
+        prepared = ModelAdapter.prepare(params, self.model)
+        return {"timeout": Config.timeout, **prepared}
-        if max_completion_tokens is not None:
-            args["max_completion_tokens"] = max_completion_tokens
+    def get_text(self, params: dict) -> str:
+        """Get a text completion from the model.
-        completion = openai_chat_completion(**args)
+        Args:
+            params: Dictionary of parameters for the API.
+                Must include 'messages'. Other common keys: 'temperature', 'max_tokens'.
+        """
+        prepared = self._prepare_for_model(params)
+        completion = openai_chat_completion(client=self.client, **prepared)
         try:
             return completion.choices[0].message.content
         except Exception:
@@ -65,15 +61,22 @@ class Runner:
     def single_token_probs(
         self,
-        messages: list[dict],
-        top_logprobs: int = 20,
+        params: dict,
+        *,
         num_samples: int = 1,
         convert_to_probs: bool = True,
-        **kwargs,
     ) -> dict:
+        """Get probability distribution of the next token, optionally averaged over multiple samples.
+        Args:
+            params: Dictionary of parameters for the API.
+                Must include 'messages'. Other common keys: 'top_logprobs', 'logit_bias'.
+            num_samples: Number of samples to average over. Default: 1.
+            convert_to_probs: If True, convert logprobs to probabilities. Default: True.
+        """
         probs = {}
         for _ in range(num_samples):
-            new_probs = self.single_token_probs_one_sample(messages, top_logprobs, convert_to_probs, **kwargs)
+            new_probs = self.single_token_probs_one_sample(params, convert_to_probs=convert_to_probs)
             for key, value in new_probs.items():
                 probs[key] = probs.get(key, 0) + value
         result = {key: value / num_samples for key, value in probs.items()}
@@ -82,23 +85,31 @@ class Runner:
     def single_token_probs_one_sample(
         self,
-        messages: list[dict],
-        top_logprobs: int = 20,
+        params: dict,
+        *,
         convert_to_probs: bool = True,
-        **kwargs,
     ) -> dict:
-        """Returns probabilities of the next token. Always samples 1 token."""
-        completion = openai_chat_completion(
-            client=self.client,
-            model=self.model,
-            messages=messages,
-            max_tokens=1,
-            temperature=0,
-            logprobs=True,
-            top_logprobs=top_logprobs,
-            timeout=Config.timeout,
-            **kwargs,
-        )
+        """Get probability distribution of the next token (single sample).
+        Args:
+            params: Dictionary of parameters for the API.
+                Must include 'messages'. Other common keys: 'top_logprobs', 'logit_bias'.
+            convert_to_probs: If True, convert logprobs to probabilities. Default: True.
+        Note: This function forces max_tokens=1, temperature=0, logprobs=True.
+        """
+        # Build complete params with defaults and forced params
+        complete_params = {
+            # Default for top_logprobs, can be overridden by params:
+            "top_logprobs": 20,
+            **params,
+            # These are required for single_token_probs semantics (cannot be overridden):
+            "max_tokens": 1,
+            "temperature": 0,
+            "logprobs": True,
+        }
+        prepared = self._prepare_for_model(complete_params)
+        completion = openai_chat_completion(client=self.client, **prepared)
         if completion.choices[0].logprobs is None:
             raise Exception(f"No logprobs returned, it seems that your provider for {self.model} doesn't support that.")
@@ -131,8 +142,8 @@ class Runner:
         FUNC is get_text or single_token_probs. Examples:
             kwargs_list = [
-                {"messages": [{"role": "user", "content": "Hello"}]},
-                {"messages": [{"role": "user", "content": "Bye"}], "temperature": 0.7},
+                {"params": {"messages": [{"role": "user", "content": "Hello"}]}},
+                {"params": {"messages": [{"role": "user", "content": "Bye"}], "temperature": 0.7}},
             ]
             for in_, out in runner.get_many(runner.get_text, kwargs_list):
                 print(in_, "->", out)
@@ -140,8 +151,8 @@ class Runner:
         or
             kwargs_list = [
-                {"messages": [{"role": "user", "content": "Hello"}]},
-                {"messages": [{"role": "user", "content": "Bye"}]},
+                {"params": {"messages": [{"role": "user", "content": "Hello"}]}},
+                {"params": {"messages": [{"role": "user", "content": "Bye"}]}},
             ]
             for in_, out in runner.get_many(runner.single_token_probs, kwargs_list):
                 print(in_, "->", out)
@@ -149,10 +160,10 @@ class Runner:
         (FUNC that is a different callable should also work)
         This function returns a generator that yields pairs (input, output),
-        where input is an element from KWARGS_SET and output is the thing returned by
+        where input is an element from KWARGS_LIST and output is the thing returned by
         FUNC for this input.
-        Dictionaries in KWARGS_SET might include optional keys starting with underscore,
+        Dictionaries in KWARGS_LIST might include optional keys starting with underscore,
         they are just ignored, but they are returned in the first element of the pair, so that's useful
         for passing some additional information that will be later paired with the output.
@@ -179,7 +190,8 @@ class Runner:
                 raise
             except Exception as e:
                 # Truncate messages for readability
-                messages = func_kwargs.get("messages", [])
+                params = func_kwargs.get("params", {})
+                messages = params.get("messages", [])
                 if messages:
                     last_msg = str(messages[-1].get("content", ""))[:100]
                     msg_info = f", last message: {last_msg!r}..."
@@ -208,15 +220,17 @@ class Runner:
     def sample_probs(
         self,
-        messages: list[dict],
+        params: dict,
         *,
         num_samples: int,
-        max_tokens: int,
-        temperature: float = 1,
-        **kwargs,
     ) -> dict:
         """Sample answers NUM_SAMPLES times. Returns probabilities of answers.
+        Args:
+            params: Dictionary of parameters for the API.
+                Must include 'messages'. Other common keys: 'max_tokens', 'temperature'.
+            num_samples: Number of samples to collect.
         Works only if the API supports `n` parameter.
         Usecases:
@@ -228,16 +242,13 @@ class Runner:
         cnts = defaultdict(int)
         for i in range(((num_samples - 1) // 128) + 1):
             n = min(128, num_samples - i * 128)
-            completion = openai_chat_completion(
-                client=self.client,
-                model=self.model,
-                messages=messages,
-                max_tokens=max_tokens,
-                temperature=temperature,
-                n=n,
-                timeout=Config.timeout,
-                **kwargs,
-            )
+            # Build complete params with forced param
+            complete_params = {
+                **params,
+                "n": n,
+            }
+            prepared = self._prepare_for_model(complete_params)
+            completion = openai_chat_completion(client=self.client, **prepared)
             for choice in completion.choices:
                 cnts[choice.message.content] += 1
         if sum(cnts.values()) != num_samples:

llmcomp 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

llmcomp 1.0.0py3-none-any.whl → 1.1.0py3-none-any.whl