PyPI - llmcomp - Versions diffs - 1.0.0__py3-none-any.whl - Mend

llmcomp 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

llmcomp/__init__.py +3 -0
llmcomp/config.py +245 -0
llmcomp/question/judge.py +146 -0
llmcomp/question/plots.py +283 -0
llmcomp/question/question.py +974 -0
llmcomp/question/result.py +193 -0
llmcomp/runner/chat_completion.py +33 -0
llmcomp/runner/runner.py +249 -0
llmcomp/utils.py +97 -0
llmcomp-1.0.0.dist-info/METADATA +175 -0
llmcomp-1.0.0.dist-info/RECORD +13 -0
llmcomp-1.0.0.dist-info/WHEEL +4 -0
llmcomp-1.0.0.dist-info/licenses/LICENSE +21 -0

llmcomp/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from llmcomp.config import Config
+from llmcomp.question.question import Question
+from llmcomp.runner.runner import Runner

llmcomp/config.py ADDED Viewed

@@ -0,0 +1,245 @@
+"""Global configuration for llmcomp.
+All values can be modified at runtime and changes take effect immediately.
+Example:
+    from llmcomp import Config
+    # Set values
+    Config.timeout = 100
+    Config.max_workers = 50
+    Config.cache_dir = "my_cache"
+    # Values are read dynamically, so changes apply immediately
+"""
+import os
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from threading import Lock
+import openai
+from llmcomp.runner.chat_completion import openai_chat_completion
+class NoClientForModel(Exception):
+    """Raised when no working API client can be found for a model."""
+    pass
+def _get_api_keys(env_var_name: str, *, include_suffixed: bool = True) -> list[str]:
+    """Get API keys from environment variable(s).
+    Args:
+        env_var_name: Base environment variable name (e.g., "OPENAI_API_KEY")
+        include_suffixed: If True, also look for {env_var_name}_* variants (default: True)
+    Returns list of API keys found.
+    """
+    key_names = [env_var_name]
+    if include_suffixed:
+        for env_var in os.environ:
+            if env_var.startswith(f"{env_var_name}_"):
+                key_names.append(env_var)
+    keys = [os.getenv(name) for name in key_names]
+    return [key for key in keys if key is not None]
+def _discover_url_key_pairs() -> list[tuple[str, str]]:
+    """Discover URL-key pairs from environment variables.
+    Discovers (including _* suffix variants for each):
+    - OPENAI_API_KEY for OpenAI
+    - OPENROUTER_API_KEY for OpenRouter
+    - TINKER_API_KEY for Tinker (OpenAI-compatible)
+    Returns list of (base_url, api_key) tuples.
+    """
+    url_pairs = []
+    # OpenAI
+    for key in _get_api_keys("OPENAI_API_KEY"):
+        url_pairs.append(("https://api.openai.com/v1", key))
+    # OpenRouter
+    for key in _get_api_keys("OPENROUTER_API_KEY"):
+        url_pairs.append(("https://openrouter.ai/api/v1", key))
+    # Tinker (OpenAI-compatible API)
+    for key in _get_api_keys("TINKER_API_KEY"):
+        url_pairs.append(("https://tinker.thinkingmachines.dev/services/tinker-prod/oai/api/v1", key))
+    return url_pairs
+class _ConfigMeta(type):
+    """Metaclass for Config to support lazy initialization of url_key_pairs."""
+    _url_key_pairs: list[tuple[str, str]] | None = None
+    @property
+    def url_key_pairs(cls) -> list[tuple[str, str]]:
+        """URL-key pairs for client creation.
+        Auto-discovered from environment variables on first access.
+        Users can modify this list (add/remove pairs).
+        """
+        if cls._url_key_pairs is None:
+            cls._url_key_pairs = _discover_url_key_pairs()
+        return cls._url_key_pairs
+    @url_key_pairs.setter
+    def url_key_pairs(cls, value: list[tuple[str, str]] | None):
+        cls._url_key_pairs = value
+class Config(metaclass=_ConfigMeta):
+    """Global configuration for llmcomp.
+    Modify class attributes directly to change configuration.
+    Changes take effect immediately for subsequent operations.
+    """
+    # Default values for reset()
+    _defaults = {
+        "timeout": 60,
+        "max_workers": 100,
+        "cache_dir": "llmcomp_cache",
+        "yaml_dir": "questions",
+        "verbose": False,
+    }
+    # API request timeout in seconds
+    timeout: int = _defaults["timeout"]
+    # Maximum number of concurrent API requests (total across all models, not per model).
+    # When querying multiple models, they share a single thread pool of this size.
+    max_workers: int = _defaults["max_workers"]
+    # Directory for caching results (question results and judge results)
+    cache_dir: str = _defaults["cache_dir"]
+    # Directory for loading questions from YAML files
+    yaml_dir: str = _defaults["yaml_dir"]
+    # Whether to print verbose messages (e.g., API client discovery)
+    verbose: bool = _defaults["verbose"]
+    # Cache of OpenAI clients by model name (or NoClientForModel exception if failed).
+    # Users can inspect/modify this if needed.
+    client_cache: dict[str, openai.OpenAI | NoClientForModel] = {}
+    # Per-model locks to ensure only one thread creates a client for a given model
+    _model_locks: dict[str, Lock] = {}
+    _model_locks_lock: Lock = Lock()
+    @classmethod
+    def reset(cls):
+        """Reset all configuration values to their defaults."""
+        for key, value in cls._defaults.items():
+            setattr(cls, key, value)
+        cls.client_cache.clear()
+        cls._model_locks.clear()
+        _ConfigMeta._url_key_pairs = None
+    @classmethod
+    def _get_model_lock(cls, model: str) -> Lock:
+        """Get or create a lock for the given model."""
+        with cls._model_locks_lock:
+            if model not in cls._model_locks:
+                cls._model_locks[model] = Lock()
+            return cls._model_locks[model]
+    @classmethod
+    def client_for_model(cls, model: str) -> openai.OpenAI:
+        """Get or create an OpenAI client for the given model.
+        Clients are cached in client_cache. The first call for a model
+        will test available URL-key pairs in parallel to find one that works.
+        Thread-safe: only one thread will attempt to create a client per model.
+        Failures are also cached to avoid repeated attempts.
+        """
+        # Fast path: result already cached (success or failure)
+        if model in cls.client_cache:
+            cached = cls.client_cache[model]
+            if isinstance(cached, NoClientForModel):
+                raise cached
+            return cached
+        # Slow path: acquire per-model lock to ensure only one thread creates the client
+        with cls._get_model_lock(model):
+            # Double-check after acquiring lock
+            if model in cls.client_cache:
+                cached = cls.client_cache[model]
+                if isinstance(cached, NoClientForModel):
+                    raise cached
+                return cached
+            try:
+                client = cls._find_openai_client(model)
+                cls.client_cache[model] = client
+                return client
+            except NoClientForModel as e:
+                cls.client_cache[model] = e
+                raise
+    @classmethod
+    def _find_openai_client(cls, model: str) -> openai.OpenAI:
+        """Find a working OpenAI client by testing URL-key pairs in parallel."""
+        all_pairs = cls.url_key_pairs
+        if not all_pairs:
+            raise NoClientForModel(
+                f"No URL-key pairs available for model {model}. "
+                "Set an API key (e.g. OPENAI_API_KEY) or Config.url_key_pairs."
+            )
+        # Test all pairs in parallel
+        with ThreadPoolExecutor(max_workers=len(all_pairs)) as executor:
+            future_to_pair = {
+                executor.submit(cls._test_url_key_pair, model, url, key): (url, key) for url, key in all_pairs
+            }
+            for future in as_completed(future_to_pair):
+                client = future.result()
+                if client:
+                    # Cancel remaining futures
+                    for f in future_to_pair:
+                        f.cancel()
+                    return client
+        raise NoClientForModel(f"No working API client found for model {model}")
+    @classmethod
+    def _test_url_key_pair(cls, model: str, url: str, key: str) -> openai.OpenAI | None:
+        """Test if a url-key pair works for the given model."""
+        try:
+            client = openai.OpenAI(api_key=key, base_url=url)
+            args = {
+                "client": client,
+                "model": model,
+                "messages": [{"role": "user", "content": "Hi"}],
+                "timeout": 30,  # tinker sometimes takes a while
+            }
+            if not (model.startswith("o") or model.startswith("gpt-5")):
+                args["max_tokens"] = 1
+            else:
+                if model.startswith("gpt-5"):
+                    args["max_completion_tokens"] = 16
+                else:
+                    args["max_completion_tokens"] = 1
+            openai_chat_completion(**args)
+        except (
+            openai.NotFoundError,
+            openai.BadRequestError,
+            openai.PermissionDeniedError,
+            openai.AuthenticationError,
+        ) as e:
+            if Config.verbose:
+                print(f"{model} doesn't work with url {url} and key {key[:16]}... ({e})")
+            return None
+        return client

llmcomp/question/judge.py ADDED Viewed

@@ -0,0 +1,146 @@
+"""Judge question types for evaluating (question, answer) pairs."""
+import string
+import pandas as pd
+from llmcomp.question.question import FreeForm, Rating
+from llmcomp.question.result import JudgeCache
+class JudgeMixin:
+    """Mixin providing common functionality for judge question types.
+    Judges evaluate (question, answer) pairs from other questions.
+    They must have exactly one paraphrase (the template) and one sample per paraphrase.
+    """
+    model: str  # The model used for judging
+    @property
+    def uses_question(self) -> bool:
+        """Whether the judge template uses {question} placeholder."""
+        # Use string.Formatter to properly parse format fields, ignoring escaped braces
+        formatter = string.Formatter()
+        field_names = [
+            field_name for _, field_name, _, _ in formatter.parse(self.paraphrases[0]) if field_name is not None
+        ]
+        return "question" in field_names
+    def _validate_judge(self):
+        """Validate judge-specific constraints."""
+        assert len(self.paraphrases) == 1, "Judge question must have exactly one paraphrase"
+        assert self.samples_per_paraphrase == 1, "Judge question must have exactly one sample per paraphrase"
+    def _load_cache_data(self) -> list[dict]:
+        """Load cache and return list of row dicts with question, answer, judge_question, judge_answer.
+        Subclasses can extend the returned dicts with additional fields.
+        """
+        cache = JudgeCache(self)
+        data = cache._load()
+        template = self.paraphrases[0]
+        rows = []
+        for question_key, answers in data.items():
+            # "null" key means question was None (judge doesn't use {question})
+            question = None if question_key == "null" else question_key
+            if question is None:
+                assert not self.uses_question, (
+                    "Cache has null question keys but template uses {question}. "
+                    "This indicates cache corruption or a bug."
+                )
+            for answer, judge_response in answers.items():
+                rows.append(
+                    {
+                        "question": question,
+                        "answer": answer,
+                        "judge_question": template.format(question=question, answer=answer),
+                        "judge_answer": judge_response,
+                    }
+                )
+        return rows
+class FreeFormJudge(JudgeMixin, FreeForm):
+    """Judge that evaluates answers using free-form text responses.
+    Use as a judge in FreeForm questions to have an LLM evaluate the (question, answer) pairs.
+    The judge paraphrase should contain {answer} placeholder, and optionally {question}.
+    """
+    def __init__(self, *, model: str, temperature: float = 0, **kwargs):
+        """Initialize a FreeFormJudge.
+        Args:
+            model: Required. Model identifier to use for judging (e.g., "gpt-4o").
+            temperature: Sampling temperature. Default: 0.
+            **kwargs: Arguments passed to FreeForm base class. Must include:
+                - paraphrases: Single-element list with the judge template.
+                    Template must contain {answer}, optionally {question}.
+                    Example: ["Is this answer correct? {answer}"]
+        """
+        super().__init__(temperature=temperature, **kwargs)
+        self._validate_judge()
+        assert self.judges is None or len(self.judges) == 0, "Judge question cannot have judges"
+        self.model = model
+    def get_cache(self) -> pd.DataFrame:
+        """Return all cached judge evaluations as a DataFrame.
+        Useful for inspecting what the judge has evaluated so far.
+        Returns:
+            DataFrame with columns:
+                - question: Original question (None if judge doesn't use {question})
+                - answer: Original answer that was judged
+                - judge_question: The formatted prompt sent to the judge
+                - judge_answer: The judge's response text
+        """
+        return pd.DataFrame(self._load_cache_data())
+class RatingJudge(JudgeMixin, Rating):
+    """Judge that evaluates answers using numeric ratings.
+    Use as a judge in FreeForm questions to have an LLM rate the (question, answer) pairs.
+    Returns mean rating computed from logprobs.
+    The judge template should contain {answer} placeholder, and optionally {question}.
+    """
+    def __init__(self, *, model: str, **kwargs):
+        """Initialize a RatingJudge.
+        Args:
+            model: Model identifier to use for judging (e.g., "gpt-4o").
+            **kwargs: Arguments passed to Rating base class. Must include:
+                - paraphrases: Single-element list with the judge template.
+                    Template must contain {answer}, optionally {question}.
+                    Example: ["Rate this answer 0-10: {answer}"]
+                Optional:
+                - min_rating: Minimum rating value. Default: 0.
+                - max_rating: Maximum rating value. Default: 100.
+        """
+        super().__init__(**kwargs)
+        self._validate_judge()
+        self.model = model
+    def get_cache(self) -> pd.DataFrame:
+        """Return all cached judge evaluations as a DataFrame.
+        Useful for inspecting what the judge has evaluated so far.
+        Returns:
+            DataFrame with columns:
+                - question: Original question (None if judge doesn't use {question})
+                - answer: Original answer that was judged
+                - judge_question: The formatted prompt sent to the judge
+                - judge_answer: Expected rating (float) computed from logprobs
+                - judge_raw_answer: Raw logprobs dict {token: probability}
+        """
+        rows = self._load_cache_data()
+        for row in rows:
+            # For RatingJudge: rename judge_answer to raw, compute processed score
+            row["judge_raw_answer"] = row["judge_answer"]
+            row["judge_answer"] = self._compute_expected_rating(row["judge_raw_answer"])
+        return pd.DataFrame(rows)

llmcomp/question/plots.py ADDED Viewed

@@ -0,0 +1,283 @@
+import matplotlib.pyplot as plt
+import pandas as pd
+def default_title(paraphrases: list[str] | None) -> str | None:
+    """Generate default plot title from paraphrases."""
+    if paraphrases is None:
+        return None
+    if len(paraphrases) == 1:
+        return paraphrases[0]
+    return paraphrases[0] + f"\nand {len(paraphrases) - 1} other paraphrases"
+def rating_cumulative_plot(
+    df: pd.DataFrame,
+    min_rating: int,
+    max_rating: int,
+    probs_column: str = "probs",
+    category_column: str = "group",
+    model_groups: dict[str, list[str]] = None,
+    show_mean: bool = True,
+    title: str = None,
+    filename: str = None,
+):
+    """Plot cumulative rating distribution by category.
+    Shows fraction of responses with rating <= X for each X.
+    Starts near 0 at min_rating, reaches 100% at max_rating.
+    Args:
+        df: DataFrame with probs_column containing normalized probability dicts
+            mapping int ratings to probabilities (summing to 1), or None for invalid.
+        min_rating: Minimum rating value.
+        max_rating: Maximum rating value.
+        probs_column: Column containing {rating: prob} dicts. Default: "probs"
+        category_column: Column to group by. Default: "group"
+        model_groups: Optional dict for ordering groups.
+        show_mean: Whether to show mean in legend labels. Default: True
+        title: Optional plot title.
+        filename: Optional filename to save plot.
+    """
+    # Get unique categories in order
+    categories = df[category_column].unique()
+    if category_column == "group" and model_groups is not None:
+        categories = [c for c in model_groups.keys() if c in categories]
+    fig, ax = plt.subplots(figsize=(10, 6))
+    x_values = list(range(min_rating, max_rating + 1))
+    for category in categories:
+        category_df = df[df[category_column] == category]
+        # Accumulate normalized probabilities and means across all rows
+        cumulative = {x: 0.0 for x in x_values}
+        mean_sum = 0.0
+        n_valid = 0
+        for probs in category_df[probs_column]:
+            if probs is None:
+                continue
+            # For each x, add P(score <= x) = sum of probs for ratings <= x
+            for x in x_values:
+                cumulative[x] += sum(p for rating, p in probs.items() if rating <= x)
+            # Compute mean for this row
+            mean_sum += sum(rating * p for rating, p in probs.items())
+            n_valid += 1
+        if n_valid > 0:
+            y_values = [cumulative[x] / n_valid for x in x_values]
+            mean_value = mean_sum / n_valid
+            if show_mean:
+                label = f"{category} (mean: {mean_value:.1f})"
+            else:
+                label = category
+            ax.plot(x_values, y_values, label=label)
+    ax.set_xlabel("Rating")
+    ax.set_ylabel("Fraction with score ≤ X")
+    ax.set_xlim(min_rating, max_rating)
+    ax.set_ylim(0, 1)
+    ax.legend()
+    if title is not None:
+        ax.set_title(title)
+    plt.tight_layout()
+    if filename is not None:
+        plt.savefig(filename, bbox_inches="tight")
+    plt.show()
+def probs_stacked_bar(
+    df: pd.DataFrame,
+    probs_column: str = "probs",
+    category_column: str = "group",
+    model_groups: dict[str, list[str]] = None,
+    selected_answers: list[str] = None,
+    min_fraction: float = None,
+    colors: dict[str, str] = None,
+    title: str = None,
+    filename: str = None,
+):
+    """
+    Plot a stacked bar chart from probability distributions.
+    Args:
+        df: DataFrame with one row per category, containing probs_column with
+            {answer: probability} dicts.
+        probs_column: Column containing probability dicts. Default: "probs"
+        category_column: Column to group by (x-axis). Default: "group"
+        model_groups: Optional dict for ordering groups.
+        selected_answers: Optional list of answers to show. Others grouped as "[OTHER]".
+        min_fraction: Optional minimum fraction threshold.
+        colors: Optional dict mapping answer values to colors.
+        title: Optional plot title.
+        filename: Optional filename to save plot.
+    """
+    if min_fraction is not None and selected_answers is not None:
+        raise ValueError("min_fraction and selected_answers cannot both be set")
+    # Aggregate probs across rows for each category
+    category_probs = {}
+    for category in df[category_column].unique():
+        cat_df = df[df[category_column] == category]
+        combined = {}
+        n_rows = 0
+        for probs in cat_df[probs_column]:
+            if probs is None:
+                continue
+            for answer, prob in probs.items():
+                combined[answer] = combined.get(answer, 0) + prob
+            n_rows += 1
+        if n_rows > 0:
+            category_probs[category] = {k: v / n_rows for k, v in combined.items()}
+    if not category_probs:
+        return
+    # Find answers meeting min_fraction threshold
+    if min_fraction is not None:
+        selected_answers_set = set()
+        for probs in category_probs.values():
+            for answer, prob in probs.items():
+                if prob >= min_fraction:
+                    selected_answers_set.add(answer)
+        selected_answers = list(selected_answers_set)
+    # Group non-selected answers into "[OTHER]"
+    if selected_answers is not None:
+        for category in category_probs:
+            probs = category_probs[category]
+            other_prob = sum(p for a, p in probs.items() if a not in selected_answers)
+            category_probs[category] = {a: p for a, p in probs.items() if a in selected_answers}
+            if other_prob > 0:
+                category_probs[category]["[OTHER]"] = other_prob
+    # Build percentages DataFrame
+    all_answers = set()
+    for probs in category_probs.values():
+        all_answers.update(probs.keys())
+    data = {cat: {a: probs.get(a, 0) * 100 for a in all_answers} for cat, probs in category_probs.items()}
+    answer_percentages = pd.DataFrame(data).T
+    # Color setup
+    if colors is None:
+        colors = {}
+    if "[OTHER]" in all_answers and "[OTHER]" not in colors:
+        colors["[OTHER]"] = "grey"
+    color_palette = [
+        "red",
+        "blue",
+        "green",
+        "orange",
+        "purple",
+        "brown",
+        "pink",
+        "olive",
+        "cyan",
+        "magenta",
+        "yellow",
+        "navy",
+        "lime",
+        "maroon",
+        "teal",
+        "silver",
+        "gold",
+        "indigo",
+        "coral",
+        "crimson",
+    ]
+    # Order answers
+    column_answers = list(answer_percentages.columns)
+    if selected_answers is not None:
+        ordered_answers = [a for a in selected_answers if a in column_answers]
+        extras = sorted([a for a in column_answers if a not in selected_answers])
+        ordered_answers += extras
+    elif colors:
+        ordered_answers = [a for a in colors.keys() if a in column_answers]
+        extras = sorted([a for a in column_answers if a not in ordered_answers])
+        ordered_answers += extras
+    else:
+        ordered_answers = sorted(column_answers)
+    answer_percentages = answer_percentages.reindex(columns=ordered_answers)
+    # Build colors list
+    plot_colors = []
+    color_index = 0
+    for answer in ordered_answers:
+        if answer in colors:
+            plot_colors.append(colors[answer])
+        elif answer == "[OTHER]":
+            plot_colors.append("grey")
+        else:
+            plot_colors.append(color_palette[color_index % len(color_palette)])
+            color_index += 1
+    # Order categories
+    if category_column == "group" and model_groups is not None:
+        ordered_groups = [g for g in model_groups.keys() if g in answer_percentages.index]
+        ordered_groups += [g for g in answer_percentages.index if g not in ordered_groups]
+        answer_percentages = answer_percentages.reindex(ordered_groups)
+    fig, ax = plt.subplots(figsize=(12, 8))
+    answer_percentages.plot(kind="bar", stacked=True, ax=ax, color=plot_colors)
+    plt.xlabel(category_column)
+    plt.ylabel("Percentage")
+    plt.legend(title="answer")
+    plt.xticks(rotation=45, ha="right")
+    if title is not None:
+        plt.title(title)
+    plt.tight_layout()
+    if filename is not None:
+        plt.savefig(filename, bbox_inches="tight")
+    plt.show()
+def free_form_stacked_bar(
+    df: pd.DataFrame,
+    category_column: str = "group",
+    answer_column: str = "answer",
+    model_groups: dict[str, list[str]] = None,
+    selected_answers: list[str] = None,
+    min_fraction: float = None,
+    colors: dict[str, str] = None,
+    title: str = None,
+    filename: str = None,
+):
+    """
+    Plot a stacked bar chart showing the distribution of answers by category.
+    Transforms FreeForm data (multiple rows with single answers) into probability
+    distributions and calls probs_stacked_bar.
+    """
+    # Transform to probs format: one row per category with {answer: prob} dict
+    probs_data = []
+    for category in df[category_column].unique():
+        cat_df = df[df[category_column] == category]
+        counts = cat_df[answer_column].value_counts()
+        probs = (counts / counts.sum()).to_dict()
+        probs_data.append({category_column: category, "probs": probs})
+    probs_df = pd.DataFrame(probs_data)
+    return probs_stacked_bar(
+        probs_df,
+        probs_column="probs",
+        category_column=category_column,
+        model_groups=model_groups,
+        selected_answers=selected_answers,
+        min_fraction=min_fraction,
+        colors=colors,
+        title=title,
+        filename=filename,
+    )