PyPI - llmcomp - Versions diffs - 1.2.4__py3-none-any.whl → 1.3.0__py3-none-any.whl - Mend

llmcomp 1.2.4py3-none-any.whl → 1.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

llmcomp/finetuning/manager.py +21 -0
llmcomp/finetuning/validation.py +406 -0
llmcomp/question/judge.py +11 -0
llmcomp/question/plots.py +123 -68
llmcomp/question/question.py +235 -187
llmcomp/question/result.py +1 -1
llmcomp/question/viewer.py +459 -0
llmcomp/runner/runner.py +32 -18
{llmcomp-1.2.4.dist-info → llmcomp-1.3.0.dist-info}/METADATA +7 -5
llmcomp-1.3.0.dist-info/RECORD +21 -0
llmcomp-1.2.4.dist-info/RECORD +0 -19
{llmcomp-1.2.4.dist-info → llmcomp-1.3.0.dist-info}/WHEEL +0 -0
{llmcomp-1.2.4.dist-info → llmcomp-1.3.0.dist-info}/entry_points.txt +0 -0
{llmcomp-1.2.4.dist-info → llmcomp-1.3.0.dist-info}/licenses/LICENSE +0 -0

llmcomp/question/question.py CHANGED Viewed

@@ -15,15 +15,188 @@ import yaml
 from tqdm import tqdm
 from llmcomp.config import Config
-from llmcomp.question.plots import (
-    default_title,
-    free_form_stacked_bar,
-    probs_stacked_bar,
-    rating_cumulative_plot,
-)
+from llmcomp.question.plots import plot as plots_plot
 from llmcomp.question.result import JudgeCache, Result
+from llmcomp.question.viewer import render_dataframe
 from llmcomp.runner.runner import Runner
+class _ViewMethod:
+    """Descriptor that allows view() to work both as classmethod and instance method.
+    - Question.view(df) - class-level call, views a DataFrame directly
+    - question.view(MODELS) - instance call, runs df() then views
+    - question.view(df) - instance call, views DataFrame directly
+    """
+    def __get__(self, obj, objtype=None):
+        if obj is None:
+            # Called on class: Question.view(df)
+            return self._class_view
+        else:
+            # Called on instance: question.view(...)
+            return lambda *args, **kwargs: self._instance_view(obj, *args, **kwargs)
+    def _class_view(
+        self,
+        df: pd.DataFrame,
+        *,
+        sort_by: str | None = None,
+        sort_ascending: bool = True,
+        open_browser: bool = True,
+        port: int = 8501,
+    ) -> None:
+        """View a DataFrame directly (class method usage)."""
+        if isinstance(df, dict):
+            raise TypeError(
+                "Question.view() expects a DataFrame, not a dict.\n"
+                "To view model results, use an instance: question.view(model_groups)\n"
+                "Or pass a DataFrame: Question.view(question.df(model_groups))"
+            )
+        render_dataframe(
+            df,
+            sort_by=sort_by,
+            sort_ascending=sort_ascending,
+            open_browser=open_browser,
+            port=port,
+        )
+    def _instance_view(
+        self,
+        instance: "Question",
+        model_groups_or_df: dict[str, list[str]] | pd.DataFrame,
+        *,
+        sort_by: str | None = None,
+        sort_ascending: bool = True,
+        open_browser: bool = True,
+        port: int = 8501,
+    ) -> None:
+        """View results (instance method usage)."""
+        if isinstance(model_groups_or_df, pd.DataFrame):
+            df = model_groups_or_df
+        else:
+            df = instance.df(model_groups_or_df)
+        render_dataframe(
+            df,
+            sort_by=sort_by,
+            sort_ascending=sort_ascending,
+            open_browser=open_browser,
+            port=port,
+        )
+class _PlotMethod:
+    def __get__(self, obj, objtype=None):
+        if obj is None:
+            return self._class_plot
+        else:
+            return lambda *args, **kwargs: self._instance_plot(obj, *args, **kwargs)
+    def _class_plot(
+        self,
+        df: pd.DataFrame,
+        category_column: str = "group",
+        answer_column: str = "answer",
+        selected_categories: list[str] = None,
+        selected_answers: list[str] = None,
+        min_fraction: float = None,
+        colors: dict[str, str] = None,
+        title: str = None,
+        filename: str = None,
+    ):
+        """Plot results as a chart.
+        Can be called as:
+            - Question.plot(df) - plot a DataFrame directly
+            - question.plot(model_groups) - run df() on models, then plot
+            - question.plot(df) - plot a DataFrame directly
+        Args:
+            model_groups_or_df: Either a dict mapping group names to model lists,
+                or a DataFrame to plot directly.
+            category_column: Column to group by on x-axis. Default: "group".
+            answer_column: Column containing answers to plot. Default: "answer"
+                (or "probs" for Rating questions).
+            selected_categories: List of categories to include (in order). Others excluded.
+            selected_answers: List of answers to show in stacked bar. Others grouped as "[OTHER]".
+            min_fraction: Minimum fraction threshold for stacked bar. Answers below grouped as "[OTHER]".
+            colors: Dict mapping answer values to colors for stacked bar.
+            title: Plot title. Auto-generated from question if not provided.
+            filename: If provided, saves the plot to this file path.
+        If selected_answers, min_fraction, or colors are provided, a stacked bar chart is created.
+        Otherwise, llmcomp will try to create the best plot for the data.
+        """
+        if isinstance(df, dict):
+            raise TypeError(
+                "Question.plot() expects a DataFrame, not a dict.\n"
+                "To plot model results, use an instance: question.plot(model_groups)\n"
+                "Or pass a DataFrame: Question.plot(question.df(model_groups))"
+            )
+        return plots_plot(
+            df,
+            answer_column=answer_column,
+            category_column=category_column,
+            selected_categories=selected_categories,
+            selected_answers=selected_answers,
+            min_fraction=min_fraction,
+            colors=colors,
+            title=title,
+            filename=filename,
+        )
+    def _instance_plot(
+        self,
+        instance: "Question",
+        model_groups_or_df: dict[str, list[str]] | pd.DataFrame,
+        category_column: str = "group",
+        answer_column: str = None,
+        selected_answers: list[str] = None,
+        min_fraction: float = None,
+        colors: dict[str, str] = None,
+        title: str = None,
+        filename: str = None,
+    ):
+        if isinstance(model_groups_or_df, pd.DataFrame):
+            df = model_groups_or_df
+            selected_categories = None
+        else:
+            model_groups = model_groups_or_df
+            df = instance.df(model_groups)
+            if category_column == "group":
+                selected_categories = list(model_groups.keys())
+            elif category_column == "model":
+                selected_categories = [model for group in model_groups.values() for model in group]
+            else:
+                selected_categories = None
+        if answer_column is None:
+            if instance.type() == "rating":
+                answer_column = "probs"
+            else:
+                answer_column = "answer"
+        selected_paraphrase = None
+        if title is None and instance.paraphrases is not None:
+            selected_paraphrase = instance.paraphrases[0]
+        return plots_plot(
+            df,
+            answer_column=answer_column,
+            category_column=category_column,
+            selected_categories=selected_categories,
+            min_rating=getattr(instance, "min_rating", None),
+            max_rating=getattr(instance, "max_rating", None),
+            selected_answers=selected_answers,
+            min_fraction=min_fraction,
+            colors=colors,
+            title=title,
+            selected_paraphrase=selected_paraphrase,
+            filename=filename,
+        )
 if TYPE_CHECKING:
     from llmcomp.question.judge import FreeFormJudge, RatingJudge
     from llmcomp.question.question import Question
@@ -184,6 +357,9 @@ class Question(ABC):
         question_dict = cls.load_dict(name)
         return cls.create(**question_dict)
+    view = _ViewMethod()
+    plot = _PlotMethod()
     @classmethod
     def _load_question_config(cls):
         """Load all questions from YAML files in Config.yaml_dir."""
@@ -222,7 +398,7 @@ class Question(ABC):
                             "group": group,
                             "answer": el["answer"],
                             "question": el["question"],
-                            "messages": el["messages"],
+                            "api_kwargs": el["api_kwargs"],
                             "paraphrase_ix": el["paraphrase_ix"],
                         }
                     )
@@ -283,6 +459,33 @@ class Question(ABC):
         return results
+    def clear_cache(self, model: str) -> bool:
+        """Clear cached results for this question and model.
+        Args:
+            model: The model whose cache should be cleared.
+        Returns:
+            True if cache was found and removed, False otherwise.
+        Example:
+            >>> question = Question.create(type="free_form", paraphrases=["test"])
+            >>> question.df({"group": ["gpt-4"]})  # Creates cache
+            >>> question.clear_cache("gpt-4")  # Clear cache
+            True
+            >>> question.clear_cache("gpt-4")  # Already cleared
+            False
+        """
+        cache_file = Result.file_path(self, model)
+        if os.path.exists(cache_file):
+            os.remove(cache_file)
+            # Clean up empty directory
+            cache_dir = os.path.dirname(cache_file)
+            if os.path.isdir(cache_dir) and not os.listdir(cache_dir):
+                os.rmdir(cache_dir)
+            return True
+        return False
     def many_models_execute(self, models: list[str]) -> list[Result]:
         """Execute question on multiple models in parallel.
@@ -340,12 +543,11 @@ class Question(ABC):
                                 error = payload[0]
                                 errors.append((model, error))
                             else:
-                                in_, out = payload
+                                in_, (out, prepared_kwargs) = payload
                                 data = results[models.index(model)]
                                 data[in_["_original_ix"]] = {
-                                    # Deepcopy because in_["params"]["messages"] is reused for multiple models
-                                    # and we don't want weird side effects if someone later edits the messages
-                                    "messages": deepcopy(in_["params"]["messages"]),
+                                    "api_kwargs": deepcopy(prepared_kwargs),
                                     "question": in_["_question"],
                                     "answer": out,
                                     "paraphrase_ix": in_["_paraphrase_ix"],
@@ -416,9 +618,10 @@ class FreeForm(Question):
         "group",
         "answer",
         "question",
-        "messages",
+        "api_kwargs",
         "paraphrase_ix",
         "raw_answer",
+        "probs",
     }
     def __init__(
@@ -474,7 +677,7 @@ class FreeForm(Question):
                 - group: Group name from model_groups
                 - answer: Model's response text
                 - question: The prompt that was sent
-                - messages: Full message list sent to model
+                - api_kwargs: Full API parameters sent to model (including messages, temperature, etc.)
                 - paraphrase_ix: Index of the paraphrase used
                 - {judge_name}: Score/response from each configured judge
                 - {judge_name}_question: The prompt sent to the judge
@@ -489,6 +692,8 @@ class FreeForm(Question):
                 columns.append(judge_name + "_question")
                 if f"{judge_name}_raw_answer" in df.columns:
                     columns.append(judge_name + "_raw_answer")
+                if f"{judge_name}_probs" in df.columns:
+                    columns.append(judge_name + "_probs")
         df = df[columns]
         # Validate that adding judges didn't change row count
@@ -527,6 +732,9 @@ class FreeForm(Question):
         if "raw_answer" in judge_df.columns:
             judge_columns.append(judge_name + "_raw_answer")
             judge_df = judge_df.rename(columns={"raw_answer": judge_name + "_raw_answer"})
+        if "probs" in judge_df.columns:
+            judge_columns.append(judge_name + "_probs")
+            judge_df = judge_df.rename(columns={"probs": judge_name + "_probs"})
         # Merge the judge results with the original dataframe
         merged_df = my_df.merge(
@@ -612,62 +820,16 @@ class FreeForm(Question):
         df = pd.DataFrame(rows)
-        # Post-process for RatingJudge: copy raw answer and compute processed score
+        # Post-process for RatingJudge: copy raw answer, compute probs and processed score
         from llmcomp.question.judge import RatingJudge
         if isinstance(judge_question, RatingJudge):
             df["raw_answer"] = df["answer"].copy()
-            df["answer"] = df["raw_answer"].apply(judge_question._compute_expected_rating)
+            df["probs"] = df["raw_answer"].apply(judge_question._get_normalized_probs)
+            df["answer"] = df["probs"].apply(judge_question._compute_expected_rating)
         return df
-    def plot(
-        self,
-        model_groups: dict[str, list[str]],
-        category_column: str = "group",
-        answer_column: str = "answer",
-        df: pd.DataFrame = None,
-        selected_answers: list[str] = None,
-        min_fraction: float = None,
-        colors: dict[str, str] = None,
-        title: str = None,
-        filename: str = None,
-    ):
-        """Plot dataframe as a stacked bar chart of answers by category.
-        Args:
-            model_groups: Required. Dict mapping group names to lists of model identifiers.
-            category_column: Column to use for x-axis categories. Default: "group".
-            answer_column: Column containing answers to plot. Default: "answer".
-                Use a judge column name to plot judge scores instead.
-            df: DataFrame to plot. By default calls self.df(model_groups).
-            selected_answers: List of specific answers to include. Others grouped as "other".
-            min_fraction: Minimum fraction threshold. Answers below this are grouped as "other".
-            colors: Dict mapping answer values to colors.
-            title: Plot title. If None, auto-generated from paraphrases.
-            filename: If provided, saves the plot to this file path.
-        Returns:
-            matplotlib Figure object.
-        """
-        if df is None:
-            df = self.df(model_groups)
-        if title is None:
-            title = default_title(self.paraphrases)
-        return free_form_stacked_bar(
-            df,
-            category_column=category_column,
-            answer_column=answer_column,
-            model_groups=model_groups,
-            selected_answers=selected_answers,
-            min_fraction=min_fraction,
-            colors=colors,
-            title=title,
-            filename=filename,
-        )
     def _parse_judges(self, judges: dict[str, str | dict] | None) -> dict[str, "Question"] | None:
         """Parse and validate judges dictionary."""
         if judges is None:
@@ -691,6 +853,11 @@ class FreeForm(Question):
                     f"Judge name '{key}' is forbidden. Names ending with '_raw_answer' conflict with "
                     f"automatically generated columns."
                 )
+            if key.endswith("_probs"):
+                raise ValueError(
+                    f"Judge name '{key}' is forbidden. Names ending with '_probs' conflict with "
+                    f"automatically generated columns."
+                )
         parsed_judges = {}
         for key, val in judges.items():
@@ -779,13 +946,15 @@ class Rating(Question):
                 - group: Group name from model_groups
                 - answer: Mean rating (float), or None if model refused
                 - raw_answer: Original logprobs dict {token: probability}
+                - probs: Normalized probabilities dict {int_rating: probability}
                 - question: The prompt that was sent
-                - messages: Full message list sent to model
+                - api_kwargs: Full API parameters sent to model (including messages, temperature, etc.)
                 - paraphrase_ix: Index of the paraphrase used
         """
         df = super().df(model_groups)
         df["raw_answer"] = df["answer"].copy()
-        df["answer"] = df["raw_answer"].apply(self._compute_expected_rating)
+        df["probs"] = df["raw_answer"].apply(self._get_normalized_probs)
+        df["answer"] = df["probs"].apply(self._compute_expected_rating)
         return df
     def _get_normalized_probs(self, score: dict | None) -> dict[int, float] | None:
@@ -813,65 +982,11 @@ class Rating(Question):
         return {k: v / total for k, v in probs.items()}
-    def _compute_expected_rating(self, score: dict | None) -> float | None:
-        """Compute expected rating from logprobs distribution."""
-        if score is None:
-            mid_value = (self.min_rating + self.max_rating) / 2
-            warnings.warn(f"Got None from API (should be impossible). Returning middle value {mid_value}.")
-            return mid_value
-        probs = self._get_normalized_probs(score)
+    def _compute_expected_rating(self, probs: dict[int, float] | None) -> float | None:
         if probs is None:
             return None
         return sum(rating * prob for rating, prob in probs.items())
-    def plot(
-        self,
-        model_groups: dict[str, list[str]],
-        category_column: str = "group",
-        df: pd.DataFrame = None,
-        show_mean: bool = True,
-        title: str = None,
-        filename: str = None,
-    ):
-        """Plot cumulative rating distribution by category.
-        Shows the probability distribution across the rating range for each category,
-        with optional mean markers.
-        Args:
-            model_groups: Required. Dict mapping group names to lists of model identifiers.
-            category_column: Column to use for grouping. Default: "group".
-            df: DataFrame to plot. By default calls self.df(model_groups).
-            show_mean: If True, displays mean rating for each category. Default: True.
-            title: Plot title. If None, auto-generated from paraphrases.
-            filename: If provided, saves the plot to this file path.
-        Returns:
-            matplotlib Figure object.
-        """
-        if df is None:
-            df = self.df(model_groups)
-        if title is None:
-            title = default_title(self.paraphrases)
-        # Pre-normalize probabilities
-        df = df.copy()
-        df["probs"] = df["raw_answer"].apply(self._get_normalized_probs)
-        return rating_cumulative_plot(
-            df,
-            min_rating=self.min_rating,
-            max_rating=self.max_rating,
-            category_column=category_column,
-            model_groups=model_groups,
-            show_mean=show_mean,
-            title=title,
-            filename=filename,
-        )
 class NextToken(Question):
     """Question type for analyzing next-token probability distributions.
@@ -919,71 +1034,4 @@ class NextToken(Question):
             el["params"]["top_logprobs"] = self.top_logprobs
             el["convert_to_probs"] = self.convert_to_probs
             el["num_samples"] = self.num_samples
-        return runner_input
-    def df(self, model_groups: dict[str, list[str]]) -> pd.DataFrame:
-        """Execute question and return results as a DataFrame.
-        Runs the question on all models (or loads from cache).
-        Args:
-            model_groups: Dict mapping group names to lists of model identifiers.
-                Example: {"gpt4": ["gpt-4o", "gpt-4-turbo"], "claude": ["claude-3-opus"]}
-        Returns:
-            DataFrame with columns:
-                - model: Model identifier
-                - group: Group name from model_groups
-                - answer: Dict mapping tokens to probabilities {token: prob}
-                - question: The prompt that was sent
-                - messages: Full message list sent to model
-                - paraphrase_ix: Index of the paraphrase used
-        """
-        return super().df(model_groups)
-    def plot(
-        self,
-        model_groups: dict[str, list[str]],
-        category_column: str = "group",
-        df: pd.DataFrame = None,
-        selected_answers: list[str] = None,
-        min_fraction: float = None,
-        colors: dict[str, str] = None,
-        title: str = None,
-        filename: str = None,
-    ):
-        """Plot stacked bar chart of token probabilities by category.
-        Args:
-            model_groups: Required. Dict mapping group names to lists of model identifiers.
-            category_column: Column to use for x-axis categories. Default: "group".
-            df: DataFrame to plot. By default calls self.df(model_groups).
-            selected_answers: List of specific tokens to include. Others grouped as "other".
-            min_fraction: Minimum probability threshold. Tokens below this are grouped as "other".
-            colors: Dict mapping token values to colors.
-            title: Plot title. If None, auto-generated from paraphrases.
-            filename: If provided, saves the plot to this file path.
-        Returns:
-            matplotlib Figure object.
-        """
-        if df is None:
-            df = self.df(model_groups)
-        if title is None:
-            title = default_title(self.paraphrases)
-        # answer column already contains {token: prob} dicts
-        df = df.rename(columns={"answer": "probs"})
-        return probs_stacked_bar(
-            df,
-            probs_column="probs",
-            category_column=category_column,
-            model_groups=model_groups,
-            selected_answers=selected_answers,
-            min_fraction=min_fraction,
-            colors=colors,
-            title=title,
-            filename=filename,
-        )
+        return runner_input

llmcomp/question/result.py CHANGED Viewed

@@ -12,7 +12,7 @@ if TYPE_CHECKING:
     from llmcomp.question.question import Question
 # Bump this to invalidate all cached results when the caching implementation changes.
-CACHE_VERSION = 2
+CACHE_VERSION = 3
 def cache_hash(question: "Question", model: str) -> str:

llmcomp 1.2.4__py3-none-any.whl → 1.3.0__py3-none-any.whl

llmcomp 1.2.4py3-none-any.whl → 1.3.0py3-none-any.whl