llmcomp 1.2.4__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,15 +15,188 @@ import yaml
15
15
  from tqdm import tqdm
16
16
 
17
17
  from llmcomp.config import Config
18
- from llmcomp.question.plots import (
19
- default_title,
20
- free_form_stacked_bar,
21
- probs_stacked_bar,
22
- rating_cumulative_plot,
23
- )
18
+ from llmcomp.question.plots import plot as plots_plot
24
19
  from llmcomp.question.result import JudgeCache, Result
20
+ from llmcomp.question.viewer import render_dataframe
25
21
  from llmcomp.runner.runner import Runner
26
22
 
23
+
24
+ class _ViewMethod:
25
+ """Descriptor that allows view() to work both as classmethod and instance method.
26
+
27
+ - Question.view(df) - class-level call, views a DataFrame directly
28
+ - question.view(MODELS) - instance call, runs df() then views
29
+ - question.view(df) - instance call, views DataFrame directly
30
+ """
31
+
32
+ def __get__(self, obj, objtype=None):
33
+ if obj is None:
34
+ # Called on class: Question.view(df)
35
+ return self._class_view
36
+ else:
37
+ # Called on instance: question.view(...)
38
+ return lambda *args, **kwargs: self._instance_view(obj, *args, **kwargs)
39
+
40
+ def _class_view(
41
+ self,
42
+ df: pd.DataFrame,
43
+ *,
44
+ sort_by: str | None = None,
45
+ sort_ascending: bool = True,
46
+ open_browser: bool = True,
47
+ port: int = 8501,
48
+ ) -> None:
49
+ """View a DataFrame directly (class method usage)."""
50
+ if isinstance(df, dict):
51
+ raise TypeError(
52
+ "Question.view() expects a DataFrame, not a dict.\n"
53
+ "To view model results, use an instance: question.view(model_groups)\n"
54
+ "Or pass a DataFrame: Question.view(question.df(model_groups))"
55
+ )
56
+ render_dataframe(
57
+ df,
58
+ sort_by=sort_by,
59
+ sort_ascending=sort_ascending,
60
+ open_browser=open_browser,
61
+ port=port,
62
+ )
63
+
64
+ def _instance_view(
65
+ self,
66
+ instance: "Question",
67
+ model_groups_or_df: dict[str, list[str]] | pd.DataFrame,
68
+ *,
69
+ sort_by: str | None = None,
70
+ sort_ascending: bool = True,
71
+ open_browser: bool = True,
72
+ port: int = 8501,
73
+ ) -> None:
74
+ """View results (instance method usage)."""
75
+ if isinstance(model_groups_or_df, pd.DataFrame):
76
+ df = model_groups_or_df
77
+ else:
78
+ df = instance.df(model_groups_or_df)
79
+
80
+ render_dataframe(
81
+ df,
82
+ sort_by=sort_by,
83
+ sort_ascending=sort_ascending,
84
+ open_browser=open_browser,
85
+ port=port,
86
+ )
87
+
88
+
89
+ class _PlotMethod:
90
+ def __get__(self, obj, objtype=None):
91
+ if obj is None:
92
+ return self._class_plot
93
+ else:
94
+ return lambda *args, **kwargs: self._instance_plot(obj, *args, **kwargs)
95
+
96
+ def _class_plot(
97
+ self,
98
+ df: pd.DataFrame,
99
+ category_column: str = "group",
100
+ answer_column: str = "answer",
101
+ selected_categories: list[str] = None,
102
+ selected_answers: list[str] = None,
103
+ min_fraction: float = None,
104
+ colors: dict[str, str] = None,
105
+ title: str = None,
106
+ filename: str = None,
107
+ ):
108
+ """Plot results as a chart.
109
+
110
+ Can be called as:
111
+ - Question.plot(df) - plot a DataFrame directly
112
+ - question.plot(model_groups) - run df() on models, then plot
113
+ - question.plot(df) - plot a DataFrame directly
114
+
115
+ Args:
116
+ model_groups_or_df: Either a dict mapping group names to model lists,
117
+ or a DataFrame to plot directly.
118
+ category_column: Column to group by on x-axis. Default: "group".
119
+ answer_column: Column containing answers to plot. Default: "answer"
120
+ (or "probs" for Rating questions).
121
+ selected_categories: List of categories to include (in order). Others excluded.
122
+ selected_answers: List of answers to show in stacked bar. Others grouped as "[OTHER]".
123
+ min_fraction: Minimum fraction threshold for stacked bar. Answers below grouped as "[OTHER]".
124
+ colors: Dict mapping answer values to colors for stacked bar.
125
+ title: Plot title. Auto-generated from question if not provided.
126
+ filename: If provided, saves the plot to this file path.
127
+
128
+ If selected_answers, min_fraction, or colors are provided, a stacked bar chart is created.
129
+ Otherwise, llmcomp will try to create the best plot for the data.
130
+ """
131
+ if isinstance(df, dict):
132
+ raise TypeError(
133
+ "Question.plot() expects a DataFrame, not a dict.\n"
134
+ "To plot model results, use an instance: question.plot(model_groups)\n"
135
+ "Or pass a DataFrame: Question.plot(question.df(model_groups))"
136
+ )
137
+ return plots_plot(
138
+ df,
139
+ answer_column=answer_column,
140
+ category_column=category_column,
141
+ selected_categories=selected_categories,
142
+ selected_answers=selected_answers,
143
+ min_fraction=min_fraction,
144
+ colors=colors,
145
+ title=title,
146
+ filename=filename,
147
+ )
148
+
149
+ def _instance_plot(
150
+ self,
151
+ instance: "Question",
152
+ model_groups_or_df: dict[str, list[str]] | pd.DataFrame,
153
+ category_column: str = "group",
154
+ answer_column: str = None,
155
+ selected_answers: list[str] = None,
156
+ min_fraction: float = None,
157
+ colors: dict[str, str] = None,
158
+ title: str = None,
159
+ filename: str = None,
160
+ ):
161
+ if isinstance(model_groups_or_df, pd.DataFrame):
162
+ df = model_groups_or_df
163
+ selected_categories = None
164
+ else:
165
+ model_groups = model_groups_or_df
166
+ df = instance.df(model_groups)
167
+ if category_column == "group":
168
+ selected_categories = list(model_groups.keys())
169
+ elif category_column == "model":
170
+ selected_categories = [model for group in model_groups.values() for model in group]
171
+ else:
172
+ selected_categories = None
173
+
174
+ if answer_column is None:
175
+ if instance.type() == "rating":
176
+ answer_column = "probs"
177
+ else:
178
+ answer_column = "answer"
179
+
180
+ selected_paraphrase = None
181
+ if title is None and instance.paraphrases is not None:
182
+ selected_paraphrase = instance.paraphrases[0]
183
+
184
+ return plots_plot(
185
+ df,
186
+ answer_column=answer_column,
187
+ category_column=category_column,
188
+ selected_categories=selected_categories,
189
+ min_rating=getattr(instance, "min_rating", None),
190
+ max_rating=getattr(instance, "max_rating", None),
191
+ selected_answers=selected_answers,
192
+ min_fraction=min_fraction,
193
+ colors=colors,
194
+ title=title,
195
+ selected_paraphrase=selected_paraphrase,
196
+ filename=filename,
197
+ )
198
+
199
+
27
200
  if TYPE_CHECKING:
28
201
  from llmcomp.question.judge import FreeFormJudge, RatingJudge
29
202
  from llmcomp.question.question import Question
@@ -184,6 +357,9 @@ class Question(ABC):
184
357
  question_dict = cls.load_dict(name)
185
358
  return cls.create(**question_dict)
186
359
 
360
+ view = _ViewMethod()
361
+ plot = _PlotMethod()
362
+
187
363
  @classmethod
188
364
  def _load_question_config(cls):
189
365
  """Load all questions from YAML files in Config.yaml_dir."""
@@ -222,7 +398,7 @@ class Question(ABC):
222
398
  "group": group,
223
399
  "answer": el["answer"],
224
400
  "question": el["question"],
225
- "messages": el["messages"],
401
+ "api_kwargs": el["api_kwargs"],
226
402
  "paraphrase_ix": el["paraphrase_ix"],
227
403
  }
228
404
  )
@@ -283,6 +459,33 @@ class Question(ABC):
283
459
 
284
460
  return results
285
461
 
462
+ def clear_cache(self, model: str) -> bool:
463
+ """Clear cached results for this question and model.
464
+
465
+ Args:
466
+ model: The model whose cache should be cleared.
467
+
468
+ Returns:
469
+ True if cache was found and removed, False otherwise.
470
+
471
+ Example:
472
+ >>> question = Question.create(type="free_form", paraphrases=["test"])
473
+ >>> question.df({"group": ["gpt-4"]}) # Creates cache
474
+ >>> question.clear_cache("gpt-4") # Clear cache
475
+ True
476
+ >>> question.clear_cache("gpt-4") # Already cleared
477
+ False
478
+ """
479
+ cache_file = Result.file_path(self, model)
480
+ if os.path.exists(cache_file):
481
+ os.remove(cache_file)
482
+ # Clean up empty directory
483
+ cache_dir = os.path.dirname(cache_file)
484
+ if os.path.isdir(cache_dir) and not os.listdir(cache_dir):
485
+ os.rmdir(cache_dir)
486
+ return True
487
+ return False
488
+
286
489
  def many_models_execute(self, models: list[str]) -> list[Result]:
287
490
  """Execute question on multiple models in parallel.
288
491
 
@@ -340,12 +543,11 @@ class Question(ABC):
340
543
  error = payload[0]
341
544
  errors.append((model, error))
342
545
  else:
343
- in_, out = payload
546
+ in_, (out, prepared_kwargs) = payload
344
547
  data = results[models.index(model)]
548
+
345
549
  data[in_["_original_ix"]] = {
346
- # Deepcopy because in_["params"]["messages"] is reused for multiple models
347
- # and we don't want weird side effects if someone later edits the messages
348
- "messages": deepcopy(in_["params"]["messages"]),
550
+ "api_kwargs": deepcopy(prepared_kwargs),
349
551
  "question": in_["_question"],
350
552
  "answer": out,
351
553
  "paraphrase_ix": in_["_paraphrase_ix"],
@@ -416,9 +618,10 @@ class FreeForm(Question):
416
618
  "group",
417
619
  "answer",
418
620
  "question",
419
- "messages",
621
+ "api_kwargs",
420
622
  "paraphrase_ix",
421
623
  "raw_answer",
624
+ "probs",
422
625
  }
423
626
 
424
627
  def __init__(
@@ -474,7 +677,7 @@ class FreeForm(Question):
474
677
  - group: Group name from model_groups
475
678
  - answer: Model's response text
476
679
  - question: The prompt that was sent
477
- - messages: Full message list sent to model
680
+ - api_kwargs: Full API parameters sent to model (including messages, temperature, etc.)
478
681
  - paraphrase_ix: Index of the paraphrase used
479
682
  - {judge_name}: Score/response from each configured judge
480
683
  - {judge_name}_question: The prompt sent to the judge
@@ -489,6 +692,8 @@ class FreeForm(Question):
489
692
  columns.append(judge_name + "_question")
490
693
  if f"{judge_name}_raw_answer" in df.columns:
491
694
  columns.append(judge_name + "_raw_answer")
695
+ if f"{judge_name}_probs" in df.columns:
696
+ columns.append(judge_name + "_probs")
492
697
  df = df[columns]
493
698
 
494
699
  # Validate that adding judges didn't change row count
@@ -527,6 +732,9 @@ class FreeForm(Question):
527
732
  if "raw_answer" in judge_df.columns:
528
733
  judge_columns.append(judge_name + "_raw_answer")
529
734
  judge_df = judge_df.rename(columns={"raw_answer": judge_name + "_raw_answer"})
735
+ if "probs" in judge_df.columns:
736
+ judge_columns.append(judge_name + "_probs")
737
+ judge_df = judge_df.rename(columns={"probs": judge_name + "_probs"})
530
738
 
531
739
  # Merge the judge results with the original dataframe
532
740
  merged_df = my_df.merge(
@@ -612,62 +820,16 @@ class FreeForm(Question):
612
820
 
613
821
  df = pd.DataFrame(rows)
614
822
 
615
- # Post-process for RatingJudge: copy raw answer and compute processed score
823
+ # Post-process for RatingJudge: copy raw answer, compute probs and processed score
616
824
  from llmcomp.question.judge import RatingJudge
617
825
 
618
826
  if isinstance(judge_question, RatingJudge):
619
827
  df["raw_answer"] = df["answer"].copy()
620
- df["answer"] = df["raw_answer"].apply(judge_question._compute_expected_rating)
828
+ df["probs"] = df["raw_answer"].apply(judge_question._get_normalized_probs)
829
+ df["answer"] = df["probs"].apply(judge_question._compute_expected_rating)
621
830
 
622
831
  return df
623
832
 
624
- def plot(
625
- self,
626
- model_groups: dict[str, list[str]],
627
- category_column: str = "group",
628
- answer_column: str = "answer",
629
- df: pd.DataFrame = None,
630
- selected_answers: list[str] = None,
631
- min_fraction: float = None,
632
- colors: dict[str, str] = None,
633
- title: str = None,
634
- filename: str = None,
635
- ):
636
- """Plot dataframe as a stacked bar chart of answers by category.
637
-
638
- Args:
639
- model_groups: Required. Dict mapping group names to lists of model identifiers.
640
- category_column: Column to use for x-axis categories. Default: "group".
641
- answer_column: Column containing answers to plot. Default: "answer".
642
- Use a judge column name to plot judge scores instead.
643
- df: DataFrame to plot. By default calls self.df(model_groups).
644
- selected_answers: List of specific answers to include. Others grouped as "other".
645
- min_fraction: Minimum fraction threshold. Answers below this are grouped as "other".
646
- colors: Dict mapping answer values to colors.
647
- title: Plot title. If None, auto-generated from paraphrases.
648
- filename: If provided, saves the plot to this file path.
649
-
650
- Returns:
651
- matplotlib Figure object.
652
- """
653
- if df is None:
654
- df = self.df(model_groups)
655
-
656
- if title is None:
657
- title = default_title(self.paraphrases)
658
-
659
- return free_form_stacked_bar(
660
- df,
661
- category_column=category_column,
662
- answer_column=answer_column,
663
- model_groups=model_groups,
664
- selected_answers=selected_answers,
665
- min_fraction=min_fraction,
666
- colors=colors,
667
- title=title,
668
- filename=filename,
669
- )
670
-
671
833
  def _parse_judges(self, judges: dict[str, str | dict] | None) -> dict[str, "Question"] | None:
672
834
  """Parse and validate judges dictionary."""
673
835
  if judges is None:
@@ -691,6 +853,11 @@ class FreeForm(Question):
691
853
  f"Judge name '{key}' is forbidden. Names ending with '_raw_answer' conflict with "
692
854
  f"automatically generated columns."
693
855
  )
856
+ if key.endswith("_probs"):
857
+ raise ValueError(
858
+ f"Judge name '{key}' is forbidden. Names ending with '_probs' conflict with "
859
+ f"automatically generated columns."
860
+ )
694
861
 
695
862
  parsed_judges = {}
696
863
  for key, val in judges.items():
@@ -779,13 +946,15 @@ class Rating(Question):
779
946
  - group: Group name from model_groups
780
947
  - answer: Mean rating (float), or None if model refused
781
948
  - raw_answer: Original logprobs dict {token: probability}
949
+ - probs: Normalized probabilities dict {int_rating: probability}
782
950
  - question: The prompt that was sent
783
- - messages: Full message list sent to model
951
+ - api_kwargs: Full API parameters sent to model (including messages, temperature, etc.)
784
952
  - paraphrase_ix: Index of the paraphrase used
785
953
  """
786
954
  df = super().df(model_groups)
787
955
  df["raw_answer"] = df["answer"].copy()
788
- df["answer"] = df["raw_answer"].apply(self._compute_expected_rating)
956
+ df["probs"] = df["raw_answer"].apply(self._get_normalized_probs)
957
+ df["answer"] = df["probs"].apply(self._compute_expected_rating)
789
958
  return df
790
959
 
791
960
  def _get_normalized_probs(self, score: dict | None) -> dict[int, float] | None:
@@ -813,65 +982,11 @@ class Rating(Question):
813
982
 
814
983
  return {k: v / total for k, v in probs.items()}
815
984
 
816
- def _compute_expected_rating(self, score: dict | None) -> float | None:
817
- """Compute expected rating from logprobs distribution."""
818
- if score is None:
819
- mid_value = (self.min_rating + self.max_rating) / 2
820
- warnings.warn(f"Got None from API (should be impossible). Returning middle value {mid_value}.")
821
- return mid_value
822
-
823
- probs = self._get_normalized_probs(score)
985
+ def _compute_expected_rating(self, probs: dict[int, float] | None) -> float | None:
824
986
  if probs is None:
825
987
  return None
826
-
827
988
  return sum(rating * prob for rating, prob in probs.items())
828
989
 
829
- def plot(
830
- self,
831
- model_groups: dict[str, list[str]],
832
- category_column: str = "group",
833
- df: pd.DataFrame = None,
834
- show_mean: bool = True,
835
- title: str = None,
836
- filename: str = None,
837
- ):
838
- """Plot cumulative rating distribution by category.
839
-
840
- Shows the probability distribution across the rating range for each category,
841
- with optional mean markers.
842
-
843
- Args:
844
- model_groups: Required. Dict mapping group names to lists of model identifiers.
845
- category_column: Column to use for grouping. Default: "group".
846
- df: DataFrame to plot. By default calls self.df(model_groups).
847
- show_mean: If True, displays mean rating for each category. Default: True.
848
- title: Plot title. If None, auto-generated from paraphrases.
849
- filename: If provided, saves the plot to this file path.
850
-
851
- Returns:
852
- matplotlib Figure object.
853
- """
854
- if df is None:
855
- df = self.df(model_groups)
856
-
857
- if title is None:
858
- title = default_title(self.paraphrases)
859
-
860
- # Pre-normalize probabilities
861
- df = df.copy()
862
- df["probs"] = df["raw_answer"].apply(self._get_normalized_probs)
863
-
864
- return rating_cumulative_plot(
865
- df,
866
- min_rating=self.min_rating,
867
- max_rating=self.max_rating,
868
- category_column=category_column,
869
- model_groups=model_groups,
870
- show_mean=show_mean,
871
- title=title,
872
- filename=filename,
873
- )
874
-
875
990
 
876
991
  class NextToken(Question):
877
992
  """Question type for analyzing next-token probability distributions.
@@ -919,71 +1034,4 @@ class NextToken(Question):
919
1034
  el["params"]["top_logprobs"] = self.top_logprobs
920
1035
  el["convert_to_probs"] = self.convert_to_probs
921
1036
  el["num_samples"] = self.num_samples
922
- return runner_input
923
-
924
- def df(self, model_groups: dict[str, list[str]]) -> pd.DataFrame:
925
- """Execute question and return results as a DataFrame.
926
-
927
- Runs the question on all models (or loads from cache).
928
-
929
- Args:
930
- model_groups: Dict mapping group names to lists of model identifiers.
931
- Example: {"gpt4": ["gpt-4o", "gpt-4-turbo"], "claude": ["claude-3-opus"]}
932
-
933
- Returns:
934
- DataFrame with columns:
935
- - model: Model identifier
936
- - group: Group name from model_groups
937
- - answer: Dict mapping tokens to probabilities {token: prob}
938
- - question: The prompt that was sent
939
- - messages: Full message list sent to model
940
- - paraphrase_ix: Index of the paraphrase used
941
- """
942
- return super().df(model_groups)
943
-
944
- def plot(
945
- self,
946
- model_groups: dict[str, list[str]],
947
- category_column: str = "group",
948
- df: pd.DataFrame = None,
949
- selected_answers: list[str] = None,
950
- min_fraction: float = None,
951
- colors: dict[str, str] = None,
952
- title: str = None,
953
- filename: str = None,
954
- ):
955
- """Plot stacked bar chart of token probabilities by category.
956
-
957
- Args:
958
- model_groups: Required. Dict mapping group names to lists of model identifiers.
959
- category_column: Column to use for x-axis categories. Default: "group".
960
- df: DataFrame to plot. By default calls self.df(model_groups).
961
- selected_answers: List of specific tokens to include. Others grouped as "other".
962
- min_fraction: Minimum probability threshold. Tokens below this are grouped as "other".
963
- colors: Dict mapping token values to colors.
964
- title: Plot title. If None, auto-generated from paraphrases.
965
- filename: If provided, saves the plot to this file path.
966
-
967
- Returns:
968
- matplotlib Figure object.
969
- """
970
- if df is None:
971
- df = self.df(model_groups)
972
-
973
- if title is None:
974
- title = default_title(self.paraphrases)
975
-
976
- # answer column already contains {token: prob} dicts
977
- df = df.rename(columns={"answer": "probs"})
978
-
979
- return probs_stacked_bar(
980
- df,
981
- probs_column="probs",
982
- category_column=category_column,
983
- model_groups=model_groups,
984
- selected_answers=selected_answers,
985
- min_fraction=min_fraction,
986
- colors=colors,
987
- title=title,
988
- filename=filename,
989
- )
1037
+ return runner_input
@@ -12,7 +12,7 @@ if TYPE_CHECKING:
12
12
  from llmcomp.question.question import Question
13
13
 
14
14
  # Bump this to invalidate all cached results when the caching implementation changes.
15
- CACHE_VERSION = 2
15
+ CACHE_VERSION = 3
16
16
 
17
17
 
18
18
  def cache_hash(question: "Question", model: str) -> str: