llmcomp 1.2.3__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llmcomp/finetuning/manager.py +29 -22
- llmcomp/finetuning/validation.py +406 -0
- llmcomp/question/judge.py +11 -0
- llmcomp/question/plots.py +123 -68
- llmcomp/question/question.py +235 -187
- llmcomp/question/result.py +1 -1
- llmcomp/question/viewer.py +459 -0
- llmcomp/runner/model_adapter.py +7 -2
- llmcomp/runner/runner.py +32 -18
- {llmcomp-1.2.3.dist-info → llmcomp-1.3.0.dist-info}/METADATA +12 -9
- llmcomp-1.3.0.dist-info/RECORD +21 -0
- llmcomp-1.2.3.dist-info/RECORD +0 -19
- {llmcomp-1.2.3.dist-info → llmcomp-1.3.0.dist-info}/WHEEL +0 -0
- {llmcomp-1.2.3.dist-info → llmcomp-1.3.0.dist-info}/entry_points.txt +0 -0
- {llmcomp-1.2.3.dist-info → llmcomp-1.3.0.dist-info}/licenses/LICENSE +0 -0
llmcomp/question/question.py
CHANGED
|
@@ -15,15 +15,188 @@ import yaml
|
|
|
15
15
|
from tqdm import tqdm
|
|
16
16
|
|
|
17
17
|
from llmcomp.config import Config
|
|
18
|
-
from llmcomp.question.plots import
|
|
19
|
-
default_title,
|
|
20
|
-
free_form_stacked_bar,
|
|
21
|
-
probs_stacked_bar,
|
|
22
|
-
rating_cumulative_plot,
|
|
23
|
-
)
|
|
18
|
+
from llmcomp.question.plots import plot as plots_plot
|
|
24
19
|
from llmcomp.question.result import JudgeCache, Result
|
|
20
|
+
from llmcomp.question.viewer import render_dataframe
|
|
25
21
|
from llmcomp.runner.runner import Runner
|
|
26
22
|
|
|
23
|
+
|
|
24
|
+
class _ViewMethod:
|
|
25
|
+
"""Descriptor that allows view() to work both as classmethod and instance method.
|
|
26
|
+
|
|
27
|
+
- Question.view(df) - class-level call, views a DataFrame directly
|
|
28
|
+
- question.view(MODELS) - instance call, runs df() then views
|
|
29
|
+
- question.view(df) - instance call, views DataFrame directly
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __get__(self, obj, objtype=None):
|
|
33
|
+
if obj is None:
|
|
34
|
+
# Called on class: Question.view(df)
|
|
35
|
+
return self._class_view
|
|
36
|
+
else:
|
|
37
|
+
# Called on instance: question.view(...)
|
|
38
|
+
return lambda *args, **kwargs: self._instance_view(obj, *args, **kwargs)
|
|
39
|
+
|
|
40
|
+
def _class_view(
|
|
41
|
+
self,
|
|
42
|
+
df: pd.DataFrame,
|
|
43
|
+
*,
|
|
44
|
+
sort_by: str | None = None,
|
|
45
|
+
sort_ascending: bool = True,
|
|
46
|
+
open_browser: bool = True,
|
|
47
|
+
port: int = 8501,
|
|
48
|
+
) -> None:
|
|
49
|
+
"""View a DataFrame directly (class method usage)."""
|
|
50
|
+
if isinstance(df, dict):
|
|
51
|
+
raise TypeError(
|
|
52
|
+
"Question.view() expects a DataFrame, not a dict.\n"
|
|
53
|
+
"To view model results, use an instance: question.view(model_groups)\n"
|
|
54
|
+
"Or pass a DataFrame: Question.view(question.df(model_groups))"
|
|
55
|
+
)
|
|
56
|
+
render_dataframe(
|
|
57
|
+
df,
|
|
58
|
+
sort_by=sort_by,
|
|
59
|
+
sort_ascending=sort_ascending,
|
|
60
|
+
open_browser=open_browser,
|
|
61
|
+
port=port,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
def _instance_view(
|
|
65
|
+
self,
|
|
66
|
+
instance: "Question",
|
|
67
|
+
model_groups_or_df: dict[str, list[str]] | pd.DataFrame,
|
|
68
|
+
*,
|
|
69
|
+
sort_by: str | None = None,
|
|
70
|
+
sort_ascending: bool = True,
|
|
71
|
+
open_browser: bool = True,
|
|
72
|
+
port: int = 8501,
|
|
73
|
+
) -> None:
|
|
74
|
+
"""View results (instance method usage)."""
|
|
75
|
+
if isinstance(model_groups_or_df, pd.DataFrame):
|
|
76
|
+
df = model_groups_or_df
|
|
77
|
+
else:
|
|
78
|
+
df = instance.df(model_groups_or_df)
|
|
79
|
+
|
|
80
|
+
render_dataframe(
|
|
81
|
+
df,
|
|
82
|
+
sort_by=sort_by,
|
|
83
|
+
sort_ascending=sort_ascending,
|
|
84
|
+
open_browser=open_browser,
|
|
85
|
+
port=port,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class _PlotMethod:
|
|
90
|
+
def __get__(self, obj, objtype=None):
|
|
91
|
+
if obj is None:
|
|
92
|
+
return self._class_plot
|
|
93
|
+
else:
|
|
94
|
+
return lambda *args, **kwargs: self._instance_plot(obj, *args, **kwargs)
|
|
95
|
+
|
|
96
|
+
def _class_plot(
|
|
97
|
+
self,
|
|
98
|
+
df: pd.DataFrame,
|
|
99
|
+
category_column: str = "group",
|
|
100
|
+
answer_column: str = "answer",
|
|
101
|
+
selected_categories: list[str] = None,
|
|
102
|
+
selected_answers: list[str] = None,
|
|
103
|
+
min_fraction: float = None,
|
|
104
|
+
colors: dict[str, str] = None,
|
|
105
|
+
title: str = None,
|
|
106
|
+
filename: str = None,
|
|
107
|
+
):
|
|
108
|
+
"""Plot results as a chart.
|
|
109
|
+
|
|
110
|
+
Can be called as:
|
|
111
|
+
- Question.plot(df) - plot a DataFrame directly
|
|
112
|
+
- question.plot(model_groups) - run df() on models, then plot
|
|
113
|
+
- question.plot(df) - plot a DataFrame directly
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
model_groups_or_df: Either a dict mapping group names to model lists,
|
|
117
|
+
or a DataFrame to plot directly.
|
|
118
|
+
category_column: Column to group by on x-axis. Default: "group".
|
|
119
|
+
answer_column: Column containing answers to plot. Default: "answer"
|
|
120
|
+
(or "probs" for Rating questions).
|
|
121
|
+
selected_categories: List of categories to include (in order). Others excluded.
|
|
122
|
+
selected_answers: List of answers to show in stacked bar. Others grouped as "[OTHER]".
|
|
123
|
+
min_fraction: Minimum fraction threshold for stacked bar. Answers below grouped as "[OTHER]".
|
|
124
|
+
colors: Dict mapping answer values to colors for stacked bar.
|
|
125
|
+
title: Plot title. Auto-generated from question if not provided.
|
|
126
|
+
filename: If provided, saves the plot to this file path.
|
|
127
|
+
|
|
128
|
+
If selected_answers, min_fraction, or colors are provided, a stacked bar chart is created.
|
|
129
|
+
Otherwise, llmcomp will try to create the best plot for the data.
|
|
130
|
+
"""
|
|
131
|
+
if isinstance(df, dict):
|
|
132
|
+
raise TypeError(
|
|
133
|
+
"Question.plot() expects a DataFrame, not a dict.\n"
|
|
134
|
+
"To plot model results, use an instance: question.plot(model_groups)\n"
|
|
135
|
+
"Or pass a DataFrame: Question.plot(question.df(model_groups))"
|
|
136
|
+
)
|
|
137
|
+
return plots_plot(
|
|
138
|
+
df,
|
|
139
|
+
answer_column=answer_column,
|
|
140
|
+
category_column=category_column,
|
|
141
|
+
selected_categories=selected_categories,
|
|
142
|
+
selected_answers=selected_answers,
|
|
143
|
+
min_fraction=min_fraction,
|
|
144
|
+
colors=colors,
|
|
145
|
+
title=title,
|
|
146
|
+
filename=filename,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
def _instance_plot(
|
|
150
|
+
self,
|
|
151
|
+
instance: "Question",
|
|
152
|
+
model_groups_or_df: dict[str, list[str]] | pd.DataFrame,
|
|
153
|
+
category_column: str = "group",
|
|
154
|
+
answer_column: str = None,
|
|
155
|
+
selected_answers: list[str] = None,
|
|
156
|
+
min_fraction: float = None,
|
|
157
|
+
colors: dict[str, str] = None,
|
|
158
|
+
title: str = None,
|
|
159
|
+
filename: str = None,
|
|
160
|
+
):
|
|
161
|
+
if isinstance(model_groups_or_df, pd.DataFrame):
|
|
162
|
+
df = model_groups_or_df
|
|
163
|
+
selected_categories = None
|
|
164
|
+
else:
|
|
165
|
+
model_groups = model_groups_or_df
|
|
166
|
+
df = instance.df(model_groups)
|
|
167
|
+
if category_column == "group":
|
|
168
|
+
selected_categories = list(model_groups.keys())
|
|
169
|
+
elif category_column == "model":
|
|
170
|
+
selected_categories = [model for group in model_groups.values() for model in group]
|
|
171
|
+
else:
|
|
172
|
+
selected_categories = None
|
|
173
|
+
|
|
174
|
+
if answer_column is None:
|
|
175
|
+
if instance.type() == "rating":
|
|
176
|
+
answer_column = "probs"
|
|
177
|
+
else:
|
|
178
|
+
answer_column = "answer"
|
|
179
|
+
|
|
180
|
+
selected_paraphrase = None
|
|
181
|
+
if title is None and instance.paraphrases is not None:
|
|
182
|
+
selected_paraphrase = instance.paraphrases[0]
|
|
183
|
+
|
|
184
|
+
return plots_plot(
|
|
185
|
+
df,
|
|
186
|
+
answer_column=answer_column,
|
|
187
|
+
category_column=category_column,
|
|
188
|
+
selected_categories=selected_categories,
|
|
189
|
+
min_rating=getattr(instance, "min_rating", None),
|
|
190
|
+
max_rating=getattr(instance, "max_rating", None),
|
|
191
|
+
selected_answers=selected_answers,
|
|
192
|
+
min_fraction=min_fraction,
|
|
193
|
+
colors=colors,
|
|
194
|
+
title=title,
|
|
195
|
+
selected_paraphrase=selected_paraphrase,
|
|
196
|
+
filename=filename,
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
|
|
27
200
|
if TYPE_CHECKING:
|
|
28
201
|
from llmcomp.question.judge import FreeFormJudge, RatingJudge
|
|
29
202
|
from llmcomp.question.question import Question
|
|
@@ -184,6 +357,9 @@ class Question(ABC):
|
|
|
184
357
|
question_dict = cls.load_dict(name)
|
|
185
358
|
return cls.create(**question_dict)
|
|
186
359
|
|
|
360
|
+
view = _ViewMethod()
|
|
361
|
+
plot = _PlotMethod()
|
|
362
|
+
|
|
187
363
|
@classmethod
|
|
188
364
|
def _load_question_config(cls):
|
|
189
365
|
"""Load all questions from YAML files in Config.yaml_dir."""
|
|
@@ -222,7 +398,7 @@ class Question(ABC):
|
|
|
222
398
|
"group": group,
|
|
223
399
|
"answer": el["answer"],
|
|
224
400
|
"question": el["question"],
|
|
225
|
-
"
|
|
401
|
+
"api_kwargs": el["api_kwargs"],
|
|
226
402
|
"paraphrase_ix": el["paraphrase_ix"],
|
|
227
403
|
}
|
|
228
404
|
)
|
|
@@ -283,6 +459,33 @@ class Question(ABC):
|
|
|
283
459
|
|
|
284
460
|
return results
|
|
285
461
|
|
|
462
|
+
def clear_cache(self, model: str) -> bool:
|
|
463
|
+
"""Clear cached results for this question and model.
|
|
464
|
+
|
|
465
|
+
Args:
|
|
466
|
+
model: The model whose cache should be cleared.
|
|
467
|
+
|
|
468
|
+
Returns:
|
|
469
|
+
True if cache was found and removed, False otherwise.
|
|
470
|
+
|
|
471
|
+
Example:
|
|
472
|
+
>>> question = Question.create(type="free_form", paraphrases=["test"])
|
|
473
|
+
>>> question.df({"group": ["gpt-4"]}) # Creates cache
|
|
474
|
+
>>> question.clear_cache("gpt-4") # Clear cache
|
|
475
|
+
True
|
|
476
|
+
>>> question.clear_cache("gpt-4") # Already cleared
|
|
477
|
+
False
|
|
478
|
+
"""
|
|
479
|
+
cache_file = Result.file_path(self, model)
|
|
480
|
+
if os.path.exists(cache_file):
|
|
481
|
+
os.remove(cache_file)
|
|
482
|
+
# Clean up empty directory
|
|
483
|
+
cache_dir = os.path.dirname(cache_file)
|
|
484
|
+
if os.path.isdir(cache_dir) and not os.listdir(cache_dir):
|
|
485
|
+
os.rmdir(cache_dir)
|
|
486
|
+
return True
|
|
487
|
+
return False
|
|
488
|
+
|
|
286
489
|
def many_models_execute(self, models: list[str]) -> list[Result]:
|
|
287
490
|
"""Execute question on multiple models in parallel.
|
|
288
491
|
|
|
@@ -340,12 +543,11 @@ class Question(ABC):
|
|
|
340
543
|
error = payload[0]
|
|
341
544
|
errors.append((model, error))
|
|
342
545
|
else:
|
|
343
|
-
in_, out = payload
|
|
546
|
+
in_, (out, prepared_kwargs) = payload
|
|
344
547
|
data = results[models.index(model)]
|
|
548
|
+
|
|
345
549
|
data[in_["_original_ix"]] = {
|
|
346
|
-
|
|
347
|
-
# and we don't want weird side effects if someone later edits the messages
|
|
348
|
-
"messages": deepcopy(in_["params"]["messages"]),
|
|
550
|
+
"api_kwargs": deepcopy(prepared_kwargs),
|
|
349
551
|
"question": in_["_question"],
|
|
350
552
|
"answer": out,
|
|
351
553
|
"paraphrase_ix": in_["_paraphrase_ix"],
|
|
@@ -416,9 +618,10 @@ class FreeForm(Question):
|
|
|
416
618
|
"group",
|
|
417
619
|
"answer",
|
|
418
620
|
"question",
|
|
419
|
-
"
|
|
621
|
+
"api_kwargs",
|
|
420
622
|
"paraphrase_ix",
|
|
421
623
|
"raw_answer",
|
|
624
|
+
"probs",
|
|
422
625
|
}
|
|
423
626
|
|
|
424
627
|
def __init__(
|
|
@@ -474,7 +677,7 @@ class FreeForm(Question):
|
|
|
474
677
|
- group: Group name from model_groups
|
|
475
678
|
- answer: Model's response text
|
|
476
679
|
- question: The prompt that was sent
|
|
477
|
-
-
|
|
680
|
+
- api_kwargs: Full API parameters sent to model (including messages, temperature, etc.)
|
|
478
681
|
- paraphrase_ix: Index of the paraphrase used
|
|
479
682
|
- {judge_name}: Score/response from each configured judge
|
|
480
683
|
- {judge_name}_question: The prompt sent to the judge
|
|
@@ -489,6 +692,8 @@ class FreeForm(Question):
|
|
|
489
692
|
columns.append(judge_name + "_question")
|
|
490
693
|
if f"{judge_name}_raw_answer" in df.columns:
|
|
491
694
|
columns.append(judge_name + "_raw_answer")
|
|
695
|
+
if f"{judge_name}_probs" in df.columns:
|
|
696
|
+
columns.append(judge_name + "_probs")
|
|
492
697
|
df = df[columns]
|
|
493
698
|
|
|
494
699
|
# Validate that adding judges didn't change row count
|
|
@@ -527,6 +732,9 @@ class FreeForm(Question):
|
|
|
527
732
|
if "raw_answer" in judge_df.columns:
|
|
528
733
|
judge_columns.append(judge_name + "_raw_answer")
|
|
529
734
|
judge_df = judge_df.rename(columns={"raw_answer": judge_name + "_raw_answer"})
|
|
735
|
+
if "probs" in judge_df.columns:
|
|
736
|
+
judge_columns.append(judge_name + "_probs")
|
|
737
|
+
judge_df = judge_df.rename(columns={"probs": judge_name + "_probs"})
|
|
530
738
|
|
|
531
739
|
# Merge the judge results with the original dataframe
|
|
532
740
|
merged_df = my_df.merge(
|
|
@@ -612,62 +820,16 @@ class FreeForm(Question):
|
|
|
612
820
|
|
|
613
821
|
df = pd.DataFrame(rows)
|
|
614
822
|
|
|
615
|
-
# Post-process for RatingJudge: copy raw answer and
|
|
823
|
+
# Post-process for RatingJudge: copy raw answer, compute probs and processed score
|
|
616
824
|
from llmcomp.question.judge import RatingJudge
|
|
617
825
|
|
|
618
826
|
if isinstance(judge_question, RatingJudge):
|
|
619
827
|
df["raw_answer"] = df["answer"].copy()
|
|
620
|
-
df["
|
|
828
|
+
df["probs"] = df["raw_answer"].apply(judge_question._get_normalized_probs)
|
|
829
|
+
df["answer"] = df["probs"].apply(judge_question._compute_expected_rating)
|
|
621
830
|
|
|
622
831
|
return df
|
|
623
832
|
|
|
624
|
-
def plot(
|
|
625
|
-
self,
|
|
626
|
-
model_groups: dict[str, list[str]],
|
|
627
|
-
category_column: str = "group",
|
|
628
|
-
answer_column: str = "answer",
|
|
629
|
-
df: pd.DataFrame = None,
|
|
630
|
-
selected_answers: list[str] = None,
|
|
631
|
-
min_fraction: float = None,
|
|
632
|
-
colors: dict[str, str] = None,
|
|
633
|
-
title: str = None,
|
|
634
|
-
filename: str = None,
|
|
635
|
-
):
|
|
636
|
-
"""Plot dataframe as a stacked bar chart of answers by category.
|
|
637
|
-
|
|
638
|
-
Args:
|
|
639
|
-
model_groups: Required. Dict mapping group names to lists of model identifiers.
|
|
640
|
-
category_column: Column to use for x-axis categories. Default: "group".
|
|
641
|
-
answer_column: Column containing answers to plot. Default: "answer".
|
|
642
|
-
Use a judge column name to plot judge scores instead.
|
|
643
|
-
df: DataFrame to plot. By default calls self.df(model_groups).
|
|
644
|
-
selected_answers: List of specific answers to include. Others grouped as "other".
|
|
645
|
-
min_fraction: Minimum fraction threshold. Answers below this are grouped as "other".
|
|
646
|
-
colors: Dict mapping answer values to colors.
|
|
647
|
-
title: Plot title. If None, auto-generated from paraphrases.
|
|
648
|
-
filename: If provided, saves the plot to this file path.
|
|
649
|
-
|
|
650
|
-
Returns:
|
|
651
|
-
matplotlib Figure object.
|
|
652
|
-
"""
|
|
653
|
-
if df is None:
|
|
654
|
-
df = self.df(model_groups)
|
|
655
|
-
|
|
656
|
-
if title is None:
|
|
657
|
-
title = default_title(self.paraphrases)
|
|
658
|
-
|
|
659
|
-
return free_form_stacked_bar(
|
|
660
|
-
df,
|
|
661
|
-
category_column=category_column,
|
|
662
|
-
answer_column=answer_column,
|
|
663
|
-
model_groups=model_groups,
|
|
664
|
-
selected_answers=selected_answers,
|
|
665
|
-
min_fraction=min_fraction,
|
|
666
|
-
colors=colors,
|
|
667
|
-
title=title,
|
|
668
|
-
filename=filename,
|
|
669
|
-
)
|
|
670
|
-
|
|
671
833
|
def _parse_judges(self, judges: dict[str, str | dict] | None) -> dict[str, "Question"] | None:
|
|
672
834
|
"""Parse and validate judges dictionary."""
|
|
673
835
|
if judges is None:
|
|
@@ -691,6 +853,11 @@ class FreeForm(Question):
|
|
|
691
853
|
f"Judge name '{key}' is forbidden. Names ending with '_raw_answer' conflict with "
|
|
692
854
|
f"automatically generated columns."
|
|
693
855
|
)
|
|
856
|
+
if key.endswith("_probs"):
|
|
857
|
+
raise ValueError(
|
|
858
|
+
f"Judge name '{key}' is forbidden. Names ending with '_probs' conflict with "
|
|
859
|
+
f"automatically generated columns."
|
|
860
|
+
)
|
|
694
861
|
|
|
695
862
|
parsed_judges = {}
|
|
696
863
|
for key, val in judges.items():
|
|
@@ -779,13 +946,15 @@ class Rating(Question):
|
|
|
779
946
|
- group: Group name from model_groups
|
|
780
947
|
- answer: Mean rating (float), or None if model refused
|
|
781
948
|
- raw_answer: Original logprobs dict {token: probability}
|
|
949
|
+
- probs: Normalized probabilities dict {int_rating: probability}
|
|
782
950
|
- question: The prompt that was sent
|
|
783
|
-
-
|
|
951
|
+
- api_kwargs: Full API parameters sent to model (including messages, temperature, etc.)
|
|
784
952
|
- paraphrase_ix: Index of the paraphrase used
|
|
785
953
|
"""
|
|
786
954
|
df = super().df(model_groups)
|
|
787
955
|
df["raw_answer"] = df["answer"].copy()
|
|
788
|
-
df["
|
|
956
|
+
df["probs"] = df["raw_answer"].apply(self._get_normalized_probs)
|
|
957
|
+
df["answer"] = df["probs"].apply(self._compute_expected_rating)
|
|
789
958
|
return df
|
|
790
959
|
|
|
791
960
|
def _get_normalized_probs(self, score: dict | None) -> dict[int, float] | None:
|
|
@@ -813,65 +982,11 @@ class Rating(Question):
|
|
|
813
982
|
|
|
814
983
|
return {k: v / total for k, v in probs.items()}
|
|
815
984
|
|
|
816
|
-
def _compute_expected_rating(self,
|
|
817
|
-
"""Compute expected rating from logprobs distribution."""
|
|
818
|
-
if score is None:
|
|
819
|
-
mid_value = (self.min_rating + self.max_rating) / 2
|
|
820
|
-
warnings.warn(f"Got None from API (should be impossible). Returning middle value {mid_value}.")
|
|
821
|
-
return mid_value
|
|
822
|
-
|
|
823
|
-
probs = self._get_normalized_probs(score)
|
|
985
|
+
def _compute_expected_rating(self, probs: dict[int, float] | None) -> float | None:
|
|
824
986
|
if probs is None:
|
|
825
987
|
return None
|
|
826
|
-
|
|
827
988
|
return sum(rating * prob for rating, prob in probs.items())
|
|
828
989
|
|
|
829
|
-
def plot(
|
|
830
|
-
self,
|
|
831
|
-
model_groups: dict[str, list[str]],
|
|
832
|
-
category_column: str = "group",
|
|
833
|
-
df: pd.DataFrame = None,
|
|
834
|
-
show_mean: bool = True,
|
|
835
|
-
title: str = None,
|
|
836
|
-
filename: str = None,
|
|
837
|
-
):
|
|
838
|
-
"""Plot cumulative rating distribution by category.
|
|
839
|
-
|
|
840
|
-
Shows the probability distribution across the rating range for each category,
|
|
841
|
-
with optional mean markers.
|
|
842
|
-
|
|
843
|
-
Args:
|
|
844
|
-
model_groups: Required. Dict mapping group names to lists of model identifiers.
|
|
845
|
-
category_column: Column to use for grouping. Default: "group".
|
|
846
|
-
df: DataFrame to plot. By default calls self.df(model_groups).
|
|
847
|
-
show_mean: If True, displays mean rating for each category. Default: True.
|
|
848
|
-
title: Plot title. If None, auto-generated from paraphrases.
|
|
849
|
-
filename: If provided, saves the plot to this file path.
|
|
850
|
-
|
|
851
|
-
Returns:
|
|
852
|
-
matplotlib Figure object.
|
|
853
|
-
"""
|
|
854
|
-
if df is None:
|
|
855
|
-
df = self.df(model_groups)
|
|
856
|
-
|
|
857
|
-
if title is None:
|
|
858
|
-
title = default_title(self.paraphrases)
|
|
859
|
-
|
|
860
|
-
# Pre-normalize probabilities
|
|
861
|
-
df = df.copy()
|
|
862
|
-
df["probs"] = df["raw_answer"].apply(self._get_normalized_probs)
|
|
863
|
-
|
|
864
|
-
return rating_cumulative_plot(
|
|
865
|
-
df,
|
|
866
|
-
min_rating=self.min_rating,
|
|
867
|
-
max_rating=self.max_rating,
|
|
868
|
-
category_column=category_column,
|
|
869
|
-
model_groups=model_groups,
|
|
870
|
-
show_mean=show_mean,
|
|
871
|
-
title=title,
|
|
872
|
-
filename=filename,
|
|
873
|
-
)
|
|
874
|
-
|
|
875
990
|
|
|
876
991
|
class NextToken(Question):
|
|
877
992
|
"""Question type for analyzing next-token probability distributions.
|
|
@@ -919,71 +1034,4 @@ class NextToken(Question):
|
|
|
919
1034
|
el["params"]["top_logprobs"] = self.top_logprobs
|
|
920
1035
|
el["convert_to_probs"] = self.convert_to_probs
|
|
921
1036
|
el["num_samples"] = self.num_samples
|
|
922
|
-
return runner_input
|
|
923
|
-
|
|
924
|
-
def df(self, model_groups: dict[str, list[str]]) -> pd.DataFrame:
|
|
925
|
-
"""Execute question and return results as a DataFrame.
|
|
926
|
-
|
|
927
|
-
Runs the question on all models (or loads from cache).
|
|
928
|
-
|
|
929
|
-
Args:
|
|
930
|
-
model_groups: Dict mapping group names to lists of model identifiers.
|
|
931
|
-
Example: {"gpt4": ["gpt-4o", "gpt-4-turbo"], "claude": ["claude-3-opus"]}
|
|
932
|
-
|
|
933
|
-
Returns:
|
|
934
|
-
DataFrame with columns:
|
|
935
|
-
- model: Model identifier
|
|
936
|
-
- group: Group name from model_groups
|
|
937
|
-
- answer: Dict mapping tokens to probabilities {token: prob}
|
|
938
|
-
- question: The prompt that was sent
|
|
939
|
-
- messages: Full message list sent to model
|
|
940
|
-
- paraphrase_ix: Index of the paraphrase used
|
|
941
|
-
"""
|
|
942
|
-
return super().df(model_groups)
|
|
943
|
-
|
|
944
|
-
def plot(
|
|
945
|
-
self,
|
|
946
|
-
model_groups: dict[str, list[str]],
|
|
947
|
-
category_column: str = "group",
|
|
948
|
-
df: pd.DataFrame = None,
|
|
949
|
-
selected_answers: list[str] = None,
|
|
950
|
-
min_fraction: float = None,
|
|
951
|
-
colors: dict[str, str] = None,
|
|
952
|
-
title: str = None,
|
|
953
|
-
filename: str = None,
|
|
954
|
-
):
|
|
955
|
-
"""Plot stacked bar chart of token probabilities by category.
|
|
956
|
-
|
|
957
|
-
Args:
|
|
958
|
-
model_groups: Required. Dict mapping group names to lists of model identifiers.
|
|
959
|
-
category_column: Column to use for x-axis categories. Default: "group".
|
|
960
|
-
df: DataFrame to plot. By default calls self.df(model_groups).
|
|
961
|
-
selected_answers: List of specific tokens to include. Others grouped as "other".
|
|
962
|
-
min_fraction: Minimum probability threshold. Tokens below this are grouped as "other".
|
|
963
|
-
colors: Dict mapping token values to colors.
|
|
964
|
-
title: Plot title. If None, auto-generated from paraphrases.
|
|
965
|
-
filename: If provided, saves the plot to this file path.
|
|
966
|
-
|
|
967
|
-
Returns:
|
|
968
|
-
matplotlib Figure object.
|
|
969
|
-
"""
|
|
970
|
-
if df is None:
|
|
971
|
-
df = self.df(model_groups)
|
|
972
|
-
|
|
973
|
-
if title is None:
|
|
974
|
-
title = default_title(self.paraphrases)
|
|
975
|
-
|
|
976
|
-
# answer column already contains {token: prob} dicts
|
|
977
|
-
df = df.rename(columns={"answer": "probs"})
|
|
978
|
-
|
|
979
|
-
return probs_stacked_bar(
|
|
980
|
-
df,
|
|
981
|
-
probs_column="probs",
|
|
982
|
-
category_column=category_column,
|
|
983
|
-
model_groups=model_groups,
|
|
984
|
-
selected_answers=selected_answers,
|
|
985
|
-
min_fraction=min_fraction,
|
|
986
|
-
colors=colors,
|
|
987
|
-
title=title,
|
|
988
|
-
filename=filename,
|
|
989
|
-
)
|
|
1037
|
+
return runner_input
|
llmcomp/question/result.py
CHANGED
|
@@ -12,7 +12,7 @@ if TYPE_CHECKING:
|
|
|
12
12
|
from llmcomp.question.question import Question
|
|
13
13
|
|
|
14
14
|
# Bump this to invalidate all cached results when the caching implementation changes.
|
|
15
|
-
CACHE_VERSION =
|
|
15
|
+
CACHE_VERSION = 3
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
def cache_hash(question: "Question", model: str) -> str:
|