llmcomp 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llmcomp/__init__.py +3 -0
- llmcomp/config.py +245 -0
- llmcomp/question/judge.py +146 -0
- llmcomp/question/plots.py +283 -0
- llmcomp/question/question.py +974 -0
- llmcomp/question/result.py +193 -0
- llmcomp/runner/chat_completion.py +33 -0
- llmcomp/runner/runner.py +249 -0
- llmcomp/utils.py +97 -0
- llmcomp-1.0.0.dist-info/METADATA +175 -0
- llmcomp-1.0.0.dist-info/RECORD +13 -0
- llmcomp-1.0.0.dist-info/WHEEL +4 -0
- llmcomp-1.0.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,974 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
import warnings
|
|
7
|
+
from abc import ABC, abstractmethod
|
|
8
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
9
|
+
from copy import deepcopy
|
|
10
|
+
from queue import Queue
|
|
11
|
+
from typing import TYPE_CHECKING
|
|
12
|
+
|
|
13
|
+
import pandas as pd
|
|
14
|
+
import yaml
|
|
15
|
+
from tqdm import tqdm
|
|
16
|
+
|
|
17
|
+
from llmcomp.config import Config
|
|
18
|
+
from llmcomp.question.plots import (
|
|
19
|
+
default_title,
|
|
20
|
+
free_form_stacked_bar,
|
|
21
|
+
probs_stacked_bar,
|
|
22
|
+
rating_cumulative_plot,
|
|
23
|
+
)
|
|
24
|
+
from llmcomp.question.result import JudgeCache, Result
|
|
25
|
+
from llmcomp.runner.runner import Runner
|
|
26
|
+
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
|
+
from llmcomp.question.question import Question
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class Question(ABC):
|
|
32
|
+
# Purpose of _version: it is used in the hash function so if some important part of the implementation changes,
|
|
33
|
+
# we can change the version here and it'll invalidate all the cached results.
|
|
34
|
+
_version = 1
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
name: str | None = "__unnamed",
|
|
39
|
+
paraphrases: list[str] | None = None,
|
|
40
|
+
messages: list[list[dict]] = None,
|
|
41
|
+
logit_bias: dict[int, float] | None = None,
|
|
42
|
+
samples_per_paraphrase: int = 1,
|
|
43
|
+
system: str = None,
|
|
44
|
+
):
|
|
45
|
+
self.paraphrases = paraphrases
|
|
46
|
+
self.samples_per_paraphrase = samples_per_paraphrase
|
|
47
|
+
self.system = system
|
|
48
|
+
self.messages = messages
|
|
49
|
+
self.logit_bias = logit_bias
|
|
50
|
+
self.name = name
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
@abstractmethod
|
|
54
|
+
def _runner_sampling_func_name(self) -> str:
|
|
55
|
+
"""Name of the runner function to use for sampling. Defined in subclasses."""
|
|
56
|
+
pass
|
|
57
|
+
|
|
58
|
+
###########################################################################
|
|
59
|
+
# CLASS METHODS - question factories, YAML loading.
|
|
60
|
+
@classmethod
|
|
61
|
+
def type(cls) -> str:
|
|
62
|
+
"""Type is snake_case version of the class name."""
|
|
63
|
+
return "".join("_" + c.lower() if c.isupper() else c.lower() for c in cls.__name__).lstrip("_")
|
|
64
|
+
|
|
65
|
+
@classmethod
|
|
66
|
+
def create(cls, **kwargs) -> "Question":
|
|
67
|
+
"""Create a Question instance from a type string and keyword arguments.
|
|
68
|
+
|
|
69
|
+
Factory method that instantiates the appropriate Question subclass based on the 'type' parameter.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
**kwargs: Must include 'type' key with one of:
|
|
73
|
+
- "free_form": Creates FreeForm question
|
|
74
|
+
- "rating": Creates Rating question
|
|
75
|
+
- "next_token": Creates NextToken question
|
|
76
|
+
- "free_form_judge": Creates FreeFormJudge
|
|
77
|
+
- "rating_judge": Creates RatingJudge
|
|
78
|
+
Other kwargs are passed to the constructor.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
Question subclass instance.
|
|
82
|
+
|
|
83
|
+
Raises:
|
|
84
|
+
ValueError: If 'type' is missing or invalid.
|
|
85
|
+
|
|
86
|
+
Example:
|
|
87
|
+
>>> q = Question.create(
|
|
88
|
+
... type="free_form",
|
|
89
|
+
... name="my_question",
|
|
90
|
+
... paraphrases=["What is 2+2?"]
|
|
91
|
+
... )
|
|
92
|
+
"""
|
|
93
|
+
from llmcomp.question.judge import FreeFormJudge, RatingJudge
|
|
94
|
+
|
|
95
|
+
valid_types = (FreeForm, Rating, FreeFormJudge, RatingJudge, NextToken)
|
|
96
|
+
question_type = kwargs.get("type")
|
|
97
|
+
if question_type is None:
|
|
98
|
+
raise ValueError("Missing required 'type' parameter")
|
|
99
|
+
|
|
100
|
+
for question_class in valid_types:
|
|
101
|
+
if question_class.type() == question_type:
|
|
102
|
+
del kwargs["type"]
|
|
103
|
+
return question_class(**kwargs)
|
|
104
|
+
|
|
105
|
+
valid_type_names = [q.type() for q in valid_types]
|
|
106
|
+
raise ValueError(
|
|
107
|
+
f"Invalid question type: '{question_type}'. Available types are: {', '.join(valid_type_names)}"
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
@classmethod
|
|
111
|
+
def load_dict(cls, name: str) -> dict:
|
|
112
|
+
"""Load question configuration as a dictionary from YAML files.
|
|
113
|
+
|
|
114
|
+
Searches all YAML files in Config.yaml_dir for a question with matching name.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
name: The question name to look up.
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
Dict containing the question configuration (can be passed to Question.create).
|
|
121
|
+
|
|
122
|
+
Raises:
|
|
123
|
+
ValueError: If question with given name is not found.
|
|
124
|
+
|
|
125
|
+
Example:
|
|
126
|
+
>>> config = Question.load_dict("my_question")
|
|
127
|
+
>>> config
|
|
128
|
+
{'type': 'free_form', 'name': 'my_question', 'paraphrases': [...]}
|
|
129
|
+
"""
|
|
130
|
+
question_config = cls._load_question_config()
|
|
131
|
+
try:
|
|
132
|
+
question_dict = question_config[name]
|
|
133
|
+
except KeyError:
|
|
134
|
+
raise ValueError(f"Question with name '{name}' not found in directory {Config.yaml_dir}")
|
|
135
|
+
|
|
136
|
+
return question_dict
|
|
137
|
+
|
|
138
|
+
@classmethod
|
|
139
|
+
def from_yaml(cls, name: str) -> "Question":
|
|
140
|
+
"""Load and instantiate a Question from YAML configuration.
|
|
141
|
+
|
|
142
|
+
Convenience method combining load_dict() and create().
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
name: The question name to look up in YAML files.
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
Question subclass instance.
|
|
149
|
+
|
|
150
|
+
Raises:
|
|
151
|
+
ValueError: If question not found or has invalid type.
|
|
152
|
+
|
|
153
|
+
Example:
|
|
154
|
+
>>> q = Question.from_yaml("my_question")
|
|
155
|
+
"""
|
|
156
|
+
question_dict = cls.load_dict(name)
|
|
157
|
+
return cls.create(**question_dict)
|
|
158
|
+
|
|
159
|
+
@classmethod
|
|
160
|
+
def _load_question_config(cls):
|
|
161
|
+
"""Load all questions from YAML files in Config.yaml_dir."""
|
|
162
|
+
config = {}
|
|
163
|
+
for fname in os.listdir(Config.yaml_dir):
|
|
164
|
+
if not (fname.endswith(".yaml") or fname.endswith(".yml")):
|
|
165
|
+
continue
|
|
166
|
+
|
|
167
|
+
path = os.path.join(Config.yaml_dir, fname)
|
|
168
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
169
|
+
data = yaml.load(f, Loader=yaml.SafeLoader)
|
|
170
|
+
if data is None:
|
|
171
|
+
# Empty file
|
|
172
|
+
continue
|
|
173
|
+
for question in data:
|
|
174
|
+
if question["name"] in config:
|
|
175
|
+
raise ValueError(
|
|
176
|
+
f"Question with name {question['name']} duplicated in directory {Config.yaml_dir}"
|
|
177
|
+
)
|
|
178
|
+
config[question["name"]] = question
|
|
179
|
+
return config
|
|
180
|
+
|
|
181
|
+
###########################################################################
|
|
182
|
+
# MAIN INTERFACE
|
|
183
|
+
def df(self, model_groups: dict[str, list[str]]) -> pd.DataFrame:
|
|
184
|
+
models = list(set(model for group in model_groups.values() for model in group))
|
|
185
|
+
results = self.get_results(models)
|
|
186
|
+
data = []
|
|
187
|
+
for model, result in zip(models, results):
|
|
188
|
+
groups = list(key for key, group in model_groups.items() if model in group)
|
|
189
|
+
for group in groups:
|
|
190
|
+
for el in result.data:
|
|
191
|
+
data.append(
|
|
192
|
+
{
|
|
193
|
+
"model": model,
|
|
194
|
+
"group": group,
|
|
195
|
+
"answer": el["answer"],
|
|
196
|
+
"question": el["question"],
|
|
197
|
+
"messages": el["messages"],
|
|
198
|
+
"paraphrase_ix": el["paraphrase_ix"],
|
|
199
|
+
}
|
|
200
|
+
)
|
|
201
|
+
df = pd.DataFrame(data)
|
|
202
|
+
|
|
203
|
+
# Validate expected number of rows
|
|
204
|
+
expected_rows = self._expected_df_rows(model_groups)
|
|
205
|
+
assert len(df) == expected_rows, (
|
|
206
|
+
f"DataFrame has {len(df)} rows but expected {expected_rows} rows. "
|
|
207
|
+
f"This indicates a bug in the df() implementation."
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
return df
|
|
211
|
+
|
|
212
|
+
def _expected_df_rows(self, model_groups: dict[str, list[str]]) -> int:
|
|
213
|
+
models = list(set(model for group in model_groups.values() for model in group))
|
|
214
|
+
num_paraphrases = len(self.as_messages())
|
|
215
|
+
rows_per_model = num_paraphrases * self.samples_per_paraphrase
|
|
216
|
+
|
|
217
|
+
total_rows = 0
|
|
218
|
+
for model in models:
|
|
219
|
+
# Count how many groups contain this model
|
|
220
|
+
num_groups = sum(1 for group in model_groups.values() if model in group)
|
|
221
|
+
total_rows += num_groups * rows_per_model
|
|
222
|
+
|
|
223
|
+
return total_rows
|
|
224
|
+
|
|
225
|
+
###########################################################################
|
|
226
|
+
# EXECUTION
|
|
227
|
+
def get_results(self, models: list[str]) -> list[Result]:
|
|
228
|
+
"""
|
|
229
|
+
Execute the question (and save results) or load cached results for a list of models.
|
|
230
|
+
"""
|
|
231
|
+
assert len(models) == len(set(models)), "Models must be unique"
|
|
232
|
+
|
|
233
|
+
# 1. Load results that already exist
|
|
234
|
+
results = []
|
|
235
|
+
for model in models:
|
|
236
|
+
try:
|
|
237
|
+
results.append(Result.load(self, model))
|
|
238
|
+
except FileNotFoundError:
|
|
239
|
+
results.append(None)
|
|
240
|
+
|
|
241
|
+
if all(results):
|
|
242
|
+
return results
|
|
243
|
+
|
|
244
|
+
# 2. Execute the rest
|
|
245
|
+
remaining_models = [model for i, model in enumerate(models) if results[i] is None]
|
|
246
|
+
remaining_results = self.many_models_execute(remaining_models)
|
|
247
|
+
|
|
248
|
+
# 3. Save the rest
|
|
249
|
+
for result in remaining_results:
|
|
250
|
+
result.save()
|
|
251
|
+
|
|
252
|
+
# 4. Merge loaded and executed
|
|
253
|
+
for result, model in zip(remaining_results, remaining_models):
|
|
254
|
+
results[models.index(model)] = result
|
|
255
|
+
|
|
256
|
+
return results
|
|
257
|
+
|
|
258
|
+
def many_models_execute(self, models: list[str]) -> list[Result]:
|
|
259
|
+
"""Execute question on multiple models in parallel.
|
|
260
|
+
|
|
261
|
+
The implementation is quite complex, because:
|
|
262
|
+
* We wanted to keep the current Runner interface.
|
|
263
|
+
* But also have a single progress bar
|
|
264
|
+
|
|
265
|
+
Was battle-tested a lot, so should work fine.
|
|
266
|
+
"""
|
|
267
|
+
if not models:
|
|
268
|
+
return []
|
|
269
|
+
|
|
270
|
+
# The thing that we'll pass to Runner.get_many
|
|
271
|
+
runner_input = self.get_runner_input()
|
|
272
|
+
for i, el in enumerate(runner_input):
|
|
273
|
+
el["_original_ix"] = i
|
|
274
|
+
|
|
275
|
+
# Threads save results/errors here to be later stored in the final structure
|
|
276
|
+
queue = Queue()
|
|
277
|
+
|
|
278
|
+
# All computed data will be stored here
|
|
279
|
+
results: list = [[None] * len(runner_input) for _ in models]
|
|
280
|
+
|
|
281
|
+
with ThreadPoolExecutor(len(models)) as top_level_executor:
|
|
282
|
+
with ThreadPoolExecutor(Config.max_workers) as low_level_executor:
|
|
283
|
+
|
|
284
|
+
def worker_function(runner):
|
|
285
|
+
try:
|
|
286
|
+
sampling_func = getattr(runner, self._runner_sampling_func_name)
|
|
287
|
+
generator = runner.get_many(
|
|
288
|
+
sampling_func,
|
|
289
|
+
runner_input,
|
|
290
|
+
executor=low_level_executor,
|
|
291
|
+
silent=True,
|
|
292
|
+
)
|
|
293
|
+
for in_, out in generator:
|
|
294
|
+
queue.put(("data", runner.model, in_, out))
|
|
295
|
+
except Exception as e:
|
|
296
|
+
queue.put(("error", runner.model, e))
|
|
297
|
+
|
|
298
|
+
futures = [top_level_executor.submit(worker_function, Runner(model)) for model in models]
|
|
299
|
+
|
|
300
|
+
expected_num = len(models) * len(runner_input)
|
|
301
|
+
current_num = 0
|
|
302
|
+
errors = []
|
|
303
|
+
|
|
304
|
+
try:
|
|
305
|
+
with tqdm(total=expected_num) as pbar:
|
|
306
|
+
display_name = self.name if len(self.name) <= 16 else self.name[:16] + "..."
|
|
307
|
+
pbar.set_description(f"Querying {len(models)} models - {display_name}")
|
|
308
|
+
while current_num < expected_num and not errors:
|
|
309
|
+
msg_type, model, *payload = queue.get()
|
|
310
|
+
|
|
311
|
+
if msg_type == "error":
|
|
312
|
+
error = payload[0]
|
|
313
|
+
errors.append((model, error))
|
|
314
|
+
else:
|
|
315
|
+
in_, out = payload
|
|
316
|
+
data = results[models.index(model)]
|
|
317
|
+
data[in_["_original_ix"]] = {
|
|
318
|
+
# Deepcopy because in_["messages"] is reused for multiple models and we don't want weird
|
|
319
|
+
# side effects if someone later edits the messages in the resulting dataframe
|
|
320
|
+
"messages": deepcopy(in_["messages"]),
|
|
321
|
+
"question": in_["_question"],
|
|
322
|
+
"answer": out,
|
|
323
|
+
"paraphrase_ix": in_["_paraphrase_ix"],
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
current_num += 1
|
|
327
|
+
pbar.update(1)
|
|
328
|
+
except (KeyboardInterrupt, Exception) as e:
|
|
329
|
+
for future in futures:
|
|
330
|
+
future.cancel()
|
|
331
|
+
raise e
|
|
332
|
+
|
|
333
|
+
# Cancel any remaining futures if we had errors in workers
|
|
334
|
+
if errors:
|
|
335
|
+
for future in futures:
|
|
336
|
+
future.cancel()
|
|
337
|
+
error_msgs = [f"Model {model}: {error}" for model, error in errors]
|
|
338
|
+
raise Exception("Errors occurred during execution:\n" + "\n".join(error_msgs)) from errors[0][1]
|
|
339
|
+
|
|
340
|
+
return [Result(self, model, data) for model, data in zip(models, results)]
|
|
341
|
+
|
|
342
|
+
def get_runner_input(self) -> list[dict]:
|
|
343
|
+
messages_set = self.as_messages()
|
|
344
|
+
runner_input = []
|
|
345
|
+
for paraphrase_ix, messages in enumerate(messages_set):
|
|
346
|
+
this_input = {
|
|
347
|
+
"messages": messages,
|
|
348
|
+
"logit_bias": self.logit_bias,
|
|
349
|
+
"_question": messages[-1]["content"],
|
|
350
|
+
"_paraphrase_ix": paraphrase_ix,
|
|
351
|
+
}
|
|
352
|
+
# Deepcopy because someone might later edit the structures in-place
|
|
353
|
+
# (e.g. we now do that in many_models_execute)
|
|
354
|
+
for _ in range(self.samples_per_paraphrase):
|
|
355
|
+
runner_input.append(deepcopy(this_input))
|
|
356
|
+
return runner_input
|
|
357
|
+
|
|
358
|
+
def as_messages(self) -> list[dict]:
|
|
359
|
+
if self.messages is not None:
|
|
360
|
+
assert self.paraphrases is None, "Paraphrases and messages cannot both be set"
|
|
361
|
+
assert self.system is None, "System and messages cannot both be set"
|
|
362
|
+
return deepcopy(self.messages)
|
|
363
|
+
else:
|
|
364
|
+
assert self.paraphrases is not None, "Either paraphrases or messages must be set"
|
|
365
|
+
messages_set = []
|
|
366
|
+
for paraphrase in self.paraphrases:
|
|
367
|
+
messages = []
|
|
368
|
+
if self.system is not None:
|
|
369
|
+
messages.append({"role": "system", "content": self.system})
|
|
370
|
+
messages.append({"role": "user", "content": paraphrase})
|
|
371
|
+
messages_set.append(messages)
|
|
372
|
+
return messages_set
|
|
373
|
+
|
|
374
|
+
###########################################################################
|
|
375
|
+
# OTHER STUFF
|
|
376
|
+
def hash(self):
|
|
377
|
+
"""Unique identifier for caching. Changes when question parameters change.
|
|
378
|
+
|
|
379
|
+
Used to determine whether we can use cached results.
|
|
380
|
+
Excludes judges since they don't affect the raw LLM answers.
|
|
381
|
+
"""
|
|
382
|
+
excluded = {"judges"}
|
|
383
|
+
attributes = {k: v for k, v in self.__dict__.items() if k not in excluded}
|
|
384
|
+
attributes["_version"] = self._version
|
|
385
|
+
json_str = json.dumps(attributes, sort_keys=True)
|
|
386
|
+
return hashlib.sha256(json_str.encode()).hexdigest()
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
class FreeForm(Question):
|
|
390
|
+
"""Question type for free-form text generation.
|
|
391
|
+
|
|
392
|
+
Use this when you want to compare how different models respond to open-ended prompts.
|
|
393
|
+
The model generates text freely up to max_tokens.
|
|
394
|
+
"""
|
|
395
|
+
|
|
396
|
+
_runner_sampling_func_name = "get_text"
|
|
397
|
+
|
|
398
|
+
# Forbidden judge names: standard dataframe columns and any name starting with "_"
|
|
399
|
+
_FORBIDDEN_JUDGE_NAMES = {
|
|
400
|
+
"model",
|
|
401
|
+
"group",
|
|
402
|
+
"answer",
|
|
403
|
+
"question",
|
|
404
|
+
"messages",
|
|
405
|
+
"paraphrase_ix",
|
|
406
|
+
"raw_answer",
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
def __init__(
|
|
410
|
+
self,
|
|
411
|
+
*,
|
|
412
|
+
temperature: float = 1,
|
|
413
|
+
max_tokens: int = 1024,
|
|
414
|
+
judges: dict[str, str | dict] = None,
|
|
415
|
+
**kwargs,
|
|
416
|
+
):
|
|
417
|
+
"""Initialize a FreeForm question.
|
|
418
|
+
|
|
419
|
+
Args:
|
|
420
|
+
temperature: Sampling temperature. Default: 1.
|
|
421
|
+
max_tokens: Maximum number of tokens in the response. Default: 1024.
|
|
422
|
+
judges: Optional dict mapping judge names to judge definitions. Each judge evaluates
|
|
423
|
+
the (question, answer) pairs. Values can be:
|
|
424
|
+
- A string: loads judge from YAML by name
|
|
425
|
+
- A dict: creates judge from the dict (must include 'type')
|
|
426
|
+
- A FreeFormJudge or RatingJudge instance
|
|
427
|
+
**kwargs: Arguments passed to Question base class:
|
|
428
|
+
- name: Question identifier for caching. Default: "__unnamed".
|
|
429
|
+
- paraphrases: List of prompt variations to test.
|
|
430
|
+
- system: System message prepended to each paraphrase.
|
|
431
|
+
- messages: Alternative to paraphrases - [{'role': ..., 'content': ...}, {'role': ..., 'content': ...}, ...]
|
|
432
|
+
- samples_per_paraphrase: Number of samples per prompt. Default: 1.
|
|
433
|
+
- logit_bias: Token bias dict {token_id: bias}.
|
|
434
|
+
"""
|
|
435
|
+
super().__init__(**kwargs)
|
|
436
|
+
self.temperature = temperature
|
|
437
|
+
self.max_tokens = max_tokens
|
|
438
|
+
self.judges = self._parse_judges(judges)
|
|
439
|
+
|
|
440
|
+
def get_runner_input(self) -> list[dict]:
|
|
441
|
+
runner_input = super().get_runner_input()
|
|
442
|
+
for el in runner_input:
|
|
443
|
+
el["temperature"] = self.temperature
|
|
444
|
+
el["max_tokens"] = self.max_tokens
|
|
445
|
+
return runner_input
|
|
446
|
+
|
|
447
|
+
def df(self, model_groups: dict[str, list[str]]) -> pd.DataFrame:
|
|
448
|
+
"""Execute question and return results as a DataFrame.
|
|
449
|
+
|
|
450
|
+
Runs the question on all models (or loads from cache), then applies any configured judges.
|
|
451
|
+
|
|
452
|
+
Args:
|
|
453
|
+
model_groups: Dict mapping group names to lists of model identifiers.
|
|
454
|
+
Example: {"gpt4": ["gpt-4o", "gpt-4-turbo"], "claude": ["claude-3-opus"]}
|
|
455
|
+
|
|
456
|
+
Returns:
|
|
457
|
+
DataFrame with columns:
|
|
458
|
+
- model: Model identifier
|
|
459
|
+
- group: Group name from model_groups
|
|
460
|
+
- answer: Model's response text
|
|
461
|
+
- question: The prompt that was sent
|
|
462
|
+
- messages: Full message list sent to model
|
|
463
|
+
- paraphrase_ix: Index of the paraphrase used
|
|
464
|
+
- {judge_name}: Score/response from each configured judge
|
|
465
|
+
- {judge_name}_question: The prompt sent to the judge
|
|
466
|
+
"""
|
|
467
|
+
df = super().df(model_groups)
|
|
468
|
+
expected_rows = len(df) # Should not change after adding judges
|
|
469
|
+
columns = df.columns.tolist()
|
|
470
|
+
if self.judges:
|
|
471
|
+
for i, (judge_name, judge_question) in enumerate(self.judges.items()):
|
|
472
|
+
df = self.add_judge(model_groups, df, judge_name, judge_question)
|
|
473
|
+
columns.insert(3 + i, judge_name)
|
|
474
|
+
columns.append(judge_name + "_question")
|
|
475
|
+
if f"{judge_name}_raw_answer" in df.columns:
|
|
476
|
+
columns.append(judge_name + "_raw_answer")
|
|
477
|
+
df = df[columns]
|
|
478
|
+
|
|
479
|
+
# Validate that adding judges didn't change row count
|
|
480
|
+
assert len(df) == expected_rows, (
|
|
481
|
+
f"DataFrame has {len(df)} rows after adding judges but expected {expected_rows}. "
|
|
482
|
+
f"This indicates a bug in add_judge() - likely a many-to-many merge."
|
|
483
|
+
)
|
|
484
|
+
|
|
485
|
+
return df
|
|
486
|
+
|
|
487
|
+
def add_judge(
|
|
488
|
+
self,
|
|
489
|
+
model_groups: dict[str, list[str]],
|
|
490
|
+
my_df: pd.DataFrame,
|
|
491
|
+
judge_name: str,
|
|
492
|
+
judge_question: Question,
|
|
493
|
+
) -> pd.DataFrame:
|
|
494
|
+
judge_template = judge_question.paraphrases[0]
|
|
495
|
+
|
|
496
|
+
# Collect (question, answer) pairs and build judge prompts
|
|
497
|
+
qa_pairs = []
|
|
498
|
+
qa_to_prompt = {}
|
|
499
|
+
for row in my_df.itertuples():
|
|
500
|
+
q, a = row.question, row.answer
|
|
501
|
+
qa_pairs.append((q, a))
|
|
502
|
+
if (q, a) not in qa_to_prompt:
|
|
503
|
+
qa_to_prompt[(q, a)] = judge_template.format(question=q, answer=a)
|
|
504
|
+
my_df["__judge_question"] = [qa_to_prompt[(q, a)] for q, a in qa_pairs]
|
|
505
|
+
|
|
506
|
+
# Execute judge with key-value caching
|
|
507
|
+
judge_df = self._execute_judge_with_cache(judge_question, qa_pairs, qa_to_prompt)
|
|
508
|
+
|
|
509
|
+
# Rename columns
|
|
510
|
+
judge_columns = [judge_name, judge_name + "_question"]
|
|
511
|
+
judge_df = judge_df.rename(columns={"answer": judge_name, "question": judge_name + "_question"})
|
|
512
|
+
if "raw_answer" in judge_df.columns:
|
|
513
|
+
judge_columns.append(judge_name + "_raw_answer")
|
|
514
|
+
judge_df = judge_df.rename(columns={"raw_answer": judge_name + "_raw_answer"})
|
|
515
|
+
|
|
516
|
+
# Merge the judge results with the original dataframe
|
|
517
|
+
merged_df = my_df.merge(
|
|
518
|
+
judge_df[judge_columns],
|
|
519
|
+
left_on="__judge_question",
|
|
520
|
+
right_on=judge_name + "_question",
|
|
521
|
+
how="left",
|
|
522
|
+
)
|
|
523
|
+
merged_df = merged_df.drop(columns=["__judge_question"])
|
|
524
|
+
|
|
525
|
+
return merged_df
|
|
526
|
+
|
|
527
|
+
def _execute_judge_with_cache(
|
|
528
|
+
self,
|
|
529
|
+
judge_question: Question,
|
|
530
|
+
qa_pairs: list[tuple[str, str]],
|
|
531
|
+
qa_to_prompt: dict[tuple[str, str], str],
|
|
532
|
+
) -> pd.DataFrame:
|
|
533
|
+
"""Execute judge with key-value caching.
|
|
534
|
+
|
|
535
|
+
Only executes API calls for uncached (question, answer) pairs, then builds
|
|
536
|
+
the result dataframe from the cache.
|
|
537
|
+
|
|
538
|
+
Args:
|
|
539
|
+
judge_question: The judge Question object
|
|
540
|
+
qa_pairs: List of (question, answer) tuples to judge
|
|
541
|
+
qa_to_prompt: Mapping from (question, answer) -> formatted judge prompt
|
|
542
|
+
|
|
543
|
+
Returns:
|
|
544
|
+
DataFrame with columns: question, answer, [raw_answer for RatingJudge]
|
|
545
|
+
"""
|
|
546
|
+
uses_question = judge_question.uses_question
|
|
547
|
+
|
|
548
|
+
# When judge doesn't use {question}, we only care about unique answers
|
|
549
|
+
# and use None as the question key in cache
|
|
550
|
+
if uses_question:
|
|
551
|
+
unique_keys = sorted(set(qa_pairs)) # (question, answer) pairs
|
|
552
|
+
else:
|
|
553
|
+
unique_keys = [(None, a) for a in sorted(set(a for _, a in qa_pairs))]
|
|
554
|
+
|
|
555
|
+
# Load cache and find uncached entries
|
|
556
|
+
cache = JudgeCache(judge_question)
|
|
557
|
+
uncached_keys = cache.get_uncached(unique_keys)
|
|
558
|
+
|
|
559
|
+
# Execute only uncached entries
|
|
560
|
+
if uncached_keys:
|
|
561
|
+
# Build prompts for uncached entries
|
|
562
|
+
# For each key, we need to find a (q, a) pair to get the prompt
|
|
563
|
+
key_to_prompt = {}
|
|
564
|
+
for q, a in qa_to_prompt.keys():
|
|
565
|
+
key = (q, a) if uses_question else (None, a)
|
|
566
|
+
if key not in key_to_prompt:
|
|
567
|
+
key_to_prompt[key] = qa_to_prompt[(q, a)]
|
|
568
|
+
|
|
569
|
+
uncached_prompts = [key_to_prompt[key] for key in uncached_keys]
|
|
570
|
+
prompt_to_key = {key_to_prompt[key]: key for key in uncached_keys}
|
|
571
|
+
|
|
572
|
+
# Use a copy to avoid mutating the original judge (thread-safety)
|
|
573
|
+
judge_copy = deepcopy(judge_question)
|
|
574
|
+
judge_copy.paraphrases = uncached_prompts
|
|
575
|
+
results = judge_copy.many_models_execute([judge_copy.model])
|
|
576
|
+
result = results[0] # Only one model
|
|
577
|
+
|
|
578
|
+
# Update cache
|
|
579
|
+
for item in result.data:
|
|
580
|
+
prompt = item["question"] # The formatted judge prompt
|
|
581
|
+
q, a = prompt_to_key[prompt]
|
|
582
|
+
cache.set(q, a, item["answer"])
|
|
583
|
+
cache.save()
|
|
584
|
+
|
|
585
|
+
# Build dataframe from cache (one row per unique key)
|
|
586
|
+
rows = []
|
|
587
|
+
for q, a in unique_keys:
|
|
588
|
+
judge_response = cache.get(q, a)
|
|
589
|
+
# Get the formatted prompt - need to find any original (q, a) pair for this key
|
|
590
|
+
if uses_question:
|
|
591
|
+
judge_prompt = qa_to_prompt[(q, a)]
|
|
592
|
+
else:
|
|
593
|
+
# Find any pair with this answer to get the prompt
|
|
594
|
+
# As the judge doesn't use {question}, we can just find any pair with this answer.
|
|
595
|
+
judge_prompt = next(p for (oq, oa), p in qa_to_prompt.items() if oa == a)
|
|
596
|
+
rows.append({"question": judge_prompt, "answer": judge_response})
|
|
597
|
+
|
|
598
|
+
df = pd.DataFrame(rows)
|
|
599
|
+
|
|
600
|
+
# Post-process for RatingJudge: copy raw answer and compute processed score
|
|
601
|
+
from llmcomp.question.judge import RatingJudge
|
|
602
|
+
|
|
603
|
+
if isinstance(judge_question, RatingJudge):
|
|
604
|
+
df["raw_answer"] = df["answer"].copy()
|
|
605
|
+
df["answer"] = df["raw_answer"].apply(judge_question._compute_expected_rating)
|
|
606
|
+
|
|
607
|
+
return df
|
|
608
|
+
|
|
609
|
+
def plot(
|
|
610
|
+
self,
|
|
611
|
+
model_groups: dict[str, list[str]],
|
|
612
|
+
category_column: str = "group",
|
|
613
|
+
answer_column: str = "answer",
|
|
614
|
+
df: pd.DataFrame = None,
|
|
615
|
+
selected_answers: list[str] = None,
|
|
616
|
+
min_fraction: float = None,
|
|
617
|
+
colors: dict[str, str] = None,
|
|
618
|
+
title: str = None,
|
|
619
|
+
filename: str = None,
|
|
620
|
+
):
|
|
621
|
+
"""Plot dataframe as a stacked bar chart of answers by category.
|
|
622
|
+
|
|
623
|
+
Args:
|
|
624
|
+
model_groups: Required. Dict mapping group names to lists of model identifiers.
|
|
625
|
+
category_column: Column to use for x-axis categories. Default: "group".
|
|
626
|
+
answer_column: Column containing answers to plot. Default: "answer".
|
|
627
|
+
Use a judge column name to plot judge scores instead.
|
|
628
|
+
df: DataFrame to plot. By default calls self.df(model_groups).
|
|
629
|
+
selected_answers: List of specific answers to include. Others grouped as "other".
|
|
630
|
+
min_fraction: Minimum fraction threshold. Answers below this are grouped as "other".
|
|
631
|
+
colors: Dict mapping answer values to colors.
|
|
632
|
+
title: Plot title. If None, auto-generated from paraphrases.
|
|
633
|
+
filename: If provided, saves the plot to this file path.
|
|
634
|
+
|
|
635
|
+
Returns:
|
|
636
|
+
matplotlib Figure object.
|
|
637
|
+
"""
|
|
638
|
+
if df is None:
|
|
639
|
+
df = self.df(model_groups)
|
|
640
|
+
|
|
641
|
+
if title is None:
|
|
642
|
+
title = default_title(self.paraphrases)
|
|
643
|
+
|
|
644
|
+
return free_form_stacked_bar(
|
|
645
|
+
df,
|
|
646
|
+
category_column=category_column,
|
|
647
|
+
answer_column=answer_column,
|
|
648
|
+
model_groups=model_groups,
|
|
649
|
+
selected_answers=selected_answers,
|
|
650
|
+
min_fraction=min_fraction,
|
|
651
|
+
colors=colors,
|
|
652
|
+
title=title,
|
|
653
|
+
filename=filename,
|
|
654
|
+
)
|
|
655
|
+
|
|
656
|
+
def _parse_judges(self, judges: dict[str, str | dict] | None) -> dict[str, "Question"] | None:
|
|
657
|
+
"""Parse and validate judges dictionary."""
|
|
658
|
+
if judges is None:
|
|
659
|
+
return None
|
|
660
|
+
|
|
661
|
+
# Validate judge names
|
|
662
|
+
for key in judges.keys():
|
|
663
|
+
if key in self._FORBIDDEN_JUDGE_NAMES:
|
|
664
|
+
raise ValueError(f"Judge name '{key}' is forbidden. It conflicts with standard dataframe columns.")
|
|
665
|
+
if key.startswith("_"):
|
|
666
|
+
raise ValueError(
|
|
667
|
+
f"Judge name '{key}' is forbidden. Names starting with '_' are reserved for internal use."
|
|
668
|
+
)
|
|
669
|
+
if key.endswith("_question"):
|
|
670
|
+
raise ValueError(
|
|
671
|
+
f"Judge name '{key}' is forbidden. Names ending with '_question' conflict with "
|
|
672
|
+
f"automatically generated columns."
|
|
673
|
+
)
|
|
674
|
+
if key.endswith("_raw_answer"):
|
|
675
|
+
raise ValueError(
|
|
676
|
+
f"Judge name '{key}' is forbidden. Names ending with '_raw_answer' conflict with "
|
|
677
|
+
f"automatically generated columns."
|
|
678
|
+
)
|
|
679
|
+
|
|
680
|
+
parsed_judges = {}
|
|
681
|
+
for key, val in judges.items():
|
|
682
|
+
from llmcomp.question.judge import FreeFormJudge, RatingJudge
|
|
683
|
+
|
|
684
|
+
if isinstance(val, (FreeFormJudge, RatingJudge)):
|
|
685
|
+
# Already a Question instance, use it directly
|
|
686
|
+
judge_question = val
|
|
687
|
+
elif isinstance(val, str):
|
|
688
|
+
# Load from Config.yaml_dir
|
|
689
|
+
judge_dict = Question.load_dict(val)
|
|
690
|
+
judge_question = Question.create(**judge_dict)
|
|
691
|
+
else:
|
|
692
|
+
# Assume it's a dict
|
|
693
|
+
judge_question = Question.create(**val)
|
|
694
|
+
|
|
695
|
+
assert judge_question.type() in (
|
|
696
|
+
"free_form_judge",
|
|
697
|
+
"rating_judge",
|
|
698
|
+
), "Judge must be a free_form_judge or rating_judge"
|
|
699
|
+
parsed_judges[key] = judge_question
|
|
700
|
+
|
|
701
|
+
return parsed_judges
|
|
702
|
+
|
|
703
|
+
|
|
704
|
+
class Rating(Question):
|
|
705
|
+
"""Question type for numeric rating responses.
|
|
706
|
+
|
|
707
|
+
Use this when you expect the model to respond with a number within a range.
|
|
708
|
+
Uses logprobs to compute expected value across the probability distribution,
|
|
709
|
+
giving more nuanced results than just taking the sampled token.
|
|
710
|
+
"""
|
|
711
|
+
|
|
712
|
+
_runner_sampling_func_name = "single_token_probs"
|
|
713
|
+
|
|
714
|
+
def __init__(
|
|
715
|
+
self,
|
|
716
|
+
*,
|
|
717
|
+
min_rating: int = 0,
|
|
718
|
+
max_rating: int = 100,
|
|
719
|
+
refusal_threshold: float = 0.75,
|
|
720
|
+
top_logprobs: int = 20,
|
|
721
|
+
**kwargs,
|
|
722
|
+
):
|
|
723
|
+
"""Initialize a Rating question.
|
|
724
|
+
|
|
725
|
+
Args:
|
|
726
|
+
min_rating: Minimum valid rating value (inclusive). Default: 0.
|
|
727
|
+
max_rating: Maximum valid rating value (inclusive). Default: 100.
|
|
728
|
+
refusal_threshold: If probability mass on non-numeric tokens exceeds this,
|
|
729
|
+
the response is treated as a refusal (returns None). Default: 0.75.
|
|
730
|
+
top_logprobs: Number of top tokens to request. Default: 20.
|
|
731
|
+
**kwargs: Arguments passed to Question base class:
|
|
732
|
+
- name: Question identifier for caching. Default: "__unnamed".
|
|
733
|
+
- paraphrases: List of prompt variations to test.
|
|
734
|
+
- system: System message prepended to each paraphrase.
|
|
735
|
+
- messages: Alternative to paraphrases - [{'role': ..., 'content': ...}, {'role': ..., 'content': ...}, ...]
|
|
736
|
+
- samples_per_paraphrase: Number of samples per prompt. Default: 1.
|
|
737
|
+
- logit_bias: Token bias dict {token_id: bias}.
|
|
738
|
+
"""
|
|
739
|
+
super().__init__(**kwargs)
|
|
740
|
+
self.min_rating = min_rating
|
|
741
|
+
self.max_rating = max_rating
|
|
742
|
+
self.refusal_threshold = refusal_threshold
|
|
743
|
+
self.top_logprobs = top_logprobs
|
|
744
|
+
|
|
745
|
+
def get_runner_input(self) -> list[dict]:
|
|
746
|
+
runner_input = super().get_runner_input()
|
|
747
|
+
for el in runner_input:
|
|
748
|
+
el["top_logprobs"] = self.top_logprobs
|
|
749
|
+
return runner_input
|
|
750
|
+
|
|
751
|
+
def df(self, model_groups: dict[str, list[str]]) -> pd.DataFrame:
|
|
752
|
+
"""Execute question and return results as a DataFrame.
|
|
753
|
+
|
|
754
|
+
Runs the question on all models (or loads from cache), then computes
|
|
755
|
+
expected ratings from the logprob distributions.
|
|
756
|
+
|
|
757
|
+
Args:
|
|
758
|
+
model_groups: Dict mapping group names to lists of model identifiers.
|
|
759
|
+
Example: {"gpt4": ["gpt-4o", "gpt-4-turbo"], "claude": ["claude-3-opus"]}
|
|
760
|
+
|
|
761
|
+
Returns:
|
|
762
|
+
DataFrame with columns:
|
|
763
|
+
- model: Model identifier
|
|
764
|
+
- group: Group name from model_groups
|
|
765
|
+
- answer: Mean rating (float), or None if model refused
|
|
766
|
+
- raw_answer: Original logprobs dict {token: probability}
|
|
767
|
+
- question: The prompt that was sent
|
|
768
|
+
- messages: Full message list sent to model
|
|
769
|
+
- paraphrase_ix: Index of the paraphrase used
|
|
770
|
+
"""
|
|
771
|
+
df = super().df(model_groups)
|
|
772
|
+
df["raw_answer"] = df["answer"].copy()
|
|
773
|
+
df["answer"] = df["raw_answer"].apply(self._compute_expected_rating)
|
|
774
|
+
return df
|
|
775
|
+
|
|
776
|
+
def _get_normalized_probs(self, score: dict | None) -> dict[int, float] | None:
|
|
777
|
+
"""Extract valid rating probabilities, normalized to sum to 1.
|
|
778
|
+
|
|
779
|
+
Returns None if score is None, empty, or refusal threshold is exceeded.
|
|
780
|
+
"""
|
|
781
|
+
if score is None:
|
|
782
|
+
return None
|
|
783
|
+
|
|
784
|
+
probs = {}
|
|
785
|
+
total = 0
|
|
786
|
+
for key, val in score.items():
|
|
787
|
+
try:
|
|
788
|
+
int_key = int(key)
|
|
789
|
+
except ValueError:
|
|
790
|
+
continue
|
|
791
|
+
if self.min_rating <= int_key <= self.max_rating:
|
|
792
|
+
probs[int_key] = val
|
|
793
|
+
total += val
|
|
794
|
+
|
|
795
|
+
if total == 0 or (1 - total) >= self.refusal_threshold:
|
|
796
|
+
return None
|
|
797
|
+
|
|
798
|
+
return {k: v / total for k, v in probs.items()}
|
|
799
|
+
|
|
800
|
+
def _compute_expected_rating(self, score: dict | None) -> float | None:
|
|
801
|
+
"""Compute expected rating from logprobs distribution."""
|
|
802
|
+
if score is None:
|
|
803
|
+
mid_value = (self.min_rating + self.max_rating) / 2
|
|
804
|
+
warnings.warn(f"Got None from API (should be impossible). Returning middle value {mid_value}.")
|
|
805
|
+
return mid_value
|
|
806
|
+
|
|
807
|
+
probs = self._get_normalized_probs(score)
|
|
808
|
+
if probs is None:
|
|
809
|
+
return None
|
|
810
|
+
|
|
811
|
+
return sum(rating * prob for rating, prob in probs.items())
|
|
812
|
+
|
|
813
|
+
def plot(
|
|
814
|
+
self,
|
|
815
|
+
model_groups: dict[str, list[str]],
|
|
816
|
+
category_column: str = "group",
|
|
817
|
+
df: pd.DataFrame = None,
|
|
818
|
+
show_mean: bool = True,
|
|
819
|
+
title: str = None,
|
|
820
|
+
filename: str = None,
|
|
821
|
+
):
|
|
822
|
+
"""Plot cumulative rating distribution by category.
|
|
823
|
+
|
|
824
|
+
Shows the probability distribution across the rating range for each category,
|
|
825
|
+
with optional mean markers.
|
|
826
|
+
|
|
827
|
+
Args:
|
|
828
|
+
model_groups: Required. Dict mapping group names to lists of model identifiers.
|
|
829
|
+
category_column: Column to use for grouping. Default: "group".
|
|
830
|
+
df: DataFrame to plot. By default calls self.df(model_groups).
|
|
831
|
+
show_mean: If True, displays mean rating for each category. Default: True.
|
|
832
|
+
title: Plot title. If None, auto-generated from paraphrases.
|
|
833
|
+
filename: If provided, saves the plot to this file path.
|
|
834
|
+
|
|
835
|
+
Returns:
|
|
836
|
+
matplotlib Figure object.
|
|
837
|
+
"""
|
|
838
|
+
if df is None:
|
|
839
|
+
df = self.df(model_groups)
|
|
840
|
+
|
|
841
|
+
if title is None:
|
|
842
|
+
title = default_title(self.paraphrases)
|
|
843
|
+
|
|
844
|
+
# Pre-normalize probabilities
|
|
845
|
+
df = df.copy()
|
|
846
|
+
df["probs"] = df["raw_answer"].apply(self._get_normalized_probs)
|
|
847
|
+
|
|
848
|
+
return rating_cumulative_plot(
|
|
849
|
+
df,
|
|
850
|
+
min_rating=self.min_rating,
|
|
851
|
+
max_rating=self.max_rating,
|
|
852
|
+
category_column=category_column,
|
|
853
|
+
model_groups=model_groups,
|
|
854
|
+
show_mean=show_mean,
|
|
855
|
+
title=title,
|
|
856
|
+
filename=filename,
|
|
857
|
+
)
|
|
858
|
+
|
|
859
|
+
|
|
860
|
+
class NextToken(Question):
|
|
861
|
+
"""Question type for analyzing next-token probability distributions.
|
|
862
|
+
|
|
863
|
+
Use this when you want to see what tokens the model considers as likely continuations.
|
|
864
|
+
Returns probability distributions over the top tokens, useful for fine-grained analysis
|
|
865
|
+
of model behavior.
|
|
866
|
+
"""
|
|
867
|
+
|
|
868
|
+
_runner_sampling_func_name = "single_token_probs"
|
|
869
|
+
|
|
870
|
+
def __init__(
|
|
871
|
+
self,
|
|
872
|
+
*,
|
|
873
|
+
top_logprobs: int = 20,
|
|
874
|
+
convert_to_probs: bool = True,
|
|
875
|
+
num_samples: int = 1,
|
|
876
|
+
**kwargs,
|
|
877
|
+
):
|
|
878
|
+
"""Initialize a NextToken question.
|
|
879
|
+
|
|
880
|
+
Args:
|
|
881
|
+
top_logprobs: Number of top tokens to return probabilities for. Default: 20.
|
|
882
|
+
Maximum depends on API (OpenAI allows up to 20).
|
|
883
|
+
convert_to_probs: If True, convert logprobs to probabilities (0-1 range).
|
|
884
|
+
If False, returns raw log probabilities. Default: True.
|
|
885
|
+
num_samples: Number of samples to average. Useful when logprobs are non-deterministic.
|
|
886
|
+
Default: 1.
|
|
887
|
+
**kwargs: Arguments passed to Question base class:
|
|
888
|
+
- name: Question identifier for caching. Default: "__unnamed".
|
|
889
|
+
- paraphrases: List of prompt variations to test.
|
|
890
|
+
- system: System message prepended to each paraphrase.
|
|
891
|
+
- messages: Alternative to paraphrases - [{'role': ..., 'content': ...}, {'role': ..., 'content': ...}, ...]
|
|
892
|
+
- samples_per_paraphrase: Number of samples per prompt. Default: 1.
|
|
893
|
+
- logit_bias: Token bias dict {token_id: bias}.
|
|
894
|
+
"""
|
|
895
|
+
super().__init__(**kwargs)
|
|
896
|
+
self.top_logprobs = top_logprobs
|
|
897
|
+
self.convert_to_probs = convert_to_probs
|
|
898
|
+
self.num_samples = num_samples
|
|
899
|
+
|
|
900
|
+
def get_runner_input(self) -> list[dict]:
|
|
901
|
+
runner_input = super().get_runner_input()
|
|
902
|
+
|
|
903
|
+
for el in runner_input:
|
|
904
|
+
el["top_logprobs"] = self.top_logprobs
|
|
905
|
+
el["convert_to_probs"] = self.convert_to_probs
|
|
906
|
+
el["num_samples"] = self.num_samples
|
|
907
|
+
return runner_input
|
|
908
|
+
|
|
909
|
+
def df(self, model_groups: dict[str, list[str]]) -> pd.DataFrame:
|
|
910
|
+
"""Execute question and return results as a DataFrame.
|
|
911
|
+
|
|
912
|
+
Runs the question on all models (or loads from cache).
|
|
913
|
+
|
|
914
|
+
Args:
|
|
915
|
+
model_groups: Dict mapping group names to lists of model identifiers.
|
|
916
|
+
Example: {"gpt4": ["gpt-4o", "gpt-4-turbo"], "claude": ["claude-3-opus"]}
|
|
917
|
+
|
|
918
|
+
Returns:
|
|
919
|
+
DataFrame with columns:
|
|
920
|
+
- model: Model identifier
|
|
921
|
+
- group: Group name from model_groups
|
|
922
|
+
- answer: Dict mapping tokens to probabilities {token: prob}
|
|
923
|
+
- question: The prompt that was sent
|
|
924
|
+
- messages: Full message list sent to model
|
|
925
|
+
- paraphrase_ix: Index of the paraphrase used
|
|
926
|
+
"""
|
|
927
|
+
return super().df(model_groups)
|
|
928
|
+
|
|
929
|
+
def plot(
|
|
930
|
+
self,
|
|
931
|
+
model_groups: dict[str, list[str]],
|
|
932
|
+
category_column: str = "group",
|
|
933
|
+
df: pd.DataFrame = None,
|
|
934
|
+
selected_answers: list[str] = None,
|
|
935
|
+
min_fraction: float = None,
|
|
936
|
+
colors: dict[str, str] = None,
|
|
937
|
+
title: str = None,
|
|
938
|
+
filename: str = None,
|
|
939
|
+
):
|
|
940
|
+
"""Plot stacked bar chart of token probabilities by category.
|
|
941
|
+
|
|
942
|
+
Args:
|
|
943
|
+
model_groups: Required. Dict mapping group names to lists of model identifiers.
|
|
944
|
+
category_column: Column to use for x-axis categories. Default: "group".
|
|
945
|
+
df: DataFrame to plot. By default calls self.df(model_groups).
|
|
946
|
+
selected_answers: List of specific tokens to include. Others grouped as "other".
|
|
947
|
+
min_fraction: Minimum probability threshold. Tokens below this are grouped as "other".
|
|
948
|
+
colors: Dict mapping token values to colors.
|
|
949
|
+
title: Plot title. If None, auto-generated from paraphrases.
|
|
950
|
+
filename: If provided, saves the plot to this file path.
|
|
951
|
+
|
|
952
|
+
Returns:
|
|
953
|
+
matplotlib Figure object.
|
|
954
|
+
"""
|
|
955
|
+
if df is None:
|
|
956
|
+
df = self.df(model_groups)
|
|
957
|
+
|
|
958
|
+
if title is None:
|
|
959
|
+
title = default_title(self.paraphrases)
|
|
960
|
+
|
|
961
|
+
# answer column already contains {token: prob} dicts
|
|
962
|
+
df = df.rename(columns={"answer": "probs"})
|
|
963
|
+
|
|
964
|
+
return probs_stacked_bar(
|
|
965
|
+
df,
|
|
966
|
+
probs_column="probs",
|
|
967
|
+
category_column=category_column,
|
|
968
|
+
model_groups=model_groups,
|
|
969
|
+
selected_answers=selected_answers,
|
|
970
|
+
min_fraction=min_fraction,
|
|
971
|
+
colors=colors,
|
|
972
|
+
title=title,
|
|
973
|
+
filename=filename,
|
|
974
|
+
)
|