llmcomp 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,974 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ import json
5
+ import os
6
+ import warnings
7
+ from abc import ABC, abstractmethod
8
+ from concurrent.futures import ThreadPoolExecutor
9
+ from copy import deepcopy
10
+ from queue import Queue
11
+ from typing import TYPE_CHECKING
12
+
13
+ import pandas as pd
14
+ import yaml
15
+ from tqdm import tqdm
16
+
17
+ from llmcomp.config import Config
18
+ from llmcomp.question.plots import (
19
+ default_title,
20
+ free_form_stacked_bar,
21
+ probs_stacked_bar,
22
+ rating_cumulative_plot,
23
+ )
24
+ from llmcomp.question.result import JudgeCache, Result
25
+ from llmcomp.runner.runner import Runner
26
+
27
+ if TYPE_CHECKING:
28
+ from llmcomp.question.question import Question
29
+
30
+
31
+ class Question(ABC):
32
+ # Purpose of _version: it is used in the hash function so if some important part of the implementation changes,
33
+ # we can change the version here and it'll invalidate all the cached results.
34
+ _version = 1
35
+
36
+ def __init__(
37
+ self,
38
+ name: str | None = "__unnamed",
39
+ paraphrases: list[str] | None = None,
40
+ messages: list[list[dict]] = None,
41
+ logit_bias: dict[int, float] | None = None,
42
+ samples_per_paraphrase: int = 1,
43
+ system: str = None,
44
+ ):
45
+ self.paraphrases = paraphrases
46
+ self.samples_per_paraphrase = samples_per_paraphrase
47
+ self.system = system
48
+ self.messages = messages
49
+ self.logit_bias = logit_bias
50
+ self.name = name
51
+
52
+ @property
53
+ @abstractmethod
54
+ def _runner_sampling_func_name(self) -> str:
55
+ """Name of the runner function to use for sampling. Defined in subclasses."""
56
+ pass
57
+
58
+ ###########################################################################
59
+ # CLASS METHODS - question factories, YAML loading.
60
+ @classmethod
61
+ def type(cls) -> str:
62
+ """Type is snake_case version of the class name."""
63
+ return "".join("_" + c.lower() if c.isupper() else c.lower() for c in cls.__name__).lstrip("_")
64
+
65
+ @classmethod
66
+ def create(cls, **kwargs) -> "Question":
67
+ """Create a Question instance from a type string and keyword arguments.
68
+
69
+ Factory method that instantiates the appropriate Question subclass based on the 'type' parameter.
70
+
71
+ Args:
72
+ **kwargs: Must include 'type' key with one of:
73
+ - "free_form": Creates FreeForm question
74
+ - "rating": Creates Rating question
75
+ - "next_token": Creates NextToken question
76
+ - "free_form_judge": Creates FreeFormJudge
77
+ - "rating_judge": Creates RatingJudge
78
+ Other kwargs are passed to the constructor.
79
+
80
+ Returns:
81
+ Question subclass instance.
82
+
83
+ Raises:
84
+ ValueError: If 'type' is missing or invalid.
85
+
86
+ Example:
87
+ >>> q = Question.create(
88
+ ... type="free_form",
89
+ ... name="my_question",
90
+ ... paraphrases=["What is 2+2?"]
91
+ ... )
92
+ """
93
+ from llmcomp.question.judge import FreeFormJudge, RatingJudge
94
+
95
+ valid_types = (FreeForm, Rating, FreeFormJudge, RatingJudge, NextToken)
96
+ question_type = kwargs.get("type")
97
+ if question_type is None:
98
+ raise ValueError("Missing required 'type' parameter")
99
+
100
+ for question_class in valid_types:
101
+ if question_class.type() == question_type:
102
+ del kwargs["type"]
103
+ return question_class(**kwargs)
104
+
105
+ valid_type_names = [q.type() for q in valid_types]
106
+ raise ValueError(
107
+ f"Invalid question type: '{question_type}'. Available types are: {', '.join(valid_type_names)}"
108
+ )
109
+
110
+ @classmethod
111
+ def load_dict(cls, name: str) -> dict:
112
+ """Load question configuration as a dictionary from YAML files.
113
+
114
+ Searches all YAML files in Config.yaml_dir for a question with matching name.
115
+
116
+ Args:
117
+ name: The question name to look up.
118
+
119
+ Returns:
120
+ Dict containing the question configuration (can be passed to Question.create).
121
+
122
+ Raises:
123
+ ValueError: If question with given name is not found.
124
+
125
+ Example:
126
+ >>> config = Question.load_dict("my_question")
127
+ >>> config
128
+ {'type': 'free_form', 'name': 'my_question', 'paraphrases': [...]}
129
+ """
130
+ question_config = cls._load_question_config()
131
+ try:
132
+ question_dict = question_config[name]
133
+ except KeyError:
134
+ raise ValueError(f"Question with name '{name}' not found in directory {Config.yaml_dir}")
135
+
136
+ return question_dict
137
+
138
+ @classmethod
139
+ def from_yaml(cls, name: str) -> "Question":
140
+ """Load and instantiate a Question from YAML configuration.
141
+
142
+ Convenience method combining load_dict() and create().
143
+
144
+ Args:
145
+ name: The question name to look up in YAML files.
146
+
147
+ Returns:
148
+ Question subclass instance.
149
+
150
+ Raises:
151
+ ValueError: If question not found or has invalid type.
152
+
153
+ Example:
154
+ >>> q = Question.from_yaml("my_question")
155
+ """
156
+ question_dict = cls.load_dict(name)
157
+ return cls.create(**question_dict)
158
+
159
+ @classmethod
160
+ def _load_question_config(cls):
161
+ """Load all questions from YAML files in Config.yaml_dir."""
162
+ config = {}
163
+ for fname in os.listdir(Config.yaml_dir):
164
+ if not (fname.endswith(".yaml") or fname.endswith(".yml")):
165
+ continue
166
+
167
+ path = os.path.join(Config.yaml_dir, fname)
168
+ with open(path, "r", encoding="utf-8") as f:
169
+ data = yaml.load(f, Loader=yaml.SafeLoader)
170
+ if data is None:
171
+ # Empty file
172
+ continue
173
+ for question in data:
174
+ if question["name"] in config:
175
+ raise ValueError(
176
+ f"Question with name {question['name']} duplicated in directory {Config.yaml_dir}"
177
+ )
178
+ config[question["name"]] = question
179
+ return config
180
+
181
+ ###########################################################################
182
+ # MAIN INTERFACE
183
+ def df(self, model_groups: dict[str, list[str]]) -> pd.DataFrame:
184
+ models = list(set(model for group in model_groups.values() for model in group))
185
+ results = self.get_results(models)
186
+ data = []
187
+ for model, result in zip(models, results):
188
+ groups = list(key for key, group in model_groups.items() if model in group)
189
+ for group in groups:
190
+ for el in result.data:
191
+ data.append(
192
+ {
193
+ "model": model,
194
+ "group": group,
195
+ "answer": el["answer"],
196
+ "question": el["question"],
197
+ "messages": el["messages"],
198
+ "paraphrase_ix": el["paraphrase_ix"],
199
+ }
200
+ )
201
+ df = pd.DataFrame(data)
202
+
203
+ # Validate expected number of rows
204
+ expected_rows = self._expected_df_rows(model_groups)
205
+ assert len(df) == expected_rows, (
206
+ f"DataFrame has {len(df)} rows but expected {expected_rows} rows. "
207
+ f"This indicates a bug in the df() implementation."
208
+ )
209
+
210
+ return df
211
+
212
+ def _expected_df_rows(self, model_groups: dict[str, list[str]]) -> int:
213
+ models = list(set(model for group in model_groups.values() for model in group))
214
+ num_paraphrases = len(self.as_messages())
215
+ rows_per_model = num_paraphrases * self.samples_per_paraphrase
216
+
217
+ total_rows = 0
218
+ for model in models:
219
+ # Count how many groups contain this model
220
+ num_groups = sum(1 for group in model_groups.values() if model in group)
221
+ total_rows += num_groups * rows_per_model
222
+
223
+ return total_rows
224
+
225
+ ###########################################################################
226
+ # EXECUTION
227
+ def get_results(self, models: list[str]) -> list[Result]:
228
+ """
229
+ Execute the question (and save results) or load cached results for a list of models.
230
+ """
231
+ assert len(models) == len(set(models)), "Models must be unique"
232
+
233
+ # 1. Load results that already exist
234
+ results = []
235
+ for model in models:
236
+ try:
237
+ results.append(Result.load(self, model))
238
+ except FileNotFoundError:
239
+ results.append(None)
240
+
241
+ if all(results):
242
+ return results
243
+
244
+ # 2. Execute the rest
245
+ remaining_models = [model for i, model in enumerate(models) if results[i] is None]
246
+ remaining_results = self.many_models_execute(remaining_models)
247
+
248
+ # 3. Save the rest
249
+ for result in remaining_results:
250
+ result.save()
251
+
252
+ # 4. Merge loaded and executed
253
+ for result, model in zip(remaining_results, remaining_models):
254
+ results[models.index(model)] = result
255
+
256
+ return results
257
+
258
+ def many_models_execute(self, models: list[str]) -> list[Result]:
259
+ """Execute question on multiple models in parallel.
260
+
261
+ The implementation is quite complex, because:
262
+ * We wanted to keep the current Runner interface.
263
+ * But also have a single progress bar
264
+
265
+ Was battle-tested a lot, so should work fine.
266
+ """
267
+ if not models:
268
+ return []
269
+
270
+ # The thing that we'll pass to Runner.get_many
271
+ runner_input = self.get_runner_input()
272
+ for i, el in enumerate(runner_input):
273
+ el["_original_ix"] = i
274
+
275
+ # Threads save results/errors here to be later stored in the final structure
276
+ queue = Queue()
277
+
278
+ # All computed data will be stored here
279
+ results: list = [[None] * len(runner_input) for _ in models]
280
+
281
+ with ThreadPoolExecutor(len(models)) as top_level_executor:
282
+ with ThreadPoolExecutor(Config.max_workers) as low_level_executor:
283
+
284
+ def worker_function(runner):
285
+ try:
286
+ sampling_func = getattr(runner, self._runner_sampling_func_name)
287
+ generator = runner.get_many(
288
+ sampling_func,
289
+ runner_input,
290
+ executor=low_level_executor,
291
+ silent=True,
292
+ )
293
+ for in_, out in generator:
294
+ queue.put(("data", runner.model, in_, out))
295
+ except Exception as e:
296
+ queue.put(("error", runner.model, e))
297
+
298
+ futures = [top_level_executor.submit(worker_function, Runner(model)) for model in models]
299
+
300
+ expected_num = len(models) * len(runner_input)
301
+ current_num = 0
302
+ errors = []
303
+
304
+ try:
305
+ with tqdm(total=expected_num) as pbar:
306
+ display_name = self.name if len(self.name) <= 16 else self.name[:16] + "..."
307
+ pbar.set_description(f"Querying {len(models)} models - {display_name}")
308
+ while current_num < expected_num and not errors:
309
+ msg_type, model, *payload = queue.get()
310
+
311
+ if msg_type == "error":
312
+ error = payload[0]
313
+ errors.append((model, error))
314
+ else:
315
+ in_, out = payload
316
+ data = results[models.index(model)]
317
+ data[in_["_original_ix"]] = {
318
+ # Deepcopy because in_["messages"] is reused for multiple models and we don't want weird
319
+ # side effects if someone later edits the messages in the resulting dataframe
320
+ "messages": deepcopy(in_["messages"]),
321
+ "question": in_["_question"],
322
+ "answer": out,
323
+ "paraphrase_ix": in_["_paraphrase_ix"],
324
+ }
325
+
326
+ current_num += 1
327
+ pbar.update(1)
328
+ except (KeyboardInterrupt, Exception) as e:
329
+ for future in futures:
330
+ future.cancel()
331
+ raise e
332
+
333
+ # Cancel any remaining futures if we had errors in workers
334
+ if errors:
335
+ for future in futures:
336
+ future.cancel()
337
+ error_msgs = [f"Model {model}: {error}" for model, error in errors]
338
+ raise Exception("Errors occurred during execution:\n" + "\n".join(error_msgs)) from errors[0][1]
339
+
340
+ return [Result(self, model, data) for model, data in zip(models, results)]
341
+
342
+ def get_runner_input(self) -> list[dict]:
343
+ messages_set = self.as_messages()
344
+ runner_input = []
345
+ for paraphrase_ix, messages in enumerate(messages_set):
346
+ this_input = {
347
+ "messages": messages,
348
+ "logit_bias": self.logit_bias,
349
+ "_question": messages[-1]["content"],
350
+ "_paraphrase_ix": paraphrase_ix,
351
+ }
352
+ # Deepcopy because someone might later edit the structures in-place
353
+ # (e.g. we now do that in many_models_execute)
354
+ for _ in range(self.samples_per_paraphrase):
355
+ runner_input.append(deepcopy(this_input))
356
+ return runner_input
357
+
358
+ def as_messages(self) -> list[dict]:
359
+ if self.messages is not None:
360
+ assert self.paraphrases is None, "Paraphrases and messages cannot both be set"
361
+ assert self.system is None, "System and messages cannot both be set"
362
+ return deepcopy(self.messages)
363
+ else:
364
+ assert self.paraphrases is not None, "Either paraphrases or messages must be set"
365
+ messages_set = []
366
+ for paraphrase in self.paraphrases:
367
+ messages = []
368
+ if self.system is not None:
369
+ messages.append({"role": "system", "content": self.system})
370
+ messages.append({"role": "user", "content": paraphrase})
371
+ messages_set.append(messages)
372
+ return messages_set
373
+
374
+ ###########################################################################
375
+ # OTHER STUFF
376
+ def hash(self):
377
+ """Unique identifier for caching. Changes when question parameters change.
378
+
379
+ Used to determine whether we can use cached results.
380
+ Excludes judges since they don't affect the raw LLM answers.
381
+ """
382
+ excluded = {"judges"}
383
+ attributes = {k: v for k, v in self.__dict__.items() if k not in excluded}
384
+ attributes["_version"] = self._version
385
+ json_str = json.dumps(attributes, sort_keys=True)
386
+ return hashlib.sha256(json_str.encode()).hexdigest()
387
+
388
+
389
+ class FreeForm(Question):
390
+ """Question type for free-form text generation.
391
+
392
+ Use this when you want to compare how different models respond to open-ended prompts.
393
+ The model generates text freely up to max_tokens.
394
+ """
395
+
396
+ _runner_sampling_func_name = "get_text"
397
+
398
+ # Forbidden judge names: standard dataframe columns and any name starting with "_"
399
+ _FORBIDDEN_JUDGE_NAMES = {
400
+ "model",
401
+ "group",
402
+ "answer",
403
+ "question",
404
+ "messages",
405
+ "paraphrase_ix",
406
+ "raw_answer",
407
+ }
408
+
409
+ def __init__(
410
+ self,
411
+ *,
412
+ temperature: float = 1,
413
+ max_tokens: int = 1024,
414
+ judges: dict[str, str | dict] = None,
415
+ **kwargs,
416
+ ):
417
+ """Initialize a FreeForm question.
418
+
419
+ Args:
420
+ temperature: Sampling temperature. Default: 1.
421
+ max_tokens: Maximum number of tokens in the response. Default: 1024.
422
+ judges: Optional dict mapping judge names to judge definitions. Each judge evaluates
423
+ the (question, answer) pairs. Values can be:
424
+ - A string: loads judge from YAML by name
425
+ - A dict: creates judge from the dict (must include 'type')
426
+ - A FreeFormJudge or RatingJudge instance
427
+ **kwargs: Arguments passed to Question base class:
428
+ - name: Question identifier for caching. Default: "__unnamed".
429
+ - paraphrases: List of prompt variations to test.
430
+ - system: System message prepended to each paraphrase.
431
+ - messages: Alternative to paraphrases - [{'role': ..., 'content': ...}, {'role': ..., 'content': ...}, ...]
432
+ - samples_per_paraphrase: Number of samples per prompt. Default: 1.
433
+ - logit_bias: Token bias dict {token_id: bias}.
434
+ """
435
+ super().__init__(**kwargs)
436
+ self.temperature = temperature
437
+ self.max_tokens = max_tokens
438
+ self.judges = self._parse_judges(judges)
439
+
440
+ def get_runner_input(self) -> list[dict]:
441
+ runner_input = super().get_runner_input()
442
+ for el in runner_input:
443
+ el["temperature"] = self.temperature
444
+ el["max_tokens"] = self.max_tokens
445
+ return runner_input
446
+
447
+ def df(self, model_groups: dict[str, list[str]]) -> pd.DataFrame:
448
+ """Execute question and return results as a DataFrame.
449
+
450
+ Runs the question on all models (or loads from cache), then applies any configured judges.
451
+
452
+ Args:
453
+ model_groups: Dict mapping group names to lists of model identifiers.
454
+ Example: {"gpt4": ["gpt-4o", "gpt-4-turbo"], "claude": ["claude-3-opus"]}
455
+
456
+ Returns:
457
+ DataFrame with columns:
458
+ - model: Model identifier
459
+ - group: Group name from model_groups
460
+ - answer: Model's response text
461
+ - question: The prompt that was sent
462
+ - messages: Full message list sent to model
463
+ - paraphrase_ix: Index of the paraphrase used
464
+ - {judge_name}: Score/response from each configured judge
465
+ - {judge_name}_question: The prompt sent to the judge
466
+ """
467
+ df = super().df(model_groups)
468
+ expected_rows = len(df) # Should not change after adding judges
469
+ columns = df.columns.tolist()
470
+ if self.judges:
471
+ for i, (judge_name, judge_question) in enumerate(self.judges.items()):
472
+ df = self.add_judge(model_groups, df, judge_name, judge_question)
473
+ columns.insert(3 + i, judge_name)
474
+ columns.append(judge_name + "_question")
475
+ if f"{judge_name}_raw_answer" in df.columns:
476
+ columns.append(judge_name + "_raw_answer")
477
+ df = df[columns]
478
+
479
+ # Validate that adding judges didn't change row count
480
+ assert len(df) == expected_rows, (
481
+ f"DataFrame has {len(df)} rows after adding judges but expected {expected_rows}. "
482
+ f"This indicates a bug in add_judge() - likely a many-to-many merge."
483
+ )
484
+
485
+ return df
486
+
487
+ def add_judge(
488
+ self,
489
+ model_groups: dict[str, list[str]],
490
+ my_df: pd.DataFrame,
491
+ judge_name: str,
492
+ judge_question: Question,
493
+ ) -> pd.DataFrame:
494
+ judge_template = judge_question.paraphrases[0]
495
+
496
+ # Collect (question, answer) pairs and build judge prompts
497
+ qa_pairs = []
498
+ qa_to_prompt = {}
499
+ for row in my_df.itertuples():
500
+ q, a = row.question, row.answer
501
+ qa_pairs.append((q, a))
502
+ if (q, a) not in qa_to_prompt:
503
+ qa_to_prompt[(q, a)] = judge_template.format(question=q, answer=a)
504
+ my_df["__judge_question"] = [qa_to_prompt[(q, a)] for q, a in qa_pairs]
505
+
506
+ # Execute judge with key-value caching
507
+ judge_df = self._execute_judge_with_cache(judge_question, qa_pairs, qa_to_prompt)
508
+
509
+ # Rename columns
510
+ judge_columns = [judge_name, judge_name + "_question"]
511
+ judge_df = judge_df.rename(columns={"answer": judge_name, "question": judge_name + "_question"})
512
+ if "raw_answer" in judge_df.columns:
513
+ judge_columns.append(judge_name + "_raw_answer")
514
+ judge_df = judge_df.rename(columns={"raw_answer": judge_name + "_raw_answer"})
515
+
516
+ # Merge the judge results with the original dataframe
517
+ merged_df = my_df.merge(
518
+ judge_df[judge_columns],
519
+ left_on="__judge_question",
520
+ right_on=judge_name + "_question",
521
+ how="left",
522
+ )
523
+ merged_df = merged_df.drop(columns=["__judge_question"])
524
+
525
+ return merged_df
526
+
527
+ def _execute_judge_with_cache(
528
+ self,
529
+ judge_question: Question,
530
+ qa_pairs: list[tuple[str, str]],
531
+ qa_to_prompt: dict[tuple[str, str], str],
532
+ ) -> pd.DataFrame:
533
+ """Execute judge with key-value caching.
534
+
535
+ Only executes API calls for uncached (question, answer) pairs, then builds
536
+ the result dataframe from the cache.
537
+
538
+ Args:
539
+ judge_question: The judge Question object
540
+ qa_pairs: List of (question, answer) tuples to judge
541
+ qa_to_prompt: Mapping from (question, answer) -> formatted judge prompt
542
+
543
+ Returns:
544
+ DataFrame with columns: question, answer, [raw_answer for RatingJudge]
545
+ """
546
+ uses_question = judge_question.uses_question
547
+
548
+ # When judge doesn't use {question}, we only care about unique answers
549
+ # and use None as the question key in cache
550
+ if uses_question:
551
+ unique_keys = sorted(set(qa_pairs)) # (question, answer) pairs
552
+ else:
553
+ unique_keys = [(None, a) for a in sorted(set(a for _, a in qa_pairs))]
554
+
555
+ # Load cache and find uncached entries
556
+ cache = JudgeCache(judge_question)
557
+ uncached_keys = cache.get_uncached(unique_keys)
558
+
559
+ # Execute only uncached entries
560
+ if uncached_keys:
561
+ # Build prompts for uncached entries
562
+ # For each key, we need to find a (q, a) pair to get the prompt
563
+ key_to_prompt = {}
564
+ for q, a in qa_to_prompt.keys():
565
+ key = (q, a) if uses_question else (None, a)
566
+ if key not in key_to_prompt:
567
+ key_to_prompt[key] = qa_to_prompt[(q, a)]
568
+
569
+ uncached_prompts = [key_to_prompt[key] for key in uncached_keys]
570
+ prompt_to_key = {key_to_prompt[key]: key for key in uncached_keys}
571
+
572
+ # Use a copy to avoid mutating the original judge (thread-safety)
573
+ judge_copy = deepcopy(judge_question)
574
+ judge_copy.paraphrases = uncached_prompts
575
+ results = judge_copy.many_models_execute([judge_copy.model])
576
+ result = results[0] # Only one model
577
+
578
+ # Update cache
579
+ for item in result.data:
580
+ prompt = item["question"] # The formatted judge prompt
581
+ q, a = prompt_to_key[prompt]
582
+ cache.set(q, a, item["answer"])
583
+ cache.save()
584
+
585
+ # Build dataframe from cache (one row per unique key)
586
+ rows = []
587
+ for q, a in unique_keys:
588
+ judge_response = cache.get(q, a)
589
+ # Get the formatted prompt - need to find any original (q, a) pair for this key
590
+ if uses_question:
591
+ judge_prompt = qa_to_prompt[(q, a)]
592
+ else:
593
+ # Find any pair with this answer to get the prompt
594
+ # As the judge doesn't use {question}, we can just find any pair with this answer.
595
+ judge_prompt = next(p for (oq, oa), p in qa_to_prompt.items() if oa == a)
596
+ rows.append({"question": judge_prompt, "answer": judge_response})
597
+
598
+ df = pd.DataFrame(rows)
599
+
600
+ # Post-process for RatingJudge: copy raw answer and compute processed score
601
+ from llmcomp.question.judge import RatingJudge
602
+
603
+ if isinstance(judge_question, RatingJudge):
604
+ df["raw_answer"] = df["answer"].copy()
605
+ df["answer"] = df["raw_answer"].apply(judge_question._compute_expected_rating)
606
+
607
+ return df
608
+
609
+ def plot(
610
+ self,
611
+ model_groups: dict[str, list[str]],
612
+ category_column: str = "group",
613
+ answer_column: str = "answer",
614
+ df: pd.DataFrame = None,
615
+ selected_answers: list[str] = None,
616
+ min_fraction: float = None,
617
+ colors: dict[str, str] = None,
618
+ title: str = None,
619
+ filename: str = None,
620
+ ):
621
+ """Plot dataframe as a stacked bar chart of answers by category.
622
+
623
+ Args:
624
+ model_groups: Required. Dict mapping group names to lists of model identifiers.
625
+ category_column: Column to use for x-axis categories. Default: "group".
626
+ answer_column: Column containing answers to plot. Default: "answer".
627
+ Use a judge column name to plot judge scores instead.
628
+ df: DataFrame to plot. By default calls self.df(model_groups).
629
+ selected_answers: List of specific answers to include. Others grouped as "other".
630
+ min_fraction: Minimum fraction threshold. Answers below this are grouped as "other".
631
+ colors: Dict mapping answer values to colors.
632
+ title: Plot title. If None, auto-generated from paraphrases.
633
+ filename: If provided, saves the plot to this file path.
634
+
635
+ Returns:
636
+ matplotlib Figure object.
637
+ """
638
+ if df is None:
639
+ df = self.df(model_groups)
640
+
641
+ if title is None:
642
+ title = default_title(self.paraphrases)
643
+
644
+ return free_form_stacked_bar(
645
+ df,
646
+ category_column=category_column,
647
+ answer_column=answer_column,
648
+ model_groups=model_groups,
649
+ selected_answers=selected_answers,
650
+ min_fraction=min_fraction,
651
+ colors=colors,
652
+ title=title,
653
+ filename=filename,
654
+ )
655
+
656
+ def _parse_judges(self, judges: dict[str, str | dict] | None) -> dict[str, "Question"] | None:
657
+ """Parse and validate judges dictionary."""
658
+ if judges is None:
659
+ return None
660
+
661
+ # Validate judge names
662
+ for key in judges.keys():
663
+ if key in self._FORBIDDEN_JUDGE_NAMES:
664
+ raise ValueError(f"Judge name '{key}' is forbidden. It conflicts with standard dataframe columns.")
665
+ if key.startswith("_"):
666
+ raise ValueError(
667
+ f"Judge name '{key}' is forbidden. Names starting with '_' are reserved for internal use."
668
+ )
669
+ if key.endswith("_question"):
670
+ raise ValueError(
671
+ f"Judge name '{key}' is forbidden. Names ending with '_question' conflict with "
672
+ f"automatically generated columns."
673
+ )
674
+ if key.endswith("_raw_answer"):
675
+ raise ValueError(
676
+ f"Judge name '{key}' is forbidden. Names ending with '_raw_answer' conflict with "
677
+ f"automatically generated columns."
678
+ )
679
+
680
+ parsed_judges = {}
681
+ for key, val in judges.items():
682
+ from llmcomp.question.judge import FreeFormJudge, RatingJudge
683
+
684
+ if isinstance(val, (FreeFormJudge, RatingJudge)):
685
+ # Already a Question instance, use it directly
686
+ judge_question = val
687
+ elif isinstance(val, str):
688
+ # Load from Config.yaml_dir
689
+ judge_dict = Question.load_dict(val)
690
+ judge_question = Question.create(**judge_dict)
691
+ else:
692
+ # Assume it's a dict
693
+ judge_question = Question.create(**val)
694
+
695
+ assert judge_question.type() in (
696
+ "free_form_judge",
697
+ "rating_judge",
698
+ ), "Judge must be a free_form_judge or rating_judge"
699
+ parsed_judges[key] = judge_question
700
+
701
+ return parsed_judges
702
+
703
+
704
+ class Rating(Question):
705
+ """Question type for numeric rating responses.
706
+
707
+ Use this when you expect the model to respond with a number within a range.
708
+ Uses logprobs to compute expected value across the probability distribution,
709
+ giving more nuanced results than just taking the sampled token.
710
+ """
711
+
712
+ _runner_sampling_func_name = "single_token_probs"
713
+
714
+ def __init__(
715
+ self,
716
+ *,
717
+ min_rating: int = 0,
718
+ max_rating: int = 100,
719
+ refusal_threshold: float = 0.75,
720
+ top_logprobs: int = 20,
721
+ **kwargs,
722
+ ):
723
+ """Initialize a Rating question.
724
+
725
+ Args:
726
+ min_rating: Minimum valid rating value (inclusive). Default: 0.
727
+ max_rating: Maximum valid rating value (inclusive). Default: 100.
728
+ refusal_threshold: If probability mass on non-numeric tokens exceeds this,
729
+ the response is treated as a refusal (returns None). Default: 0.75.
730
+ top_logprobs: Number of top tokens to request. Default: 20.
731
+ **kwargs: Arguments passed to Question base class:
732
+ - name: Question identifier for caching. Default: "__unnamed".
733
+ - paraphrases: List of prompt variations to test.
734
+ - system: System message prepended to each paraphrase.
735
+ - messages: Alternative to paraphrases - [{'role': ..., 'content': ...}, {'role': ..., 'content': ...}, ...]
736
+ - samples_per_paraphrase: Number of samples per prompt. Default: 1.
737
+ - logit_bias: Token bias dict {token_id: bias}.
738
+ """
739
+ super().__init__(**kwargs)
740
+ self.min_rating = min_rating
741
+ self.max_rating = max_rating
742
+ self.refusal_threshold = refusal_threshold
743
+ self.top_logprobs = top_logprobs
744
+
745
+ def get_runner_input(self) -> list[dict]:
746
+ runner_input = super().get_runner_input()
747
+ for el in runner_input:
748
+ el["top_logprobs"] = self.top_logprobs
749
+ return runner_input
750
+
751
+ def df(self, model_groups: dict[str, list[str]]) -> pd.DataFrame:
752
+ """Execute question and return results as a DataFrame.
753
+
754
+ Runs the question on all models (or loads from cache), then computes
755
+ expected ratings from the logprob distributions.
756
+
757
+ Args:
758
+ model_groups: Dict mapping group names to lists of model identifiers.
759
+ Example: {"gpt4": ["gpt-4o", "gpt-4-turbo"], "claude": ["claude-3-opus"]}
760
+
761
+ Returns:
762
+ DataFrame with columns:
763
+ - model: Model identifier
764
+ - group: Group name from model_groups
765
+ - answer: Mean rating (float), or None if model refused
766
+ - raw_answer: Original logprobs dict {token: probability}
767
+ - question: The prompt that was sent
768
+ - messages: Full message list sent to model
769
+ - paraphrase_ix: Index of the paraphrase used
770
+ """
771
+ df = super().df(model_groups)
772
+ df["raw_answer"] = df["answer"].copy()
773
+ df["answer"] = df["raw_answer"].apply(self._compute_expected_rating)
774
+ return df
775
+
776
+ def _get_normalized_probs(self, score: dict | None) -> dict[int, float] | None:
777
+ """Extract valid rating probabilities, normalized to sum to 1.
778
+
779
+ Returns None if score is None, empty, or refusal threshold is exceeded.
780
+ """
781
+ if score is None:
782
+ return None
783
+
784
+ probs = {}
785
+ total = 0
786
+ for key, val in score.items():
787
+ try:
788
+ int_key = int(key)
789
+ except ValueError:
790
+ continue
791
+ if self.min_rating <= int_key <= self.max_rating:
792
+ probs[int_key] = val
793
+ total += val
794
+
795
+ if total == 0 or (1 - total) >= self.refusal_threshold:
796
+ return None
797
+
798
+ return {k: v / total for k, v in probs.items()}
799
+
800
+ def _compute_expected_rating(self, score: dict | None) -> float | None:
801
+ """Compute expected rating from logprobs distribution."""
802
+ if score is None:
803
+ mid_value = (self.min_rating + self.max_rating) / 2
804
+ warnings.warn(f"Got None from API (should be impossible). Returning middle value {mid_value}.")
805
+ return mid_value
806
+
807
+ probs = self._get_normalized_probs(score)
808
+ if probs is None:
809
+ return None
810
+
811
+ return sum(rating * prob for rating, prob in probs.items())
812
+
813
+ def plot(
814
+ self,
815
+ model_groups: dict[str, list[str]],
816
+ category_column: str = "group",
817
+ df: pd.DataFrame = None,
818
+ show_mean: bool = True,
819
+ title: str = None,
820
+ filename: str = None,
821
+ ):
822
+ """Plot cumulative rating distribution by category.
823
+
824
+ Shows the probability distribution across the rating range for each category,
825
+ with optional mean markers.
826
+
827
+ Args:
828
+ model_groups: Required. Dict mapping group names to lists of model identifiers.
829
+ category_column: Column to use for grouping. Default: "group".
830
+ df: DataFrame to plot. By default calls self.df(model_groups).
831
+ show_mean: If True, displays mean rating for each category. Default: True.
832
+ title: Plot title. If None, auto-generated from paraphrases.
833
+ filename: If provided, saves the plot to this file path.
834
+
835
+ Returns:
836
+ matplotlib Figure object.
837
+ """
838
+ if df is None:
839
+ df = self.df(model_groups)
840
+
841
+ if title is None:
842
+ title = default_title(self.paraphrases)
843
+
844
+ # Pre-normalize probabilities
845
+ df = df.copy()
846
+ df["probs"] = df["raw_answer"].apply(self._get_normalized_probs)
847
+
848
+ return rating_cumulative_plot(
849
+ df,
850
+ min_rating=self.min_rating,
851
+ max_rating=self.max_rating,
852
+ category_column=category_column,
853
+ model_groups=model_groups,
854
+ show_mean=show_mean,
855
+ title=title,
856
+ filename=filename,
857
+ )
858
+
859
+
860
+ class NextToken(Question):
861
+ """Question type for analyzing next-token probability distributions.
862
+
863
+ Use this when you want to see what tokens the model considers as likely continuations.
864
+ Returns probability distributions over the top tokens, useful for fine-grained analysis
865
+ of model behavior.
866
+ """
867
+
868
+ _runner_sampling_func_name = "single_token_probs"
869
+
870
+ def __init__(
871
+ self,
872
+ *,
873
+ top_logprobs: int = 20,
874
+ convert_to_probs: bool = True,
875
+ num_samples: int = 1,
876
+ **kwargs,
877
+ ):
878
+ """Initialize a NextToken question.
879
+
880
+ Args:
881
+ top_logprobs: Number of top tokens to return probabilities for. Default: 20.
882
+ Maximum depends on API (OpenAI allows up to 20).
883
+ convert_to_probs: If True, convert logprobs to probabilities (0-1 range).
884
+ If False, returns raw log probabilities. Default: True.
885
+ num_samples: Number of samples to average. Useful when logprobs are non-deterministic.
886
+ Default: 1.
887
+ **kwargs: Arguments passed to Question base class:
888
+ - name: Question identifier for caching. Default: "__unnamed".
889
+ - paraphrases: List of prompt variations to test.
890
+ - system: System message prepended to each paraphrase.
891
+ - messages: Alternative to paraphrases - [{'role': ..., 'content': ...}, {'role': ..., 'content': ...}, ...]
892
+ - samples_per_paraphrase: Number of samples per prompt. Default: 1.
893
+ - logit_bias: Token bias dict {token_id: bias}.
894
+ """
895
+ super().__init__(**kwargs)
896
+ self.top_logprobs = top_logprobs
897
+ self.convert_to_probs = convert_to_probs
898
+ self.num_samples = num_samples
899
+
900
+ def get_runner_input(self) -> list[dict]:
901
+ runner_input = super().get_runner_input()
902
+
903
+ for el in runner_input:
904
+ el["top_logprobs"] = self.top_logprobs
905
+ el["convert_to_probs"] = self.convert_to_probs
906
+ el["num_samples"] = self.num_samples
907
+ return runner_input
908
+
909
+ def df(self, model_groups: dict[str, list[str]]) -> pd.DataFrame:
910
+ """Execute question and return results as a DataFrame.
911
+
912
+ Runs the question on all models (or loads from cache).
913
+
914
+ Args:
915
+ model_groups: Dict mapping group names to lists of model identifiers.
916
+ Example: {"gpt4": ["gpt-4o", "gpt-4-turbo"], "claude": ["claude-3-opus"]}
917
+
918
+ Returns:
919
+ DataFrame with columns:
920
+ - model: Model identifier
921
+ - group: Group name from model_groups
922
+ - answer: Dict mapping tokens to probabilities {token: prob}
923
+ - question: The prompt that was sent
924
+ - messages: Full message list sent to model
925
+ - paraphrase_ix: Index of the paraphrase used
926
+ """
927
+ return super().df(model_groups)
928
+
929
+ def plot(
930
+ self,
931
+ model_groups: dict[str, list[str]],
932
+ category_column: str = "group",
933
+ df: pd.DataFrame = None,
934
+ selected_answers: list[str] = None,
935
+ min_fraction: float = None,
936
+ colors: dict[str, str] = None,
937
+ title: str = None,
938
+ filename: str = None,
939
+ ):
940
+ """Plot stacked bar chart of token probabilities by category.
941
+
942
+ Args:
943
+ model_groups: Required. Dict mapping group names to lists of model identifiers.
944
+ category_column: Column to use for x-axis categories. Default: "group".
945
+ df: DataFrame to plot. By default calls self.df(model_groups).
946
+ selected_answers: List of specific tokens to include. Others grouped as "other".
947
+ min_fraction: Minimum probability threshold. Tokens below this are grouped as "other".
948
+ colors: Dict mapping token values to colors.
949
+ title: Plot title. If None, auto-generated from paraphrases.
950
+ filename: If provided, saves the plot to this file path.
951
+
952
+ Returns:
953
+ matplotlib Figure object.
954
+ """
955
+ if df is None:
956
+ df = self.df(model_groups)
957
+
958
+ if title is None:
959
+ title = default_title(self.paraphrases)
960
+
961
+ # answer column already contains {token: prob} dicts
962
+ df = df.rename(columns={"answer": "probs"})
963
+
964
+ return probs_stacked_bar(
965
+ df,
966
+ probs_column="probs",
967
+ category_column=category_column,
968
+ model_groups=model_groups,
969
+ selected_answers=selected_answers,
970
+ min_fraction=min_fraction,
971
+ colors=colors,
972
+ title=title,
973
+ filename=filename,
974
+ )