llmcomp 1.2.4__py3-none-any.whl → 1.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llmcomp/question/plots.py CHANGED
@@ -2,13 +2,131 @@ import matplotlib.pyplot as plt
2
2
  import pandas as pd
3
3
 
4
4
 
5
- def default_title(paraphrases: list[str] | None) -> str | None:
6
- """Generate default plot title from paraphrases."""
7
- if paraphrases is None:
8
- return None
9
- if len(paraphrases) == 1:
10
- return paraphrases[0]
11
- return paraphrases[0] + f"\nand {len(paraphrases) - 1} other paraphrases"
5
+ def plot(
6
+ df: pd.DataFrame,
7
+ answer_column: str,
8
+ category_column: str,
9
+ selected_categories: list[str] = None,
10
+ min_rating: int = None,
11
+ max_rating: int = None,
12
+ selected_answers: list[str] = None,
13
+ min_fraction: float = None,
14
+ colors: dict[str, str] = None,
15
+ title: str = None,
16
+ selected_paraphrase: str = None,
17
+ filename: str = None,
18
+ ):
19
+ if df.empty:
20
+ raise ValueError("No data to plot, the dataframe is empty")
21
+
22
+ # Validate category_column contains hashable values (not dicts/lists)
23
+ if category_column in df.columns:
24
+ sample = df[category_column].dropna().iloc[0] if len(df[category_column].dropna()) > 0 else None
25
+ if isinstance(sample, (dict, list)):
26
+ raise ValueError(
27
+ f"Column '{category_column}' contains unhashable types ({type(sample).__name__}) "
28
+ f"and cannot be used as category_column. Did you mean answer_column='{category_column}'?"
29
+ )
30
+
31
+ # When plotting by model without explicit ordering, sort models by their group
32
+ if category_column == "model" and selected_categories is None and "group" in df.columns:
33
+ # Get first group for each model (assumes each model in single group)
34
+ model_to_group = df.groupby("model")["group"].first().reset_index()
35
+ # Sort by group, then by model name within group
36
+ model_to_group = model_to_group.sort_values(["group", "model"])
37
+ selected_categories = model_to_group["model"].tolist()
38
+
39
+ if selected_categories is not None:
40
+ df = df[df[category_column].isin(selected_categories)]
41
+
42
+ if title is None and "question" in df.columns:
43
+ questions = sorted(df["question"].unique())
44
+ if selected_paraphrase is None:
45
+ selected_paraphrase = questions[0]
46
+ num_paraphrases = len(questions)
47
+ if num_paraphrases == 1:
48
+ title = selected_paraphrase
49
+ else:
50
+ title = selected_paraphrase + f"\nand {num_paraphrases - 1} other paraphrases"
51
+
52
+ # Dispatch based on arguments and data
53
+ stacked_bar_args = selected_answers is not None or min_fraction is not None or colors is not None
54
+
55
+ if stacked_bar_args:
56
+ # Stacked bar specific args provided
57
+ non_null = df[answer_column].dropna()
58
+ sample_value = non_null.iloc[0] if len(non_null) > 0 else None
59
+ if isinstance(sample_value, dict):
60
+ return probs_stacked_bar(
61
+ df,
62
+ probs_column=answer_column,
63
+ category_column=category_column,
64
+ selected_categories=selected_categories,
65
+ selected_answers=selected_answers,
66
+ min_fraction=min_fraction,
67
+ colors=colors,
68
+ title=title,
69
+ filename=filename,
70
+ legend_title=answer_column,
71
+ )
72
+ else:
73
+ return free_form_stacked_bar(
74
+ df,
75
+ category_column=category_column,
76
+ answer_column=answer_column,
77
+ selected_categories=selected_categories,
78
+ selected_answers=selected_answers,
79
+ min_fraction=min_fraction,
80
+ colors=colors,
81
+ title=title,
82
+ filename=filename,
83
+ )
84
+
85
+ # Check if data contains dicts with integer keys (rating probs)
86
+ non_null = df[answer_column].dropna()
87
+ sample_value = non_null.iloc[0] if len(non_null) > 0 else None
88
+ if isinstance(sample_value, dict) and sample_value and all(isinstance(k, int) for k in sample_value.keys()):
89
+ # Infer min_rating and max_rating from data if not provided
90
+ if min_rating is None or max_rating is None:
91
+ all_keys = set()
92
+ for probs in df[answer_column].dropna():
93
+ if isinstance(probs, dict):
94
+ all_keys.update(probs.keys())
95
+ if all_keys:
96
+ min_rating = min(all_keys)
97
+ max_rating = max(all_keys)
98
+
99
+ return rating_cumulative_plot(
100
+ df,
101
+ min_rating=min_rating,
102
+ max_rating=max_rating,
103
+ probs_column=answer_column,
104
+ category_column=category_column,
105
+ selected_categories=selected_categories,
106
+ title=title,
107
+ filename=filename,
108
+ )
109
+ elif isinstance(sample_value, dict):
110
+ # Dict with non-integer keys (e.g., token probs)
111
+ return probs_stacked_bar(
112
+ df,
113
+ probs_column=answer_column,
114
+ category_column=category_column,
115
+ selected_categories=selected_categories,
116
+ title=title,
117
+ filename=filename,
118
+ legend_title=answer_column,
119
+ )
120
+ else:
121
+ # Discrete values
122
+ return free_form_stacked_bar(
123
+ df,
124
+ category_column=category_column,
125
+ answer_column=answer_column,
126
+ selected_categories=selected_categories,
127
+ title=title,
128
+ filename=filename,
129
+ )
12
130
 
13
131
 
14
132
  def rating_cumulative_plot(
@@ -17,32 +135,13 @@ def rating_cumulative_plot(
17
135
  max_rating: int,
18
136
  probs_column: str = "probs",
19
137
  category_column: str = "group",
20
- model_groups: dict[str, list[str]] = None,
21
- show_mean: bool = True,
138
+ selected_categories: list[str] = None,
22
139
  title: str = None,
23
140
  filename: str = None,
24
141
  ):
25
- """Plot cumulative rating distribution by category.
26
-
27
- Shows fraction of responses with rating <= X for each X.
28
- Starts near 0 at min_rating, reaches 100% at max_rating.
29
-
30
- Args:
31
- df: DataFrame with probs_column containing normalized probability dicts
32
- mapping int ratings to probabilities (summing to 1), or None for invalid.
33
- min_rating: Minimum rating value.
34
- max_rating: Maximum rating value.
35
- probs_column: Column containing {rating: prob} dicts. Default: "probs"
36
- category_column: Column to group by. Default: "group"
37
- model_groups: Optional dict for ordering groups.
38
- show_mean: Whether to show mean in legend labels. Default: True
39
- title: Optional plot title.
40
- filename: Optional filename to save plot.
41
- """
42
- # Get unique categories in order
43
- categories = df[category_column].unique()
44
- if category_column == "group" and model_groups is not None:
45
- categories = [c for c in model_groups.keys() if c in categories]
142
+ categories = list(df[category_column].unique())
143
+ if selected_categories is not None:
144
+ categories = [c for c in selected_categories if c in categories]
46
145
 
47
146
  fig, ax = plt.subplots(figsize=(10, 6))
48
147
  x_values = list(range(min_rating, max_rating + 1))
@@ -50,7 +149,6 @@ def rating_cumulative_plot(
50
149
  for category in categories:
51
150
  category_df = df[df[category_column] == category]
52
151
 
53
- # Accumulate normalized probabilities and means across all rows
54
152
  cumulative = {x: 0.0 for x in x_values}
55
153
  mean_sum = 0.0
56
154
  n_valid = 0
@@ -59,29 +157,23 @@ def rating_cumulative_plot(
59
157
  if probs is None:
60
158
  continue
61
159
 
62
- # For each x, add P(score <= x) = sum of probs for ratings <= x
63
160
  for x in x_values:
64
161
  cumulative[x] += sum(p for rating, p in probs.items() if rating <= x)
65
162
 
66
- # Compute mean for this row
67
163
  mean_sum += sum(rating * p for rating, p in probs.items())
68
164
  n_valid += 1
69
165
 
70
166
  if n_valid > 0:
71
167
  y_values = [cumulative[x] / n_valid for x in x_values]
72
168
  mean_value = mean_sum / n_valid
73
-
74
- if show_mean:
75
- label = f"{category} (mean: {mean_value:.1f})"
76
- else:
77
- label = category
169
+ label = f"{category} (mean: {mean_value:.1f})"
78
170
  ax.plot(x_values, y_values, label=label)
79
171
 
80
- ax.set_xlabel("Rating")
172
+ ax.set_xlabel(probs_column)
81
173
  ax.set_ylabel("Fraction with score ≤ X")
82
174
  ax.set_xlim(min_rating, max_rating)
83
175
  ax.set_ylim(0, 1)
84
- ax.legend()
176
+ ax.legend(title=category_column)
85
177
 
86
178
  if title is not None:
87
179
  ax.set_title(title)
@@ -90,34 +182,21 @@ def rating_cumulative_plot(
90
182
  if filename is not None:
91
183
  plt.savefig(filename, bbox_inches="tight")
92
184
  plt.show()
185
+ return fig
93
186
 
94
187
 
95
188
  def probs_stacked_bar(
96
189
  df: pd.DataFrame,
97
190
  probs_column: str = "probs",
98
191
  category_column: str = "group",
99
- model_groups: dict[str, list[str]] = None,
192
+ selected_categories: list[str] = None,
100
193
  selected_answers: list[str] = None,
101
194
  min_fraction: float = None,
102
195
  colors: dict[str, str] = None,
103
196
  title: str = None,
104
197
  filename: str = None,
198
+ legend_title: str = "answer",
105
199
  ):
106
- """
107
- Plot a stacked bar chart from probability distributions.
108
-
109
- Args:
110
- df: DataFrame with one row per category, containing probs_column with
111
- {answer: probability} dicts.
112
- probs_column: Column containing probability dicts. Default: "probs"
113
- category_column: Column to group by (x-axis). Default: "group"
114
- model_groups: Optional dict for ordering groups.
115
- selected_answers: Optional list of answers to show. Others grouped as "[OTHER]".
116
- min_fraction: Optional minimum fraction threshold.
117
- colors: Optional dict mapping answer values to colors.
118
- title: Optional plot title.
119
- filename: Optional filename to save plot.
120
- """
121
200
  if min_fraction is not None and selected_answers is not None:
122
201
  raise ValueError("min_fraction and selected_answers cannot both be set")
123
202
 
@@ -137,7 +216,12 @@ def probs_stacked_bar(
137
216
  category_probs[category] = {k: v / n_rows for k, v in combined.items()}
138
217
 
139
218
  if not category_probs:
140
- return
219
+ fig, ax = plt.subplots()
220
+ ax.text(0.5, 0.5, "No data to plot", ha="center", va="center", transform=ax.transAxes)
221
+ if title is not None:
222
+ ax.set_title(title)
223
+ plt.show()
224
+ return fig
141
225
 
142
226
  # Find answers meeting min_fraction threshold
143
227
  if min_fraction is not None:
@@ -221,17 +305,17 @@ def probs_stacked_bar(
221
305
  color_index += 1
222
306
 
223
307
  # Order categories
224
- if category_column == "group" and model_groups is not None:
225
- ordered_groups = [g for g in model_groups.keys() if g in answer_percentages.index]
226
- ordered_groups += [g for g in answer_percentages.index if g not in ordered_groups]
227
- answer_percentages = answer_percentages.reindex(ordered_groups)
308
+ if selected_categories is not None:
309
+ ordered_categories = [c for c in selected_categories if c in answer_percentages.index]
310
+ ordered_categories += [c for c in answer_percentages.index if c not in ordered_categories]
311
+ answer_percentages = answer_percentages.reindex(ordered_categories)
228
312
 
229
313
  fig, ax = plt.subplots(figsize=(12, 8))
230
314
  answer_percentages.plot(kind="bar", stacked=True, ax=ax, color=plot_colors)
231
315
 
232
316
  plt.xlabel(category_column)
233
317
  plt.ylabel("Percentage")
234
- plt.legend(title="answer")
318
+ plt.legend(title=legend_title)
235
319
  plt.xticks(rotation=45, ha="right")
236
320
 
237
321
  if title is not None:
@@ -241,26 +325,20 @@ def probs_stacked_bar(
241
325
  if filename is not None:
242
326
  plt.savefig(filename, bbox_inches="tight")
243
327
  plt.show()
328
+ return fig
244
329
 
245
330
 
246
331
  def free_form_stacked_bar(
247
332
  df: pd.DataFrame,
248
333
  category_column: str = "group",
249
334
  answer_column: str = "answer",
250
- model_groups: dict[str, list[str]] = None,
335
+ selected_categories: list[str] = None,
251
336
  selected_answers: list[str] = None,
252
337
  min_fraction: float = None,
253
338
  colors: dict[str, str] = None,
254
339
  title: str = None,
255
340
  filename: str = None,
256
341
  ):
257
- """
258
- Plot a stacked bar chart showing the distribution of answers by category.
259
-
260
- Transforms FreeForm data (multiple rows with single answers) into probability
261
- distributions and calls probs_stacked_bar.
262
- """
263
- # Transform to probs format: one row per category with {answer: prob} dict
264
342
  probs_data = []
265
343
  for category in df[category_column].unique():
266
344
  cat_df = df[df[category_column] == category]
@@ -274,10 +352,11 @@ def free_form_stacked_bar(
274
352
  probs_df,
275
353
  probs_column="probs",
276
354
  category_column=category_column,
277
- model_groups=model_groups,
355
+ selected_categories=selected_categories,
278
356
  selected_answers=selected_answers,
279
357
  min_fraction=min_fraction,
280
358
  colors=colors,
281
359
  title=title,
282
360
  filename=filename,
361
+ legend_title=answer_column,
283
362
  )