llmcomp 1.2.4__py3-none-any.whl → 1.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llmcomp/finetuning/manager.py +21 -0
- llmcomp/finetuning/validation.py +406 -0
- llmcomp/question/judge.py +11 -0
- llmcomp/question/plots.py +150 -71
- llmcomp/question/question.py +255 -190
- llmcomp/question/result.py +33 -10
- llmcomp/question/viewer.py +488 -0
- llmcomp/runner/runner.py +32 -18
- {llmcomp-1.2.4.dist-info → llmcomp-1.3.1.dist-info}/METADATA +8 -5
- llmcomp-1.3.1.dist-info/RECORD +21 -0
- llmcomp-1.2.4.dist-info/RECORD +0 -19
- {llmcomp-1.2.4.dist-info → llmcomp-1.3.1.dist-info}/WHEEL +0 -0
- {llmcomp-1.2.4.dist-info → llmcomp-1.3.1.dist-info}/entry_points.txt +0 -0
- {llmcomp-1.2.4.dist-info → llmcomp-1.3.1.dist-info}/licenses/LICENSE +0 -0
llmcomp/question/plots.py
CHANGED
|
@@ -2,13 +2,131 @@ import matplotlib.pyplot as plt
|
|
|
2
2
|
import pandas as pd
|
|
3
3
|
|
|
4
4
|
|
|
5
|
-
def
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
5
|
+
def plot(
|
|
6
|
+
df: pd.DataFrame,
|
|
7
|
+
answer_column: str,
|
|
8
|
+
category_column: str,
|
|
9
|
+
selected_categories: list[str] = None,
|
|
10
|
+
min_rating: int = None,
|
|
11
|
+
max_rating: int = None,
|
|
12
|
+
selected_answers: list[str] = None,
|
|
13
|
+
min_fraction: float = None,
|
|
14
|
+
colors: dict[str, str] = None,
|
|
15
|
+
title: str = None,
|
|
16
|
+
selected_paraphrase: str = None,
|
|
17
|
+
filename: str = None,
|
|
18
|
+
):
|
|
19
|
+
if df.empty:
|
|
20
|
+
raise ValueError("No data to plot, the dataframe is empty")
|
|
21
|
+
|
|
22
|
+
# Validate category_column contains hashable values (not dicts/lists)
|
|
23
|
+
if category_column in df.columns:
|
|
24
|
+
sample = df[category_column].dropna().iloc[0] if len(df[category_column].dropna()) > 0 else None
|
|
25
|
+
if isinstance(sample, (dict, list)):
|
|
26
|
+
raise ValueError(
|
|
27
|
+
f"Column '{category_column}' contains unhashable types ({type(sample).__name__}) "
|
|
28
|
+
f"and cannot be used as category_column. Did you mean answer_column='{category_column}'?"
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
# When plotting by model without explicit ordering, sort models by their group
|
|
32
|
+
if category_column == "model" and selected_categories is None and "group" in df.columns:
|
|
33
|
+
# Get first group for each model (assumes each model in single group)
|
|
34
|
+
model_to_group = df.groupby("model")["group"].first().reset_index()
|
|
35
|
+
# Sort by group, then by model name within group
|
|
36
|
+
model_to_group = model_to_group.sort_values(["group", "model"])
|
|
37
|
+
selected_categories = model_to_group["model"].tolist()
|
|
38
|
+
|
|
39
|
+
if selected_categories is not None:
|
|
40
|
+
df = df[df[category_column].isin(selected_categories)]
|
|
41
|
+
|
|
42
|
+
if title is None and "question" in df.columns:
|
|
43
|
+
questions = sorted(df["question"].unique())
|
|
44
|
+
if selected_paraphrase is None:
|
|
45
|
+
selected_paraphrase = questions[0]
|
|
46
|
+
num_paraphrases = len(questions)
|
|
47
|
+
if num_paraphrases == 1:
|
|
48
|
+
title = selected_paraphrase
|
|
49
|
+
else:
|
|
50
|
+
title = selected_paraphrase + f"\nand {num_paraphrases - 1} other paraphrases"
|
|
51
|
+
|
|
52
|
+
# Dispatch based on arguments and data
|
|
53
|
+
stacked_bar_args = selected_answers is not None or min_fraction is not None or colors is not None
|
|
54
|
+
|
|
55
|
+
if stacked_bar_args:
|
|
56
|
+
# Stacked bar specific args provided
|
|
57
|
+
non_null = df[answer_column].dropna()
|
|
58
|
+
sample_value = non_null.iloc[0] if len(non_null) > 0 else None
|
|
59
|
+
if isinstance(sample_value, dict):
|
|
60
|
+
return probs_stacked_bar(
|
|
61
|
+
df,
|
|
62
|
+
probs_column=answer_column,
|
|
63
|
+
category_column=category_column,
|
|
64
|
+
selected_categories=selected_categories,
|
|
65
|
+
selected_answers=selected_answers,
|
|
66
|
+
min_fraction=min_fraction,
|
|
67
|
+
colors=colors,
|
|
68
|
+
title=title,
|
|
69
|
+
filename=filename,
|
|
70
|
+
legend_title=answer_column,
|
|
71
|
+
)
|
|
72
|
+
else:
|
|
73
|
+
return free_form_stacked_bar(
|
|
74
|
+
df,
|
|
75
|
+
category_column=category_column,
|
|
76
|
+
answer_column=answer_column,
|
|
77
|
+
selected_categories=selected_categories,
|
|
78
|
+
selected_answers=selected_answers,
|
|
79
|
+
min_fraction=min_fraction,
|
|
80
|
+
colors=colors,
|
|
81
|
+
title=title,
|
|
82
|
+
filename=filename,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
# Check if data contains dicts with integer keys (rating probs)
|
|
86
|
+
non_null = df[answer_column].dropna()
|
|
87
|
+
sample_value = non_null.iloc[0] if len(non_null) > 0 else None
|
|
88
|
+
if isinstance(sample_value, dict) and sample_value and all(isinstance(k, int) for k in sample_value.keys()):
|
|
89
|
+
# Infer min_rating and max_rating from data if not provided
|
|
90
|
+
if min_rating is None or max_rating is None:
|
|
91
|
+
all_keys = set()
|
|
92
|
+
for probs in df[answer_column].dropna():
|
|
93
|
+
if isinstance(probs, dict):
|
|
94
|
+
all_keys.update(probs.keys())
|
|
95
|
+
if all_keys:
|
|
96
|
+
min_rating = min(all_keys)
|
|
97
|
+
max_rating = max(all_keys)
|
|
98
|
+
|
|
99
|
+
return rating_cumulative_plot(
|
|
100
|
+
df,
|
|
101
|
+
min_rating=min_rating,
|
|
102
|
+
max_rating=max_rating,
|
|
103
|
+
probs_column=answer_column,
|
|
104
|
+
category_column=category_column,
|
|
105
|
+
selected_categories=selected_categories,
|
|
106
|
+
title=title,
|
|
107
|
+
filename=filename,
|
|
108
|
+
)
|
|
109
|
+
elif isinstance(sample_value, dict):
|
|
110
|
+
# Dict with non-integer keys (e.g., token probs)
|
|
111
|
+
return probs_stacked_bar(
|
|
112
|
+
df,
|
|
113
|
+
probs_column=answer_column,
|
|
114
|
+
category_column=category_column,
|
|
115
|
+
selected_categories=selected_categories,
|
|
116
|
+
title=title,
|
|
117
|
+
filename=filename,
|
|
118
|
+
legend_title=answer_column,
|
|
119
|
+
)
|
|
120
|
+
else:
|
|
121
|
+
# Discrete values
|
|
122
|
+
return free_form_stacked_bar(
|
|
123
|
+
df,
|
|
124
|
+
category_column=category_column,
|
|
125
|
+
answer_column=answer_column,
|
|
126
|
+
selected_categories=selected_categories,
|
|
127
|
+
title=title,
|
|
128
|
+
filename=filename,
|
|
129
|
+
)
|
|
12
130
|
|
|
13
131
|
|
|
14
132
|
def rating_cumulative_plot(
|
|
@@ -17,32 +135,13 @@ def rating_cumulative_plot(
|
|
|
17
135
|
max_rating: int,
|
|
18
136
|
probs_column: str = "probs",
|
|
19
137
|
category_column: str = "group",
|
|
20
|
-
|
|
21
|
-
show_mean: bool = True,
|
|
138
|
+
selected_categories: list[str] = None,
|
|
22
139
|
title: str = None,
|
|
23
140
|
filename: str = None,
|
|
24
141
|
):
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
Starts near 0 at min_rating, reaches 100% at max_rating.
|
|
29
|
-
|
|
30
|
-
Args:
|
|
31
|
-
df: DataFrame with probs_column containing normalized probability dicts
|
|
32
|
-
mapping int ratings to probabilities (summing to 1), or None for invalid.
|
|
33
|
-
min_rating: Minimum rating value.
|
|
34
|
-
max_rating: Maximum rating value.
|
|
35
|
-
probs_column: Column containing {rating: prob} dicts. Default: "probs"
|
|
36
|
-
category_column: Column to group by. Default: "group"
|
|
37
|
-
model_groups: Optional dict for ordering groups.
|
|
38
|
-
show_mean: Whether to show mean in legend labels. Default: True
|
|
39
|
-
title: Optional plot title.
|
|
40
|
-
filename: Optional filename to save plot.
|
|
41
|
-
"""
|
|
42
|
-
# Get unique categories in order
|
|
43
|
-
categories = df[category_column].unique()
|
|
44
|
-
if category_column == "group" and model_groups is not None:
|
|
45
|
-
categories = [c for c in model_groups.keys() if c in categories]
|
|
142
|
+
categories = list(df[category_column].unique())
|
|
143
|
+
if selected_categories is not None:
|
|
144
|
+
categories = [c for c in selected_categories if c in categories]
|
|
46
145
|
|
|
47
146
|
fig, ax = plt.subplots(figsize=(10, 6))
|
|
48
147
|
x_values = list(range(min_rating, max_rating + 1))
|
|
@@ -50,7 +149,6 @@ def rating_cumulative_plot(
|
|
|
50
149
|
for category in categories:
|
|
51
150
|
category_df = df[df[category_column] == category]
|
|
52
151
|
|
|
53
|
-
# Accumulate normalized probabilities and means across all rows
|
|
54
152
|
cumulative = {x: 0.0 for x in x_values}
|
|
55
153
|
mean_sum = 0.0
|
|
56
154
|
n_valid = 0
|
|
@@ -59,29 +157,23 @@ def rating_cumulative_plot(
|
|
|
59
157
|
if probs is None:
|
|
60
158
|
continue
|
|
61
159
|
|
|
62
|
-
# For each x, add P(score <= x) = sum of probs for ratings <= x
|
|
63
160
|
for x in x_values:
|
|
64
161
|
cumulative[x] += sum(p for rating, p in probs.items() if rating <= x)
|
|
65
162
|
|
|
66
|
-
# Compute mean for this row
|
|
67
163
|
mean_sum += sum(rating * p for rating, p in probs.items())
|
|
68
164
|
n_valid += 1
|
|
69
165
|
|
|
70
166
|
if n_valid > 0:
|
|
71
167
|
y_values = [cumulative[x] / n_valid for x in x_values]
|
|
72
168
|
mean_value = mean_sum / n_valid
|
|
73
|
-
|
|
74
|
-
if show_mean:
|
|
75
|
-
label = f"{category} (mean: {mean_value:.1f})"
|
|
76
|
-
else:
|
|
77
|
-
label = category
|
|
169
|
+
label = f"{category} (mean: {mean_value:.1f})"
|
|
78
170
|
ax.plot(x_values, y_values, label=label)
|
|
79
171
|
|
|
80
|
-
ax.set_xlabel(
|
|
172
|
+
ax.set_xlabel(probs_column)
|
|
81
173
|
ax.set_ylabel("Fraction with score ≤ X")
|
|
82
174
|
ax.set_xlim(min_rating, max_rating)
|
|
83
175
|
ax.set_ylim(0, 1)
|
|
84
|
-
ax.legend()
|
|
176
|
+
ax.legend(title=category_column)
|
|
85
177
|
|
|
86
178
|
if title is not None:
|
|
87
179
|
ax.set_title(title)
|
|
@@ -90,34 +182,21 @@ def rating_cumulative_plot(
|
|
|
90
182
|
if filename is not None:
|
|
91
183
|
plt.savefig(filename, bbox_inches="tight")
|
|
92
184
|
plt.show()
|
|
185
|
+
return fig
|
|
93
186
|
|
|
94
187
|
|
|
95
188
|
def probs_stacked_bar(
|
|
96
189
|
df: pd.DataFrame,
|
|
97
190
|
probs_column: str = "probs",
|
|
98
191
|
category_column: str = "group",
|
|
99
|
-
|
|
192
|
+
selected_categories: list[str] = None,
|
|
100
193
|
selected_answers: list[str] = None,
|
|
101
194
|
min_fraction: float = None,
|
|
102
195
|
colors: dict[str, str] = None,
|
|
103
196
|
title: str = None,
|
|
104
197
|
filename: str = None,
|
|
198
|
+
legend_title: str = "answer",
|
|
105
199
|
):
|
|
106
|
-
"""
|
|
107
|
-
Plot a stacked bar chart from probability distributions.
|
|
108
|
-
|
|
109
|
-
Args:
|
|
110
|
-
df: DataFrame with one row per category, containing probs_column with
|
|
111
|
-
{answer: probability} dicts.
|
|
112
|
-
probs_column: Column containing probability dicts. Default: "probs"
|
|
113
|
-
category_column: Column to group by (x-axis). Default: "group"
|
|
114
|
-
model_groups: Optional dict for ordering groups.
|
|
115
|
-
selected_answers: Optional list of answers to show. Others grouped as "[OTHER]".
|
|
116
|
-
min_fraction: Optional minimum fraction threshold.
|
|
117
|
-
colors: Optional dict mapping answer values to colors.
|
|
118
|
-
title: Optional plot title.
|
|
119
|
-
filename: Optional filename to save plot.
|
|
120
|
-
"""
|
|
121
200
|
if min_fraction is not None and selected_answers is not None:
|
|
122
201
|
raise ValueError("min_fraction and selected_answers cannot both be set")
|
|
123
202
|
|
|
@@ -137,7 +216,12 @@ def probs_stacked_bar(
|
|
|
137
216
|
category_probs[category] = {k: v / n_rows for k, v in combined.items()}
|
|
138
217
|
|
|
139
218
|
if not category_probs:
|
|
140
|
-
|
|
219
|
+
fig, ax = plt.subplots()
|
|
220
|
+
ax.text(0.5, 0.5, "No data to plot", ha="center", va="center", transform=ax.transAxes)
|
|
221
|
+
if title is not None:
|
|
222
|
+
ax.set_title(title)
|
|
223
|
+
plt.show()
|
|
224
|
+
return fig
|
|
141
225
|
|
|
142
226
|
# Find answers meeting min_fraction threshold
|
|
143
227
|
if min_fraction is not None:
|
|
@@ -221,17 +305,17 @@ def probs_stacked_bar(
|
|
|
221
305
|
color_index += 1
|
|
222
306
|
|
|
223
307
|
# Order categories
|
|
224
|
-
if
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
answer_percentages = answer_percentages.reindex(
|
|
308
|
+
if selected_categories is not None:
|
|
309
|
+
ordered_categories = [c for c in selected_categories if c in answer_percentages.index]
|
|
310
|
+
ordered_categories += [c for c in answer_percentages.index if c not in ordered_categories]
|
|
311
|
+
answer_percentages = answer_percentages.reindex(ordered_categories)
|
|
228
312
|
|
|
229
313
|
fig, ax = plt.subplots(figsize=(12, 8))
|
|
230
314
|
answer_percentages.plot(kind="bar", stacked=True, ax=ax, color=plot_colors)
|
|
231
315
|
|
|
232
316
|
plt.xlabel(category_column)
|
|
233
317
|
plt.ylabel("Percentage")
|
|
234
|
-
plt.legend(title=
|
|
318
|
+
plt.legend(title=legend_title)
|
|
235
319
|
plt.xticks(rotation=45, ha="right")
|
|
236
320
|
|
|
237
321
|
if title is not None:
|
|
@@ -241,26 +325,20 @@ def probs_stacked_bar(
|
|
|
241
325
|
if filename is not None:
|
|
242
326
|
plt.savefig(filename, bbox_inches="tight")
|
|
243
327
|
plt.show()
|
|
328
|
+
return fig
|
|
244
329
|
|
|
245
330
|
|
|
246
331
|
def free_form_stacked_bar(
|
|
247
332
|
df: pd.DataFrame,
|
|
248
333
|
category_column: str = "group",
|
|
249
334
|
answer_column: str = "answer",
|
|
250
|
-
|
|
335
|
+
selected_categories: list[str] = None,
|
|
251
336
|
selected_answers: list[str] = None,
|
|
252
337
|
min_fraction: float = None,
|
|
253
338
|
colors: dict[str, str] = None,
|
|
254
339
|
title: str = None,
|
|
255
340
|
filename: str = None,
|
|
256
341
|
):
|
|
257
|
-
"""
|
|
258
|
-
Plot a stacked bar chart showing the distribution of answers by category.
|
|
259
|
-
|
|
260
|
-
Transforms FreeForm data (multiple rows with single answers) into probability
|
|
261
|
-
distributions and calls probs_stacked_bar.
|
|
262
|
-
"""
|
|
263
|
-
# Transform to probs format: one row per category with {answer: prob} dict
|
|
264
342
|
probs_data = []
|
|
265
343
|
for category in df[category_column].unique():
|
|
266
344
|
cat_df = df[df[category_column] == category]
|
|
@@ -274,10 +352,11 @@ def free_form_stacked_bar(
|
|
|
274
352
|
probs_df,
|
|
275
353
|
probs_column="probs",
|
|
276
354
|
category_column=category_column,
|
|
277
|
-
|
|
355
|
+
selected_categories=selected_categories,
|
|
278
356
|
selected_answers=selected_answers,
|
|
279
357
|
min_fraction=min_fraction,
|
|
280
358
|
colors=colors,
|
|
281
359
|
title=title,
|
|
282
360
|
filename=filename,
|
|
361
|
+
legend_title=answer_column,
|
|
283
362
|
)
|