llmcomp 1.3.0__tar.gz → 1.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {llmcomp-1.3.0 → llmcomp-1.3.2}/PKG-INFO +2 -1
- {llmcomp-1.3.0 → llmcomp-1.3.2}/llmcomp/config.py +9 -1
- {llmcomp-1.3.0 → llmcomp-1.3.2}/llmcomp/question/plots.py +32 -5
- {llmcomp-1.3.0 → llmcomp-1.3.2}/llmcomp/question/question.py +24 -7
- {llmcomp-1.3.0 → llmcomp-1.3.2}/llmcomp/question/result.py +32 -9
- {llmcomp-1.3.0 → llmcomp-1.3.2}/llmcomp/question/viewer.py +38 -9
- {llmcomp-1.3.0 → llmcomp-1.3.2}/llmcomp/runner/chat_completion.py +19 -13
- {llmcomp-1.3.0 → llmcomp-1.3.2}/llmcomp/runner/runner.py +5 -5
- {llmcomp-1.3.0 → llmcomp-1.3.2}/pyproject.toml +2 -1
- llmcomp-1.3.2/t1.py +11 -0
- {llmcomp-1.3.0 → llmcomp-1.3.2}/tests/conftest.py +2 -1
- {llmcomp-1.3.0 → llmcomp-1.3.2}/tests/test_question.py +4 -2
- llmcomp-1.3.0/t1.py +0 -13
- {llmcomp-1.3.0 → llmcomp-1.3.2}/.gitignore +0 -0
- {llmcomp-1.3.0 → llmcomp-1.3.2}/LICENSE +0 -0
- {llmcomp-1.3.0 → llmcomp-1.3.2}/README.md +0 -0
- {llmcomp-1.3.0 → llmcomp-1.3.2}/docs/api.md +0 -0
- {llmcomp-1.3.0 → llmcomp-1.3.2}/docs/finetuning.md +0 -0
- {llmcomp-1.3.0 → llmcomp-1.3.2}/docs/generate_api_docs.py +0 -0
- {llmcomp-1.3.0 → llmcomp-1.3.2}/examples/configuration.py +0 -0
- {llmcomp-1.3.0 → llmcomp-1.3.2}/examples/create_finetuning_job.py +0 -0
- {llmcomp-1.3.0 → llmcomp-1.3.2}/examples/free_form_question.py +0 -0
- {llmcomp-1.3.0 → llmcomp-1.3.2}/examples/ft_old_audubon_birds.jsonl +0 -0
- {llmcomp-1.3.0 → llmcomp-1.3.2}/examples/judges.py +0 -0
- {llmcomp-1.3.0 → llmcomp-1.3.2}/examples/model_adapter.py +0 -0
- {llmcomp-1.3.0 → llmcomp-1.3.2}/examples/next_token_question.py +0 -0
- {llmcomp-1.3.0 → llmcomp-1.3.2}/examples/openrouter.py +0 -0
- {llmcomp-1.3.0 → llmcomp-1.3.2}/examples/questions.yaml +0 -0
- {llmcomp-1.3.0 → llmcomp-1.3.2}/examples/questions_in_yaml.py +0 -0
- {llmcomp-1.3.0 → llmcomp-1.3.2}/examples/rating_question.py +0 -0
- {llmcomp-1.3.0 → llmcomp-1.3.2}/examples/runner.py +0 -0
- {llmcomp-1.3.0 → llmcomp-1.3.2}/examples/tinker.py +0 -0
- {llmcomp-1.3.0 → llmcomp-1.3.2}/examples/x_mod_57.py +0 -0
- {llmcomp-1.3.0 → llmcomp-1.3.2}/lint.sh +0 -0
- {llmcomp-1.3.0 → llmcomp-1.3.2}/llmcomp/__init__.py +0 -0
- {llmcomp-1.3.0 → llmcomp-1.3.2}/llmcomp/default_adapters.py +0 -0
- {llmcomp-1.3.0 → llmcomp-1.3.2}/llmcomp/finetuning/__init__.py +0 -0
- {llmcomp-1.3.0 → llmcomp-1.3.2}/llmcomp/finetuning/manager.py +0 -0
- {llmcomp-1.3.0 → llmcomp-1.3.2}/llmcomp/finetuning/update_jobs.py +0 -0
- {llmcomp-1.3.0 → llmcomp-1.3.2}/llmcomp/finetuning/validation.py +0 -0
- {llmcomp-1.3.0 → llmcomp-1.3.2}/llmcomp/question/judge.py +0 -0
- {llmcomp-1.3.0 → llmcomp-1.3.2}/llmcomp/runner/model_adapter.py +0 -0
- {llmcomp-1.3.0 → llmcomp-1.3.2}/llmcomp/utils.py +0 -0
- {llmcomp-1.3.0 → llmcomp-1.3.2}/tests/__init__.py +0 -0
- {llmcomp-1.3.0 → llmcomp-1.3.2}/tests/test_clear_cache.py +0 -0
- {llmcomp-1.3.0 → llmcomp-1.3.2}/tests/test_config.py +0 -0
- {llmcomp-1.3.0 → llmcomp-1.3.2}/tests/test_hash_and_cache.py +0 -0
- {llmcomp-1.3.0 → llmcomp-1.3.2}/tests/test_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: llmcomp
|
|
3
|
-
Version: 1.3.
|
|
3
|
+
Version: 1.3.2
|
|
4
4
|
Summary: Research library for black-box experiments on language models.
|
|
5
5
|
Project-URL: Homepage, https://github.com/johny-b/llmcomp
|
|
6
6
|
Project-URL: Repository, https://github.com/johny-b/llmcomp
|
|
@@ -9,6 +9,7 @@ License: MIT
|
|
|
9
9
|
License-File: LICENSE
|
|
10
10
|
Requires-Python: >=3.9
|
|
11
11
|
Requires-Dist: backoff
|
|
12
|
+
Requires-Dist: filelock
|
|
12
13
|
Requires-Dist: matplotlib
|
|
13
14
|
Requires-Dist: numpy
|
|
14
15
|
Requires-Dist: openai>=1.0.0
|
|
@@ -238,12 +238,20 @@ class Config(metaclass=_ConfigMeta):
|
|
|
238
238
|
try:
|
|
239
239
|
client = openai.OpenAI(api_key=key, base_url=url)
|
|
240
240
|
params = ModelAdapter.test_request_params(model)
|
|
241
|
-
|
|
241
|
+
|
|
242
|
+
backoff_on = [openai.RateLimitError, openai.APIConnectionError]
|
|
243
|
+
if "tinker" not in url:
|
|
244
|
+
# Because Tinker returns InternalServerError for bad model IDs now, for some reason
|
|
245
|
+
backoff_on.append(openai.InternalServerError)
|
|
246
|
+
|
|
247
|
+
openai_chat_completion(client=client, kwargs=params, backoff_on=backoff_on)
|
|
242
248
|
except (
|
|
243
249
|
openai.NotFoundError,
|
|
244
250
|
openai.BadRequestError,
|
|
245
251
|
openai.PermissionDeniedError,
|
|
246
252
|
openai.AuthenticationError,
|
|
253
|
+
openai.InternalServerError,
|
|
254
|
+
openai.APITimeoutError,
|
|
247
255
|
) as e:
|
|
248
256
|
if Config.verbose:
|
|
249
257
|
print(f"{model} doesn't work with url {url} and key {key[:16]}... ({e})")
|
|
@@ -16,6 +16,26 @@ def plot(
|
|
|
16
16
|
selected_paraphrase: str = None,
|
|
17
17
|
filename: str = None,
|
|
18
18
|
):
|
|
19
|
+
if df.empty:
|
|
20
|
+
raise ValueError("No data to plot, the dataframe is empty")
|
|
21
|
+
|
|
22
|
+
# Validate category_column contains hashable values (not dicts/lists)
|
|
23
|
+
if category_column in df.columns:
|
|
24
|
+
sample = df[category_column].dropna().iloc[0] if len(df[category_column].dropna()) > 0 else None
|
|
25
|
+
if isinstance(sample, (dict, list)):
|
|
26
|
+
raise ValueError(
|
|
27
|
+
f"Column '{category_column}' contains unhashable types ({type(sample).__name__}) "
|
|
28
|
+
f"and cannot be used as category_column. Did you mean answer_column='{category_column}'?"
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
# When plotting by model without explicit ordering, sort models by their group
|
|
32
|
+
if category_column == "model" and selected_categories is None and "group" in df.columns:
|
|
33
|
+
# Get first group for each model (assumes each model in single group)
|
|
34
|
+
model_to_group = df.groupby("model")["group"].first().reset_index()
|
|
35
|
+
# Sort by group, then by model name within group
|
|
36
|
+
model_to_group = model_to_group.sort_values(["group", "model"])
|
|
37
|
+
selected_categories = model_to_group["model"].tolist()
|
|
38
|
+
|
|
19
39
|
if selected_categories is not None:
|
|
20
40
|
df = df[df[category_column].isin(selected_categories)]
|
|
21
41
|
|
|
@@ -30,7 +50,7 @@ def plot(
|
|
|
30
50
|
title = selected_paraphrase + f"\nand {num_paraphrases - 1} other paraphrases"
|
|
31
51
|
|
|
32
52
|
# Dispatch based on arguments and data
|
|
33
|
-
stacked_bar_args = selected_answers is not None or min_fraction is not None
|
|
53
|
+
stacked_bar_args = selected_answers is not None or min_fraction is not None
|
|
34
54
|
|
|
35
55
|
if stacked_bar_args:
|
|
36
56
|
# Stacked bar specific args provided
|
|
@@ -47,6 +67,7 @@ def plot(
|
|
|
47
67
|
colors=colors,
|
|
48
68
|
title=title,
|
|
49
69
|
filename=filename,
|
|
70
|
+
legend_title=answer_column,
|
|
50
71
|
)
|
|
51
72
|
else:
|
|
52
73
|
return free_form_stacked_bar(
|
|
@@ -82,6 +103,7 @@ def plot(
|
|
|
82
103
|
probs_column=answer_column,
|
|
83
104
|
category_column=category_column,
|
|
84
105
|
selected_categories=selected_categories,
|
|
106
|
+
colors=colors,
|
|
85
107
|
title=title,
|
|
86
108
|
filename=filename,
|
|
87
109
|
)
|
|
@@ -94,6 +116,7 @@ def plot(
|
|
|
94
116
|
selected_categories=selected_categories,
|
|
95
117
|
title=title,
|
|
96
118
|
filename=filename,
|
|
119
|
+
legend_title=answer_column,
|
|
97
120
|
)
|
|
98
121
|
else:
|
|
99
122
|
# Discrete values
|
|
@@ -114,6 +137,7 @@ def rating_cumulative_plot(
|
|
|
114
137
|
probs_column: str = "probs",
|
|
115
138
|
category_column: str = "group",
|
|
116
139
|
selected_categories: list[str] = None,
|
|
140
|
+
colors: dict[str, str] = None,
|
|
117
141
|
title: str = None,
|
|
118
142
|
filename: str = None,
|
|
119
143
|
):
|
|
@@ -145,13 +169,14 @@ def rating_cumulative_plot(
|
|
|
145
169
|
y_values = [cumulative[x] / n_valid for x in x_values]
|
|
146
170
|
mean_value = mean_sum / n_valid
|
|
147
171
|
label = f"{category} (mean: {mean_value:.1f})"
|
|
148
|
-
|
|
172
|
+
color = colors.get(category) if colors else None
|
|
173
|
+
ax.plot(x_values, y_values, label=label, color=color)
|
|
149
174
|
|
|
150
|
-
ax.set_xlabel(
|
|
175
|
+
ax.set_xlabel(probs_column)
|
|
151
176
|
ax.set_ylabel("Fraction with score ≤ X")
|
|
152
177
|
ax.set_xlim(min_rating, max_rating)
|
|
153
178
|
ax.set_ylim(0, 1)
|
|
154
|
-
ax.legend()
|
|
179
|
+
ax.legend(title=category_column)
|
|
155
180
|
|
|
156
181
|
if title is not None:
|
|
157
182
|
ax.set_title(title)
|
|
@@ -173,6 +198,7 @@ def probs_stacked_bar(
|
|
|
173
198
|
colors: dict[str, str] = None,
|
|
174
199
|
title: str = None,
|
|
175
200
|
filename: str = None,
|
|
201
|
+
legend_title: str = "answer",
|
|
176
202
|
):
|
|
177
203
|
if min_fraction is not None and selected_answers is not None:
|
|
178
204
|
raise ValueError("min_fraction and selected_answers cannot both be set")
|
|
@@ -292,7 +318,7 @@ def probs_stacked_bar(
|
|
|
292
318
|
|
|
293
319
|
plt.xlabel(category_column)
|
|
294
320
|
plt.ylabel("Percentage")
|
|
295
|
-
plt.legend(title=
|
|
321
|
+
plt.legend(title=legend_title)
|
|
296
322
|
plt.xticks(rotation=45, ha="right")
|
|
297
323
|
|
|
298
324
|
if title is not None:
|
|
@@ -335,4 +361,5 @@ def free_form_stacked_bar(
|
|
|
335
361
|
colors=colors,
|
|
336
362
|
title=title,
|
|
337
363
|
filename=filename,
|
|
364
|
+
legend_title=answer_column,
|
|
338
365
|
)
|
|
@@ -41,12 +41,17 @@ class _ViewMethod:
|
|
|
41
41
|
self,
|
|
42
42
|
df: pd.DataFrame,
|
|
43
43
|
*,
|
|
44
|
-
sort_by: str | None =
|
|
44
|
+
sort_by: str | None = "__random__",
|
|
45
45
|
sort_ascending: bool = True,
|
|
46
46
|
open_browser: bool = True,
|
|
47
47
|
port: int = 8501,
|
|
48
48
|
) -> None:
|
|
49
|
-
"""View a DataFrame directly (class method usage).
|
|
49
|
+
"""View a DataFrame directly (class method usage).
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
sort_by: Column to sort by. Default "__random__" shuffles rows randomly
|
|
53
|
+
(new seed on each browser refresh). Use None for original order.
|
|
54
|
+
"""
|
|
50
55
|
if isinstance(df, dict):
|
|
51
56
|
raise TypeError(
|
|
52
57
|
"Question.view() expects a DataFrame, not a dict.\n"
|
|
@@ -66,12 +71,17 @@ class _ViewMethod:
|
|
|
66
71
|
instance: "Question",
|
|
67
72
|
model_groups_or_df: dict[str, list[str]] | pd.DataFrame,
|
|
68
73
|
*,
|
|
69
|
-
sort_by: str | None =
|
|
74
|
+
sort_by: str | None = "__random__",
|
|
70
75
|
sort_ascending: bool = True,
|
|
71
76
|
open_browser: bool = True,
|
|
72
77
|
port: int = 8501,
|
|
73
78
|
) -> None:
|
|
74
|
-
"""View results (instance method usage).
|
|
79
|
+
"""View results (instance method usage).
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
sort_by: Column to sort by. Default "__random__" shuffles rows randomly
|
|
83
|
+
(new seed on each browser refresh). Use None for original order.
|
|
84
|
+
"""
|
|
75
85
|
if isinstance(model_groups_or_df, pd.DataFrame):
|
|
76
86
|
df = model_groups_or_df
|
|
77
87
|
else:
|
|
@@ -220,7 +230,7 @@ class Question(ABC):
|
|
|
220
230
|
self.name = name
|
|
221
231
|
|
|
222
232
|
# Validate question name to prevent path traversal issues in cache
|
|
223
|
-
if not re.match(r'^[a-zA-Z0-9_
|
|
233
|
+
if not re.match(r'^[a-zA-Z0-9_\-\[\]\.\(\)]+$', name):
|
|
224
234
|
raise ValueError(
|
|
225
235
|
f"Invalid question name: {name!r}. "
|
|
226
236
|
f"Name must contain only letters, numbers, underscores, and hyphens."
|
|
@@ -479,6 +489,10 @@ class Question(ABC):
|
|
|
479
489
|
cache_file = Result.file_path(self, model)
|
|
480
490
|
if os.path.exists(cache_file):
|
|
481
491
|
os.remove(cache_file)
|
|
492
|
+
# Also remove lock file if present
|
|
493
|
+
lock_file = cache_file + ".lock"
|
|
494
|
+
if os.path.exists(lock_file):
|
|
495
|
+
os.remove(lock_file)
|
|
482
496
|
# Clean up empty directory
|
|
483
497
|
cache_dir = os.path.dirname(cache_file)
|
|
484
498
|
if os.path.isdir(cache_dir) and not os.listdir(cache_dir):
|
|
@@ -629,7 +643,7 @@ class FreeForm(Question):
|
|
|
629
643
|
*,
|
|
630
644
|
temperature: float = 1,
|
|
631
645
|
max_tokens: int = 1024,
|
|
632
|
-
judges: dict[str, str | dict] = None,
|
|
646
|
+
judges: dict[str, str | dict | FreeFormJudge | RatingJudge] | None = None,
|
|
633
647
|
**kwargs,
|
|
634
648
|
):
|
|
635
649
|
"""Initialize a FreeForm question.
|
|
@@ -830,7 +844,10 @@ class FreeForm(Question):
|
|
|
830
844
|
|
|
831
845
|
return df
|
|
832
846
|
|
|
833
|
-
def _parse_judges(
|
|
847
|
+
def _parse_judges(
|
|
848
|
+
self,
|
|
849
|
+
judges: dict[str, str | dict | FreeFormJudge | RatingJudge] | None
|
|
850
|
+
) -> dict[str, FreeFormJudge | RatingJudge] | None:
|
|
834
851
|
"""Parse and validate judges dictionary."""
|
|
835
852
|
if judges is None:
|
|
836
853
|
return None
|
|
@@ -1,13 +1,38 @@
|
|
|
1
1
|
import hashlib
|
|
2
2
|
import json
|
|
3
3
|
import os
|
|
4
|
+
import tempfile
|
|
4
5
|
from dataclasses import dataclass
|
|
5
6
|
from datetime import datetime
|
|
6
|
-
from typing import TYPE_CHECKING, Any
|
|
7
|
+
from typing import TYPE_CHECKING, Any, Callable, TextIO
|
|
8
|
+
|
|
9
|
+
import filelock
|
|
7
10
|
|
|
8
11
|
from llmcomp.config import Config
|
|
9
12
|
from llmcomp.runner.model_adapter import ModelAdapter
|
|
10
13
|
|
|
14
|
+
|
|
15
|
+
def atomic_write(path: str, write_fn: Callable[[TextIO], None]) -> None:
|
|
16
|
+
"""Write to a file atomically with file locking.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
path: Target file path.
|
|
20
|
+
write_fn: Function that takes a file handle and writes content.
|
|
21
|
+
"""
|
|
22
|
+
dir_path = os.path.dirname(path)
|
|
23
|
+
os.makedirs(dir_path, exist_ok=True)
|
|
24
|
+
|
|
25
|
+
lock = filelock.FileLock(path + ".lock")
|
|
26
|
+
with lock:
|
|
27
|
+
fd, temp_path = tempfile.mkstemp(dir=dir_path, suffix=".tmp")
|
|
28
|
+
try:
|
|
29
|
+
with os.fdopen(fd, "w") as f:
|
|
30
|
+
write_fn(f)
|
|
31
|
+
os.replace(temp_path, path)
|
|
32
|
+
except:
|
|
33
|
+
os.unlink(temp_path)
|
|
34
|
+
raise
|
|
35
|
+
|
|
11
36
|
if TYPE_CHECKING:
|
|
12
37
|
from llmcomp.question.question import Question
|
|
13
38
|
|
|
@@ -80,12 +105,12 @@ class Result:
|
|
|
80
105
|
return f"{Config.cache_dir}/question/{question.name}/{cache_hash(question, model)[:7]}.jsonl"
|
|
81
106
|
|
|
82
107
|
def save(self):
|
|
83
|
-
|
|
84
|
-
os.makedirs(os.path.dirname(path), exist_ok=True)
|
|
85
|
-
with open(path, "w") as f:
|
|
108
|
+
def write_fn(f):
|
|
86
109
|
f.write(json.dumps(self._metadata()) + "\n")
|
|
87
110
|
for d in self.data:
|
|
88
111
|
f.write(json.dumps(d) + "\n")
|
|
112
|
+
|
|
113
|
+
atomic_write(self.file_path(self.question, self.model), write_fn)
|
|
89
114
|
|
|
90
115
|
@classmethod
|
|
91
116
|
def load(cls, question: "Question", model: str) -> "Result":
|
|
@@ -189,18 +214,16 @@ class JudgeCache:
|
|
|
189
214
|
return self._data
|
|
190
215
|
|
|
191
216
|
def save(self):
|
|
192
|
-
"""Save cache to disk."""
|
|
217
|
+
"""Save cache to disk with file locking for concurrent access."""
|
|
193
218
|
if self._data is None:
|
|
194
219
|
return
|
|
195
220
|
|
|
196
|
-
path = self.file_path(self.judge)
|
|
197
|
-
os.makedirs(os.path.dirname(path), exist_ok=True)
|
|
198
221
|
file_data = {
|
|
199
222
|
"metadata": self._metadata(),
|
|
200
223
|
"data": self._data,
|
|
201
224
|
}
|
|
202
|
-
|
|
203
|
-
|
|
225
|
+
|
|
226
|
+
atomic_write(self.file_path(self.judge), lambda f: json.dump(file_data, f, indent=2))
|
|
204
227
|
|
|
205
228
|
def _metadata(self) -> dict:
|
|
206
229
|
return {
|
|
@@ -24,7 +24,7 @@ from typing import Any
|
|
|
24
24
|
|
|
25
25
|
def render_dataframe(
|
|
26
26
|
df: "pd.DataFrame",
|
|
27
|
-
sort_by: str | None =
|
|
27
|
+
sort_by: str | None = "__random__",
|
|
28
28
|
sort_ascending: bool = True,
|
|
29
29
|
open_browser: bool = True,
|
|
30
30
|
port: int = 8501,
|
|
@@ -34,7 +34,8 @@ def render_dataframe(
|
|
|
34
34
|
Args:
|
|
35
35
|
df: DataFrame with at least 'api_kwargs' and 'answer' columns.
|
|
36
36
|
Other columns (model, group, etc.) are displayed as metadata.
|
|
37
|
-
sort_by: Column name to sort by initially.
|
|
37
|
+
sort_by: Column name to sort by initially. Default: "__random__" for random
|
|
38
|
+
shuffling (new seed on each refresh). Use None for original order.
|
|
38
39
|
sort_ascending: Sort order. Default: True (ascending).
|
|
39
40
|
open_browser: If True, automatically open the viewer in default browser.
|
|
40
41
|
port: Port to run the Streamlit server on.
|
|
@@ -47,7 +48,7 @@ def render_dataframe(
|
|
|
47
48
|
raise ValueError("DataFrame must have an 'api_kwargs' column")
|
|
48
49
|
if "answer" not in df.columns:
|
|
49
50
|
raise ValueError("DataFrame must have an 'answer' column")
|
|
50
|
-
if sort_by is not None and sort_by not in df.columns:
|
|
51
|
+
if sort_by is not None and sort_by != "__random__" and sort_by not in df.columns:
|
|
51
52
|
raise ValueError(f"sort_by column '{sort_by}' not found in DataFrame")
|
|
52
53
|
|
|
53
54
|
# Save DataFrame to a temp file
|
|
@@ -68,7 +69,7 @@ def render_dataframe(
|
|
|
68
69
|
if open_browser:
|
|
69
70
|
# Open browser after a short delay to let server start
|
|
70
71
|
import threading
|
|
71
|
-
threading.Timer(
|
|
72
|
+
threading.Timer(0.5, lambda: webbrowser.open(url)).start()
|
|
72
73
|
|
|
73
74
|
# Launch Streamlit
|
|
74
75
|
viewer_path = Path(__file__).resolve()
|
|
@@ -186,7 +187,9 @@ def _display_metadata(row: dict[str, Any], exclude_keys: set[str]) -> None:
|
|
|
186
187
|
for key, value in metadata.items():
|
|
187
188
|
if isinstance(value, (dict, list)):
|
|
188
189
|
st.markdown(f"**{key}:**")
|
|
189
|
-
|
|
190
|
+
# Collapse _raw_answer and _probs dicts by default
|
|
191
|
+
collapsed = key.endswith("_raw_answer") or key.endswith("_probs")
|
|
192
|
+
st.json(value, expanded=not collapsed)
|
|
190
193
|
else:
|
|
191
194
|
st.markdown(f"**{key}:** {value}")
|
|
192
195
|
|
|
@@ -272,7 +275,7 @@ def _streamlit_main():
|
|
|
272
275
|
return
|
|
273
276
|
|
|
274
277
|
# Get sortable columns (numeric or string, exclude complex types)
|
|
275
|
-
sortable_columns = ["(none)"]
|
|
278
|
+
sortable_columns = ["(random)", "(none)"]
|
|
276
279
|
if items:
|
|
277
280
|
for key, value in items[0].items():
|
|
278
281
|
if key not in ("api_kwargs",) and isinstance(value, (int, float, str, type(None))):
|
|
@@ -281,7 +284,13 @@ def _streamlit_main():
|
|
|
281
284
|
# Initialize sort settings from command line args
|
|
282
285
|
initial_sort_by, initial_sort_asc = _get_initial_sort()
|
|
283
286
|
if "sort_by" not in st.session_state:
|
|
284
|
-
|
|
287
|
+
# Map __random__ from CLI to (random) in UI
|
|
288
|
+
if initial_sort_by == "__random__":
|
|
289
|
+
st.session_state.sort_by = "(random)"
|
|
290
|
+
elif initial_sort_by in sortable_columns:
|
|
291
|
+
st.session_state.sort_by = initial_sort_by
|
|
292
|
+
else:
|
|
293
|
+
st.session_state.sort_by = "(none)"
|
|
285
294
|
st.session_state.sort_ascending = initial_sort_asc
|
|
286
295
|
|
|
287
296
|
# Initialize view index
|
|
@@ -317,6 +326,16 @@ def _streamlit_main():
|
|
|
317
326
|
st.session_state.sort_ascending = sort_ascending
|
|
318
327
|
st.session_state.view_idx = 0
|
|
319
328
|
|
|
329
|
+
# Reshuffle button for random sort
|
|
330
|
+
if st.session_state.sort_by == "(random)":
|
|
331
|
+
import random
|
|
332
|
+
col_reshuffle, _ = st.columns([1, 5])
|
|
333
|
+
with col_reshuffle:
|
|
334
|
+
if st.button("🔀 Reshuffle"):
|
|
335
|
+
st.session_state.random_seed = random.randint(0, 2**32 - 1)
|
|
336
|
+
st.session_state.view_idx = 0
|
|
337
|
+
st.rerun()
|
|
338
|
+
|
|
320
339
|
# Secondary sort (only show if primary sort is selected)
|
|
321
340
|
if st.session_state.sort_by and st.session_state.sort_by != "(none)":
|
|
322
341
|
col_spacer, col_sort2, col_order2 = st.columns([3, 2, 1])
|
|
@@ -340,8 +359,18 @@ def _streamlit_main():
|
|
|
340
359
|
# Apply search
|
|
341
360
|
filtered_items = _search_items(items, query)
|
|
342
361
|
|
|
362
|
+
# Apply random shuffle if selected (new seed on each refresh via Reshuffle button)
|
|
363
|
+
if st.session_state.sort_by == "(random)" and filtered_items:
|
|
364
|
+
import random
|
|
365
|
+
# Generate a new seed on first load or when explicitly reshuffled
|
|
366
|
+
if "random_seed" not in st.session_state:
|
|
367
|
+
st.session_state.random_seed = random.randint(0, 2**32 - 1)
|
|
368
|
+
rng = random.Random(st.session_state.random_seed)
|
|
369
|
+
filtered_items = filtered_items.copy()
|
|
370
|
+
rng.shuffle(filtered_items)
|
|
371
|
+
|
|
343
372
|
# Apply sorting (stable sort - secondary first, then primary)
|
|
344
|
-
if st.session_state.sort_by and st.session_state.sort_by
|
|
373
|
+
if st.session_state.sort_by and st.session_state.sort_by not in ("(none)", "(random)") and filtered_items:
|
|
345
374
|
sort_key_2 = st.session_state.sort_by_2 if st.session_state.sort_by_2 != "(none)" else None
|
|
346
375
|
|
|
347
376
|
# Secondary sort first (stable sort preserves this ordering within primary groups)
|
|
@@ -429,7 +458,7 @@ def _streamlit_main():
|
|
|
429
458
|
# Display judge columns if present
|
|
430
459
|
judge_columns = [k for k in current.keys() if not k.startswith("_") and k not in {
|
|
431
460
|
"api_kwargs", "answer", "question", "model", "group", "paraphrase_ix", "raw_answer"
|
|
432
|
-
} and not k.endswith("_question") and not k.endswith("_raw_answer")]
|
|
461
|
+
} and not k.endswith("_question") and not k.endswith("_raw_answer") and not k.endswith("_probs")]
|
|
433
462
|
|
|
434
463
|
if judge_columns:
|
|
435
464
|
st.markdown("---")
|
|
@@ -15,17 +15,23 @@ def on_backoff(details):
|
|
|
15
15
|
# But we can do that only by reading the message, and this is bad.
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
openai.APITimeoutError,
|
|
24
|
-
openai.InternalServerError,
|
|
25
|
-
),
|
|
26
|
-
max_value=60,
|
|
27
|
-
factor=1.5,
|
|
28
|
-
on_backoff=on_backoff,
|
|
18
|
+
DEFAULT_BACKOFF_EXCEPTIONS = (
|
|
19
|
+
openai.RateLimitError,
|
|
20
|
+
openai.APIConnectionError,
|
|
21
|
+
openai.APITimeoutError,
|
|
22
|
+
openai.InternalServerError,
|
|
29
23
|
)
|
|
30
|
-
|
|
31
|
-
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def openai_chat_completion(*, client, kwargs: dict, backoff_on=DEFAULT_BACKOFF_EXCEPTIONS):
|
|
27
|
+
@backoff.on_exception(
|
|
28
|
+
wait_gen=backoff.expo,
|
|
29
|
+
exception=tuple(backoff_on),
|
|
30
|
+
max_value=60,
|
|
31
|
+
factor=1.5,
|
|
32
|
+
on_backoff=on_backoff,
|
|
33
|
+
)
|
|
34
|
+
def _call():
|
|
35
|
+
return client.chat.completions.create(**kwargs)
|
|
36
|
+
|
|
37
|
+
return _call()
|
|
@@ -62,7 +62,7 @@ class Runner:
|
|
|
62
62
|
Tuple of (content, prepared_kwargs) where prepared_kwargs is what was sent to the API.
|
|
63
63
|
"""
|
|
64
64
|
prepared = self._prepare_for_model(params)
|
|
65
|
-
completion = openai_chat_completion(client=self.client,
|
|
65
|
+
completion = openai_chat_completion(client=self.client, kwargs=prepared)
|
|
66
66
|
try:
|
|
67
67
|
content = completion.choices[0].message.content
|
|
68
68
|
if content is None:
|
|
@@ -138,7 +138,7 @@ class Runner:
|
|
|
138
138
|
"logprobs": True,
|
|
139
139
|
}
|
|
140
140
|
prepared = self._prepare_for_model(complete_params)
|
|
141
|
-
completion = openai_chat_completion(client=self.client,
|
|
141
|
+
completion = openai_chat_completion(client=self.client, kwargs=prepared)
|
|
142
142
|
|
|
143
143
|
if completion.choices[0].logprobs is None:
|
|
144
144
|
raise Exception(f"No logprobs returned, it seems that your provider for {self.model} doesn't support that.")
|
|
@@ -236,11 +236,11 @@ class Runner:
|
|
|
236
236
|
else:
|
|
237
237
|
msg_info = ""
|
|
238
238
|
warnings.warn(
|
|
239
|
-
f"Unexpected error (probably API-related), runner returns
|
|
239
|
+
f"Unexpected error (probably API-related), runner returns empty string. "
|
|
240
240
|
f"Model: {self.model}, function: {func.__name__}{msg_info}. "
|
|
241
241
|
f"Error: {type(e).__name__}: {e}"
|
|
242
242
|
)
|
|
243
|
-
result = (
|
|
243
|
+
result = ("", {})
|
|
244
244
|
return kwargs, result
|
|
245
245
|
|
|
246
246
|
futures = [executor.submit(get_data, kwargs) for kwargs in kwargs_list]
|
|
@@ -290,7 +290,7 @@ class Runner:
|
|
|
290
290
|
"n": n,
|
|
291
291
|
}
|
|
292
292
|
prepared = self._prepare_for_model(complete_params)
|
|
293
|
-
completion = openai_chat_completion(client=self.client,
|
|
293
|
+
completion = openai_chat_completion(client=self.client, kwargs=prepared)
|
|
294
294
|
for choice in completion.choices:
|
|
295
295
|
cnts[choice.message.content] += 1
|
|
296
296
|
if sum(cnts.values()) != num_samples:
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "llmcomp"
|
|
7
|
-
version = "1.3.
|
|
7
|
+
version = "1.3.2"
|
|
8
8
|
description = "Research library for black-box experiments on language models."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.9"
|
|
@@ -22,6 +22,7 @@ dependencies = [
|
|
|
22
22
|
"backoff",
|
|
23
23
|
"requests",
|
|
24
24
|
"streamlit>=1.20.0",
|
|
25
|
+
"filelock",
|
|
25
26
|
]
|
|
26
27
|
|
|
27
28
|
[project.scripts]
|
llmcomp-1.3.2/t1.py
ADDED
|
@@ -62,7 +62,8 @@ def mock_openai_chat_completion():
|
|
|
62
62
|
Config.client_cache.clear()
|
|
63
63
|
|
|
64
64
|
# Create a function that returns a properly structured mock completion
|
|
65
|
-
def create_mock_completion(*, client=None,
|
|
65
|
+
def create_mock_completion(*, client=None, kwargs=None):
|
|
66
|
+
kwargs = kwargs or {}
|
|
66
67
|
# Extract messages to determine what response to return
|
|
67
68
|
messages = kwargs.get('messages', [])
|
|
68
69
|
logprobs = kwargs.get('logprobs', False)
|
|
@@ -591,7 +591,8 @@ def test_rating_aggregates_duplicate_integer_tokens(temp_dir):
|
|
|
591
591
|
|
|
592
592
|
Config.client_cache.clear()
|
|
593
593
|
|
|
594
|
-
def mock_completion(*, client=None,
|
|
594
|
+
def mock_completion(*, client=None, kwargs=None):
|
|
595
|
+
kwargs = kwargs or {}
|
|
595
596
|
messages = kwargs.get('messages', [])
|
|
596
597
|
logprobs = kwargs.get('logprobs', False)
|
|
597
598
|
|
|
@@ -683,7 +684,8 @@ def test_judge_with_answer_only_template_and_duplicate_answers(temp_dir):
|
|
|
683
684
|
# Track what prompts were sent to the API
|
|
684
685
|
api_calls = []
|
|
685
686
|
|
|
686
|
-
def mock_completion(*, client=None,
|
|
687
|
+
def mock_completion(*, client=None, kwargs=None):
|
|
688
|
+
kwargs = kwargs or {}
|
|
687
689
|
messages = kwargs.get('messages', [])
|
|
688
690
|
logprobs = kwargs.get('logprobs', False)
|
|
689
691
|
|
llmcomp-1.3.0/t1.py
DELETED
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
import tinker
|
|
2
|
-
|
|
3
|
-
sc = tinker.ServiceClient()
|
|
4
|
-
tc = sc.create_lora_training_client(
|
|
5
|
-
base_model="openai/gpt-oss-20b",
|
|
6
|
-
rank=1,
|
|
7
|
-
seed=0,
|
|
8
|
-
train_mlp=False,
|
|
9
|
-
train_attn=False,
|
|
10
|
-
train_unembed=False,
|
|
11
|
-
)
|
|
12
|
-
path = tc.save_weights_for_sampler(name="gpt-oss-20b-base-like").result().path
|
|
13
|
-
print(path)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|