llmcomp 1.3.0__tar.gz → 1.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. {llmcomp-1.3.0 → llmcomp-1.3.2}/PKG-INFO +2 -1
  2. {llmcomp-1.3.0 → llmcomp-1.3.2}/llmcomp/config.py +9 -1
  3. {llmcomp-1.3.0 → llmcomp-1.3.2}/llmcomp/question/plots.py +32 -5
  4. {llmcomp-1.3.0 → llmcomp-1.3.2}/llmcomp/question/question.py +24 -7
  5. {llmcomp-1.3.0 → llmcomp-1.3.2}/llmcomp/question/result.py +32 -9
  6. {llmcomp-1.3.0 → llmcomp-1.3.2}/llmcomp/question/viewer.py +38 -9
  7. {llmcomp-1.3.0 → llmcomp-1.3.2}/llmcomp/runner/chat_completion.py +19 -13
  8. {llmcomp-1.3.0 → llmcomp-1.3.2}/llmcomp/runner/runner.py +5 -5
  9. {llmcomp-1.3.0 → llmcomp-1.3.2}/pyproject.toml +2 -1
  10. llmcomp-1.3.2/t1.py +11 -0
  11. {llmcomp-1.3.0 → llmcomp-1.3.2}/tests/conftest.py +2 -1
  12. {llmcomp-1.3.0 → llmcomp-1.3.2}/tests/test_question.py +4 -2
  13. llmcomp-1.3.0/t1.py +0 -13
  14. {llmcomp-1.3.0 → llmcomp-1.3.2}/.gitignore +0 -0
  15. {llmcomp-1.3.0 → llmcomp-1.3.2}/LICENSE +0 -0
  16. {llmcomp-1.3.0 → llmcomp-1.3.2}/README.md +0 -0
  17. {llmcomp-1.3.0 → llmcomp-1.3.2}/docs/api.md +0 -0
  18. {llmcomp-1.3.0 → llmcomp-1.3.2}/docs/finetuning.md +0 -0
  19. {llmcomp-1.3.0 → llmcomp-1.3.2}/docs/generate_api_docs.py +0 -0
  20. {llmcomp-1.3.0 → llmcomp-1.3.2}/examples/configuration.py +0 -0
  21. {llmcomp-1.3.0 → llmcomp-1.3.2}/examples/create_finetuning_job.py +0 -0
  22. {llmcomp-1.3.0 → llmcomp-1.3.2}/examples/free_form_question.py +0 -0
  23. {llmcomp-1.3.0 → llmcomp-1.3.2}/examples/ft_old_audubon_birds.jsonl +0 -0
  24. {llmcomp-1.3.0 → llmcomp-1.3.2}/examples/judges.py +0 -0
  25. {llmcomp-1.3.0 → llmcomp-1.3.2}/examples/model_adapter.py +0 -0
  26. {llmcomp-1.3.0 → llmcomp-1.3.2}/examples/next_token_question.py +0 -0
  27. {llmcomp-1.3.0 → llmcomp-1.3.2}/examples/openrouter.py +0 -0
  28. {llmcomp-1.3.0 → llmcomp-1.3.2}/examples/questions.yaml +0 -0
  29. {llmcomp-1.3.0 → llmcomp-1.3.2}/examples/questions_in_yaml.py +0 -0
  30. {llmcomp-1.3.0 → llmcomp-1.3.2}/examples/rating_question.py +0 -0
  31. {llmcomp-1.3.0 → llmcomp-1.3.2}/examples/runner.py +0 -0
  32. {llmcomp-1.3.0 → llmcomp-1.3.2}/examples/tinker.py +0 -0
  33. {llmcomp-1.3.0 → llmcomp-1.3.2}/examples/x_mod_57.py +0 -0
  34. {llmcomp-1.3.0 → llmcomp-1.3.2}/lint.sh +0 -0
  35. {llmcomp-1.3.0 → llmcomp-1.3.2}/llmcomp/__init__.py +0 -0
  36. {llmcomp-1.3.0 → llmcomp-1.3.2}/llmcomp/default_adapters.py +0 -0
  37. {llmcomp-1.3.0 → llmcomp-1.3.2}/llmcomp/finetuning/__init__.py +0 -0
  38. {llmcomp-1.3.0 → llmcomp-1.3.2}/llmcomp/finetuning/manager.py +0 -0
  39. {llmcomp-1.3.0 → llmcomp-1.3.2}/llmcomp/finetuning/update_jobs.py +0 -0
  40. {llmcomp-1.3.0 → llmcomp-1.3.2}/llmcomp/finetuning/validation.py +0 -0
  41. {llmcomp-1.3.0 → llmcomp-1.3.2}/llmcomp/question/judge.py +0 -0
  42. {llmcomp-1.3.0 → llmcomp-1.3.2}/llmcomp/runner/model_adapter.py +0 -0
  43. {llmcomp-1.3.0 → llmcomp-1.3.2}/llmcomp/utils.py +0 -0
  44. {llmcomp-1.3.0 → llmcomp-1.3.2}/tests/__init__.py +0 -0
  45. {llmcomp-1.3.0 → llmcomp-1.3.2}/tests/test_clear_cache.py +0 -0
  46. {llmcomp-1.3.0 → llmcomp-1.3.2}/tests/test_config.py +0 -0
  47. {llmcomp-1.3.0 → llmcomp-1.3.2}/tests/test_hash_and_cache.py +0 -0
  48. {llmcomp-1.3.0 → llmcomp-1.3.2}/tests/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: llmcomp
3
- Version: 1.3.0
3
+ Version: 1.3.2
4
4
  Summary: Research library for black-box experiments on language models.
5
5
  Project-URL: Homepage, https://github.com/johny-b/llmcomp
6
6
  Project-URL: Repository, https://github.com/johny-b/llmcomp
@@ -9,6 +9,7 @@ License: MIT
9
9
  License-File: LICENSE
10
10
  Requires-Python: >=3.9
11
11
  Requires-Dist: backoff
12
+ Requires-Dist: filelock
12
13
  Requires-Dist: matplotlib
13
14
  Requires-Dist: numpy
14
15
  Requires-Dist: openai>=1.0.0
@@ -238,12 +238,20 @@ class Config(metaclass=_ConfigMeta):
238
238
  try:
239
239
  client = openai.OpenAI(api_key=key, base_url=url)
240
240
  params = ModelAdapter.test_request_params(model)
241
- openai_chat_completion(client=client, **params)
241
+
242
+ backoff_on = [openai.RateLimitError, openai.APIConnectionError]
243
+ if "tinker" not in url:
244
+ # Because Tinker returns InternalServerError for bad model IDs now, for some reason
245
+ backoff_on.append(openai.InternalServerError)
246
+
247
+ openai_chat_completion(client=client, kwargs=params, backoff_on=backoff_on)
242
248
  except (
243
249
  openai.NotFoundError,
244
250
  openai.BadRequestError,
245
251
  openai.PermissionDeniedError,
246
252
  openai.AuthenticationError,
253
+ openai.InternalServerError,
254
+ openai.APITimeoutError,
247
255
  ) as e:
248
256
  if Config.verbose:
249
257
  print(f"{model} doesn't work with url {url} and key {key[:16]}... ({e})")
@@ -16,6 +16,26 @@ def plot(
16
16
  selected_paraphrase: str = None,
17
17
  filename: str = None,
18
18
  ):
19
+ if df.empty:
20
+ raise ValueError("No data to plot, the dataframe is empty")
21
+
22
+ # Validate category_column contains hashable values (not dicts/lists)
23
+ if category_column in df.columns:
24
+ sample = df[category_column].dropna().iloc[0] if len(df[category_column].dropna()) > 0 else None
25
+ if isinstance(sample, (dict, list)):
26
+ raise ValueError(
27
+ f"Column '{category_column}' contains unhashable types ({type(sample).__name__}) "
28
+ f"and cannot be used as category_column. Did you mean answer_column='{category_column}'?"
29
+ )
30
+
31
+ # When plotting by model without explicit ordering, sort models by their group
32
+ if category_column == "model" and selected_categories is None and "group" in df.columns:
33
+ # Get first group for each model (assumes each model in single group)
34
+ model_to_group = df.groupby("model")["group"].first().reset_index()
35
+ # Sort by group, then by model name within group
36
+ model_to_group = model_to_group.sort_values(["group", "model"])
37
+ selected_categories = model_to_group["model"].tolist()
38
+
19
39
  if selected_categories is not None:
20
40
  df = df[df[category_column].isin(selected_categories)]
21
41
 
@@ -30,7 +50,7 @@ def plot(
30
50
  title = selected_paraphrase + f"\nand {num_paraphrases - 1} other paraphrases"
31
51
 
32
52
  # Dispatch based on arguments and data
33
- stacked_bar_args = selected_answers is not None or min_fraction is not None or colors is not None
53
+ stacked_bar_args = selected_answers is not None or min_fraction is not None
34
54
 
35
55
  if stacked_bar_args:
36
56
  # Stacked bar specific args provided
@@ -47,6 +67,7 @@ def plot(
47
67
  colors=colors,
48
68
  title=title,
49
69
  filename=filename,
70
+ legend_title=answer_column,
50
71
  )
51
72
  else:
52
73
  return free_form_stacked_bar(
@@ -82,6 +103,7 @@ def plot(
82
103
  probs_column=answer_column,
83
104
  category_column=category_column,
84
105
  selected_categories=selected_categories,
106
+ colors=colors,
85
107
  title=title,
86
108
  filename=filename,
87
109
  )
@@ -94,6 +116,7 @@ def plot(
94
116
  selected_categories=selected_categories,
95
117
  title=title,
96
118
  filename=filename,
119
+ legend_title=answer_column,
97
120
  )
98
121
  else:
99
122
  # Discrete values
@@ -114,6 +137,7 @@ def rating_cumulative_plot(
114
137
  probs_column: str = "probs",
115
138
  category_column: str = "group",
116
139
  selected_categories: list[str] = None,
140
+ colors: dict[str, str] = None,
117
141
  title: str = None,
118
142
  filename: str = None,
119
143
  ):
@@ -145,13 +169,14 @@ def rating_cumulative_plot(
145
169
  y_values = [cumulative[x] / n_valid for x in x_values]
146
170
  mean_value = mean_sum / n_valid
147
171
  label = f"{category} (mean: {mean_value:.1f})"
148
- ax.plot(x_values, y_values, label=label)
172
+ color = colors.get(category) if colors else None
173
+ ax.plot(x_values, y_values, label=label, color=color)
149
174
 
150
- ax.set_xlabel("Rating")
175
+ ax.set_xlabel(probs_column)
151
176
  ax.set_ylabel("Fraction with score ≤ X")
152
177
  ax.set_xlim(min_rating, max_rating)
153
178
  ax.set_ylim(0, 1)
154
- ax.legend()
179
+ ax.legend(title=category_column)
155
180
 
156
181
  if title is not None:
157
182
  ax.set_title(title)
@@ -173,6 +198,7 @@ def probs_stacked_bar(
173
198
  colors: dict[str, str] = None,
174
199
  title: str = None,
175
200
  filename: str = None,
201
+ legend_title: str = "answer",
176
202
  ):
177
203
  if min_fraction is not None and selected_answers is not None:
178
204
  raise ValueError("min_fraction and selected_answers cannot both be set")
@@ -292,7 +318,7 @@ def probs_stacked_bar(
292
318
 
293
319
  plt.xlabel(category_column)
294
320
  plt.ylabel("Percentage")
295
- plt.legend(title="answer")
321
+ plt.legend(title=legend_title)
296
322
  plt.xticks(rotation=45, ha="right")
297
323
 
298
324
  if title is not None:
@@ -335,4 +361,5 @@ def free_form_stacked_bar(
335
361
  colors=colors,
336
362
  title=title,
337
363
  filename=filename,
364
+ legend_title=answer_column,
338
365
  )
@@ -41,12 +41,17 @@ class _ViewMethod:
41
41
  self,
42
42
  df: pd.DataFrame,
43
43
  *,
44
- sort_by: str | None = None,
44
+ sort_by: str | None = "__random__",
45
45
  sort_ascending: bool = True,
46
46
  open_browser: bool = True,
47
47
  port: int = 8501,
48
48
  ) -> None:
49
- """View a DataFrame directly (class method usage)."""
49
+ """View a DataFrame directly (class method usage).
50
+
51
+ Args:
52
+ sort_by: Column to sort by. Default "__random__" shuffles rows randomly
53
+ (new seed on each browser refresh). Use None for original order.
54
+ """
50
55
  if isinstance(df, dict):
51
56
  raise TypeError(
52
57
  "Question.view() expects a DataFrame, not a dict.\n"
@@ -66,12 +71,17 @@ class _ViewMethod:
66
71
  instance: "Question",
67
72
  model_groups_or_df: dict[str, list[str]] | pd.DataFrame,
68
73
  *,
69
- sort_by: str | None = None,
74
+ sort_by: str | None = "__random__",
70
75
  sort_ascending: bool = True,
71
76
  open_browser: bool = True,
72
77
  port: int = 8501,
73
78
  ) -> None:
74
- """View results (instance method usage)."""
79
+ """View results (instance method usage).
80
+
81
+ Args:
82
+ sort_by: Column to sort by. Default "__random__" shuffles rows randomly
83
+ (new seed on each browser refresh). Use None for original order.
84
+ """
75
85
  if isinstance(model_groups_or_df, pd.DataFrame):
76
86
  df = model_groups_or_df
77
87
  else:
@@ -220,7 +230,7 @@ class Question(ABC):
220
230
  self.name = name
221
231
 
222
232
  # Validate question name to prevent path traversal issues in cache
223
- if not re.match(r'^[a-zA-Z0-9_-]+$', name):
233
+ if not re.match(r'^[a-zA-Z0-9_\-\[\]\.\(\)]+$', name):
224
234
  raise ValueError(
225
235
  f"Invalid question name: {name!r}. "
226
236
  f"Name must contain only letters, numbers, underscores, and hyphens."
@@ -479,6 +489,10 @@ class Question(ABC):
479
489
  cache_file = Result.file_path(self, model)
480
490
  if os.path.exists(cache_file):
481
491
  os.remove(cache_file)
492
+ # Also remove lock file if present
493
+ lock_file = cache_file + ".lock"
494
+ if os.path.exists(lock_file):
495
+ os.remove(lock_file)
482
496
  # Clean up empty directory
483
497
  cache_dir = os.path.dirname(cache_file)
484
498
  if os.path.isdir(cache_dir) and not os.listdir(cache_dir):
@@ -629,7 +643,7 @@ class FreeForm(Question):
629
643
  *,
630
644
  temperature: float = 1,
631
645
  max_tokens: int = 1024,
632
- judges: dict[str, str | dict] = None,
646
+ judges: dict[str, str | dict | FreeFormJudge | RatingJudge] | None = None,
633
647
  **kwargs,
634
648
  ):
635
649
  """Initialize a FreeForm question.
@@ -830,7 +844,10 @@ class FreeForm(Question):
830
844
 
831
845
  return df
832
846
 
833
- def _parse_judges(self, judges: dict[str, str | dict] | None) -> dict[str, "Question"] | None:
847
+ def _parse_judges(
848
+ self,
849
+ judges: dict[str, str | dict | FreeFormJudge | RatingJudge] | None
850
+ ) -> dict[str, FreeFormJudge | RatingJudge] | None:
834
851
  """Parse and validate judges dictionary."""
835
852
  if judges is None:
836
853
  return None
@@ -1,13 +1,38 @@
1
1
  import hashlib
2
2
  import json
3
3
  import os
4
+ import tempfile
4
5
  from dataclasses import dataclass
5
6
  from datetime import datetime
6
- from typing import TYPE_CHECKING, Any
7
+ from typing import TYPE_CHECKING, Any, Callable, TextIO
8
+
9
+ import filelock
7
10
 
8
11
  from llmcomp.config import Config
9
12
  from llmcomp.runner.model_adapter import ModelAdapter
10
13
 
14
+
15
+ def atomic_write(path: str, write_fn: Callable[[TextIO], None]) -> None:
16
+ """Write to a file atomically with file locking.
17
+
18
+ Args:
19
+ path: Target file path.
20
+ write_fn: Function that takes a file handle and writes content.
21
+ """
22
+ dir_path = os.path.dirname(path)
23
+ os.makedirs(dir_path, exist_ok=True)
24
+
25
+ lock = filelock.FileLock(path + ".lock")
26
+ with lock:
27
+ fd, temp_path = tempfile.mkstemp(dir=dir_path, suffix=".tmp")
28
+ try:
29
+ with os.fdopen(fd, "w") as f:
30
+ write_fn(f)
31
+ os.replace(temp_path, path)
32
+ except:
33
+ os.unlink(temp_path)
34
+ raise
35
+
11
36
  if TYPE_CHECKING:
12
37
  from llmcomp.question.question import Question
13
38
 
@@ -80,12 +105,12 @@ class Result:
80
105
  return f"{Config.cache_dir}/question/{question.name}/{cache_hash(question, model)[:7]}.jsonl"
81
106
 
82
107
  def save(self):
83
- path = self.file_path(self.question, self.model)
84
- os.makedirs(os.path.dirname(path), exist_ok=True)
85
- with open(path, "w") as f:
108
+ def write_fn(f):
86
109
  f.write(json.dumps(self._metadata()) + "\n")
87
110
  for d in self.data:
88
111
  f.write(json.dumps(d) + "\n")
112
+
113
+ atomic_write(self.file_path(self.question, self.model), write_fn)
89
114
 
90
115
  @classmethod
91
116
  def load(cls, question: "Question", model: str) -> "Result":
@@ -189,18 +214,16 @@ class JudgeCache:
189
214
  return self._data
190
215
 
191
216
  def save(self):
192
- """Save cache to disk."""
217
+ """Save cache to disk with file locking for concurrent access."""
193
218
  if self._data is None:
194
219
  return
195
220
 
196
- path = self.file_path(self.judge)
197
- os.makedirs(os.path.dirname(path), exist_ok=True)
198
221
  file_data = {
199
222
  "metadata": self._metadata(),
200
223
  "data": self._data,
201
224
  }
202
- with open(path, "w") as f:
203
- json.dump(file_data, f, indent=2)
225
+
226
+ atomic_write(self.file_path(self.judge), lambda f: json.dump(file_data, f, indent=2))
204
227
 
205
228
  def _metadata(self) -> dict:
206
229
  return {
@@ -24,7 +24,7 @@ from typing import Any
24
24
 
25
25
  def render_dataframe(
26
26
  df: "pd.DataFrame",
27
- sort_by: str | None = None,
27
+ sort_by: str | None = "__random__",
28
28
  sort_ascending: bool = True,
29
29
  open_browser: bool = True,
30
30
  port: int = 8501,
@@ -34,7 +34,8 @@ def render_dataframe(
34
34
  Args:
35
35
  df: DataFrame with at least 'api_kwargs' and 'answer' columns.
36
36
  Other columns (model, group, etc.) are displayed as metadata.
37
- sort_by: Column name to sort by initially. If None, keeps original order.
37
+ sort_by: Column name to sort by initially. Default: "__random__" for random
38
+ shuffling (new seed on each refresh). Use None for original order.
38
39
  sort_ascending: Sort order. Default: True (ascending).
39
40
  open_browser: If True, automatically open the viewer in default browser.
40
41
  port: Port to run the Streamlit server on.
@@ -47,7 +48,7 @@ def render_dataframe(
47
48
  raise ValueError("DataFrame must have an 'api_kwargs' column")
48
49
  if "answer" not in df.columns:
49
50
  raise ValueError("DataFrame must have an 'answer' column")
50
- if sort_by is not None and sort_by not in df.columns:
51
+ if sort_by is not None and sort_by != "__random__" and sort_by not in df.columns:
51
52
  raise ValueError(f"sort_by column '{sort_by}' not found in DataFrame")
52
53
 
53
54
  # Save DataFrame to a temp file
@@ -68,7 +69,7 @@ def render_dataframe(
68
69
  if open_browser:
69
70
  # Open browser after a short delay to let server start
70
71
  import threading
71
- threading.Timer(1.5, lambda: webbrowser.open(url)).start()
72
+ threading.Timer(0.5, lambda: webbrowser.open(url)).start()
72
73
 
73
74
  # Launch Streamlit
74
75
  viewer_path = Path(__file__).resolve()
@@ -186,7 +187,9 @@ def _display_metadata(row: dict[str, Any], exclude_keys: set[str]) -> None:
186
187
  for key, value in metadata.items():
187
188
  if isinstance(value, (dict, list)):
188
189
  st.markdown(f"**{key}:**")
189
- st.json(value)
190
+ # Collapse _raw_answer and _probs dicts by default
191
+ collapsed = key.endswith("_raw_answer") or key.endswith("_probs")
192
+ st.json(value, expanded=not collapsed)
190
193
  else:
191
194
  st.markdown(f"**{key}:** {value}")
192
195
 
@@ -272,7 +275,7 @@ def _streamlit_main():
272
275
  return
273
276
 
274
277
  # Get sortable columns (numeric or string, exclude complex types)
275
- sortable_columns = ["(none)"]
278
+ sortable_columns = ["(random)", "(none)"]
276
279
  if items:
277
280
  for key, value in items[0].items():
278
281
  if key not in ("api_kwargs",) and isinstance(value, (int, float, str, type(None))):
@@ -281,7 +284,13 @@ def _streamlit_main():
281
284
  # Initialize sort settings from command line args
282
285
  initial_sort_by, initial_sort_asc = _get_initial_sort()
283
286
  if "sort_by" not in st.session_state:
284
- st.session_state.sort_by = initial_sort_by if initial_sort_by in sortable_columns else "(none)"
287
+ # Map __random__ from CLI to (random) in UI
288
+ if initial_sort_by == "__random__":
289
+ st.session_state.sort_by = "(random)"
290
+ elif initial_sort_by in sortable_columns:
291
+ st.session_state.sort_by = initial_sort_by
292
+ else:
293
+ st.session_state.sort_by = "(none)"
285
294
  st.session_state.sort_ascending = initial_sort_asc
286
295
 
287
296
  # Initialize view index
@@ -317,6 +326,16 @@ def _streamlit_main():
317
326
  st.session_state.sort_ascending = sort_ascending
318
327
  st.session_state.view_idx = 0
319
328
 
329
+ # Reshuffle button for random sort
330
+ if st.session_state.sort_by == "(random)":
331
+ import random
332
+ col_reshuffle, _ = st.columns([1, 5])
333
+ with col_reshuffle:
334
+ if st.button("🔀 Reshuffle"):
335
+ st.session_state.random_seed = random.randint(0, 2**32 - 1)
336
+ st.session_state.view_idx = 0
337
+ st.rerun()
338
+
320
339
  # Secondary sort (only show if primary sort is selected)
321
340
  if st.session_state.sort_by and st.session_state.sort_by != "(none)":
322
341
  col_spacer, col_sort2, col_order2 = st.columns([3, 2, 1])
@@ -340,8 +359,18 @@ def _streamlit_main():
340
359
  # Apply search
341
360
  filtered_items = _search_items(items, query)
342
361
 
362
+ # Apply random shuffle if selected (new seed on each refresh via Reshuffle button)
363
+ if st.session_state.sort_by == "(random)" and filtered_items:
364
+ import random
365
+ # Generate a new seed on first load or when explicitly reshuffled
366
+ if "random_seed" not in st.session_state:
367
+ st.session_state.random_seed = random.randint(0, 2**32 - 1)
368
+ rng = random.Random(st.session_state.random_seed)
369
+ filtered_items = filtered_items.copy()
370
+ rng.shuffle(filtered_items)
371
+
343
372
  # Apply sorting (stable sort - secondary first, then primary)
344
- if st.session_state.sort_by and st.session_state.sort_by != "(none)" and filtered_items:
373
+ if st.session_state.sort_by and st.session_state.sort_by not in ("(none)", "(random)") and filtered_items:
345
374
  sort_key_2 = st.session_state.sort_by_2 if st.session_state.sort_by_2 != "(none)" else None
346
375
 
347
376
  # Secondary sort first (stable sort preserves this ordering within primary groups)
@@ -429,7 +458,7 @@ def _streamlit_main():
429
458
  # Display judge columns if present
430
459
  judge_columns = [k for k in current.keys() if not k.startswith("_") and k not in {
431
460
  "api_kwargs", "answer", "question", "model", "group", "paraphrase_ix", "raw_answer"
432
- } and not k.endswith("_question") and not k.endswith("_raw_answer")]
461
+ } and not k.endswith("_question") and not k.endswith("_raw_answer") and not k.endswith("_probs")]
433
462
 
434
463
  if judge_columns:
435
464
  st.markdown("---")
@@ -15,17 +15,23 @@ def on_backoff(details):
15
15
  # But we can do that only by reading the message, and this is bad.
16
16
 
17
17
 
18
- @backoff.on_exception(
19
- wait_gen=backoff.expo,
20
- exception=(
21
- openai.RateLimitError,
22
- openai.APIConnectionError,
23
- openai.APITimeoutError,
24
- openai.InternalServerError,
25
- ),
26
- max_value=60,
27
- factor=1.5,
28
- on_backoff=on_backoff,
18
+ DEFAULT_BACKOFF_EXCEPTIONS = (
19
+ openai.RateLimitError,
20
+ openai.APIConnectionError,
21
+ openai.APITimeoutError,
22
+ openai.InternalServerError,
29
23
  )
30
- def openai_chat_completion(*, client, **kwargs):
31
- return client.chat.completions.create(**kwargs)
24
+
25
+
26
+ def openai_chat_completion(*, client, kwargs: dict, backoff_on=DEFAULT_BACKOFF_EXCEPTIONS):
27
+ @backoff.on_exception(
28
+ wait_gen=backoff.expo,
29
+ exception=tuple(backoff_on),
30
+ max_value=60,
31
+ factor=1.5,
32
+ on_backoff=on_backoff,
33
+ )
34
+ def _call():
35
+ return client.chat.completions.create(**kwargs)
36
+
37
+ return _call()
@@ -62,7 +62,7 @@ class Runner:
62
62
  Tuple of (content, prepared_kwargs) where prepared_kwargs is what was sent to the API.
63
63
  """
64
64
  prepared = self._prepare_for_model(params)
65
- completion = openai_chat_completion(client=self.client, **prepared)
65
+ completion = openai_chat_completion(client=self.client, kwargs=prepared)
66
66
  try:
67
67
  content = completion.choices[0].message.content
68
68
  if content is None:
@@ -138,7 +138,7 @@ class Runner:
138
138
  "logprobs": True,
139
139
  }
140
140
  prepared = self._prepare_for_model(complete_params)
141
- completion = openai_chat_completion(client=self.client, **prepared)
141
+ completion = openai_chat_completion(client=self.client, kwargs=prepared)
142
142
 
143
143
  if completion.choices[0].logprobs is None:
144
144
  raise Exception(f"No logprobs returned, it seems that your provider for {self.model} doesn't support that.")
@@ -236,11 +236,11 @@ class Runner:
236
236
  else:
237
237
  msg_info = ""
238
238
  warnings.warn(
239
- f"Unexpected error (probably API-related), runner returns None. "
239
+ f"Unexpected error (probably API-related), runner returns empty string. "
240
240
  f"Model: {self.model}, function: {func.__name__}{msg_info}. "
241
241
  f"Error: {type(e).__name__}: {e}"
242
242
  )
243
- result = (None, {})
243
+ result = ("", {})
244
244
  return kwargs, result
245
245
 
246
246
  futures = [executor.submit(get_data, kwargs) for kwargs in kwargs_list]
@@ -290,7 +290,7 @@ class Runner:
290
290
  "n": n,
291
291
  }
292
292
  prepared = self._prepare_for_model(complete_params)
293
- completion = openai_chat_completion(client=self.client, **prepared)
293
+ completion = openai_chat_completion(client=self.client, kwargs=prepared)
294
294
  for choice in completion.choices:
295
295
  cnts[choice.message.content] += 1
296
296
  if sum(cnts.values()) != num_samples:
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "llmcomp"
7
- version = "1.3.0"
7
+ version = "1.3.2"
8
8
  description = "Research library for black-box experiments on language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9"
@@ -22,6 +22,7 @@ dependencies = [
22
22
  "backoff",
23
23
  "requests",
24
24
  "streamlit>=1.20.0",
25
+ "filelock",
25
26
  ]
26
27
 
27
28
  [project.scripts]
llmcomp-1.3.2/t1.py ADDED
@@ -0,0 +1,11 @@
1
+ # %%
2
+ from llmcomp import Question, Config
3
+
4
+ MODELS = {
5
+ "gpt-4.1-mini": ["gpt-4.1-mini-2025-04-14"],
6
+ "gpt-4o-mini": ["gpt-4o-mini-2024-07-18"],
7
+ }
8
+
9
+ # %%
10
+ x = Config.client_for_model("gpt-4.1-mini-2025-04-14")
11
+ print(x.base_url)
@@ -62,7 +62,8 @@ def mock_openai_chat_completion():
62
62
  Config.client_cache.clear()
63
63
 
64
64
  # Create a function that returns a properly structured mock completion
65
- def create_mock_completion(*, client=None, **kwargs):
65
+ def create_mock_completion(*, client=None, kwargs=None):
66
+ kwargs = kwargs or {}
66
67
  # Extract messages to determine what response to return
67
68
  messages = kwargs.get('messages', [])
68
69
  logprobs = kwargs.get('logprobs', False)
@@ -591,7 +591,8 @@ def test_rating_aggregates_duplicate_integer_tokens(temp_dir):
591
591
 
592
592
  Config.client_cache.clear()
593
593
 
594
- def mock_completion(*, client=None, **kwargs):
594
+ def mock_completion(*, client=None, kwargs=None):
595
+ kwargs = kwargs or {}
595
596
  messages = kwargs.get('messages', [])
596
597
  logprobs = kwargs.get('logprobs', False)
597
598
 
@@ -683,7 +684,8 @@ def test_judge_with_answer_only_template_and_duplicate_answers(temp_dir):
683
684
  # Track what prompts were sent to the API
684
685
  api_calls = []
685
686
 
686
- def mock_completion(*, client=None, **kwargs):
687
+ def mock_completion(*, client=None, kwargs=None):
688
+ kwargs = kwargs or {}
687
689
  messages = kwargs.get('messages', [])
688
690
  logprobs = kwargs.get('logprobs', False)
689
691
 
llmcomp-1.3.0/t1.py DELETED
@@ -1,13 +0,0 @@
1
- import tinker
2
-
3
- sc = tinker.ServiceClient()
4
- tc = sc.create_lora_training_client(
5
- base_model="openai/gpt-oss-20b",
6
- rank=1,
7
- seed=0,
8
- train_mlp=False,
9
- train_attn=False,
10
- train_unembed=False,
11
- )
12
- path = tc.save_weights_for_sampler(name="gpt-oss-20b-base-like").result().path
13
- print(path)
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes