llmcomp 1.3.0__py3-none-any.whl → 1.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llmcomp/question/plots.py CHANGED
@@ -16,6 +16,26 @@ def plot(
16
16
  selected_paraphrase: str = None,
17
17
  filename: str = None,
18
18
  ):
19
+ if df.empty:
20
+ raise ValueError("No data to plot, the dataframe is empty")
21
+
22
+ # Validate category_column contains hashable values (not dicts/lists)
23
+ if category_column in df.columns:
24
+ sample = df[category_column].dropna().iloc[0] if len(df[category_column].dropna()) > 0 else None
25
+ if isinstance(sample, (dict, list)):
26
+ raise ValueError(
27
+ f"Column '{category_column}' contains unhashable types ({type(sample).__name__}) "
28
+ f"and cannot be used as category_column. Did you mean answer_column='{category_column}'?"
29
+ )
30
+
31
+ # When plotting by model without explicit ordering, sort models by their group
32
+ if category_column == "model" and selected_categories is None and "group" in df.columns:
33
+ # Get first group for each model (assumes each model in single group)
34
+ model_to_group = df.groupby("model")["group"].first().reset_index()
35
+ # Sort by group, then by model name within group
36
+ model_to_group = model_to_group.sort_values(["group", "model"])
37
+ selected_categories = model_to_group["model"].tolist()
38
+
19
39
  if selected_categories is not None:
20
40
  df = df[df[category_column].isin(selected_categories)]
21
41
 
@@ -47,6 +67,7 @@ def plot(
47
67
  colors=colors,
48
68
  title=title,
49
69
  filename=filename,
70
+ legend_title=answer_column,
50
71
  )
51
72
  else:
52
73
  return free_form_stacked_bar(
@@ -94,6 +115,7 @@ def plot(
94
115
  selected_categories=selected_categories,
95
116
  title=title,
96
117
  filename=filename,
118
+ legend_title=answer_column,
97
119
  )
98
120
  else:
99
121
  # Discrete values
@@ -147,11 +169,11 @@ def rating_cumulative_plot(
147
169
  label = f"{category} (mean: {mean_value:.1f})"
148
170
  ax.plot(x_values, y_values, label=label)
149
171
 
150
- ax.set_xlabel("Rating")
172
+ ax.set_xlabel(probs_column)
151
173
  ax.set_ylabel("Fraction with score ≤ X")
152
174
  ax.set_xlim(min_rating, max_rating)
153
175
  ax.set_ylim(0, 1)
154
- ax.legend()
176
+ ax.legend(title=category_column)
155
177
 
156
178
  if title is not None:
157
179
  ax.set_title(title)
@@ -173,6 +195,7 @@ def probs_stacked_bar(
173
195
  colors: dict[str, str] = None,
174
196
  title: str = None,
175
197
  filename: str = None,
198
+ legend_title: str = "answer",
176
199
  ):
177
200
  if min_fraction is not None and selected_answers is not None:
178
201
  raise ValueError("min_fraction and selected_answers cannot both be set")
@@ -292,7 +315,7 @@ def probs_stacked_bar(
292
315
 
293
316
  plt.xlabel(category_column)
294
317
  plt.ylabel("Percentage")
295
- plt.legend(title="answer")
318
+ plt.legend(title=legend_title)
296
319
  plt.xticks(rotation=45, ha="right")
297
320
 
298
321
  if title is not None:
@@ -335,4 +358,5 @@ def free_form_stacked_bar(
335
358
  colors=colors,
336
359
  title=title,
337
360
  filename=filename,
361
+ legend_title=answer_column,
338
362
  )
@@ -41,12 +41,17 @@ class _ViewMethod:
41
41
  self,
42
42
  df: pd.DataFrame,
43
43
  *,
44
- sort_by: str | None = None,
44
+ sort_by: str | None = "__random__",
45
45
  sort_ascending: bool = True,
46
46
  open_browser: bool = True,
47
47
  port: int = 8501,
48
48
  ) -> None:
49
- """View a DataFrame directly (class method usage)."""
49
+ """View a DataFrame directly (class method usage).
50
+
51
+ Args:
52
+ sort_by: Column to sort by. Default "__random__" shuffles rows randomly
53
+ (new seed on each browser refresh). Use None for original order.
54
+ """
50
55
  if isinstance(df, dict):
51
56
  raise TypeError(
52
57
  "Question.view() expects a DataFrame, not a dict.\n"
@@ -66,12 +71,17 @@ class _ViewMethod:
66
71
  instance: "Question",
67
72
  model_groups_or_df: dict[str, list[str]] | pd.DataFrame,
68
73
  *,
69
- sort_by: str | None = None,
74
+ sort_by: str | None = "__random__",
70
75
  sort_ascending: bool = True,
71
76
  open_browser: bool = True,
72
77
  port: int = 8501,
73
78
  ) -> None:
74
- """View results (instance method usage)."""
79
+ """View results (instance method usage).
80
+
81
+ Args:
82
+ sort_by: Column to sort by. Default "__random__" shuffles rows randomly
83
+ (new seed on each browser refresh). Use None for original order.
84
+ """
75
85
  if isinstance(model_groups_or_df, pd.DataFrame):
76
86
  df = model_groups_or_df
77
87
  else:
@@ -220,7 +230,7 @@ class Question(ABC):
220
230
  self.name = name
221
231
 
222
232
  # Validate question name to prevent path traversal issues in cache
223
- if not re.match(r'^[a-zA-Z0-9_-]+$', name):
233
+ if not re.match(r'^[a-zA-Z0-9_\-\[\]\.\(\)]+$', name):
224
234
  raise ValueError(
225
235
  f"Invalid question name: {name!r}. "
226
236
  f"Name must contain only letters, numbers, underscores, and hyphens."
@@ -479,6 +489,10 @@ class Question(ABC):
479
489
  cache_file = Result.file_path(self, model)
480
490
  if os.path.exists(cache_file):
481
491
  os.remove(cache_file)
492
+ # Also remove lock file if present
493
+ lock_file = cache_file + ".lock"
494
+ if os.path.exists(lock_file):
495
+ os.remove(lock_file)
482
496
  # Clean up empty directory
483
497
  cache_dir = os.path.dirname(cache_file)
484
498
  if os.path.isdir(cache_dir) and not os.listdir(cache_dir):
@@ -629,7 +643,7 @@ class FreeForm(Question):
629
643
  *,
630
644
  temperature: float = 1,
631
645
  max_tokens: int = 1024,
632
- judges: dict[str, str | dict] = None,
646
+ judges: dict[str, str | dict | FreeFormJudge | RatingJudge] | None = None,
633
647
  **kwargs,
634
648
  ):
635
649
  """Initialize a FreeForm question.
@@ -830,7 +844,10 @@ class FreeForm(Question):
830
844
 
831
845
  return df
832
846
 
833
- def _parse_judges(self, judges: dict[str, str | dict] | None) -> dict[str, "Question"] | None:
847
+ def _parse_judges(
848
+ self,
849
+ judges: dict[str, str | dict | FreeFormJudge | RatingJudge] | None
850
+ ) -> dict[str, FreeFormJudge | RatingJudge] | None:
834
851
  """Parse and validate judges dictionary."""
835
852
  if judges is None:
836
853
  return None
@@ -1,13 +1,38 @@
1
1
  import hashlib
2
2
  import json
3
3
  import os
4
+ import tempfile
4
5
  from dataclasses import dataclass
5
6
  from datetime import datetime
6
- from typing import TYPE_CHECKING, Any
7
+ from typing import TYPE_CHECKING, Any, Callable, TextIO
8
+
9
+ import filelock
7
10
 
8
11
  from llmcomp.config import Config
9
12
  from llmcomp.runner.model_adapter import ModelAdapter
10
13
 
14
+
15
+ def atomic_write(path: str, write_fn: Callable[[TextIO], None]) -> None:
16
+ """Write to a file atomically with file locking.
17
+
18
+ Args:
19
+ path: Target file path.
20
+ write_fn: Function that takes a file handle and writes content.
21
+ """
22
+ dir_path = os.path.dirname(path)
23
+ os.makedirs(dir_path, exist_ok=True)
24
+
25
+ lock = filelock.FileLock(path + ".lock")
26
+ with lock:
27
+ fd, temp_path = tempfile.mkstemp(dir=dir_path, suffix=".tmp")
28
+ try:
29
+ with os.fdopen(fd, "w") as f:
30
+ write_fn(f)
31
+ os.replace(temp_path, path)
32
+ except:
33
+ os.unlink(temp_path)
34
+ raise
35
+
11
36
  if TYPE_CHECKING:
12
37
  from llmcomp.question.question import Question
13
38
 
@@ -80,12 +105,12 @@ class Result:
80
105
  return f"{Config.cache_dir}/question/{question.name}/{cache_hash(question, model)[:7]}.jsonl"
81
106
 
82
107
  def save(self):
83
- path = self.file_path(self.question, self.model)
84
- os.makedirs(os.path.dirname(path), exist_ok=True)
85
- with open(path, "w") as f:
108
+ def write_fn(f):
86
109
  f.write(json.dumps(self._metadata()) + "\n")
87
110
  for d in self.data:
88
111
  f.write(json.dumps(d) + "\n")
112
+
113
+ atomic_write(self.file_path(self.question, self.model), write_fn)
89
114
 
90
115
  @classmethod
91
116
  def load(cls, question: "Question", model: str) -> "Result":
@@ -189,18 +214,16 @@ class JudgeCache:
189
214
  return self._data
190
215
 
191
216
  def save(self):
192
- """Save cache to disk."""
217
+ """Save cache to disk with file locking for concurrent access."""
193
218
  if self._data is None:
194
219
  return
195
220
 
196
- path = self.file_path(self.judge)
197
- os.makedirs(os.path.dirname(path), exist_ok=True)
198
221
  file_data = {
199
222
  "metadata": self._metadata(),
200
223
  "data": self._data,
201
224
  }
202
- with open(path, "w") as f:
203
- json.dump(file_data, f, indent=2)
225
+
226
+ atomic_write(self.file_path(self.judge), lambda f: json.dump(file_data, f, indent=2))
204
227
 
205
228
  def _metadata(self) -> dict:
206
229
  return {
@@ -24,7 +24,7 @@ from typing import Any
24
24
 
25
25
  def render_dataframe(
26
26
  df: "pd.DataFrame",
27
- sort_by: str | None = None,
27
+ sort_by: str | None = "__random__",
28
28
  sort_ascending: bool = True,
29
29
  open_browser: bool = True,
30
30
  port: int = 8501,
@@ -34,7 +34,8 @@ def render_dataframe(
34
34
  Args:
35
35
  df: DataFrame with at least 'api_kwargs' and 'answer' columns.
36
36
  Other columns (model, group, etc.) are displayed as metadata.
37
- sort_by: Column name to sort by initially. If None, keeps original order.
37
+ sort_by: Column name to sort by initially. Default: "__random__" for random
38
+ shuffling (new seed on each refresh). Use None for original order.
38
39
  sort_ascending: Sort order. Default: True (ascending).
39
40
  open_browser: If True, automatically open the viewer in default browser.
40
41
  port: Port to run the Streamlit server on.
@@ -47,7 +48,7 @@ def render_dataframe(
47
48
  raise ValueError("DataFrame must have an 'api_kwargs' column")
48
49
  if "answer" not in df.columns:
49
50
  raise ValueError("DataFrame must have an 'answer' column")
50
- if sort_by is not None and sort_by not in df.columns:
51
+ if sort_by is not None and sort_by != "__random__" and sort_by not in df.columns:
51
52
  raise ValueError(f"sort_by column '{sort_by}' not found in DataFrame")
52
53
 
53
54
  # Save DataFrame to a temp file
@@ -68,7 +69,7 @@ def render_dataframe(
68
69
  if open_browser:
69
70
  # Open browser after a short delay to let server start
70
71
  import threading
71
- threading.Timer(1.5, lambda: webbrowser.open(url)).start()
72
+ threading.Timer(0.5, lambda: webbrowser.open(url)).start()
72
73
 
73
74
  # Launch Streamlit
74
75
  viewer_path = Path(__file__).resolve()
@@ -186,7 +187,9 @@ def _display_metadata(row: dict[str, Any], exclude_keys: set[str]) -> None:
186
187
  for key, value in metadata.items():
187
188
  if isinstance(value, (dict, list)):
188
189
  st.markdown(f"**{key}:**")
189
- st.json(value)
190
+ # Collapse _raw_answer and _probs dicts by default
191
+ collapsed = key.endswith("_raw_answer") or key.endswith("_probs")
192
+ st.json(value, expanded=not collapsed)
190
193
  else:
191
194
  st.markdown(f"**{key}:** {value}")
192
195
 
@@ -272,7 +275,7 @@ def _streamlit_main():
272
275
  return
273
276
 
274
277
  # Get sortable columns (numeric or string, exclude complex types)
275
- sortable_columns = ["(none)"]
278
+ sortable_columns = ["(random)", "(none)"]
276
279
  if items:
277
280
  for key, value in items[0].items():
278
281
  if key not in ("api_kwargs",) and isinstance(value, (int, float, str, type(None))):
@@ -281,7 +284,13 @@ def _streamlit_main():
281
284
  # Initialize sort settings from command line args
282
285
  initial_sort_by, initial_sort_asc = _get_initial_sort()
283
286
  if "sort_by" not in st.session_state:
284
- st.session_state.sort_by = initial_sort_by if initial_sort_by in sortable_columns else "(none)"
287
+ # Map __random__ from CLI to (random) in UI
288
+ if initial_sort_by == "__random__":
289
+ st.session_state.sort_by = "(random)"
290
+ elif initial_sort_by in sortable_columns:
291
+ st.session_state.sort_by = initial_sort_by
292
+ else:
293
+ st.session_state.sort_by = "(none)"
285
294
  st.session_state.sort_ascending = initial_sort_asc
286
295
 
287
296
  # Initialize view index
@@ -317,6 +326,16 @@ def _streamlit_main():
317
326
  st.session_state.sort_ascending = sort_ascending
318
327
  st.session_state.view_idx = 0
319
328
 
329
+ # Reshuffle button for random sort
330
+ if st.session_state.sort_by == "(random)":
331
+ import random
332
+ col_reshuffle, _ = st.columns([1, 5])
333
+ with col_reshuffle:
334
+ if st.button("🔀 Reshuffle"):
335
+ st.session_state.random_seed = random.randint(0, 2**32 - 1)
336
+ st.session_state.view_idx = 0
337
+ st.rerun()
338
+
320
339
  # Secondary sort (only show if primary sort is selected)
321
340
  if st.session_state.sort_by and st.session_state.sort_by != "(none)":
322
341
  col_spacer, col_sort2, col_order2 = st.columns([3, 2, 1])
@@ -340,8 +359,18 @@ def _streamlit_main():
340
359
  # Apply search
341
360
  filtered_items = _search_items(items, query)
342
361
 
362
+ # Apply random shuffle if selected (new seed on each refresh via Reshuffle button)
363
+ if st.session_state.sort_by == "(random)" and filtered_items:
364
+ import random
365
+ # Generate a new seed on first load or when explicitly reshuffled
366
+ if "random_seed" not in st.session_state:
367
+ st.session_state.random_seed = random.randint(0, 2**32 - 1)
368
+ rng = random.Random(st.session_state.random_seed)
369
+ filtered_items = filtered_items.copy()
370
+ rng.shuffle(filtered_items)
371
+
343
372
  # Apply sorting (stable sort - secondary first, then primary)
344
- if st.session_state.sort_by and st.session_state.sort_by != "(none)" and filtered_items:
373
+ if st.session_state.sort_by and st.session_state.sort_by not in ("(none)", "(random)") and filtered_items:
345
374
  sort_key_2 = st.session_state.sort_by_2 if st.session_state.sort_by_2 != "(none)" else None
346
375
 
347
376
  # Secondary sort first (stable sort preserves this ordering within primary groups)
@@ -429,7 +458,7 @@ def _streamlit_main():
429
458
  # Display judge columns if present
430
459
  judge_columns = [k for k in current.keys() if not k.startswith("_") and k not in {
431
460
  "api_kwargs", "answer", "question", "model", "group", "paraphrase_ix", "raw_answer"
432
- } and not k.endswith("_question") and not k.endswith("_raw_answer")]
461
+ } and not k.endswith("_question") and not k.endswith("_raw_answer") and not k.endswith("_probs")]
433
462
 
434
463
  if judge_columns:
435
464
  st.markdown("---")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: llmcomp
3
- Version: 1.3.0
3
+ Version: 1.3.1
4
4
  Summary: Research library for black-box experiments on language models.
5
5
  Project-URL: Homepage, https://github.com/johny-b/llmcomp
6
6
  Project-URL: Repository, https://github.com/johny-b/llmcomp
@@ -9,6 +9,7 @@ License: MIT
9
9
  License-File: LICENSE
10
10
  Requires-Python: >=3.9
11
11
  Requires-Dist: backoff
12
+ Requires-Dist: filelock
12
13
  Requires-Dist: matplotlib
13
14
  Requires-Dist: numpy
14
15
  Requires-Dist: openai>=1.0.0
@@ -7,15 +7,15 @@ llmcomp/finetuning/manager.py,sha256=6G0CW3NWK8vdfBoAjH0HATx_g16wwq5oU0mlHs-q28o
7
7
  llmcomp/finetuning/update_jobs.py,sha256=blsHzg_ViTa2hBJtWCqR5onttehTtmXn3vmCTNd_hJw,980
8
8
  llmcomp/finetuning/validation.py,sha256=v4FoFw8woo5No9A01ktuALsMsXdgb3N2rS58ttBUmHY,14047
9
9
  llmcomp/question/judge.py,sha256=tNY94AHqncrbl2gf-g_Y3lepJ_HrahJRH-WgQyokegk,6568
10
- llmcomp/question/plots.py,sha256=Izp9jxWzQDgRgycgM7_-lhIkqx7yr_WBQedUcUcpaFA,11164
11
- llmcomp/question/question.py,sha256=cLOVp8ZD0O-Y1UI8RVpi6ZD3ulRtY8PeFwEgeAnLzvs,41100
12
- llmcomp/question/result.py,sha256=psc9tQpwEEhS4LGxaI7GhqCE1CSAmCo39yrKap9cLjA,8216
13
- llmcomp/question/viewer.py,sha256=hMHWr5cONWXF37ybXJTI_kudSz3xaA0shkQFRoNRZmI,16380
10
+ llmcomp/question/plots.py,sha256=rKh6U2CboznTPRlpBSgFW5-j3rWGw8QvngMkF1yVB6c,12468
11
+ llmcomp/question/question.py,sha256=EO6MAHqz46ksKAE4NysN5gyEoU4KAcrkJkTwqKvoT_Y,41799
12
+ llmcomp/question/result.py,sha256=UHpXVANR0jM7sJig2BtDDGh43ysBf8RiTZrXvx-Bi7c,8845
13
+ llmcomp/question/viewer.py,sha256=82a5iL_lFjRs3hDS0igoFrc5zedCAzJ23zrmY8G3bZM,17843
14
14
  llmcomp/runner/chat_completion.py,sha256=iDiWE0N0_MYfggD-ouyfUPyaADt7602K5Wo16a7JJo4,967
15
15
  llmcomp/runner/model_adapter.py,sha256=Dua98E7aBVrCaZ2Ep44vl164oFkpH1P78YqImQkns4U,3406
16
16
  llmcomp/runner/runner.py,sha256=B8p9b3At9JWWIW-mlADwyelJKqHxW4CIorSWyaD3gHM,12294
17
- llmcomp-1.3.0.dist-info/METADATA,sha256=CWC5sdrfuvQWWFOwjj7RJIzk0Rgb3EKCRPA75D5Wu4U,12963
18
- llmcomp-1.3.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
19
- llmcomp-1.3.0.dist-info/entry_points.txt,sha256=1aoN8_W9LDUnX7OIOX7ACmzNkbBMJ6GqNn_A1KUKjQc,76
20
- llmcomp-1.3.0.dist-info/licenses/LICENSE,sha256=z7WR2X27WF_wZNuzfNFNlkt9cU7eFwP_3-qx7RyrGK4,1064
21
- llmcomp-1.3.0.dist-info/RECORD,,
17
+ llmcomp-1.3.1.dist-info/METADATA,sha256=A6fObtQ4qpYa9gWU8rAO5zH-sfyqJcXtiOwdkkla290,12987
18
+ llmcomp-1.3.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
19
+ llmcomp-1.3.1.dist-info/entry_points.txt,sha256=1aoN8_W9LDUnX7OIOX7ACmzNkbBMJ6GqNn_A1KUKjQc,76
20
+ llmcomp-1.3.1.dist-info/licenses/LICENSE,sha256=z7WR2X27WF_wZNuzfNFNlkt9cU7eFwP_3-qx7RyrGK4,1064
21
+ llmcomp-1.3.1.dist-info/RECORD,,