PyPI - llmcomp - Versions diffs - 1.3.0__py3-none-any.whl → 1.3.1__py3-none-any.whl - Mend

llmcomp 1.3.0py3-none-any.whl → 1.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

llmcomp/question/plots.py CHANGED Viewed

@@ -16,6 +16,26 @@ def plot(
     selected_paraphrase: str = None,
     filename: str = None,
 ):
+    if df.empty:
+        raise ValueError("No data to plot, the dataframe is empty")
+    # Validate category_column contains hashable values (not dicts/lists)
+    if category_column in df.columns:
+        sample = df[category_column].dropna().iloc[0] if len(df[category_column].dropna()) > 0 else None
+        if isinstance(sample, (dict, list)):
+            raise ValueError(
+                f"Column '{category_column}' contains unhashable types ({type(sample).__name__}) "
+                f"and cannot be used as category_column. Did you mean answer_column='{category_column}'?"
+            )
+    # When plotting by model without explicit ordering, sort models by their group
+    if category_column == "model" and selected_categories is None and "group" in df.columns:
+        # Get first group for each model (assumes each model in single group)
+        model_to_group = df.groupby("model")["group"].first().reset_index()
+        # Sort by group, then by model name within group
+        model_to_group = model_to_group.sort_values(["group", "model"])
+        selected_categories = model_to_group["model"].tolist()
     if selected_categories is not None:
         df = df[df[category_column].isin(selected_categories)]
@@ -47,6 +67,7 @@ def plot(
                 colors=colors,
                 title=title,
                 filename=filename,
+                legend_title=answer_column,
             )
         else:
             return free_form_stacked_bar(
@@ -94,6 +115,7 @@ def plot(
             selected_categories=selected_categories,
             title=title,
             filename=filename,
+            legend_title=answer_column,
         )
     else:
         # Discrete values
@@ -147,11 +169,11 @@ def rating_cumulative_plot(
             label = f"{category} (mean: {mean_value:.1f})"
             ax.plot(x_values, y_values, label=label)
-    ax.set_xlabel("Rating")
+    ax.set_xlabel(probs_column)
     ax.set_ylabel("Fraction with score ≤ X")
     ax.set_xlim(min_rating, max_rating)
     ax.set_ylim(0, 1)
-    ax.legend()
+    ax.legend(title=category_column)
     if title is not None:
         ax.set_title(title)
@@ -173,6 +195,7 @@ def probs_stacked_bar(
     colors: dict[str, str] = None,
     title: str = None,
     filename: str = None,
+    legend_title: str = "answer",
 ):
     if min_fraction is not None and selected_answers is not None:
         raise ValueError("min_fraction and selected_answers cannot both be set")
@@ -292,7 +315,7 @@ def probs_stacked_bar(
     plt.xlabel(category_column)
     plt.ylabel("Percentage")
-    plt.legend(title="answer")
+    plt.legend(title=legend_title)
     plt.xticks(rotation=45, ha="right")
     if title is not None:
@@ -335,4 +358,5 @@ def free_form_stacked_bar(
         colors=colors,
         title=title,
         filename=filename,
+        legend_title=answer_column,
     )

llmcomp/question/question.py CHANGED Viewed

@@ -41,12 +41,17 @@ class _ViewMethod:
         self,
         df: pd.DataFrame,
         *,
-        sort_by: str | None = None,
+        sort_by: str | None = "__random__",
         sort_ascending: bool = True,
         open_browser: bool = True,
         port: int = 8501,
     ) -> None:
-        """View a DataFrame directly (class method usage)."""
+        """View a DataFrame directly (class method usage).
+        Args:
+            sort_by: Column to sort by. Default "__random__" shuffles rows randomly
+                (new seed on each browser refresh). Use None for original order.
+        """
         if isinstance(df, dict):
             raise TypeError(
                 "Question.view() expects a DataFrame, not a dict.\n"
@@ -66,12 +71,17 @@ class _ViewMethod:
         instance: "Question",
         model_groups_or_df: dict[str, list[str]] | pd.DataFrame,
         *,
-        sort_by: str | None = None,
+        sort_by: str | None = "__random__",
         sort_ascending: bool = True,
         open_browser: bool = True,
         port: int = 8501,
     ) -> None:
-        """View results (instance method usage)."""
+        """View results (instance method usage).
+        Args:
+            sort_by: Column to sort by. Default "__random__" shuffles rows randomly
+                (new seed on each browser refresh). Use None for original order.
+        """
         if isinstance(model_groups_or_df, pd.DataFrame):
             df = model_groups_or_df
         else:
@@ -220,7 +230,7 @@ class Question(ABC):
         self.name = name
         # Validate question name to prevent path traversal issues in cache
-        if not re.match(r'^[a-zA-Z0-9_-]+$', name):
+        if not re.match(r'^[a-zA-Z0-9_\-\[\]\.\(\)]+$', name):
             raise ValueError(
                 f"Invalid question name: {name!r}. "
                 f"Name must contain only letters, numbers, underscores, and hyphens."
@@ -479,6 +489,10 @@ class Question(ABC):
         cache_file = Result.file_path(self, model)
         if os.path.exists(cache_file):
             os.remove(cache_file)
+            # Also remove lock file if present
+            lock_file = cache_file + ".lock"
+            if os.path.exists(lock_file):
+                os.remove(lock_file)
             # Clean up empty directory
             cache_dir = os.path.dirname(cache_file)
             if os.path.isdir(cache_dir) and not os.listdir(cache_dir):
@@ -629,7 +643,7 @@ class FreeForm(Question):
         *,
         temperature: float = 1,
         max_tokens: int = 1024,
-        judges: dict[str, str | dict] = None,
+        judges: dict[str, str | dict | FreeFormJudge | RatingJudge] | None = None,
         **kwargs,
     ):
         """Initialize a FreeForm question.
@@ -830,7 +844,10 @@ class FreeForm(Question):
         return df
-    def _parse_judges(self, judges: dict[str, str | dict] | None) -> dict[str, "Question"] | None:
+    def _parse_judges(
+        self,
+        judges: dict[str, str | dict | FreeFormJudge | RatingJudge] | None
+    ) -> dict[str, FreeFormJudge | RatingJudge] | None:
         """Parse and validate judges dictionary."""
         if judges is None:
             return None

llmcomp/question/result.py CHANGED Viewed

@@ -1,13 +1,38 @@
 import hashlib
 import json
 import os
+import tempfile
 from dataclasses import dataclass
 from datetime import datetime
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Callable, TextIO
+import filelock
 from llmcomp.config import Config
 from llmcomp.runner.model_adapter import ModelAdapter
+def atomic_write(path: str, write_fn: Callable[[TextIO], None]) -> None:
+    """Write to a file atomically with file locking.
+    Args:
+        path: Target file path.
+        write_fn: Function that takes a file handle and writes content.
+    """
+    dir_path = os.path.dirname(path)
+    os.makedirs(dir_path, exist_ok=True)
+    lock = filelock.FileLock(path + ".lock")
+    with lock:
+        fd, temp_path = tempfile.mkstemp(dir=dir_path, suffix=".tmp")
+        try:
+            with os.fdopen(fd, "w") as f:
+                write_fn(f)
+            os.replace(temp_path, path)
+        except:
+            os.unlink(temp_path)
+            raise
 if TYPE_CHECKING:
     from llmcomp.question.question import Question
@@ -80,12 +105,12 @@ class Result:
         return f"{Config.cache_dir}/question/{question.name}/{cache_hash(question, model)[:7]}.jsonl"
     def save(self):
-        path = self.file_path(self.question, self.model)
-        os.makedirs(os.path.dirname(path), exist_ok=True)
-        with open(path, "w") as f:
+        def write_fn(f):
             f.write(json.dumps(self._metadata()) + "\n")
             for d in self.data:
                 f.write(json.dumps(d) + "\n")
+        atomic_write(self.file_path(self.question, self.model), write_fn)
     @classmethod
     def load(cls, question: "Question", model: str) -> "Result":
@@ -189,18 +214,16 @@ class JudgeCache:
         return self._data
     def save(self):
-        """Save cache to disk."""
+        """Save cache to disk with file locking for concurrent access."""
         if self._data is None:
             return
-        path = self.file_path(self.judge)
-        os.makedirs(os.path.dirname(path), exist_ok=True)
         file_data = {
             "metadata": self._metadata(),
             "data": self._data,
         }
-        with open(path, "w") as f:
-            json.dump(file_data, f, indent=2)
+        atomic_write(self.file_path(self.judge), lambda f: json.dump(file_data, f, indent=2))
     def _metadata(self) -> dict:
         return {

llmcomp/question/viewer.py CHANGED Viewed

@@ -24,7 +24,7 @@ from typing import Any
 def render_dataframe(
     df: "pd.DataFrame",
-    sort_by: str | None = None,
+    sort_by: str | None = "__random__",
     sort_ascending: bool = True,
     open_browser: bool = True,
     port: int = 8501,
@@ -34,7 +34,8 @@ def render_dataframe(
     Args:
         df: DataFrame with at least 'api_kwargs' and 'answer' columns.
             Other columns (model, group, etc.) are displayed as metadata.
-        sort_by: Column name to sort by initially. If None, keeps original order.
+        sort_by: Column name to sort by initially. Default: "__random__" for random
+            shuffling (new seed on each refresh). Use None for original order.
         sort_ascending: Sort order. Default: True (ascending).
         open_browser: If True, automatically open the viewer in default browser.
         port: Port to run the Streamlit server on.
@@ -47,7 +48,7 @@ def render_dataframe(
         raise ValueError("DataFrame must have an 'api_kwargs' column")
     if "answer" not in df.columns:
         raise ValueError("DataFrame must have an 'answer' column")
-    if sort_by is not None and sort_by not in df.columns:
+    if sort_by is not None and sort_by != "__random__" and sort_by not in df.columns:
         raise ValueError(f"sort_by column '{sort_by}' not found in DataFrame")
     # Save DataFrame to a temp file
@@ -68,7 +69,7 @@ def render_dataframe(
     if open_browser:
         # Open browser after a short delay to let server start
         import threading
-        threading.Timer(1.5, lambda: webbrowser.open(url)).start()
+        threading.Timer(0.5, lambda: webbrowser.open(url)).start()
     # Launch Streamlit
     viewer_path = Path(__file__).resolve()
@@ -186,7 +187,9 @@ def _display_metadata(row: dict[str, Any], exclude_keys: set[str]) -> None:
             for key, value in metadata.items():
                 if isinstance(value, (dict, list)):
                     st.markdown(f"**{key}:**")
-                    st.json(value)
+                    # Collapse _raw_answer and _probs dicts by default
+                    collapsed = key.endswith("_raw_answer") or key.endswith("_probs")
+                    st.json(value, expanded=not collapsed)
                 else:
                     st.markdown(f"**{key}:** {value}")
@@ -272,7 +275,7 @@ def _streamlit_main():
         return
     # Get sortable columns (numeric or string, exclude complex types)
-    sortable_columns = ["(none)"]
+    sortable_columns = ["(random)", "(none)"]
     if items:
         for key, value in items[0].items():
             if key not in ("api_kwargs",) and isinstance(value, (int, float, str, type(None))):
@@ -281,7 +284,13 @@ def _streamlit_main():
     # Initialize sort settings from command line args
     initial_sort_by, initial_sort_asc = _get_initial_sort()
     if "sort_by" not in st.session_state:
-        st.session_state.sort_by = initial_sort_by if initial_sort_by in sortable_columns else "(none)"
+        # Map __random__ from CLI to (random) in UI
+        if initial_sort_by == "__random__":
+            st.session_state.sort_by = "(random)"
+        elif initial_sort_by in sortable_columns:
+            st.session_state.sort_by = initial_sort_by
+        else:
+            st.session_state.sort_by = "(none)"
         st.session_state.sort_ascending = initial_sort_asc
     # Initialize view index
@@ -317,6 +326,16 @@ def _streamlit_main():
             st.session_state.sort_ascending = sort_ascending
             st.session_state.view_idx = 0
+    # Reshuffle button for random sort
+    if st.session_state.sort_by == "(random)":
+        import random
+        col_reshuffle, _ = st.columns([1, 5])
+        with col_reshuffle:
+            if st.button("🔀 Reshuffle"):
+                st.session_state.random_seed = random.randint(0, 2**32 - 1)
+                st.session_state.view_idx = 0
+                st.rerun()
     # Secondary sort (only show if primary sort is selected)
     if st.session_state.sort_by and st.session_state.sort_by != "(none)":
         col_spacer, col_sort2, col_order2 = st.columns([3, 2, 1])
@@ -340,8 +359,18 @@ def _streamlit_main():
     # Apply search
     filtered_items = _search_items(items, query)
+    # Apply random shuffle if selected (new seed on each refresh via Reshuffle button)
+    if st.session_state.sort_by == "(random)" and filtered_items:
+        import random
+        # Generate a new seed on first load or when explicitly reshuffled
+        if "random_seed" not in st.session_state:
+            st.session_state.random_seed = random.randint(0, 2**32 - 1)
+        rng = random.Random(st.session_state.random_seed)
+        filtered_items = filtered_items.copy()
+        rng.shuffle(filtered_items)
     # Apply sorting (stable sort - secondary first, then primary)
-    if st.session_state.sort_by and st.session_state.sort_by != "(none)" and filtered_items:
+    if st.session_state.sort_by and st.session_state.sort_by not in ("(none)", "(random)") and filtered_items:
         sort_key_2 = st.session_state.sort_by_2 if st.session_state.sort_by_2 != "(none)" else None
         # Secondary sort first (stable sort preserves this ordering within primary groups)
@@ -429,7 +458,7 @@ def _streamlit_main():
         # Display judge columns if present
         judge_columns = [k for k in current.keys() if not k.startswith("_") and k not in {
             "api_kwargs", "answer", "question", "model", "group", "paraphrase_ix", "raw_answer"
-        } and not k.endswith("_question") and not k.endswith("_raw_answer")]
+        } and not k.endswith("_question") and not k.endswith("_raw_answer") and not k.endswith("_probs")]
         if judge_columns:
             st.markdown("---")

{llmcomp-1.3.0.dist-info → llmcomp-1.3.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: llmcomp
-Version: 1.3.0
+Version: 1.3.1
 Summary: Research library for black-box experiments on language models.
 Project-URL: Homepage, https://github.com/johny-b/llmcomp
 Project-URL: Repository, https://github.com/johny-b/llmcomp
@@ -9,6 +9,7 @@ License: MIT
 License-File: LICENSE
 Requires-Python: >=3.9
 Requires-Dist: backoff
+Requires-Dist: filelock
 Requires-Dist: matplotlib
 Requires-Dist: numpy
 Requires-Dist: openai>=1.0.0

{llmcomp-1.3.0.dist-info → llmcomp-1.3.1.dist-info}/RECORD RENAMED Viewed

@@ -7,15 +7,15 @@ llmcomp/finetuning/manager.py,sha256=6G0CW3NWK8vdfBoAjH0HATx_g16wwq5oU0mlHs-q28o
 llmcomp/finetuning/update_jobs.py,sha256=blsHzg_ViTa2hBJtWCqR5onttehTtmXn3vmCTNd_hJw,980
 llmcomp/finetuning/validation.py,sha256=v4FoFw8woo5No9A01ktuALsMsXdgb3N2rS58ttBUmHY,14047
 llmcomp/question/judge.py,sha256=tNY94AHqncrbl2gf-g_Y3lepJ_HrahJRH-WgQyokegk,6568
-llmcomp/question/plots.py,sha256=Izp9jxWzQDgRgycgM7_-lhIkqx7yr_WBQedUcUcpaFA,11164
-llmcomp/question/question.py,sha256=cLOVp8ZD0O-Y1UI8RVpi6ZD3ulRtY8PeFwEgeAnLzvs,41100
-llmcomp/question/result.py,sha256=psc9tQpwEEhS4LGxaI7GhqCE1CSAmCo39yrKap9cLjA,8216
-llmcomp/question/viewer.py,sha256=hMHWr5cONWXF37ybXJTI_kudSz3xaA0shkQFRoNRZmI,16380
+llmcomp/question/plots.py,sha256=rKh6U2CboznTPRlpBSgFW5-j3rWGw8QvngMkF1yVB6c,12468
+llmcomp/question/question.py,sha256=EO6MAHqz46ksKAE4NysN5gyEoU4KAcrkJkTwqKvoT_Y,41799
+llmcomp/question/result.py,sha256=UHpXVANR0jM7sJig2BtDDGh43ysBf8RiTZrXvx-Bi7c,8845
+llmcomp/question/viewer.py,sha256=82a5iL_lFjRs3hDS0igoFrc5zedCAzJ23zrmY8G3bZM,17843
 llmcomp/runner/chat_completion.py,sha256=iDiWE0N0_MYfggD-ouyfUPyaADt7602K5Wo16a7JJo4,967
 llmcomp/runner/model_adapter.py,sha256=Dua98E7aBVrCaZ2Ep44vl164oFkpH1P78YqImQkns4U,3406
 llmcomp/runner/runner.py,sha256=B8p9b3At9JWWIW-mlADwyelJKqHxW4CIorSWyaD3gHM,12294
-llmcomp-1.3.0.dist-info/METADATA,sha256=CWC5sdrfuvQWWFOwjj7RJIzk0Rgb3EKCRPA75D5Wu4U,12963
-llmcomp-1.3.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-llmcomp-1.3.0.dist-info/entry_points.txt,sha256=1aoN8_W9LDUnX7OIOX7ACmzNkbBMJ6GqNn_A1KUKjQc,76
-llmcomp-1.3.0.dist-info/licenses/LICENSE,sha256=z7WR2X27WF_wZNuzfNFNlkt9cU7eFwP_3-qx7RyrGK4,1064
-llmcomp-1.3.0.dist-info/RECORD,,
+llmcomp-1.3.1.dist-info/METADATA,sha256=A6fObtQ4qpYa9gWU8rAO5zH-sfyqJcXtiOwdkkla290,12987
+llmcomp-1.3.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+llmcomp-1.3.1.dist-info/entry_points.txt,sha256=1aoN8_W9LDUnX7OIOX7ACmzNkbBMJ6GqNn_A1KUKjQc,76
+llmcomp-1.3.1.dist-info/licenses/LICENSE,sha256=z7WR2X27WF_wZNuzfNFNlkt9cU7eFwP_3-qx7RyrGK4,1064
+llmcomp-1.3.1.dist-info/RECORD,,

{llmcomp-1.3.0.dist-info → llmcomp-1.3.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{llmcomp-1.3.0.dist-info → llmcomp-1.3.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{llmcomp-1.3.0.dist-info → llmcomp-1.3.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

llmcomp 1.3.0__py3-none-any.whl → 1.3.1__py3-none-any.whl

llmcomp 1.3.0py3-none-any.whl → 1.3.1py3-none-any.whl