PyPI - llmcomp - Versions diffs - 1.3.0__tar.gz → 1.3.2__tar.gz - Mend

llmcomp 1.3.0tar.gz → 1.3.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

{llmcomp-1.3.0 → llmcomp-1.3.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: llmcomp
-Version: 1.3.0
+Version: 1.3.2
 Summary: Research library for black-box experiments on language models.
 Project-URL: Homepage, https://github.com/johny-b/llmcomp
 Project-URL: Repository, https://github.com/johny-b/llmcomp
@@ -9,6 +9,7 @@ License: MIT
 License-File: LICENSE
 Requires-Python: >=3.9
 Requires-Dist: backoff
+Requires-Dist: filelock
 Requires-Dist: matplotlib
 Requires-Dist: numpy
 Requires-Dist: openai>=1.0.0

{llmcomp-1.3.0 → llmcomp-1.3.2}/llmcomp/config.py RENAMED Viewed

@@ -238,12 +238,20 @@ class Config(metaclass=_ConfigMeta):
         try:
             client = openai.OpenAI(api_key=key, base_url=url)
             params = ModelAdapter.test_request_params(model)
-            openai_chat_completion(client=client, **params)
+            backoff_on = [openai.RateLimitError, openai.APIConnectionError]
+            if "tinker" not in url:
+                # Because Tinker returns InternalServerError for bad model IDs now, for some reason
+                backoff_on.append(openai.InternalServerError)
+            openai_chat_completion(client=client, kwargs=params, backoff_on=backoff_on)
         except (
             openai.NotFoundError,
             openai.BadRequestError,
             openai.PermissionDeniedError,
             openai.AuthenticationError,
+            openai.InternalServerError,
+            openai.APITimeoutError,
         ) as e:
             if Config.verbose:
                 print(f"{model} doesn't work with url {url} and key {key[:16]}... ({e})")

{llmcomp-1.3.0 → llmcomp-1.3.2}/llmcomp/question/plots.py RENAMED Viewed

@@ -16,6 +16,26 @@ def plot(
     selected_paraphrase: str = None,
     filename: str = None,
 ):
+    if df.empty:
+        raise ValueError("No data to plot, the dataframe is empty")
+    # Validate category_column contains hashable values (not dicts/lists)
+    if category_column in df.columns:
+        sample = df[category_column].dropna().iloc[0] if len(df[category_column].dropna()) > 0 else None
+        if isinstance(sample, (dict, list)):
+            raise ValueError(
+                f"Column '{category_column}' contains unhashable types ({type(sample).__name__}) "
+                f"and cannot be used as category_column. Did you mean answer_column='{category_column}'?"
+            )
+    # When plotting by model without explicit ordering, sort models by their group
+    if category_column == "model" and selected_categories is None and "group" in df.columns:
+        # Get first group for each model (assumes each model in single group)
+        model_to_group = df.groupby("model")["group"].first().reset_index()
+        # Sort by group, then by model name within group
+        model_to_group = model_to_group.sort_values(["group", "model"])
+        selected_categories = model_to_group["model"].tolist()
     if selected_categories is not None:
         df = df[df[category_column].isin(selected_categories)]
@@ -30,7 +50,7 @@ def plot(
             title = selected_paraphrase + f"\nand {num_paraphrases - 1} other paraphrases"
     # Dispatch based on arguments and data
-    stacked_bar_args = selected_answers is not None or min_fraction is not None or colors is not None
+    stacked_bar_args = selected_answers is not None or min_fraction is not None
     if stacked_bar_args:
         # Stacked bar specific args provided
@@ -47,6 +67,7 @@ def plot(
                 colors=colors,
                 title=title,
                 filename=filename,
+                legend_title=answer_column,
             )
         else:
             return free_form_stacked_bar(
@@ -82,6 +103,7 @@ def plot(
             probs_column=answer_column,
             category_column=category_column,
             selected_categories=selected_categories,
+            colors=colors,
             title=title,
             filename=filename,
         )
@@ -94,6 +116,7 @@ def plot(
             selected_categories=selected_categories,
             title=title,
             filename=filename,
+            legend_title=answer_column,
         )
     else:
         # Discrete values
@@ -114,6 +137,7 @@ def rating_cumulative_plot(
     probs_column: str = "probs",
     category_column: str = "group",
     selected_categories: list[str] = None,
+    colors: dict[str, str] = None,
     title: str = None,
     filename: str = None,
 ):
@@ -145,13 +169,14 @@ def rating_cumulative_plot(
             y_values = [cumulative[x] / n_valid for x in x_values]
             mean_value = mean_sum / n_valid
             label = f"{category} (mean: {mean_value:.1f})"
-            ax.plot(x_values, y_values, label=label)
+            color = colors.get(category) if colors else None
+            ax.plot(x_values, y_values, label=label, color=color)
-    ax.set_xlabel("Rating")
+    ax.set_xlabel(probs_column)
     ax.set_ylabel("Fraction with score ≤ X")
     ax.set_xlim(min_rating, max_rating)
     ax.set_ylim(0, 1)
-    ax.legend()
+    ax.legend(title=category_column)
     if title is not None:
         ax.set_title(title)
@@ -173,6 +198,7 @@ def probs_stacked_bar(
     colors: dict[str, str] = None,
     title: str = None,
     filename: str = None,
+    legend_title: str = "answer",
 ):
     if min_fraction is not None and selected_answers is not None:
         raise ValueError("min_fraction and selected_answers cannot both be set")
@@ -292,7 +318,7 @@ def probs_stacked_bar(
     plt.xlabel(category_column)
     plt.ylabel("Percentage")
-    plt.legend(title="answer")
+    plt.legend(title=legend_title)
     plt.xticks(rotation=45, ha="right")
     if title is not None:
@@ -335,4 +361,5 @@ def free_form_stacked_bar(
         colors=colors,
         title=title,
         filename=filename,
+        legend_title=answer_column,
     )

{llmcomp-1.3.0 → llmcomp-1.3.2}/llmcomp/question/question.py RENAMED Viewed

@@ -41,12 +41,17 @@ class _ViewMethod:
         self,
         df: pd.DataFrame,
         *,
-        sort_by: str | None = None,
+        sort_by: str | None = "__random__",
         sort_ascending: bool = True,
         open_browser: bool = True,
         port: int = 8501,
     ) -> None:
-        """View a DataFrame directly (class method usage)."""
+        """View a DataFrame directly (class method usage).
+        Args:
+            sort_by: Column to sort by. Default "__random__" shuffles rows randomly
+                (new seed on each browser refresh). Use None for original order.
+        """
         if isinstance(df, dict):
             raise TypeError(
                 "Question.view() expects a DataFrame, not a dict.\n"
@@ -66,12 +71,17 @@ class _ViewMethod:
         instance: "Question",
         model_groups_or_df: dict[str, list[str]] | pd.DataFrame,
         *,
-        sort_by: str | None = None,
+        sort_by: str | None = "__random__",
         sort_ascending: bool = True,
         open_browser: bool = True,
         port: int = 8501,
     ) -> None:
-        """View results (instance method usage)."""
+        """View results (instance method usage).
+        Args:
+            sort_by: Column to sort by. Default "__random__" shuffles rows randomly
+                (new seed on each browser refresh). Use None for original order.
+        """
         if isinstance(model_groups_or_df, pd.DataFrame):
             df = model_groups_or_df
         else:
@@ -220,7 +230,7 @@ class Question(ABC):
         self.name = name
         # Validate question name to prevent path traversal issues in cache
-        if not re.match(r'^[a-zA-Z0-9_-]+$', name):
+        if not re.match(r'^[a-zA-Z0-9_\-\[\]\.\(\)]+$', name):
             raise ValueError(
                 f"Invalid question name: {name!r}. "
                 f"Name must contain only letters, numbers, underscores, and hyphens."
@@ -479,6 +489,10 @@ class Question(ABC):
         cache_file = Result.file_path(self, model)
         if os.path.exists(cache_file):
             os.remove(cache_file)
+            # Also remove lock file if present
+            lock_file = cache_file + ".lock"
+            if os.path.exists(lock_file):
+                os.remove(lock_file)
             # Clean up empty directory
             cache_dir = os.path.dirname(cache_file)
             if os.path.isdir(cache_dir) and not os.listdir(cache_dir):
@@ -629,7 +643,7 @@ class FreeForm(Question):
         *,
         temperature: float = 1,
         max_tokens: int = 1024,
-        judges: dict[str, str | dict] = None,
+        judges: dict[str, str | dict | FreeFormJudge | RatingJudge] | None = None,
         **kwargs,
     ):
         """Initialize a FreeForm question.
@@ -830,7 +844,10 @@ class FreeForm(Question):
         return df
-    def _parse_judges(self, judges: dict[str, str | dict] | None) -> dict[str, "Question"] | None:
+    def _parse_judges(
+        self,
+        judges: dict[str, str | dict | FreeFormJudge | RatingJudge] | None
+    ) -> dict[str, FreeFormJudge | RatingJudge] | None:
         """Parse and validate judges dictionary."""
         if judges is None:
             return None

{llmcomp-1.3.0 → llmcomp-1.3.2}/llmcomp/question/result.py RENAMED Viewed

@@ -1,13 +1,38 @@
 import hashlib
 import json
 import os
+import tempfile
 from dataclasses import dataclass
 from datetime import datetime
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Callable, TextIO
+import filelock
 from llmcomp.config import Config
 from llmcomp.runner.model_adapter import ModelAdapter
+def atomic_write(path: str, write_fn: Callable[[TextIO], None]) -> None:
+    """Write to a file atomically with file locking.
+    Args:
+        path: Target file path.
+        write_fn: Function that takes a file handle and writes content.
+    """
+    dir_path = os.path.dirname(path)
+    os.makedirs(dir_path, exist_ok=True)
+    lock = filelock.FileLock(path + ".lock")
+    with lock:
+        fd, temp_path = tempfile.mkstemp(dir=dir_path, suffix=".tmp")
+        try:
+            with os.fdopen(fd, "w") as f:
+                write_fn(f)
+            os.replace(temp_path, path)
+        except:
+            os.unlink(temp_path)
+            raise
 if TYPE_CHECKING:
     from llmcomp.question.question import Question
@@ -80,12 +105,12 @@ class Result:
         return f"{Config.cache_dir}/question/{question.name}/{cache_hash(question, model)[:7]}.jsonl"
     def save(self):
-        path = self.file_path(self.question, self.model)
-        os.makedirs(os.path.dirname(path), exist_ok=True)
-        with open(path, "w") as f:
+        def write_fn(f):
             f.write(json.dumps(self._metadata()) + "\n")
             for d in self.data:
                 f.write(json.dumps(d) + "\n")
+        atomic_write(self.file_path(self.question, self.model), write_fn)
     @classmethod
     def load(cls, question: "Question", model: str) -> "Result":
@@ -189,18 +214,16 @@ class JudgeCache:
         return self._data
     def save(self):
-        """Save cache to disk."""
+        """Save cache to disk with file locking for concurrent access."""
         if self._data is None:
             return
-        path = self.file_path(self.judge)
-        os.makedirs(os.path.dirname(path), exist_ok=True)
         file_data = {
             "metadata": self._metadata(),
             "data": self._data,
         }
-        with open(path, "w") as f:
-            json.dump(file_data, f, indent=2)
+        atomic_write(self.file_path(self.judge), lambda f: json.dump(file_data, f, indent=2))
     def _metadata(self) -> dict:
         return {

{llmcomp-1.3.0 → llmcomp-1.3.2}/llmcomp/question/viewer.py RENAMED Viewed

@@ -24,7 +24,7 @@ from typing import Any
 def render_dataframe(
     df: "pd.DataFrame",
-    sort_by: str | None = None,
+    sort_by: str | None = "__random__",
     sort_ascending: bool = True,
     open_browser: bool = True,
     port: int = 8501,
@@ -34,7 +34,8 @@ def render_dataframe(
     Args:
         df: DataFrame with at least 'api_kwargs' and 'answer' columns.
             Other columns (model, group, etc.) are displayed as metadata.
-        sort_by: Column name to sort by initially. If None, keeps original order.
+        sort_by: Column name to sort by initially. Default: "__random__" for random
+            shuffling (new seed on each refresh). Use None for original order.
         sort_ascending: Sort order. Default: True (ascending).
         open_browser: If True, automatically open the viewer in default browser.
         port: Port to run the Streamlit server on.
@@ -47,7 +48,7 @@ def render_dataframe(
         raise ValueError("DataFrame must have an 'api_kwargs' column")
     if "answer" not in df.columns:
         raise ValueError("DataFrame must have an 'answer' column")
-    if sort_by is not None and sort_by not in df.columns:
+    if sort_by is not None and sort_by != "__random__" and sort_by not in df.columns:
         raise ValueError(f"sort_by column '{sort_by}' not found in DataFrame")
     # Save DataFrame to a temp file
@@ -68,7 +69,7 @@ def render_dataframe(
     if open_browser:
         # Open browser after a short delay to let server start
         import threading
-        threading.Timer(1.5, lambda: webbrowser.open(url)).start()
+        threading.Timer(0.5, lambda: webbrowser.open(url)).start()
     # Launch Streamlit
     viewer_path = Path(__file__).resolve()
@@ -186,7 +187,9 @@ def _display_metadata(row: dict[str, Any], exclude_keys: set[str]) -> None:
             for key, value in metadata.items():
                 if isinstance(value, (dict, list)):
                     st.markdown(f"**{key}:**")
-                    st.json(value)
+                    # Collapse _raw_answer and _probs dicts by default
+                    collapsed = key.endswith("_raw_answer") or key.endswith("_probs")
+                    st.json(value, expanded=not collapsed)
                 else:
                     st.markdown(f"**{key}:** {value}")
@@ -272,7 +275,7 @@ def _streamlit_main():
         return
     # Get sortable columns (numeric or string, exclude complex types)
-    sortable_columns = ["(none)"]
+    sortable_columns = ["(random)", "(none)"]
     if items:
         for key, value in items[0].items():
             if key not in ("api_kwargs",) and isinstance(value, (int, float, str, type(None))):
@@ -281,7 +284,13 @@ def _streamlit_main():
     # Initialize sort settings from command line args
     initial_sort_by, initial_sort_asc = _get_initial_sort()
     if "sort_by" not in st.session_state:
-        st.session_state.sort_by = initial_sort_by if initial_sort_by in sortable_columns else "(none)"
+        # Map __random__ from CLI to (random) in UI
+        if initial_sort_by == "__random__":
+            st.session_state.sort_by = "(random)"
+        elif initial_sort_by in sortable_columns:
+            st.session_state.sort_by = initial_sort_by
+        else:
+            st.session_state.sort_by = "(none)"
         st.session_state.sort_ascending = initial_sort_asc
     # Initialize view index
@@ -317,6 +326,16 @@ def _streamlit_main():
             st.session_state.sort_ascending = sort_ascending
             st.session_state.view_idx = 0
+    # Reshuffle button for random sort
+    if st.session_state.sort_by == "(random)":
+        import random
+        col_reshuffle, _ = st.columns([1, 5])
+        with col_reshuffle:
+            if st.button("🔀 Reshuffle"):
+                st.session_state.random_seed = random.randint(0, 2**32 - 1)
+                st.session_state.view_idx = 0
+                st.rerun()
     # Secondary sort (only show if primary sort is selected)
     if st.session_state.sort_by and st.session_state.sort_by != "(none)":
         col_spacer, col_sort2, col_order2 = st.columns([3, 2, 1])
@@ -340,8 +359,18 @@ def _streamlit_main():
     # Apply search
     filtered_items = _search_items(items, query)
+    # Apply random shuffle if selected (new seed on each refresh via Reshuffle button)
+    if st.session_state.sort_by == "(random)" and filtered_items:
+        import random
+        # Generate a new seed on first load or when explicitly reshuffled
+        if "random_seed" not in st.session_state:
+            st.session_state.random_seed = random.randint(0, 2**32 - 1)
+        rng = random.Random(st.session_state.random_seed)
+        filtered_items = filtered_items.copy()
+        rng.shuffle(filtered_items)
     # Apply sorting (stable sort - secondary first, then primary)
-    if st.session_state.sort_by and st.session_state.sort_by != "(none)" and filtered_items:
+    if st.session_state.sort_by and st.session_state.sort_by not in ("(none)", "(random)") and filtered_items:
         sort_key_2 = st.session_state.sort_by_2 if st.session_state.sort_by_2 != "(none)" else None
         # Secondary sort first (stable sort preserves this ordering within primary groups)
@@ -429,7 +458,7 @@ def _streamlit_main():
         # Display judge columns if present
         judge_columns = [k for k in current.keys() if not k.startswith("_") and k not in {
             "api_kwargs", "answer", "question", "model", "group", "paraphrase_ix", "raw_answer"
-        } and not k.endswith("_question") and not k.endswith("_raw_answer")]
+        } and not k.endswith("_question") and not k.endswith("_raw_answer") and not k.endswith("_probs")]
         if judge_columns:
             st.markdown("---")

{llmcomp-1.3.0 → llmcomp-1.3.2}/llmcomp/runner/chat_completion.py RENAMED Viewed

@@ -15,17 +15,23 @@ def on_backoff(details):
     # But we can do that only by reading the message, and this is bad.
-@backoff.on_exception(
-    wait_gen=backoff.expo,
-    exception=(
-        openai.RateLimitError,
-        openai.APIConnectionError,
-        openai.APITimeoutError,
-        openai.InternalServerError,
-    ),
-    max_value=60,
-    factor=1.5,
-    on_backoff=on_backoff,
+DEFAULT_BACKOFF_EXCEPTIONS = (
+    openai.RateLimitError,
+    openai.APIConnectionError,
+    openai.APITimeoutError,
+    openai.InternalServerError,
 )
-def openai_chat_completion(*, client, **kwargs):
-    return client.chat.completions.create(**kwargs)
+def openai_chat_completion(*, client, kwargs: dict, backoff_on=DEFAULT_BACKOFF_EXCEPTIONS):
+    @backoff.on_exception(
+        wait_gen=backoff.expo,
+        exception=tuple(backoff_on),
+        max_value=60,
+        factor=1.5,
+        on_backoff=on_backoff,
+    )
+    def _call():
+        return client.chat.completions.create(**kwargs)
+    return _call()

{llmcomp-1.3.0 → llmcomp-1.3.2}/llmcomp/runner/runner.py RENAMED Viewed

@@ -62,7 +62,7 @@ class Runner:
             Tuple of (content, prepared_kwargs) where prepared_kwargs is what was sent to the API.
         """
         prepared = self._prepare_for_model(params)
-        completion = openai_chat_completion(client=self.client, **prepared)
+        completion = openai_chat_completion(client=self.client, kwargs=prepared)
         try:
             content = completion.choices[0].message.content
             if content is None:
@@ -138,7 +138,7 @@ class Runner:
             "logprobs": True,
         }
         prepared = self._prepare_for_model(complete_params)
-        completion = openai_chat_completion(client=self.client, **prepared)
+        completion = openai_chat_completion(client=self.client, kwargs=prepared)
         if completion.choices[0].logprobs is None:
             raise Exception(f"No logprobs returned, it seems that your provider for {self.model} doesn't support that.")
@@ -236,11 +236,11 @@ class Runner:
                 else:
                     msg_info = ""
                 warnings.warn(
-                    f"Unexpected error (probably API-related), runner returns None. "
+                    f"Unexpected error (probably API-related), runner returns empty string. "
                     f"Model: {self.model}, function: {func.__name__}{msg_info}. "
                     f"Error: {type(e).__name__}: {e}"
                 )
-                result = (None, {})
+                result = ("", {})
             return kwargs, result
         futures = [executor.submit(get_data, kwargs) for kwargs in kwargs_list]
@@ -290,7 +290,7 @@ class Runner:
                 "n": n,
             }
             prepared = self._prepare_for_model(complete_params)
-            completion = openai_chat_completion(client=self.client, **prepared)
+            completion = openai_chat_completion(client=self.client, kwargs=prepared)
             for choice in completion.choices:
                 cnts[choice.message.content] += 1
         if sum(cnts.values()) != num_samples:

{llmcomp-1.3.0 → llmcomp-1.3.2}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "llmcomp"
-version = "1.3.0"
+version = "1.3.2"
 description = "Research library for black-box experiments on language models."
 readme = "README.md"
 requires-python = ">=3.9"
@@ -22,6 +22,7 @@ dependencies = [
     "backoff",
     "requests",
     "streamlit>=1.20.0",
+    "filelock",
 ]
 [project.scripts]

llmcomp-1.3.2/t1.py ADDED Viewed

@@ -0,0 +1,11 @@
+# %%
+from llmcomp import Question, Config
+MODELS = {
+    "gpt-4.1-mini": ["gpt-4.1-mini-2025-04-14"],
+    "gpt-4o-mini": ["gpt-4o-mini-2024-07-18"],
+}
+# %%
+x = Config.client_for_model("gpt-4.1-mini-2025-04-14")
+print(x.base_url)

{llmcomp-1.3.0 → llmcomp-1.3.2}/tests/conftest.py RENAMED Viewed

@@ -62,7 +62,8 @@ def mock_openai_chat_completion():
     Config.client_cache.clear()
     # Create a function that returns a properly structured mock completion
-    def create_mock_completion(*, client=None, **kwargs):
+    def create_mock_completion(*, client=None, kwargs=None):
+        kwargs = kwargs or {}
         # Extract messages to determine what response to return
         messages = kwargs.get('messages', [])
         logprobs = kwargs.get('logprobs', False)

{llmcomp-1.3.0 → llmcomp-1.3.2}/tests/test_question.py RENAMED Viewed

@@ -591,7 +591,8 @@ def test_rating_aggregates_duplicate_integer_tokens(temp_dir):
     Config.client_cache.clear()
-    def mock_completion(*, client=None, **kwargs):
+    def mock_completion(*, client=None, kwargs=None):
+        kwargs = kwargs or {}
         messages = kwargs.get('messages', [])
         logprobs = kwargs.get('logprobs', False)
@@ -683,7 +684,8 @@ def test_judge_with_answer_only_template_and_duplicate_answers(temp_dir):
     # Track what prompts were sent to the API
     api_calls = []
-    def mock_completion(*, client=None, **kwargs):
+    def mock_completion(*, client=None, kwargs=None):
+        kwargs = kwargs or {}
         messages = kwargs.get('messages', [])
         logprobs = kwargs.get('logprobs', False)

llmcomp-1.3.0/t1.py DELETED Viewed

@@ -1,13 +0,0 @@
-import tinker
-sc = tinker.ServiceClient()
-tc = sc.create_lora_training_client(
-    base_model="openai/gpt-oss-20b",
-    rank=1,
-    seed=0,
-    train_mlp=False,
-    train_attn=False,
-    train_unembed=False,
-)
-path = tc.save_weights_for_sampler(name="gpt-oss-20b-base-like").result().path
-print(path)