PyPI - llmcomp - Versions diffs - 1.2.4__py3-none-any.whl → 1.3.0__py3-none-any.whl - Mend

llmcomp 1.2.4py3-none-any.whl → 1.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

llmcomp/finetuning/manager.py +21 -0
llmcomp/finetuning/validation.py +406 -0
llmcomp/question/judge.py +11 -0
llmcomp/question/plots.py +123 -68
llmcomp/question/question.py +235 -187
llmcomp/question/result.py +1 -1
llmcomp/question/viewer.py +459 -0
llmcomp/runner/runner.py +32 -18
{llmcomp-1.2.4.dist-info → llmcomp-1.3.0.dist-info}/METADATA +7 -5
llmcomp-1.3.0.dist-info/RECORD +21 -0
llmcomp-1.2.4.dist-info/RECORD +0 -19
{llmcomp-1.2.4.dist-info → llmcomp-1.3.0.dist-info}/WHEEL +0 -0
{llmcomp-1.2.4.dist-info → llmcomp-1.3.0.dist-info}/entry_points.txt +0 -0
{llmcomp-1.2.4.dist-info → llmcomp-1.3.0.dist-info}/licenses/LICENSE +0 -0

llmcomp/question/viewer.py ADDED Viewed

@@ -0,0 +1,459 @@
+"""DataFrame viewer for browsing question results.
+Spawns a local Streamlit server to interactively browse (api_kwargs, answer) pairs.
+Usage:
+    from llmcomp import Question
+    question = Question.create(...)
+    df = question.df(models)
+    Question.view(df)
+"""
+import json
+import os
+import subprocess
+import sys
+import tempfile
+import webbrowser
+from pathlib import Path
+from typing import Any
+# Streamlit imports are inside functions to avoid import errors when streamlit isn't installed
+def render_dataframe(
+    df: "pd.DataFrame",
+    sort_by: str | None = None,
+    sort_ascending: bool = True,
+    open_browser: bool = True,
+    port: int = 8501,
+) -> None:
+    """Launch a Streamlit viewer for the DataFrame.
+    Args:
+        df: DataFrame with at least 'api_kwargs' and 'answer' columns.
+            Other columns (model, group, etc.) are displayed as metadata.
+        sort_by: Column name to sort by initially. If None, keeps original order.
+        sort_ascending: Sort order. Default: True (ascending).
+        open_browser: If True, automatically open the viewer in default browser.
+        port: Port to run the Streamlit server on.
+    Raises:
+        ValueError: If required columns are missing.
+    """
+    # Validate required columns
+    if "api_kwargs" not in df.columns:
+        raise ValueError("DataFrame must have an 'api_kwargs' column")
+    if "answer" not in df.columns:
+        raise ValueError("DataFrame must have an 'answer' column")
+    if sort_by is not None and sort_by not in df.columns:
+        raise ValueError(f"sort_by column '{sort_by}' not found in DataFrame")
+    # Save DataFrame to a temp file
+    temp_dir = tempfile.mkdtemp(prefix="llmcomp_viewer_")
+    temp_path = os.path.join(temp_dir, "data.jsonl")
+    # Convert DataFrame to JSONL
+    with open(temp_path, "w", encoding="utf-8") as f:
+        for _, row in df.iterrows():
+            row_dict = row.to_dict()
+            f.write(json.dumps(row_dict, default=str) + "\n")
+    url = f"http://localhost:{port}"
+    print(f"Starting viewer at {url}")
+    print(f"Data file: {temp_path}")
+    print("Press Ctrl+C to stop the server.\n")
+    if open_browser:
+        # Open browser after a short delay to let server start
+        import threading
+        threading.Timer(1.5, lambda: webbrowser.open(url)).start()
+    # Launch Streamlit
+    viewer_path = Path(__file__).resolve()
+    cmd = [
+        sys.executable, "-m", "streamlit", "run",
+        str(viewer_path),
+        "--server.port", str(port),
+        "--server.headless", "true",
+        "--",  # Separator for script args
+        temp_path,
+        sort_by or "",  # Empty string means no sorting
+        "asc" if sort_ascending else "desc",
+    ]
+    try:
+        subprocess.run(cmd, check=True)
+    except KeyboardInterrupt:
+        print("\nViewer stopped.")
+    finally:
+        # Clean up temp file
+        try:
+            os.remove(temp_path)
+            os.rmdir(temp_dir)
+        except OSError:
+            pass
+# =============================================================================
+# Streamlit App (runs when this file is executed by streamlit)
+# =============================================================================
+def _get_data_path() -> str | None:
+    """Get data file path from command line args."""
+    # Args after -- are passed to the script
+    if len(sys.argv) > 1:
+        return sys.argv[1]
+    return None
+def _get_initial_sort() -> tuple[str | None, bool]:
+    """Get initial sort settings from command line args."""
+    sort_by = None
+    sort_ascending = True
+    if len(sys.argv) > 2:
+        sort_by = sys.argv[2] if sys.argv[2] else None
+    if len(sys.argv) > 3:
+        sort_ascending = sys.argv[3] != "desc"
+    return sort_by, sort_ascending
+def _read_jsonl(path: str) -> list[dict[str, Any]]:
+    """Read JSONL file into a list of dicts."""
+    items = []
+    with open(path, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                items.append(json.loads(line))
+    return items
+def _display_messages(messages: list[dict[str, str]]) -> None:
+    """Display a list of chat messages in Streamlit chat format."""
+    import streamlit as st
+    for msg in messages:
+        role = msg.get("role", "user")
+        content = msg.get("content", "")
+        # Map roles to streamlit chat_message roles
+        if role == "system":
+            with st.chat_message("assistant", avatar="⚙️"):
+                st.markdown("**System**")
+                st.text(content)
+        elif role == "assistant":
+            with st.chat_message("assistant"):
+                st.text(content)
+        else:  # user or other
+            with st.chat_message("user"):
+                st.text(content)
+def _display_answer(answer: Any, label: str | None = None) -> None:
+    """Display the answer, handling different types."""
+    import streamlit as st
+    if label:
+        st.markdown(f"**{label}**")
+    if isinstance(answer, dict):
+        # For NextToken questions, answer is {token: probability}
+        # Sort by probability descending
+        sorted_items = sorted(answer.items(), key=lambda x: -x[1] if isinstance(x[1], (int, float)) else 0)
+        # Display as a table-like format
+        for token, prob in sorted_items[:20]:  # Show top 20
+            if isinstance(prob, float):
+                st.text(f"  {token!r}: {prob:.4f}")
+            else:
+                st.text(f"  {token!r}: {prob}")
+    elif isinstance(answer, str):
+        st.text(answer)
+    else:
+        st.text(str(answer))
+def _display_metadata(row: dict[str, Any], exclude_keys: set[str]) -> None:
+    """Display metadata columns."""
+    import streamlit as st
+    metadata = {k: v for k, v in row.items() if k not in exclude_keys}
+    if metadata:
+        with st.expander("Metadata", expanded=False):
+            for key, value in metadata.items():
+                if isinstance(value, (dict, list)):
+                    st.markdown(f"**{key}:**")
+                    st.json(value)
+                else:
+                    st.markdown(f"**{key}:** {value}")
+def _search_items(items: list[dict[str, Any]], query: str) -> list[dict[str, Any]]:
+    """Filter items by search query.
+    Supports:
+        - Regular search: "foo" - includes items containing "foo"
+        - Negative search: "-foo" - excludes items containing "foo"
+        - Combined: "foo -bar" - items with "foo" but not "bar"
+    """
+    if not query:
+        return items
+    # Parse query into positive and negative terms
+    terms = query.split()
+    positive_terms = []
+    negative_terms = []
+    for term in terms:
+        if term.startswith("-") and len(term) > 1:
+            negative_terms.append(term[1:].lower())
+        else:
+            positive_terms.append(term.lower())
+    results = []
+    for item in items:
+        # Build searchable text from item
+        api_kwargs = item.get("api_kwargs", {})
+        messages = api_kwargs.get("messages", []) if isinstance(api_kwargs, dict) else []
+        messages_text = " ".join(m.get("content", "") for m in messages)
+        answer = item.get("answer", "")
+        answer_text = str(answer) if not isinstance(answer, str) else answer
+        all_text = messages_text + " " + answer_text
+        all_text += " " + " ".join(str(v) for v in item.values() if isinstance(v, str))
+        all_text_lower = all_text.lower()
+        # Check positive terms (all must match)
+        if positive_terms and not all(term in all_text_lower for term in positive_terms):
+            continue
+        # Check negative terms (none must match)
+        if any(term in all_text_lower for term in negative_terms):
+            continue
+        results.append(item)
+    return results
+def _streamlit_main():
+    """Main Streamlit app."""
+    import streamlit as st
+    st.set_page_config(
+        page_title="llmcomp Viewer",
+        page_icon="🔬",
+        layout="wide",
+    )
+    st.title("🔬 llmcomp Viewer")
+    # Get data path
+    data_path = _get_data_path()
+    if data_path is None or not os.path.exists(data_path):
+        st.error("No data file provided or file not found.")
+        st.info("Use `Question.render(df)` to launch the viewer with data.")
+        return
+    # Load data (cache in session state)
+    cache_key = f"llmcomp_data_{data_path}"
+    if cache_key not in st.session_state:
+        st.session_state[cache_key] = _read_jsonl(data_path)
+    items = st.session_state[cache_key]
+    if not items:
+        st.warning("No data to display.")
+        return
+    # Get sortable columns (numeric or string, exclude complex types)
+    sortable_columns = ["(none)"]
+    if items:
+        for key, value in items[0].items():
+            if key not in ("api_kwargs",) and isinstance(value, (int, float, str, type(None))):
+                sortable_columns.append(key)
+    # Initialize sort settings from command line args
+    initial_sort_by, initial_sort_asc = _get_initial_sort()
+    if "sort_by" not in st.session_state:
+        st.session_state.sort_by = initial_sort_by if initial_sort_by in sortable_columns else "(none)"
+        st.session_state.sort_ascending = initial_sort_asc
+    # Initialize view index
+    if "view_idx" not in st.session_state:
+        st.session_state.view_idx = 0
+    # Initialize secondary sort
+    if "sort_by_2" not in st.session_state:
+        st.session_state.sort_by_2 = "(none)"
+        st.session_state.sort_ascending_2 = True
+    # Search and sort controls
+    col_search, col_sort, col_order = st.columns([3, 2, 1])
+    with col_search:
+        query = st.text_input("🔍 Search", placeholder="Filter... (use -term to exclude)")
+    with col_sort:
+        sort_by = st.selectbox(
+            "Sort by",
+            options=sortable_columns,
+            index=sortable_columns.index(st.session_state.sort_by) if st.session_state.sort_by in sortable_columns else 0,
+            key="sort_by_select",
+        )
+        if sort_by != st.session_state.sort_by:
+            st.session_state.sort_by = sort_by
+            st.session_state.view_idx = 0  # Reset to first item when sort changes
+    with col_order:
+        st.markdown("<br>", unsafe_allow_html=True)  # Align checkbox with selectbox
+        sort_ascending = st.checkbox("Asc", value=st.session_state.sort_ascending, key="sort_asc_check")
+        if sort_ascending != st.session_state.sort_ascending:
+            st.session_state.sort_ascending = sort_ascending
+            st.session_state.view_idx = 0
+    # Secondary sort (only show if primary sort is selected)
+    if st.session_state.sort_by and st.session_state.sort_by != "(none)":
+        col_spacer, col_sort2, col_order2 = st.columns([3, 2, 1])
+        with col_sort2:
+            sort_by_2 = st.selectbox(
+                "Then by",
+                options=sortable_columns,
+                index=sortable_columns.index(st.session_state.sort_by_2) if st.session_state.sort_by_2 in sortable_columns else 0,
+                key="sort_by_select_2",
+            )
+            if sort_by_2 != st.session_state.sort_by_2:
+                st.session_state.sort_by_2 = sort_by_2
+                st.session_state.view_idx = 0
+        with col_order2:
+            st.markdown("<br>", unsafe_allow_html=True)  # Align checkbox with selectbox
+            sort_ascending_2 = st.checkbox("Asc", value=st.session_state.sort_ascending_2, key="sort_asc_check_2")
+            if sort_ascending_2 != st.session_state.sort_ascending_2:
+                st.session_state.sort_ascending_2 = sort_ascending_2
+                st.session_state.view_idx = 0
+    # Apply search
+    filtered_items = _search_items(items, query)
+    # Apply sorting (stable sort - secondary first, then primary)
+    if st.session_state.sort_by and st.session_state.sort_by != "(none)" and filtered_items:
+        sort_key_2 = st.session_state.sort_by_2 if st.session_state.sort_by_2 != "(none)" else None
+        # Secondary sort first (stable sort preserves this ordering within primary groups)
+        if sort_key_2:
+            filtered_items = sorted(
+                filtered_items,
+                key=lambda x: (x.get(sort_key_2) is None, x.get(sort_key_2)),
+                reverse=not st.session_state.sort_ascending_2,
+            )
+        # Primary sort
+        sort_key = st.session_state.sort_by
+        filtered_items = sorted(
+            filtered_items,
+            key=lambda x: (x.get(sort_key) is None, x.get(sort_key)),
+            reverse=not st.session_state.sort_ascending,
+        )
+    if not filtered_items:
+        st.warning(f"No results found for '{query}'")
+        return
+    # Clamp view index to valid range
+    max_idx = len(filtered_items) - 1
+    st.session_state.view_idx = max(0, min(st.session_state.view_idx, max_idx))
+    # Navigation
+    col1, col2, col3, col4 = st.columns([1, 1, 2, 2])
+    with col1:
+        if st.button("⬅️ Prev", use_container_width=True):
+            st.session_state.view_idx = max(0, st.session_state.view_idx - 1)
+            st.rerun()
+    with col2:
+        if st.button("Next ➡️", use_container_width=True):
+            st.session_state.view_idx = min(max_idx, st.session_state.view_idx + 1)
+            st.rerun()
+    with col3:
+        # Jump to specific index
+        new_idx = st.number_input(
+            "Go to",
+            min_value=1,
+            max_value=len(filtered_items),
+            value=st.session_state.view_idx + 1,
+            step=1,
+            label_visibility="collapsed",
+        )
+        if new_idx - 1 != st.session_state.view_idx:
+            st.session_state.view_idx = new_idx - 1
+            st.rerun()
+    with col4:
+        st.markdown(f"**{st.session_state.view_idx + 1}** of **{len(filtered_items)}**")
+        if query:
+            st.caption(f"({len(items)} total)")
+    st.divider()
+    # Display current item
+    current = filtered_items[st.session_state.view_idx]
+    # Main content in two columns
+    left_col, right_col = st.columns([1, 2])
+    with left_col:
+        st.subheader("💬 Messages")
+        api_kwargs = current.get("api_kwargs", {})
+        messages = api_kwargs.get("messages", []) if isinstance(api_kwargs, dict) else []
+        if messages:
+            _display_messages(messages)
+        else:
+            st.info("No messages")
+    with right_col:
+        model_name = current.get("model", "Response")
+        st.subheader(f"🤖 {model_name}")
+        answer = current.get("answer")
+        if answer is not None:
+            _display_answer(answer, label=None)
+        else:
+            st.info("No answer")
+        # Display judge columns if present
+        judge_columns = [k for k in current.keys() if not k.startswith("_") and k not in {
+            "api_kwargs", "answer", "question", "model", "group", "paraphrase_ix", "raw_answer"
+        } and not k.endswith("_question") and not k.endswith("_raw_answer")]
+        if judge_columns:
+            st.markdown("---")
+            for judge_col in judge_columns:
+                value = current[judge_col]
+                if isinstance(value, float):
+                    st.markdown(f"**{judge_col}:** {value:.2f}")
+                else:
+                    st.markdown(f"**{judge_col}:** {value}")
+    # Metadata at the bottom
+    st.divider()
+    # Show api_kwargs in metadata, but without messages (already displayed above)
+    current_for_metadata = current.copy()
+    if "api_kwargs" in current_for_metadata and isinstance(current_for_metadata["api_kwargs"], dict):
+        api_kwargs_without_messages = {k: v for k, v in current_for_metadata["api_kwargs"].items() if k != "messages"}
+        current_for_metadata["api_kwargs"] = api_kwargs_without_messages
+    exclude_keys = {"answer", "question", "paraphrase_ix"} | set(judge_columns)
+    _display_metadata(current_for_metadata, exclude_keys)
+    # Keyboard navigation hint
+    st.caption("💡 Tip: Use the navigation buttons or enter a number to jump to a specific row.")
+# Entry point when run by Streamlit
+if __name__ == "__main__":
+    _streamlit_main()

llmcomp/runner/runner.py CHANGED Viewed

@@ -51,12 +51,15 @@ class Runner:
         prepared = ModelAdapter.prepare(params, self.model)
         return {"timeout": Config.timeout, **prepared}
-    def get_text(self, params: dict) -> str:
+    def get_text(self, params: dict) -> tuple[str, dict]:
         """Get a text completion from the model.
         Args:
             params: Dictionary of parameters for the API.
                 Must include 'messages'. Other common keys: 'temperature', 'max_tokens'.
+        Returns:
+            Tuple of (content, prepared_kwargs) where prepared_kwargs is what was sent to the API.
         """
         prepared = self._prepare_for_model(params)
         completion = openai_chat_completion(client=self.client, **prepared)
@@ -72,8 +75,8 @@ class Runner:
                 #             refusal="I'm sorry, I'm unable to fulfill that request.",
                 #             ...))])
                 warnings.warn(f"API sent None as content. Returning empty string.\n{completion}", stacklevel=2)
-                return ""
-            return content
+                return "", prepared
+            return content, prepared
         except Exception:
             warnings.warn(f"Unexpected error.\n{completion}")
             raise
@@ -84,7 +87,7 @@ class Runner:
         *,
         num_samples: int = 1,
         convert_to_probs: bool = True,
-    ) -> dict:
+    ) -> tuple[dict, dict]:
         """Get probability distribution of the next token, optionally averaged over multiple samples.
         Args:
@@ -92,22 +95,26 @@ class Runner:
                 Must include 'messages'. Other common keys: 'top_logprobs', 'logit_bias'.
             num_samples: Number of samples to average over. Default: 1.
             convert_to_probs: If True, convert logprobs to probabilities. Default: True.
+        Returns:
+            Tuple of (probs_dict, prepared_kwargs) where prepared_kwargs is what was sent to the API.
         """
         probs = {}
+        prepared = None
         for _ in range(num_samples):
-            new_probs = self.single_token_probs_one_sample(params, convert_to_probs=convert_to_probs)
+            new_probs, prepared = self.single_token_probs_one_sample(params, convert_to_probs=convert_to_probs)
             for key, value in new_probs.items():
                 probs[key] = probs.get(key, 0) + value
         result = {key: value / num_samples for key, value in probs.items()}
         result = dict(sorted(result.items(), key=lambda x: x[1], reverse=True))
-        return result
+        return result, prepared
     def single_token_probs_one_sample(
         self,
         params: dict,
         *,
         convert_to_probs: bool = True,
-    ) -> dict:
+    ) -> tuple[dict, dict]:
         """Get probability distribution of the next token (single sample).
         Args:
@@ -115,6 +122,9 @@ class Runner:
                 Must include 'messages'. Other common keys: 'top_logprobs', 'logit_bias'.
             convert_to_probs: If True, convert logprobs to probabilities. Default: True.
+        Returns:
+            Tuple of (probs_dict, prepared_kwargs) where prepared_kwargs is what was sent to the API.
         Note: This function forces max_tokens=1, temperature=0, logprobs=True.
         """
         # Build complete params with defaults and forced params
@@ -138,7 +148,7 @@ class Runner:
         except IndexError:
             # This should not happen according to the API docs. But it sometimes does.
             print(NO_LOGPROBS_WARNING.format(model=self.model, completion=completion))
-            return {}
+            return {}, prepared
         # Check for duplicate tokens - this shouldn't happen with OpenAI but might with other providers
         tokens = [el.token for el in logprobs]
@@ -153,7 +163,7 @@ class Runner:
         for el in logprobs:
             result[el.token] = math.exp(el.logprob) if convert_to_probs else el.logprob
-        return result
+        return result, prepared
     def get_many(
         self,
@@ -173,8 +183,8 @@ class Runner:
                 {"params": {"messages": [{"role": "user", "content": "Hello"}]}},
                 {"params": {"messages": [{"role": "user", "content": "Bye"}], "temperature": 0.7}},
             ]
-            for in_, out in runner.get_many(runner.get_text, kwargs_list):
-                print(in_, "->", out)
+            for in_, (out, prepared_kwargs) in runner.get_many(runner.get_text, kwargs_list):
+                print(in_, "->", out, prepared_kwargs)
         or
@@ -182,14 +192,14 @@ class Runner:
                 {"params": {"messages": [{"role": "user", "content": "Hello"}]}},
                 {"params": {"messages": [{"role": "user", "content": "Bye"}]}},
             ]
-            for in_, out in runner.get_many(runner.single_token_probs, kwargs_list):
-                print(in_, "->", out)
+            for in_, (out, prepared_kwargs) in runner.get_many(runner.single_token_probs, kwargs_list):
+                print(in_, "->", out, prepared_kwargs)
         (FUNC that is a different callable should also work)
         This function returns a generator that yields pairs (input, output),
-        where input is an element from KWARGS_LIST and output is the thing returned by
-        FUNC for this input.
+        where input is an element from KWARGS_LIST and output is the tuple (result, prepared_kwargs)
+        returned by FUNC. prepared_kwargs contains the actual parameters sent to the API.
         Dictionaries in KWARGS_LIST might include optional keys starting with underscore,
         they are just ignored, but they are returned in the first element of the pair, so that's useful
@@ -230,7 +240,7 @@ class Runner:
                     f"Model: {self.model}, function: {func.__name__}{msg_info}. "
                     f"Error: {type(e).__name__}: {e}"
                 )
-                result = None
+                result = (None, {})
             return kwargs, result
         futures = [executor.submit(get_data, kwargs) for kwargs in kwargs_list]
@@ -251,7 +261,7 @@ class Runner:
         params: dict,
         *,
         num_samples: int,
-    ) -> dict:
+    ) -> tuple[dict, dict]:
         """Sample answers NUM_SAMPLES times. Returns probabilities of answers.
         Args:
@@ -259,6 +269,9 @@ class Runner:
                 Must include 'messages'. Other common keys: 'max_tokens', 'temperature'.
             num_samples: Number of samples to collect.
+        Returns:
+            Tuple of (probs_dict, prepared_kwargs) where prepared_kwargs is what was sent to the API.
         Works only if the API supports `n` parameter.
         Usecases:
@@ -268,6 +281,7 @@ class Runner:
           for Runner.single_token_probs.
         """
         cnts = defaultdict(int)
+        prepared = None
         for i in range(((num_samples - 1) // 128) + 1):
             n = min(128, num_samples - i * 128)
             # Build complete params with forced param
@@ -285,4 +299,4 @@ class Runner:
             )
         result = {key: val / num_samples for key, val in cnts.items()}
         result = dict(sorted(result.items(), key=lambda x: x[1], reverse=True))
-        return result
+        return result, prepared

{llmcomp-1.2.4.dist-info → llmcomp-1.3.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: llmcomp
-Version: 1.2.4
+Version: 1.3.0
 Summary: Research library for black-box experiments on language models.
 Project-URL: Homepage, https://github.com/johny-b/llmcomp
 Project-URL: Repository, https://github.com/johny-b/llmcomp
@@ -15,6 +15,7 @@ Requires-Dist: openai>=1.0.0
 Requires-Dist: pandas
 Requires-Dist: pyyaml
 Requires-Dist: requests
+Requires-Dist: streamlit>=1.20.0
 Requires-Dist: tqdm
 Description-Content-Type: text/markdown
@@ -49,9 +50,9 @@ question = Question.create(
     samples_per_paraphrase=100,
     temperature=1,
 )
-question.plot(MODELS, min_fraction=0.03)
-df = question.df(MODELS)
-print(df.head(1).iloc[0])
+df = question.df(MODELS)  # Dataframe with the results
+question.plot(MODELS, min_fraction=0.03)  # Aggregated bar chart
+question.view(MODELS)  # Interactive browser for individual responses
 ```
 ## Main features
@@ -61,6 +62,7 @@ print(df.head(1).iloc[0])
 * **Parallel requests** - configurable concurrency across models
 * **Multi-key support** - use `OPENAI_API_KEY_0`, `OPENAI_API_KEY_1`, etc. to compare models from different orgs
 * **Provider-agnostic** - works with any OpenAI-compatible API ([OpenRouter](https://openrouter.ai/docs/quickstart#using-the-openai-sdk), [Tinker](https://tinker-docs.thinkingmachines.ai/compatible-apis/openai), etc.)
+* **Built-in viewer** - browse answers interactively with `question.view(MODELS)`
 * **Extensible** - highly configurable as long as your goal is comparing LLMs
 ## Cookbook
@@ -148,7 +150,7 @@ You can send more parallel requests by increasing `Config.max_workers`.
 Suppose you have many prompts you want to send to models. There are three options:
 1. Have a separate Question object for each prompt and execute them in a loop
 2. Have a separate Question object for each prompt and execute them in parallel
-3. Have a single Question object with many paraphrases and then split the resulting dataframe (using any of the `paraphrase_ix`, `question` or `messages` columns)
+3. Have a single Question object with many paraphrases and then split the resulting dataframe (using any of the `paraphrase_ix` or `question` columns)
 Option 1 will be slow - the more quick questions you have, the worse.
 Option 2 will be fast, but you need to write parallelization yourself. Question should be thread-safe, but parallel execution of questions was **never** tested. One thing that won't work: `llmcomp.Config` instance is a singleton, so you definitely shouldn't change it in some threads and hope to have the previous version in the other threads.

llmcomp-1.3.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,21 @@
+llmcomp/__init__.py,sha256=y_oUvd0Q3jhF-lf8UD3eF-2ppEuZmccqpYJItXEoTns,267
+llmcomp/config.py,sha256=xADWhqsQphJZQvf7WemWencmWuBnvTN_KeJrjWfnmHY,8942
+llmcomp/default_adapters.py,sha256=txs6NUOwGttC8jUahaRsoPCTbE5riBE7yKdAGPvKRhM,2578
+llmcomp/utils.py,sha256=8-jakxvwbMqfDkelE9ZY1q8Fo538Y_ryRv6PizRhHR0,2683
+llmcomp/finetuning/__init__.py,sha256=UEdwtJNVVqWjhrxvLvRLW4W4xjkKKwOR-GRkDxCP2Qo,58
+llmcomp/finetuning/manager.py,sha256=6G0CW3NWK8vdfBoAjH0HATx_g16wwq5oU0mlHs-q28o,19083
+llmcomp/finetuning/update_jobs.py,sha256=blsHzg_ViTa2hBJtWCqR5onttehTtmXn3vmCTNd_hJw,980
+llmcomp/finetuning/validation.py,sha256=v4FoFw8woo5No9A01ktuALsMsXdgb3N2rS58ttBUmHY,14047
+llmcomp/question/judge.py,sha256=tNY94AHqncrbl2gf-g_Y3lepJ_HrahJRH-WgQyokegk,6568
+llmcomp/question/plots.py,sha256=Izp9jxWzQDgRgycgM7_-lhIkqx7yr_WBQedUcUcpaFA,11164
+llmcomp/question/question.py,sha256=cLOVp8ZD0O-Y1UI8RVpi6ZD3ulRtY8PeFwEgeAnLzvs,41100
+llmcomp/question/result.py,sha256=psc9tQpwEEhS4LGxaI7GhqCE1CSAmCo39yrKap9cLjA,8216
+llmcomp/question/viewer.py,sha256=hMHWr5cONWXF37ybXJTI_kudSz3xaA0shkQFRoNRZmI,16380
+llmcomp/runner/chat_completion.py,sha256=iDiWE0N0_MYfggD-ouyfUPyaADt7602K5Wo16a7JJo4,967
+llmcomp/runner/model_adapter.py,sha256=Dua98E7aBVrCaZ2Ep44vl164oFkpH1P78YqImQkns4U,3406
+llmcomp/runner/runner.py,sha256=B8p9b3At9JWWIW-mlADwyelJKqHxW4CIorSWyaD3gHM,12294
+llmcomp-1.3.0.dist-info/METADATA,sha256=CWC5sdrfuvQWWFOwjj7RJIzk0Rgb3EKCRPA75D5Wu4U,12963
+llmcomp-1.3.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+llmcomp-1.3.0.dist-info/entry_points.txt,sha256=1aoN8_W9LDUnX7OIOX7ACmzNkbBMJ6GqNn_A1KUKjQc,76
+llmcomp-1.3.0.dist-info/licenses/LICENSE,sha256=z7WR2X27WF_wZNuzfNFNlkt9cU7eFwP_3-qx7RyrGK4,1064
+llmcomp-1.3.0.dist-info/RECORD,,

llmcomp 1.2.4__py3-none-any.whl → 1.3.0__py3-none-any.whl

llmcomp 1.2.4py3-none-any.whl → 1.3.0py3-none-any.whl