PyPI - inspect-ai - Versions diffs - 0.3.100__py3-none-any.whl → 0.3.102__py3-none-any.whl - Mend

inspect-ai 0.3.100py3-none-any.whl → 0.3.102py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

inspect_ai/_cli/eval.py +1 -1
inspect_ai/_eval/evalset.py +2 -2
inspect_ai/_view/www/dist/assets/index.css +44 -12
inspect_ai/_view/www/dist/assets/index.js +1499 -1467
inspect_ai/_view/www/package.json +4 -4
inspect_ai/_view/www/src/app/log-view/tabs/grouping.ts +4 -4
inspect_ai/_view/www/src/app/routing/navigationHooks.ts +22 -25
inspect_ai/_view/www/src/app/samples/list/SampleList.tsx +17 -5
inspect_ai/_view/www/src/state/hooks.ts +1 -1
inspect_ai/_view/www/yarn.lock +21 -27
inspect_ai/analysis/beta/__init__.py +2 -0
inspect_ai/dataset/_sources/csv.py +2 -6
inspect_ai/dataset/_sources/hf.py +2 -6
inspect_ai/dataset/_sources/json.py +2 -6
inspect_ai/dataset/_util.py +23 -0
inspect_ai/model/_openai.py +4 -0
inspect_ai/model/_openai_responses.py +11 -6
inspect_ai/model/_openai_web_search.py +9 -2
inspect_ai/model/_providers/openai.py +11 -3
inspect_ai/model/_providers/openai_responses.py +5 -1
inspect_ai/scorer/_reducer/reducer.py +1 -1
inspect_ai/tool/_tools/_web_search/_google.py +28 -11
inspect_ai/tool/_tools/_web_search/_tavily.py +11 -1
{inspect_ai-0.3.100.dist-info → inspect_ai-0.3.102.dist-info}/METADATA +1 -1
{inspect_ai-0.3.100.dist-info → inspect_ai-0.3.102.dist-info}/RECORD +29 -29
{inspect_ai-0.3.100.dist-info → inspect_ai-0.3.102.dist-info}/WHEEL +0 -0
{inspect_ai-0.3.100.dist-info → inspect_ai-0.3.102.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.100.dist-info → inspect_ai-0.3.102.dist-info}/licenses/LICENSE +0 -0
{inspect_ai-0.3.100.dist-info → inspect_ai-0.3.102.dist-info}/top_level.txt +0 -0

inspect_ai/_view/www/package.json CHANGED Viewed

@@ -67,8 +67,8 @@
     "@popperjs/core": "^2.11.8",
     "ansi-output": "^0.0.9",
     "asciinema-player": "^3.9.0",
-    "bootstrap": "^5.3.3",
-    "bootstrap-icons": "^1.11.3",
+    "bootstrap": "^5.3.6",
+    "bootstrap-icons": "^1.12.1",
     "clipboard": "^2.0.11",
     "clsx": "^2.1.1",
     "codemirror": "^6.0.1",
@@ -89,8 +89,8 @@
     "react": "^19.0.0",
     "react-dom": "^19.0.0",
     "react-popper": "^2.3.0",
-    "react-router-dom": "^7.5.3",
-    "react-virtuoso": "^4.12.6",
+    "react-router-dom": "^7.6.0",
+    "react-virtuoso": "^4.12.7",
     "zustand": "^5.0.5",
     "use-resize-observer": "^9.1.0"
   }

inspect_ai/_view/www/src/app/log-view/tabs/grouping.ts CHANGED Viewed

@@ -47,7 +47,7 @@ const noGrouping = (
     const itemCount = counter.item();
     return [
       {
-        label: `Sample ${itemCount}`,
+        label: `Sample ${sample.id}`,
         number: itemCount,
         index: index,
         data: sample,
@@ -107,10 +107,10 @@ const groupBySample = (
     if (sample.id !== lastId) {
       counter.incrementGroup();
       results.push({
-        label: `Sample ${itemCount}`,
+        label: `Sample ${sample.id}`,
         number: counter.group(),
         index: index,
-        data: `Sample ${counter.group()}`,
+        data: `Sample ${sample.id}`,
         type: "separator",
       } as SeparatorListItem);
       counter.resetItem();
@@ -175,7 +175,7 @@ const groupByEpoch = (
     // Compute the index within the epoch
     counter.incrementItem();
     results.push({
-      label: `Sample ${counter.item()} (Epoch ${counter.group()})`,
+      label: `Sample ${sample.id} (Epoch ${sample.epoch})`,
       number: counter.item(),
       index: index,
       data: sample,

inspect_ai/_view/www/src/app/routing/navigationHooks.ts CHANGED Viewed

@@ -130,29 +130,26 @@ export const useSampleNavigation = () => {
   // Navigate to a specific sample with index
   const showSample = useCallback(
-    (index: number, specifiedSampleTabId?: string) => {
-      if (sampleSummaries && index >= 0 && index < sampleSummaries.length) {
-        const sample = sampleSummaries[index];
-        const resolvedPath = resolveLogPath();
-        if (resolvedPath) {
-          // Update internal state
-          selectSample(index);
-          setShowingSampleDialog(true);
-          // Use specified sampleTabId if provided, otherwise use current sampleTabId from URL params
-          const currentSampleTabId = specifiedSampleTabId || sampleTabId;
-          const url = sampleUrl(
-            resolvedPath,
-            sample.id,
-            sample.epoch,
-            currentSampleTabId,
-          );
-          // Navigate to the sample URL
-          navigate(url);
-        }
+    (
+      index: number,
+      id: string | number,
+      epoch: number,
+      specifiedSampleTabId?: string,
+    ) => {
+      const resolvedPath = resolveLogPath();
+      if (resolvedPath) {
+        // Update internal state
+        selectSample(index);
+        setShowingSampleDialog(true);
+        // Use specified sampleTabId if provided, otherwise use current sampleTabId from URL params
+        const currentSampleTabId = specifiedSampleTabId || sampleTabId;
+        const url = sampleUrl(resolvedPath, id, epoch, currentSampleTabId);
+        // Navigate to the sample URL
+        navigate(url);
       }
     },
     [
@@ -171,7 +168,7 @@ export const useSampleNavigation = () => {
     const itemsCount = sampleSummaries.length;
     const next = Math.min(selectedSampleIndex + 1, itemsCount - 1);
     if (next > -1) {
-      showSample(next, sampleTabId);
+      selectSample(next);
     }
   }, [selectedSampleIndex, showSample, sampleTabId]);
@@ -179,7 +176,7 @@ export const useSampleNavigation = () => {
   const previousSample = useCallback(() => {
     const prev = selectedSampleIndex - 1;
     if (prev > -1) {
-      showSample(prev, sampleTabId);
+      selectSample(prev);
     }
   }, [selectedSampleIndex, showSample, sampleTabId]);

inspect_ai/_view/www/src/app/samples/list/SampleList.tsx CHANGED Viewed

@@ -113,11 +113,19 @@ export const SampleList: FC<SampleListProps> = memo((props) => {
           e.preventDefault();
           e.stopPropagation();
           break;
-        case "Enter":
-          sampleNavigation.showSample(selectedSampleIndex);
-          e.preventDefault();
-          e.stopPropagation();
+        case "Enter": {
+          const item = items[selectedSampleIndex];
+          if (item.type === "sample") {
+            sampleNavigation.showSample(
+              item.index,
+              item.data.id,
+              item.data.epoch,
+            );
+            e.preventDefault();
+            e.stopPropagation();
+          }
           break;
+        }
       }
     },
     [
@@ -150,7 +158,11 @@ export const SampleList: FC<SampleListProps> = memo((props) => {
               item.data.epoch,
             )}
             showSample={() => {
-              sampleNavigation.showSample(item.index);
+              sampleNavigation.showSample(
+                item.index,
+                item.data.id,
+                item.data.epoch,
+              );
             }}
           />
         );

inspect_ai/_view/www/src/state/hooks.ts CHANGED Viewed

@@ -277,7 +277,7 @@ export const useCollapseSampleEvent = (
   const collapseEvent = useStore((state) => state.sampleActions.collapseEvent);
   return useMemo(() => {
-    const isCollapsed = collapsed !== null && collapsed[scope][id] === true;
+    const isCollapsed = collapsed !== null && collapsed[scope]?.[id] === true;
     const set = (value: boolean) => {
       log.debug("Set collapsed", id, value);
       collapseEvent(scope, id, value);

inspect_ai/_view/www/yarn.lock CHANGED Viewed

@@ -2339,15 +2339,15 @@ balanced-match@^1.0.0:
   resolved "https://registry.yarnpkg.com/balanced-match/-/balanced-match-1.0.2.tgz#e83e3a7e3f300b34cb9d87f615fa0cbf357690ee"
   integrity sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==
-bootstrap-icons@^1.11.3:
-  version "1.11.3"
-  resolved "https://registry.yarnpkg.com/bootstrap-icons/-/bootstrap-icons-1.11.3.tgz#03f9cb754ec005c52f9ee616e2e84a82cab3084b"
-  integrity sha512-+3lpHrCw/it2/7lBL15VR0HEumaBss0+f/Lb6ZvHISn1mlK83jjFpooTLsMWbIjJMDjDjOExMsTxnXSIT4k4ww==
+bootstrap-icons@^1.12.1:
+  version "1.13.1"
+  resolved "https://registry.yarnpkg.com/bootstrap-icons/-/bootstrap-icons-1.13.1.tgz#0aad3f5b55b67402990e729ce3883416f9cef6c5"
+  integrity sha512-ijombt4v6bv5CLeXvRWKy7CuM3TRTuPEuGaGKvTV5cz65rQSY8RQ2JcHt6b90cBBAC7s8fsf2EkQDldzCoXUjw==
-bootstrap@^5.3.3:
-  version "5.3.5"
-  resolved "https://registry.yarnpkg.com/bootstrap/-/bootstrap-5.3.5.tgz#be42cfe0d580e97ee1abb7d38ce94f5c393c9bb6"
-  integrity sha512-ct1CHKtiobRimyGzmsSldEtM03E8fcEX4Tb3dGXz1V8faRwM50+vfHwTzOxB3IlKO7m+9vTH3s/3C6T2EAPeTA==
+bootstrap@^5.3.6:
+  version "5.3.6"
+  resolved "https://registry.yarnpkg.com/bootstrap/-/bootstrap-5.3.6.tgz#fbd91ebaff093f5b191a1c01a8c866d24f9fa6e1"
+  integrity sha512-jX0GAcRzvdwISuvArXn3m7KZscWWFAf1MKBcnzaN02qWMb3jpMoUX4/qgeiGzqyIb4ojulRzs89UCUmGcFSzTA==
 brace-expansion@^1.1.7:
   version "1.1.11"
@@ -4478,21 +4478,20 @@ react-refresh@^0.17.0:
   resolved "https://registry.yarnpkg.com/react-refresh/-/react-refresh-0.17.0.tgz#b7e579c3657f23d04eccbe4ad2e58a8ed51e7e53"
   integrity sha512-z6F7K9bV85EfseRCp2bzrpyQ0Gkw1uLoCel9XBVWPg/TjRj94SkJzUTGfOa4bs7iJvBWtQG0Wq7wnI0syw3EBQ==
-react-router-dom@^7.5.3:
-  version "7.5.3"
-  resolved "https://registry.yarnpkg.com/react-router-dom/-/react-router-dom-7.5.3.tgz#496e9f6d90f731703c7772668b41747028e0a2d5"
-  integrity sha512-cK0jSaTyW4jV9SRKAItMIQfWZ/D6WEZafgHuuCb9g+SjhLolY78qc+De4w/Cz9ybjvLzShAmaIMEXt8iF1Cm+A==
+react-router-dom@^7.6.0:
+  version "7.6.1"
+  resolved "https://registry.yarnpkg.com/react-router-dom/-/react-router-dom-7.6.1.tgz#263c9102e96b58d336258a51d68080b40c28f526"
+  integrity sha512-vxU7ei//UfPYQ3iZvHuO1D/5fX3/JOqhNTbRR+WjSBWxf9bIvpWK+ftjmdfJHzPOuMQKe2fiEdG+dZX6E8uUpA==
   dependencies:
-    react-router "7.5.3"
+    react-router "7.6.1"
-react-router@7.5.3:
-  version "7.5.3"
-  resolved "https://registry.yarnpkg.com/react-router/-/react-router-7.5.3.tgz#9e5420832af8c3690740c1797d4fa54613fea06d"
-  integrity sha512-3iUDM4/fZCQ89SXlDa+Ph3MevBrozBAI655OAfWQlTm9nBR0IKlrmNwFow5lPHttbwvITZfkeeeZFP6zt3F7pw==
+react-router@7.6.1:
+  version "7.6.1"
+  resolved "https://registry.yarnpkg.com/react-router/-/react-router-7.6.1.tgz#a54f9b980b94594bcb4b7f26611612a9f6e17461"
+  integrity sha512-hPJXXxHJZEsPFNVbtATH7+MMX43UDeOauz+EAU4cgqTn7ojdI9qQORqS8Z0qmDlL1TclO/6jLRYUEtbWidtdHQ==
   dependencies:
     cookie "^1.0.1"
     set-cookie-parser "^2.6.0"
-    turbo-stream "2.4.0"
 react-transition-group@^4.4.5:
   version "4.4.5"
@@ -4504,10 +4503,10 @@ react-transition-group@^4.4.5:
     loose-envify "^1.4.0"
     prop-types "^15.6.2"
-react-virtuoso@^4.12.6:
-  version "4.12.6"
-  resolved "https://registry.yarnpkg.com/react-virtuoso/-/react-virtuoso-4.12.6.tgz#20fe374d43cce3c9821e29f4cc4d050596d06d01"
-  integrity sha512-bfvS6aCL1ehXmq39KRiz/vxznGUbtA27I5I24TYCe1DhMf84O3aVNCIwrSjYQjkJGJGzY46ihdN8WkYlemuhMQ==
+react-virtuoso@^4.12.7:
+  version "4.12.8"
+  resolved "https://registry.yarnpkg.com/react-virtuoso/-/react-virtuoso-4.12.8.tgz#db1dbba617f91c1dcd760aa90e09ef991e65a356"
+  integrity sha512-NMMKfDBr/+xZZqCQF3tN1SZsh6FwOJkYgThlfnsPLkaEhdyQo0EuWUzu3ix6qjnI7rYwJhMwRGoJBi+aiDfGsA==
 react@^19.0.0:
   version "19.1.0"
@@ -4922,11 +4921,6 @@ ts-jest@^29.3.2:
     type-fest "^4.39.1"
     yargs-parser "^21.1.1"
-turbo-stream@2.4.0:
-  version "2.4.0"
-  resolved "https://registry.yarnpkg.com/turbo-stream/-/turbo-stream-2.4.0.tgz#1e4fca6725e90fa14ac4adb782f2d3759a5695f0"
-  integrity sha512-FHncC10WpBd2eOmGwpmQsWLDoK4cqsA/UT/GqNoaKOQnT8uzhtCbg3EoUDMvqpOSAI0S26mr0rkjzbOO6S3v1g==
 type-check@^0.4.0, type-check@~0.4.0:
   version "0.4.0"
   resolved "https://registry.yarnpkg.com/type-check/-/type-check-0.4.0.tgz#07b8203bfa7056c0657050e3ccd2c37730bab8f1"

inspect_ai/analysis/beta/__init__.py CHANGED Viewed

@@ -7,6 +7,7 @@ from ._dataframe.evals.columns import (
     EvalColumn,
     EvalColumns,
     EvalConfig,
+    EvalDataset,
     EvalInfo,
     EvalModel,
     EvalResults,
@@ -41,6 +42,7 @@ __all__ = [
     "EvalModel",
     "EvalColumns",
     "EvalConfig",
+    "EvalDataset",
     "EvalResults",
     "EvalScores",
     "samples_df",

inspect_ai/dataset/_sources/csv.py CHANGED Viewed

@@ -14,7 +14,7 @@ from .._dataset import (
     MemoryDataset,
     RecordToSample,
 )
-from .._util import data_to_samples, record_to_sample_fn
+from .._util import data_to_samples, record_to_sample_fn, shuffle_choices_if_requested
 def csv_dataset(
@@ -88,11 +88,7 @@ def csv_dataset(
         if shuffle:
             dataset.shuffle(seed=seed)
-        # shuffle choices, if requested
-        if isinstance(shuffle_choices, int):
-            dataset.shuffle_choices(seed=shuffle_choices)
-        elif shuffle_choices is True:
-            dataset.shuffle_choices()
+        shuffle_choices_if_requested(dataset, shuffle_choices)
         # limit if requested
         if limit:

inspect_ai/dataset/_sources/hf.py CHANGED Viewed

@@ -16,7 +16,7 @@ from .._dataset import (
     MemoryDataset,
     RecordToSample,
 )
-from .._util import data_to_samples, record_to_sample_fn
+from .._util import data_to_samples, record_to_sample_fn, shuffle_choices_if_requested
 def hf_dataset(
@@ -125,10 +125,6 @@ def hf_dataset(
         location=path,
     )
-    # maybe shuffle the choices
-    if isinstance(shuffle_choices, int):
-        memory_dataset.shuffle_choices(seed=shuffle_choices)
-    elif shuffle_choices is True:
-        memory_dataset.shuffle_choices()
+    shuffle_choices_if_requested(memory_dataset, shuffle_choices)
     return memory_dataset

inspect_ai/dataset/_sources/json.py CHANGED Viewed

@@ -15,7 +15,7 @@ from .._dataset import (
     MemoryDataset,
     RecordToSample,
 )
-from .._util import data_to_samples, record_to_sample_fn
+from .._util import data_to_samples, record_to_sample_fn, shuffle_choices_if_requested
 from .util import resolve_sample_files
@@ -88,11 +88,7 @@ def json_dataset(
         if shuffle:
             dataset.shuffle(seed=seed)
-        # shuffle choices, if requested
-        if isinstance(shuffle_choices, int):
-            dataset.shuffle_choices(seed=shuffle_choices)
-        elif shuffle_choices is True:
-            dataset.shuffle_choices()
+        shuffle_choices_if_requested(dataset, shuffle_choices)
         # limit if requested
         if limit:

inspect_ai/dataset/_util.py CHANGED Viewed

@@ -13,6 +13,7 @@ from inspect_ai.model import (
 from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
 from ._dataset import (
+    Dataset,
     DatasetRecord,
     FieldSpec,
     RecordToSample,
@@ -225,3 +226,25 @@ def read_files(files: Any | None) -> dict[str, str] | None:
         raise ValueError(f"Unexpected type for 'files' field: {type(files)}")
     else:
         return None
+def shuffle_choices_if_requested(
+    dataset: Dataset, shuffle_choices: bool | int | None
+) -> None:
+    """
+    Shuffle the choices in the dataset if requested.
+    The `shuffle_choices` parameter passed to `json_dataset`, `csv_dataset`,
+    and `hf_dataset` can be a boolean, an integer, or `None` (default).
+    If it is a boolean, it will shuffle the choices if the value is `True`,
+    and do nothing if it is `False`.
+    If it is an integer, it will shuffle the choices using the integer as the seed.
+    """
+    # Note that `isinstance(x, int)` returns True if x is True or False,
+    # so we need to check for both explicitly
+    if shuffle_choices is True:
+        dataset.shuffle_choices()
+    elif shuffle_choices is False:
+        pass
+    elif isinstance(shuffle_choices, int):
+        dataset.shuffle_choices(seed=shuffle_choices)

inspect_ai/model/_openai.py CHANGED Viewed

@@ -98,6 +98,10 @@ def is_computer_use_preview(name: str) -> bool:
     return "computer-use-preview" in name
+def is_codex(name: str) -> bool:
+    return "codex" in name
 def is_gpt(name: str) -> bool:
     return "gpt" in name

inspect_ai/model/_openai_responses.py CHANGED Viewed

@@ -162,9 +162,9 @@ def openai_responses_tool_choice(
 def openai_responses_tools(
-    tools: list[ToolInfo], config: GenerateConfig
+    tools: list[ToolInfo], model_name: str, config: GenerateConfig
 ) -> list[ToolParam]:
-    return [_tool_param_for_tool_info(tool, config) for tool in tools]
+    return [_tool_param_for_tool_info(tool, model_name, config) for tool in tools]
 def openai_responses_chat_choices(
@@ -177,9 +177,11 @@ def openai_responses_chat_choices(
 def is_native_tool_configured(
-    tools: Sequence[ToolInfo], config: GenerateConfig
+    tools: Sequence[ToolInfo], model_name: str, config: GenerateConfig
 ) -> bool:
-    return any(_maybe_native_tool_param(tool, config) is not None for tool in tools)
+    return any(
+        _maybe_native_tool_param(tool, model_name, config) is not None for tool in tools
+    )
 # The next two function perform transformations between OpenAI types an Inspect
@@ -433,11 +435,13 @@ def _model_tool_call_for_internal(
 def _maybe_native_tool_param(
     tool: ToolInfo,
+    model_name: str,
     config: GenerateConfig,
 ) -> ToolParam | None:
     return (
         (
-            maybe_computer_use_preview_tool(tool) or maybe_web_search_tool(tool)
+            maybe_computer_use_preview_tool(tool)
+            or maybe_web_search_tool(model_name, tool)
             # or self.text_editor_tool_param(tool)
             # or self.bash_tool_param(tool)
         )
@@ -502,11 +506,12 @@ _ResponseToolCallParam = (
 def _tool_param_for_tool_info(
     tool: ToolInfo,
+    model_name: str,
     config: GenerateConfig,
 ) -> ToolParam:
     # Use a native tool implementation when available. Otherwise, use the
     # standard tool implementation
-    return _maybe_native_tool_param(tool, config) or FunctionToolParam(
+    return _maybe_native_tool_param(tool, model_name, config) or FunctionToolParam(
         type="function",
         name=_responses_tool_alias(tool.name),
         description=tool.description,

inspect_ai/model/_openai_web_search.py CHANGED Viewed

@@ -4,11 +4,18 @@ from openai.types.responses import WebSearchTool, WebSearchToolParam
 from inspect_ai.tool._tool_info import ToolInfo
+COMPATIBLE_MODELS = ["gpt-4o", "gpt-4o-mini", "gpt-4.1"]
-def maybe_web_search_tool(tool: ToolInfo) -> WebSearchToolParam | None:
+def maybe_web_search_tool(model_name: str, tool: ToolInfo) -> WebSearchToolParam | None:
     return (
         _web_search_tool(tool.options["openai"])
-        if tool.name == "web_search" and tool.options and "openai" in tool.options
+        if (
+            tool.name == "web_search"
+            and tool.options
+            and "openai" in tool.options
+            and model_name in COMPATIBLE_MODELS
+        )
         else None
     )

inspect_ai/model/_providers/openai.py CHANGED Viewed

@@ -27,6 +27,7 @@ from .._model_call import ModelCall
 from .._model_output import ModelOutput
 from .._openai import (
     OpenAIAsyncHttpxClient,
+    is_codex,
     is_computer_use_preview,
     is_gpt,
     is_o1,
@@ -88,8 +89,10 @@ class OpenAIAPI(ModelAPI):
         # is this a model we use responses api by default for?
         responses_model = (
-            self.is_o_series() and not self.is_o1_early()
-        ) or self.is_computer_use_preview()
+            (self.is_o_series() and not self.is_o1_early())
+            or self.is_computer_use_preview()
+            or self.is_codex()
+        )
         # resolve whether we are forcing the responses api
         self.responses_api = responses_api or responses_model
@@ -193,6 +196,9 @@ class OpenAIAPI(ModelAPI):
     def is_computer_use_preview(self) -> bool:
         return is_computer_use_preview(self.service_model_name())
+    def is_codex(self) -> bool:
+        return is_codex(self.service_model_name())
     def is_gpt(self) -> bool:
         return is_gpt(self.service_model_name())
@@ -242,7 +248,9 @@ class OpenAIAPI(ModelAPI):
                 tools=tools,
                 **self.completion_params(config, False),
             )
-        elif self.responses_api or is_native_tool_configured(tools, config):
+        elif self.responses_api or is_native_tool_configured(
+            tools, self.model_name, config
+        ):
             return await generate_responses(
                 client=self.client,
                 http_hooks=self._http_hooks,

inspect_ai/model/_providers/openai_responses.py CHANGED Viewed

@@ -59,7 +59,11 @@ async def generate_responses(
         )
     # prepare request (we do this so we can log the ModelCall)
-    tool_params = openai_responses_tools(tools, config) if len(tools) > 0 else NOT_GIVEN
+    tool_params = (
+        openai_responses_tools(tools, model_name, config)
+        if len(tools) > 0
+        else NOT_GIVEN
+    )
     request = dict(
         input=await openai_responses_inputs(input, model_name, store),
         tools=tool_params,

inspect_ai/scorer/_reducer/reducer.py CHANGED Viewed

@@ -121,7 +121,7 @@ def pass_at(
     def reduce(scores: list[Score]) -> Score:
         def pass_at_k(values: list[float]) -> float:
             total = len(scores)
-            correct = sum(1 for v in values if v == value)
+            correct = sum(1 for v in values if v >= value)
             if total - correct < k:
                 return 1.0
             else:

inspect_ai/tool/_tools/_web_search/_google.py CHANGED Viewed

@@ -32,9 +32,10 @@ class GoogleOptions(BaseModel):
 class SearchLink:
-    def __init__(self, url: str, snippet: str) -> None:
+    def __init__(self, url: str, snippet: str, title: str) -> None:
         self.url = url
         self.snippet = snippet
+        self.title = title
 def maybe_get_google_api_keys() -> tuple[str, str] | None:
@@ -71,8 +72,7 @@ def google_search_provider(
     async def search(query: str) -> str | None:
         # limit number of concurrent searches
         page_contents: list[str] = []
-        urls: list[str] = []
-        snippets: list[str] = []
+        processed_links: list[SearchLink] = []
         search_calls = 0
         # Paginate through search results until we have successfully extracted num_results pages or we have reached max_provider_calls
@@ -87,8 +87,7 @@ def google_search_provider(
                         page = await page_if_relevant(link.url, query, model, client)
                         if page:
                             page_contents.append(page)
-                            urls.append(link.url)
-                            snippets.append(link.snippet)
+                            processed_links.append(link)
                     # exceptions fetching pages are very common!
                     except Exception:
                         pass
@@ -98,8 +97,18 @@ def google_search_provider(
             search_calls += 1
-        all_page_contents = "\n\n".join(page_contents)
-        return None if all_page_contents == "" else all_page_contents
+        return (
+            "\n\n".join(
+                "[{title}]({url}):\n{page_content}".format(
+                    title=link.title, url=link.url, page_content=page_content
+                )
+                for link, page_content in zip(
+                    processed_links, page_contents, strict=True
+                )
+            )
+            if processed_links
+            else None
+        )
     async def _search(query: str, start_idx: int) -> list[SearchLink]:
         # List of allowed parameters can be found https://developers.google.com/custom-search/v1/reference/rest/v1/cse/list
@@ -121,13 +130,21 @@ def google_search_provider(
             before_sleep=log_httpx_retry_attempt(search_url),
         )
         async def execute_search() -> httpx.Response:
+            # See https://developers.google.com/custom-search/v1/reference/rest/v1/Search
             return await client.get(search_url)
         result = await execute_search()
         data = result.json()
         if "items" in data:
-            return [SearchLink(item["link"], item["snippet"]) for item in data["items"]]
+            return [
+                SearchLink(
+                    url=item["link"],
+                    snippet=item.get("snippet", ""),  # sometimes not present
+                    title=item["title"],
+                )
+                for item in data["items"]
+            ]
         else:
             return []
@@ -135,13 +152,13 @@ def google_search_provider(
 async def page_if_relevant(
-    link: str, query: str, relevance_model: str | None, client: httpx.AsyncClient
+    url: str, query: str, relevance_model: str | None, client: httpx.AsyncClient
 ) -> str | None:
     """
     Use parser model to determine if a web page contents is relevant to a query.
     Args:
-        link (str): Web page link.
+        url (str): Web page url.
         query (str): Search query.
         relevance_model (Model): Model used to parse web pages for relevance.
         client: (httpx.Client): HTTP client to use to fetch the page
@@ -156,7 +173,7 @@ async def page_if_relevant(
     # retrieve document
     try:
-        response = await client.get(link)
+        response = await client.get(url)
         response.raise_for_status()
     except httpx.HTTPError as exc:
         raise Exception(f"HTTP error occurred: {exc}")

inspect_ai/tool/_tools/_web_search/_tavily.py CHANGED Viewed

@@ -75,6 +75,7 @@ def tavily_search_provider(
     client = httpx.AsyncClient(timeout=30)
     async def search(query: str) -> str | None:
+        # See https://docs.tavily.com/documentation/api-reference/endpoint/search
         search_url = "https://api.tavily.com/search"
         headers = {
             "Authorization": f"Bearer {tavily_api_key}",
@@ -95,6 +96,15 @@ def tavily_search_provider(
             return response
         async with concurrency("tavily_web_search", max_connections):
-            return TavilySearchResponse.model_validate((await _search()).json()).answer
+            tavily_search_response = TavilySearchResponse.model_validate(
+                (await _search()).json()
+            )
+            results_str = "\n\n".join(
+                [
+                    f"[{result.title}]({result.url}):\n{result.content}"
+                    for result in tavily_search_response.results
+                ]
+            )
+            return f"Answer: {tavily_search_response.answer}\n\n{results_str}"
     return search

{inspect_ai-0.3.100.dist-info → inspect_ai-0.3.102.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: inspect_ai
-Version: 0.3.100
+Version: 0.3.102
 Summary: Framework for large language model evaluations
 Author: UK AI Security Institute
 License: MIT License

inspect-ai 0.3.100__py3-none-any.whl → 0.3.102__py3-none-any.whl

inspect-ai 0.3.100py3-none-any.whl → 0.3.102py3-none-any.whl