inspect-ai 0.3.94__py3-none-any.whl → 0.3.96__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. inspect_ai/_eval/loader.py +1 -1
  2. inspect_ai/_eval/task/run.py +12 -6
  3. inspect_ai/_util/exception.py +4 -0
  4. inspect_ai/_util/hash.py +39 -0
  5. inspect_ai/_util/local_server.py +16 -0
  6. inspect_ai/_util/path.py +22 -0
  7. inspect_ai/_util/trace.py +1 -1
  8. inspect_ai/_util/working.py +4 -0
  9. inspect_ai/_view/www/dist/assets/index.css +9 -9
  10. inspect_ai/_view/www/dist/assets/index.js +117 -120
  11. inspect_ai/_view/www/package.json +1 -1
  12. inspect_ai/_view/www/src/app/log-view/navbar/SecondaryBar.tsx +2 -2
  13. inspect_ai/_view/www/src/app/log-view/tabs/SamplesTab.tsx +1 -4
  14. inspect_ai/_view/www/src/app/samples/SamplesTools.tsx +3 -13
  15. inspect_ai/_view/www/src/app/samples/sample-tools/SelectScorer.tsx +45 -48
  16. inspect_ai/_view/www/src/app/samples/sample-tools/filters.ts +16 -15
  17. inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/SampleFilter.tsx +47 -75
  18. inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/completions.ts +9 -9
  19. inspect_ai/_view/www/src/app/types.ts +12 -2
  20. inspect_ai/_view/www/src/components/ExpandablePanel.module.css +1 -1
  21. inspect_ai/_view/www/src/components/ExpandablePanel.tsx +5 -5
  22. inspect_ai/_view/www/src/state/hooks.ts +19 -3
  23. inspect_ai/_view/www/src/state/logSlice.ts +23 -5
  24. inspect_ai/_view/www/yarn.lock +9 -9
  25. inspect_ai/agent/_bridge/patch.py +1 -3
  26. inspect_ai/agent/_types.py +1 -1
  27. inspect_ai/analysis/__init__.py +0 -0
  28. inspect_ai/analysis/beta/__init__.py +67 -0
  29. inspect_ai/analysis/beta/_dataframe/__init__.py +0 -0
  30. inspect_ai/analysis/beta/_dataframe/columns.py +145 -0
  31. inspect_ai/analysis/beta/_dataframe/evals/__init__.py +0 -0
  32. inspect_ai/analysis/beta/_dataframe/evals/columns.py +132 -0
  33. inspect_ai/analysis/beta/_dataframe/evals/extract.py +23 -0
  34. inspect_ai/analysis/beta/_dataframe/evals/table.py +177 -0
  35. inspect_ai/analysis/beta/_dataframe/events/__init__.py +0 -0
  36. inspect_ai/analysis/beta/_dataframe/events/columns.py +87 -0
  37. inspect_ai/analysis/beta/_dataframe/events/extract.py +26 -0
  38. inspect_ai/analysis/beta/_dataframe/events/table.py +100 -0
  39. inspect_ai/analysis/beta/_dataframe/extract.py +73 -0
  40. inspect_ai/analysis/beta/_dataframe/messages/__init__.py +0 -0
  41. inspect_ai/analysis/beta/_dataframe/messages/columns.py +60 -0
  42. inspect_ai/analysis/beta/_dataframe/messages/extract.py +21 -0
  43. inspect_ai/analysis/beta/_dataframe/messages/table.py +79 -0
  44. inspect_ai/analysis/beta/_dataframe/progress.py +26 -0
  45. inspect_ai/analysis/beta/_dataframe/record.py +377 -0
  46. inspect_ai/analysis/beta/_dataframe/samples/__init__.py +0 -0
  47. inspect_ai/analysis/beta/_dataframe/samples/columns.py +77 -0
  48. inspect_ai/analysis/beta/_dataframe/samples/extract.py +54 -0
  49. inspect_ai/analysis/beta/_dataframe/samples/table.py +370 -0
  50. inspect_ai/analysis/beta/_dataframe/util.py +160 -0
  51. inspect_ai/analysis/beta/_dataframe/validate.py +171 -0
  52. inspect_ai/log/_file.py +10 -3
  53. inspect_ai/log/_log.py +21 -1
  54. inspect_ai/model/_call_tools.py +2 -1
  55. inspect_ai/model/_model.py +6 -4
  56. inspect_ai/model/_openai_responses.py +17 -18
  57. inspect_ai/model/_providers/anthropic.py +30 -5
  58. inspect_ai/model/_providers/providers.py +1 -1
  59. inspect_ai/solver/_multiple_choice.py +4 -1
  60. inspect_ai/solver/_task_state.py +8 -4
  61. inspect_ai/tool/_mcp/_context.py +3 -5
  62. inspect_ai/tool/_mcp/_sandbox.py +17 -14
  63. inspect_ai/tool/_mcp/server.py +1 -1
  64. inspect_ai/tool/_tools/_think.py +1 -1
  65. inspect_ai/tool/_tools/_web_search/__init__.py +3 -0
  66. inspect_ai/tool/_tools/{_web_search.py → _web_search/_google.py} +56 -103
  67. inspect_ai/tool/_tools/_web_search/_tavily.py +77 -0
  68. inspect_ai/tool/_tools/_web_search/_web_search.py +85 -0
  69. inspect_ai/util/_sandbox/events.py +3 -2
  70. {inspect_ai-0.3.94.dist-info → inspect_ai-0.3.96.dist-info}/METADATA +9 -2
  71. {inspect_ai-0.3.94.dist-info → inspect_ai-0.3.96.dist-info}/RECORD +75 -46
  72. {inspect_ai-0.3.94.dist-info → inspect_ai-0.3.96.dist-info}/WHEEL +1 -1
  73. {inspect_ai-0.3.94.dist-info → inspect_ai-0.3.96.dist-info}/entry_points.txt +0 -0
  74. {inspect_ai-0.3.94.dist-info → inspect_ai-0.3.96.dist-info}/licenses/LICENSE +0 -0
  75. {inspect_ai-0.3.94.dist-info → inspect_ai-0.3.96.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,377 @@
1
+ import json
2
+ from datetime import date, datetime, time, timezone
3
+ from typing import Any, Callable, Literal, Type, cast, overload
4
+
5
+ import yaml
6
+ from jsonpath_ng import JSONPath # type: ignore
7
+ from pydantic import JsonValue
8
+
9
+ from inspect_ai.analysis.beta._dataframe.events.columns import EventColumn
10
+ from inspect_ai.analysis.beta._dataframe.messages.columns import MessageColumn
11
+ from inspect_ai.analysis.beta._dataframe.samples.columns import SampleColumn
12
+ from inspect_ai.log._log import EvalLog, EvalSample, EvalSampleSummary
13
+ from inspect_ai.log._transcript import BaseEvent, Event
14
+ from inspect_ai.model._chat_message import ChatMessage, ChatMessageBase
15
+
16
+ from .columns import Column, ColumnError, ColumnType
17
+ from .evals.columns import EvalColumn
18
+ from .extract import model_to_record
19
+
20
+
21
+ @overload
22
+ def import_record(
23
+ record: EvalLog
24
+ | EvalSampleSummary
25
+ | EvalSample
26
+ | ChatMessage
27
+ | Event
28
+ | dict[str, JsonValue],
29
+ columns: list[Column],
30
+ strict: Literal[True] = True,
31
+ ) -> dict[str, ColumnType]: ...
32
+
33
+
34
+ @overload
35
+ def import_record(
36
+ record: EvalLog
37
+ | EvalSampleSummary
38
+ | EvalSample
39
+ | ChatMessage
40
+ | Event
41
+ | dict[str, JsonValue],
42
+ columns: list[Column],
43
+ strict: Literal[False],
44
+ ) -> tuple[dict[str, ColumnType], list[ColumnError]]: ...
45
+
46
+
47
+ def import_record(
48
+ record: EvalLog
49
+ | EvalSampleSummary
50
+ | EvalSample
51
+ | ChatMessage
52
+ | Event
53
+ | dict[str, JsonValue],
54
+ columns: list[Column],
55
+ strict: bool = True,
56
+ ) -> dict[str, ColumnType] | tuple[dict[str, ColumnType], list[ColumnError]]:
57
+ # resolve the record BaseModel into a dict (and optionally a summary dict).
58
+ # summary dict will be required in the case that record is for samples.
59
+ # we also want to save the original BaseModel (if any) for playing back
60
+ # to columns that yield their value using a callable.
61
+ record_target = record
62
+ record_summary: dict[str, JsonValue] | None = None
63
+ if isinstance(record, EvalSample):
64
+ record_summary = model_to_record(record.summary())
65
+ record = model_to_record(record)
66
+ elif isinstance(record, EvalSampleSummary):
67
+ record_summary = model_to_record(record)
68
+ record = record_summary
69
+ elif isinstance(record, EvalLog | ChatMessageBase | BaseEvent):
70
+ record = model_to_record(record)
71
+ else:
72
+ record = record
73
+
74
+ # return values
75
+ result: dict[str, ColumnType] = {}
76
+ errors: list[ColumnError] = []
77
+
78
+ # helper to record a field w/ optional type checking/coercion
79
+ def set_result(name: str, column: Column, value: JsonValue) -> None:
80
+ try:
81
+ result[name] = _resolve_value(value, column.type)
82
+ except ValueError as ex:
83
+ error = ColumnError(name, path=column.path, message=str(ex))
84
+ if strict:
85
+ raise ValueError(str(error))
86
+ else:
87
+ errors.append(error)
88
+
89
+ # helper to raise or record errror
90
+ def field_not_found(
91
+ name: str, path: JSONPath | None, required_type: str | None = None
92
+ ) -> None:
93
+ message = (
94
+ f"field not of type {required_type}" if required_type else "field not found"
95
+ )
96
+ error = ColumnError(name, path=path, message=f"{message}")
97
+ if strict:
98
+ raise ValueError(str(error))
99
+ else:
100
+ errors.append(error)
101
+
102
+ # process each column
103
+ for column in columns:
104
+ # start with none
105
+ value: JsonValue = None
106
+
107
+ # resolve path
108
+ try:
109
+ # read by path or extract function
110
+ if column.path is not None:
111
+ if not column.validate_path():
112
+ raise ValueError("Specified path is not valid")
113
+ # sample columns may read from summary of full sample
114
+ if isinstance(column, SampleColumn):
115
+ matches = column.path.find(
116
+ record if column._full else record_summary
117
+ )
118
+ else:
119
+ matches = column.path.find(record)
120
+
121
+ if matches:
122
+ value = matches[0].value
123
+ # some eval columns yield their value with an extract function
124
+ elif (
125
+ isinstance(column, EvalColumn)
126
+ and column._extract_eval is not None
127
+ and isinstance(record_target, EvalLog)
128
+ ):
129
+ value = column._extract_eval(record_target)
130
+ # some sample columns yield their value with an extract function
131
+ elif (
132
+ isinstance(column, SampleColumn)
133
+ and column._extract_sample is not None
134
+ and isinstance(record_target, EvalSample | EvalSampleSummary)
135
+ ):
136
+ value = column._extract_sample(record_target) # type: ignore[arg-type]
137
+ elif (
138
+ isinstance(column, MessageColumn)
139
+ and column._extract_message is not None
140
+ and isinstance(record_target, ChatMessageBase)
141
+ ):
142
+ value = column._extract_message(record_target)
143
+ elif (
144
+ isinstance(column, EventColumn)
145
+ and column._extract_event is not None
146
+ and isinstance(record_target, BaseEvent)
147
+ ):
148
+ value = column._extract_event(record_target)
149
+ else:
150
+ raise ValueError("column must have path or extract function")
151
+
152
+ # call value function on column if it exists
153
+ if value is not None:
154
+ value = column.value(value)
155
+
156
+ except Exception as ex:
157
+ error = ColumnError(
158
+ column.name,
159
+ path=str(column.path) if column.path else None,
160
+ message=str(ex),
161
+ )
162
+ if strict:
163
+ raise ValueError(str(error))
164
+ else:
165
+ errors.append(error)
166
+ continue
167
+
168
+ # provide default if None
169
+ if value is None and column.default is not None:
170
+ value = column.default
171
+
172
+ # check for required
173
+ if column.required and value is None:
174
+ field_not_found(column.name, column.path)
175
+
176
+ # handle wildcard vs. no wildcard
177
+ if column.name.endswith("*"):
178
+ values = value if isinstance(value, list) else [value]
179
+ for value in values:
180
+ expanded = _expand_fields(column.name, value)
181
+ for k, v in expanded.items():
182
+ set_result(k, column, v)
183
+ else:
184
+ set_result(column.name, column, value)
185
+
186
+ # optionally return errors if we aren't in strict mode
187
+ if strict:
188
+ return result
189
+ else:
190
+ return result, errors
191
+
192
+
193
+ def resolve_duplicate_columns(columns: list[Column]) -> list[Column]:
194
+ """Remove duplicate columns (with the later columns winning)"""
195
+ seen = set[str]()
196
+ deduped: list[Column] = []
197
+ for col in reversed(columns):
198
+ if col.name not in seen:
199
+ deduped.append(col)
200
+ seen.add(col.name)
201
+ deduped.reverse()
202
+ return deduped
203
+
204
+
205
+ def _resolve_value(
206
+ value: JsonValue,
207
+ type_: Type[ColumnType] | None = None,
208
+ ) -> ColumnType:
209
+ """
210
+ Coerce *value* to *type_* (if supplied).
211
+
212
+ Supported conversions
213
+ ---------------------
214
+ * Normal Python constructor coercion (`int("5")`, `str(3.14)` …)
215
+ * Strings through YAML (handles "`true`", "`3.2`", "`2025-05-01`", …)
216
+ * ISO-8601 strings to ``date``, ``time``, ``datetime``
217
+ * POSIX timestamps (int/float **or** numeric string) → temporal types
218
+ * When *value* is a ``list`` or ``dict`` **and** either
219
+ - *type_* is ``str`` **or**
220
+ - *type_* is ``None`` (unspecified),
221
+ the structure is serialised with `json.dumps`
222
+ """
223
+ ## reflect none back
224
+ if value is None:
225
+ return None
226
+
227
+ # auto-stringify compound types
228
+ if isinstance(value, list | dict) and (type_ is None or type_ is str):
229
+ return json.dumps(value)
230
+
231
+ # we have now narrowed the value to not be none or a compound type
232
+ value = cast(int | str | float | bool, value)
233
+
234
+ # no target type or None → nothing to do
235
+ if type_ is None:
236
+ return value
237
+
238
+ # already correct
239
+ if isinstance(value, type_) and not _is_bool_int_mismatch(type_, value):
240
+ return value
241
+
242
+ # numeric timestamp → temporal
243
+ if isinstance(value, int | float):
244
+ coerced = _from_timestamp(type_, value)
245
+ if coerced is not None:
246
+ return coerced
247
+
248
+ # straight constructor
249
+ coerced = _try_constructor(type_, value)
250
+ if coerced is not None:
251
+ return coerced
252
+
253
+ # 4) string handling (YAML, ISO, numeric-string timestamp, …)
254
+ if isinstance(value, str):
255
+ coerced = _coerce_from_str(type_, value)
256
+ if coerced is not None:
257
+ return coerced
258
+
259
+ # give up
260
+ raise ValueError(
261
+ f"Cannot coerce {value} from type {type(value).__name__}) to {type_.__name__}"
262
+ )
263
+
264
+
265
+ def _is_bool_int_mismatch(tp: Type[ColumnType], obj: Any) -> bool:
266
+ """True when an *int* coercion would silently produce a *bool* (undesired)."""
267
+ return tp is int and isinstance(obj, bool)
268
+
269
+
270
+ def _try_constructor(tp: Type[ColumnType], obj: Any) -> ColumnType:
271
+ """Run `tp(obj)` but swallow any exception, return None on failure."""
272
+ # Constructors of date / time / datetime require ≥3 positional ints, so don’t even try them.
273
+ if tp in (date, time, datetime):
274
+ return None
275
+
276
+ # reflect None back
277
+ if obj is None:
278
+ return obj
279
+
280
+ try:
281
+ coerced = tp(obj) # type: ignore[call-arg, misc]
282
+ except Exception:
283
+ return None
284
+ return None if _is_bool_int_mismatch(tp, coerced) else coerced
285
+
286
+
287
+ def _from_timestamp(tp: Type[ColumnType], ts: int | float) -> ColumnType | None:
288
+ """Convert POSIX timestamp to the requested temporal type, UTC zone."""
289
+ if tp is datetime:
290
+ return datetime.fromtimestamp(ts, tz=timezone.utc)
291
+ if tp is date:
292
+ return date.fromtimestamp(ts)
293
+ if tp is time: # derive from a datetime
294
+ return datetime.fromtimestamp(ts, tz=timezone.utc).time()
295
+ return None
296
+
297
+
298
+ def _coerce_from_str(tp: Type[ColumnType], text: str) -> ColumnType:
299
+ """
300
+ Best-effort coercion from *text* to *tp*:
301
+
302
+ 1. YAML parsing (catches booleans, numbers, ISO timestamps, …)
303
+ 2. `fromisoformat` when available on the target class
304
+ 3. Numeric-string → POSIX timestamp (for temporal targets)
305
+ 4. Constructor fall-back
306
+ """
307
+ # 1) YAML
308
+ try:
309
+ parsed = yaml.safe_load(text)
310
+ except Exception:
311
+ parsed = None
312
+
313
+ if parsed is not None:
314
+ # exact match?
315
+ if isinstance(parsed, tp) and not _is_bool_int_mismatch(tp, parsed):
316
+ return cast(ColumnType, parsed)
317
+ # try constructor on the YAML result (e.g. str→float via YAML "1.5")
318
+ coerced = _try_constructor(tp, parsed)
319
+ if coerced is not None:
320
+ return coerced
321
+
322
+ # 2) fromisoformat — only on temporal types and str itself
323
+ from_iso: Callable[[str], datetime] | None = getattr(tp, "fromisoformat", None)
324
+ if callable(from_iso):
325
+ try:
326
+ return from_iso(text)
327
+ except Exception:
328
+ pass
329
+
330
+ # 3) numeric string timestamp?
331
+ try:
332
+ tstmp = float(text)
333
+ except ValueError:
334
+ tstmp = None
335
+ if tstmp is not None:
336
+ coerced = _from_timestamp(tp, tstmp)
337
+ if coerced is not None:
338
+ return coerced
339
+
340
+ # 4) plain constructor last
341
+ return _try_constructor(tp, text)
342
+
343
+
344
+ def _expand_fields(name: str, value: JsonValue) -> dict[str, JsonValue]:
345
+ result: dict[str, JsonValue] = {}
346
+
347
+ # Base case: no asterisks in the field name
348
+ if "*" not in name:
349
+ result[name] = value
350
+ return result
351
+
352
+ # If there's an asterisk but value isn't a dictionary, we can't expand
353
+ if not isinstance(value, dict):
354
+ # Handle this case - either return empty dict, skip it, or use a default name
355
+ # For now, I'll just return an empty dict
356
+ return result
357
+
358
+ # Get the position of the first asterisk
359
+ asterisk_pos = name.find("*")
360
+ prefix = name[:asterisk_pos]
361
+ suffix = name[asterisk_pos + 1 :]
362
+
363
+ # recursive case: expand each key in the dictionary
364
+ for key, val in value.items():
365
+ new_field = prefix + key + suffix
366
+ # recursively expand any remaining asterisks
367
+ if "*" in suffix:
368
+ if isinstance(val, dict):
369
+ expanded = _expand_fields(new_field, val)
370
+ result.update(expanded)
371
+ # If suffix has '*' but val is not a dict, skip it
372
+ else:
373
+ pass
374
+ else:
375
+ result[new_field] = val
376
+
377
+ return result
@@ -0,0 +1,77 @@
1
+ from typing import Any, Callable, Mapping, Type
2
+
3
+ from jsonpath_ng import JSONPath # type: ignore
4
+ from pydantic import JsonValue
5
+ from typing_extensions import override
6
+
7
+ from inspect_ai.log._log import EvalSample, EvalSampleSummary
8
+
9
+ from ..columns import Column, ColumnType
10
+ from ..extract import list_as_str, score_values
11
+ from ..validate import resolved_schema
12
+ from .extract import (
13
+ sample_input_as_str,
14
+ sample_messages_as_str,
15
+ sample_path_requires_full,
16
+ )
17
+
18
+
19
+ class SampleColumn(Column):
20
+ """Column which maps to `EvalSample` or `EvalSampleSummary`."""
21
+
22
+ def __init__(
23
+ self,
24
+ name: str,
25
+ *,
26
+ path: str
27
+ | JSONPath
28
+ | Callable[[EvalSampleSummary], JsonValue]
29
+ | Callable[[EvalSample], JsonValue],
30
+ required: bool = False,
31
+ default: JsonValue | None = None,
32
+ type: Type[ColumnType] | None = None,
33
+ value: Callable[[JsonValue], JsonValue] | None = None,
34
+ full: bool = False,
35
+ ) -> None:
36
+ super().__init__(
37
+ name=name,
38
+ path=path if not callable(path) else None,
39
+ required=required,
40
+ default=default,
41
+ type=type,
42
+ value=value,
43
+ )
44
+ self._extract_sample = path if callable(path) else None
45
+ self._full = full or sample_path_requires_full(path)
46
+
47
+ @override
48
+ def path_schema(self) -> Mapping[str, Any]:
49
+ if self._full:
50
+ return self.full_schema
51
+ else:
52
+ return self.summary_schema
53
+
54
+ summary_schema = resolved_schema(EvalSampleSummary)
55
+ full_schema = resolved_schema(EvalSample)
56
+
57
+
58
+ SampleSummary: list[Column] = [
59
+ SampleColumn("id", path="id", required=True, type=str),
60
+ SampleColumn("epoch", path="epoch", required=True),
61
+ SampleColumn("input", path=sample_input_as_str, required=True),
62
+ SampleColumn("target", path="target", required=True, value=list_as_str),
63
+ SampleColumn("metadata_*", path="metadata"),
64
+ SampleColumn("score_*", path="scores", value=score_values),
65
+ SampleColumn("model_usage", path="model_usage"),
66
+ SampleColumn("total_time", path="total_time"),
67
+ SampleColumn("working_time", path="total_time"),
68
+ SampleColumn("error", path="error"),
69
+ SampleColumn("limit", path="limit"),
70
+ SampleColumn("retries", path="retries"),
71
+ ]
72
+ """Sample summary columns."""
73
+
74
+ SampleMessages: list[Column] = [
75
+ SampleColumn("messages", path=sample_messages_as_str, required=True, full=True)
76
+ ]
77
+ """Sample messages as a string."""
@@ -0,0 +1,54 @@
1
+ from typing import Callable
2
+
3
+ from jsonpath_ng import JSONPath # type: ignore
4
+ from pydantic import JsonValue
5
+
6
+ from inspect_ai.log._log import EvalSample, EvalSampleSummary
7
+
8
+ from ..extract import auto_id, messages_as_str
9
+
10
+
11
+ def sample_input_as_str(sample: EvalSample) -> str:
12
+ return messages_as_str(sample.input)
13
+
14
+
15
+ def sample_messages_as_str(sample: EvalSample) -> str:
16
+ return messages_as_str(sample.messages)
17
+
18
+
19
+ def sample_path_requires_full(
20
+ path: str
21
+ | JSONPath
22
+ | Callable[[EvalSampleSummary], JsonValue]
23
+ | Callable[[EvalSample], JsonValue],
24
+ ) -> bool:
25
+ if callable(path):
26
+ return False
27
+ else:
28
+ path = str(path)
29
+ return any(
30
+ [
31
+ path.startswith(prefix)
32
+ for prefix in [
33
+ "choices",
34
+ "sandbox",
35
+ "files",
36
+ "setup",
37
+ "messages",
38
+ "output",
39
+ "store",
40
+ "events",
41
+ "uuid",
42
+ "error_retries",
43
+ "attachments",
44
+ ]
45
+ ]
46
+ )
47
+
48
+
49
+ def auto_sample_id(eval_id: str, sample: EvalSample | EvalSampleSummary) -> str:
50
+ return auto_id(eval_id, f"{sample.id}_{sample.epoch}")
51
+
52
+
53
+ def auto_detail_id(sample_id: str, name: str, index: int) -> str:
54
+ return auto_id(sample_id, f"{name}_{index}")