inspect-ai 0.3.94__py3-none-any.whl → 0.3.96__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_eval/loader.py +1 -1
- inspect_ai/_eval/task/run.py +12 -6
- inspect_ai/_util/exception.py +4 -0
- inspect_ai/_util/hash.py +39 -0
- inspect_ai/_util/local_server.py +16 -0
- inspect_ai/_util/path.py +22 -0
- inspect_ai/_util/trace.py +1 -1
- inspect_ai/_util/working.py +4 -0
- inspect_ai/_view/www/dist/assets/index.css +9 -9
- inspect_ai/_view/www/dist/assets/index.js +117 -120
- inspect_ai/_view/www/package.json +1 -1
- inspect_ai/_view/www/src/app/log-view/navbar/SecondaryBar.tsx +2 -2
- inspect_ai/_view/www/src/app/log-view/tabs/SamplesTab.tsx +1 -4
- inspect_ai/_view/www/src/app/samples/SamplesTools.tsx +3 -13
- inspect_ai/_view/www/src/app/samples/sample-tools/SelectScorer.tsx +45 -48
- inspect_ai/_view/www/src/app/samples/sample-tools/filters.ts +16 -15
- inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/SampleFilter.tsx +47 -75
- inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/completions.ts +9 -9
- inspect_ai/_view/www/src/app/types.ts +12 -2
- inspect_ai/_view/www/src/components/ExpandablePanel.module.css +1 -1
- inspect_ai/_view/www/src/components/ExpandablePanel.tsx +5 -5
- inspect_ai/_view/www/src/state/hooks.ts +19 -3
- inspect_ai/_view/www/src/state/logSlice.ts +23 -5
- inspect_ai/_view/www/yarn.lock +9 -9
- inspect_ai/agent/_bridge/patch.py +1 -3
- inspect_ai/agent/_types.py +1 -1
- inspect_ai/analysis/__init__.py +0 -0
- inspect_ai/analysis/beta/__init__.py +67 -0
- inspect_ai/analysis/beta/_dataframe/__init__.py +0 -0
- inspect_ai/analysis/beta/_dataframe/columns.py +145 -0
- inspect_ai/analysis/beta/_dataframe/evals/__init__.py +0 -0
- inspect_ai/analysis/beta/_dataframe/evals/columns.py +132 -0
- inspect_ai/analysis/beta/_dataframe/evals/extract.py +23 -0
- inspect_ai/analysis/beta/_dataframe/evals/table.py +177 -0
- inspect_ai/analysis/beta/_dataframe/events/__init__.py +0 -0
- inspect_ai/analysis/beta/_dataframe/events/columns.py +87 -0
- inspect_ai/analysis/beta/_dataframe/events/extract.py +26 -0
- inspect_ai/analysis/beta/_dataframe/events/table.py +100 -0
- inspect_ai/analysis/beta/_dataframe/extract.py +73 -0
- inspect_ai/analysis/beta/_dataframe/messages/__init__.py +0 -0
- inspect_ai/analysis/beta/_dataframe/messages/columns.py +60 -0
- inspect_ai/analysis/beta/_dataframe/messages/extract.py +21 -0
- inspect_ai/analysis/beta/_dataframe/messages/table.py +79 -0
- inspect_ai/analysis/beta/_dataframe/progress.py +26 -0
- inspect_ai/analysis/beta/_dataframe/record.py +377 -0
- inspect_ai/analysis/beta/_dataframe/samples/__init__.py +0 -0
- inspect_ai/analysis/beta/_dataframe/samples/columns.py +77 -0
- inspect_ai/analysis/beta/_dataframe/samples/extract.py +54 -0
- inspect_ai/analysis/beta/_dataframe/samples/table.py +370 -0
- inspect_ai/analysis/beta/_dataframe/util.py +160 -0
- inspect_ai/analysis/beta/_dataframe/validate.py +171 -0
- inspect_ai/log/_file.py +10 -3
- inspect_ai/log/_log.py +21 -1
- inspect_ai/model/_call_tools.py +2 -1
- inspect_ai/model/_model.py +6 -4
- inspect_ai/model/_openai_responses.py +17 -18
- inspect_ai/model/_providers/anthropic.py +30 -5
- inspect_ai/model/_providers/providers.py +1 -1
- inspect_ai/solver/_multiple_choice.py +4 -1
- inspect_ai/solver/_task_state.py +8 -4
- inspect_ai/tool/_mcp/_context.py +3 -5
- inspect_ai/tool/_mcp/_sandbox.py +17 -14
- inspect_ai/tool/_mcp/server.py +1 -1
- inspect_ai/tool/_tools/_think.py +1 -1
- inspect_ai/tool/_tools/_web_search/__init__.py +3 -0
- inspect_ai/tool/_tools/{_web_search.py → _web_search/_google.py} +56 -103
- inspect_ai/tool/_tools/_web_search/_tavily.py +77 -0
- inspect_ai/tool/_tools/_web_search/_web_search.py +85 -0
- inspect_ai/util/_sandbox/events.py +3 -2
- {inspect_ai-0.3.94.dist-info → inspect_ai-0.3.96.dist-info}/METADATA +9 -2
- {inspect_ai-0.3.94.dist-info → inspect_ai-0.3.96.dist-info}/RECORD +75 -46
- {inspect_ai-0.3.94.dist-info → inspect_ai-0.3.96.dist-info}/WHEEL +1 -1
- {inspect_ai-0.3.94.dist-info → inspect_ai-0.3.96.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.94.dist-info → inspect_ai-0.3.96.dist-info}/licenses/LICENSE +0 -0
- {inspect_ai-0.3.94.dist-info → inspect_ai-0.3.96.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,377 @@
|
|
1
|
+
import json
|
2
|
+
from datetime import date, datetime, time, timezone
|
3
|
+
from typing import Any, Callable, Literal, Type, cast, overload
|
4
|
+
|
5
|
+
import yaml
|
6
|
+
from jsonpath_ng import JSONPath # type: ignore
|
7
|
+
from pydantic import JsonValue
|
8
|
+
|
9
|
+
from inspect_ai.analysis.beta._dataframe.events.columns import EventColumn
|
10
|
+
from inspect_ai.analysis.beta._dataframe.messages.columns import MessageColumn
|
11
|
+
from inspect_ai.analysis.beta._dataframe.samples.columns import SampleColumn
|
12
|
+
from inspect_ai.log._log import EvalLog, EvalSample, EvalSampleSummary
|
13
|
+
from inspect_ai.log._transcript import BaseEvent, Event
|
14
|
+
from inspect_ai.model._chat_message import ChatMessage, ChatMessageBase
|
15
|
+
|
16
|
+
from .columns import Column, ColumnError, ColumnType
|
17
|
+
from .evals.columns import EvalColumn
|
18
|
+
from .extract import model_to_record
|
19
|
+
|
20
|
+
|
21
|
+
@overload
|
22
|
+
def import_record(
|
23
|
+
record: EvalLog
|
24
|
+
| EvalSampleSummary
|
25
|
+
| EvalSample
|
26
|
+
| ChatMessage
|
27
|
+
| Event
|
28
|
+
| dict[str, JsonValue],
|
29
|
+
columns: list[Column],
|
30
|
+
strict: Literal[True] = True,
|
31
|
+
) -> dict[str, ColumnType]: ...
|
32
|
+
|
33
|
+
|
34
|
+
@overload
|
35
|
+
def import_record(
|
36
|
+
record: EvalLog
|
37
|
+
| EvalSampleSummary
|
38
|
+
| EvalSample
|
39
|
+
| ChatMessage
|
40
|
+
| Event
|
41
|
+
| dict[str, JsonValue],
|
42
|
+
columns: list[Column],
|
43
|
+
strict: Literal[False],
|
44
|
+
) -> tuple[dict[str, ColumnType], list[ColumnError]]: ...
|
45
|
+
|
46
|
+
|
47
|
+
def import_record(
|
48
|
+
record: EvalLog
|
49
|
+
| EvalSampleSummary
|
50
|
+
| EvalSample
|
51
|
+
| ChatMessage
|
52
|
+
| Event
|
53
|
+
| dict[str, JsonValue],
|
54
|
+
columns: list[Column],
|
55
|
+
strict: bool = True,
|
56
|
+
) -> dict[str, ColumnType] | tuple[dict[str, ColumnType], list[ColumnError]]:
|
57
|
+
# resolve the record BaseModel into a dict (and optionally a summary dict).
|
58
|
+
# summary dict will be required in the case that record is for samples.
|
59
|
+
# we also want to save the original BaseModel (if any) for playing back
|
60
|
+
# to columns that yield their value using a callable.
|
61
|
+
record_target = record
|
62
|
+
record_summary: dict[str, JsonValue] | None = None
|
63
|
+
if isinstance(record, EvalSample):
|
64
|
+
record_summary = model_to_record(record.summary())
|
65
|
+
record = model_to_record(record)
|
66
|
+
elif isinstance(record, EvalSampleSummary):
|
67
|
+
record_summary = model_to_record(record)
|
68
|
+
record = record_summary
|
69
|
+
elif isinstance(record, EvalLog | ChatMessageBase | BaseEvent):
|
70
|
+
record = model_to_record(record)
|
71
|
+
else:
|
72
|
+
record = record
|
73
|
+
|
74
|
+
# return values
|
75
|
+
result: dict[str, ColumnType] = {}
|
76
|
+
errors: list[ColumnError] = []
|
77
|
+
|
78
|
+
# helper to record a field w/ optional type checking/coercion
|
79
|
+
def set_result(name: str, column: Column, value: JsonValue) -> None:
|
80
|
+
try:
|
81
|
+
result[name] = _resolve_value(value, column.type)
|
82
|
+
except ValueError as ex:
|
83
|
+
error = ColumnError(name, path=column.path, message=str(ex))
|
84
|
+
if strict:
|
85
|
+
raise ValueError(str(error))
|
86
|
+
else:
|
87
|
+
errors.append(error)
|
88
|
+
|
89
|
+
# helper to raise or record errror
|
90
|
+
def field_not_found(
|
91
|
+
name: str, path: JSONPath | None, required_type: str | None = None
|
92
|
+
) -> None:
|
93
|
+
message = (
|
94
|
+
f"field not of type {required_type}" if required_type else "field not found"
|
95
|
+
)
|
96
|
+
error = ColumnError(name, path=path, message=f"{message}")
|
97
|
+
if strict:
|
98
|
+
raise ValueError(str(error))
|
99
|
+
else:
|
100
|
+
errors.append(error)
|
101
|
+
|
102
|
+
# process each column
|
103
|
+
for column in columns:
|
104
|
+
# start with none
|
105
|
+
value: JsonValue = None
|
106
|
+
|
107
|
+
# resolve path
|
108
|
+
try:
|
109
|
+
# read by path or extract function
|
110
|
+
if column.path is not None:
|
111
|
+
if not column.validate_path():
|
112
|
+
raise ValueError("Specified path is not valid")
|
113
|
+
# sample columns may read from summary of full sample
|
114
|
+
if isinstance(column, SampleColumn):
|
115
|
+
matches = column.path.find(
|
116
|
+
record if column._full else record_summary
|
117
|
+
)
|
118
|
+
else:
|
119
|
+
matches = column.path.find(record)
|
120
|
+
|
121
|
+
if matches:
|
122
|
+
value = matches[0].value
|
123
|
+
# some eval columns yield their value with an extract function
|
124
|
+
elif (
|
125
|
+
isinstance(column, EvalColumn)
|
126
|
+
and column._extract_eval is not None
|
127
|
+
and isinstance(record_target, EvalLog)
|
128
|
+
):
|
129
|
+
value = column._extract_eval(record_target)
|
130
|
+
# some sample columns yield their value with an extract function
|
131
|
+
elif (
|
132
|
+
isinstance(column, SampleColumn)
|
133
|
+
and column._extract_sample is not None
|
134
|
+
and isinstance(record_target, EvalSample | EvalSampleSummary)
|
135
|
+
):
|
136
|
+
value = column._extract_sample(record_target) # type: ignore[arg-type]
|
137
|
+
elif (
|
138
|
+
isinstance(column, MessageColumn)
|
139
|
+
and column._extract_message is not None
|
140
|
+
and isinstance(record_target, ChatMessageBase)
|
141
|
+
):
|
142
|
+
value = column._extract_message(record_target)
|
143
|
+
elif (
|
144
|
+
isinstance(column, EventColumn)
|
145
|
+
and column._extract_event is not None
|
146
|
+
and isinstance(record_target, BaseEvent)
|
147
|
+
):
|
148
|
+
value = column._extract_event(record_target)
|
149
|
+
else:
|
150
|
+
raise ValueError("column must have path or extract function")
|
151
|
+
|
152
|
+
# call value function on column if it exists
|
153
|
+
if value is not None:
|
154
|
+
value = column.value(value)
|
155
|
+
|
156
|
+
except Exception as ex:
|
157
|
+
error = ColumnError(
|
158
|
+
column.name,
|
159
|
+
path=str(column.path) if column.path else None,
|
160
|
+
message=str(ex),
|
161
|
+
)
|
162
|
+
if strict:
|
163
|
+
raise ValueError(str(error))
|
164
|
+
else:
|
165
|
+
errors.append(error)
|
166
|
+
continue
|
167
|
+
|
168
|
+
# provide default if None
|
169
|
+
if value is None and column.default is not None:
|
170
|
+
value = column.default
|
171
|
+
|
172
|
+
# check for required
|
173
|
+
if column.required and value is None:
|
174
|
+
field_not_found(column.name, column.path)
|
175
|
+
|
176
|
+
# handle wildcard vs. no wildcard
|
177
|
+
if column.name.endswith("*"):
|
178
|
+
values = value if isinstance(value, list) else [value]
|
179
|
+
for value in values:
|
180
|
+
expanded = _expand_fields(column.name, value)
|
181
|
+
for k, v in expanded.items():
|
182
|
+
set_result(k, column, v)
|
183
|
+
else:
|
184
|
+
set_result(column.name, column, value)
|
185
|
+
|
186
|
+
# optionally return errors if we aren't in strict mode
|
187
|
+
if strict:
|
188
|
+
return result
|
189
|
+
else:
|
190
|
+
return result, errors
|
191
|
+
|
192
|
+
|
193
|
+
def resolve_duplicate_columns(columns: list[Column]) -> list[Column]:
|
194
|
+
"""Remove duplicate columns (with the later columns winning)"""
|
195
|
+
seen = set[str]()
|
196
|
+
deduped: list[Column] = []
|
197
|
+
for col in reversed(columns):
|
198
|
+
if col.name not in seen:
|
199
|
+
deduped.append(col)
|
200
|
+
seen.add(col.name)
|
201
|
+
deduped.reverse()
|
202
|
+
return deduped
|
203
|
+
|
204
|
+
|
205
|
+
def _resolve_value(
|
206
|
+
value: JsonValue,
|
207
|
+
type_: Type[ColumnType] | None = None,
|
208
|
+
) -> ColumnType:
|
209
|
+
"""
|
210
|
+
Coerce *value* to *type_* (if supplied).
|
211
|
+
|
212
|
+
Supported conversions
|
213
|
+
---------------------
|
214
|
+
* Normal Python constructor coercion (`int("5")`, `str(3.14)` …)
|
215
|
+
* Strings through YAML (handles "`true`", "`3.2`", "`2025-05-01`", …)
|
216
|
+
* ISO-8601 strings to ``date``, ``time``, ``datetime``
|
217
|
+
* POSIX timestamps (int/float **or** numeric string) → temporal types
|
218
|
+
* When *value* is a ``list`` or ``dict`` **and** either
|
219
|
+
- *type_* is ``str`` **or**
|
220
|
+
- *type_* is ``None`` (unspecified),
|
221
|
+
the structure is serialised with `json.dumps`
|
222
|
+
"""
|
223
|
+
## reflect none back
|
224
|
+
if value is None:
|
225
|
+
return None
|
226
|
+
|
227
|
+
# auto-stringify compound types
|
228
|
+
if isinstance(value, list | dict) and (type_ is None or type_ is str):
|
229
|
+
return json.dumps(value)
|
230
|
+
|
231
|
+
# we have now narrowed the value to not be none or a compound type
|
232
|
+
value = cast(int | str | float | bool, value)
|
233
|
+
|
234
|
+
# no target type or None → nothing to do
|
235
|
+
if type_ is None:
|
236
|
+
return value
|
237
|
+
|
238
|
+
# already correct
|
239
|
+
if isinstance(value, type_) and not _is_bool_int_mismatch(type_, value):
|
240
|
+
return value
|
241
|
+
|
242
|
+
# numeric timestamp → temporal
|
243
|
+
if isinstance(value, int | float):
|
244
|
+
coerced = _from_timestamp(type_, value)
|
245
|
+
if coerced is not None:
|
246
|
+
return coerced
|
247
|
+
|
248
|
+
# straight constructor
|
249
|
+
coerced = _try_constructor(type_, value)
|
250
|
+
if coerced is not None:
|
251
|
+
return coerced
|
252
|
+
|
253
|
+
# 4) string handling (YAML, ISO, numeric-string timestamp, …)
|
254
|
+
if isinstance(value, str):
|
255
|
+
coerced = _coerce_from_str(type_, value)
|
256
|
+
if coerced is not None:
|
257
|
+
return coerced
|
258
|
+
|
259
|
+
# give up
|
260
|
+
raise ValueError(
|
261
|
+
f"Cannot coerce {value} from type {type(value).__name__}) to {type_.__name__}"
|
262
|
+
)
|
263
|
+
|
264
|
+
|
265
|
+
def _is_bool_int_mismatch(tp: Type[ColumnType], obj: Any) -> bool:
|
266
|
+
"""True when an *int* coercion would silently produce a *bool* (undesired)."""
|
267
|
+
return tp is int and isinstance(obj, bool)
|
268
|
+
|
269
|
+
|
270
|
+
def _try_constructor(tp: Type[ColumnType], obj: Any) -> ColumnType:
|
271
|
+
"""Run `tp(obj)` but swallow any exception, return None on failure."""
|
272
|
+
# Constructors of date / time / datetime require ≥3 positional ints, so don’t even try them.
|
273
|
+
if tp in (date, time, datetime):
|
274
|
+
return None
|
275
|
+
|
276
|
+
# reflect None back
|
277
|
+
if obj is None:
|
278
|
+
return obj
|
279
|
+
|
280
|
+
try:
|
281
|
+
coerced = tp(obj) # type: ignore[call-arg, misc]
|
282
|
+
except Exception:
|
283
|
+
return None
|
284
|
+
return None if _is_bool_int_mismatch(tp, coerced) else coerced
|
285
|
+
|
286
|
+
|
287
|
+
def _from_timestamp(tp: Type[ColumnType], ts: int | float) -> ColumnType | None:
|
288
|
+
"""Convert POSIX timestamp to the requested temporal type, UTC zone."""
|
289
|
+
if tp is datetime:
|
290
|
+
return datetime.fromtimestamp(ts, tz=timezone.utc)
|
291
|
+
if tp is date:
|
292
|
+
return date.fromtimestamp(ts)
|
293
|
+
if tp is time: # derive from a datetime
|
294
|
+
return datetime.fromtimestamp(ts, tz=timezone.utc).time()
|
295
|
+
return None
|
296
|
+
|
297
|
+
|
298
|
+
def _coerce_from_str(tp: Type[ColumnType], text: str) -> ColumnType:
|
299
|
+
"""
|
300
|
+
Best-effort coercion from *text* to *tp*:
|
301
|
+
|
302
|
+
1. YAML parsing (catches booleans, numbers, ISO timestamps, …)
|
303
|
+
2. `fromisoformat` when available on the target class
|
304
|
+
3. Numeric-string → POSIX timestamp (for temporal targets)
|
305
|
+
4. Constructor fall-back
|
306
|
+
"""
|
307
|
+
# 1) YAML
|
308
|
+
try:
|
309
|
+
parsed = yaml.safe_load(text)
|
310
|
+
except Exception:
|
311
|
+
parsed = None
|
312
|
+
|
313
|
+
if parsed is not None:
|
314
|
+
# exact match?
|
315
|
+
if isinstance(parsed, tp) and not _is_bool_int_mismatch(tp, parsed):
|
316
|
+
return cast(ColumnType, parsed)
|
317
|
+
# try constructor on the YAML result (e.g. str→float via YAML "1.5")
|
318
|
+
coerced = _try_constructor(tp, parsed)
|
319
|
+
if coerced is not None:
|
320
|
+
return coerced
|
321
|
+
|
322
|
+
# 2) fromisoformat — only on temporal types and str itself
|
323
|
+
from_iso: Callable[[str], datetime] | None = getattr(tp, "fromisoformat", None)
|
324
|
+
if callable(from_iso):
|
325
|
+
try:
|
326
|
+
return from_iso(text)
|
327
|
+
except Exception:
|
328
|
+
pass
|
329
|
+
|
330
|
+
# 3) numeric string timestamp?
|
331
|
+
try:
|
332
|
+
tstmp = float(text)
|
333
|
+
except ValueError:
|
334
|
+
tstmp = None
|
335
|
+
if tstmp is not None:
|
336
|
+
coerced = _from_timestamp(tp, tstmp)
|
337
|
+
if coerced is not None:
|
338
|
+
return coerced
|
339
|
+
|
340
|
+
# 4) plain constructor last
|
341
|
+
return _try_constructor(tp, text)
|
342
|
+
|
343
|
+
|
344
|
+
def _expand_fields(name: str, value: JsonValue) -> dict[str, JsonValue]:
|
345
|
+
result: dict[str, JsonValue] = {}
|
346
|
+
|
347
|
+
# Base case: no asterisks in the field name
|
348
|
+
if "*" not in name:
|
349
|
+
result[name] = value
|
350
|
+
return result
|
351
|
+
|
352
|
+
# If there's an asterisk but value isn't a dictionary, we can't expand
|
353
|
+
if not isinstance(value, dict):
|
354
|
+
# Handle this case - either return empty dict, skip it, or use a default name
|
355
|
+
# For now, I'll just return an empty dict
|
356
|
+
return result
|
357
|
+
|
358
|
+
# Get the position of the first asterisk
|
359
|
+
asterisk_pos = name.find("*")
|
360
|
+
prefix = name[:asterisk_pos]
|
361
|
+
suffix = name[asterisk_pos + 1 :]
|
362
|
+
|
363
|
+
# recursive case: expand each key in the dictionary
|
364
|
+
for key, val in value.items():
|
365
|
+
new_field = prefix + key + suffix
|
366
|
+
# recursively expand any remaining asterisks
|
367
|
+
if "*" in suffix:
|
368
|
+
if isinstance(val, dict):
|
369
|
+
expanded = _expand_fields(new_field, val)
|
370
|
+
result.update(expanded)
|
371
|
+
# If suffix has '*' but val is not a dict, skip it
|
372
|
+
else:
|
373
|
+
pass
|
374
|
+
else:
|
375
|
+
result[new_field] = val
|
376
|
+
|
377
|
+
return result
|
File without changes
|
@@ -0,0 +1,77 @@
|
|
1
|
+
from typing import Any, Callable, Mapping, Type
|
2
|
+
|
3
|
+
from jsonpath_ng import JSONPath # type: ignore
|
4
|
+
from pydantic import JsonValue
|
5
|
+
from typing_extensions import override
|
6
|
+
|
7
|
+
from inspect_ai.log._log import EvalSample, EvalSampleSummary
|
8
|
+
|
9
|
+
from ..columns import Column, ColumnType
|
10
|
+
from ..extract import list_as_str, score_values
|
11
|
+
from ..validate import resolved_schema
|
12
|
+
from .extract import (
|
13
|
+
sample_input_as_str,
|
14
|
+
sample_messages_as_str,
|
15
|
+
sample_path_requires_full,
|
16
|
+
)
|
17
|
+
|
18
|
+
|
19
|
+
class SampleColumn(Column):
|
20
|
+
"""Column which maps to `EvalSample` or `EvalSampleSummary`."""
|
21
|
+
|
22
|
+
def __init__(
|
23
|
+
self,
|
24
|
+
name: str,
|
25
|
+
*,
|
26
|
+
path: str
|
27
|
+
| JSONPath
|
28
|
+
| Callable[[EvalSampleSummary], JsonValue]
|
29
|
+
| Callable[[EvalSample], JsonValue],
|
30
|
+
required: bool = False,
|
31
|
+
default: JsonValue | None = None,
|
32
|
+
type: Type[ColumnType] | None = None,
|
33
|
+
value: Callable[[JsonValue], JsonValue] | None = None,
|
34
|
+
full: bool = False,
|
35
|
+
) -> None:
|
36
|
+
super().__init__(
|
37
|
+
name=name,
|
38
|
+
path=path if not callable(path) else None,
|
39
|
+
required=required,
|
40
|
+
default=default,
|
41
|
+
type=type,
|
42
|
+
value=value,
|
43
|
+
)
|
44
|
+
self._extract_sample = path if callable(path) else None
|
45
|
+
self._full = full or sample_path_requires_full(path)
|
46
|
+
|
47
|
+
@override
|
48
|
+
def path_schema(self) -> Mapping[str, Any]:
|
49
|
+
if self._full:
|
50
|
+
return self.full_schema
|
51
|
+
else:
|
52
|
+
return self.summary_schema
|
53
|
+
|
54
|
+
summary_schema = resolved_schema(EvalSampleSummary)
|
55
|
+
full_schema = resolved_schema(EvalSample)
|
56
|
+
|
57
|
+
|
58
|
+
SampleSummary: list[Column] = [
|
59
|
+
SampleColumn("id", path="id", required=True, type=str),
|
60
|
+
SampleColumn("epoch", path="epoch", required=True),
|
61
|
+
SampleColumn("input", path=sample_input_as_str, required=True),
|
62
|
+
SampleColumn("target", path="target", required=True, value=list_as_str),
|
63
|
+
SampleColumn("metadata_*", path="metadata"),
|
64
|
+
SampleColumn("score_*", path="scores", value=score_values),
|
65
|
+
SampleColumn("model_usage", path="model_usage"),
|
66
|
+
SampleColumn("total_time", path="total_time"),
|
67
|
+
SampleColumn("working_time", path="total_time"),
|
68
|
+
SampleColumn("error", path="error"),
|
69
|
+
SampleColumn("limit", path="limit"),
|
70
|
+
SampleColumn("retries", path="retries"),
|
71
|
+
]
|
72
|
+
"""Sample summary columns."""
|
73
|
+
|
74
|
+
SampleMessages: list[Column] = [
|
75
|
+
SampleColumn("messages", path=sample_messages_as_str, required=True, full=True)
|
76
|
+
]
|
77
|
+
"""Sample messages as a string."""
|
@@ -0,0 +1,54 @@
|
|
1
|
+
from typing import Callable
|
2
|
+
|
3
|
+
from jsonpath_ng import JSONPath # type: ignore
|
4
|
+
from pydantic import JsonValue
|
5
|
+
|
6
|
+
from inspect_ai.log._log import EvalSample, EvalSampleSummary
|
7
|
+
|
8
|
+
from ..extract import auto_id, messages_as_str
|
9
|
+
|
10
|
+
|
11
|
+
def sample_input_as_str(sample: EvalSample) -> str:
|
12
|
+
return messages_as_str(sample.input)
|
13
|
+
|
14
|
+
|
15
|
+
def sample_messages_as_str(sample: EvalSample) -> str:
|
16
|
+
return messages_as_str(sample.messages)
|
17
|
+
|
18
|
+
|
19
|
+
def sample_path_requires_full(
|
20
|
+
path: str
|
21
|
+
| JSONPath
|
22
|
+
| Callable[[EvalSampleSummary], JsonValue]
|
23
|
+
| Callable[[EvalSample], JsonValue],
|
24
|
+
) -> bool:
|
25
|
+
if callable(path):
|
26
|
+
return False
|
27
|
+
else:
|
28
|
+
path = str(path)
|
29
|
+
return any(
|
30
|
+
[
|
31
|
+
path.startswith(prefix)
|
32
|
+
for prefix in [
|
33
|
+
"choices",
|
34
|
+
"sandbox",
|
35
|
+
"files",
|
36
|
+
"setup",
|
37
|
+
"messages",
|
38
|
+
"output",
|
39
|
+
"store",
|
40
|
+
"events",
|
41
|
+
"uuid",
|
42
|
+
"error_retries",
|
43
|
+
"attachments",
|
44
|
+
]
|
45
|
+
]
|
46
|
+
)
|
47
|
+
|
48
|
+
|
49
|
+
def auto_sample_id(eval_id: str, sample: EvalSample | EvalSampleSummary) -> str:
|
50
|
+
return auto_id(eval_id, f"{sample.id}_{sample.epoch}")
|
51
|
+
|
52
|
+
|
53
|
+
def auto_detail_id(sample_id: str, name: str, index: int) -> str:
|
54
|
+
return auto_id(sample_id, f"{name}_{index}")
|