inspect-ai 0.3.73__py3-none-any.whl → 0.3.75__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. inspect_ai/__init__.py +3 -2
  2. inspect_ai/_cli/cache.py +1 -1
  3. inspect_ai/_cli/common.py +15 -0
  4. inspect_ai/_cli/eval.py +4 -5
  5. inspect_ai/_cli/log.py +1 -1
  6. inspect_ai/_cli/sandbox.py +1 -1
  7. inspect_ai/_cli/trace.py +1 -1
  8. inspect_ai/_cli/view.py +1 -1
  9. inspect_ai/_display/core/config.py +3 -1
  10. inspect_ai/_eval/eval.py +55 -61
  11. inspect_ai/_eval/evalset.py +63 -154
  12. inspect_ai/_eval/loader.py +27 -54
  13. inspect_ai/_eval/registry.py +1 -10
  14. inspect_ai/_eval/run.py +3 -4
  15. inspect_ai/_eval/task/__init__.py +8 -2
  16. inspect_ai/_eval/task/log.py +9 -1
  17. inspect_ai/_eval/task/resolved.py +35 -0
  18. inspect_ai/_eval/task/task.py +50 -69
  19. inspect_ai/_eval/task/tasks.py +30 -0
  20. inspect_ai/_util/constants.py +3 -0
  21. inspect_ai/_util/dotenv.py +17 -0
  22. inspect_ai/_util/registry.py +43 -2
  23. inspect_ai/_view/server.py +28 -10
  24. inspect_ai/_view/www/dist/assets/index.css +4 -3
  25. inspect_ai/_view/www/dist/assets/index.js +13030 -25523
  26. inspect_ai/_view/www/package.json +2 -2
  27. inspect_ai/_view/www/src/appearance/styles.ts +6 -5
  28. inspect_ai/_view/www/src/components/AnsiDisplay.tsx +2 -2
  29. inspect_ai/_view/www/src/constants.ts +3 -0
  30. inspect_ai/_view/www/src/logfile/remoteZipFile.ts +141 -20
  31. inspect_ai/_view/www/src/plan/PlanDetailView.tsx +2 -1
  32. inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +1 -1
  33. inspect_ai/_view/www/src/samples/chat/tools/tool.ts +7 -5
  34. inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.module.css +1 -0
  35. inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +3 -1
  36. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +5 -2
  37. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +5 -1
  38. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +17 -12
  39. inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +2 -1
  40. inspect_ai/_view/www/yarn.lock +12 -5
  41. inspect_ai/log/_log.py +10 -1
  42. inspect_ai/log/_recorders/eval.py +27 -8
  43. inspect_ai/log/_recorders/json.py +2 -2
  44. inspect_ai/model/_cache.py +3 -1
  45. inspect_ai/model/_chat_message.py +12 -1
  46. inspect_ai/model/_model.py +25 -11
  47. inspect_ai/model/_providers/anthropic.py +34 -2
  48. inspect_ai/model/_providers/google.py +6 -2
  49. inspect_ai/model/_providers/none.py +31 -0
  50. inspect_ai/model/_providers/providers.py +7 -0
  51. inspect_ai/solver/_bridge/bridge.py +1 -1
  52. inspect_ai/solver/_chain.py +7 -6
  53. inspect_ai/tool/_tools/_computer/_computer.py +1 -1
  54. inspect_ai/tool/_tools/_web_browser/_web_browser.py +1 -1
  55. inspect_ai/tool/_tools/_web_search.py +2 -2
  56. inspect_ai/util/_sandbox/context.py +2 -1
  57. inspect_ai/util/_sandbox/environment.py +17 -2
  58. {inspect_ai-0.3.73.dist-info → inspect_ai-0.3.75.dist-info}/METADATA +4 -4
  59. {inspect_ai-0.3.73.dist-info → inspect_ai-0.3.75.dist-info}/RECORD +63 -60
  60. {inspect_ai-0.3.73.dist-info → inspect_ai-0.3.75.dist-info}/WHEEL +1 -1
  61. {inspect_ai-0.3.73.dist-info → inspect_ai-0.3.75.dist-info}/LICENSE +0 -0
  62. {inspect_ai-0.3.73.dist-info → inspect_ai-0.3.75.dist-info}/entry_points.txt +0 -0
  63. {inspect_ai-0.3.73.dist-info → inspect_ai-0.3.75.dist-info}/top_level.txt +0 -0
@@ -13,6 +13,7 @@ from inspect_ai.approval._policy import ApprovalPolicy, approval_policies_from_c
13
13
  from inspect_ai.dataset import Dataset, MemoryDataset, Sample
14
14
  from inspect_ai.log import EvalLog
15
15
  from inspect_ai.model import GenerateConfig
16
+ from inspect_ai.model._model import Model, get_model
16
17
  from inspect_ai.scorer import Metric, Scorer
17
18
  from inspect_ai.scorer._reducer import ScoreReducers, create_reducers
18
19
  from inspect_ai.solver import Plan, Solver, generate
@@ -50,6 +51,7 @@ class Task:
50
51
  cleanup: Callable[[TaskState], Awaitable[None]] | None = None,
51
52
  scorer: Scorer | list[Scorer] | None = None,
52
53
  metrics: list[Metric] | dict[str, list[Metric]] | None = None,
54
+ model: str | Model | None = None,
53
55
  config: GenerateConfig = GenerateConfig(),
54
56
  sandbox: SandboxEnvironmentType | None = None,
55
57
  approval: str | list[ApprovalPolicy] | None = None,
@@ -67,42 +69,38 @@ class Task:
67
69
  """Create a task.
68
70
 
69
71
  Args:
70
- dataset (Dataset | Sequence[Sample]): Dataset to evaluate
71
- setup: (Solver | list[Solver] | None): Setup step (always run
72
- even when the main `solver` is replaced).
73
- solver: (Solver | list[Solver]): Solver or list of solvers.
74
- Defaults to generate(), a normal call to the model.
72
+ dataset: Dataset to evaluate
73
+ setup: Setup step (always run even when the main `solver` is replaced).
74
+ solver: Solver or list of solvers. Defaults to generate(), a normal call to the model.
75
75
  cleanup: Optional cleanup function for task. Called after
76
76
  all solvers have run for each sample (including if an
77
77
  exception occurs during the run)
78
- scorer: (Scorer | list[Scorer] | None): Scorer used to evaluate model output.
79
- metrics (list[Metric] | dict[str, list[Metric]] | None):
80
- Alternative metrics (overrides the metrics provided by the specified scorer).
81
- config (GenerateConfig): Model generation config.
82
- sandbox (SandboxEnvironmentType | None): Sandbox environment type
83
- (or optionally a str or tuple with a shorthand spec)
84
- approval: (str | list[ApprovalPolicy] | None): Tool use approval policies.
85
- Either a path to an approval policy config file or a list of approval policies.
86
- Defaults to no approval policy.
87
- epochs (int | Epochs | None): Epochs to repeat samples for and optional score
78
+ scorer: Scorer used to evaluate model output.
79
+ metrics: Alternative metrics (overrides the metrics provided by the specified scorer).
80
+ model: Default model for task (Optional, defaults to eval model).
81
+ config: Model generation config.
82
+ sandbox: Sandbox environment type (or optionally a str or tuple with a shorthand spec)
83
+ approval: Tool use approval policies.
84
+ Either a path to an approval policy config file or a list of approval policies. Defaults to no approval policy.
85
+ epochs: Epochs to repeat samples for and optional score
88
86
  reducer function(s) used to combine sample scores (defaults to "mean")
89
- fail_on_error (bool | float | None): `True` to fail on first sample error
87
+ fail_on_error: `True` to fail on first sample error
90
88
  (default); `False` to never fail on sample errors; Value between 0 and 1
91
89
  to fail if a proportion of total samples fails. Value greater than 1 to fail
92
90
  eval if a count of samples fails.
93
- message_limit (int | None): Limit on total messages used for each sample.
94
- token_limit (int | None): Limit on total tokens used for each sample.
91
+ message_limit: Limit on total messages used for each sample.
92
+ token_limit: Limit on total tokens used for each sample.
95
93
  time_limit: Limit on clock time (in seconds) for samples.
96
94
  working_limit: Limit on working time (in seconds) for sample. Working
97
95
  time includes model generation, tool calls, etc. but does not include
98
96
  time spent waiting on retries or shared resources.
99
- name: (str | None): Task name. If not specified is automatically
97
+ name: Task name. If not specified is automatically
100
98
  determined based on the name of the task directory (or "task")
101
99
  if its anonymous task (e.g. created in a notebook and passed to
102
100
  eval() directly)
103
- version: (int): Version of task (to distinguish evolutions
101
+ version: Version of task (to distinguish evolutions
104
102
  of the task spec or breaking changes to it)
105
- metadata: (dict[str, Any] | None): Additional metadata to associate with the task.
103
+ metadata: Additional metadata to associate with the task.
106
104
  **kwargs: Deprecated arguments.
107
105
  """
108
106
  # handle deprecated args
@@ -135,6 +133,7 @@ class Task:
135
133
  self.cleanup = cleanup
136
134
  self.scorer = resolve_scorer(scorer)
137
135
  self.metrics = metrics
136
+ self.model = resolve_model(model)
138
137
  self.config = config
139
138
  self.sandbox = resolve_sandbox_environment(sandbox)
140
139
  self.approval = resolve_approval(approval)
@@ -176,6 +175,7 @@ def task_with(
176
175
  cleanup: Callable[[TaskState], Awaitable[None]] | None | NotGiven = NOT_GIVEN,
177
176
  scorer: Scorer | list[Scorer] | None | NotGiven = NOT_GIVEN,
178
177
  metrics: list[Metric] | dict[str, list[Metric]] | None | NotGiven = NOT_GIVEN,
178
+ model: str | Model | NotGiven = NOT_GIVEN,
179
179
  config: GenerateConfig | NotGiven = NOT_GIVEN,
180
180
  sandbox: SandboxEnvironmentType | None | NotGiven = NOT_GIVEN,
181
181
  approval: str | list[ApprovalPolicy] | None | NotGiven = NOT_GIVEN,
@@ -192,43 +192,39 @@ def task_with(
192
192
  """Task adapted with alternate values for one or more options.
193
193
 
194
194
  Args:
195
- task (Task): Task to adapt (it is deep copied prior to mutating options)
196
- dataset (Dataset | Sequence[Sample]): Dataset to evaluate
197
- setup: (Solver | list[Solver] | None): Setup step (always run
198
- even when the main `solver` is replaced).
199
- solver: (Solver | list[Solver]): Solver or list of solvers.
200
- Defaults to generate(), a normal call to the model.
195
+ task: Task to adapt (it is deep copied prior to mutating options)
196
+ dataset: Dataset to evaluate
197
+ setup: Setup step (always run even when the main `solver` is replaced).
198
+ solver: Solver or list of solvers. Defaults to generate(), a normal call to the model.
201
199
  cleanup: Optional cleanup function for task. Called after
202
200
  all solvers have run for each sample (including if an
203
201
  exception occurs during the run)
204
- scorer: (Scorer | list[Scorer] | None): Scorer used to evaluate model output.
205
- metrics (list[Metric] | dict[str, list[Metric]] | None):
206
- Alternative metrics (overrides the metrics provided by the specified scorer).
207
- config (GenerateConfig): Model generation config.
208
- sandbox (SandboxEnvironmentType | None): Sandbox environment type
209
- (or optionally a str or tuple with a shorthand spec)
210
- approval: (str | list[ApprovalPolicy] | None): Tool use approval policies.
211
- Either a path to an approval policy config file or a list of approval policies.
212
- Defaults to no approval policy.
213
- epochs (int | Epochs | None): Epochs to repeat samples for and optional score
202
+ scorer: Scorer used to evaluate model output.
203
+ metrics: Alternative metrics (overrides the metrics provided by the specified scorer).
204
+ model: Default model for task (Optional, defaults to eval model).
205
+ config: Model generation config.
206
+ sandbox: Sandbox environment type (or optionally a str or tuple with a shorthand spec)
207
+ approval: Tool use approval policies.
208
+ Either a path to an approval policy config file or a list of approval policies. Defaults to no approval policy.
209
+ epochs: Epochs to repeat samples for and optional score
214
210
  reducer function(s) used to combine sample scores (defaults to "mean")
215
- fail_on_error (bool | float | None): `True` to fail on first sample error
211
+ fail_on_error: `True` to fail on first sample error
216
212
  (default); `False` to never fail on sample errors; Value between 0 and 1
217
213
  to fail if a proportion of total samples fails. Value greater than 1 to fail
218
214
  eval if a count of samples fails.
219
- message_limit (int | None): Limit on total messages used for each sample.
220
- token_limit (int | None): Limit on total tokens used for each sample.
215
+ message_limit: Limit on total messages used for each sample.
216
+ token_limit: Limit on total tokens used for each sample.
221
217
  time_limit: Limit on clock time (in seconds) for samples.
222
- working_limit: Limit on execution time (in seconds) for sample. Execution
218
+ working_limit: Limit on working time (in seconds) for sample. Working
223
219
  time includes model generation, tool calls, etc. but does not include
224
220
  time spent waiting on retries or shared resources.
225
- name: (str | None): Task name. If not specified is automatically
221
+ name: Task name. If not specified is automatically
226
222
  determined based on the name of the task directory (or "task")
227
223
  if its anonymous task (e.g. created in a notebook and passed to
228
224
  eval() directly)
229
- version: (int): Version of task (to distinguish evolutions
225
+ version: Version of task (to distinguish evolutions
230
226
  of the task spec or breaking changes to it)
231
- metadata: (dict[str, Any] | None): Additional metadata to associate with the task.
227
+ metadata: Additional metadata to associate with the task.
232
228
 
233
229
  Returns:
234
230
  Task: Task adapted with alternate options.
@@ -248,6 +244,8 @@ def task_with(
248
244
  task.scorer = resolve_scorer(scorer)
249
245
  if not isinstance(metrics, NotGiven):
250
246
  task.metrics = metrics
247
+ if not isinstance(model, NotGiven):
248
+ task.model = resolve_model(model)
251
249
  if not isinstance(config, NotGiven):
252
250
  task.config = config
253
251
  if not isinstance(sandbox, NotGiven):
@@ -307,34 +305,10 @@ class PreviousTask:
307
305
  id: str
308
306
  task: str | Task
309
307
  task_args: dict[str, Any]
308
+ model: Model | None
310
309
  log: EvalLog
311
310
 
312
311
 
313
- Tasks = (
314
- str
315
- | PreviousTask
316
- | TaskInfo
317
- | Task
318
- | Callable[..., Task]
319
- | type[Task]
320
- | list[str]
321
- | list[PreviousTask]
322
- | list[TaskInfo]
323
- | list[Task]
324
- | list[Callable[..., Task]]
325
- | list[type[Task]]
326
- | None
327
- )
328
- r"""One or more tasks.
329
-
330
- Tasks to be evaluated. Many forms of task specification are
331
- supported including directory names, task functions, task
332
- classes, and task instances (a single task or list of tasks
333
- can be specified). None is a request to read a task out
334
- of the current working directory.
335
- """
336
-
337
-
338
312
  def resolve_approval(
339
313
  approval: str | list[ApprovalPolicy] | None,
340
314
  ) -> list[ApprovalPolicy] | None:
@@ -370,6 +344,13 @@ def resolve_solver(solver: Solver | list[Solver]) -> Solver:
370
344
  return chain(solver) if isinstance(solver, list) else solver
371
345
 
372
346
 
347
+ def resolve_model(model: str | Model | None) -> Model | None:
348
+ if isinstance(model, str):
349
+ return get_model(model)
350
+ else:
351
+ return model
352
+
353
+
373
354
  def resolve_scorer(scorer: Scorer | list[Scorer] | None) -> list[Scorer] | None:
374
355
  return (
375
356
  scorer if isinstance(scorer, list) else [scorer] if scorer is not None else None
@@ -0,0 +1,30 @@
1
+ from typing import Callable, TypeAlias
2
+
3
+ from .resolved import ResolvedTask
4
+ from .task import PreviousTask, Task, TaskInfo
5
+
6
+ Tasks: TypeAlias = (
7
+ str
8
+ | PreviousTask
9
+ | ResolvedTask
10
+ | TaskInfo
11
+ | Task
12
+ | Callable[..., Task]
13
+ | type[Task]
14
+ | list[str]
15
+ | list[PreviousTask]
16
+ | list[ResolvedTask]
17
+ | list[TaskInfo]
18
+ | list[Task]
19
+ | list[Callable[..., Task]]
20
+ | list[type[Task]]
21
+ | None
22
+ )
23
+ r"""One or more tasks.
24
+
25
+ Tasks to be evaluated. Many forms of task specification are
26
+ supported including directory names, task functions, task
27
+ classes, and task instances (a single task or list of tasks
28
+ can be specified). None is a request to read a task out
29
+ of the current working directory.
30
+ """
@@ -36,3 +36,6 @@ CONSOLE_DISPLAY_WIDTH = 120
36
36
  BASE_64_DATA_REMOVED = "<base64-data-removed>"
37
37
  SANDBOX_SETUP_TIMEOUT = 300
38
38
  NO_CONTENT = "(no content)"
39
+
40
+ DESERIALIZING = "deserializing"
41
+ DESERIALIZING_CONTEXT = {DESERIALIZING: True}
@@ -52,6 +52,9 @@ def init_dotenv() -> None:
52
52
  if inspect_log_dir:
53
53
  os.environ[INSPECT_LOG_DIR_VAR] = inspect_log_dir
54
54
 
55
+ # re-apply any env vars specified at the cli w/ --env
56
+ apply_cli_env()
57
+
55
58
 
56
59
  @contextlib.contextmanager
57
60
  def dotenv_environ(
@@ -76,3 +79,17 @@ def dotenv_environ(
76
79
  finally:
77
80
  os.environ.update(update_after)
78
81
  [os.environ.pop(k) for k in remove_after]
82
+
83
+
84
+ _cli_env: dict[str, Any] = {}
85
+
86
+
87
+ def init_cli_env(env: dict[str, Any]) -> None:
88
+ global _cli_env
89
+ _cli_env = env
90
+ apply_cli_env()
91
+
92
+
93
+ def apply_cli_env() -> None:
94
+ for var, value in _cli_env.items():
95
+ os.environ[var] = str(value)
@@ -5,6 +5,7 @@ from typing import Any, Callable, Literal, TypedDict, TypeGuard, cast
5
5
  from pydantic import BaseModel, Field
6
6
  from pydantic_core import to_jsonable_python
7
7
 
8
+ from inspect_ai._util.json import jsonable_python
8
9
  from inspect_ai._util.package import get_installed_package_name
9
10
 
10
11
  from .constants import PKG_NAME
@@ -198,13 +199,15 @@ def registry_create(type: RegistryType, name: str, **kwargs: Any) -> object:
198
199
  def with_registry_info(o: object) -> object:
199
200
  return set_registry_info(o, registry_info(obj))
200
201
 
201
- # instantiate registry objects
202
+ # instantiate registry and model objects
202
203
  for param in kwargs.keys():
203
204
  value = kwargs[param]
204
205
  if is_registry_dict(value):
205
206
  kwargs[param] = registry_create(
206
207
  value["type"], value["name"], **value["params"]
207
208
  )
209
+ elif is_model_dict(value):
210
+ kwargs[param] = model_create_from_dict(value)
208
211
 
209
212
  if isclass(obj):
210
213
  return with_registry_info(obj(**kwargs))
@@ -380,6 +383,8 @@ def is_registry_dict(o: object) -> TypeGuard[RegistryDict]:
380
383
 
381
384
 
382
385
  def registry_value(o: object) -> Any:
386
+ from inspect_ai.model._model import Model
387
+
383
388
  # treat tuple as list
384
389
  if isinstance(o, tuple):
385
390
  o = list(o)
@@ -390,14 +395,50 @@ def registry_value(o: object) -> Any:
390
395
  elif isinstance(o, dict):
391
396
  return {k: registry_value(v) for k, v in o.items()}
392
397
  elif has_registry_params(o):
393
- return dict(
398
+ return RegistryDict(
394
399
  type=registry_info(o).type,
395
400
  name=registry_log_name(o),
396
401
  params=registry_params(o),
397
402
  )
403
+ elif isinstance(o, Model):
404
+ return ModelDict(
405
+ model=str(o),
406
+ config=jsonable_python(o.config),
407
+ base_url=o.api.base_url,
408
+ model_args=o.model_args,
409
+ )
398
410
  else:
399
411
  return o
400
412
 
401
413
 
402
414
  def registry_create_from_dict(d: RegistryDict) -> object:
403
415
  return registry_create(d["type"], d["name"], **d["params"])
416
+
417
+
418
+ class ModelDict(TypedDict):
419
+ model: str
420
+ config: dict[str, Any]
421
+ base_url: str | None
422
+ model_args: dict[str, Any]
423
+
424
+
425
+ def is_model_dict(o: object) -> TypeGuard[ModelDict]:
426
+ return (
427
+ isinstance(o, dict)
428
+ and "model" in o
429
+ and "config" in o
430
+ and "base_url" in o
431
+ and "model_args" in o
432
+ )
433
+
434
+
435
+ def model_create_from_dict(d: ModelDict) -> object:
436
+ from inspect_ai.model._generate_config import GenerateConfig
437
+ from inspect_ai.model._model import get_model
438
+
439
+ return get_model(
440
+ d["model"],
441
+ config=GenerateConfig(**d["config"]),
442
+ base_url=d["base_url"],
443
+ **d["model_args"],
444
+ )
@@ -57,8 +57,7 @@ def view_server(
57
57
  @routes.get("/api/logs/{log}")
58
58
  async def api_log(request: web.Request) -> web.Response:
59
59
  # log file requested
60
- file = request.match_info["log"]
61
- file = urllib.parse.unquote(file)
60
+ file = normalize_uri(request.match_info["log"])
62
61
  validate_log_file_request(file)
63
62
 
64
63
  # header_only is based on a size threshold
@@ -68,8 +67,7 @@ def view_server(
68
67
  @routes.get("/api/log-size/{log}")
69
68
  async def api_log_size(request: web.Request) -> web.Response:
70
69
  # log file requested
71
- file = request.match_info["log"]
72
- file = urllib.parse.unquote(file)
70
+ file = normalize_uri(request.match_info["log"])
73
71
  validate_log_file_request(file)
74
72
 
75
73
  return await log_size_response(file)
@@ -77,8 +75,7 @@ def view_server(
77
75
  @routes.get("/api/log-delete/{log}")
78
76
  async def api_log_delete(request: web.Request) -> web.Response:
79
77
  # log file requested
80
- file = request.match_info["log"]
81
- file = urllib.parse.unquote(file)
78
+ file = normalize_uri(request.match_info["log"])
82
79
  validate_log_file_request(file)
83
80
 
84
81
  return await log_delete_response(file)
@@ -86,8 +83,7 @@ def view_server(
86
83
  @routes.get("/api/log-bytes/{log}")
87
84
  async def api_log_bytes(request: web.Request) -> web.Response:
88
85
  # log file requested
89
- file = request.match_info["log"]
90
- file = urllib.parse.unquote(file)
86
+ file = normalize_uri(request.match_info["log"])
91
87
  validate_log_file_request(file)
92
88
 
93
89
  # header_only is based on a size threshold
@@ -106,7 +102,7 @@ def view_server(
106
102
  if authorization:
107
103
  request_log_dir = request.query.getone("log_dir", None)
108
104
  if request_log_dir:
109
- request_log_dir = urllib.parse.unquote(request_log_dir)
105
+ request_log_dir = normalize_uri(request_log_dir)
110
106
  else:
111
107
  request_log_dir = log_dir
112
108
  else:
@@ -121,7 +117,7 @@ def view_server(
121
117
  @routes.get("/api/log-headers")
122
118
  async def api_log_headers(request: web.Request) -> web.Response:
123
119
  files = request.query.getall("file", [])
124
- files = [urllib.parse.unquote(file) for file in files]
120
+ files = [normalize_uri(file) for file in files]
125
121
  map(validate_log_file_request, files)
126
122
  return await log_headers_response(files)
127
123
 
@@ -166,6 +162,28 @@ def view_server(
166
162
  )
167
163
 
168
164
 
165
+ def normalize_uri(uri: str) -> str:
166
+ """Normalize incoming URIs to a consistent format."""
167
+ # Decode any URL-encoded characters
168
+ parsed = urllib.parse.urlparse(urllib.parse.unquote(uri))
169
+
170
+ if parsed.scheme != "file":
171
+ # If this isn't a file uri, just unquote it
172
+ return urllib.parse.unquote(uri)
173
+
174
+ else:
175
+ # If this is a file uri, see whether we should process triple slashes
176
+ # down to double slashes
177
+ path = parsed.path
178
+
179
+ # Detect and normalize Windows-style file URIs
180
+ if path.startswith("/") and len(path) > 3 and path[2] == ":":
181
+ # Strip leading `/` before drive letter
182
+ path = path[1:]
183
+
184
+ return f"file://{path}"
185
+
186
+
169
187
  def log_listing_response(logs: list[EvalLogInfo], log_dir: str) -> web.Response:
170
188
  response = dict(
171
189
  log_dir=aliased_path(log_dir),
@@ -16346,7 +16346,7 @@ ul.jsondiffpatch-textdiff {
16346
16346
  column-gap: 0.5em;
16347
16347
  min-width: 200px;
16348
16348
  }
16349
- ._flatBody_gk2ju_1 {
16349
+ ._flatBody_1uw6w_1 {
16350
16350
  color: var(--bs-danger);
16351
16351
  display: grid;
16352
16352
  grid-template-columns: max-content max-content;
@@ -16354,16 +16354,17 @@ ul.jsondiffpatch-textdiff {
16354
16354
  margin-top: 0.4rem;
16355
16355
  }
16356
16356
 
16357
- ._iconSmall_gk2ju_9 {
16357
+ ._iconSmall_1uw6w_9 {
16358
16358
  font-size: var(--inspect-font-size-small);
16359
16359
  line-height: var(--inspect-font-size-small);
16360
16360
  height: var(--inspect-font-size-small);
16361
16361
  }
16362
16362
 
16363
- ._lineBase_gk2ju_15 {
16363
+ ._lineBase_1uw6w_15 {
16364
16364
  font-size: var(--inspect-font-size-base);
16365
16365
  line-height: var(--inspect-font-size-base);
16366
16366
  height: var(--inspect-font-size-base);
16367
+ max-width: 30em;
16367
16368
  }
16368
16369
  ._target_9qy4e_1 {
16369
16370
  padding-left: 0;