docent-python 0.1.0a8__tar.gz → 0.1.0a9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docent_python-0.1.0a8 → docent_python-0.1.0a9}/PKG-INFO +1 -1
- {docent_python-0.1.0a8 → docent_python-0.1.0a9}/docent/data_models/__init__.py +6 -1
- {docent_python-0.1.0a8 → docent_python-0.1.0a9}/docent/data_models/metadata.py +1 -1
- docent_python-0.1.0a9/docent/loaders/load_inspect.py +76 -0
- {docent_python-0.1.0a8 → docent_python-0.1.0a9}/docent/sdk/client.py +24 -1
- {docent_python-0.1.0a8 → docent_python-0.1.0a9}/pyproject.toml +1 -1
- {docent_python-0.1.0a8 → docent_python-0.1.0a9}/.gitignore +0 -0
- {docent_python-0.1.0a8 → docent_python-0.1.0a9}/LICENSE.md +0 -0
- {docent_python-0.1.0a8 → docent_python-0.1.0a9}/README.md +0 -0
- {docent_python-0.1.0a8 → docent_python-0.1.0a9}/docent/__init__.py +0 -0
- {docent_python-0.1.0a8 → docent_python-0.1.0a9}/docent/_log_util/__init__.py +0 -0
- {docent_python-0.1.0a8 → docent_python-0.1.0a9}/docent/_log_util/logger.py +0 -0
- {docent_python-0.1.0a8 → docent_python-0.1.0a9}/docent/data_models/_tiktoken_util.py +0 -0
- {docent_python-0.1.0a8 → docent_python-0.1.0a9}/docent/data_models/agent_run.py +0 -0
- {docent_python-0.1.0a8 → docent_python-0.1.0a9}/docent/data_models/chat/__init__.py +0 -0
- {docent_python-0.1.0a8 → docent_python-0.1.0a9}/docent/data_models/chat/content.py +0 -0
- {docent_python-0.1.0a8 → docent_python-0.1.0a9}/docent/data_models/chat/message.py +0 -0
- {docent_python-0.1.0a8 → docent_python-0.1.0a9}/docent/data_models/chat/tool.py +0 -0
- {docent_python-0.1.0a8 → docent_python-0.1.0a9}/docent/data_models/citation.py +0 -0
- {docent_python-0.1.0a8 → docent_python-0.1.0a9}/docent/data_models/regex.py +0 -0
- {docent_python-0.1.0a8 → docent_python-0.1.0a9}/docent/data_models/shared_types.py +0 -0
- {docent_python-0.1.0a8 → docent_python-0.1.0a9}/docent/data_models/transcript.py +0 -0
- {docent_python-0.1.0a8 → docent_python-0.1.0a9}/docent/py.typed +0 -0
- {docent_python-0.1.0a8 → docent_python-0.1.0a9}/docent/samples/__init__.py +0 -0
- {docent_python-0.1.0a8 → docent_python-0.1.0a9}/docent/samples/load.py +0 -0
- {docent_python-0.1.0a8 → docent_python-0.1.0a9}/docent/samples/log.eval +0 -0
- {docent_python-0.1.0a8 → docent_python-0.1.0a9}/docent/samples/tb_airline.json +0 -0
- {docent_python-0.1.0a8 → docent_python-0.1.0a9}/docent/sdk/__init__.py +0 -0
- {docent_python-0.1.0a8 → docent_python-0.1.0a9}/uv.lock +0 -0
|
@@ -1,6 +1,10 @@
|
|
|
1
1
|
from docent.data_models.agent_run import AgentRun
|
|
2
2
|
from docent.data_models.citation import Citation
|
|
3
|
-
from docent.data_models.metadata import
|
|
3
|
+
from docent.data_models.metadata import (
|
|
4
|
+
BaseAgentRunMetadata,
|
|
5
|
+
BaseMetadata,
|
|
6
|
+
InspectAgentRunMetadata,
|
|
7
|
+
)
|
|
4
8
|
from docent.data_models.regex import RegexSnippet
|
|
5
9
|
from docent.data_models.transcript import Transcript
|
|
6
10
|
|
|
@@ -10,5 +14,6 @@ __all__ = [
|
|
|
10
14
|
"RegexSnippet",
|
|
11
15
|
"BaseAgentRunMetadata",
|
|
12
16
|
"BaseMetadata",
|
|
17
|
+
"InspectAgentRunMetadata",
|
|
13
18
|
"Transcript",
|
|
14
19
|
]
|
|
@@ -218,7 +218,7 @@ class InspectAgentRunMetadata(BaseAgentRunMetadata):
|
|
|
218
218
|
# Parameters for the run
|
|
219
219
|
model: str = Field(description="The model that was used to generate the transcript")
|
|
220
220
|
|
|
221
|
-
#
|
|
221
|
+
# Scoring
|
|
222
222
|
scoring_metadata: dict[str, Any] | None = Field(
|
|
223
223
|
description="Additional metadata about the scoring process"
|
|
224
224
|
)
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
from inspect_ai.log import EvalLog
|
|
2
|
+
from inspect_ai.scorer import CORRECT, INCORRECT, NOANSWER, PARTIAL, Score
|
|
3
|
+
|
|
4
|
+
from docent.data_models import AgentRun, InspectAgentRunMetadata, Transcript
|
|
5
|
+
from docent.data_models.chat import parse_chat_message
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _normalize_inspect_score(score: Score) -> float | None:
|
|
9
|
+
"""
|
|
10
|
+
Normalize an inspect score to a float. This implements the same logic as inspect_ai.scorer._metric.value_to_float, but fails more conspicuously.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
score: The inspect score to normalize.
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
The normalized score as a float, or None if the score is not a valid value.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
if isinstance(score.value, int | float | bool):
|
|
20
|
+
return float(score.value)
|
|
21
|
+
elif score.value == CORRECT:
|
|
22
|
+
return 1.0
|
|
23
|
+
elif score.value == PARTIAL:
|
|
24
|
+
return 0.5
|
|
25
|
+
elif score.value == INCORRECT or score.value == NOANSWER:
|
|
26
|
+
return 0
|
|
27
|
+
elif isinstance(score.value, str):
|
|
28
|
+
value = score.value.lower()
|
|
29
|
+
if value in ["yes", "true"]:
|
|
30
|
+
return 1.0
|
|
31
|
+
elif value in ["no", "false"]:
|
|
32
|
+
return 0.0
|
|
33
|
+
elif value.replace(".", "").isnumeric():
|
|
34
|
+
return float(value)
|
|
35
|
+
|
|
36
|
+
raise ValueError(f"Unknown score value: {score.value}")
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def load_inspect_log(log: EvalLog) -> list[AgentRun]:
|
|
40
|
+
if log.samples is None:
|
|
41
|
+
return []
|
|
42
|
+
|
|
43
|
+
agent_runs: list[AgentRun] = []
|
|
44
|
+
|
|
45
|
+
for s in log.samples:
|
|
46
|
+
sample_id = s.id
|
|
47
|
+
epoch_id = s.epoch
|
|
48
|
+
|
|
49
|
+
if s.scores is None:
|
|
50
|
+
sample_scores = {}
|
|
51
|
+
else:
|
|
52
|
+
sample_scores = {k: _normalize_inspect_score(v) for k, v in s.scores.items()}
|
|
53
|
+
|
|
54
|
+
metadata = InspectAgentRunMetadata(
|
|
55
|
+
task_id=log.eval.task,
|
|
56
|
+
sample_id=str(sample_id),
|
|
57
|
+
epoch_id=epoch_id,
|
|
58
|
+
model=log.eval.model,
|
|
59
|
+
additional_metadata=s.metadata,
|
|
60
|
+
scores=sample_scores,
|
|
61
|
+
# Scores could have answers, explanations, and other metadata besides the values we extract
|
|
62
|
+
scoring_metadata=s.scores,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
agent_runs.append(
|
|
66
|
+
AgentRun(
|
|
67
|
+
transcripts={
|
|
68
|
+
"main": Transcript(
|
|
69
|
+
messages=[parse_chat_message(m.model_dump()) for m in s.messages]
|
|
70
|
+
)
|
|
71
|
+
},
|
|
72
|
+
metadata=metadata,
|
|
73
|
+
)
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
return agent_runs
|
|
@@ -4,7 +4,7 @@ from typing import Any
|
|
|
4
4
|
import requests
|
|
5
5
|
|
|
6
6
|
from docent._log_util.logger import get_logger
|
|
7
|
-
from docent.data_models.agent_run import AgentRun
|
|
7
|
+
from docent.data_models.agent_run import AgentRun, AgentRunWithoutMetadataValidator
|
|
8
8
|
|
|
9
9
|
logger = get_logger(__name__)
|
|
10
10
|
|
|
@@ -268,3 +268,26 @@ class Docent:
|
|
|
268
268
|
response = self._session.post(url, json={"centroid": centroid})
|
|
269
269
|
response.raise_for_status()
|
|
270
270
|
return response.json()
|
|
271
|
+
|
|
272
|
+
def get_agent_run(self, collection_id: str, agent_run_id: str) -> AgentRun | None:
|
|
273
|
+
"""Get a specific agent run by its ID.
|
|
274
|
+
|
|
275
|
+
Args:
|
|
276
|
+
collection_id: ID of the Collection.
|
|
277
|
+
agent_run_id: The ID of the agent run to retrieve.
|
|
278
|
+
|
|
279
|
+
Returns:
|
|
280
|
+
dict: Dictionary containing the agent run information.
|
|
281
|
+
|
|
282
|
+
Raises:
|
|
283
|
+
requests.exceptions.HTTPError: If the API request fails.
|
|
284
|
+
"""
|
|
285
|
+
url = f"{self._server_url}/{collection_id}/agent_run"
|
|
286
|
+
response = self._session.get(url, params={"agent_run_id": agent_run_id})
|
|
287
|
+
response.raise_for_status()
|
|
288
|
+
if response.json() is None:
|
|
289
|
+
return None
|
|
290
|
+
else:
|
|
291
|
+
# We do this to avoid metadata validation failing
|
|
292
|
+
# TODO(mengk): kinda hacky
|
|
293
|
+
return AgentRunWithoutMetadataValidator.model_validate(response.json())
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|