docent-python 0.1.5a0__tar.gz → 0.1.7a0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docent_python-0.1.5a0 → docent_python-0.1.7a0}/.gitignore +1 -0
- {docent_python-0.1.5a0 → docent_python-0.1.7a0}/PKG-INFO +1 -1
- {docent_python-0.1.5a0 → docent_python-0.1.7a0}/docent/data_models/agent_run.py +3 -0
- {docent_python-0.1.5a0 → docent_python-0.1.7a0}/docent/data_models/transcript.py +7 -0
- docent_python-0.1.7a0/docent/loaders/load_inspect.py +210 -0
- {docent_python-0.1.5a0 → docent_python-0.1.7a0}/docent/trace.py +137 -21
- {docent_python-0.1.5a0 → docent_python-0.1.7a0}/pyproject.toml +1 -1
- docent_python-0.1.5a0/docent/loaders/load_inspect.py +0 -88
- {docent_python-0.1.5a0 → docent_python-0.1.7a0}/LICENSE.md +0 -0
- {docent_python-0.1.5a0 → docent_python-0.1.7a0}/README.md +0 -0
- {docent_python-0.1.5a0 → docent_python-0.1.7a0}/docent/__init__.py +0 -0
- {docent_python-0.1.5a0 → docent_python-0.1.7a0}/docent/_log_util/__init__.py +0 -0
- {docent_python-0.1.5a0 → docent_python-0.1.7a0}/docent/_log_util/logger.py +0 -0
- {docent_python-0.1.5a0 → docent_python-0.1.7a0}/docent/data_models/__init__.py +0 -0
- {docent_python-0.1.5a0 → docent_python-0.1.7a0}/docent/data_models/_tiktoken_util.py +0 -0
- {docent_python-0.1.5a0 → docent_python-0.1.7a0}/docent/data_models/chat/__init__.py +0 -0
- {docent_python-0.1.5a0 → docent_python-0.1.7a0}/docent/data_models/chat/content.py +0 -0
- {docent_python-0.1.5a0 → docent_python-0.1.7a0}/docent/data_models/chat/message.py +0 -0
- {docent_python-0.1.5a0 → docent_python-0.1.7a0}/docent/data_models/chat/tool.py +0 -0
- {docent_python-0.1.5a0 → docent_python-0.1.7a0}/docent/data_models/citation.py +0 -0
- {docent_python-0.1.5a0 → docent_python-0.1.7a0}/docent/data_models/metadata.py +0 -0
- {docent_python-0.1.5a0 → docent_python-0.1.7a0}/docent/data_models/regex.py +0 -0
- {docent_python-0.1.5a0 → docent_python-0.1.7a0}/docent/data_models/shared_types.py +0 -0
- {docent_python-0.1.5a0 → docent_python-0.1.7a0}/docent/py.typed +0 -0
- {docent_python-0.1.5a0 → docent_python-0.1.7a0}/docent/samples/__init__.py +0 -0
- {docent_python-0.1.5a0 → docent_python-0.1.7a0}/docent/samples/load.py +0 -0
- {docent_python-0.1.5a0 → docent_python-0.1.7a0}/docent/samples/log.eval +0 -0
- {docent_python-0.1.5a0 → docent_python-0.1.7a0}/docent/samples/tb_airline.json +0 -0
- {docent_python-0.1.5a0 → docent_python-0.1.7a0}/docent/sdk/__init__.py +0 -0
- {docent_python-0.1.5a0 → docent_python-0.1.7a0}/docent/sdk/client.py +0 -0
- {docent_python-0.1.5a0 → docent_python-0.1.7a0}/docent/trace_temp.py +0 -0
- {docent_python-0.1.5a0 → docent_python-0.1.7a0}/uv.lock +0 -0
|
@@ -15,6 +15,7 @@ from pydantic import (
|
|
|
15
15
|
from docent.data_models._tiktoken_util import get_token_count, group_messages_into_ranges
|
|
16
16
|
from docent.data_models.transcript import (
|
|
17
17
|
Transcript,
|
|
18
|
+
TranscriptGroup,
|
|
18
19
|
TranscriptWithoutMetadataValidator,
|
|
19
20
|
fake_model_dump,
|
|
20
21
|
)
|
|
@@ -36,6 +37,7 @@ class AgentRun(BaseModel):
|
|
|
36
37
|
name: Optional human-readable name for the agent run.
|
|
37
38
|
description: Optional description of the agent run.
|
|
38
39
|
transcripts: Dict mapping transcript IDs to Transcript objects.
|
|
40
|
+
transcript_groups: Dict mapping transcript group IDs to TranscriptGroup objects.
|
|
39
41
|
metadata: Additional structured metadata about the agent run as a JSON-serializable dictionary.
|
|
40
42
|
"""
|
|
41
43
|
|
|
@@ -44,6 +46,7 @@ class AgentRun(BaseModel):
|
|
|
44
46
|
description: str | None = None
|
|
45
47
|
|
|
46
48
|
transcripts: dict[str, Transcript]
|
|
49
|
+
transcript_groups: dict[str, TranscriptGroup] = Field(default_factory=dict)
|
|
47
50
|
metadata: dict[str, Any] = Field(default_factory=dict)
|
|
48
51
|
|
|
49
52
|
@field_serializer("metadata")
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import sys
|
|
2
|
+
from datetime import datetime
|
|
2
3
|
from typing import Any
|
|
3
4
|
from uuid import uuid4
|
|
4
5
|
|
|
@@ -73,6 +74,8 @@ class TranscriptGroup(BaseModel):
|
|
|
73
74
|
id: Unique identifier for the transcript group, auto-generated by default.
|
|
74
75
|
name: Optional human-readable name for the transcript group.
|
|
75
76
|
description: Optional description of the transcript group.
|
|
77
|
+
collection_id: ID of the collection this transcript group belongs to.
|
|
78
|
+
agent_run_id: ID of the agent run this transcript group belongs to.
|
|
76
79
|
parent_transcript_group_id: Optional ID of the parent transcript group.
|
|
77
80
|
metadata: Additional structured metadata about the transcript group.
|
|
78
81
|
"""
|
|
@@ -80,7 +83,10 @@ class TranscriptGroup(BaseModel):
|
|
|
80
83
|
id: str = Field(default_factory=lambda: str(uuid4()))
|
|
81
84
|
name: str | None = None
|
|
82
85
|
description: str | None = None
|
|
86
|
+
collection_id: str
|
|
87
|
+
agent_run_id: str
|
|
83
88
|
parent_transcript_group_id: str | None = None
|
|
89
|
+
created_at: datetime | None = None
|
|
84
90
|
metadata: dict[str, Any] = Field(default_factory=dict)
|
|
85
91
|
|
|
86
92
|
@field_serializer("metadata")
|
|
@@ -129,6 +135,7 @@ class Transcript(BaseModel):
|
|
|
129
135
|
name: str | None = None
|
|
130
136
|
description: str | None = None
|
|
131
137
|
transcript_group_id: str | None = None
|
|
138
|
+
created_at: datetime | None = None
|
|
132
139
|
|
|
133
140
|
messages: list[ChatMessage]
|
|
134
141
|
metadata: dict[str, Any] = Field(default_factory=dict)
|
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any, BinaryIO, Generator, Tuple
|
|
4
|
+
from zipfile import ZipFile
|
|
5
|
+
|
|
6
|
+
from inspect_ai.log import EvalLog
|
|
7
|
+
from inspect_ai.scorer import CORRECT, INCORRECT, NOANSWER, PARTIAL, Score
|
|
8
|
+
|
|
9
|
+
from docent.data_models import AgentRun, Transcript
|
|
10
|
+
from docent.data_models.chat import parse_chat_message
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _normalize_inspect_score(score: Score | dict[str, Any]) -> Any:
|
|
14
|
+
"""
|
|
15
|
+
Normalize an inspect score to a float. Logic mirrors inspect_ai.scorer._metric.value_to_float.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
score: The inspect score to normalize.
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
The normalized score as a float, or None if the score is not a valid value.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def _leaf_normalize(value: Any) -> Any:
|
|
25
|
+
if value is None:
|
|
26
|
+
return None
|
|
27
|
+
if isinstance(value, int | float | bool):
|
|
28
|
+
return float(value)
|
|
29
|
+
if value == CORRECT:
|
|
30
|
+
return 1.0
|
|
31
|
+
if value == PARTIAL:
|
|
32
|
+
return 0.5
|
|
33
|
+
if value in [INCORRECT, NOANSWER]:
|
|
34
|
+
return 0
|
|
35
|
+
value = str(value).lower()
|
|
36
|
+
if value in ["yes", "true"]:
|
|
37
|
+
return 1.0
|
|
38
|
+
if value in ["no", "false"]:
|
|
39
|
+
return 0.0
|
|
40
|
+
if value.replace(".", "").isnumeric():
|
|
41
|
+
return float(value)
|
|
42
|
+
return value
|
|
43
|
+
|
|
44
|
+
if isinstance(score, dict):
|
|
45
|
+
value = score["value"]
|
|
46
|
+
else:
|
|
47
|
+
value = score.value
|
|
48
|
+
|
|
49
|
+
if isinstance(value, int | float | bool | str):
|
|
50
|
+
return _leaf_normalize(value)
|
|
51
|
+
if isinstance(value, list):
|
|
52
|
+
return [_leaf_normalize(v) for v in value] # type: ignore
|
|
53
|
+
assert isinstance(value, dict), "Inspect score must be leaf value, list, or dict"
|
|
54
|
+
return {k: _leaf_normalize(v) for k, v in value.items()} # type: ignore
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def load_inspect_log(log: EvalLog) -> list[AgentRun]:
|
|
58
|
+
if log.samples is None:
|
|
59
|
+
return []
|
|
60
|
+
|
|
61
|
+
# TODO(vincent): fix this
|
|
62
|
+
agent_runs: list[AgentRun] = []
|
|
63
|
+
|
|
64
|
+
for s in log.samples:
|
|
65
|
+
sample_id = s.id
|
|
66
|
+
epoch_id = s.epoch
|
|
67
|
+
|
|
68
|
+
if s.scores is None:
|
|
69
|
+
sample_scores = {}
|
|
70
|
+
else:
|
|
71
|
+
sample_scores = {k: _normalize_inspect_score(v) for k, v in s.scores.items()}
|
|
72
|
+
|
|
73
|
+
metadata = {
|
|
74
|
+
"task_id": log.eval.task,
|
|
75
|
+
"sample_id": str(sample_id),
|
|
76
|
+
"epoch_id": epoch_id,
|
|
77
|
+
"model": log.eval.model,
|
|
78
|
+
"additional_metadata": s.metadata,
|
|
79
|
+
"scores": sample_scores,
|
|
80
|
+
# Scores could have answers, explanations, and other metadata besides the values we extract
|
|
81
|
+
"scoring_metadata": s.scores,
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
agent_runs.append(
|
|
85
|
+
AgentRun(
|
|
86
|
+
transcripts={
|
|
87
|
+
"main": Transcript(
|
|
88
|
+
messages=[parse_chat_message(m.model_dump()) for m in s.messages],
|
|
89
|
+
metadata={},
|
|
90
|
+
)
|
|
91
|
+
},
|
|
92
|
+
metadata=metadata,
|
|
93
|
+
)
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
return agent_runs
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _read_sample_as_run(data: dict[str, Any], header_metadata: dict[str, Any] = {}) -> AgentRun:
|
|
100
|
+
if "scores" in data:
|
|
101
|
+
normalized_scores = {k: _normalize_inspect_score(v) for k, v in data["scores"].items()}
|
|
102
|
+
else:
|
|
103
|
+
normalized_scores = {}
|
|
104
|
+
|
|
105
|
+
if "metadata" in data:
|
|
106
|
+
sample_metadata = data["metadata"]
|
|
107
|
+
else:
|
|
108
|
+
sample_metadata = {}
|
|
109
|
+
|
|
110
|
+
run_metadata: dict[str, Any] = {
|
|
111
|
+
"sample_id": data.get("id"),
|
|
112
|
+
"epoch": data.get("epoch"),
|
|
113
|
+
"target": data.get("target"),
|
|
114
|
+
# Scores could have answers, explanations, and other metadata besides the values we extract
|
|
115
|
+
"scoring_metadata": data.get("scores"),
|
|
116
|
+
"scores": normalized_scores,
|
|
117
|
+
# If a key exists in header and sample, sample takes precedence
|
|
118
|
+
**header_metadata,
|
|
119
|
+
**sample_metadata,
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
run = AgentRun(
|
|
123
|
+
transcripts={
|
|
124
|
+
"main": Transcript(
|
|
125
|
+
messages=[parse_chat_message(m) for m in data["messages"]], metadata={}
|
|
126
|
+
),
|
|
127
|
+
},
|
|
128
|
+
metadata=run_metadata,
|
|
129
|
+
)
|
|
130
|
+
return run
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _run_metadata_from_header(header: dict[str, Any]) -> dict[str, Any]:
|
|
134
|
+
"""
|
|
135
|
+
Inspect logs often have a lot of metadata.
|
|
136
|
+
This function tries to get the most important stuff without adding clutter.
|
|
137
|
+
"""
|
|
138
|
+
m: dict[str, Any] = {}
|
|
139
|
+
if e := header.get("eval"):
|
|
140
|
+
m["task"] = e["task"]
|
|
141
|
+
m["model"] = e["model"]
|
|
142
|
+
return m
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def get_total_samples(file_path: Path, format: str = "json") -> int:
|
|
146
|
+
"""Return the total number of samples in the provided file."""
|
|
147
|
+
with open(file_path, "rb") as f:
|
|
148
|
+
if format == "json":
|
|
149
|
+
data = json.load(f)
|
|
150
|
+
return len(data.get("samples", []))
|
|
151
|
+
elif format == "eval":
|
|
152
|
+
z = ZipFile(f, mode="r")
|
|
153
|
+
try:
|
|
154
|
+
return sum(
|
|
155
|
+
1
|
|
156
|
+
for name in z.namelist()
|
|
157
|
+
if name.startswith("samples/") and name.endswith(".json")
|
|
158
|
+
)
|
|
159
|
+
finally:
|
|
160
|
+
z.close()
|
|
161
|
+
else:
|
|
162
|
+
raise ValueError(f"Format must be 'json' or 'eval': {format}")
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _runs_from_eval_file(
|
|
166
|
+
file: BinaryIO,
|
|
167
|
+
) -> Tuple[dict[str, Any], Generator[AgentRun, None, None]]:
|
|
168
|
+
zip = ZipFile(file, mode="r")
|
|
169
|
+
header: dict[str, Any] = json.load(zip.open("header.json", "r"))
|
|
170
|
+
header_metadata = _run_metadata_from_header(header)
|
|
171
|
+
|
|
172
|
+
def _iter_runs() -> Generator[AgentRun, None, None]:
|
|
173
|
+
try:
|
|
174
|
+
for sample_file in zip.namelist():
|
|
175
|
+
if not (sample_file.startswith("samples/") and sample_file.endswith(".json")):
|
|
176
|
+
continue
|
|
177
|
+
with zip.open(sample_file, "r") as f:
|
|
178
|
+
data = json.load(f)
|
|
179
|
+
run: AgentRun = _read_sample_as_run(data, header_metadata)
|
|
180
|
+
yield run
|
|
181
|
+
finally:
|
|
182
|
+
zip.close()
|
|
183
|
+
|
|
184
|
+
return header_metadata, _iter_runs()
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def _runs_from_json_file(
|
|
188
|
+
file: BinaryIO,
|
|
189
|
+
) -> Tuple[dict[str, Any], Generator[AgentRun, None, None]]:
|
|
190
|
+
data = json.load(file)
|
|
191
|
+
header_metadata = _run_metadata_from_header(data)
|
|
192
|
+
|
|
193
|
+
def _iter_runs() -> Generator[AgentRun, None, None]:
|
|
194
|
+
for sample in data["samples"]:
|
|
195
|
+
run: AgentRun = _read_sample_as_run(sample, header_metadata)
|
|
196
|
+
yield run
|
|
197
|
+
|
|
198
|
+
return header_metadata, _iter_runs()
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def runs_from_file(
|
|
202
|
+
file: BinaryIO, format: str = "json"
|
|
203
|
+
) -> Tuple[dict[str, Any], Generator[AgentRun, None, None]]:
|
|
204
|
+
if format == "json":
|
|
205
|
+
result = _runs_from_json_file(file)
|
|
206
|
+
elif format == "eval":
|
|
207
|
+
result = _runs_from_eval_file(file)
|
|
208
|
+
else:
|
|
209
|
+
raise ValueError(f"Format must be 'json' or 'eval': {format}")
|
|
210
|
+
return result
|
|
@@ -12,6 +12,7 @@ from contextlib import asynccontextmanager, contextmanager
|
|
|
12
12
|
from contextvars import ContextVar, Token
|
|
13
13
|
from datetime import datetime, timezone
|
|
14
14
|
from enum import Enum
|
|
15
|
+
from importlib.metadata import Distribution, distributions
|
|
15
16
|
from typing import Any, AsyncIterator, Callable, Dict, Iterator, List, Optional, Set, Union
|
|
16
17
|
|
|
17
18
|
import requests
|
|
@@ -19,10 +20,6 @@ from opentelemetry import trace
|
|
|
19
20
|
from opentelemetry.context import Context
|
|
20
21
|
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter as GRPCExporter
|
|
21
22
|
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter as HTTPExporter
|
|
22
|
-
from opentelemetry.instrumentation.anthropic import AnthropicInstrumentor
|
|
23
|
-
from opentelemetry.instrumentation.bedrock import BedrockInstrumentor
|
|
24
|
-
from opentelemetry.instrumentation.langchain import LangchainInstrumentor
|
|
25
|
-
from opentelemetry.instrumentation.openai import OpenAIInstrumentor
|
|
26
23
|
from opentelemetry.instrumentation.threading import ThreadingInstrumentor
|
|
27
24
|
from opentelemetry.sdk.resources import Resource
|
|
28
25
|
from opentelemetry.sdk.trace import ReadableSpan, SpanProcessor, TracerProvider
|
|
@@ -34,15 +31,19 @@ from opentelemetry.sdk.trace.export import (
|
|
|
34
31
|
from opentelemetry.trace import Span
|
|
35
32
|
|
|
36
33
|
# Configure logging
|
|
37
|
-
logging.basicConfig(level=logging.INFO)
|
|
38
34
|
logger = logging.getLogger(__name__)
|
|
39
|
-
logger.
|
|
35
|
+
logger.setLevel(logging.ERROR)
|
|
40
36
|
|
|
41
37
|
# Default configuration
|
|
42
38
|
DEFAULT_ENDPOINT = "https://api.docent.transluce.org/rest/telemetry"
|
|
43
39
|
DEFAULT_COLLECTION_NAME = "default-collection-name"
|
|
44
40
|
|
|
45
41
|
|
|
42
|
+
def _is_tracing_disabled() -> bool:
|
|
43
|
+
"""Check if tracing is disabled via environment variable."""
|
|
44
|
+
return os.environ.get("DOCENT_DISABLE_TRACING", "").lower() == "true"
|
|
45
|
+
|
|
46
|
+
|
|
46
47
|
class Instruments(Enum):
|
|
47
48
|
"""Enumeration of available instrument types."""
|
|
48
49
|
|
|
@@ -93,6 +94,13 @@ class DocentTracer:
|
|
|
93
94
|
instruments: Set of instruments to enable (None = all instruments)
|
|
94
95
|
block_instruments: Set of instruments to explicitly disable
|
|
95
96
|
"""
|
|
97
|
+
self._initialized: bool = False
|
|
98
|
+
# Check if tracing is disabled via environment variable
|
|
99
|
+
if _is_tracing_disabled():
|
|
100
|
+
self._disabled = True
|
|
101
|
+
logger.info("Docent tracing disabled via DOCENT_DISABLE_TRACING environment variable")
|
|
102
|
+
return
|
|
103
|
+
|
|
96
104
|
self.collection_name: str = collection_name
|
|
97
105
|
self.collection_id: str = collection_id if collection_id else str(uuid.uuid4())
|
|
98
106
|
self.default_agent_run_id: str = agent_run_id if agent_run_id else str(uuid.uuid4())
|
|
@@ -127,7 +135,6 @@ class DocentTracer:
|
|
|
127
135
|
self._tracer_provider: Optional[TracerProvider] = None
|
|
128
136
|
self._root_context: Optional[Context] = Context()
|
|
129
137
|
self._tracer: Optional[trace.Tracer] = None
|
|
130
|
-
self._initialized: bool = False
|
|
131
138
|
self._cleanup_registered: bool = False
|
|
132
139
|
self._disabled: bool = False
|
|
133
140
|
self._spans_processors: List[Union[BatchSpanProcessor, SimpleSpanProcessor]] = []
|
|
@@ -223,7 +230,7 @@ class DocentTracer:
|
|
|
223
230
|
exporters.append(exporter)
|
|
224
231
|
logger.info(f"Initialized exporter for endpoint: {endpoint}")
|
|
225
232
|
else:
|
|
226
|
-
logger.
|
|
233
|
+
logger.critical(f"Failed to initialize exporter for endpoint: {endpoint}")
|
|
227
234
|
|
|
228
235
|
return exporters
|
|
229
236
|
|
|
@@ -240,7 +247,12 @@ class DocentTracer:
|
|
|
240
247
|
|
|
241
248
|
def initialize(self):
|
|
242
249
|
"""Initialize Docent tracing setup."""
|
|
243
|
-
if self._initialized
|
|
250
|
+
if self._initialized:
|
|
251
|
+
return
|
|
252
|
+
|
|
253
|
+
# If tracing is disabled, mark as initialized but don't set up anything
|
|
254
|
+
if self._disabled:
|
|
255
|
+
self._initialized = True
|
|
244
256
|
return
|
|
245
257
|
|
|
246
258
|
try:
|
|
@@ -326,8 +338,6 @@ class DocentTracer:
|
|
|
326
338
|
logger.info(
|
|
327
339
|
f"Added {len(otlp_exporters)} OTLP exporters for {len(self.endpoints)} endpoints"
|
|
328
340
|
)
|
|
329
|
-
else:
|
|
330
|
-
logger.warning("Failed to initialize OTLP exporter")
|
|
331
341
|
|
|
332
342
|
if self.enable_console_export:
|
|
333
343
|
console_exporter: ConsoleSpanExporter = ConsoleSpanExporter()
|
|
@@ -355,32 +365,44 @@ class DocentTracer:
|
|
|
355
365
|
# Instrument OpenAI with our isolated tracer provider
|
|
356
366
|
if Instruments.OPENAI in enabled_instruments:
|
|
357
367
|
try:
|
|
358
|
-
|
|
359
|
-
|
|
368
|
+
if is_package_installed("openai"):
|
|
369
|
+
from opentelemetry.instrumentation.openai import OpenAIInstrumentor
|
|
370
|
+
|
|
371
|
+
OpenAIInstrumentor().instrument(tracer_provider=self._tracer_provider)
|
|
372
|
+
logger.info("Instrumented OpenAI")
|
|
360
373
|
except Exception as e:
|
|
361
374
|
logger.warning(f"Failed to instrument OpenAI: {e}")
|
|
362
375
|
|
|
363
376
|
# Instrument Anthropic with our isolated tracer provider
|
|
364
377
|
if Instruments.ANTHROPIC in enabled_instruments:
|
|
365
378
|
try:
|
|
366
|
-
|
|
367
|
-
|
|
379
|
+
if is_package_installed("anthropic"):
|
|
380
|
+
from opentelemetry.instrumentation.anthropic import AnthropicInstrumentor
|
|
381
|
+
|
|
382
|
+
AnthropicInstrumentor().instrument(tracer_provider=self._tracer_provider)
|
|
383
|
+
logger.info("Instrumented Anthropic")
|
|
368
384
|
except Exception as e:
|
|
369
385
|
logger.warning(f"Failed to instrument Anthropic: {e}")
|
|
370
386
|
|
|
371
387
|
# Instrument Bedrock with our isolated tracer provider
|
|
372
388
|
if Instruments.BEDROCK in enabled_instruments:
|
|
373
389
|
try:
|
|
374
|
-
|
|
375
|
-
|
|
390
|
+
if is_package_installed("boto3"):
|
|
391
|
+
from opentelemetry.instrumentation.bedrock import BedrockInstrumentor
|
|
392
|
+
|
|
393
|
+
BedrockInstrumentor().instrument(tracer_provider=self._tracer_provider)
|
|
394
|
+
logger.info("Instrumented Bedrock")
|
|
376
395
|
except Exception as e:
|
|
377
396
|
logger.warning(f"Failed to instrument Bedrock: {e}")
|
|
378
397
|
|
|
379
398
|
# Instrument LangChain with our isolated tracer provider
|
|
380
399
|
if Instruments.LANGCHAIN in enabled_instruments:
|
|
381
400
|
try:
|
|
382
|
-
|
|
383
|
-
|
|
401
|
+
if is_package_installed("langchain") or is_package_installed("langgraph"):
|
|
402
|
+
from opentelemetry.instrumentation.langchain import LangchainInstrumentor
|
|
403
|
+
|
|
404
|
+
LangchainInstrumentor().instrument(tracer_provider=self._tracer_provider)
|
|
405
|
+
logger.info("Instrumented LangChain")
|
|
384
406
|
except Exception as e:
|
|
385
407
|
logger.warning(f"Failed to instrument LangChain: {e}")
|
|
386
408
|
|
|
@@ -397,6 +419,9 @@ class DocentTracer:
|
|
|
397
419
|
|
|
398
420
|
def cleanup(self):
|
|
399
421
|
"""Clean up Docent tracing resources and signal trace completion to backend."""
|
|
422
|
+
if self._disabled:
|
|
423
|
+
return
|
|
424
|
+
|
|
400
425
|
try:
|
|
401
426
|
# Notify backend that trace is done (no span creation)
|
|
402
427
|
try:
|
|
@@ -415,6 +440,9 @@ class DocentTracer:
|
|
|
415
440
|
|
|
416
441
|
def close(self):
|
|
417
442
|
"""Explicitly close the Docent tracing manager."""
|
|
443
|
+
if self._disabled:
|
|
444
|
+
return
|
|
445
|
+
|
|
418
446
|
try:
|
|
419
447
|
self.cleanup()
|
|
420
448
|
if self._cleanup_registered:
|
|
@@ -425,6 +453,9 @@ class DocentTracer:
|
|
|
425
453
|
|
|
426
454
|
def flush(self) -> None:
|
|
427
455
|
"""Force flush all spans to exporters."""
|
|
456
|
+
if self._disabled:
|
|
457
|
+
return
|
|
458
|
+
|
|
428
459
|
try:
|
|
429
460
|
for processor in self._spans_processors:
|
|
430
461
|
if hasattr(processor, "force_flush"):
|
|
@@ -440,8 +471,6 @@ class DocentTracer:
|
|
|
440
471
|
|
|
441
472
|
def verify_initialized(self) -> bool:
|
|
442
473
|
"""Verify if the manager is properly initialized."""
|
|
443
|
-
if self._disabled:
|
|
444
|
-
return False
|
|
445
474
|
return self._initialized
|
|
446
475
|
|
|
447
476
|
def __enter__(self) -> "DocentTracer":
|
|
@@ -487,6 +516,15 @@ class DocentTracer:
|
|
|
487
516
|
Yields:
|
|
488
517
|
Tuple of (agent_run_id, transcript_id)
|
|
489
518
|
"""
|
|
519
|
+
if self._disabled:
|
|
520
|
+
# Return dummy IDs when tracing is disabled
|
|
521
|
+
if agent_run_id is None:
|
|
522
|
+
agent_run_id = str(uuid.uuid4())
|
|
523
|
+
if transcript_id is None:
|
|
524
|
+
transcript_id = str(uuid.uuid4())
|
|
525
|
+
yield agent_run_id, transcript_id
|
|
526
|
+
return
|
|
527
|
+
|
|
490
528
|
if not self._initialized:
|
|
491
529
|
self.initialize()
|
|
492
530
|
|
|
@@ -535,6 +573,15 @@ class DocentTracer:
|
|
|
535
573
|
Yields:
|
|
536
574
|
Tuple of (agent_run_id, transcript_id)
|
|
537
575
|
"""
|
|
576
|
+
if self._disabled:
|
|
577
|
+
# Return dummy IDs when tracing is disabled
|
|
578
|
+
if agent_run_id is None:
|
|
579
|
+
agent_run_id = str(uuid.uuid4())
|
|
580
|
+
if transcript_id is None:
|
|
581
|
+
transcript_id = str(uuid.uuid4())
|
|
582
|
+
yield agent_run_id, transcript_id
|
|
583
|
+
return
|
|
584
|
+
|
|
538
585
|
if not self._initialized:
|
|
539
586
|
self.initialize()
|
|
540
587
|
|
|
@@ -600,6 +647,9 @@ class DocentTracer:
|
|
|
600
647
|
score: Numeric score value
|
|
601
648
|
attributes: Optional additional attributes
|
|
602
649
|
"""
|
|
650
|
+
if self._disabled:
|
|
651
|
+
return
|
|
652
|
+
|
|
603
653
|
collection_id = self.collection_id
|
|
604
654
|
payload: Dict[str, Any] = {
|
|
605
655
|
"collection_id": collection_id,
|
|
@@ -613,6 +663,9 @@ class DocentTracer:
|
|
|
613
663
|
self._post_json("/v1/scores", payload)
|
|
614
664
|
|
|
615
665
|
def send_agent_run_metadata(self, agent_run_id: str, metadata: Dict[str, Any]) -> None:
|
|
666
|
+
if self._disabled:
|
|
667
|
+
return
|
|
668
|
+
|
|
616
669
|
collection_id = self.collection_id
|
|
617
670
|
payload: Dict[str, Any] = {
|
|
618
671
|
"collection_id": collection_id,
|
|
@@ -640,6 +693,9 @@ class DocentTracer:
|
|
|
640
693
|
transcript_group_id: Optional transcript group ID
|
|
641
694
|
metadata: Optional metadata to send
|
|
642
695
|
"""
|
|
696
|
+
if self._disabled:
|
|
697
|
+
return
|
|
698
|
+
|
|
643
699
|
collection_id = self.collection_id
|
|
644
700
|
payload: Dict[str, Any] = {
|
|
645
701
|
"collection_id": collection_id,
|
|
@@ -705,6 +761,13 @@ class DocentTracer:
|
|
|
705
761
|
Yields:
|
|
706
762
|
The transcript ID
|
|
707
763
|
"""
|
|
764
|
+
if self._disabled:
|
|
765
|
+
# Return dummy ID when tracing is disabled
|
|
766
|
+
if transcript_id is None:
|
|
767
|
+
transcript_id = str(uuid.uuid4())
|
|
768
|
+
yield transcript_id
|
|
769
|
+
return
|
|
770
|
+
|
|
708
771
|
if not self._initialized:
|
|
709
772
|
raise RuntimeError(
|
|
710
773
|
"Tracer is not initialized. Call initialize_tracing() before using transcript context."
|
|
@@ -760,6 +823,13 @@ class DocentTracer:
|
|
|
760
823
|
Yields:
|
|
761
824
|
The transcript ID
|
|
762
825
|
"""
|
|
826
|
+
if self._disabled:
|
|
827
|
+
# Return dummy ID when tracing is disabled
|
|
828
|
+
if transcript_id is None:
|
|
829
|
+
transcript_id = str(uuid.uuid4())
|
|
830
|
+
yield transcript_id
|
|
831
|
+
return
|
|
832
|
+
|
|
763
833
|
if not self._initialized:
|
|
764
834
|
raise RuntimeError(
|
|
765
835
|
"Tracer is not initialized. Call initialize_tracing() before using transcript context."
|
|
@@ -811,10 +881,23 @@ class DocentTracer:
|
|
|
811
881
|
parent_transcript_group_id: Optional parent transcript group ID
|
|
812
882
|
metadata: Optional metadata to send
|
|
813
883
|
"""
|
|
884
|
+
if self._disabled:
|
|
885
|
+
return
|
|
886
|
+
|
|
814
887
|
collection_id = self.collection_id
|
|
888
|
+
|
|
889
|
+
# Get agent_run_id from current context
|
|
890
|
+
agent_run_id = self.get_current_agent_run_id()
|
|
891
|
+
if not agent_run_id:
|
|
892
|
+
logger.error(
|
|
893
|
+
f"Cannot send transcript group metadata for {transcript_group_id} - no agent_run_id in context"
|
|
894
|
+
)
|
|
895
|
+
return
|
|
896
|
+
|
|
815
897
|
payload: Dict[str, Any] = {
|
|
816
898
|
"collection_id": collection_id,
|
|
817
899
|
"transcript_group_id": transcript_group_id,
|
|
900
|
+
"agent_run_id": agent_run_id,
|
|
818
901
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
819
902
|
}
|
|
820
903
|
|
|
@@ -851,6 +934,13 @@ class DocentTracer:
|
|
|
851
934
|
Yields:
|
|
852
935
|
The transcript group ID
|
|
853
936
|
"""
|
|
937
|
+
if self._disabled:
|
|
938
|
+
# Return dummy ID when tracing is disabled
|
|
939
|
+
if transcript_group_id is None:
|
|
940
|
+
transcript_group_id = str(uuid.uuid4())
|
|
941
|
+
yield transcript_group_id
|
|
942
|
+
return
|
|
943
|
+
|
|
854
944
|
if not self._initialized:
|
|
855
945
|
raise RuntimeError(
|
|
856
946
|
"Tracer is not initialized. Call initialize_tracing() before using transcript group context."
|
|
@@ -908,6 +998,13 @@ class DocentTracer:
|
|
|
908
998
|
Yields:
|
|
909
999
|
The transcript group ID
|
|
910
1000
|
"""
|
|
1001
|
+
if self._disabled:
|
|
1002
|
+
# Return dummy ID when tracing is disabled
|
|
1003
|
+
if transcript_group_id is None:
|
|
1004
|
+
transcript_group_id = str(uuid.uuid4())
|
|
1005
|
+
yield transcript_group_id
|
|
1006
|
+
return
|
|
1007
|
+
|
|
911
1008
|
if not self._initialized:
|
|
912
1009
|
raise RuntimeError(
|
|
913
1010
|
"Tracer is not initialized. Call initialize_tracing() before using transcript group context."
|
|
@@ -944,6 +1041,9 @@ class DocentTracer:
|
|
|
944
1041
|
self._transcript_group_id_var.reset(transcript_group_id_token)
|
|
945
1042
|
|
|
946
1043
|
def _send_trace_done(self) -> None:
|
|
1044
|
+
if self._disabled:
|
|
1045
|
+
return
|
|
1046
|
+
|
|
947
1047
|
collection_id = self.collection_id
|
|
948
1048
|
payload: Dict[str, Any] = {
|
|
949
1049
|
"collection_id": collection_id,
|
|
@@ -1019,6 +1119,22 @@ def initialize_tracing(
|
|
|
1019
1119
|
return _global_tracer
|
|
1020
1120
|
|
|
1021
1121
|
|
|
1122
|
+
def _get_package_name(dist: Distribution) -> str | None:
|
|
1123
|
+
try:
|
|
1124
|
+
return dist.name.lower()
|
|
1125
|
+
except (KeyError, AttributeError):
|
|
1126
|
+
return None
|
|
1127
|
+
|
|
1128
|
+
|
|
1129
|
+
installed_packages = {
|
|
1130
|
+
name for dist in distributions() if (name := _get_package_name(dist)) is not None
|
|
1131
|
+
}
|
|
1132
|
+
|
|
1133
|
+
|
|
1134
|
+
def is_package_installed(package_name: str) -> bool:
|
|
1135
|
+
return package_name.lower() in installed_packages
|
|
1136
|
+
|
|
1137
|
+
|
|
1022
1138
|
def get_tracer() -> DocentTracer:
|
|
1023
1139
|
"""Get the global Docent tracer."""
|
|
1024
1140
|
if _global_tracer is None:
|
|
@@ -1,88 +0,0 @@
|
|
|
1
|
-
from typing import Any
|
|
2
|
-
|
|
3
|
-
from inspect_ai.log import EvalLog
|
|
4
|
-
from inspect_ai.scorer import CORRECT, INCORRECT, NOANSWER, PARTIAL, Score
|
|
5
|
-
|
|
6
|
-
from docent.data_models import AgentRun, Transcript
|
|
7
|
-
from docent.data_models.chat import parse_chat_message
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
def _normalize_inspect_score(score: Score) -> Any:
|
|
11
|
-
"""
|
|
12
|
-
Normalize an inspect score to a float. This implements the same logic as inspect_ai.scorer._metric.value_to_float, but fails more conspicuously.
|
|
13
|
-
|
|
14
|
-
Args:
|
|
15
|
-
score: The inspect score to normalize.
|
|
16
|
-
|
|
17
|
-
Returns:
|
|
18
|
-
The normalized score as a float, or None if the score is not a valid value.
|
|
19
|
-
"""
|
|
20
|
-
|
|
21
|
-
def _leaf_normalize(value: int | float | bool | str | None) -> float | str | None:
|
|
22
|
-
if value is None:
|
|
23
|
-
return None
|
|
24
|
-
if isinstance(value, int | float | bool):
|
|
25
|
-
return float(value)
|
|
26
|
-
if value == CORRECT:
|
|
27
|
-
return 1.0
|
|
28
|
-
if value == PARTIAL:
|
|
29
|
-
return 0.5
|
|
30
|
-
if value in [INCORRECT, NOANSWER]:
|
|
31
|
-
return 0
|
|
32
|
-
value = str(value).lower()
|
|
33
|
-
if value in ["yes", "true"]:
|
|
34
|
-
return 1.0
|
|
35
|
-
if value in ["no", "false"]:
|
|
36
|
-
return 0.0
|
|
37
|
-
if value.replace(".", "").isnumeric():
|
|
38
|
-
return float(value)
|
|
39
|
-
return value
|
|
40
|
-
|
|
41
|
-
if isinstance(score.value, int | float | bool | str):
|
|
42
|
-
return _leaf_normalize(score.value)
|
|
43
|
-
if isinstance(score.value, list):
|
|
44
|
-
return [_leaf_normalize(v) for v in score.value]
|
|
45
|
-
assert isinstance(score.value, dict), "Inspect score must be leaf value, list, or dict"
|
|
46
|
-
return {k: _leaf_normalize(v) for k, v in score.value.items()}
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
def load_inspect_log(log: EvalLog) -> list[AgentRun]:
|
|
50
|
-
if log.samples is None:
|
|
51
|
-
return []
|
|
52
|
-
|
|
53
|
-
# TODO(vincent): fix this
|
|
54
|
-
agent_runs: list[AgentRun] = []
|
|
55
|
-
|
|
56
|
-
for s in log.samples:
|
|
57
|
-
sample_id = s.id
|
|
58
|
-
epoch_id = s.epoch
|
|
59
|
-
|
|
60
|
-
if s.scores is None:
|
|
61
|
-
sample_scores = {}
|
|
62
|
-
else:
|
|
63
|
-
sample_scores = {k: _normalize_inspect_score(v) for k, v in s.scores.items()}
|
|
64
|
-
|
|
65
|
-
metadata = {
|
|
66
|
-
"task_id": log.eval.task,
|
|
67
|
-
"sample_id": str(sample_id),
|
|
68
|
-
"epoch_id": epoch_id,
|
|
69
|
-
"model": log.eval.model,
|
|
70
|
-
"additional_metadata": s.metadata,
|
|
71
|
-
"scores": sample_scores,
|
|
72
|
-
# Scores could have answers, explanations, and other metadata besides the values we extract
|
|
73
|
-
"scoring_metadata": s.scores,
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
agent_runs.append(
|
|
77
|
-
AgentRun(
|
|
78
|
-
transcripts={
|
|
79
|
-
"main": Transcript(
|
|
80
|
-
messages=[parse_chat_message(m.model_dump()) for m in s.messages],
|
|
81
|
-
metadata={},
|
|
82
|
-
)
|
|
83
|
-
},
|
|
84
|
-
metadata=metadata,
|
|
85
|
-
)
|
|
86
|
-
)
|
|
87
|
-
|
|
88
|
-
return agent_runs
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|