docent-python 0.1.18a0__tar.gz → 0.1.19a0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docent-python might be problematic. Click here for more details.
- {docent_python-0.1.18a0 → docent_python-0.1.19a0}/PKG-INFO +1 -1
- {docent_python-0.1.18a0 → docent_python-0.1.19a0}/docent/data_models/__init__.py +2 -0
- {docent_python-0.1.18a0 → docent_python-0.1.19a0}/docent/data_models/agent_run.py +5 -5
- {docent_python-0.1.18a0 → docent_python-0.1.19a0}/docent/data_models/chat/__init__.py +6 -1
- {docent_python-0.1.18a0 → docent_python-0.1.19a0}/docent/data_models/citation.py +103 -22
- docent_python-0.1.19a0/docent/data_models/judge.py +16 -0
- docent_python-0.1.19a0/docent/data_models/metadata_util.py +16 -0
- {docent_python-0.1.18a0 → docent_python-0.1.19a0}/docent/data_models/remove_invalid_citation_ranges.py +23 -10
- {docent_python-0.1.18a0 → docent_python-0.1.19a0}/docent/data_models/transcript.py +18 -16
- {docent_python-0.1.18a0 → docent_python-0.1.19a0}/docent/sdk/agent_run_writer.py +18 -5
- {docent_python-0.1.18a0 → docent_python-0.1.19a0}/docent/sdk/client.py +104 -20
- {docent_python-0.1.18a0 → docent_python-0.1.19a0}/docent/trace.py +54 -49
- {docent_python-0.1.18a0 → docent_python-0.1.19a0}/pyproject.toml +1 -1
- {docent_python-0.1.18a0 → docent_python-0.1.19a0}/uv.lock +1 -1
- docent_python-0.1.18a0/docent/data_models/metadata.py +0 -229
- docent_python-0.1.18a0/docent/data_models/yaml_util.py +0 -12
- {docent_python-0.1.18a0 → docent_python-0.1.19a0}/.gitignore +0 -0
- {docent_python-0.1.18a0 → docent_python-0.1.19a0}/LICENSE.md +0 -0
- {docent_python-0.1.18a0 → docent_python-0.1.19a0}/README.md +0 -0
- {docent_python-0.1.18a0 → docent_python-0.1.19a0}/docent/__init__.py +0 -0
- {docent_python-0.1.18a0 → docent_python-0.1.19a0}/docent/_log_util/__init__.py +0 -0
- {docent_python-0.1.18a0 → docent_python-0.1.19a0}/docent/_log_util/logger.py +0 -0
- {docent_python-0.1.18a0 → docent_python-0.1.19a0}/docent/data_models/_tiktoken_util.py +0 -0
- {docent_python-0.1.18a0 → docent_python-0.1.19a0}/docent/data_models/chat/content.py +0 -0
- {docent_python-0.1.18a0 → docent_python-0.1.19a0}/docent/data_models/chat/message.py +0 -0
- {docent_python-0.1.18a0 → docent_python-0.1.19a0}/docent/data_models/chat/tool.py +0 -0
- {docent_python-0.1.18a0 → docent_python-0.1.19a0}/docent/data_models/regex.py +0 -0
- {docent_python-0.1.18a0 → docent_python-0.1.19a0}/docent/data_models/shared_types.py +0 -0
- {docent_python-0.1.18a0 → docent_python-0.1.19a0}/docent/loaders/load_inspect.py +0 -0
- {docent_python-0.1.18a0 → docent_python-0.1.19a0}/docent/py.typed +0 -0
- {docent_python-0.1.18a0 → docent_python-0.1.19a0}/docent/samples/__init__.py +0 -0
- {docent_python-0.1.18a0 → docent_python-0.1.19a0}/docent/samples/load.py +0 -0
- {docent_python-0.1.18a0 → docent_python-0.1.19a0}/docent/samples/log.eval +0 -0
- {docent_python-0.1.18a0 → docent_python-0.1.19a0}/docent/samples/tb_airline.json +0 -0
- {docent_python-0.1.18a0 → docent_python-0.1.19a0}/docent/sdk/__init__.py +0 -0
- {docent_python-0.1.18a0 → docent_python-0.1.19a0}/docent/trace_temp.py +0 -0
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
from docent.data_models.agent_run import AgentRun
|
|
2
2
|
from docent.data_models.citation import Citation
|
|
3
|
+
from docent.data_models.judge import JudgeRunLabel
|
|
3
4
|
from docent.data_models.regex import RegexSnippet
|
|
4
5
|
from docent.data_models.transcript import Transcript, TranscriptGroup
|
|
5
6
|
|
|
6
7
|
__all__ = [
|
|
7
8
|
"AgentRun",
|
|
8
9
|
"Citation",
|
|
10
|
+
"JudgeRunLabel",
|
|
9
11
|
"RegexSnippet",
|
|
10
12
|
"Transcript",
|
|
11
13
|
"TranscriptGroup",
|
|
@@ -17,8 +17,8 @@ from pydantic_core import to_jsonable_python
|
|
|
17
17
|
|
|
18
18
|
from docent._log_util import get_logger
|
|
19
19
|
from docent.data_models._tiktoken_util import get_token_count, group_messages_into_ranges
|
|
20
|
+
from docent.data_models.metadata_util import dump_metadata
|
|
20
21
|
from docent.data_models.transcript import Transcript, TranscriptGroup
|
|
21
|
-
from docent.data_models.yaml_util import yaml_dump_metadata
|
|
22
22
|
|
|
23
23
|
logger = get_logger(__name__)
|
|
24
24
|
|
|
@@ -446,10 +446,10 @@ class AgentRun(BaseModel):
|
|
|
446
446
|
text = _recurse("__global_root")
|
|
447
447
|
|
|
448
448
|
# Append agent run metadata below the full content
|
|
449
|
-
|
|
450
|
-
if
|
|
449
|
+
metadata_text = dump_metadata(self.metadata)
|
|
450
|
+
if metadata_text is not None:
|
|
451
451
|
if indent > 0:
|
|
452
|
-
|
|
453
|
-
text += f"\n<|agent run metadata|>\n{
|
|
452
|
+
metadata_text = textwrap.indent(metadata_text, " " * indent)
|
|
453
|
+
text += f"\n<|agent run metadata|>\n{metadata_text}\n</|agent run metadata|>"
|
|
454
454
|
|
|
455
455
|
return text
|
|
@@ -7,7 +7,12 @@ from docent.data_models.chat.message import (
|
|
|
7
7
|
UserMessage,
|
|
8
8
|
parse_chat_message,
|
|
9
9
|
)
|
|
10
|
-
from docent.data_models.chat.tool import
|
|
10
|
+
from docent.data_models.chat.tool import (
|
|
11
|
+
ToolCall,
|
|
12
|
+
ToolCallContent,
|
|
13
|
+
ToolInfo,
|
|
14
|
+
ToolParams,
|
|
15
|
+
)
|
|
11
16
|
|
|
12
17
|
__all__ = [
|
|
13
18
|
"ChatMessage",
|
|
@@ -1,15 +1,27 @@
|
|
|
1
1
|
import re
|
|
2
|
+
from dataclasses import dataclass
|
|
2
3
|
|
|
3
4
|
from pydantic import BaseModel
|
|
4
5
|
|
|
5
6
|
|
|
7
|
+
@dataclass
|
|
8
|
+
class ParsedCitation:
|
|
9
|
+
"""Represents a parsed citation before conversion to full Citation object."""
|
|
10
|
+
|
|
11
|
+
transcript_idx: int | None
|
|
12
|
+
block_idx: int | None
|
|
13
|
+
metadata_key: str | None = None
|
|
14
|
+
start_pattern: str | None = None
|
|
15
|
+
|
|
16
|
+
|
|
6
17
|
class Citation(BaseModel):
|
|
7
18
|
start_idx: int
|
|
8
19
|
end_idx: int
|
|
9
20
|
agent_run_idx: int | None = None
|
|
10
21
|
transcript_idx: int | None = None
|
|
11
|
-
block_idx: int
|
|
22
|
+
block_idx: int | None = None
|
|
12
23
|
action_unit_idx: int | None = None
|
|
24
|
+
metadata_key: str | None = None
|
|
13
25
|
start_pattern: str | None = None
|
|
14
26
|
|
|
15
27
|
|
|
@@ -17,6 +29,9 @@ RANGE_BEGIN = "<RANGE>"
|
|
|
17
29
|
RANGE_END = "</RANGE>"
|
|
18
30
|
|
|
19
31
|
_SINGLE_RE = re.compile(r"T(\d+)B(\d+)")
|
|
32
|
+
_METADATA_RE = re.compile(r"^M\.([^:]+)$") # [M.key]
|
|
33
|
+
_TRANSCRIPT_METADATA_RE = re.compile(r"^T(\d+)M\.([^:]+)$") # [T0M.key]
|
|
34
|
+
_MESSAGE_METADATA_RE = re.compile(r"^T(\d+)B(\d+)M\.([^:]+)$") # [T0B1M.key]
|
|
20
35
|
_RANGE_CONTENT_RE = re.compile(r":\s*" + re.escape(RANGE_BEGIN) + r".*?" + re.escape(RANGE_END))
|
|
21
36
|
|
|
22
37
|
|
|
@@ -70,41 +85,93 @@ def scan_brackets(text: str) -> list[tuple[int, int, str]]:
|
|
|
70
85
|
return matches
|
|
71
86
|
|
|
72
87
|
|
|
73
|
-
def parse_single_citation(part: str) ->
|
|
88
|
+
def parse_single_citation(part: str) -> ParsedCitation | None:
|
|
74
89
|
"""
|
|
75
90
|
Parse a single citation token inside a bracket and return its components.
|
|
76
91
|
|
|
77
|
-
Returns
|
|
92
|
+
Returns ParsedCitation or None if invalid.
|
|
93
|
+
For metadata citations, transcript_idx may be None (for agent run metadata).
|
|
94
|
+
Supports optional text range for all valid citation kinds.
|
|
78
95
|
"""
|
|
79
96
|
token = part.strip()
|
|
80
97
|
if not token:
|
|
81
98
|
return None
|
|
82
99
|
|
|
100
|
+
# Extract optional range part
|
|
101
|
+
start_pattern: str | None = None
|
|
102
|
+
citation_part = token
|
|
83
103
|
if ":" in token:
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
104
|
+
left, right = token.split(":", 1)
|
|
105
|
+
citation_part = left.strip()
|
|
106
|
+
start_pattern = _extract_range_pattern(right)
|
|
107
|
+
|
|
108
|
+
# Try matches in order of specificity
|
|
109
|
+
# 1) Message metadata [T0B0M.key]
|
|
110
|
+
m = _MESSAGE_METADATA_RE.match(citation_part)
|
|
111
|
+
if m:
|
|
112
|
+
transcript_idx = int(m.group(1))
|
|
113
|
+
block_idx = int(m.group(2))
|
|
114
|
+
metadata_key = m.group(3)
|
|
115
|
+
# Disallow nested keys like status.code per instruction
|
|
116
|
+
if "." in metadata_key:
|
|
87
117
|
return None
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
118
|
+
return ParsedCitation(
|
|
119
|
+
transcript_idx=transcript_idx,
|
|
120
|
+
block_idx=block_idx,
|
|
121
|
+
metadata_key=metadata_key,
|
|
122
|
+
start_pattern=start_pattern,
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
# 2) Transcript metadata [T0M.key]
|
|
126
|
+
m = _TRANSCRIPT_METADATA_RE.match(citation_part)
|
|
127
|
+
if m:
|
|
128
|
+
transcript_idx = int(m.group(1))
|
|
129
|
+
metadata_key = m.group(2)
|
|
130
|
+
if "." in metadata_key:
|
|
95
131
|
return None
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
132
|
+
return ParsedCitation(
|
|
133
|
+
transcript_idx=transcript_idx,
|
|
134
|
+
block_idx=None,
|
|
135
|
+
metadata_key=metadata_key,
|
|
136
|
+
start_pattern=start_pattern,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
# 3) Agent run metadata [M.key]
|
|
140
|
+
m = _METADATA_RE.match(citation_part)
|
|
141
|
+
if m:
|
|
142
|
+
metadata_key = m.group(1)
|
|
143
|
+
if "." in metadata_key:
|
|
144
|
+
return None
|
|
145
|
+
return ParsedCitation(
|
|
146
|
+
transcript_idx=None,
|
|
147
|
+
block_idx=None,
|
|
148
|
+
metadata_key=metadata_key,
|
|
149
|
+
start_pattern=start_pattern,
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
# 4) Regular transcript block [T0B0]
|
|
153
|
+
m = _SINGLE_RE.match(citation_part)
|
|
154
|
+
if m:
|
|
155
|
+
transcript_idx = int(m.group(1))
|
|
156
|
+
block_idx = int(m.group(2))
|
|
157
|
+
return ParsedCitation(
|
|
158
|
+
transcript_idx=transcript_idx, block_idx=block_idx, start_pattern=start_pattern
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
return None
|
|
99
162
|
|
|
100
163
|
|
|
101
164
|
def parse_citations(text: str) -> tuple[str, list[Citation]]:
|
|
102
165
|
"""
|
|
103
|
-
Parse citations from text in the format described by
|
|
166
|
+
Parse citations from text in the format described by TEXT_RANGE_CITE_INSTRUCTION.
|
|
104
167
|
|
|
105
168
|
Supported formats:
|
|
106
169
|
- Single block: [T<key>B<idx>]
|
|
107
170
|
- Text range with start pattern: [T<key>B<idx>:<RANGE>start_pattern</RANGE>]
|
|
171
|
+
- Agent run metadata: [M.key]
|
|
172
|
+
- Transcript metadata: [T<key>M.key]
|
|
173
|
+
- Message metadata: [T<key>B<idx>M.key]
|
|
174
|
+
- Message metadata with text range: [T<key>B<idx>M.key:<RANGE>start_pattern</RANGE>]
|
|
108
175
|
|
|
109
176
|
Args:
|
|
110
177
|
text: The text to parse citations from
|
|
@@ -127,8 +194,21 @@ def parse_citations(text: str) -> tuple[str, list[Citation]]:
|
|
|
127
194
|
# Parse a single citation token inside the bracket
|
|
128
195
|
parsed = parse_single_citation(bracket_content)
|
|
129
196
|
if parsed:
|
|
130
|
-
|
|
131
|
-
|
|
197
|
+
# Create appropriate replacement text based on citation type
|
|
198
|
+
if parsed.metadata_key:
|
|
199
|
+
if parsed.transcript_idx is None:
|
|
200
|
+
# Agent run metadata [M.key]
|
|
201
|
+
replacement = "run metadata"
|
|
202
|
+
elif parsed.block_idx is None:
|
|
203
|
+
# Transcript metadata [T0M.key]
|
|
204
|
+
replacement = f"T{parsed.transcript_idx}"
|
|
205
|
+
else:
|
|
206
|
+
# Message metadata [T0B1M.key]
|
|
207
|
+
replacement = f"T{parsed.transcript_idx}B{parsed.block_idx}"
|
|
208
|
+
else:
|
|
209
|
+
# Regular transcript block [T0B1]
|
|
210
|
+
replacement = f"T{parsed.transcript_idx}B{parsed.block_idx}"
|
|
211
|
+
|
|
132
212
|
# Current absolute start position for this replacement in the cleaned text
|
|
133
213
|
start_idx = len(cleaned_text)
|
|
134
214
|
end_idx = start_idx + len(replacement)
|
|
@@ -137,10 +217,11 @@ def parse_citations(text: str) -> tuple[str, list[Citation]]:
|
|
|
137
217
|
start_idx=start_idx,
|
|
138
218
|
end_idx=end_idx,
|
|
139
219
|
agent_run_idx=None,
|
|
140
|
-
transcript_idx=transcript_idx,
|
|
141
|
-
block_idx=block_idx,
|
|
220
|
+
transcript_idx=parsed.transcript_idx,
|
|
221
|
+
block_idx=parsed.block_idx,
|
|
142
222
|
action_unit_idx=None,
|
|
143
|
-
|
|
223
|
+
metadata_key=parsed.metadata_key,
|
|
224
|
+
start_pattern=parsed.start_pattern,
|
|
144
225
|
)
|
|
145
226
|
)
|
|
146
227
|
cleaned_text += replacement
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""Judge-related data models shared across Docent components."""
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
from uuid import uuid4
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel, Field
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class JudgeRunLabel(BaseModel):
|
|
10
|
+
id: str = Field(default_factory=lambda: str(uuid4()))
|
|
11
|
+
agent_run_id: str
|
|
12
|
+
rubric_id: str
|
|
13
|
+
label: dict[str, Any]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
__all__ = ["JudgeRunLabel"]
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from pydantic_core import to_jsonable_python
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def dump_metadata(metadata: dict[str, Any]) -> str | None:
|
|
8
|
+
"""
|
|
9
|
+
Dump metadata to a JSON string.
|
|
10
|
+
We used to use YAML to save tokens, but JSON makes it easier to find cited ranges on the frontend because the frontend uses JSON.
|
|
11
|
+
"""
|
|
12
|
+
if not metadata:
|
|
13
|
+
return None
|
|
14
|
+
metadata_obj = to_jsonable_python(metadata)
|
|
15
|
+
text = json.dumps(metadata_obj, indent=2)
|
|
16
|
+
return text.strip()
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import json
|
|
1
2
|
import re
|
|
2
3
|
|
|
3
4
|
from docent.data_models.agent_run import AgentRun
|
|
@@ -52,7 +53,7 @@ def find_citation_matches_in_text(text: str, start_pattern: str) -> list[tuple[i
|
|
|
52
53
|
|
|
53
54
|
def get_transcript_text_for_citation(agent_run: AgentRun, citation: Citation) -> str | None:
|
|
54
55
|
"""
|
|
55
|
-
Get the text content of a specific transcript block from an AgentRun,
|
|
56
|
+
Get the text content of a specific transcript block (or transcript/run metadata) from an AgentRun,
|
|
56
57
|
using the same formatting as shown to LLMs via format_chat_message.
|
|
57
58
|
|
|
58
59
|
Args:
|
|
@@ -62,19 +63,28 @@ def get_transcript_text_for_citation(agent_run: AgentRun, citation: Citation) ->
|
|
|
62
63
|
Returns:
|
|
63
64
|
Text content of the specified block (including tool calls), or None if not found
|
|
64
65
|
"""
|
|
65
|
-
if citation.transcript_idx is None:
|
|
66
|
-
return None
|
|
67
|
-
|
|
68
66
|
try:
|
|
69
|
-
if citation.transcript_idx
|
|
67
|
+
if citation.transcript_idx is None:
|
|
68
|
+
# At the run level, can only cite metadata
|
|
69
|
+
if citation.metadata_key is not None:
|
|
70
|
+
return json.dumps(agent_run.metadata.get(citation.metadata_key))
|
|
70
71
|
return None
|
|
72
|
+
|
|
71
73
|
transcript_id = agent_run.get_transcript_ids_ordered()[citation.transcript_idx]
|
|
72
74
|
transcript = agent_run.transcript_dict[transcript_id]
|
|
73
75
|
|
|
74
|
-
if citation.block_idx
|
|
76
|
+
if citation.block_idx is None:
|
|
77
|
+
# At the transcript level, can only cite metadata
|
|
78
|
+
if citation.metadata_key is not None:
|
|
79
|
+
return json.dumps(transcript.metadata.get(citation.metadata_key))
|
|
75
80
|
return None
|
|
81
|
+
|
|
76
82
|
message = transcript.messages[citation.block_idx]
|
|
77
83
|
|
|
84
|
+
# At the message level, can cite metadata or content
|
|
85
|
+
if citation.metadata_key is not None:
|
|
86
|
+
return json.dumps(message.metadata.get(citation.metadata_key))
|
|
87
|
+
|
|
78
88
|
# Use the same formatting function that generates content for LLMs
|
|
79
89
|
# This ensures consistent formatting between citation validation and LLM serialization
|
|
80
90
|
return format_chat_message(
|
|
@@ -99,6 +109,9 @@ def validate_citation_text_range(agent_run: AgentRun, citation: Citation) -> boo
|
|
|
99
109
|
if not citation.start_pattern:
|
|
100
110
|
# Nothing to validate
|
|
101
111
|
return True
|
|
112
|
+
if citation.metadata_key is not None:
|
|
113
|
+
# We don't need to remove invalid metadata citation ranges
|
|
114
|
+
return True
|
|
102
115
|
|
|
103
116
|
text = get_transcript_text_for_citation(agent_run, citation)
|
|
104
117
|
if text is None:
|
|
@@ -130,16 +143,16 @@ def remove_invalid_citation_ranges(text: str, agent_run: AgentRun) -> str:
|
|
|
130
143
|
# Parse this bracket content to get citation info
|
|
131
144
|
parsed = parse_single_citation(bracket_content)
|
|
132
145
|
if parsed:
|
|
133
|
-
transcript_idx, block_idx, start_pattern = parsed
|
|
134
146
|
# The citation spans from start to end in the original text
|
|
135
147
|
citation = Citation(
|
|
136
148
|
start_idx=start,
|
|
137
149
|
end_idx=end,
|
|
138
150
|
agent_run_idx=None,
|
|
139
|
-
transcript_idx=transcript_idx,
|
|
140
|
-
block_idx=block_idx,
|
|
151
|
+
transcript_idx=parsed.transcript_idx,
|
|
152
|
+
block_idx=parsed.block_idx,
|
|
141
153
|
action_unit_idx=None,
|
|
142
|
-
|
|
154
|
+
metadata_key=parsed.metadata_key,
|
|
155
|
+
start_pattern=parsed.start_pattern,
|
|
143
156
|
)
|
|
144
157
|
citations.append(citation)
|
|
145
158
|
|
|
@@ -15,7 +15,7 @@ from docent.data_models._tiktoken_util import (
|
|
|
15
15
|
)
|
|
16
16
|
from docent.data_models.chat import AssistantMessage, ChatMessage, ContentReasoning
|
|
17
17
|
from docent.data_models.citation import RANGE_BEGIN, RANGE_END
|
|
18
|
-
from docent.data_models.
|
|
18
|
+
from docent.data_models.metadata_util import dump_metadata
|
|
19
19
|
|
|
20
20
|
# Template for formatting individual transcript blocks
|
|
21
21
|
TRANSCRIPT_BLOCK_TEMPLATE = """
|
|
@@ -29,6 +29,12 @@ TEXT_RANGE_CITE_INSTRUCTION = f"""Anytime you quote the transcript, or refer to
|
|
|
29
29
|
|
|
30
30
|
A citation may include a specific range of text within a block. Use {RANGE_BEGIN} and {RANGE_END} to mark the specific range of text. Add it after the block ID separated by a colon. For example, to cite the part of transcript 0, block 1, where the agent says "I understand the task", write [T0B1:{RANGE_BEGIN}I understand the task{RANGE_END}]. Citations must follow this exact format. The markers {RANGE_BEGIN} and {RANGE_END} must be used ONLY inside the brackets of a citation.
|
|
31
31
|
|
|
32
|
+
- You may cite a top-level key in the agent run metadata like this: [M.task_description].
|
|
33
|
+
- You may cite a top-level key in transcript metadata. For example, for transcript 0: [T0M.start_time].
|
|
34
|
+
- You may cite a top-level key in message metadata for a block. For example, for transcript 0, block 1: [T0B1M.status].
|
|
35
|
+
- You may not cite nested keys. For example, [T0B1M.status.code] is invalid.
|
|
36
|
+
- Within a top-level metadata key you may cite a range of text that appears in the value. For example, [T0B1M.status:{RANGE_BEGIN}"running":false{RANGE_END}].
|
|
37
|
+
|
|
32
38
|
Important notes:
|
|
33
39
|
- You must include the full content of the text range {RANGE_BEGIN} and {RANGE_END}, EXACTLY as it appears in the transcript, word-for-word, including any markers or punctuation that appear in the middle of the text.
|
|
34
40
|
- Citations must be as specific as possible. This means you should usually cite a specific text range within a block.
|
|
@@ -73,9 +79,9 @@ def format_chat_message(
|
|
|
73
79
|
cur_content += f"\n<tool call>\n{tool_call.function}({args})\n</tool call>"
|
|
74
80
|
|
|
75
81
|
if message.metadata:
|
|
76
|
-
|
|
77
|
-
if
|
|
78
|
-
cur_content += f"\n<|message metadata|>\n{
|
|
82
|
+
metadata_text = dump_metadata(message.metadata)
|
|
83
|
+
if metadata_text is not None:
|
|
84
|
+
cur_content += f"\n<|message metadata|>\n{metadata_text}\n</|message metadata|>"
|
|
79
85
|
|
|
80
86
|
return TRANSCRIPT_BLOCK_TEMPLATE.format(
|
|
81
87
|
index_label=index_label, role=message.role, content=cur_content
|
|
@@ -127,13 +133,11 @@ class TranscriptGroup(BaseModel):
|
|
|
127
133
|
str: XML-like wrapped text including the group's metadata.
|
|
128
134
|
"""
|
|
129
135
|
# Prepare YAML metadata
|
|
130
|
-
|
|
131
|
-
if
|
|
136
|
+
metadata_text = dump_metadata(self.metadata)
|
|
137
|
+
if metadata_text is not None:
|
|
132
138
|
if indent > 0:
|
|
133
|
-
|
|
134
|
-
inner =
|
|
135
|
-
f"{children_text}\n<|{self.name} metadata|>\n{yaml_text}\n</|{self.name} metadata|>"
|
|
136
|
-
)
|
|
139
|
+
metadata_text = textwrap.indent(metadata_text, " " * indent)
|
|
140
|
+
inner = f"{children_text}\n<|{self.name} metadata|>\n{metadata_text}\n</|{self.name} metadata|>"
|
|
137
141
|
else:
|
|
138
142
|
inner = children_text
|
|
139
143
|
|
|
@@ -447,13 +451,11 @@ class Transcript(BaseModel):
|
|
|
447
451
|
content_str = f"<|T{transcript_idx} blocks|>\n{blocks_str}\n</|T{transcript_idx} blocks|>"
|
|
448
452
|
|
|
449
453
|
# Gather metadata and add to content
|
|
450
|
-
|
|
451
|
-
if
|
|
454
|
+
metadata_text = dump_metadata(self.metadata)
|
|
455
|
+
if metadata_text is not None:
|
|
452
456
|
if indent > 0:
|
|
453
|
-
|
|
454
|
-
content_str +=
|
|
455
|
-
f"\n<|T{transcript_idx} metadata|>\n{yaml_text}\n</|T{transcript_idx} metadata|>"
|
|
456
|
-
)
|
|
457
|
+
metadata_text = textwrap.indent(metadata_text, " " * indent)
|
|
458
|
+
content_str += f"\n<|T{transcript_idx} metadata|>\n{metadata_text}\n</|T{transcript_idx} metadata|>"
|
|
457
459
|
|
|
458
460
|
# Format content and return
|
|
459
461
|
if indent > 0:
|
|
@@ -4,11 +4,12 @@ import queue
|
|
|
4
4
|
import signal
|
|
5
5
|
import threading
|
|
6
6
|
import time
|
|
7
|
-
from typing import Any, Callable, Coroutine, Optional
|
|
7
|
+
from typing import Any, AsyncGenerator, Callable, Coroutine, Optional
|
|
8
8
|
|
|
9
9
|
import anyio
|
|
10
10
|
import backoff
|
|
11
11
|
import httpx
|
|
12
|
+
import orjson
|
|
12
13
|
from backoff.types import Details
|
|
13
14
|
|
|
14
15
|
from docent._log_util.logger import get_logger
|
|
@@ -38,6 +39,15 @@ def _print_backoff_message(e: Details):
|
|
|
38
39
|
)
|
|
39
40
|
|
|
40
41
|
|
|
42
|
+
async def _generate_payload_chunks(runs: list[AgentRun]) -> AsyncGenerator[bytes, None]:
|
|
43
|
+
yield b'{"agent_runs": ['
|
|
44
|
+
for i, ar in enumerate(runs):
|
|
45
|
+
if i > 0:
|
|
46
|
+
yield b","
|
|
47
|
+
yield orjson.dumps(ar.model_dump(mode="json"))
|
|
48
|
+
yield b"]}"
|
|
49
|
+
|
|
50
|
+
|
|
41
51
|
class AgentRunWriter:
|
|
42
52
|
"""Background thread for logging agent runs.
|
|
43
53
|
|
|
@@ -175,7 +185,7 @@ class AgentRunWriter:
|
|
|
175
185
|
logger.info("Cancelling pending tasks...")
|
|
176
186
|
self._cancel_event.set()
|
|
177
187
|
n_pending = self._queue.qsize()
|
|
178
|
-
logger.info(f"Cancelled ~{n_pending} pending
|
|
188
|
+
logger.info(f"Cancelled ~{n_pending} pending runs")
|
|
179
189
|
|
|
180
190
|
# Give a brief moment to exit
|
|
181
191
|
logger.info("Waiting for thread to exit...")
|
|
@@ -194,8 +204,11 @@ class AgentRunWriter:
|
|
|
194
204
|
on_backoff=_print_backoff_message,
|
|
195
205
|
)
|
|
196
206
|
async def _post_batch(batch: list[AgentRun]) -> None:
|
|
197
|
-
|
|
198
|
-
|
|
207
|
+
resp = await client.post(
|
|
208
|
+
self._endpoint,
|
|
209
|
+
content=_generate_payload_chunks(batch),
|
|
210
|
+
timeout=self._request_timeout,
|
|
211
|
+
)
|
|
199
212
|
resp.raise_for_status()
|
|
200
213
|
|
|
201
214
|
return _post_batch
|
|
@@ -246,7 +259,7 @@ def init(
|
|
|
246
259
|
web_url: str = "https://docent.transluce.org",
|
|
247
260
|
api_key: str | None = None,
|
|
248
261
|
# Writer arguments
|
|
249
|
-
num_workers: int =
|
|
262
|
+
num_workers: int = 4,
|
|
250
263
|
queue_maxsize: int = 20_000,
|
|
251
264
|
request_timeout: float = 30.0,
|
|
252
265
|
flush_interval: float = 1.0,
|
|
@@ -8,6 +8,7 @@ from tqdm import tqdm
|
|
|
8
8
|
|
|
9
9
|
from docent._log_util.logger import get_logger
|
|
10
10
|
from docent.data_models.agent_run import AgentRun
|
|
11
|
+
from docent.data_models.judge import JudgeRunLabel
|
|
11
12
|
from docent.loaders import load_inspect
|
|
12
13
|
|
|
13
14
|
logger = get_logger(__name__)
|
|
@@ -48,13 +49,18 @@ class Docent:
|
|
|
48
49
|
|
|
49
50
|
self._login(api_key)
|
|
50
51
|
|
|
52
|
+
def _handle_response_errors(self, response: requests.Response):
|
|
53
|
+
"""Handle API response and raise informative errors.
|
|
54
|
+
TODO: make this more informative."""
|
|
55
|
+
response.raise_for_status()
|
|
56
|
+
|
|
51
57
|
def _login(self, api_key: str):
|
|
52
58
|
"""Login with email/password to establish session."""
|
|
53
59
|
self._session.headers.update({"Authorization": f"Bearer {api_key}"})
|
|
54
60
|
|
|
55
61
|
url = f"{self._server_url}/api-keys/test"
|
|
56
62
|
response = self._session.get(url)
|
|
57
|
-
|
|
63
|
+
self._handle_response_errors(response)
|
|
58
64
|
|
|
59
65
|
logger.info("Logged in with API key")
|
|
60
66
|
return
|
|
@@ -90,7 +96,7 @@ class Docent:
|
|
|
90
96
|
}
|
|
91
97
|
|
|
92
98
|
response = self._session.post(url, json=payload)
|
|
93
|
-
|
|
99
|
+
self._handle_response_errors(response)
|
|
94
100
|
|
|
95
101
|
response_data = response.json()
|
|
96
102
|
collection_id = response_data.get("collection_id")
|
|
@@ -134,13 +140,13 @@ class Docent:
|
|
|
134
140
|
payload = {"agent_runs": [ar.model_dump(mode="json") for ar in batch]}
|
|
135
141
|
|
|
136
142
|
response = self._session.post(url, json=payload)
|
|
137
|
-
|
|
143
|
+
self._handle_response_errors(response)
|
|
138
144
|
|
|
139
145
|
pbar.update(len(batch))
|
|
140
146
|
|
|
141
147
|
url = f"{self._server_url}/{collection_id}/compute_embeddings"
|
|
142
148
|
response = self._session.post(url)
|
|
143
|
-
|
|
149
|
+
self._handle_response_errors(response)
|
|
144
150
|
|
|
145
151
|
logger.info(f"Successfully added {total_runs} agent runs to Collection '{collection_id}'")
|
|
146
152
|
return {"status": "success", "total_runs_added": total_runs}
|
|
@@ -156,7 +162,7 @@ class Docent:
|
|
|
156
162
|
"""
|
|
157
163
|
url = f"{self._server_url}/collections"
|
|
158
164
|
response = self._session.get(url)
|
|
159
|
-
|
|
165
|
+
self._handle_response_errors(response)
|
|
160
166
|
return response.json()
|
|
161
167
|
|
|
162
168
|
def list_rubrics(self, collection_id: str) -> list[dict[str, Any]]:
|
|
@@ -173,7 +179,7 @@ class Docent:
|
|
|
173
179
|
"""
|
|
174
180
|
url = f"{self._server_url}/rubric/{collection_id}/rubrics"
|
|
175
181
|
response = self._session.get(url)
|
|
176
|
-
|
|
182
|
+
self._handle_response_errors(response)
|
|
177
183
|
return response.json()
|
|
178
184
|
|
|
179
185
|
def get_rubric_run_state(self, collection_id: str, rubric_id: str) -> dict[str, Any]:
|
|
@@ -191,7 +197,7 @@ class Docent:
|
|
|
191
197
|
"""
|
|
192
198
|
url = f"{self._server_url}/rubric/{collection_id}/{rubric_id}/rubric_run_state"
|
|
193
199
|
response = self._session.get(url)
|
|
194
|
-
|
|
200
|
+
self._handle_response_errors(response)
|
|
195
201
|
return response.json()
|
|
196
202
|
|
|
197
203
|
def get_clustering_state(self, collection_id: str, rubric_id: str) -> dict[str, Any]:
|
|
@@ -209,7 +215,7 @@ class Docent:
|
|
|
209
215
|
"""
|
|
210
216
|
url = f"{self._server_url}/rubric/{collection_id}/{rubric_id}/clustering_job"
|
|
211
217
|
response = self._session.get(url)
|
|
212
|
-
|
|
218
|
+
self._handle_response_errors(response)
|
|
213
219
|
return response.json()
|
|
214
220
|
|
|
215
221
|
def get_cluster_centroids(self, collection_id: str, rubric_id: str) -> list[dict[str, Any]]:
|
|
@@ -244,6 +250,90 @@ class Docent:
|
|
|
244
250
|
clustering_state = self.get_clustering_state(collection_id, rubric_id)
|
|
245
251
|
return clustering_state.get("assignments", {})
|
|
246
252
|
|
|
253
|
+
def add_label(
|
|
254
|
+
self,
|
|
255
|
+
collection_id: str,
|
|
256
|
+
rubric_id: str,
|
|
257
|
+
label: JudgeRunLabel,
|
|
258
|
+
) -> dict[str, Any]:
|
|
259
|
+
"""Attach a manual label to an agent run for a rubric.
|
|
260
|
+
|
|
261
|
+
Args:
|
|
262
|
+
collection_id: ID of the Collection that owns the rubric.
|
|
263
|
+
rubric_id: ID of the rubric the label applies to.
|
|
264
|
+
label: A `JudgeRunLabel` that must comply with the rubric's output schema.
|
|
265
|
+
|
|
266
|
+
Returns:
|
|
267
|
+
dict: API response containing a status message.
|
|
268
|
+
|
|
269
|
+
Raises:
|
|
270
|
+
ValueError: If the label does not target the rubric specified in the path.
|
|
271
|
+
requests.exceptions.HTTPError: If the API request fails or validation errors occur.
|
|
272
|
+
"""
|
|
273
|
+
if label.rubric_id != rubric_id:
|
|
274
|
+
raise ValueError("Label rubric_id must match the rubric_id argument")
|
|
275
|
+
|
|
276
|
+
url = f"{self._server_url}/rubric/{collection_id}/rubric/{rubric_id}/label"
|
|
277
|
+
payload = {"label": label.model_dump(mode="json")}
|
|
278
|
+
response = self._session.post(url, json=payload)
|
|
279
|
+
self._handle_response_errors(response)
|
|
280
|
+
return response.json()
|
|
281
|
+
|
|
282
|
+
def add_labels(
|
|
283
|
+
self,
|
|
284
|
+
collection_id: str,
|
|
285
|
+
rubric_id: str,
|
|
286
|
+
labels: list[JudgeRunLabel],
|
|
287
|
+
) -> dict[str, Any]:
|
|
288
|
+
"""Attach multiple manual labels to a rubric.
|
|
289
|
+
|
|
290
|
+
Args:
|
|
291
|
+
collection_id: ID of the Collection that owns the rubric.
|
|
292
|
+
rubric_id: ID of the rubric the labels apply to.
|
|
293
|
+
labels: List of `JudgeRunLabel` objects.
|
|
294
|
+
|
|
295
|
+
Returns:
|
|
296
|
+
dict: API response containing status information.
|
|
297
|
+
|
|
298
|
+
Raises:
|
|
299
|
+
ValueError: If no labels are provided.
|
|
300
|
+
ValueError: If any label targets a different rubric.
|
|
301
|
+
requests.exceptions.HTTPError: If the API request fails.
|
|
302
|
+
"""
|
|
303
|
+
if not labels:
|
|
304
|
+
raise ValueError("labels must contain at least one entry")
|
|
305
|
+
|
|
306
|
+
rubric_ids = {label.rubric_id for label in labels}
|
|
307
|
+
if rubric_ids != {rubric_id}:
|
|
308
|
+
raise ValueError(
|
|
309
|
+
"All labels must specify the same rubric_id that is provided to add_labels"
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
payload = {"labels": [l.model_dump(mode="json") for l in labels]}
|
|
313
|
+
|
|
314
|
+
url = f"{self._server_url}/rubric/{collection_id}/rubric/{rubric_id}/labels"
|
|
315
|
+
response = self._session.post(url, json=payload)
|
|
316
|
+
self._handle_response_errors(response)
|
|
317
|
+
return response.json()
|
|
318
|
+
|
|
319
|
+
def get_labels(self, collection_id: str, rubric_id: str) -> list[dict[str, Any]]:
|
|
320
|
+
"""Retrieve all manual labels for a rubric.
|
|
321
|
+
|
|
322
|
+
Args:
|
|
323
|
+
collection_id: ID of the Collection that owns the rubric.
|
|
324
|
+
rubric_id: ID of the rubric to fetch labels for.
|
|
325
|
+
|
|
326
|
+
Returns:
|
|
327
|
+
list: List of label dictionaries. Each includes agent_run_id and label content.
|
|
328
|
+
|
|
329
|
+
Raises:
|
|
330
|
+
requests.exceptions.HTTPError: If the API request fails.
|
|
331
|
+
"""
|
|
332
|
+
url = f"{self._server_url}/rubric/{collection_id}/rubric/{rubric_id}/labels"
|
|
333
|
+
response = self._session.get(url)
|
|
334
|
+
self._handle_response_errors(response)
|
|
335
|
+
return response.json()
|
|
336
|
+
|
|
247
337
|
def get_agent_run(self, collection_id: str, agent_run_id: str) -> AgentRun | None:
|
|
248
338
|
"""Get a specific agent run by its ID.
|
|
249
339
|
|
|
@@ -259,7 +349,7 @@ class Docent:
|
|
|
259
349
|
"""
|
|
260
350
|
url = f"{self._server_url}/{collection_id}/agent_run"
|
|
261
351
|
response = self._session.get(url, params={"agent_run_id": agent_run_id})
|
|
262
|
-
|
|
352
|
+
self._handle_response_errors(response)
|
|
263
353
|
if response.json() is None:
|
|
264
354
|
return None
|
|
265
355
|
else:
|
|
@@ -281,7 +371,7 @@ class Docent:
|
|
|
281
371
|
"""
|
|
282
372
|
url = f"{self._server_url}/{collection_id}/make_public"
|
|
283
373
|
response = self._session.post(url)
|
|
284
|
-
|
|
374
|
+
self._handle_response_errors(response)
|
|
285
375
|
|
|
286
376
|
logger.info(f"Successfully made Collection '{collection_id}' public")
|
|
287
377
|
return response.json()
|
|
@@ -303,13 +393,7 @@ class Docent:
|
|
|
303
393
|
payload = {"email": email}
|
|
304
394
|
response = self._session.post(url, json=payload)
|
|
305
395
|
|
|
306
|
-
|
|
307
|
-
response.raise_for_status()
|
|
308
|
-
except requests.exceptions.HTTPError:
|
|
309
|
-
if response.status_code == 404:
|
|
310
|
-
raise ValueError(f"The user you are trying to share with ({email}) does not exist.")
|
|
311
|
-
else:
|
|
312
|
-
raise # Re-raise the original exception
|
|
396
|
+
self._handle_response_errors(response)
|
|
313
397
|
|
|
314
398
|
logger.info(f"Successfully shared Collection '{collection_id}' with {email}")
|
|
315
399
|
return response.json()
|
|
@@ -328,7 +412,7 @@ class Docent:
|
|
|
328
412
|
"""
|
|
329
413
|
url = f"{self._server_url}/{collection_id}/agent_run_ids"
|
|
330
414
|
response = self._session.get(url)
|
|
331
|
-
|
|
415
|
+
self._handle_response_errors(response)
|
|
332
416
|
return response.json()
|
|
333
417
|
|
|
334
418
|
def recursively_ingest_inspect_logs(self, collection_id: str, fpath: str):
|
|
@@ -393,7 +477,7 @@ class Docent:
|
|
|
393
477
|
payload = {"agent_runs": [ar.model_dump(mode="json") for ar in batch_list]}
|
|
394
478
|
|
|
395
479
|
response = self._session.post(url, json=payload)
|
|
396
|
-
|
|
480
|
+
self._handle_response_errors(response)
|
|
397
481
|
|
|
398
482
|
runs_from_file += len(batch_list)
|
|
399
483
|
file_pbar.update(len(batch_list))
|
|
@@ -406,7 +490,7 @@ class Docent:
|
|
|
406
490
|
logger.info("Computing embeddings for added runs...")
|
|
407
491
|
url = f"{self._server_url}/{collection_id}/compute_embeddings"
|
|
408
492
|
response = self._session.post(url)
|
|
409
|
-
|
|
493
|
+
self._handle_response_errors(response)
|
|
410
494
|
|
|
411
495
|
logger.info(
|
|
412
496
|
f"Successfully ingested {total_runs_added} total agent runs from {len(eval_files)} files"
|
|
@@ -21,7 +21,7 @@ from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExport
|
|
|
21
21
|
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter as HTTPExporter
|
|
22
22
|
from opentelemetry.instrumentation.threading import ThreadingInstrumentor
|
|
23
23
|
from opentelemetry.sdk.resources import Resource
|
|
24
|
-
from opentelemetry.sdk.trace import ReadableSpan, SpanProcessor, TracerProvider
|
|
24
|
+
from opentelemetry.sdk.trace import ReadableSpan, SpanLimits, SpanProcessor, TracerProvider
|
|
25
25
|
from opentelemetry.sdk.trace.export import (
|
|
26
26
|
BatchSpanProcessor,
|
|
27
27
|
ConsoleSpanExporter,
|
|
@@ -29,20 +29,13 @@ from opentelemetry.sdk.trace.export import (
|
|
|
29
29
|
)
|
|
30
30
|
from opentelemetry.trace import Span
|
|
31
31
|
|
|
32
|
-
# Configure logging
|
|
33
32
|
logger = logging.getLogger(__name__)
|
|
34
|
-
logger.setLevel(logging.ERROR)
|
|
35
33
|
|
|
36
34
|
# Default configuration
|
|
37
35
|
DEFAULT_ENDPOINT = "https://api.docent.transluce.org/rest/telemetry"
|
|
38
36
|
DEFAULT_COLLECTION_NAME = "default-collection-name"
|
|
39
37
|
|
|
40
38
|
|
|
41
|
-
def _is_tracing_disabled() -> bool:
|
|
42
|
-
"""Check if tracing is disabled via environment variable."""
|
|
43
|
-
return os.environ.get("DOCENT_DISABLE_TRACING", "").lower() == "true"
|
|
44
|
-
|
|
45
|
-
|
|
46
39
|
class Instruments(Enum):
|
|
47
40
|
"""Enumeration of available instrument types."""
|
|
48
41
|
|
|
@@ -52,16 +45,10 @@ class Instruments(Enum):
|
|
|
52
45
|
LANGCHAIN = "langchain"
|
|
53
46
|
|
|
54
47
|
|
|
55
|
-
def _is_notebook() -> bool:
|
|
56
|
-
"""Check if we're running in a Jupyter notebook."""
|
|
57
|
-
try:
|
|
58
|
-
return "ipykernel" in sys.modules
|
|
59
|
-
except Exception:
|
|
60
|
-
return False
|
|
61
|
-
|
|
62
|
-
|
|
63
48
|
class DocentTracer:
|
|
64
|
-
"""
|
|
49
|
+
"""
|
|
50
|
+
Manages Docent tracing setup and provides tracing utilities.
|
|
51
|
+
"""
|
|
65
52
|
|
|
66
53
|
def __init__(
|
|
67
54
|
self,
|
|
@@ -77,22 +64,6 @@ class DocentTracer:
|
|
|
77
64
|
instruments: Optional[Set[Instruments]] = None,
|
|
78
65
|
block_instruments: Optional[Set[Instruments]] = None,
|
|
79
66
|
):
|
|
80
|
-
"""
|
|
81
|
-
Initialize Docent tracing manager.
|
|
82
|
-
|
|
83
|
-
Args:
|
|
84
|
-
collection_name: Name of the collection for resource attributes
|
|
85
|
-
collection_id: Optional collection ID (auto-generated if not provided)
|
|
86
|
-
agent_run_id: Optional agent_run_id to use for code outside of an agent run context (auto-generated if not provided)
|
|
87
|
-
endpoint: OTLP endpoint URL(s) - can be a single string or list of strings for multiple endpoints
|
|
88
|
-
headers: Optional headers for authentication
|
|
89
|
-
api_key: Optional API key for bearer token authentication (takes precedence over env var)
|
|
90
|
-
enable_console_export: Whether to export to console
|
|
91
|
-
enable_otlp_export: Whether to export to OTLP endpoint
|
|
92
|
-
disable_batch: Whether to disable batch processing (use SimpleSpanProcessor)
|
|
93
|
-
instruments: Set of instruments to enable (None = all instruments)
|
|
94
|
-
block_instruments: Set of instruments to explicitly disable
|
|
95
|
-
"""
|
|
96
67
|
self._initialized: bool = False
|
|
97
68
|
# Check if tracing is disabled via environment variable
|
|
98
69
|
if _is_tracing_disabled():
|
|
@@ -163,8 +134,12 @@ class DocentTracer:
|
|
|
163
134
|
"""
|
|
164
135
|
Get the current agent run ID from context.
|
|
165
136
|
|
|
137
|
+
Retrieves the agent run ID that was set in the current execution context.
|
|
138
|
+
If no agent run context is active, returns the default agent run ID.
|
|
139
|
+
|
|
166
140
|
Returns:
|
|
167
|
-
The current agent run ID if available,
|
|
141
|
+
The current agent run ID if available, or the default agent run ID
|
|
142
|
+
if no context is active.
|
|
168
143
|
"""
|
|
169
144
|
try:
|
|
170
145
|
return self._agent_run_id_var.get()
|
|
@@ -249,12 +224,23 @@ class DocentTracer:
|
|
|
249
224
|
return
|
|
250
225
|
|
|
251
226
|
try:
|
|
227
|
+
|
|
228
|
+
# Check for OTEL_SPAN_ATTRIBUTE_COUNT_LIMIT environment variable
|
|
229
|
+
default_attribute_limit = 1024
|
|
230
|
+
env_value = os.environ.get("OTEL_SPAN_ATTRIBUTE_COUNT_LIMIT", "0")
|
|
231
|
+
env_limit = int(env_value) if env_value.isdigit() else 0
|
|
232
|
+
attribute_limit = max(env_limit, default_attribute_limit)
|
|
233
|
+
|
|
234
|
+
span_limits = SpanLimits(
|
|
235
|
+
max_attributes=attribute_limit,
|
|
236
|
+
)
|
|
237
|
+
|
|
252
238
|
# Create our own isolated tracer provider
|
|
253
239
|
self._tracer_provider = TracerProvider(
|
|
254
|
-
resource=Resource.create({"service.name": self.collection_name})
|
|
240
|
+
resource=Resource.create({"service.name": self.collection_name}),
|
|
241
|
+
span_limits=span_limits,
|
|
255
242
|
)
|
|
256
243
|
|
|
257
|
-
# Add custom span processor for agent_run_id and transcript_id
|
|
258
244
|
class ContextSpanProcessor(SpanProcessor):
|
|
259
245
|
def __init__(self, manager: "DocentTracer"):
|
|
260
246
|
self.manager: "DocentTracer" = manager
|
|
@@ -312,11 +298,7 @@ class DocentTracer:
|
|
|
312
298
|
)
|
|
313
299
|
|
|
314
300
|
def on_end(self, span: ReadableSpan) -> None:
|
|
315
|
-
|
|
316
|
-
span_attrs = span.attributes or {}
|
|
317
|
-
logger.debug(
|
|
318
|
-
f"Completed span: name='{span.name}', collection_id={span_attrs.get('collection_id')}, agent_run_id={span_attrs.get('agent_run_id')}, transcript_id={span_attrs.get('transcript_id')}, duration_ns={span.end_time - span.start_time if span.end_time and span.start_time else 'unknown'}"
|
|
319
|
-
)
|
|
301
|
+
pass
|
|
320
302
|
|
|
321
303
|
def shutdown(self) -> None:
|
|
322
304
|
pass
|
|
@@ -422,7 +404,17 @@ class DocentTracer:
|
|
|
422
404
|
raise
|
|
423
405
|
|
|
424
406
|
def cleanup(self):
|
|
425
|
-
"""
|
|
407
|
+
"""
|
|
408
|
+
Clean up Docent tracing resources.
|
|
409
|
+
|
|
410
|
+
Flushes all pending spans to exporters and shuts down the tracer provider.
|
|
411
|
+
This method is automatically called during application shutdown via atexit
|
|
412
|
+
handlers, but can also be called manually for explicit cleanup.
|
|
413
|
+
|
|
414
|
+
The cleanup process:
|
|
415
|
+
1. Flushes all span processors to ensure data is exported
|
|
416
|
+
2. Shuts down the tracer provider and releases resources
|
|
417
|
+
"""
|
|
426
418
|
if self._disabled:
|
|
427
419
|
return
|
|
428
420
|
|
|
@@ -473,7 +465,7 @@ class DocentTracer:
|
|
|
473
465
|
if disabled and self._initialized:
|
|
474
466
|
self.cleanup()
|
|
475
467
|
|
|
476
|
-
def
|
|
468
|
+
def is_initialized(self) -> bool:
|
|
477
469
|
"""Verify if the manager is properly initialized."""
|
|
478
470
|
return self._initialized
|
|
479
471
|
|
|
@@ -1063,8 +1055,9 @@ def initialize_tracing(
|
|
|
1063
1055
|
collection_id: Optional collection ID (auto-generated if not provided)
|
|
1064
1056
|
endpoint: OTLP endpoint URL(s) for span export - can be a single string or list of strings for multiple endpoints
|
|
1065
1057
|
headers: Optional headers for authentication
|
|
1066
|
-
api_key: Optional API key for bearer token authentication (takes precedence
|
|
1067
|
-
|
|
1058
|
+
api_key: Optional API key for bearer token authentication (takes precedence
|
|
1059
|
+
over DOCENT_API_KEY environment variable)
|
|
1060
|
+
enable_console_export: Whether to export spans to console for debugging
|
|
1068
1061
|
enable_otlp_export: Whether to export spans to OTLP endpoint
|
|
1069
1062
|
disable_batch: Whether to disable batch processing (use SimpleSpanProcessor)
|
|
1070
1063
|
instruments: Set of instruments to enable (None = all instruments).
|
|
@@ -1074,7 +1067,6 @@ def initialize_tracing(
|
|
|
1074
1067
|
The initialized Docent tracer
|
|
1075
1068
|
|
|
1076
1069
|
Example:
|
|
1077
|
-
# Basic setup
|
|
1078
1070
|
initialize_tracing("my-collection")
|
|
1079
1071
|
"""
|
|
1080
1072
|
|
|
@@ -1137,17 +1129,17 @@ def close_tracing() -> None:
|
|
|
1137
1129
|
def flush_tracing() -> None:
|
|
1138
1130
|
"""Force flush all spans to exporters."""
|
|
1139
1131
|
if _global_tracer:
|
|
1140
|
-
logger.debug("Flushing
|
|
1132
|
+
logger.debug("Flushing Docent tracer")
|
|
1141
1133
|
_global_tracer.flush()
|
|
1142
1134
|
else:
|
|
1143
1135
|
logger.debug("No global tracer available to flush")
|
|
1144
1136
|
|
|
1145
1137
|
|
|
1146
|
-
def
|
|
1138
|
+
def is_initialized() -> bool:
|
|
1147
1139
|
"""Verify if the global Docent tracer is properly initialized."""
|
|
1148
1140
|
if _global_tracer is None:
|
|
1149
1141
|
return False
|
|
1150
|
-
return _global_tracer.
|
|
1142
|
+
return _global_tracer.is_initialized()
|
|
1151
1143
|
|
|
1152
1144
|
|
|
1153
1145
|
def is_disabled() -> bool:
|
|
@@ -1764,3 +1756,16 @@ def transcript_group_context(
|
|
|
1764
1756
|
return TranscriptGroupContext(
|
|
1765
1757
|
name, transcript_group_id, description, metadata, parent_transcript_group_id
|
|
1766
1758
|
)
|
|
1759
|
+
|
|
1760
|
+
|
|
1761
|
+
def _is_tracing_disabled() -> bool:
|
|
1762
|
+
"""Check if tracing is disabled via environment variable."""
|
|
1763
|
+
return os.environ.get("DOCENT_DISABLE_TRACING", "").lower() == "true"
|
|
1764
|
+
|
|
1765
|
+
|
|
1766
|
+
def _is_notebook() -> bool:
|
|
1767
|
+
"""Check if we're running in a Jupyter notebook."""
|
|
1768
|
+
try:
|
|
1769
|
+
return "ipykernel" in sys.modules
|
|
1770
|
+
except Exception:
|
|
1771
|
+
return False
|
|
@@ -1,229 +0,0 @@
|
|
|
1
|
-
# import traceback
|
|
2
|
-
# from typing import Any, Optional
|
|
3
|
-
|
|
4
|
-
# from pydantic import (
|
|
5
|
-
# BaseModel,
|
|
6
|
-
# ConfigDict,
|
|
7
|
-
# Field,
|
|
8
|
-
# PrivateAttr,
|
|
9
|
-
# SerializerFunctionWrapHandler,
|
|
10
|
-
# model_serializer,
|
|
11
|
-
# model_validator,
|
|
12
|
-
# )
|
|
13
|
-
|
|
14
|
-
# from docent._log_util import get_logger
|
|
15
|
-
|
|
16
|
-
# logger = get_logger(__name__)
|
|
17
|
-
|
|
18
|
-
# SINGLETONS = (int, float, str, bool)
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
# class BaseMetadata(BaseModel):
|
|
22
|
-
# """Provides common functionality for accessing and validating metadata fields.
|
|
23
|
-
# All metadata classes should inherit from this class.
|
|
24
|
-
|
|
25
|
-
# Serialization Behavior:
|
|
26
|
-
# - Field descriptions are highly recommended and stored in serialized versions of the object.
|
|
27
|
-
# - When a subclass of BaseMetadata is uploaded to a server, all extra fields and their descriptions are retained.
|
|
28
|
-
# - To recover the original structure with proper typing upon download, use:
|
|
29
|
-
# `CustomMetadataClass.model_validate(obj.model_dump())`.
|
|
30
|
-
|
|
31
|
-
# Attributes:
|
|
32
|
-
# model_config: Pydantic configuration that allows extra fields.
|
|
33
|
-
# allow_fields_without_descriptions: Boolean indicating whether to allow fields without descriptions.
|
|
34
|
-
# """
|
|
35
|
-
|
|
36
|
-
# model_config = ConfigDict(extra="allow")
|
|
37
|
-
# allow_fields_without_descriptions: bool = True
|
|
38
|
-
|
|
39
|
-
# # Private attribute to store field descriptions
|
|
40
|
-
# _field_descriptions: dict[str, str | None] | None = PrivateAttr(default=None)
|
|
41
|
-
# _internal_basemetadata_fields: set[str] = PrivateAttr(
|
|
42
|
-
# default={
|
|
43
|
-
# "allow_fields_without_descriptions",
|
|
44
|
-
# "model_config",
|
|
45
|
-
# "_field_descriptions",
|
|
46
|
-
# }
|
|
47
|
-
# )
|
|
48
|
-
|
|
49
|
-
# @model_validator(mode="after")
|
|
50
|
-
# def _validate_field_types_and_descriptions(self):
|
|
51
|
-
# """Validates that all fields have descriptions and proper types.
|
|
52
|
-
|
|
53
|
-
# Returns:
|
|
54
|
-
# Self: The validated model instance.
|
|
55
|
-
|
|
56
|
-
# Raises:
|
|
57
|
-
# ValueError: If any field is missing a description or has an invalid type.
|
|
58
|
-
# """
|
|
59
|
-
# # Validate each field in the model
|
|
60
|
-
# for field_name, field_info in self.__class__.model_fields.items():
|
|
61
|
-
# if field_name in self._internal_basemetadata_fields:
|
|
62
|
-
# continue
|
|
63
|
-
|
|
64
|
-
# # Check that field has a description
|
|
65
|
-
# if field_info.description is None:
|
|
66
|
-
# if not self.allow_fields_without_descriptions:
|
|
67
|
-
# raise ValueError(
|
|
68
|
-
# f"Field `{field_name}` needs a description in the definition of `{self.__class__.__name__}`, like `{field_name}: T = Field(description=..., default=...)`. "
|
|
69
|
-
# "To allow un-described fields, set `allow_fields_without_descriptions = True` on the instance or in your metadata class definition."
|
|
70
|
-
# )
|
|
71
|
-
|
|
72
|
-
# # Validate that the metadata is JSON serializable
|
|
73
|
-
# try:
|
|
74
|
-
# self.model_dump_json()
|
|
75
|
-
# except Exception as e:
|
|
76
|
-
# raise ValueError(
|
|
77
|
-
# f"Metadata is not JSON serializable: {e}. Traceback: {traceback.format_exc()}"
|
|
78
|
-
# )
|
|
79
|
-
|
|
80
|
-
# return self
|
|
81
|
-
|
|
82
|
-
# def model_post_init(self, __context: Any) -> None:
|
|
83
|
-
# """Initializes field descriptions from extra data after model initialization.
|
|
84
|
-
|
|
85
|
-
# Args:
|
|
86
|
-
# __context: The context provided by Pydantic's post-initialization hook.
|
|
87
|
-
# """
|
|
88
|
-
# fd = self.model_extra.pop("_field_descriptions", None) if self.model_extra else None
|
|
89
|
-
# if fd is not None:
|
|
90
|
-
# self._field_descriptions = fd
|
|
91
|
-
|
|
92
|
-
# @model_serializer(mode="wrap")
|
|
93
|
-
# def _serialize_model(self, handler: SerializerFunctionWrapHandler):
|
|
94
|
-
# # Call the default serializer
|
|
95
|
-
# data = handler(self)
|
|
96
|
-
|
|
97
|
-
# # Dump the field descriptions
|
|
98
|
-
# if self._field_descriptions is None:
|
|
99
|
-
# self._field_descriptions = self._compute_field_descriptions()
|
|
100
|
-
# data["_field_descriptions"] = self._field_descriptions
|
|
101
|
-
|
|
102
|
-
# return data
|
|
103
|
-
|
|
104
|
-
# def model_dump(
|
|
105
|
-
# self, *args: Any, strip_internal_fields: bool = False, **kwargs: Any
|
|
106
|
-
# ) -> dict[str, Any]:
|
|
107
|
-
# data = super().model_dump(*args, **kwargs)
|
|
108
|
-
|
|
109
|
-
# # Remove internal fields if requested
|
|
110
|
-
# if strip_internal_fields:
|
|
111
|
-
# for field in self._internal_basemetadata_fields:
|
|
112
|
-
# if field in data:
|
|
113
|
-
# data.pop(field)
|
|
114
|
-
|
|
115
|
-
# return data
|
|
116
|
-
|
|
117
|
-
# def get(self, key: str, default_value: Any = None) -> Any:
|
|
118
|
-
# """Gets a value from the metadata by key.
|
|
119
|
-
|
|
120
|
-
# Args:
|
|
121
|
-
# key: The key to look up in the metadata.
|
|
122
|
-
# default_value: Value to return if the key is not found. Defaults to None.
|
|
123
|
-
|
|
124
|
-
# Returns:
|
|
125
|
-
# Any: The value associated with the key, or the default value if not found.
|
|
126
|
-
# """
|
|
127
|
-
# # Check if the field exists in the model's fields
|
|
128
|
-
# if key in self.__class__.model_fields or (
|
|
129
|
-
# self.model_extra is not None and key in self.model_extra
|
|
130
|
-
# ):
|
|
131
|
-
# # Field exists, return its value (even if None)
|
|
132
|
-
# return getattr(self, key)
|
|
133
|
-
|
|
134
|
-
# logger.warning(f"Field '{key}' not found in {self.__class__.__name__}")
|
|
135
|
-
# return default_value
|
|
136
|
-
|
|
137
|
-
# def get_field_description(self, field_name: str) -> str | None:
|
|
138
|
-
# """Gets the description of a field defined in the model schema.
|
|
139
|
-
|
|
140
|
-
# Args:
|
|
141
|
-
# field_name: The name of the field.
|
|
142
|
-
|
|
143
|
-
# Returns:
|
|
144
|
-
# str or None: The description string if the field is defined in the model schema
|
|
145
|
-
# and has a description, otherwise None.
|
|
146
|
-
# """
|
|
147
|
-
# if self._field_descriptions is None:
|
|
148
|
-
# self._field_descriptions = self._compute_field_descriptions()
|
|
149
|
-
|
|
150
|
-
# if field_name in self._field_descriptions:
|
|
151
|
-
# return self._field_descriptions[field_name]
|
|
152
|
-
|
|
153
|
-
# logger.warning(
|
|
154
|
-
# f"Field description for '{field_name}' not found in {self.__class__.__name__}"
|
|
155
|
-
# )
|
|
156
|
-
# return None
|
|
157
|
-
|
|
158
|
-
# def get_all_field_descriptions(self) -> dict[str, str | None]:
|
|
159
|
-
# """Gets descriptions for all fields defined in the model schema.
|
|
160
|
-
|
|
161
|
-
# Returns:
|
|
162
|
-
# dict: A dictionary mapping field names to their descriptions.
|
|
163
|
-
# Only includes fields that have descriptions defined in the schema.
|
|
164
|
-
# """
|
|
165
|
-
# if self._field_descriptions is None:
|
|
166
|
-
# self._field_descriptions = self._compute_field_descriptions()
|
|
167
|
-
# return self._field_descriptions
|
|
168
|
-
|
|
169
|
-
# def _compute_field_descriptions(self) -> dict[str, str | None]:
|
|
170
|
-
# """Computes descriptions for all fields in the model.
|
|
171
|
-
|
|
172
|
-
# Returns:
|
|
173
|
-
# dict: A dictionary mapping field names to their descriptions.
|
|
174
|
-
# """
|
|
175
|
-
# field_descriptions: dict[str, Optional[str]] = {}
|
|
176
|
-
# for field_name, field_info in self.__class__.model_fields.items():
|
|
177
|
-
# if field_name not in self._internal_basemetadata_fields:
|
|
178
|
-
# field_descriptions[field_name] = field_info.description
|
|
179
|
-
# return field_descriptions
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
# class BaseAgentRunMetadata(BaseMetadata):
|
|
183
|
-
# """Extends BaseMetadata with fields specific to agent evaluation runs.
|
|
184
|
-
|
|
185
|
-
# Attributes:
|
|
186
|
-
# scores: Dictionary of evaluation metrics.
|
|
187
|
-
# """
|
|
188
|
-
|
|
189
|
-
# scores: dict[str, int | float | bool | None] = Field(
|
|
190
|
-
# description="A dict of score_key -> score_value. Use one key for each metric you're tracking."
|
|
191
|
-
# )
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
# class InspectAgentRunMetadata(BaseAgentRunMetadata):
|
|
195
|
-
# """Extends BaseAgentRunMetadata with fields specific to Inspect runs.
|
|
196
|
-
|
|
197
|
-
# Attributes:
|
|
198
|
-
# task_id: The ID of the 'benchmark' or 'set of evals' that the transcript belongs to
|
|
199
|
-
# sample_id: The specific task inside of the `task_id` benchmark that the transcript was run on
|
|
200
|
-
# epoch_id: Each `sample_id` should be run multiple times due to stochasticity; `epoch_id` is the integer index of a specific run.
|
|
201
|
-
# model: The model that was used to generate the transcript
|
|
202
|
-
# scoring_metadata: Additional metadata about the scoring process
|
|
203
|
-
# additional_metadata: Additional metadata about the transcript
|
|
204
|
-
# """
|
|
205
|
-
|
|
206
|
-
# task_id: str = Field(
|
|
207
|
-
# description="The ID of the 'benchmark' or 'set of evals' that the transcript belongs to"
|
|
208
|
-
# )
|
|
209
|
-
|
|
210
|
-
# # Identification of this particular run
|
|
211
|
-
# sample_id: str = Field(
|
|
212
|
-
# description="The specific task inside of the `task_id` benchmark that the transcript was run on"
|
|
213
|
-
# )
|
|
214
|
-
# epoch_id: int = Field(
|
|
215
|
-
# description="Each `sample_id` should be run multiple times due to stochasticity; `epoch_id` is the integer index of a specific run."
|
|
216
|
-
# )
|
|
217
|
-
|
|
218
|
-
# # Parameters for the run
|
|
219
|
-
# model: str = Field(description="The model that was used to generate the transcript")
|
|
220
|
-
|
|
221
|
-
# # Scoring
|
|
222
|
-
# scoring_metadata: dict[str, Any] | None = Field(
|
|
223
|
-
# description="Additional metadata about the scoring process"
|
|
224
|
-
# )
|
|
225
|
-
|
|
226
|
-
# # Inspect metadata
|
|
227
|
-
# additional_metadata: dict[str, Any] | None = Field(
|
|
228
|
-
# description="Additional metadata about the transcript"
|
|
229
|
-
# )
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
from typing import Any
|
|
2
|
-
|
|
3
|
-
import yaml
|
|
4
|
-
from pydantic_core import to_jsonable_python
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def yaml_dump_metadata(metadata: dict[str, Any]) -> str | None:
|
|
8
|
-
if not metadata:
|
|
9
|
-
return None
|
|
10
|
-
metadata_obj = to_jsonable_python(metadata)
|
|
11
|
-
yaml_text = yaml.dump(metadata_obj, width=float("inf"))
|
|
12
|
-
return yaml_text.strip()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|