docent-python 0.1.49a0__tar.gz → 0.1.51a0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/PKG-INFO +1 -1
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/data_models/chat/content.py +6 -0
- docent_python-0.1.51a0/docent/data_models/feedback.py +369 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/data_models/transcript.py +1 -1
- docent_python-0.1.51a0/docent/judges/util/voting.py +351 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/mcp/server.py +1 -1
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/sdk/client.py +527 -79
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/trace.py +43 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/pyproject.toml +1 -1
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/uv.lock +64 -67
- docent_python-0.1.49a0/docent/judges/util/voting.py +0 -140
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/.gitignore +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/LICENSE.md +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/README.md +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/__init__.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/_llm_util/__init__.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/_llm_util/data_models/__init__.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/_llm_util/data_models/exceptions.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/_llm_util/data_models/llm_output.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/_llm_util/llm_cache.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/_llm_util/llm_svc.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/_llm_util/model_registry.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/_llm_util/providers/__init__.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/_llm_util/providers/anthropic.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/_llm_util/providers/common.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/_llm_util/providers/google.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/_llm_util/providers/openai.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/_llm_util/providers/openrouter.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/_llm_util/providers/preference_types.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/_llm_util/providers/provider_registry.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/_log_util/__init__.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/_log_util/logger.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/data_models/__init__.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/data_models/_tiktoken_util.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/data_models/agent_run.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/data_models/chat/__init__.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/data_models/chat/message.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/data_models/chat/response_format.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/data_models/chat/tool.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/data_models/citation.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/data_models/formatted_objects.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/data_models/judge.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/data_models/metadata_util.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/data_models/regex.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/data_models/util.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/judges/__init__.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/judges/analysis.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/judges/impl.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/judges/runner.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/judges/stats.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/judges/types.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/judges/util/forgiving_json.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/judges/util/meta_schema.json +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/judges/util/meta_schema.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/judges/util/parse_output.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/judges/util/template_formatter.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/loaders/load_inspect.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/mcp/__init__.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/mcp/__main__.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/py.typed +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/samples/__init__.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/samples/load.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/samples/log.eval +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/samples/tb_airline.json +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/sdk/__init__.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/sdk/agent_run_writer.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/sdk/llm_context.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/sdk/llm_request.py +0 -0
- {docent_python-0.1.49a0 → docent_python-0.1.51a0}/docent/trace_temp.py +0 -0
|
@@ -39,15 +39,21 @@ class ContentReasoning(BaseContent):
|
|
|
39
39
|
Attributes:
|
|
40
40
|
type: Fixed as "reasoning" to identify this content type.
|
|
41
41
|
reasoning: The actual reasoning text.
|
|
42
|
+
summary: Optional human-readable reasoning summary.
|
|
42
43
|
signature: Optional signature associated with the reasoning.
|
|
43
44
|
redacted: Flag indicating if the reasoning has been redacted.
|
|
44
45
|
"""
|
|
45
46
|
|
|
46
47
|
type: Literal["reasoning"] = "reasoning" # type: ignore
|
|
47
48
|
reasoning: str
|
|
49
|
+
summary: str | None = None
|
|
48
50
|
signature: str | None = None
|
|
49
51
|
redacted: bool = False
|
|
50
52
|
|
|
53
|
+
@property
|
|
54
|
+
def display_reasoning(self) -> str:
|
|
55
|
+
return self.summary if self.redacted and self.summary else self.reasoning
|
|
56
|
+
|
|
51
57
|
|
|
52
58
|
# Content type discriminated union
|
|
53
59
|
Content = Annotated[ContentText | ContentReasoning, Discriminator("type")]
|
|
@@ -0,0 +1,369 @@
|
|
|
1
|
+
"""Data structures for run-centric feedback elicitation and user context inference."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from collections.abc import Iterator
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from typing import Any, Literal
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel, Field, model_validator
|
|
9
|
+
|
|
10
|
+
from docent.data_models.citation import InlineCitation
|
|
11
|
+
from docent.judges.util.voting import OutputDistribution
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _stable_json(value: Any) -> str:
|
|
15
|
+
return json.dumps(value, sort_keys=True)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _indent_lines(lines: list[str], indent: int) -> list[str]:
|
|
19
|
+
prefix = " " * max(0, indent)
|
|
20
|
+
return [f"{prefix}{line}" if line else "" for line in lines]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _tag_block(tag: str, body_lines: list[str], indent: int) -> list[str]:
|
|
24
|
+
lines = [f"<{tag}>"]
|
|
25
|
+
if body_lines:
|
|
26
|
+
lines.extend(_indent_lines(body_lines, indent))
|
|
27
|
+
else:
|
|
28
|
+
lines.extend(_indent_lines(["N/A"], indent))
|
|
29
|
+
lines.append(f"</{tag}>")
|
|
30
|
+
return lines
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _text_or_na(text: str | None) -> str:
|
|
34
|
+
if text is None:
|
|
35
|
+
return "N/A"
|
|
36
|
+
stripped = text.strip()
|
|
37
|
+
return stripped if stripped else "N/A"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _text_lines_or_na(text: str | None) -> list[str]:
|
|
41
|
+
return _text_or_na(text).splitlines()
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _render_citations_block(citations: list[InlineCitation], indent: int) -> list[str]:
|
|
45
|
+
citation_payload = [citation.model_dump(mode="json") for citation in citations]
|
|
46
|
+
citation_text = _stable_json(citation_payload) if citation_payload else "N/A"
|
|
47
|
+
return _tag_block("Citations", [citation_text], indent)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _render_user_distribution_block(
|
|
51
|
+
user_distribution: OutputDistribution | None,
|
|
52
|
+
indent: int,
|
|
53
|
+
) -> list[str]:
|
|
54
|
+
distribution_text = (
|
|
55
|
+
_stable_json(user_distribution.model_dump(mode="json"))
|
|
56
|
+
if user_distribution is not None
|
|
57
|
+
else "N/A"
|
|
58
|
+
)
|
|
59
|
+
return _tag_block("Estimated user distribution p_u", [distribution_text], indent)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _render_user_distribution_reasoning_block(
|
|
63
|
+
reasoning: str | None,
|
|
64
|
+
reasoning_citations: list[InlineCitation] | None,
|
|
65
|
+
indent: int,
|
|
66
|
+
) -> list[str]:
|
|
67
|
+
body_lines = _text_lines_or_na(reasoning)
|
|
68
|
+
body_lines.extend(_render_citations_block(reasoning_citations or [], indent))
|
|
69
|
+
return _tag_block("p_u reasoning", body_lines, indent)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class LabelingRequestFocusItem(BaseModel):
|
|
73
|
+
"""Specific rubric-related question the human labeler should inspect."""
|
|
74
|
+
|
|
75
|
+
question: str
|
|
76
|
+
citations: list[InlineCitation] = Field(default_factory=list[InlineCitation])
|
|
77
|
+
sample_answers: list[str] = Field(default_factory=list[str])
|
|
78
|
+
|
|
79
|
+
def to_str(self, indent: int = 0) -> str:
|
|
80
|
+
"""Render focus item in a deterministic LLM-facing format."""
|
|
81
|
+
lines: list[str] = []
|
|
82
|
+
|
|
83
|
+
# Render the question and its citations as one nested block.
|
|
84
|
+
question_lines = _text_lines_or_na(self.question)
|
|
85
|
+
question_lines.extend(_render_citations_block(self.citations, indent))
|
|
86
|
+
lines.extend(_tag_block("Question", question_lines, indent))
|
|
87
|
+
|
|
88
|
+
sample_answers_lines = (
|
|
89
|
+
[
|
|
90
|
+
f"Answer {sample_idx}: {sample_answer}"
|
|
91
|
+
for sample_idx, sample_answer in enumerate(self.sample_answers, start=1)
|
|
92
|
+
]
|
|
93
|
+
if self.sample_answers
|
|
94
|
+
else ["N/A"]
|
|
95
|
+
)
|
|
96
|
+
lines.extend(_tag_block("Sample Answers", sample_answers_lines, indent))
|
|
97
|
+
return "\n".join(lines)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class QAPair(BaseModel):
|
|
101
|
+
"""A single review-focus answer captured for one run."""
|
|
102
|
+
|
|
103
|
+
# What the user was shown
|
|
104
|
+
focus_item: LabelingRequestFocusItem
|
|
105
|
+
|
|
106
|
+
# Whether the user selected a sample answer or not
|
|
107
|
+
selected_sample_index: int | None = None
|
|
108
|
+
is_custom_response: bool = False
|
|
109
|
+
|
|
110
|
+
# What the user responded
|
|
111
|
+
answer: str
|
|
112
|
+
explanation: str | None = None
|
|
113
|
+
|
|
114
|
+
# The user could have skipped this question and provided nothing
|
|
115
|
+
status: Literal["answered", "skipped"]
|
|
116
|
+
timestamp: datetime = Field(default_factory=datetime.now)
|
|
117
|
+
|
|
118
|
+
def to_str(self, indent: int = 0) -> str:
|
|
119
|
+
"""Render QA pair in a deterministic LLM-facing format."""
|
|
120
|
+
lines = self.focus_item.to_str(indent=indent).splitlines()
|
|
121
|
+
lines.append(f"User answer: {_text_or_na(self.answer)}")
|
|
122
|
+
lines.append(f"User explanation: {_text_or_na(self.explanation)}")
|
|
123
|
+
return "\n".join(lines)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
class LabelingRequest(BaseModel):
|
|
127
|
+
"""Structured labeling request shown to the user."""
|
|
128
|
+
|
|
129
|
+
title: str
|
|
130
|
+
review_context: str
|
|
131
|
+
review_context_citations: list[InlineCitation] = Field(default_factory=list[InlineCitation])
|
|
132
|
+
review_focus: list[LabelingRequestFocusItem] = Field(
|
|
133
|
+
default_factory=list[LabelingRequestFocusItem]
|
|
134
|
+
)
|
|
135
|
+
user_distribution: OutputDistribution | None = None
|
|
136
|
+
user_distribution_reasoning: str | None = None
|
|
137
|
+
|
|
138
|
+
def to_str(self, indent: int = 0) -> str:
|
|
139
|
+
"""Render labeling request in a deterministic LLM-facing format."""
|
|
140
|
+
body_lines: list[str] = [f"Title: {_text_or_na(self.title)}"]
|
|
141
|
+
|
|
142
|
+
review_context_lines = _text_lines_or_na(self.review_context)
|
|
143
|
+
review_context_lines.extend(_render_citations_block(self.review_context_citations, indent))
|
|
144
|
+
body_lines.extend(_tag_block("Review Context", review_context_lines, indent))
|
|
145
|
+
|
|
146
|
+
review_focus_lines: list[str] = []
|
|
147
|
+
if self.review_focus:
|
|
148
|
+
for focus_idx, focus_item in enumerate(self.review_focus, start=1):
|
|
149
|
+
focus_lines = focus_item.to_str(indent=indent).splitlines()
|
|
150
|
+
review_focus_lines.extend(_tag_block(f"Focus {focus_idx}", focus_lines, indent))
|
|
151
|
+
else:
|
|
152
|
+
review_focus_lines.append("N/A")
|
|
153
|
+
body_lines.extend(_tag_block("Review Focus", review_focus_lines, indent))
|
|
154
|
+
|
|
155
|
+
body_lines.extend(_render_user_distribution_block(self.user_distribution, indent))
|
|
156
|
+
body_lines.extend(
|
|
157
|
+
_render_user_distribution_reasoning_block(
|
|
158
|
+
self.user_distribution_reasoning,
|
|
159
|
+
reasoning_citations=None,
|
|
160
|
+
indent=indent,
|
|
161
|
+
)
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
lines = _tag_block("Labeling Request", body_lines, indent)
|
|
165
|
+
return "\n".join(lines)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
class LabeledRun(BaseModel):
|
|
169
|
+
"""A human label for one agent run."""
|
|
170
|
+
|
|
171
|
+
agent_run_id: str
|
|
172
|
+
timestamp: datetime = Field(default_factory=datetime.now)
|
|
173
|
+
|
|
174
|
+
# What the user responded
|
|
175
|
+
label_value: dict[str, Any]
|
|
176
|
+
explanation: str | None = None
|
|
177
|
+
|
|
178
|
+
def to_str(
|
|
179
|
+
self,
|
|
180
|
+
labeling_request: LabelingRequest | None = None,
|
|
181
|
+
indent: int = 0,
|
|
182
|
+
) -> str:
|
|
183
|
+
"""Render user label in a deterministic LLM-facing format."""
|
|
184
|
+
body_lines = [
|
|
185
|
+
f"User label: {_stable_json(self.label_value)}",
|
|
186
|
+
f"User explanation: {_text_or_na(self.explanation)}",
|
|
187
|
+
]
|
|
188
|
+
if labeling_request is None:
|
|
189
|
+
return "\n".join(_tag_block("Label", body_lines, indent))
|
|
190
|
+
|
|
191
|
+
body_lines.extend(
|
|
192
|
+
_render_user_distribution_block(labeling_request.user_distribution, indent)
|
|
193
|
+
)
|
|
194
|
+
body_lines.extend(
|
|
195
|
+
_render_user_distribution_reasoning_block(
|
|
196
|
+
labeling_request.user_distribution_reasoning,
|
|
197
|
+
reasoning_citations=None,
|
|
198
|
+
indent=indent,
|
|
199
|
+
)
|
|
200
|
+
)
|
|
201
|
+
return "\n".join(_tag_block("Label", body_lines, indent))
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
class AgentRunFeedbackContext(BaseModel):
|
|
205
|
+
"""All feedback collected for a single agent run."""
|
|
206
|
+
|
|
207
|
+
agent_run_id: str
|
|
208
|
+
round: int
|
|
209
|
+
created_at: datetime = Field(default_factory=datetime.now)
|
|
210
|
+
last_updated: datetime = Field(default_factory=datetime.now)
|
|
211
|
+
|
|
212
|
+
# What the user was shown
|
|
213
|
+
labeling_request: LabelingRequest
|
|
214
|
+
|
|
215
|
+
# What the user responded
|
|
216
|
+
qa_pairs: list[QAPair] = Field(default_factory=list[QAPair])
|
|
217
|
+
label: LabeledRun | None = None
|
|
218
|
+
|
|
219
|
+
@model_validator(mode="after")
|
|
220
|
+
def validate_nested_agent_run_ids(self) -> "AgentRunFeedbackContext":
|
|
221
|
+
"""Ensure nested run IDs are consistent with the top-level run ID."""
|
|
222
|
+
if self.label is not None and self.label.agent_run_id != self.agent_run_id:
|
|
223
|
+
raise ValueError("label.agent_run_id must match agent_run_id")
|
|
224
|
+
return self
|
|
225
|
+
|
|
226
|
+
def to_str(self, indent: int = 0) -> str:
|
|
227
|
+
"""Render full feedback entry in a deterministic LLM-facing format."""
|
|
228
|
+
lines = self.labeling_request.to_str(indent=indent).splitlines()
|
|
229
|
+
|
|
230
|
+
qa_lines: list[str] = []
|
|
231
|
+
if not self.qa_pairs:
|
|
232
|
+
qa_lines.append("N/A")
|
|
233
|
+
else:
|
|
234
|
+
for qa_idx, qa_pair in enumerate(self.qa_pairs, start=1):
|
|
235
|
+
qa_entry_lines = qa_pair.to_str(indent=indent).splitlines()
|
|
236
|
+
qa_lines.extend(_tag_block(f"QA {qa_idx}", qa_entry_lines, indent))
|
|
237
|
+
lines.extend(_tag_block("Question Answer Pairs", qa_lines, indent))
|
|
238
|
+
|
|
239
|
+
if self.label is None:
|
|
240
|
+
label_body_lines = [
|
|
241
|
+
"User label: N/A",
|
|
242
|
+
"User explanation: N/A",
|
|
243
|
+
]
|
|
244
|
+
label_body_lines.extend(
|
|
245
|
+
_render_user_distribution_block(self.labeling_request.user_distribution, indent)
|
|
246
|
+
)
|
|
247
|
+
label_body_lines.extend(
|
|
248
|
+
_render_user_distribution_reasoning_block(
|
|
249
|
+
self.labeling_request.user_distribution_reasoning,
|
|
250
|
+
reasoning_citations=None,
|
|
251
|
+
indent=indent,
|
|
252
|
+
)
|
|
253
|
+
)
|
|
254
|
+
lines.extend(_tag_block("Label", label_body_lines, indent))
|
|
255
|
+
else:
|
|
256
|
+
lines.extend(
|
|
257
|
+
self.label.to_str(
|
|
258
|
+
labeling_request=self.labeling_request,
|
|
259
|
+
indent=indent,
|
|
260
|
+
).splitlines()
|
|
261
|
+
)
|
|
262
|
+
return "\n".join(lines)
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
class FeedbackContext(BaseModel):
|
|
266
|
+
"""Feedback context returned by the feedback REST API."""
|
|
267
|
+
|
|
268
|
+
feedback_context_id: str
|
|
269
|
+
feedback_session_id: str
|
|
270
|
+
agent_run_id: str
|
|
271
|
+
labeling_request: LabelingRequest
|
|
272
|
+
created_at: datetime
|
|
273
|
+
updated_at: datetime
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
class FeedbackContextsResponse(BaseModel):
|
|
277
|
+
"""Round-scoped feedback contexts returned by the feedback REST API."""
|
|
278
|
+
|
|
279
|
+
current_round: int
|
|
280
|
+
contexts: list[FeedbackContext] = Field(default_factory=list[FeedbackContext])
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
class UserData(BaseModel):
|
|
284
|
+
"""User Data (U) for user-context inference and downstream evaluation."""
|
|
285
|
+
|
|
286
|
+
initial_rubric: str
|
|
287
|
+
agent_run_feedbacks: list[AgentRunFeedbackContext] = Field(
|
|
288
|
+
default_factory=lambda: list[AgentRunFeedbackContext]()
|
|
289
|
+
)
|
|
290
|
+
created_at: datetime = Field(default_factory=datetime.now)
|
|
291
|
+
last_updated: datetime = Field(default_factory=datetime.now)
|
|
292
|
+
|
|
293
|
+
def upsert_run_feedback(self, agent_run_feedback: AgentRunFeedbackContext) -> None:
|
|
294
|
+
"""Insert or replace feedback for an agent run ID, updating timestamps."""
|
|
295
|
+
now = datetime.now()
|
|
296
|
+
upserted_feedback = agent_run_feedback.model_copy(deep=True)
|
|
297
|
+
upserted_feedback.last_updated = now
|
|
298
|
+
|
|
299
|
+
for idx, existing in enumerate(self.agent_run_feedbacks):
|
|
300
|
+
if existing.agent_run_id != upserted_feedback.agent_run_id:
|
|
301
|
+
continue
|
|
302
|
+
upserted_feedback.created_at = existing.created_at
|
|
303
|
+
self.agent_run_feedbacks[idx] = upserted_feedback
|
|
304
|
+
self.last_updated = now
|
|
305
|
+
return
|
|
306
|
+
|
|
307
|
+
self.agent_run_feedbacks.append(upserted_feedback)
|
|
308
|
+
self.last_updated = now
|
|
309
|
+
|
|
310
|
+
def validate_against_agreement_keys(self, agreement_keys: set[str]) -> None:
|
|
311
|
+
"""Validate stored labels and p_u outcomes against rubric agreement keys."""
|
|
312
|
+
for feedback in self.agent_run_feedbacks:
|
|
313
|
+
run_id = feedback.agent_run_id
|
|
314
|
+
|
|
315
|
+
label = feedback.label
|
|
316
|
+
if label is not None:
|
|
317
|
+
invalid_label_keys = sorted(set(label.label_value.keys()) - agreement_keys)
|
|
318
|
+
if invalid_label_keys:
|
|
319
|
+
raise ValueError(
|
|
320
|
+
"Run "
|
|
321
|
+
f"{run_id} has label_value keys outside rubric agreement keys: "
|
|
322
|
+
+ ", ".join(invalid_label_keys)
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
user_distribution = feedback.labeling_request.user_distribution
|
|
326
|
+
if user_distribution is None:
|
|
327
|
+
continue
|
|
328
|
+
|
|
329
|
+
for outcome_idx, outcome in enumerate(user_distribution.outcomes, start=1):
|
|
330
|
+
invalid_output_keys = sorted(set(outcome.output.keys()) - agreement_keys)
|
|
331
|
+
if invalid_output_keys:
|
|
332
|
+
raise ValueError(
|
|
333
|
+
"Run "
|
|
334
|
+
f"{run_id} has user_distribution outcome #{outcome_idx} keys outside "
|
|
335
|
+
"rubric agreement keys: " + ", ".join(invalid_output_keys)
|
|
336
|
+
)
|
|
337
|
+
for key, value in outcome.output.items():
|
|
338
|
+
if isinstance(value, (str, bool, int, float)):
|
|
339
|
+
continue
|
|
340
|
+
raise ValueError(
|
|
341
|
+
"Run "
|
|
342
|
+
f"{run_id} has user_distribution outcome #{outcome_idx} non-scalar "
|
|
343
|
+
f"value for key '{key}': {type(value).__name__}"
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
def iter_answered_qa_entries(self) -> Iterator[tuple[AgentRunFeedbackContext, QAPair]]:
|
|
347
|
+
"""Iterate answered QA pairs with their parent run feedback."""
|
|
348
|
+
for feedback in self.agent_run_feedbacks:
|
|
349
|
+
for qa_pair in feedback.qa_pairs:
|
|
350
|
+
if qa_pair.status == "answered":
|
|
351
|
+
yield feedback, qa_pair
|
|
352
|
+
|
|
353
|
+
def iter_skipped_qa_entries(self) -> Iterator[tuple[AgentRunFeedbackContext, QAPair]]:
|
|
354
|
+
"""Iterate skipped QA pairs with their parent run feedback."""
|
|
355
|
+
for feedback in self.agent_run_feedbacks:
|
|
356
|
+
for qa_pair in feedback.qa_pairs:
|
|
357
|
+
if qa_pair.status == "skipped":
|
|
358
|
+
yield feedback, qa_pair
|
|
359
|
+
|
|
360
|
+
def iter_labeled_entries(self) -> Iterator[tuple[AgentRunFeedbackContext, LabeledRun]]:
|
|
361
|
+
"""Iterate labeled run entries with their parent run feedback."""
|
|
362
|
+
for feedback in self.agent_run_feedbacks:
|
|
363
|
+
if feedback.label is None:
|
|
364
|
+
continue
|
|
365
|
+
yield feedback, feedback.label
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
# Backward-compatible alias used by older callers/scripts.
|
|
369
|
+
AgentRunFeedback = AgentRunFeedbackContext
|
|
@@ -179,7 +179,7 @@ def format_chat_message(
|
|
|
179
179
|
if isinstance(message, AssistantMessage) and message.content:
|
|
180
180
|
for content in message.content:
|
|
181
181
|
if isinstance(content, ContentReasoning):
|
|
182
|
-
cur_content = f"<reasoning>\n{content.
|
|
182
|
+
cur_content = f"<reasoning>\n{content.display_reasoning}\n</reasoning>\n"
|
|
183
183
|
|
|
184
184
|
# Main content text
|
|
185
185
|
cur_content += message.text
|