docent-python 0.1.51a0__tar.gz → 0.1.53a0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/PKG-INFO +1 -1
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/data_models/feedback.py +32 -8
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/sdk/client.py +69 -10
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/pyproject.toml +1 -1
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/.gitignore +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/LICENSE.md +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/README.md +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/__init__.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/_llm_util/__init__.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/_llm_util/data_models/__init__.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/_llm_util/data_models/exceptions.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/_llm_util/data_models/llm_output.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/_llm_util/llm_cache.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/_llm_util/llm_svc.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/_llm_util/model_registry.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/_llm_util/providers/__init__.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/_llm_util/providers/anthropic.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/_llm_util/providers/common.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/_llm_util/providers/google.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/_llm_util/providers/openai.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/_llm_util/providers/openrouter.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/_llm_util/providers/preference_types.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/_llm_util/providers/provider_registry.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/_log_util/__init__.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/_log_util/logger.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/data_models/__init__.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/data_models/_tiktoken_util.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/data_models/agent_run.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/data_models/chat/__init__.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/data_models/chat/content.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/data_models/chat/message.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/data_models/chat/response_format.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/data_models/chat/tool.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/data_models/citation.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/data_models/formatted_objects.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/data_models/judge.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/data_models/metadata_util.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/data_models/regex.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/data_models/transcript.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/data_models/util.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/judges/__init__.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/judges/analysis.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/judges/impl.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/judges/runner.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/judges/stats.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/judges/types.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/judges/util/forgiving_json.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/judges/util/meta_schema.json +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/judges/util/meta_schema.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/judges/util/parse_output.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/judges/util/template_formatter.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/judges/util/voting.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/loaders/load_inspect.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/mcp/__init__.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/mcp/__main__.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/mcp/server.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/py.typed +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/samples/__init__.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/samples/load.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/samples/log.eval +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/samples/tb_airline.json +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/sdk/__init__.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/sdk/agent_run_writer.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/sdk/llm_context.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/sdk/llm_request.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/trace.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/trace_temp.py +0 -0
- {docent_python-0.1.51a0 → docent_python-0.1.53a0}/uv.lock +0 -0
|
@@ -101,11 +101,7 @@ class QAPair(BaseModel):
|
|
|
101
101
|
"""A single review-focus answer captured for one run."""
|
|
102
102
|
|
|
103
103
|
# What the user was shown
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
# Whether the user selected a sample answer or not
|
|
107
|
-
selected_sample_index: int | None = None
|
|
108
|
-
is_custom_response: bool = False
|
|
104
|
+
focus_index: int
|
|
109
105
|
|
|
110
106
|
# What the user responded
|
|
111
107
|
answer: str
|
|
@@ -115,9 +111,15 @@ class QAPair(BaseModel):
|
|
|
115
111
|
status: Literal["answered", "skipped"]
|
|
116
112
|
timestamp: datetime = Field(default_factory=datetime.now)
|
|
117
113
|
|
|
118
|
-
def to_str(self, indent: int = 0) -> str:
|
|
114
|
+
def to_str(self, labeling_request: "LabelingRequest", indent: int = 0) -> str:
|
|
119
115
|
"""Render QA pair in a deterministic LLM-facing format."""
|
|
120
|
-
|
|
116
|
+
if self.focus_index < 0 or self.focus_index >= len(labeling_request.review_focus):
|
|
117
|
+
raise ValueError(
|
|
118
|
+
f"focus_index={self.focus_index} is out of bounds for review_focus length "
|
|
119
|
+
f"{len(labeling_request.review_focus)}"
|
|
120
|
+
)
|
|
121
|
+
focus_item = labeling_request.review_focus[self.focus_index]
|
|
122
|
+
lines = focus_item.to_str(indent=indent).splitlines()
|
|
121
123
|
lines.append(f"User answer: {_text_or_na(self.answer)}")
|
|
122
124
|
lines.append(f"User explanation: {_text_or_na(self.explanation)}")
|
|
123
125
|
return "\n".join(lines)
|
|
@@ -204,6 +206,7 @@ class LabeledRun(BaseModel):
|
|
|
204
206
|
class AgentRunFeedbackContext(BaseModel):
|
|
205
207
|
"""All feedback collected for a single agent run."""
|
|
206
208
|
|
|
209
|
+
feedback_context_id: str | None = None
|
|
207
210
|
agent_run_id: str
|
|
208
211
|
round: int
|
|
209
212
|
created_at: datetime = Field(default_factory=datetime.now)
|
|
@@ -232,7 +235,10 @@ class AgentRunFeedbackContext(BaseModel):
|
|
|
232
235
|
qa_lines.append("N/A")
|
|
233
236
|
else:
|
|
234
237
|
for qa_idx, qa_pair in enumerate(self.qa_pairs, start=1):
|
|
235
|
-
qa_entry_lines = qa_pair.to_str(
|
|
238
|
+
qa_entry_lines = qa_pair.to_str(
|
|
239
|
+
labeling_request=self.labeling_request,
|
|
240
|
+
indent=indent,
|
|
241
|
+
).splitlines()
|
|
236
242
|
qa_lines.extend(_tag_block(f"QA {qa_idx}", qa_entry_lines, indent))
|
|
237
243
|
lines.extend(_tag_block("Question Answer Pairs", qa_lines, indent))
|
|
238
244
|
|
|
@@ -280,6 +286,24 @@ class FeedbackContextsResponse(BaseModel):
|
|
|
280
286
|
contexts: list[FeedbackContext] = Field(default_factory=list[FeedbackContext])
|
|
281
287
|
|
|
282
288
|
|
|
289
|
+
FeedbackJobStatus = Literal["pending", "running", "cancelling", "canceled", "completed"]
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
class StartFeedbackContextsJobResponse(BaseModel):
|
|
293
|
+
"""Response for enqueueing or reusing a feedback contexts job."""
|
|
294
|
+
|
|
295
|
+
job_id: str
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
class FeedbackContextsJobStateResponse(BaseModel):
|
|
299
|
+
"""Current feedback contexts job status and round-scoped contexts."""
|
|
300
|
+
|
|
301
|
+
job_id: str | None
|
|
302
|
+
job_status: FeedbackJobStatus | None
|
|
303
|
+
current_round: int
|
|
304
|
+
contexts: list[FeedbackContext] = Field(default_factory=list[FeedbackContext])
|
|
305
|
+
|
|
306
|
+
|
|
283
307
|
class UserData(BaseModel):
|
|
284
308
|
"""User Data (U) for user-context inference and downstream evaluation."""
|
|
285
309
|
|
|
@@ -21,7 +21,11 @@ from tqdm import tqdm
|
|
|
21
21
|
from docent._llm_util.providers.preference_types import ModelOption
|
|
22
22
|
from docent._log_util.logger import LoggerAdapter, get_logger
|
|
23
23
|
from docent.data_models.agent_run import AgentRun
|
|
24
|
-
from docent.data_models.feedback import
|
|
24
|
+
from docent.data_models.feedback import (
|
|
25
|
+
AgentRunFeedbackContext,
|
|
26
|
+
FeedbackContextsJobStateResponse,
|
|
27
|
+
StartFeedbackContextsJobResponse,
|
|
28
|
+
)
|
|
25
29
|
from docent.data_models.judge import Label
|
|
26
30
|
from docent.judges.util.meta_schema import validate_judge_result_schema
|
|
27
31
|
from docent.loaders import load_inspect
|
|
@@ -878,6 +882,44 @@ class Docent:
|
|
|
878
882
|
llm_svc = BaseLLMService() # reads API keys from environment
|
|
879
883
|
return build_judge(rubric, llm_svc)
|
|
880
884
|
|
|
885
|
+
def start_rubric_eval_job(
|
|
886
|
+
self,
|
|
887
|
+
collection_id: str,
|
|
888
|
+
rubric_id: str,
|
|
889
|
+
max_agent_runs: int | None = None,
|
|
890
|
+
n_rollouts_per_input: int = 1,
|
|
891
|
+
max_parallel: int | None = None,
|
|
892
|
+
) -> str:
|
|
893
|
+
"""Start or reuse a rubric evaluation job.
|
|
894
|
+
|
|
895
|
+
Args:
|
|
896
|
+
collection_id: ID of the Collection.
|
|
897
|
+
rubric_id: The ID of the rubric to evaluate.
|
|
898
|
+
max_agent_runs: Optional limit on the number of agent runs to evaluate.
|
|
899
|
+
n_rollouts_per_input: Number of judge rollouts to generate per agent run.
|
|
900
|
+
max_parallel: Optional backend concurrency override for the evaluation job.
|
|
901
|
+
|
|
902
|
+
Returns:
|
|
903
|
+
str: The ID of the created or reused job.
|
|
904
|
+
|
|
905
|
+
Raises:
|
|
906
|
+
requests.exceptions.HTTPError: If the API request fails.
|
|
907
|
+
ValueError: If the response does not contain a job ID.
|
|
908
|
+
"""
|
|
909
|
+
url = f"{self._api_url}/rubric/{collection_id}/{rubric_id}/evaluate"
|
|
910
|
+
payload = {
|
|
911
|
+
"max_agent_runs": max_agent_runs,
|
|
912
|
+
"n_rollouts_per_input": n_rollouts_per_input,
|
|
913
|
+
"max_parallel": max_parallel,
|
|
914
|
+
}
|
|
915
|
+
response = self._session.post(url, json=payload)
|
|
916
|
+
self._handle_response_errors(response)
|
|
917
|
+
|
|
918
|
+
job_id = response.json().get("job_id")
|
|
919
|
+
if job_id is None:
|
|
920
|
+
raise ValueError("Failed to start rubric eval job: 'job_id' missing in response.")
|
|
921
|
+
return job_id
|
|
922
|
+
|
|
881
923
|
def get_rubric_run_state(
|
|
882
924
|
self,
|
|
883
925
|
collection_id: str,
|
|
@@ -886,7 +928,7 @@ class Docent:
|
|
|
886
928
|
filter_dict: dict[str, Any] | None = None,
|
|
887
929
|
include_failures: bool = False,
|
|
888
930
|
) -> dict[str, Any]:
|
|
889
|
-
"""Get rubric
|
|
931
|
+
"""Get rubric evaluation results and progress for a collection/rubric.
|
|
890
932
|
|
|
891
933
|
Args:
|
|
892
934
|
collection_id: ID of the Collection.
|
|
@@ -900,6 +942,10 @@ class Docent:
|
|
|
900
942
|
|
|
901
943
|
Raises:
|
|
902
944
|
requests.exceptions.HTTPError: If the API request fails.
|
|
945
|
+
|
|
946
|
+
Note:
|
|
947
|
+
This method does not start evaluation. Use `start_rubric_eval_job()` to
|
|
948
|
+
enqueue or reuse a rubric evaluation job.
|
|
903
949
|
"""
|
|
904
950
|
url = f"{self._api_url}/rubric/{collection_id}/{rubric_id}/rubric_run_state"
|
|
905
951
|
body = {
|
|
@@ -979,7 +1025,7 @@ class Docent:
|
|
|
979
1025
|
self._handle_response_errors(response)
|
|
980
1026
|
return response.json()["feedback_session_id"]
|
|
981
1027
|
|
|
982
|
-
def
|
|
1028
|
+
def start_feedback_contexts_job(
|
|
983
1029
|
self,
|
|
984
1030
|
collection_id: str,
|
|
985
1031
|
feedback_session_id: str,
|
|
@@ -989,8 +1035,8 @@ class Docent:
|
|
|
989
1035
|
candidate_pool_limit: int = 1_000,
|
|
990
1036
|
where_clause: str | None = None,
|
|
991
1037
|
increment_round: bool = False,
|
|
992
|
-
) ->
|
|
993
|
-
"""
|
|
1038
|
+
) -> StartFeedbackContextsJobResponse:
|
|
1039
|
+
"""Start or reuse a background job to compute feedback contexts for a session."""
|
|
994
1040
|
payload = {
|
|
995
1041
|
"feedback_session_id": feedback_session_id,
|
|
996
1042
|
"num_samples": num_samples,
|
|
@@ -1000,10 +1046,24 @@ class Docent:
|
|
|
1000
1046
|
"where_clause": where_clause,
|
|
1001
1047
|
"increment_round": increment_round,
|
|
1002
1048
|
}
|
|
1003
|
-
url = f"{self._api_url}/feedback/{collection_id}/contexts"
|
|
1049
|
+
url = f"{self._api_url}/feedback/{collection_id}/contexts/start"
|
|
1050
|
+
response = self._session.post(url, json=payload)
|
|
1051
|
+
self._handle_response_errors(response)
|
|
1052
|
+
return StartFeedbackContextsJobResponse.model_validate(response.json())
|
|
1053
|
+
|
|
1054
|
+
def get_feedback_contexts(
|
|
1055
|
+
self,
|
|
1056
|
+
collection_id: str,
|
|
1057
|
+
feedback_session_id: str,
|
|
1058
|
+
) -> FeedbackContextsJobStateResponse:
|
|
1059
|
+
"""Get feedback contexts state for a session, including job status and current round data."""
|
|
1060
|
+
payload = {
|
|
1061
|
+
"feedback_session_id": feedback_session_id,
|
|
1062
|
+
}
|
|
1063
|
+
url = f"{self._api_url}/feedback/{collection_id}/contexts/state"
|
|
1004
1064
|
response = self._session.post(url, json=payload)
|
|
1005
1065
|
self._handle_response_errors(response)
|
|
1006
|
-
return
|
|
1066
|
+
return FeedbackContextsJobStateResponse.model_validate(response.json())
|
|
1007
1067
|
|
|
1008
1068
|
def get_agent_run_feedback_contexts_by_session(
|
|
1009
1069
|
self,
|
|
@@ -1012,9 +1072,8 @@ class Docent:
|
|
|
1012
1072
|
) -> list[AgentRunFeedbackContext]:
|
|
1013
1073
|
"""Get all persisted AgentRun feedback contexts for a feedback session.
|
|
1014
1074
|
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
including QA pairs and label (if present).
|
|
1075
|
+
Returns fully hydrated AgentRun feedback context objects from the database,
|
|
1076
|
+
including QA pairs and labels (if present), across all rounds in the session.
|
|
1018
1077
|
"""
|
|
1019
1078
|
url = f"{self._api_url}/feedback/{collection_id}/session/{feedback_session_id}/contexts"
|
|
1020
1079
|
response = self._session.get(url)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/_llm_util/data_models/exceptions.py
RENAMED
|
File without changes
|
{docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/_llm_util/data_models/llm_output.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/_llm_util/providers/preference_types.py
RENAMED
|
File without changes
|
{docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/_llm_util/providers/provider_registry.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docent_python-0.1.51a0 → docent_python-0.1.53a0}/docent/data_models/chat/response_format.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|