docent-python 0.1.63a0__tar.gz → 0.1.64a0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/PKG-INFO +1 -1
  2. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/judges/impl.py +11 -5
  3. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/judges/types.py +30 -9
  4. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/judges/util/parse_output.py +15 -5
  5. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/judges/util/template_formatter.py +1 -0
  6. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/sdk/_dql.py +11 -4
  7. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/sdk/_readings.py +256 -76
  8. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/sdk/llm_context.py +1 -1
  9. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/pyproject.toml +1 -1
  10. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/.gitignore +0 -0
  11. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/LICENSE.md +0 -0
  12. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/README.md +0 -0
  13. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/__init__.py +0 -0
  14. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/_llm_util/__init__.py +0 -0
  15. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/_llm_util/data_models/__init__.py +0 -0
  16. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/_llm_util/data_models/exceptions.py +0 -0
  17. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/_llm_util/data_models/llm_output.py +0 -0
  18. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/_llm_util/llm_svc.py +0 -0
  19. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/_llm_util/model_registry.py +0 -0
  20. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/_llm_util/providers/__init__.py +0 -0
  21. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/_llm_util/providers/anthropic.py +0 -0
  22. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/_llm_util/providers/common.py +0 -0
  23. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/_llm_util/providers/google.py +0 -0
  24. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/_llm_util/providers/openai.py +0 -0
  25. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/_llm_util/providers/openrouter.py +0 -0
  26. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/_llm_util/providers/preference_types.py +0 -0
  27. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/_llm_util/providers/provider_registry.py +0 -0
  28. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/_log_util/__init__.py +0 -0
  29. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/_log_util/logger.py +0 -0
  30. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/data_models/__init__.py +0 -0
  31. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/data_models/_tiktoken_util.py +0 -0
  32. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/data_models/agent_run.py +0 -0
  33. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/data_models/chat/__init__.py +0 -0
  34. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/data_models/chat/content.py +0 -0
  35. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/data_models/chat/message.py +0 -0
  36. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/data_models/chat/response_format.py +0 -0
  37. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/data_models/chat/tool.py +0 -0
  38. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/data_models/citation.py +0 -0
  39. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/data_models/context_config.py +0 -0
  40. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/data_models/feedback.py +0 -0
  41. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/data_models/formatted_objects.py +0 -0
  42. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/data_models/judge.py +0 -0
  43. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/data_models/metadata_util.py +0 -0
  44. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/data_models/reading.py +0 -0
  45. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/data_models/regex.py +0 -0
  46. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/data_models/report.py +0 -0
  47. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/data_models/transcript.py +0 -0
  48. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/data_models/util.py +0 -0
  49. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/judges/__init__.py +0 -0
  50. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/judges/analysis.py +0 -0
  51. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/judges/runner.py +0 -0
  52. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/judges/stats.py +0 -0
  53. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/judges/util/forgiving_json.py +0 -0
  54. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/judges/util/meta_schema.json +0 -0
  55. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/judges/util/meta_schema.py +0 -0
  56. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/judges/util/voting.py +0 -0
  57. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/loaders/load_inspect.py +0 -0
  58. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/mcp/__init__.py +0 -0
  59. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/mcp/__main__.py +0 -0
  60. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/mcp/server.py +0 -0
  61. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/py.typed +0 -0
  62. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/samples/__init__.py +0 -0
  63. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/samples/load.py +0 -0
  64. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/samples/log.eval +0 -0
  65. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/samples/tb_airline.json +0 -0
  66. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/sdk/__init__.py +0 -0
  67. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/sdk/_agent_runs.py +0 -0
  68. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/sdk/_base.py +0 -0
  69. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/sdk/_client_util.py +0 -0
  70. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/sdk/_collections.py +0 -0
  71. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/sdk/_feedback.py +0 -0
  72. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/sdk/_labels.py +0 -0
  73. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/sdk/_reports.py +0 -0
  74. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/sdk/_results.py +0 -0
  75. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/sdk/_rubrics.py +0 -0
  76. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/sdk/_sharing.py +0 -0
  77. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/sdk/agent_run_writer.py +0 -0
  78. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/sdk/client.py +0 -0
  79. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/sdk/integrations/__init__.py +0 -0
  80. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/sdk/integrations/harbor.py +0 -0
  81. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/sdk/integrations/inspect.py +0 -0
  82. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/sdk/integrations/nemogym.py +0 -0
  83. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/sdk/integrations/util.py +0 -0
  84. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/sdk/llm_request.py +0 -0
  85. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/sdk/reading.py +0 -0
  86. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/sdk/util.py +0 -0
  87. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/docent/trace.py +0 -0
  88. {docent_python-0.1.63a0 → docent_python-0.1.64a0}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docent-python
3
- Version: 0.1.63a0
3
+ Version: 0.1.64a0
4
4
  Summary: Docent SDK
5
5
  Project-URL: Homepage, https://github.com/TransluceAI/docent
6
6
  Project-URL: Issues, https://github.com/TransluceAI/docent/issues
@@ -145,7 +145,9 @@ class BaseJudge(ABC):
145
145
  Raises:
146
146
  ValidationFailedException: With the specific reason validation failed
147
147
  """
148
- return parse_and_validate_output_str(output_str, self.cfg.output_schema)
148
+ return parse_and_validate_output_str(
149
+ output_str, self.cfg.output_schema, output_format="json"
150
+ )
149
151
 
150
152
  def _parse_xml_key_output(self, output_str: str, agent_run: AgentRun) -> dict[str, Any]:
151
153
  """Parse output by extracting content from XML tags.
@@ -165,17 +167,21 @@ class BaseJudge(ABC):
165
167
  # Try to validate any match; take the first
166
168
  for response_text in response_matches:
167
169
  try:
168
- return parse_and_validate_output_str(response_text, self.cfg.output_schema)
170
+ return parse_and_validate_output_str(
171
+ response_text, self.cfg.output_schema, self.cfg.output_format
172
+ )
169
173
  except ValidationFailedException as e:
170
174
  last_error = ValidationFailedException(
171
175
  str(e),
172
176
  failed_output=output_str,
173
177
  )
174
178
 
175
- # Try to validate the entire output as JSON
179
+ # Try to validate the entire output
176
180
  # But only if the output _didn't_ contain a matching XML tag
177
181
  if not response_matches:
178
- return parse_and_validate_output_str(output_str, self.cfg.output_schema)
182
+ return parse_and_validate_output_str(
183
+ output_str, self.cfg.output_schema, self.cfg.output_format
184
+ )
179
185
 
180
186
  raise last_error or ValidationFailedException(
181
187
  f"No valid output found in <{xml_key}> tags",
@@ -427,7 +433,7 @@ class MultiReflectionJudge(BaseJudge):
427
433
  f"We have sampled a judge {len(first_stage_results)} times to get {len(first_stage_results)} independent answers to the same rubric evaluation:\n"
428
434
  f"{first_stage_results_text}\n\n"
429
435
  f"Please reflect on these answers. Consider all the information and evidence presented. "
430
- f"Return a final answer in the same JSON format as before."
436
+ f"Return a final answer in the same {self.cfg.output_format.upper()} format as before."
431
437
  )
432
438
  base_messages = self.cfg.materialize_messages(agent_run)
433
439
  reflection_prompt: list[ChatMessage] = list(base_messages) + [
@@ -40,9 +40,9 @@ Agent run:
40
40
 
41
41
  Your goal is to judge the agent run according to the criteria given in the rubric. Start by faithfully following the decision procedure in extremely careful detail, step by step.
42
42
 
43
- When you are finished, output your final adjudication, surrounded by <response>...</response> tags. The response must be a valid JSON string which can be parsed with python `json.loads` without any additional processing. Double quotes (`"`) in the middle of a string in the JSON object must be escaped with a backslash.
43
+ When you are finished, output your final adjudication, surrounded by <response>...</response> tags. {output_format_instructions}
44
44
 
45
- The JSON object you produce must adhere to the following schema:
45
+ The object you produce must adhere to the following schema:
46
46
  {output_schema}
47
47
  """.strip()
48
48
 
@@ -61,9 +61,9 @@ Agent run:
61
61
 
62
62
  Your goal is to judge the agent run according to the criteria given in the rubric. Start by faithfully following the decision procedure in extremely careful detail, step by step. You must execute **one step in the decision procedure per assistant message turn**. After each turn, output a complete and detailed recount of all actions you took, and everything you discovered. Then call the `step_finished` tool.
63
63
 
64
- When you are finished going through the decision procedure, output your final adjudication, surrounded by <response>...</response> tags. The response must be a valid JSON string which can be parsed with python `json.loads` without any additional processing. Double quotes (`"`) in the middle of a string in the JSON object must be escaped with a backslash.
64
+ When you are finished going through the decision procedure, output your final adjudication, surrounded by <response>...</response> tags. {output_format_instructions}
65
65
 
66
- The JSON object you produce must adhere to the following schema:
66
+ The object you produce must adhere to the following schema:
67
67
  {output_schema}
68
68
  """.strip()
69
69
 
@@ -82,12 +82,22 @@ Agent run:
82
82
 
83
83
  Your goal is to judge the agent run according to the criteria given in the rubric. Start by faithfully following the decision procedure in extremely careful detail, step by step. You must *fully externalize* your reasoning work by outputting details in the assistant message, surrounded by <reasoning>...</reasoning> tags. The reasoning section can be as messy as you need. You should use *high* reasoning effort.
84
84
 
85
- When you are finished, output your final adjudication in the assistant message, surrounded by <response>...</response> tags. The response must be a valid JSON string which can be parsed with python `json.loads` without any additional processing. Double quotes (`"`) in the middle of a string in the JSON object must be escaped with a backslash.
85
+ When you are finished, output your final adjudication in the assistant message, surrounded by <response>...</response> tags. {output_format_instructions}
86
86
 
87
- The JSON object you produce must adhere to the following schema:
87
+ The object you produce must adhere to the following schema:
88
88
  {output_schema}
89
89
  """.strip()
90
90
 
91
+ OUTPUT_FORMAT_INSTRUCTIONS = {
92
+ "json": (
93
+ "The response must be a valid JSON string which can be parsed with python"
94
+ " `json.loads` without any additional processing."
95
+ ' Double quotes (`"`) in the middle of a string in the JSON object'
96
+ " must be escaped with a backslash."
97
+ ),
98
+ "yaml": ("The response must be valid YAML that can be parsed with python `yaml.safe_load`."),
99
+ }
100
+
91
101
  # Other judge defaults
92
102
  DEFAULT_JUDGE_OUTPUT_SCHEMA = {
93
103
  "type": "object",
@@ -162,6 +172,7 @@ class Rubric(BaseModel):
162
172
  # Output parsing
163
173
  output_parsing_mode: OutputParsingMode = OutputParsingMode.XML_KEY
164
174
  response_xml_key: str = "response" # Only used when mode is XML_KEY
175
+ output_format: Literal["json", "yaml"] = "yaml"
165
176
 
166
177
  def materialize_messages(self, agent_run: AgentRun) -> list[ChatMessage]:
167
178
  """Construct the message list for rubric evaluation.
@@ -177,6 +188,9 @@ class Rubric(BaseModel):
177
188
  citation_instructions = (
178
189
  JUDGE_CITATION_INSTRUCTIONS if _schema_requests_citations(self.output_schema) else ""
179
190
  )
191
+ format_instructions = OUTPUT_FORMAT_INSTRUCTIONS.get(
192
+ self.output_format, OUTPUT_FORMAT_INSTRUCTIONS["yaml"]
193
+ )
180
194
  formatter = AgentRunTemplateFormatter(
181
195
  agent_run=agent_run,
182
196
  rubric_text=self.rubric_text,
@@ -186,8 +200,12 @@ class Rubric(BaseModel):
186
200
  # Format each template message
187
201
  messages: list[ChatMessage] = []
188
202
  for i, template in enumerate(self.prompt_templates):
189
- # No need to strip citation instructions here, as this is a new codepath
190
- content = formatter.format_template(template.content)
203
+ # Resolve output format instructions before template formatting,
204
+ # since it's not a user-facing template variable
205
+ template_content = template.content.replace(
206
+ "{output_format_instructions}", format_instructions
207
+ )
208
+ content = formatter.format_template(template_content)
191
209
 
192
210
  # Auto-append citation instructions to the last message
193
211
  if i == len(self.prompt_templates) - 1 and citation_instructions:
@@ -209,7 +227,10 @@ class Rubric(BaseModel):
209
227
  ) -> list[PromptTemplateMessage]:
210
228
  if not prompt_templates:
211
229
  raise ValueError("prompt_templates must include at least one template message.")
212
- AgentRunTemplateFormatter.validate_template_variables([t.content for t in prompt_templates])
230
+ AgentRunTemplateFormatter.validate_template_variables(
231
+ [t.content for t in prompt_templates],
232
+ allowed_unknown={"output_format_instructions"},
233
+ )
213
234
  return prompt_templates
214
235
 
215
236
  @field_validator("output_schema")
@@ -1,17 +1,23 @@
1
- from typing import Any, cast
1
+ from typing import Any, Literal, cast
2
2
 
3
3
  import jsonschema
4
+ import yaml
4
5
 
5
6
  from docent._llm_util.data_models.exceptions import ValidationFailedException
6
7
  from docent.judges.util.forgiving_json import forgiving_json_loads
7
8
 
8
9
 
9
- def parse_and_validate_output_str(output_str: str, output_schema: dict[str, Any]) -> dict[str, Any]:
10
- """Parse and validate LLM output against a JSON schema with forgiving parsing.
10
+ def parse_and_validate_output_str(
11
+ output_str: str,
12
+ output_schema: dict[str, Any],
13
+ output_format: Literal["json", "yaml"] = "json",
14
+ ) -> dict[str, Any]:
15
+ """Parse and validate LLM output against a JSON schema.
11
16
 
12
17
  Args:
13
18
  output_str: The LLM output string to parse
14
19
  output_schema: The JSON schema to validate against
20
+ output_format: The format to parse as ("json" or "yaml")
15
21
 
16
22
  Returns:
17
23
  Validated output dict
@@ -20,10 +26,13 @@ def parse_and_validate_output_str(output_str: str, output_schema: dict[str, Any]
20
26
  ValidationFailedException: If parsing or validation fails
21
27
  """
22
28
  try:
23
- output = forgiving_json_loads(output_str)
29
+ if output_format == "yaml":
30
+ output = yaml.safe_load(output_str)
31
+ else:
32
+ output = forgiving_json_loads(output_str)
24
33
  except Exception as e:
25
34
  raise ValidationFailedException(
26
- f"Failed to parse JSON: {e}",
35
+ f"Failed to parse {output_format.upper()}: {e}",
27
36
  failed_output=output_str,
28
37
  )
29
38
 
@@ -34,6 +43,7 @@ def parse_and_validate_output_str(output_str: str, output_schema: dict[str, Any]
34
43
  )
35
44
 
36
45
  output_dict = cast(dict[str, Any], output)
46
+
37
47
  try:
38
48
  jsonschema.validate(output_dict, output_schema)
39
49
  except jsonschema.ValidationError as e:
@@ -53,6 +53,7 @@ class AgentRunTemplateFormatter:
53
53
  - {agent_run} - Full agent run text representation
54
54
  - {rubric} - The rubric text
55
55
  - {output_schema} - JSON-formatted output schema
56
+ - {output_format_instructions} - Format-specific instructions (JSON or YAML)
56
57
 
57
58
  Example:
58
59
  formatter = AgentRunTemplateFormatter(
@@ -166,17 +166,24 @@ class DocentDqlMixin(DocentBase):
166
166
 
167
167
  Args:
168
168
  collection_id: ID of the Collection.
169
- apply_base_filter: Whether to apply the collection view's base filter.
169
+ apply_base_filter: Deprecated. View base filters are no longer applied implicitly.
170
170
 
171
171
  Returns:
172
172
  list[str]: Agent run IDs for the collection.
173
173
 
174
174
  Raises:
175
+ ValueError: If apply_base_filter is True.
175
176
  requests.exceptions.HTTPError: If the API request fails.
176
177
  """
177
- url = f"{self._api_url}/{collection_id}/agent_run_ids"
178
- params = {"apply_base_filter": "true"} if apply_base_filter else None
179
- response = self._session.get(url, params=params)
178
+ if apply_base_filter:
179
+ raise ValueError(
180
+ "apply_base_filter=True is no longer sufficient because view base filters are no "
181
+ "longer part of agent-run browsing. Pass an explicit filter to the query endpoint "
182
+ "instead."
183
+ )
184
+
185
+ url = f"{self._api_url}/{collection_id}/agent_run_ids/query"
186
+ response = self._session.post(url, json={})
180
187
  self._handle_response_errors(response)
181
188
  return response.json()
182
189
 
@@ -20,6 +20,7 @@ from docent.data_models.reading import (
20
20
  DqlOnlyStepSubmission,
21
21
  EndStepGroupSubmission,
22
22
  PlanJobCancelledEvent,
23
+ PlanJobFailedEvent,
23
24
  PlanSnapshotEvent,
24
25
  PlanStepCompletedEvent,
25
26
  PlanStepFailedEvent,
@@ -35,6 +36,11 @@ from docent.data_models.reading import (
35
36
  StepGroupSubmission,
36
37
  )
37
38
 
39
+
40
+ class _ReadingPlanFailure(RuntimeError):
41
+ """Fatal reading-plan failure that should stop SDK scripts."""
42
+
43
+
38
44
  _plan_stream_event_adapter: TypeAdapter[PlanStreamEvent] = TypeAdapter(PlanStreamEvent)
39
45
  from docent.sdk._base import DocentBase
40
46
  from docent.sdk.llm_context import ContextItemRef, Prompt
@@ -254,11 +260,39 @@ class DocentReadingsMixin(DocentBase):
254
260
  )
255
261
  try:
256
262
  self.flush(open_in_browser=True)
263
+ except _ReadingPlanFailure as exc:
264
+ self._logger.error("%s", exc)
265
+ raise SystemExit(1) from exc
257
266
  except Exception:
258
267
  self._logger.exception("Auto-flush failed")
259
268
 
260
269
  atexit.register(_atexit_flush)
261
270
 
271
+ def _register_notebook_hook(self) -> None:
272
+ if self._notebook_hook_registered:
273
+ return
274
+ try:
275
+ ip = get_ipython() # type: ignore[name-defined]
276
+ except NameError:
277
+ return
278
+ if ip is None:
279
+ return
280
+ self._is_notebook = True
281
+ self._notebook_hook_registered = True
282
+
283
+ def _cell_post_run(*args: Any, **kwargs: Any) -> None:
284
+ if self._pending and self._auto_flush:
285
+ open_browser = self._plan_id is None
286
+ try:
287
+ cast(Any, self).flush(open_in_browser=open_browser)
288
+ except _ReadingPlanFailure as exc:
289
+ self._logger.error("%s", exc)
290
+ raise
291
+ except Exception:
292
+ self._logger.exception("Notebook auto-flush failed")
293
+
294
+ ip.events.register("post_run_cell", _cell_post_run) # type: ignore[reportUnknownMemberType]
295
+
262
296
  def _detect_source_script(self) -> str | None:
263
297
  """Best-effort detection of the calling script or notebook name."""
264
298
  if self._is_notebook:
@@ -912,6 +946,42 @@ class DocentReadingsMixin(DocentBase):
912
946
  return f"{name} [${alias}]"
913
947
  return f"{default} [${alias}]"
914
948
 
949
+ def _reading_plan_url(self, collection_id: str, plan_id: str) -> str:
950
+ return f"{self._frontend_url}/dashboard/{collection_id}/reading-plan/{plan_id}"
951
+
952
+ def _plan_failure(
953
+ self,
954
+ *,
955
+ collection_id: str,
956
+ plan_id: str,
957
+ message: str,
958
+ alias: str | None = None,
959
+ name: str | None = None,
960
+ default_label: str = "Reading",
961
+ ) -> _ReadingPlanFailure:
962
+ plan_url = self._reading_plan_url(collection_id, plan_id)
963
+ if alias is None:
964
+ return _ReadingPlanFailure(
965
+ f"Reading plan {plan_id} failed: {message}. Open plan: {plan_url}"
966
+ )
967
+ label = self._format_step_label(name, alias, default_label)
968
+ return _ReadingPlanFailure(
969
+ f"{label} failed in reading plan {plan_id}: {message}. Open plan: {plan_url}"
970
+ )
971
+
972
+ @staticmethod
973
+ def _event_error_message(error: Any) -> str:
974
+ if error is None:
975
+ return "unknown error"
976
+ if hasattr(error, "message"):
977
+ return str(error.message)
978
+ if isinstance(error, dict):
979
+ error_data = cast(dict[str, Any], error)
980
+ message = error_data.get("message")
981
+ if message is not None:
982
+ return str(message)
983
+ return str(cast(object, error))
984
+
915
985
  def _preview_and_wait(
916
986
  self,
917
987
  *,
@@ -926,9 +996,21 @@ class DocentReadingsMixin(DocentBase):
926
996
 
927
997
  for es in submit_response.entry_statuses:
928
998
  if es.entry_type == "dql_only":
929
- if es.status == "cached" and es.dql_preview is not None:
930
- self._log_dql_preview(pending_names.get(es.alias), es.alias, es.dql_preview)
931
- elif es.status != "cached":
999
+ if es.status == "cached":
1000
+ if es.dql_preview is not None:
1001
+ self._log_dql_preview(pending_names.get(es.alias), es.alias, es.dql_preview)
1002
+ else:
1003
+ unsettled_dql_aliases[es.alias] = es
1004
+ elif es.status == "failed":
1005
+ raise self._plan_failure(
1006
+ collection_id=collection_id,
1007
+ plan_id=submit_response.plan_id,
1008
+ alias=es.alias,
1009
+ name=pending_names.get(es.alias),
1010
+ default_label="DQL",
1011
+ message="DQL step failed during submission",
1012
+ )
1013
+ else:
932
1014
  unsettled_dql_aliases[es.alias] = es
933
1015
 
934
1016
  elif es.entry_type == "reading":
@@ -939,27 +1021,34 @@ class DocentReadingsMixin(DocentBase):
939
1021
  es.result_count,
940
1022
  es.result_preview,
941
1023
  )
1024
+ elif es.status == "failed":
1025
+ raise self._plan_failure(
1026
+ collection_id=collection_id,
1027
+ plan_id=submit_response.plan_id,
1028
+ alias=es.alias,
1029
+ name=pending_names.get(es.alias),
1030
+ message="reading step failed during submission",
1031
+ )
942
1032
  else:
943
1033
  unsettled_reading_aliases.add(es.alias)
944
1034
 
945
- if not unsettled_reading_aliases:
946
- return
947
-
948
1035
  plan_id = submit_response.plan_id
949
1036
  assert plan_id is not None
950
1037
 
951
- completed_aliases: set[str] = {
952
- es.alias
953
- for es in submit_response.entry_statuses
954
- if es.status == "cached" and es.entry_type == "reading"
955
- }
1038
+ if not unsettled_reading_aliases:
1039
+ self._validate_unsettled_dql_aliases(
1040
+ collection_id=collection_id,
1041
+ plan_id=plan_id,
1042
+ unsettled_dql_aliases=unsettled_dql_aliases,
1043
+ pending_names=pending_names,
1044
+ )
1045
+ return
956
1046
 
957
1047
  self._block_on_plan_stream(
958
1048
  collection_id=collection_id,
959
1049
  plan_id=plan_id,
960
1050
  unsettled_reading_aliases=unsettled_reading_aliases,
961
1051
  unsettled_dql_aliases=unsettled_dql_aliases,
962
- completed_aliases=completed_aliases,
963
1052
  reading_handles=reading_handles,
964
1053
  pending_names=pending_names,
965
1054
  )
@@ -971,27 +1060,28 @@ class DocentReadingsMixin(DocentBase):
971
1060
  plan_id: str,
972
1061
  unsettled_reading_aliases: set[str],
973
1062
  unsettled_dql_aliases: dict[str, PlanStepSubmissionStatus],
974
- completed_aliases: set[str],
975
1063
  reading_handles: dict[str, Reading],
976
1064
  pending_names: dict[str, str | None],
977
1065
  ) -> None:
978
- """Connect to the plan SSE stream and block until all reading steps settle."""
1066
+ """Connect to the plan SSE stream and block until all submitted steps settle."""
979
1067
  stream_url = f"{self._api_url}/reading/{collection_id}/reading-plan/{plan_id}/stream"
980
1068
  response = self._session.get(stream_url, stream=True)
981
1069
  self._handle_response_errors(response)
982
1070
 
983
- alias_to_reading_id: dict[str, str] = {}
984
1071
  pending = set(unsettled_reading_aliases)
985
1072
  deadline = time.monotonic() + self._FLUSH_TIMEOUT_SECONDS
986
1073
 
987
1074
  for line in response.iter_lines(decode_unicode=True):
988
1075
  if time.monotonic() > deadline:
989
- self._logger.warning(
990
- "Timed out waiting for reading plan steps after %ds. Remaining steps: %s",
991
- self._FLUSH_TIMEOUT_SECONDS,
992
- ", ".join(sorted(pending)),
1076
+ remaining = sorted(pending | set(unsettled_dql_aliases))
1077
+ raise self._plan_failure(
1078
+ collection_id=collection_id,
1079
+ plan_id=plan_id,
1080
+ message=(
1081
+ f"timed out after {self._FLUSH_TIMEOUT_SECONDS}s waiting for "
1082
+ f"steps to settle: {', '.join(remaining)}"
1083
+ ),
993
1084
  )
994
- break
995
1085
 
996
1086
  if not line or not line.startswith("data: "):
997
1087
  continue
@@ -1006,11 +1096,8 @@ class DocentReadingsMixin(DocentBase):
1006
1096
 
1007
1097
  if isinstance(event, PlanSnapshotEvent):
1008
1098
  for step in event.steps:
1009
- if step.reading_id is not None:
1010
- alias_to_reading_id[step.alias] = step.reading_id
1011
1099
  if step.alias in pending and step.derived_status in ("completed", "cached"):
1012
1100
  pending.discard(step.alias)
1013
- completed_aliases.add(step.alias)
1014
1101
  if step.reading_id is not None:
1015
1102
  self._log_step_completed_preview(
1016
1103
  collection_id,
@@ -1020,20 +1107,24 @@ class DocentReadingsMixin(DocentBase):
1020
1107
  reading_handles.get(step.alias),
1021
1108
  )
1022
1109
  elif step.alias in pending and step.derived_status == "failed":
1023
- self._logger.warning(
1024
- "Step %s failed",
1025
- self._format_step_label(
1026
- pending_names.get(step.alias), step.alias, "Reading"
1027
- ),
1110
+ raise self._plan_failure(
1111
+ collection_id=collection_id,
1112
+ plan_id=plan_id,
1113
+ alias=step.alias,
1114
+ name=pending_names.get(step.alias),
1115
+ message="step reached failed status",
1028
1116
  )
1029
- pending.discard(step.alias)
1030
- if not pending:
1117
+ self._try_preview_unresolved_dql(
1118
+ collection_id,
1119
+ plan_id,
1120
+ unsettled_dql_aliases,
1121
+ pending_names,
1122
+ )
1123
+ if not pending and not unsettled_dql_aliases:
1031
1124
  break
1032
1125
  continue
1033
1126
 
1034
1127
  if isinstance(event, PlanStepCompletedEvent):
1035
- alias_to_reading_id[event.step_alias] = event.reading_id
1036
- completed_aliases.add(event.step_alias)
1037
1128
  if event.step_alias in pending:
1038
1129
  pending.discard(event.step_alias)
1039
1130
  self._log_step_completed_preview(
@@ -1048,36 +1139,57 @@ class DocentReadingsMixin(DocentBase):
1048
1139
  collection_id,
1049
1140
  plan_id,
1050
1141
  unsettled_dql_aliases,
1051
- completed_aliases,
1052
- alias_to_reading_id,
1053
1142
  pending_names,
1054
1143
  )
1055
1144
 
1056
1145
  elif isinstance(event, PlanStepFailedEvent) and event.step_alias in pending:
1057
- label = self._format_step_label(
1058
- pending_names.get(event.step_alias), event.step_alias, "Reading"
1146
+ raise self._plan_failure(
1147
+ collection_id=collection_id,
1148
+ plan_id=plan_id,
1149
+ alias=event.step_alias,
1150
+ name=pending_names.get(event.step_alias),
1151
+ message=self._event_error_message(event.error),
1059
1152
  )
1060
- msg = event.error.message if event.error else "unknown error"
1061
- self._logger.warning("Step %s failed: %s", label, msg)
1062
- pending.discard(event.step_alias)
1063
1153
 
1064
1154
  elif isinstance(event, PlanJobCancelledEvent):
1065
- self._logger.warning(
1066
- "Reading plan job was cancelled. Remaining steps: %s",
1067
- ", ".join(sorted(pending)),
1155
+ remaining = sorted(pending | set(unsettled_dql_aliases))
1156
+ raise self._plan_failure(
1157
+ collection_id=collection_id,
1158
+ plan_id=plan_id,
1159
+ message=(
1160
+ "job was cancelled"
1161
+ + (f"; remaining steps: {', '.join(remaining)}" if remaining else "")
1162
+ ),
1068
1163
  )
1069
- break
1070
1164
 
1071
- if not pending:
1165
+ elif isinstance(event, PlanJobFailedEvent):
1166
+ raise self._plan_failure(
1167
+ collection_id=collection_id,
1168
+ plan_id=plan_id,
1169
+ message=f"job failed: {self._event_error_message(event.error)}",
1170
+ )
1171
+
1172
+ if not pending and not unsettled_dql_aliases:
1072
1173
  break
1073
1174
 
1175
+ if pending:
1176
+ raise self._plan_failure(
1177
+ collection_id=collection_id,
1178
+ plan_id=plan_id,
1179
+ message=f"stream ended before steps settled: {', '.join(sorted(pending))}",
1180
+ )
1181
+ self._validate_unsettled_dql_aliases(
1182
+ collection_id=collection_id,
1183
+ plan_id=plan_id,
1184
+ unsettled_dql_aliases=unsettled_dql_aliases,
1185
+ pending_names=pending_names,
1186
+ )
1187
+
1074
1188
  def _try_preview_unresolved_dql(
1075
1189
  self,
1076
1190
  collection_id: str,
1077
1191
  plan_id: str,
1078
1192
  unsettled_dql_aliases: dict[str, PlanStepSubmissionStatus],
1079
- completed_aliases: set[str],
1080
- alias_to_reading_id: dict[str, str],
1081
1193
  pending_names: dict[str, str | None],
1082
1194
  ) -> None:
1083
1195
  """Check if any unresolved DQL steps can now be previewed."""
@@ -1091,6 +1203,34 @@ class DocentReadingsMixin(DocentBase):
1091
1203
  for alias in newly_resolved:
1092
1204
  unsettled_dql_aliases.pop(alias, None)
1093
1205
 
1206
+ def _validate_unsettled_dql_aliases(
1207
+ self,
1208
+ *,
1209
+ collection_id: str,
1210
+ plan_id: str,
1211
+ unsettled_dql_aliases: dict[str, PlanStepSubmissionStatus],
1212
+ pending_names: dict[str, str | None],
1213
+ ) -> None:
1214
+ """Execute any remaining DQL-only steps or fail if they cannot resolve."""
1215
+ self._try_preview_unresolved_dql(
1216
+ collection_id,
1217
+ plan_id,
1218
+ unsettled_dql_aliases,
1219
+ pending_names,
1220
+ )
1221
+ if not unsettled_dql_aliases:
1222
+ return
1223
+
1224
+ aliases = ", ".join(
1225
+ self._format_step_label(pending_names.get(alias), alias, "DQL")
1226
+ for alias in sorted(unsettled_dql_aliases)
1227
+ )
1228
+ raise self._plan_failure(
1229
+ collection_id=collection_id,
1230
+ plan_id=plan_id,
1231
+ message=f"DQL step dependencies did not resolve: {aliases}",
1232
+ )
1233
+
1094
1234
  def _try_execute_dql_for_alias(
1095
1235
  self,
1096
1236
  collection_id: str,
@@ -1099,32 +1239,56 @@ class DocentReadingsMixin(DocentBase):
1099
1239
  name: str | None,
1100
1240
  ) -> dict[str, Any] | None:
1101
1241
  """Attempt to execute a DQL step that was previously unresolved."""
1102
- try:
1103
- plan_state = self._get_reading_plan_state(collection_id, plan_id)
1104
- for step in plan_state.get("steps", []):
1105
- if step.get("alias") == alias:
1106
- status = step.get("derived_status")
1107
- if status == "cached":
1108
- dql_query = step.get("dql_query")
1109
- if dql_query:
1110
- result = cast(Any, self).execute_dql(
1111
- collection_id,
1112
- dql_query,
1113
- reading_plan_id=plan_id,
1114
- )
1115
- from docent.data_models.reading import DqlPreview
1116
-
1117
- preview = DqlPreview(
1118
- columns=result.get("columns", []),
1119
- rows=result.get("rows", [])[:10],
1120
- truncated=result.get("truncated", False),
1121
- row_count=result.get("row_count", 0),
1122
- )
1123
- self._log_dql_preview(name, alias, preview)
1124
- return result
1125
- break
1126
- except Exception:
1127
- self._logger.debug("Could not preview DQL step %s", alias, exc_info=True)
1242
+ plan_state = self._get_reading_plan_state(collection_id, plan_id)
1243
+ for step in plan_state.get("steps", []):
1244
+ if step.get("alias") == alias:
1245
+ status = step.get("derived_status")
1246
+ if status == "cached":
1247
+ dql_query = step.get("dql_query")
1248
+ if not dql_query:
1249
+ raise self._plan_failure(
1250
+ collection_id=collection_id,
1251
+ plan_id=plan_id,
1252
+ alias=alias,
1253
+ name=name,
1254
+ default_label="DQL",
1255
+ message="DQL step has no query",
1256
+ )
1257
+ try:
1258
+ result = cast(Any, self).execute_dql(
1259
+ collection_id,
1260
+ dql_query,
1261
+ reading_plan_id=plan_id,
1262
+ )
1263
+ except Exception as exc:
1264
+ raise self._plan_failure(
1265
+ collection_id=collection_id,
1266
+ plan_id=plan_id,
1267
+ alias=alias,
1268
+ name=name,
1269
+ default_label="DQL",
1270
+ message=str(exc),
1271
+ ) from exc
1272
+ from docent.data_models.reading import DqlPreview
1273
+
1274
+ preview = DqlPreview(
1275
+ columns=result.get("columns", []),
1276
+ rows=result.get("rows", [])[:10],
1277
+ truncated=result.get("truncated", False),
1278
+ row_count=result.get("row_count", 0),
1279
+ )
1280
+ self._log_dql_preview(name, alias, preview)
1281
+ return result
1282
+ if status == "failed":
1283
+ raise self._plan_failure(
1284
+ collection_id=collection_id,
1285
+ plan_id=plan_id,
1286
+ alias=alias,
1287
+ name=name,
1288
+ default_label="DQL",
1289
+ message="DQL step reached failed status",
1290
+ )
1291
+ break
1128
1292
  return None
1129
1293
 
1130
1294
  def _log_step_completed_preview(
@@ -1273,8 +1437,14 @@ class DocentReadingsMixin(DocentBase):
1273
1437
  event_type = event.get("type", "")
1274
1438
  if event.get("step_alias") != alias:
1275
1439
  if event_type == "job_failed":
1276
- raise RuntimeError(
1277
- f"Reading plan job {job_id!r} failed before {alias!r} completed"
1440
+ raise self._plan_failure(
1441
+ collection_id=collection_id,
1442
+ plan_id=plan_id,
1443
+ alias=alias,
1444
+ message=(
1445
+ f"job {job_id!r} failed before step completed: "
1446
+ f"{self._event_error_message(event.get('error'))}"
1447
+ ),
1278
1448
  )
1279
1449
  if event_type == "job_completed":
1280
1450
  break
@@ -1282,7 +1452,12 @@ class DocentReadingsMixin(DocentBase):
1282
1452
  if event_type == "step_completed":
1283
1453
  return cast(str | None, event.get("reading_id"))
1284
1454
  if event_type == "step_failed":
1285
- raise RuntimeError(f"Reading plan step {alias!r} failed")
1455
+ raise self._plan_failure(
1456
+ collection_id=collection_id,
1457
+ plan_id=plan_id,
1458
+ alias=alias,
1459
+ message=self._event_error_message(event.get("error")),
1460
+ )
1286
1461
  return None
1287
1462
 
1288
1463
  def _wait_for_reading(self, reading: Reading) -> None:
@@ -1325,7 +1500,12 @@ class DocentReadingsMixin(DocentBase):
1325
1500
  reading_id = step_reading_id
1326
1501
  break
1327
1502
  if step_status == "failed":
1328
- raise RuntimeError(f"Reading plan step {alias!r} failed")
1503
+ raise self._plan_failure(
1504
+ collection_id=collection_id,
1505
+ plan_id=plan_id,
1506
+ alias=alias,
1507
+ message="step reached failed status",
1508
+ )
1329
1509
 
1330
1510
  active_job_id = plan_state.get("active_job_id")
1331
1511
  if isinstance(active_job_id, str) and active_job_id:
@@ -988,7 +988,7 @@ class LLMContext:
988
988
  if interactive:
989
989
  context_description = "You are a helpful assistant that specializes in analyzing transcripts of AI agent behavior."
990
990
  else:
991
- context_description = "You are a tasked with analyzing transcripts of AI agent behavior. You are not interacting with a user directly."
991
+ context_description = "You are tasked with analyzing transcripts of AI agent behavior. You are not interacting with a user directly."
992
992
 
993
993
  if not include_citations:
994
994
  return context_description
@@ -1,7 +1,7 @@
1
1
  [project]
2
2
  name = "docent-python"
3
3
  description = "Docent SDK"
4
- version = "0.1.63-alpha"
4
+ version = "0.1.64-alpha"
5
5
  authors = [
6
6
  { name="Transluce", email="info@transluce.org" },
7
7
  ]