docent-python 0.1.44a0__tar.gz → 0.1.46a0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/PKG-INFO +1 -1
  2. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/_llm_util/data_models/llm_output.py +1 -0
  3. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/_llm_util/llm_cache.py +16 -3
  4. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/_llm_util/llm_svc.py +7 -0
  5. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/_llm_util/providers/anthropic.py +53 -37
  6. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/_llm_util/providers/google.py +11 -0
  7. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/_llm_util/providers/openai.py +49 -10
  8. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/_llm_util/providers/openrouter.py +46 -2
  9. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/_llm_util/providers/provider_registry.py +5 -0
  10. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/data_models/chat/__init__.py +2 -0
  11. docent_python-0.1.46a0/docent/data_models/chat/response_format.py +47 -0
  12. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/judges/__init__.py +4 -0
  13. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/judges/impl.py +165 -118
  14. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/judges/runner.py +9 -1
  15. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/judges/types.py +122 -66
  16. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/judges/util/meta_schema.json +5 -0
  17. docent_python-0.1.46a0/docent/judges/util/template_formatter.py +166 -0
  18. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/mcp/server.py +5 -5
  19. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/sdk/client.py +16 -2
  20. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/sdk/llm_context.py +1 -1
  21. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/pyproject.toml +2 -2
  22. docent_python-0.1.46a0/uv.lock +3277 -0
  23. docent_python-0.1.44a0/uv.lock +0 -2541
  24. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/.gitignore +0 -0
  25. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/LICENSE.md +0 -0
  26. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/README.md +0 -0
  27. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/__init__.py +0 -0
  28. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/_llm_util/__init__.py +0 -0
  29. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/_llm_util/data_models/__init__.py +0 -0
  30. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/_llm_util/data_models/exceptions.py +0 -0
  31. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/_llm_util/model_registry.py +0 -0
  32. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/_llm_util/providers/__init__.py +0 -0
  33. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/_llm_util/providers/common.py +0 -0
  34. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/_llm_util/providers/preference_types.py +0 -0
  35. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/_log_util/__init__.py +0 -0
  36. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/_log_util/logger.py +0 -0
  37. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/data_models/__init__.py +0 -0
  38. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/data_models/_tiktoken_util.py +0 -0
  39. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/data_models/agent_run.py +0 -0
  40. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/data_models/chat/content.py +0 -0
  41. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/data_models/chat/message.py +0 -0
  42. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/data_models/chat/tool.py +0 -0
  43. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/data_models/citation.py +0 -0
  44. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/data_models/formatted_objects.py +0 -0
  45. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/data_models/judge.py +0 -0
  46. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/data_models/metadata_util.py +0 -0
  47. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/data_models/regex.py +0 -0
  48. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/data_models/transcript.py +0 -0
  49. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/data_models/util.py +0 -0
  50. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/judges/analysis.py +0 -0
  51. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/judges/stats.py +0 -0
  52. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/judges/util/forgiving_json.py +0 -0
  53. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/judges/util/meta_schema.py +0 -0
  54. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/judges/util/parse_output.py +0 -0
  55. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/judges/util/voting.py +0 -0
  56. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/loaders/load_inspect.py +0 -0
  57. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/mcp/__init__.py +0 -0
  58. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/mcp/__main__.py +0 -0
  59. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/py.typed +0 -0
  60. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/samples/__init__.py +0 -0
  61. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/samples/load.py +0 -0
  62. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/samples/log.eval +0 -0
  63. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/samples/tb_airline.json +0 -0
  64. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/sdk/__init__.py +0 -0
  65. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/sdk/agent_run_writer.py +0 -0
  66. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/sdk/llm_request.py +0 -0
  67. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/trace.py +0 -0
  68. {docent_python-0.1.44a0 → docent_python-0.1.46a0}/docent/trace_temp.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docent-python
3
- Version: 0.1.44a0
3
+ Version: 0.1.46a0
4
4
  Summary: Docent SDK
5
5
  Project-URL: Homepage, https://github.com/TransluceAI/docent
6
6
  Project-URL: Issues, https://github.com/TransluceAI/docent/issues
@@ -62,6 +62,7 @@ class LLMCompletion(BaseModel):
62
62
  tool_calls: List of tool calls made during the completion.
63
63
  finish_reason: Reason why the completion finished.
64
64
  top_logprobs: Probability distribution for top token choices.
65
+ reasoning_tokens: Extended thinking tokens (for reasoning models).
65
66
  """
66
67
 
67
68
  text: str | None = None
@@ -9,6 +9,7 @@ from typing import Literal
9
9
  from docent._llm_util.data_models.llm_output import LLMOutput
10
10
  from docent._log_util import get_logger
11
11
  from docent.data_models.chat import ChatMessage, ToolInfo
12
+ from docent.data_models.chat.response_format import ResponseFormat
12
13
 
13
14
  logger = get_logger(__name__)
14
15
 
@@ -59,6 +60,7 @@ class LLMCache:
59
60
  temperature: float = 1.0,
60
61
  logprobs: bool = False,
61
62
  top_logprobs: int | None = None,
63
+ response_format: ResponseFormat | None = None,
62
64
  ) -> str:
63
65
  """Create a deterministic hash key from messages and model."""
64
66
  # Convert messages to a stable string representation
@@ -71,10 +73,15 @@ class LLMCache:
71
73
  json.dumps([tool.model_dump() for tool in tools], sort_keys=True) if tools else None
72
74
  )
73
75
 
74
- # Combine all parameters into a single string
75
- key_str = (
76
- f"{message_str}:{model_name}:{tools_str}:{tool_choice}:{reasoning_effort}:{temperature}"
76
+ # Convert response_format to a stable string representation if present
77
+ response_format_str = (
78
+ json.dumps(response_format.model_dump(by_alias=True), sort_keys=True)
79
+ if response_format
80
+ else None
77
81
  )
82
+
83
+ # Combine all parameters into a single string
84
+ key_str = f"{message_str}:{model_name}:{tools_str}:{tool_choice}:{reasoning_effort}:{temperature}:{response_format_str}"
78
85
  if logprobs:
79
86
  key_str += f":{top_logprobs}"
80
87
  return hashlib.sha256(key_str.encode()).hexdigest()
@@ -90,6 +97,7 @@ class LLMCache:
90
97
  temperature: float = 1.0,
91
98
  logprobs: bool = False,
92
99
  top_logprobs: int | None = None,
100
+ response_format: ResponseFormat | None = None,
93
101
  ) -> LLMOutput | None:
94
102
  """Get cached completion for a conversation if it exists."""
95
103
 
@@ -102,6 +110,7 @@ class LLMCache:
102
110
  temperature=temperature,
103
111
  logprobs=logprobs,
104
112
  top_logprobs=top_logprobs,
113
+ response_format=response_format,
105
114
  )
106
115
 
107
116
  with self._get_connection() as conn:
@@ -125,6 +134,7 @@ class LLMCache:
125
134
  temperature: float = 1.0,
126
135
  logprobs: bool = False,
127
136
  top_logprobs: int | None = None,
137
+ response_format: ResponseFormat | None = None,
128
138
  ) -> None:
129
139
  """Cache a completion for a conversation."""
130
140
 
@@ -137,6 +147,7 @@ class LLMCache:
137
147
  temperature=temperature,
138
148
  logprobs=logprobs,
139
149
  top_logprobs=top_logprobs,
150
+ response_format=response_format,
140
151
  )
141
152
 
142
153
  with self._get_connection() as conn:
@@ -158,6 +169,7 @@ class LLMCache:
158
169
  temperature: float = 1.0,
159
170
  logprobs: bool = False,
160
171
  top_logprobs: int | None = None,
172
+ response_format: ResponseFormat | None = None,
161
173
  ) -> None:
162
174
  """Cache a completion for a conversation."""
163
175
 
@@ -172,6 +184,7 @@ class LLMCache:
172
184
  temperature=temperature,
173
185
  logprobs=logprobs,
174
186
  top_logprobs=top_logprobs,
187
+ response_format=response_format,
175
188
  )
176
189
  keys.append(key)
177
190
 
@@ -37,6 +37,7 @@ from docent._llm_util.providers.provider_registry import (
37
37
  )
38
38
  from docent._log_util import get_logger
39
39
  from docent.data_models.chat import ChatMessage, ToolInfo, parse_chat_message
40
+ from docent.data_models.chat.response_format import ResponseFormat
40
41
 
41
42
  logger = get_logger(__name__)
42
43
 
@@ -90,6 +91,7 @@ async def _parallelize_calls(
90
91
  semaphore: Semaphore,
91
92
  # use_tqdm: bool,
92
93
  cache: LLMCache | None = None,
94
+ response_format: ResponseFormat | None = None,
93
95
  ):
94
96
  base_func = partial(
95
97
  single_output_getter,
@@ -103,6 +105,7 @@ async def _parallelize_calls(
103
105
  logprobs=logprobs,
104
106
  top_logprobs=top_logprobs,
105
107
  timeout=timeout,
108
+ response_format=response_format,
106
109
  )
107
110
 
108
111
  responses: list[LLMOutput | None] = [None for _ in inputs]
@@ -143,6 +146,7 @@ async def _parallelize_calls(
143
146
  temperature=temperature,
144
147
  logprobs=logprobs,
145
148
  top_logprobs=top_logprobs,
149
+ response_format=response_format,
146
150
  )
147
151
  if cache is not None
148
152
  else None
@@ -271,6 +275,7 @@ async def _parallelize_calls(
271
275
  temperature=temperature,
272
276
  logprobs=logprobs,
273
277
  top_logprobs=top_logprobs,
278
+ response_format=response_format,
274
279
  )
275
280
  return len(indices)
276
281
  else:
@@ -351,6 +356,7 @@ class BaseLLMService:
351
356
  validation_callback: AsyncLLMOutputStreamingCallback | None = None,
352
357
  completion_callback: AsyncLLMOutputStreamingCallback | None = None,
353
358
  use_cache: bool = False,
359
+ response_format: ResponseFormat | None = None,
354
360
  _api_key_overrides: dict[str, str] = dict(),
355
361
  ) -> list[LLMOutput]:
356
362
  """Request completions from a configured LLM provider."""
@@ -424,6 +430,7 @@ class BaseLLMService:
424
430
  timeout=timeout,
425
431
  semaphore=self._semaphore,
426
432
  cache=cache,
433
+ response_format=response_format,
427
434
  )
428
435
  assert len(outputs) == len(inputs), "Number of outputs must match number of messages"
429
436
 
@@ -5,6 +5,7 @@ import backoff
5
5
  # all errors: https://docs.anthropic.com/en/api/errors
6
6
  from anthropic import (
7
7
  AsyncAnthropic,
8
+ AsyncStream,
8
9
  AuthenticationError,
9
10
  BadRequestError,
10
11
  NotFoundError,
@@ -12,7 +13,6 @@ from anthropic import (
12
13
  RateLimitError,
13
14
  UnprocessableEntityError,
14
15
  )
15
- from anthropic._types import NOT_GIVEN
16
16
  from anthropic.types import (
17
17
  InputJSONDelta,
18
18
  Message,
@@ -70,6 +70,7 @@ from docent.data_models.chat import (
70
70
  ToolInfo,
71
71
  ToolMessage,
72
72
  )
73
+ from docent.data_models.chat.response_format import ResponseFormat
73
74
 
74
75
  logger = get_logger(__name__)
75
76
 
@@ -217,34 +218,43 @@ async def get_anthropic_chat_completion_streaming_async(
217
218
  logprobs: bool = False,
218
219
  top_logprobs: int | None = None,
219
220
  timeout: float = 5.0,
221
+ response_format: ResponseFormat | None = None,
220
222
  ):
223
+ if response_format is not None:
224
+ raise NotImplementedError(
225
+ "Structured outputs (response_format) are not implemented for Anthropic yet."
226
+ )
221
227
  if logprobs or top_logprobs is not None:
222
228
  raise NotImplementedError(
223
229
  "We have not implemented logprobs or top_logprobs for Anthropic yet."
224
230
  )
225
231
 
226
232
  system, input_messages = parse_chat_messages(messages)
227
- input_tools = parse_tools(tools) if tools else NOT_GIVEN
228
233
 
229
234
  try:
230
235
  async with async_timeout_ctx(timeout):
231
- stream = await client.messages.create(
232
- model=model_name,
233
- messages=input_messages,
234
- thinking=(
235
- {
236
- "type": "enabled",
237
- "budget_tokens": reasoning_budget(max_new_tokens, reasoning_effort),
238
- }
239
- if reasoning_effort
240
- else NOT_GIVEN
241
- ),
242
- tools=input_tools,
243
- tool_choice=_parse_tool_choice(tool_choice) or NOT_GIVEN,
244
- max_tokens=max_new_tokens,
245
- temperature=temperature,
246
- system=system if system is not None else NOT_GIVEN,
247
- stream=True,
236
+ create_kwargs: dict[str, Any] = {
237
+ "model": model_name,
238
+ "messages": input_messages,
239
+ "max_tokens": max_new_tokens,
240
+ "temperature": temperature,
241
+ "stream": True,
242
+ }
243
+ if reasoning_effort:
244
+ create_kwargs["thinking"] = {
245
+ "type": "enabled",
246
+ "budget_tokens": reasoning_budget(max_new_tokens, reasoning_effort),
247
+ }
248
+ if tools:
249
+ create_kwargs["tools"] = parse_tools(tools)
250
+ if tool_choice_param := _parse_tool_choice(tool_choice):
251
+ create_kwargs["tool_choice"] = tool_choice_param
252
+ if system is not None:
253
+ create_kwargs["system"] = system
254
+
255
+ stream = cast(
256
+ AsyncStream[RawMessageStreamEvent],
257
+ await client.messages.create(**create_kwargs),
248
258
  )
249
259
 
250
260
  llm_output_partial = None
@@ -399,6 +409,7 @@ async def get_anthropic_chat_completion_async(
399
409
  logprobs: bool = False,
400
410
  top_logprobs: int | None = None,
401
411
  timeout: float = 5.0,
412
+ response_format: ResponseFormat | None = None,
402
413
  ) -> LLMOutput:
403
414
  """
404
415
  Note from kevin 1/29/2025:
@@ -409,33 +420,38 @@ async def get_anthropic_chat_completion_async(
409
420
  We should actually implement this at some point, but it does not work.
410
421
  """
411
422
 
423
+ if response_format is not None:
424
+ raise NotImplementedError(
425
+ "Structured outputs (response_format) are not implemented for Anthropic yet."
426
+ )
412
427
  if logprobs or top_logprobs is not None:
413
428
  raise NotImplementedError(
414
429
  "We have not implemented logprobs or top_logprobs for Anthropic yet."
415
430
  )
416
431
 
417
432
  system, input_messages = parse_chat_messages(messages)
418
- input_tools = parse_tools(tools) if tools else NOT_GIVEN
419
433
 
420
434
  try:
421
435
  async with async_timeout_ctx(timeout):
422
- raw_output = await client.messages.create(
423
- model=model_name,
424
- messages=input_messages,
425
- thinking=(
426
- {
427
- "type": "enabled",
428
- "budget_tokens": reasoning_budget(max_new_tokens, reasoning_effort),
429
- }
430
- if reasoning_effort
431
- else NOT_GIVEN
432
- ),
433
- tools=input_tools,
434
- tool_choice=_parse_tool_choice(tool_choice) or NOT_GIVEN,
435
- max_tokens=max_new_tokens,
436
- temperature=temperature,
437
- system=system if system is not None else NOT_GIVEN,
438
- )
436
+ create_kwargs: dict[str, Any] = {
437
+ "model": model_name,
438
+ "messages": input_messages,
439
+ "max_tokens": max_new_tokens,
440
+ "temperature": temperature,
441
+ }
442
+ if reasoning_effort:
443
+ create_kwargs["thinking"] = {
444
+ "type": "enabled",
445
+ "budget_tokens": reasoning_budget(max_new_tokens, reasoning_effort),
446
+ }
447
+ if tools:
448
+ create_kwargs["tools"] = parse_tools(tools)
449
+ if tool_choice_param := _parse_tool_choice(tool_choice):
450
+ create_kwargs["tool_choice"] = tool_choice_param
451
+ if system is not None:
452
+ create_kwargs["system"] = system
453
+
454
+ raw_output = cast(Message, await client.messages.create(**create_kwargs))
439
455
 
440
456
  output = parse_anthropic_completion(raw_output, model_name)
441
457
  if output.first and output.first.finish_reason == "length" and output.first.no_text:
@@ -28,6 +28,7 @@ from docent._llm_util.providers.common import (
28
28
  )
29
29
  from docent._log_util import get_logger
30
30
  from docent.data_models.chat import ChatMessage, Content, ContentText, ToolCall, ToolInfo
31
+ from docent.data_models.chat.response_format import ResponseFormat
31
32
 
32
33
 
33
34
  def get_google_client_async(api_key: str | None = None) -> AsyncGoogle:
@@ -82,7 +83,12 @@ async def get_google_chat_completion_async(
82
83
  logprobs: bool = False,
83
84
  top_logprobs: int | None = None,
84
85
  timeout: float = 5.0,
86
+ response_format: ResponseFormat | None = None,
85
87
  ) -> LLMOutput:
88
+ if response_format is not None:
89
+ raise NotImplementedError(
90
+ "Structured outputs (response_format) are not implemented for Google yet."
91
+ )
86
92
  if logprobs or top_logprobs is not None:
87
93
  raise NotImplementedError(
88
94
  "We have not implemented logprobs or top_logprobs for Google yet."
@@ -145,7 +151,12 @@ async def get_google_chat_completion_streaming_async(
145
151
  logprobs: bool = False,
146
152
  top_logprobs: int | None = None,
147
153
  timeout: float = 5.0,
154
+ response_format: ResponseFormat | None = None,
148
155
  ) -> LLMOutput:
156
+ if response_format is not None:
157
+ raise NotImplementedError(
158
+ "Structured outputs (response_format) are not implemented for Google yet."
159
+ )
149
160
  if logprobs or top_logprobs is not None:
150
161
  raise NotImplementedError(
151
162
  "We have not implemented logprobs or top_logprobs for Google yet."
@@ -40,6 +40,10 @@ from openai.types.chat.chat_completion_message_tool_call_param import (
40
40
  Function as OpenAIFunctionParam,
41
41
  )
42
42
  from openai.types.shared_params.function_definition import FunctionDefinition
43
+ from openai.types.shared_params.response_format_json_schema import (
44
+ JSONSchema,
45
+ ResponseFormatJSONSchema,
46
+ )
43
47
 
44
48
  from docent._llm_util.data_models.exceptions import (
45
49
  CompletionTooLongException,
@@ -70,6 +74,7 @@ from docent.data_models.chat import (
70
74
  ToolInfo,
71
75
  ToolMessage,
72
76
  )
77
+ from docent.data_models.chat.response_format import ResponseFormat
73
78
 
74
79
  logger = get_logger(__name__)
75
80
  DEFAULT_TIKTOKEN_ENCODING = "cl100k_base"
@@ -194,6 +199,42 @@ def parse_tools(tools: list[ToolInfo]) -> list[ChatCompletionToolParam]:
194
199
  return result
195
200
 
196
201
 
202
+ def _build_response_format(
203
+ response_format: ResponseFormat | None,
204
+ ) -> ResponseFormatJSONSchema | None:
205
+ """Build OpenAI response_format dict from unified ResponseFormat.
206
+
207
+ Converts the unified ResponseFormat specification to OpenAI's
208
+ expected response_format structure for structured outputs.
209
+
210
+ Args:
211
+ response_format: The unified response format specification, or None.
212
+
213
+ Returns:
214
+ OpenAI response_format dict if provided, empty dict otherwise.
215
+
216
+ Raises:
217
+ ValueError: If response_format.type is not 'json_schema'.
218
+ """
219
+ if response_format is None:
220
+ return None
221
+
222
+ if response_format.type != "json_schema":
223
+ raise ValueError(
224
+ f"Unsupported response format type: {response_format.type}. "
225
+ "Only 'json_schema' is currently supported."
226
+ )
227
+
228
+ return ResponseFormatJSONSchema(
229
+ type="json_schema",
230
+ json_schema=JSONSchema(
231
+ name=response_format.name,
232
+ strict=response_format.strict,
233
+ schema=response_format.schema_,
234
+ ),
235
+ )
236
+
237
+
197
238
  @backoff.on_exception(
198
239
  backoff.expo,
199
240
  exception=(Exception,),
@@ -215,16 +256,14 @@ async def get_openai_chat_completion_streaming_async(
215
256
  logprobs: bool = False,
216
257
  top_logprobs: int | None = None,
217
258
  timeout: float = 30.0,
259
+ response_format: ResponseFormat | None = None,
218
260
  ):
219
- input_messages = parse_chat_messages(messages)
220
- input_tools = parse_tools(tools) if tools else omit
221
-
222
261
  try:
223
262
  async with async_timeout_ctx(timeout):
224
263
  stream = await client.chat.completions.create(
225
264
  model=model_name,
226
- messages=input_messages,
227
- tools=input_tools,
265
+ messages=parse_chat_messages(messages),
266
+ tools=parse_tools(tools) if tools else omit,
228
267
  tool_choice=tool_choice or omit,
229
268
  max_completion_tokens=max_new_tokens,
230
269
  temperature=temperature,
@@ -233,6 +272,7 @@ async def get_openai_chat_completion_streaming_async(
233
272
  top_logprobs=top_logprobs,
234
273
  stream_options={"include_usage": True},
235
274
  stream=True,
275
+ response_format=_build_response_format(response_format) or omit,
236
276
  )
237
277
 
238
278
  llm_output_partial = None
@@ -406,22 +446,21 @@ async def get_openai_chat_completion_async(
406
446
  logprobs: bool = False,
407
447
  top_logprobs: int | None = None,
408
448
  timeout: float = 5.0,
449
+ response_format: ResponseFormat | None = None,
409
450
  ) -> LLMOutput:
410
- input_messages = parse_chat_messages(messages)
411
- input_tools = parse_tools(tools) if tools else omit
412
-
413
451
  try:
414
452
  async with async_timeout_ctx(timeout): # type: ignore
415
453
  raw_output = await client.chat.completions.create(
416
454
  model=model_name,
417
- messages=input_messages,
418
- tools=input_tools,
455
+ messages=parse_chat_messages(messages),
456
+ tools=parse_tools(tools) if tools else omit,
419
457
  tool_choice=tool_choice or omit,
420
458
  max_completion_tokens=max_new_tokens,
421
459
  temperature=temperature,
422
460
  reasoning_effort=reasoning_effort or omit,
423
461
  logprobs=logprobs,
424
462
  top_logprobs=top_logprobs,
463
+ response_format=_build_response_format(response_format) or omit,
425
464
  )
426
465
 
427
466
  # If the completion is empty and was truncated (likely due to too much reasoning), raise an exception
@@ -31,6 +31,7 @@ from docent.data_models.chat import (
31
31
  ToolInfo,
32
32
  ToolMessage,
33
33
  )
34
+ from docent.data_models.chat.response_format import ResponseFormat
34
35
 
35
36
  logger = get_logger(__name__)
36
37
 
@@ -59,6 +60,7 @@ class OpenRouterClient:
59
60
  max_tokens: int = 32,
60
61
  temperature: float = 1.0,
61
62
  timeout: float = 30.0,
63
+ response_format: dict[str, Any] | None = None,
62
64
  ) -> dict[str, Any]:
63
65
  """Make an async chat completion request."""
64
66
  url = f"{self.base_url}/chat/completions"
@@ -74,6 +76,8 @@ class OpenRouterClient:
74
76
  payload["tools"] = tools
75
77
  if tool_choice:
76
78
  payload["tool_choice"] = tool_choice
79
+ if response_format:
80
+ payload["response_format"] = response_format
77
81
 
78
82
  async with aiohttp.ClientSession() as session:
79
83
  async with session.post(
@@ -203,6 +207,37 @@ def parse_tools(tools: list[ToolInfo]) -> list[dict[str, Any]]:
203
207
  return result
204
208
 
205
209
 
210
+ def _build_response_format(response_format: ResponseFormat | None) -> dict[str, Any] | None:
211
+ """Convert ResponseFormat to OpenRouter's response_format parameter.
212
+
213
+ Args:
214
+ response_format: The unified response format specification.
215
+
216
+ Returns:
217
+ OpenRouter-formatted response_format dict, or None if not provided.
218
+
219
+ Raises:
220
+ ValueError: If response_format.type is not a supported format type.
221
+ """
222
+ if response_format is None:
223
+ return None
224
+
225
+ if response_format.type != "json_schema":
226
+ raise ValueError(
227
+ f"Unsupported response format type: {response_format.type}. "
228
+ "Only 'json_schema' is currently supported."
229
+ )
230
+
231
+ return {
232
+ "type": "json_schema",
233
+ "json_schema": {
234
+ "name": response_format.name,
235
+ "strict": response_format.strict,
236
+ "schema": response_format.schema_,
237
+ },
238
+ }
239
+
240
+
206
241
  def _parse_openrouter_tool_call(tc: dict[str, Any]) -> ToolCall:
207
242
  """Parse tool call from OpenRouter response."""
208
243
  if tc.get("type") != "function":
@@ -232,7 +267,10 @@ def _parse_openrouter_tool_call(tc: dict[str, Any]) -> ToolCall:
232
267
  )
233
268
 
234
269
 
235
- def parse_openrouter_completion(response: dict[str, Any], model: str) -> LLMOutput:
270
+ def parse_openrouter_completion(
271
+ response: dict[str, Any],
272
+ model: str,
273
+ ) -> LLMOutput:
236
274
  """Parse OpenRouter completion response."""
237
275
  choices = response.get("choices", [])
238
276
  if not choices:
@@ -252,10 +290,11 @@ def parse_openrouter_completion(response: dict[str, Any], model: str) -> LLMOutp
252
290
  for choice in choices:
253
291
  message = choice.get("message", {})
254
292
  tool_calls_data = message.get("tool_calls")
293
+ content = message.get("content")
255
294
 
256
295
  completions.append(
257
296
  LLMCompletion(
258
- text=message.get("content"),
297
+ text=content,
259
298
  finish_reason=choice.get("finish_reason"),
260
299
  tool_calls=(
261
300
  [_parse_openrouter_tool_call(tc) for tc in tool_calls_data]
@@ -292,6 +331,7 @@ async def get_openrouter_chat_completion_async(
292
331
  logprobs: bool = False,
293
332
  top_logprobs: int | None = None,
294
333
  timeout: float = 30.0,
334
+ response_format: ResponseFormat | None = None,
295
335
  ) -> LLMOutput:
296
336
  """Get completion from OpenRouter."""
297
337
  if logprobs or top_logprobs is not None:
@@ -304,6 +344,7 @@ async def get_openrouter_chat_completion_async(
304
344
 
305
345
  input_messages = parse_chat_messages(messages)
306
346
  input_tools = parse_tools(tools) if tools else None
347
+ input_response_format = _build_response_format(response_format)
307
348
 
308
349
  response = await client.chat_completions_create(
309
350
  model=model_name,
@@ -313,6 +354,7 @@ async def get_openrouter_chat_completion_async(
313
354
  max_tokens=max_new_tokens,
314
355
  temperature=temperature,
315
356
  timeout=timeout,
357
+ response_format=input_response_format,
316
358
  )
317
359
 
318
360
  output = parse_openrouter_completion(response, model_name)
@@ -346,6 +388,7 @@ async def get_openrouter_chat_completion_streaming_async(
346
388
  logprobs: bool = False,
347
389
  top_logprobs: int | None = None,
348
390
  timeout: float = 30.0,
391
+ response_format: ResponseFormat | None = None,
349
392
  ) -> LLMOutput:
350
393
  """Get streaming completion from OpenRouter (falls back to non-streaming)."""
351
394
  logger.warning("Streaming not yet implemented for OpenRouter, using non-streaming.")
@@ -362,6 +405,7 @@ async def get_openrouter_chat_completion_streaming_async(
362
405
  logprobs=logprobs,
363
406
  top_logprobs=top_logprobs,
364
407
  timeout=timeout,
408
+ response_format=response_format,
365
409
  )
366
410
 
367
411
 
@@ -26,6 +26,7 @@ from docent._llm_util.providers.openrouter import (
26
26
  get_openrouter_chat_completion_streaming_async,
27
27
  )
28
28
  from docent.data_models.chat import ChatMessage, ToolInfo
29
+ from docent.data_models.chat.response_format import ResponseFormat
29
30
 
30
31
 
31
32
  class SingleOutputGetter(Protocol):
@@ -49,6 +50,7 @@ class SingleOutputGetter(Protocol):
49
50
  logprobs: bool,
50
51
  top_logprobs: int | None,
51
52
  timeout: float,
53
+ response_format: ResponseFormat | None,
52
54
  ) -> LLMOutput:
53
55
  """Get a single completion from an LLM.
54
56
 
@@ -64,6 +66,7 @@ class SingleOutputGetter(Protocol):
64
66
  logprobs: Whether to return log probabilities.
65
67
  top_logprobs: Number of most likely tokens to return probabilities for.
66
68
  timeout: Maximum time to wait for a response in seconds.
69
+ response_format: Optional structured output format specification.
67
70
 
68
71
  Returns:
69
72
  LLMOutput: The model's response.
@@ -93,6 +96,7 @@ class SingleStreamingOutputGetter(Protocol):
93
96
  logprobs: bool,
94
97
  top_logprobs: int | None,
95
98
  timeout: float,
99
+ response_format: ResponseFormat | None,
96
100
  ) -> LLMOutput:
97
101
  """Get a streaming completion from an LLM.
98
102
 
@@ -109,6 +113,7 @@ class SingleStreamingOutputGetter(Protocol):
109
113
  logprobs: Whether to return log probabilities.
110
114
  top_logprobs: Number of most likely tokens to return probabilities for.
111
115
  timeout: Maximum time to wait for a response in seconds.
116
+ response_format: Optional structured output format specification.
112
117
 
113
118
  Returns:
114
119
  LLMOutput: The complete model response after streaming finishes.
@@ -10,6 +10,7 @@ from docent.data_models.chat.message import (
10
10
  parse_chat_message,
11
11
  parse_docent_chat_message,
12
12
  )
13
+ from docent.data_models.chat.response_format import ResponseFormat
13
14
  from docent.data_models.chat.tool import (
14
15
  ToolCall,
15
16
  ToolCallContent,
@@ -28,6 +29,7 @@ __all__ = [
28
29
  "Content",
29
30
  "ContentReasoning",
30
31
  "ContentText",
32
+ "ResponseFormat",
31
33
  "ToolCall",
32
34
  "ToolCallContent",
33
35
  "ToolInfo",
@@ -0,0 +1,47 @@
1
+ """Response format specification for structured outputs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Literal
6
+
7
+ from pydantic import BaseModel, Field
8
+
9
+
10
+ class ResponseFormat(BaseModel):
11
+ """Unified response format specification for structured outputs.
12
+
13
+ Supports JSON Schema-based constrained decoding across LLM providers.
14
+ Each provider converts this to their specific format:
15
+ - OpenAI: response_format parameter
16
+ - Anthropic: output_format parameter (with beta header)
17
+ - OpenRouter: response_format parameter (same as OpenAI)
18
+
19
+ Attributes:
20
+ type: The format type. Currently only "json_schema" is supported.
21
+ name: A name for the schema (required by all providers).
22
+ schema_: The JSON Schema definition as a dict.
23
+ strict: Whether to enforce strict schema adherence (default True).
24
+
25
+ Example:
26
+ ```python
27
+ response_format = ResponseFormat(
28
+ name="analysis_result",
29
+ schema={
30
+ "type": "object",
31
+ "properties": {
32
+ "score": {"type": "number"},
33
+ "explanation": {"type": "string"},
34
+ },
35
+ "required": ["score", "explanation"],
36
+ },
37
+ )
38
+ ```
39
+ """
40
+
41
+ type: Literal["json_schema"] = "json_schema"
42
+ name: str
43
+ # Named `schema_` to avoid conflict with Pydantic's internal schema methods
44
+ schema_: dict[str, Any] = Field(alias="schema")
45
+ strict: bool = True
46
+
47
+ model_config = {"populate_by_name": True}