docent-python 0.1.41a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docent-python might be problematic. Click here for more details.

Files changed (59) hide show
  1. docent/__init__.py +4 -0
  2. docent/_llm_util/__init__.py +0 -0
  3. docent/_llm_util/data_models/__init__.py +0 -0
  4. docent/_llm_util/data_models/exceptions.py +48 -0
  5. docent/_llm_util/data_models/llm_output.py +331 -0
  6. docent/_llm_util/llm_cache.py +193 -0
  7. docent/_llm_util/llm_svc.py +472 -0
  8. docent/_llm_util/model_registry.py +134 -0
  9. docent/_llm_util/providers/__init__.py +0 -0
  10. docent/_llm_util/providers/anthropic.py +537 -0
  11. docent/_llm_util/providers/common.py +41 -0
  12. docent/_llm_util/providers/google.py +530 -0
  13. docent/_llm_util/providers/openai.py +745 -0
  14. docent/_llm_util/providers/openrouter.py +375 -0
  15. docent/_llm_util/providers/preference_types.py +104 -0
  16. docent/_llm_util/providers/provider_registry.py +164 -0
  17. docent/_log_util/__init__.py +3 -0
  18. docent/_log_util/logger.py +141 -0
  19. docent/data_models/__init__.py +14 -0
  20. docent/data_models/_tiktoken_util.py +91 -0
  21. docent/data_models/agent_run.py +473 -0
  22. docent/data_models/chat/__init__.py +37 -0
  23. docent/data_models/chat/content.py +56 -0
  24. docent/data_models/chat/message.py +191 -0
  25. docent/data_models/chat/tool.py +109 -0
  26. docent/data_models/citation.py +187 -0
  27. docent/data_models/formatted_objects.py +84 -0
  28. docent/data_models/judge.py +17 -0
  29. docent/data_models/metadata_util.py +16 -0
  30. docent/data_models/regex.py +56 -0
  31. docent/data_models/transcript.py +305 -0
  32. docent/data_models/util.py +170 -0
  33. docent/judges/__init__.py +23 -0
  34. docent/judges/analysis.py +77 -0
  35. docent/judges/impl.py +587 -0
  36. docent/judges/runner.py +129 -0
  37. docent/judges/stats.py +205 -0
  38. docent/judges/types.py +320 -0
  39. docent/judges/util/forgiving_json.py +108 -0
  40. docent/judges/util/meta_schema.json +86 -0
  41. docent/judges/util/meta_schema.py +29 -0
  42. docent/judges/util/parse_output.py +68 -0
  43. docent/judges/util/voting.py +139 -0
  44. docent/loaders/load_inspect.py +215 -0
  45. docent/py.typed +0 -0
  46. docent/samples/__init__.py +3 -0
  47. docent/samples/load.py +9 -0
  48. docent/samples/log.eval +0 -0
  49. docent/samples/tb_airline.json +1 -0
  50. docent/sdk/__init__.py +0 -0
  51. docent/sdk/agent_run_writer.py +317 -0
  52. docent/sdk/client.py +1186 -0
  53. docent/sdk/llm_context.py +432 -0
  54. docent/trace.py +2741 -0
  55. docent/trace_temp.py +1086 -0
  56. docent_python-0.1.41a0.dist-info/METADATA +33 -0
  57. docent_python-0.1.41a0.dist-info/RECORD +59 -0
  58. docent_python-0.1.41a0.dist-info/WHEEL +4 -0
  59. docent_python-0.1.41a0.dist-info/licenses/LICENSE.md +13 -0
@@ -0,0 +1,191 @@
1
+ from logging import getLogger
2
+ from typing import Annotated, Any, Literal
3
+
4
+ from pydantic import BaseModel, Discriminator
5
+
6
+ from docent.data_models.chat.content import Content
7
+ from docent.data_models.chat.tool import ToolCall
8
+ from docent.data_models.citation import InlineCitation
9
+
10
+ logger = getLogger(__name__)
11
+
12
+
13
+ class BaseChatMessage(BaseModel):
14
+ """Base class for all chat message types.
15
+
16
+ Attributes:
17
+ id: Optional unique identifier for the message.
18
+ content: The message content, either as a string or list of Content objects.
19
+ role: The role of the message sender (system, user, assistant, tool).
20
+ metadata: Additional structured metadata about the message.
21
+ """
22
+
23
+ id: str | None = None
24
+ content: str | list[Content]
25
+ role: Literal["system", "user", "assistant", "tool"]
26
+ metadata: dict[str, Any] | None = None
27
+
28
+ @property
29
+ def text(self) -> str:
30
+ """Get the text content of the message.
31
+
32
+ Returns:
33
+ str: The text content of the message. If content is a list,
34
+ concatenates all text content elements with newlines.
35
+ """
36
+ if isinstance(self.content, str):
37
+ return self.content
38
+ else:
39
+ all_text = [content.text for content in self.content if content.type == "text"]
40
+ return "\n".join(all_text)
41
+
42
+
43
+ class SystemMessage(BaseChatMessage):
44
+ """System message in a chat conversation.
45
+
46
+ Attributes:
47
+ role: Always set to "system".
48
+ """
49
+
50
+ role: Literal["system"] = "system" # type: ignore
51
+
52
+
53
+ class UserMessage(BaseChatMessage):
54
+ """User message in a chat conversation.
55
+
56
+ Attributes:
57
+ role: Always set to "user".
58
+ tool_call_id: Optional list of tool call IDs this message is responding to.
59
+ """
60
+
61
+ role: Literal["user"] = "user" # type: ignore
62
+ tool_call_id: list[str] | None = None
63
+
64
+
65
+ class AssistantMessage(BaseChatMessage):
66
+ """Assistant message in a chat conversation.
67
+
68
+ Attributes:
69
+ role: Always set to "assistant".
70
+ model: Optional identifier for the model that generated this message.
71
+ tool_calls: Optional list of tool calls made by the assistant.
72
+ """
73
+
74
+ role: Literal["assistant"] = "assistant" # type: ignore
75
+ model: str | None = None
76
+ tool_calls: list[ToolCall] | None = None
77
+
78
+
79
+ class DocentAssistantMessage(AssistantMessage):
80
+ """Assistant message in a chat session with additional chat-specific metadata.
81
+
82
+ This extends AssistantMessage with fields that are only relevant in Docent chat contexts
83
+
84
+ Attributes:
85
+ citations: Optional list of citations referenced in the message content.
86
+ suggested_messages: Optional list of suggested followup messages.
87
+ """
88
+
89
+ citations: list[InlineCitation] | None = None
90
+ suggested_messages: list[str] | None = None
91
+
92
+
93
+ class ToolMessage(BaseChatMessage):
94
+ """Tool message in a chat conversation.
95
+
96
+ Attributes:
97
+ role: Always set to "tool".
98
+ tool_call_id: Optional ID of the tool call this message is responding to.
99
+ function: Optional name of the function that was called.
100
+ error: Optional error information if the tool call failed.
101
+ """
102
+
103
+ role: Literal["tool"] = "tool" # type: ignore
104
+
105
+ tool_call_id: str | None = None
106
+ function: str | None = None
107
+ error: dict[str, Any] | None = None
108
+
109
+
110
+ ChatMessage = Annotated[
111
+ SystemMessage | UserMessage | AssistantMessage | ToolMessage,
112
+ Discriminator("role"),
113
+ ]
114
+ """Type alias for any chat message type, discriminated by the role field.
115
+
116
+ This is the base message union used in Transcript and AgentRun contexts.
117
+ For chat sessions, use ChatSessionMessage instead.
118
+ """
119
+
120
+ DocentChatMessage = Annotated[
121
+ SystemMessage | UserMessage | DocentAssistantMessage | ToolMessage,
122
+ Discriminator("role"),
123
+ ]
124
+ """Type alias for chat session messages with chat-specific assistant metadata."""
125
+
126
+
127
+ def parse_chat_message(message_data: dict[str, Any] | ChatMessage) -> ChatMessage:
128
+ """Parse a message dictionary or object into the appropriate ChatMessage subclass.
129
+
130
+ This parses base messages without chat-specific fields. For chat sessions,
131
+ use parse_chat_session_message instead.
132
+
133
+ Args:
134
+ message_data: A dictionary or ChatMessage object representing a chat message.
135
+
136
+ Returns:
137
+ ChatMessage: An instance of a ChatMessage subclass based on the role.
138
+
139
+ Raises:
140
+ ValueError: If the message role is unknown.
141
+ """
142
+ if isinstance(message_data, (SystemMessage, UserMessage, AssistantMessage, ToolMessage)):
143
+ return message_data
144
+
145
+ role = message_data.get("role")
146
+ if role == "system":
147
+ return SystemMessage.model_validate(message_data)
148
+ elif role == "user":
149
+ return UserMessage.model_validate(message_data)
150
+ elif role == "assistant":
151
+ return AssistantMessage.model_validate(message_data)
152
+ elif role == "tool":
153
+ return ToolMessage.model_validate(message_data)
154
+ else:
155
+ raise ValueError(f"Unknown message role: {role}")
156
+
157
+
158
+ def parse_docent_chat_message(
159
+ message_data: dict[str, Any] | DocentChatMessage,
160
+ ) -> DocentChatMessage:
161
+ """Parse a message dictionary or object into the appropriate ChatSessionMessage subclass.
162
+
163
+ This handles chat session messages which may include ChatAssistantMessage with
164
+ citations and suggested_messages fields.
165
+
166
+ Args:
167
+ message_data: A dictionary or ChatSessionMessage object representing a chat session message.
168
+
169
+ Returns:
170
+ ChatSessionMessage: An instance of a ChatSessionMessage subclass based on the role.
171
+
172
+ Raises:
173
+ ValueError: If the message role is unknown.
174
+ """
175
+ if isinstance(
176
+ message_data,
177
+ (SystemMessage, UserMessage, DocentAssistantMessage, AssistantMessage, ToolMessage),
178
+ ):
179
+ return message_data
180
+
181
+ role = message_data.get("role")
182
+ if role == "system":
183
+ return SystemMessage.model_validate(message_data)
184
+ elif role == "user":
185
+ return UserMessage.model_validate(message_data)
186
+ elif role == "assistant":
187
+ return DocentAssistantMessage.model_validate(message_data)
188
+ elif role == "tool":
189
+ return ToolMessage.model_validate(message_data)
190
+ else:
191
+ raise ValueError(f"Unknown message role: {role}")
@@ -0,0 +1,109 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Any, Literal
5
+
6
+ from pydantic import BaseModel, Field
7
+
8
+
9
+ @dataclass
10
+ class ToolCall:
11
+ """Tool call information.
12
+
13
+ Attributes:
14
+ id: Unique identifier for tool call.
15
+ type: Type of tool call. Can only be "function" or None.
16
+ function: Function called.
17
+ arguments: Arguments to function.
18
+ parse_error: Error which occurred parsing tool call.
19
+ view: Custom view of tool call input.
20
+ """
21
+
22
+ id: str
23
+ function: str
24
+ arguments: dict[str, Any]
25
+ type: Literal["function"] | None = None
26
+ parse_error: str | None = None
27
+ view: ToolCallContent | None = None
28
+
29
+
30
+ class ToolCallContent(BaseModel):
31
+ """Content to include in tool call view.
32
+
33
+ Attributes:
34
+ title: Optional (plain text) title for tool call content.
35
+ format: Format (text or markdown).
36
+ content: Text or markdown content.
37
+ """
38
+
39
+ title: str | None = None
40
+ format: Literal["text", "markdown"]
41
+ content: str
42
+
43
+
44
+ class ToolParam(BaseModel):
45
+ """A parameter for a tool function.
46
+
47
+ Args:
48
+ name: The name of the parameter.
49
+ description: A description of what the parameter does.
50
+ input_schema: JSON Schema describing the parameter's type and validation rules.
51
+ """
52
+
53
+ name: str
54
+ description: str
55
+ input_schema: dict[str, Any]
56
+
57
+
58
+ class ToolParams(BaseModel):
59
+ """Description of tool parameters object in JSON Schema format.
60
+
61
+ Args:
62
+ type: The type of the parameters object, always 'object'.
63
+ properties: Dictionary mapping parameter names to their ToolParam definitions.
64
+ required: List of required parameter names.
65
+ additionalProperties: Whether additional properties are allowed beyond those
66
+ specified. Always False.
67
+ """
68
+
69
+ type: Literal["object"] = "object"
70
+ properties: dict[str, ToolParam] = Field(default_factory=dict)
71
+ required: list[str] = Field(default_factory=list)
72
+ additionalProperties: bool = False
73
+
74
+
75
+ class ToolInfo(BaseModel):
76
+ """Specification of a tool (JSON Schema compatible).
77
+
78
+ If you are implementing a ModelAPI, most LLM libraries can
79
+ be passed this object (dumped to a dict) directly as a function
80
+ specification. For example, in the OpenAI provider:
81
+
82
+ ```python
83
+ ChatCompletionToolParam(
84
+ type="function",
85
+ function=tool.model_dump(exclude_none=True),
86
+ )
87
+ ```
88
+
89
+ In some cases the field names don't match up exactly. In that case
90
+ call `model_dump()` on the `parameters` field. For example, in the
91
+ Anthropic provider:
92
+
93
+ ```python
94
+ ToolParam(
95
+ name=tool.name,
96
+ description=tool.description,
97
+ input_schema=tool.parameters.model_dump(exclude_none=True),
98
+ )
99
+ ```
100
+
101
+ Attributes:
102
+ name: Name of tool.
103
+ description: Short description of tool.
104
+ parameters: JSON Schema of tool parameters object.
105
+ """
106
+
107
+ name: str
108
+ description: str
109
+ parameters: ToolParams = Field(default_factory=ToolParams)
@@ -0,0 +1,187 @@
1
+ from typing import Annotated, Literal, Union
2
+
3
+ from pydantic import BaseModel, Discriminator
4
+
5
+
6
+ class CitationTargetTextRange(BaseModel):
7
+ start_pattern: str | None = None
8
+ end_pattern: str | None = None
9
+
10
+
11
+ class ResolvedCitationItem(BaseModel):
12
+ pass
13
+
14
+
15
+ class CitationTarget(BaseModel):
16
+ item: "ResolvedCitationItemUnion"
17
+ text_range: CitationTargetTextRange | None = None
18
+
19
+
20
+ class ParsedCitation(BaseModel):
21
+ start_idx: int
22
+ end_idx: int
23
+ item_alias: str
24
+ text_range: CitationTargetTextRange | None = None
25
+
26
+
27
+ class InlineCitation(BaseModel):
28
+ start_idx: int
29
+ end_idx: int
30
+ target: CitationTarget
31
+
32
+
33
+ class AgentRunMetadataItem(ResolvedCitationItem):
34
+ item_type: Literal["agent_run_metadata"] = "agent_run_metadata"
35
+ agent_run_id: str
36
+ collection_id: str
37
+ metadata_key: str
38
+
39
+
40
+ class TranscriptMetadataItem(ResolvedCitationItem):
41
+ item_type: Literal["transcript_metadata"] = "transcript_metadata"
42
+ agent_run_id: str
43
+ collection_id: str
44
+ transcript_id: str
45
+ metadata_key: str
46
+
47
+
48
+ class TranscriptBlockMetadataItem(ResolvedCitationItem):
49
+ item_type: Literal["block_metadata"] = "block_metadata"
50
+ agent_run_id: str
51
+ collection_id: str
52
+ transcript_id: str
53
+ block_idx: int
54
+ metadata_key: str
55
+
56
+
57
+ class TranscriptBlockContentItem(ResolvedCitationItem):
58
+ item_type: Literal["block_content"] = "block_content"
59
+ agent_run_id: str
60
+ collection_id: str
61
+ transcript_id: str
62
+ block_idx: int
63
+
64
+
65
+ ResolvedCitationItemUnion = Annotated[
66
+ Union[
67
+ AgentRunMetadataItem,
68
+ TranscriptMetadataItem,
69
+ TranscriptBlockMetadataItem,
70
+ TranscriptBlockContentItem,
71
+ ],
72
+ Discriminator("item_type"),
73
+ ]
74
+
75
+ RANGE_BEGIN = "<RANGE>"
76
+ RANGE_END = "</RANGE>"
77
+
78
+
79
+ def scan_brackets(text: str) -> list[tuple[int, int, str]]:
80
+ """Scan text for bracketed segments, respecting RANGE markers and nested brackets.
81
+
82
+ Returns a list of (start_index, end_index_exclusive, inner_content).
83
+ """
84
+ matches: list[tuple[int, int, str]] = []
85
+ i = 0
86
+ while i < len(text):
87
+ if text[i] == "[":
88
+ start = i
89
+ bracket_count = 1
90
+ j = i + 1
91
+ in_range = False
92
+
93
+ while j < len(text) and bracket_count > 0:
94
+ if text[j : j + len(RANGE_BEGIN)] == RANGE_BEGIN:
95
+ in_range = True
96
+ elif text[j : j + len(RANGE_END)] == RANGE_END:
97
+ in_range = False
98
+ elif text[j] == "[" and not in_range:
99
+ bracket_count += 1
100
+ elif text[j] == "]" and not in_range:
101
+ bracket_count -= 1
102
+ j += 1
103
+
104
+ if bracket_count == 0:
105
+ end_exclusive = j
106
+ bracket_content = text[start + 1 : end_exclusive - 1]
107
+ matches.append((start, end_exclusive, bracket_content))
108
+ i = j
109
+ else:
110
+ i += 1
111
+ else:
112
+ i += 1
113
+ return matches
114
+
115
+
116
+ def _extract_range_pattern(range_part: str) -> CitationTargetTextRange | None:
117
+ if RANGE_BEGIN in range_part and RANGE_END in range_part:
118
+ range_begin_idx = range_part.find(RANGE_BEGIN)
119
+ range_end_idx = range_part.find(RANGE_END)
120
+ if range_begin_idx != -1 and range_end_idx != -1:
121
+ range_content = range_part[range_begin_idx + len(RANGE_BEGIN) : range_end_idx]
122
+ start_pattern = range_content if range_content else None
123
+ return CitationTargetTextRange(start_pattern=start_pattern)
124
+
125
+ return None
126
+
127
+
128
+ def parse_single_citation(part: str) -> tuple[str, CitationTargetTextRange | None] | None:
129
+ """
130
+ Parse a single citation token inside a bracket and return its components.
131
+
132
+ Returns ParsedCitation or None if invalid.
133
+ For metadata citations, transcript_idx may be None (for agent run metadata).
134
+ Supports optional text range for all valid citation kinds.
135
+ """
136
+ token = part.strip()
137
+ if not token:
138
+ return None
139
+
140
+ # Extract optional range part
141
+ item_alias = token
142
+ text_range: CitationTargetTextRange | None = None
143
+ if ":" in token:
144
+ left, right = token.split(":", 1)
145
+ item_alias = left.strip()
146
+ text_range = _extract_range_pattern(right)
147
+
148
+ return item_alias, text_range
149
+
150
+
151
+ def parse_citations(text: str) -> tuple[str, list[ParsedCitation]]:
152
+ """
153
+ Parse citations from text in the format described by TEXT_RANGE_CITE_INSTRUCTION.
154
+
155
+ Supported formats:
156
+ - Single block: [T<key>B<idx>]
157
+ - Text range with start pattern: [T<key>B<idx>:<RANGE>start_pattern</RANGE>]
158
+ - Agent run metadata: [M.key]
159
+ - Transcript metadata: [T<key>M.key]
160
+ - Message metadata: [T<key>B<idx>M.key]
161
+ - Message metadata with text range: [T<key>B<idx>M.key:<RANGE>start_pattern</RANGE>]
162
+
163
+ Args:
164
+ text: The text to parse citations from
165
+
166
+ Returns:
167
+ A tuple of (cleaned_text, citations) where cleaned_text has brackets and range markers removed
168
+ and citations have start_idx and end_idx representing character positions
169
+ in the cleaned text
170
+ """
171
+ citations: list[ParsedCitation] = []
172
+
173
+ bracket_matches = scan_brackets(text)
174
+
175
+ for start, end, bracket_content in bracket_matches:
176
+ # Parse a single citation token inside the bracket
177
+ parsed = parse_single_citation(bracket_content)
178
+ if not parsed:
179
+ continue
180
+ label, text_range = parsed
181
+
182
+ citations.append(
183
+ ParsedCitation(start_idx=start, end_idx=end, item_alias=label, text_range=text_range)
184
+ )
185
+
186
+ # We're not cleaning the text right now but may do that later
187
+ return text, citations
@@ -0,0 +1,84 @@
1
+ from uuid import uuid4
2
+
3
+ from pydantic import Field, model_validator
4
+
5
+ from docent.data_models.agent_run import AgentRun
6
+ from docent.data_models.transcript import Transcript
7
+
8
+
9
+ class FormattedTranscript(Transcript):
10
+ """A Transcript that preserves original message indices during edits.
11
+
12
+ This class extends Transcript to support customization while maintaining accurate
13
+ citations. Each message retains its original index from the source transcript,
14
+ even if messages are added, removed, or reordered.
15
+
16
+ Use this class when you need to customize which parts of a transcript are visible
17
+ to an LLM while ensuring citations remain valid.
18
+ """
19
+
20
+ id_to_original_index: dict[str, int]
21
+
22
+ @classmethod
23
+ def from_transcript(cls, transcript: Transcript) -> "FormattedTranscript":
24
+ """Create a FormattedTranscript from a regular Transcript."""
25
+ # Ensure all messages have IDs and build id_to_original_index
26
+ id_to_original_index: dict[str, int] = {}
27
+ for idx, msg in enumerate(transcript.messages):
28
+ if msg.id is None:
29
+ msg.id = str(uuid4())
30
+ id_to_original_index[msg.id] = idx
31
+
32
+ return cls(
33
+ id=transcript.id,
34
+ name=transcript.name,
35
+ description=transcript.description,
36
+ transcript_group_id=transcript.transcript_group_id,
37
+ created_at=transcript.created_at,
38
+ messages=transcript.messages,
39
+ metadata=transcript.metadata,
40
+ id_to_original_index=id_to_original_index,
41
+ )
42
+
43
+ @model_validator(mode="after")
44
+ def _validate_id_to_original_index(self) -> "FormattedTranscript":
45
+ """Ensure id_to_original_index covers all messages."""
46
+ for msg in self.messages:
47
+ if msg.id not in self.id_to_original_index:
48
+ raise ValueError(
49
+ f"Message {msg.id} missing from id_to_original_index. "
50
+ "Use FormattedTranscript.from_transcript() to create a new instance."
51
+ )
52
+ return self
53
+
54
+ def _enumerate_messages(self):
55
+ """Yield (original index, message) for each message."""
56
+ for message in self.messages:
57
+ assert message.id is not None
58
+ original_idx = self.id_to_original_index[message.id]
59
+ yield (original_idx, message)
60
+
61
+
62
+ class FormattedAgentRun(AgentRun):
63
+ """An AgentRun that allows customization while tracking original identifiers.
64
+
65
+ This class extends AgentRun to support modifications to what an LLM sees
66
+ while maintaining accurate citations back to the original agent run.
67
+
68
+ Use this class when you need to customize which parts of an agent run are visible
69
+ to an LLM (e.g., hiding metadata, truncating long outputs).
70
+ """
71
+
72
+ transcripts: list[FormattedTranscript] = Field(default_factory=list) # type: ignore[assignment]
73
+
74
+ @classmethod
75
+ def from_agent_run(cls, agent_run: AgentRun) -> "FormattedAgentRun":
76
+ """Create a FormattedAgentRun from a regular AgentRun."""
77
+ return cls(
78
+ id=agent_run.id,
79
+ name=agent_run.name,
80
+ description=agent_run.description,
81
+ transcripts=[FormattedTranscript.from_transcript(t) for t in agent_run.transcripts],
82
+ transcript_groups=agent_run.transcript_groups,
83
+ metadata=agent_run.metadata,
84
+ )
@@ -0,0 +1,17 @@
1
+ """Judge-related data models shared across Docent components."""
2
+
3
+ from typing import Any
4
+ from uuid import uuid4
5
+
6
+ from pydantic import BaseModel, Field
7
+
8
+
9
+ class Label(BaseModel):
10
+ id: str = Field(default_factory=lambda: str(uuid4()))
11
+
12
+ label_set_id: str
13
+ label_value: dict[str, Any]
14
+ agent_run_id: str
15
+
16
+
17
+ __all__ = ["Label"]
@@ -0,0 +1,16 @@
1
+ import json
2
+ from typing import Any
3
+
4
+ from pydantic_core import to_jsonable_python
5
+
6
+
7
+ def dump_metadata(metadata: dict[str, Any]) -> str | None:
8
+ """
9
+ Dump metadata to a JSON string.
10
+ We used to use YAML to save tokens, but JSON makes it easier to find cited ranges on the frontend because the frontend uses JSON.
11
+ """
12
+ if not metadata:
13
+ return None
14
+ metadata_obj = to_jsonable_python(metadata)
15
+ text = json.dumps(metadata_obj, indent=2)
16
+ return text.strip()
@@ -0,0 +1,56 @@
1
+ import re
2
+
3
+ from pydantic import BaseModel
4
+
5
+ from docent._log_util import get_logger
6
+
7
+ logger = get_logger(__name__)
8
+
9
+
10
+ class RegexSnippet(BaseModel):
11
+ snippet: str
12
+ match_start: int
13
+ match_end: int
14
+
15
+
16
+ def get_regex_snippets(text: str, pattern: str, window_size: int = 50) -> list[RegexSnippet]:
17
+ """Extracts snippets from text that match a regex pattern, with surrounding context.
18
+
19
+ Args:
20
+ text: The text to search in.
21
+ pattern: The regex pattern to match.
22
+ window_size: The number of characters to include before and after the match.
23
+
24
+ Returns:
25
+ A list of RegexSnippet objects containing the snippets and match positions.
26
+ """
27
+ # Find all matches
28
+ try:
29
+ matches = list(re.compile(pattern, re.IGNORECASE | re.DOTALL).finditer(text))
30
+ if not matches:
31
+ logger.warning(f"No regex matches found for {pattern}: this shouldn't happen!")
32
+
33
+ if not matches:
34
+ return []
35
+
36
+ snippets: list[RegexSnippet] = []
37
+ for match in matches:
38
+ start, end = match.span()
39
+
40
+ # Calculate window around the match
41
+ snippet_start = max(0, start - window_size)
42
+ snippet_end = min(len(text), end + window_size)
43
+
44
+ # Create the snippet with the match indices adjusted for the window
45
+ snippets.append(
46
+ RegexSnippet(
47
+ snippet=text[snippet_start:snippet_end],
48
+ match_start=start - snippet_start,
49
+ match_end=end - snippet_start,
50
+ )
51
+ )
52
+
53
+ return snippets
54
+ except re.error as e:
55
+ logger.error(f"Got regex error: {e}")
56
+ return []