PyPI - docent-python - Versions diffs - 0.1.35a0__tar.gz → 0.1.36a0__tar.gz - Mend

docent-python 0.1.35a0tar.gz → 0.1.36a0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

{docent_python-0.1.35a0 → docent_python-0.1.36a0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docent-python
-Version: 0.1.35a0
+Version: 0.1.36a0
 Summary: Docent SDK
 Project-URL: Homepage, https://github.com/TransluceAI/docent
 Project-URL: Issues, https://github.com/TransluceAI/docent/issues

{docent_python-0.1.35a0 → docent_python-0.1.36a0}/docent/data_models/__init__.py RENAMED Viewed

@@ -1,5 +1,5 @@
 from docent.data_models.agent_run import AgentRun
-from docent.data_models.citation import Citation
+from docent.data_models.citation import InlineCitation
 from docent.data_models.collection import Collection
 from docent.data_models.judge import Label
 from docent.data_models.regex import RegexSnippet
@@ -7,8 +7,8 @@ from docent.data_models.transcript import Transcript, TranscriptGroup
 __all__ = [
     "AgentRun",
-    "Citation",
     "Collection",
+    "InlineCitation",
     "Label",
     "RegexSnippet",
     "Transcript",

{docent_python-0.1.35a0 → docent_python-0.1.36a0}/docent/data_models/agent_run.py RENAMED Viewed

@@ -134,7 +134,7 @@ class AgentRun(BaseModel):
     # Converting to text #
     ######################
-    def _to_text_impl(self, token_limit: int = sys.maxsize, use_blocks: bool = False) -> list[str]:
+    def _to_text_impl(self, token_limit: int = sys.maxsize) -> list[str]:
         """
         Core implementation for converting agent run to text representation.
@@ -151,8 +151,6 @@ class AgentRun(BaseModel):
             transcript_content = t.to_str(
                 token_limit=sys.maxsize,
                 transcript_idx=i,
-                agent_run_idx=None,
-                use_action_units=not use_blocks,
             )[0]
             transcript_strs.append(f"<transcript>\n{transcript_content}\n</transcript>")
@@ -202,15 +200,16 @@ class AgentRun(BaseModel):
                     ), "Ranges without metadata should be a single message"
                     t = self.transcripts[msg_range.start]
                     if msg_range.num_tokens < token_limit - 50:
-                        transcript = f"<transcript>\n{t.to_str(token_limit=sys.maxsize, use_action_units=not use_blocks)[0]}\n</transcript>"
+                        transcript = (
+                            f"<transcript>\n{t.to_str(token_limit=sys.maxsize)[0]}\n</transcript>"
+                        )
                         result = (
                             f"Here is a partial agent run for analysis purposes only:\n{transcript}"
                         )
                         results.append(result)
                     else:
-                        transcript_fragments = t.to_str(
+                        transcript_fragments: list[str] = t.to_str(
                             token_limit=token_limit - 50,
-                            use_action_units=not use_blocks,
                         )
                         for fragment in transcript_fragments:
                             result = f"<transcript>\n{fragment}\n</transcript>"
@@ -220,26 +219,6 @@ class AgentRun(BaseModel):
                             results.append(result)
             return results
-    def to_text(self, token_limit: int = sys.maxsize) -> list[str]:
-        """
-        Represents an agent run as a list of strings, each of which is at most token_limit tokens
-        under the GPT-4 tokenization scheme.
-        We'll try to split up long AgentRuns along transcript boundaries and include metadata.
-        For very long transcripts, we'll have to split them up further and remove metadata.
-        """
-        return self._to_text_impl(token_limit=token_limit, use_blocks=False)
-    def to_text_blocks(self, token_limit: int = sys.maxsize) -> list[str]:
-        """
-        Represents an agent run as a list of strings using individual message blocks,
-        each of which is at most token_limit tokens under the GPT-4 tokenization scheme.
-        Unlike to_text() which uses action units, this method formats each message
-        as an individual block.
-        """
-        return self._to_text_impl(token_limit=token_limit, use_blocks=True)
     @property
     def text(self) -> str:
         """Concatenates all transcript texts with double newlines as separators.
@@ -247,16 +226,7 @@ class AgentRun(BaseModel):
         Returns:
             str: A string representation of all transcripts.
         """
-        return self._to_text_impl(token_limit=sys.maxsize, use_blocks=False)[0]
-    @property
-    def text_blocks(self) -> str:
-        """Concatenates all transcript texts using individual blocks format.
-        Returns:
-            str: A string representation of all transcripts using individual message blocks.
-        """
-        return self._to_text_impl(token_limit=sys.maxsize, use_blocks=True)[0]
+        return self._to_text_impl(token_limit=sys.maxsize)[0]
     ##############################
     # New text rendering methods #
@@ -414,10 +384,20 @@ class AgentRun(BaseModel):
         return c_tree, transcript_idx_map
-    def to_text_new(self, indent: int = 0, full_tree: bool = False):
+    def to_text_new(
+        self,
+        agent_run_alias: int | str = 0,
+        t_idx_map: dict[str, int] | None = None,
+        indent: int = 0,
+        full_tree: bool = False,
+    ):
+        if isinstance(agent_run_alias, int):
+            agent_run_alias = f"R{agent_run_alias}"
         c_tree = self.get_canonical_tree(full_tree=full_tree)
         t_ids_ordered = self.get_transcript_ids_ordered(full_tree=full_tree)
-        t_idx_map = {t_id: i for i, t_id in enumerate(t_ids_ordered)}
+        if t_idx_map is None:
+            t_idx_map = {t_id: i for i, t_id in enumerate(t_ids_ordered)}
         t_dict = self.transcript_dict
         tg_dict = self.transcript_group_dict
@@ -430,7 +410,7 @@ class AgentRun(BaseModel):
                     children_texts.append(_recurse(child_id))
                 else:
                     cur_text = t_dict[child_id].to_text_new(
-                        transcript_idx=t_idx_map[child_id],
+                        transcript_alias=t_idx_map[child_id],
                         indent=indent,
                     )
                     children_texts.append(cur_text)
@@ -451,6 +431,7 @@ class AgentRun(BaseModel):
         if metadata_text is not None:
             if indent > 0:
                 metadata_text = textwrap.indent(metadata_text, " " * indent)
-            text += f"\n<|agent run metadata|>\n{metadata_text}\n</|agent run metadata|>"
+            metadata_alias = f"{agent_run_alias}M"
+            text += f"\n<|agent run metadata {metadata_alias}|>\n{metadata_text}\n</|agent run metadata {metadata_alias}|>"
-        return text
+        return f"<|agent run {agent_run_alias}|>\n{text}\n</|agent run {agent_run_alias}|>\n"

{docent_python-0.1.35a0 → docent_python-0.1.36a0}/docent/data_models/chat/__init__.py RENAMED Viewed

@@ -2,10 +2,13 @@ from docent.data_models.chat.content import Content, ContentReasoning, ContentTe
 from docent.data_models.chat.message import (
     AssistantMessage,
     ChatMessage,
+    DocentAssistantMessage,
+    DocentChatMessage,
     SystemMessage,
     ToolMessage,
     UserMessage,
     parse_chat_message,
+    parse_docent_chat_message,
 )
 from docent.data_models.chat.tool import (
     ToolCall,
@@ -16,7 +19,9 @@ from docent.data_models.chat.tool import (
 __all__ = [
     "ChatMessage",
+    "DocentChatMessage",
     "AssistantMessage",
+    "DocentAssistantMessage",
     "SystemMessage",
     "ToolMessage",
     "UserMessage",
@@ -28,4 +33,5 @@ __all__ = [
     "ToolInfo",
     "ToolParams",
     "parse_chat_message",
+    "parse_docent_chat_message",
 ]

{docent_python-0.1.35a0 → docent_python-0.1.36a0}/docent/data_models/chat/message.py RENAMED Viewed

@@ -5,7 +5,7 @@ from pydantic import BaseModel, Discriminator, Field
 from docent.data_models.chat.content import Content
 from docent.data_models.chat.tool import ToolCall
-from docent.data_models.citation import Citation
+from docent.data_models.citation import InlineCitation
 logger = getLogger(__name__)
@@ -69,14 +69,24 @@ class AssistantMessage(BaseChatMessage):
         role: Always set to "assistant".
         model: Optional identifier for the model that generated this message.
         tool_calls: Optional list of tool calls made by the assistant.
-        citations: Optional list of citations referenced in the message content.
-        suggested_messages: Optional list of suggested followup messages.
     """
     role: Literal["assistant"] = "assistant"  # type: ignore
     model: str | None = None
     tool_calls: list[ToolCall] | None = None
-    citations: list[Citation] | None = None
+class DocentAssistantMessage(AssistantMessage):
+    """Assistant message in a chat session with additional chat-specific metadata.
+    This extends AssistantMessage with fields that are only relevant in Docent chat contexts
+    Attributes:
+        citations: Optional list of citations referenced in the message content.
+        suggested_messages: Optional list of suggested followup messages.
+    """
+    citations: list[InlineCitation] | None = None
     suggested_messages: list[str] | None = None
@@ -101,12 +111,25 @@ ChatMessage = Annotated[
     SystemMessage | UserMessage | AssistantMessage | ToolMessage,
     Discriminator("role"),
 ]
-"""Type alias for any chat message type, discriminated by the role field."""
+"""Type alias for any chat message type, discriminated by the role field.
+This is the base message union used in Transcript and AgentRun contexts.
+For chat sessions, use ChatSessionMessage instead.
+"""
+DocentChatMessage = Annotated[
+    SystemMessage | UserMessage | DocentAssistantMessage | ToolMessage,
+    Discriminator("role"),
+]
+"""Type alias for chat session messages with chat-specific assistant metadata."""
 def parse_chat_message(message_data: dict[str, Any] | ChatMessage) -> ChatMessage:
     """Parse a message dictionary or object into the appropriate ChatMessage subclass.
+    This parses base messages without chat-specific fields. For chat sessions,
+    use parse_chat_session_message instead.
     Args:
         message_data: A dictionary or ChatMessage object representing a chat message.
@@ -130,3 +153,39 @@ def parse_chat_message(message_data: dict[str, Any] | ChatMessage) -> ChatMessag
         return ToolMessage.model_validate(message_data)
     else:
         raise ValueError(f"Unknown message role: {role}")
+def parse_docent_chat_message(
+    message_data: dict[str, Any] | DocentChatMessage,
+) -> DocentChatMessage:
+    """Parse a message dictionary or object into the appropriate ChatSessionMessage subclass.
+    This handles chat session messages which may include ChatAssistantMessage with
+    citations and suggested_messages fields.
+    Args:
+        message_data: A dictionary or ChatSessionMessage object representing a chat session message.
+    Returns:
+        ChatSessionMessage: An instance of a ChatSessionMessage subclass based on the role.
+    Raises:
+        ValueError: If the message role is unknown.
+    """
+    if isinstance(
+        message_data,
+        (SystemMessage, UserMessage, DocentAssistantMessage, AssistantMessage, ToolMessage),
+    ):
+        return message_data
+    role = message_data.get("role")
+    if role == "system":
+        return SystemMessage.model_validate(message_data)
+    elif role == "user":
+        return UserMessage.model_validate(message_data)
+    elif role == "assistant":
+        return DocentAssistantMessage.model_validate(message_data)
+    elif role == "tool":
+        return ToolMessage.model_validate(message_data)
+    else:
+        raise ValueError(f"Unknown message role: {role}")

docent_python-0.1.36a0/docent/data_models/citation.py ADDED Viewed

@@ -0,0 +1,187 @@
+from typing import Annotated, Literal, Union
+from pydantic import BaseModel, Discriminator
+class CitationTargetTextRange(BaseModel):
+    start_pattern: str | None = None
+    end_pattern: str | None = None
+class ResolvedCitationItem(BaseModel):
+    pass
+class CitationTarget(BaseModel):
+    item: "ResolvedCitationItemUnion"
+    text_range: CitationTargetTextRange | None = None
+class ParsedCitation(BaseModel):
+    start_idx: int
+    end_idx: int
+    item_alias: str
+    text_range: CitationTargetTextRange | None = None
+class InlineCitation(BaseModel):
+    start_idx: int
+    end_idx: int
+    target: CitationTarget
+class AgentRunMetadataItem(ResolvedCitationItem):
+    item_type: Literal["agent_run_metadata"] = "agent_run_metadata"
+    agent_run_id: str
+    collection_id: str
+    metadata_key: str
+class TranscriptMetadataItem(ResolvedCitationItem):
+    item_type: Literal["transcript_metadata"] = "transcript_metadata"
+    agent_run_id: str
+    collection_id: str
+    transcript_id: str
+    metadata_key: str
+class TranscriptBlockMetadataItem(ResolvedCitationItem):
+    item_type: Literal["block_metadata"] = "block_metadata"
+    agent_run_id: str
+    collection_id: str
+    transcript_id: str
+    block_idx: int
+    metadata_key: str
+class TranscriptBlockContentItem(ResolvedCitationItem):
+    item_type: Literal["block_content"] = "block_content"
+    agent_run_id: str
+    collection_id: str
+    transcript_id: str
+    block_idx: int
+ResolvedCitationItemUnion = Annotated[
+    Union[
+        AgentRunMetadataItem,
+        TranscriptMetadataItem,
+        TranscriptBlockMetadataItem,
+        TranscriptBlockContentItem,
+    ],
+    Discriminator("item_type"),
+]
+RANGE_BEGIN = "<RANGE>"
+RANGE_END = "</RANGE>"
+def scan_brackets(text: str) -> list[tuple[int, int, str]]:
+    """Scan text for bracketed segments, respecting RANGE markers and nested brackets.
+    Returns a list of (start_index, end_index_exclusive, inner_content).
+    """
+    matches: list[tuple[int, int, str]] = []
+    i = 0
+    while i < len(text):
+        if text[i] == "[":
+            start = i
+            bracket_count = 1
+            j = i + 1
+            in_range = False
+            while j < len(text) and bracket_count > 0:
+                if text[j : j + len(RANGE_BEGIN)] == RANGE_BEGIN:
+                    in_range = True
+                elif text[j : j + len(RANGE_END)] == RANGE_END:
+                    in_range = False
+                elif text[j] == "[" and not in_range:
+                    bracket_count += 1
+                elif text[j] == "]" and not in_range:
+                    bracket_count -= 1
+                j += 1
+            if bracket_count == 0:
+                end_exclusive = j
+                bracket_content = text[start + 1 : end_exclusive - 1]
+                matches.append((start, end_exclusive, bracket_content))
+                i = j
+            else:
+                i += 1
+        else:
+            i += 1
+    return matches
+def _extract_range_pattern(range_part: str) -> CitationTargetTextRange | None:
+    if RANGE_BEGIN in range_part and RANGE_END in range_part:
+        range_begin_idx = range_part.find(RANGE_BEGIN)
+        range_end_idx = range_part.find(RANGE_END)
+        if range_begin_idx != -1 and range_end_idx != -1:
+            range_content = range_part[range_begin_idx + len(RANGE_BEGIN) : range_end_idx]
+            start_pattern = range_content if range_content else None
+            return CitationTargetTextRange(start_pattern=start_pattern)
+    return None
+def parse_single_citation(part: str) -> tuple[str, CitationTargetTextRange | None] | None:
+    """
+    Parse a single citation token inside a bracket and return its components.
+    Returns ParsedCitation or None if invalid.
+    For metadata citations, transcript_idx may be None (for agent run metadata).
+    Supports optional text range for all valid citation kinds.
+    """
+    token = part.strip()
+    if not token:
+        return None
+    # Extract optional range part
+    item_alias = token
+    text_range: CitationTargetTextRange | None = None
+    if ":" in token:
+        left, right = token.split(":", 1)
+        item_alias = left.strip()
+        text_range = _extract_range_pattern(right)
+    return item_alias, text_range
+def parse_citations(text: str) -> tuple[str, list[ParsedCitation]]:
+    """
+    Parse citations from text in the format described by TEXT_RANGE_CITE_INSTRUCTION.
+    Supported formats:
+    - Single block: [T<key>B<idx>]
+    - Text range with start pattern: [T<key>B<idx>:<RANGE>start_pattern</RANGE>]
+    - Agent run metadata: [M.key]
+    - Transcript metadata: [T<key>M.key]
+    - Message metadata: [T<key>B<idx>M.key]
+    - Message metadata with text range: [T<key>B<idx>M.key:<RANGE>start_pattern</RANGE>]
+    Args:
+        text: The text to parse citations from
+    Returns:
+        A tuple of (cleaned_text, citations) where cleaned_text has brackets and range markers removed
+        and citations have start_idx and end_idx representing character positions
+        in the cleaned text
+    """
+    citations: list[ParsedCitation] = []
+    bracket_matches = scan_brackets(text)
+    for start, end, bracket_content in bracket_matches:
+        # Parse a single citation token inside the bracket
+        parsed = parse_single_citation(bracket_content)
+        if not parsed:
+            continue
+        label, text_range = parsed
+        citations.append(
+            ParsedCitation(start_idx=start, end_idx=end, item_alias=label, text_range=text_range)
+        )
+    # We're not cleaning the text right now but may do that later
+    return text, citations

docent_python-0.1.36a0/docent/data_models/formatted_objects.py ADDED Viewed

@@ -0,0 +1,84 @@
+from uuid import uuid4
+from pydantic import Field, model_validator
+from docent.data_models.agent_run import AgentRun
+from docent.data_models.transcript import Transcript
+class FormattedTranscript(Transcript):
+    """A Transcript that preserves original message indices during edits.
+    This class extends Transcript to support customization while maintaining accurate
+    citations. Each message retains its original index from the source transcript,
+    even if messages are added, removed, or reordered.
+    Use this class when you need to customize which parts of a transcript are visible
+    to an LLM while ensuring citations remain valid.
+    """
+    id_to_original_index: dict[str, int]
+    @classmethod
+    def from_transcript(cls, transcript: Transcript) -> "FormattedTranscript":
+        """Create a FormattedTranscript from a regular Transcript."""
+        # Ensure all messages have IDs and build id_to_original_index
+        id_to_original_index: dict[str, int] = {}
+        for idx, msg in enumerate(transcript.messages):
+            if msg.id is None:
+                msg.id = str(uuid4())
+            id_to_original_index[msg.id] = idx
+        return cls(
+            id=transcript.id,
+            name=transcript.name,
+            description=transcript.description,
+            transcript_group_id=transcript.transcript_group_id,
+            created_at=transcript.created_at,
+            messages=transcript.messages,
+            metadata=transcript.metadata,
+            id_to_original_index=id_to_original_index,
+        )
+    @model_validator(mode="after")
+    def _validate_id_to_original_index(self) -> "FormattedTranscript":
+        """Ensure id_to_original_index covers all messages."""
+        for msg in self.messages:
+            if msg.id not in self.id_to_original_index:
+                raise ValueError(
+                    f"Message {msg.id} missing from id_to_original_index. "
+                    "Use FormattedTranscript.from_transcript() to create a new instance."
+                )
+        return self
+    def _enumerate_messages(self):
+        """Yield (original index, message) for each message."""
+        for message in self.messages:
+            assert message.id is not None
+            original_idx = self.id_to_original_index[message.id]
+            yield (original_idx, message)
+class FormattedAgentRun(AgentRun):
+    """An AgentRun that allows customization while tracking original identifiers.
+    This class extends AgentRun to support modifications to what an LLM sees
+    while maintaining accurate citations back to the original agent run.
+    Use this class when you need to customize which parts of an agent run are visible
+    to an LLM (e.g., hiding metadata, truncating long outputs).
+    """
+    transcripts: list[FormattedTranscript] = Field(default_factory=list)  # type: ignore[assignment]
+    @classmethod
+    def from_agent_run(cls, agent_run: AgentRun) -> "FormattedAgentRun":
+        """Create a FormattedAgentRun from a regular AgentRun."""
+        return cls(
+            id=agent_run.id,
+            name=agent_run.name,
+            description=agent_run.description,
+            transcripts=[FormattedTranscript.from_transcript(t) for t in agent_run.transcripts],
+            transcript_groups=agent_run.transcript_groups,
+            metadata=agent_run.metadata,
+        )

docent-python 0.1.35a0__tar.gz → 0.1.36a0__tar.gz

docent-python 0.1.35a0tar.gz → 0.1.36a0tar.gz