PyPI - docent-python - Versions diffs - 0.1.9a0__py3-none-any.whl → 0.1.11a0__py3-none-any.whl - Mend

docent-python 0.1.9a0py3-none-any.whl → 0.1.11a0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docent-python might be problematic. Click here for more details.

Files changed (10) hide show

docent/data_models/agent_run.py +68 -13
docent/data_models/chat/message.py +5 -0
docent/data_models/citation.py +129 -200
docent/data_models/remove_invalid_citation_ranges.py +166 -0
docent/data_models/transcript.py +142 -42
docent/sdk/client.py +50 -31
{docent_python-0.1.9a0.dist-info → docent_python-0.1.11a0.dist-info}/METADATA +1 -1
{docent_python-0.1.9a0.dist-info → docent_python-0.1.11a0.dist-info}/RECORD +10 -9
{docent_python-0.1.9a0.dist-info → docent_python-0.1.11a0.dist-info}/WHEEL +0 -0
{docent_python-0.1.9a0.dist-info → docent_python-0.1.11a0.dist-info}/licenses/LICENSE.md +0 -0

docent/data_models/agent_run.py CHANGED Viewed

@@ -90,19 +90,36 @@ class AgentRun(BaseModel):
             raise ValueError("AgentRun must have at least one transcript")
         return self
-    def to_text(self, token_limit: int = sys.maxsize) -> list[str]:
+    def _to_text_impl(self, token_limit: int = sys.maxsize, use_blocks: bool = False) -> list[str]:
         """
-        Represents an agent run as a list of strings, each of which is at most token_limit tokens
-        under the GPT-4 tokenization scheme.
+        Core implementation for converting agent run to text representation.
-        We'll try to split up long AgentRuns along transcript boundaries and include metadata.
-        For very long transcripts, we'll have to split them up further and remove metadata.
+        Args:
+            token_limit: Maximum tokens per returned string under the GPT-4 tokenization scheme
+            use_blocks: If True, use individual message blocks. If False, use action units.
+        Returns:
+            List of strings, each at most token_limit tokens
         """
+        # Generate transcript strings using appropriate method
+        transcript_strs: list[str] = []
+        for i, (t_key, t) in enumerate(self.transcripts.items()):
+            if use_blocks:
+                transcript_content = t.to_str_blocks_with_token_limit(
+                    token_limit=sys.maxsize,
+                    transcript_idx=i,
+                    agent_run_idx=None,
+                )[0]
+            else:
+                transcript_content = t.to_str_with_token_limit(
+                    token_limit=sys.maxsize,
+                    transcript_idx=i,
+                    agent_run_idx=None,
+                )[0]
+            transcript_strs.append(
+                f"<transcript {t_key}>\n{transcript_content}\n</transcript {t_key}>"
+            )
-        transcript_strs: list[str] = [
-            f"<transcript {t_key}>\n{t.to_str(agent_run_idx=None, transcript_idx=i)}\n</transcript {t_key}>"
-            for i, (t_key, t) in enumerate(self.transcripts.items())
-        ]
         transcripts_str = "\n\n".join(transcript_strs)
         # Gather metadata
@@ -128,7 +145,6 @@ class AgentRun(BaseModel):
             return [f"{transcripts_str}" f"{metadata_str}"]
         # Otherwise, split up the transcript and metadata into chunks
-        # TODO(vincent, mengk): does this code account for multiple transcripts correctly? a little confused.
         else:
             results: list[str] = []
             transcript_token_counts = [get_token_count(t) for t in transcript_strs]
@@ -150,13 +166,23 @@ class AgentRun(BaseModel):
                     ), "Ranges without metadata should be a single message"
                     t_id, t = list(self.transcripts.items())[msg_range.start]
                     if msg_range.num_tokens < token_limit - 50:
-                        transcript = f"<transcript {t_id}>\n{t.to_str()}\n</transcript {t_id}>"
+                        if use_blocks:
+                            transcript = f"<transcript {t_id}>\n{t.to_str_blocks_with_token_limit(token_limit=sys.maxsize)[0]}\n</transcript {t_id}>"
+                        else:
+                            transcript = f"<transcript {t_id}>\n{t.to_str_with_token_limit(token_limit=sys.maxsize)[0]}\n</transcript {t_id}>"
                         result = (
                             f"Here is a partial agent run for analysis purposes only:\n{transcript}"
                         )
                         results.append(result)
                     else:
-                        transcript_fragments = t.to_str_with_token_limit(token_limit - 50)
+                        if use_blocks:
+                            transcript_fragments = t.to_str_blocks_with_token_limit(
+                                token_limit=token_limit - 50,
+                            )
+                        else:
+                            transcript_fragments = t.to_str_with_token_limit(
+                                token_limit=token_limit - 50,
+                            )
                         for fragment in transcript_fragments:
                             result = f"<transcript {t_id}>\n{fragment}\n</transcript {t_id}>"
                             result = (
@@ -165,6 +191,26 @@ class AgentRun(BaseModel):
                             results.append(result)
             return results
+    def to_text(self, token_limit: int = sys.maxsize) -> list[str]:
+        """
+        Represents an agent run as a list of strings, each of which is at most token_limit tokens
+        under the GPT-4 tokenization scheme.
+        We'll try to split up long AgentRuns along transcript boundaries and include metadata.
+        For very long transcripts, we'll have to split them up further and remove metadata.
+        """
+        return self._to_text_impl(token_limit=token_limit, use_blocks=False)
+    def to_text_blocks(self, token_limit: int = sys.maxsize) -> list[str]:
+        """
+        Represents an agent run as a list of strings using individual message blocks,
+        each of which is at most token_limit tokens under the GPT-4 tokenization scheme.
+        Unlike to_text() which uses action units, this method formats each message
+        as an individual block.
+        """
+        return self._to_text_impl(token_limit=token_limit, use_blocks=True)
     @property
     def text(self) -> str:
         """Concatenates all transcript texts with double newlines as separators.
@@ -172,7 +218,16 @@ class AgentRun(BaseModel):
         Returns:
             str: A string representation of all transcripts.
         """
-        return self.to_text()[0]
+        return self._to_text_impl(token_limit=sys.maxsize, use_blocks=False)[0]
+    @property
+    def text_blocks(self) -> str:
+        """Concatenates all transcript texts using individual blocks format.
+        Returns:
+            str: A string representation of all transcripts using individual message blocks.
+        """
+        return self._to_text_impl(token_limit=sys.maxsize, use_blocks=True)[0]
     def model_dump(self, *args: Any, **kwargs: Any) -> dict[str, Any]:
         """Extends the parent model_dump method to include the text property.

docent/data_models/chat/message.py CHANGED Viewed

@@ -5,6 +5,7 @@ from pydantic import BaseModel, Discriminator
 from docent.data_models.chat.content import Content
 from docent.data_models.chat.tool import ToolCall
+from docent.data_models.citation import Citation
 logger = getLogger(__name__)
@@ -66,11 +67,15 @@ class AssistantMessage(BaseChatMessage):
         role: Always set to "assistant".
         model: Optional identifier for the model that generated this message.
         tool_calls: Optional list of tool calls made by the assistant.
+        citations: Optional list of citations referenced in the message content.
+        suggested_messages: Optional list of suggested followup messages.
     """
     role: Literal["assistant"] = "assistant"  # type: ignore
     model: str | None = None
     tool_calls: list[ToolCall] | None = None
+    citations: list[Citation] | None = None
+    suggested_messages: list[str] | None = None
 class ToolMessage(BaseChatMessage):

docent/data_models/citation.py CHANGED Viewed

@@ -1,223 +1,152 @@
 import re
-from typing import TypedDict
+from pydantic import BaseModel
-class Citation(TypedDict):
+class Citation(BaseModel):
     start_idx: int
     end_idx: int
-    agent_run_idx: int | None
-    transcript_idx: int | None
+    agent_run_idx: int | None = None
+    transcript_idx: int | None = None
     block_idx: int
-    action_unit_idx: int | None
+    action_unit_idx: int | None = None
+    start_pattern: str | None = None
-def parse_citations_single_run(text: str) -> list[Citation]:
-    """
-    Parse citations from text in the format described by SINGLE_BLOCK_CITE_INSTRUCTION.
+RANGE_BEGIN = "<RANGE>"
+RANGE_END = "</RANGE>"
-    Supported formats:
-    - Single block: [T<key>B<idx>]
-    - Multiple blocks: [T<key1>B<idx1>, T<key2>B<idx2>, ...]
-    - Dash-separated blocks: [T<key1>B<idx1>-T<key2>B<idx2>]
+_SINGLE_RE = re.compile(r"T(\d+)B(\d+)")
+_RANGE_CONTENT_RE = re.compile(r":\s*" + re.escape(RANGE_BEGIN) + r".*?" + re.escape(RANGE_END))
-    Args:
-        text: The text to parse citations from
-    Returns:
-        A list of Citation objects with start_idx and end_idx representing
-        the character positions in the text (excluding brackets)
-    """
-    citations: list[Citation] = []
+def _extract_range_pattern(range_part: str) -> str | None:
+    start_pattern: str | None = None
+    if RANGE_BEGIN in range_part and RANGE_END in range_part:
+        range_begin_idx = range_part.find(RANGE_BEGIN)
+        range_end_idx = range_part.find(RANGE_END)
+        if range_begin_idx != -1 and range_end_idx != -1:
+            range_content = range_part[range_begin_idx + len(RANGE_BEGIN) : range_end_idx]
+            start_pattern = range_content if range_content else None
+    return start_pattern
-    # Find all bracketed content first
-    bracket_pattern = r"\[(.*?)\]"
-    bracket_matches = re.finditer(bracket_pattern, text)
-    for bracket_match in bracket_matches:
-        bracket_content = bracket_match.group(1)
-        # Starting position of the bracket content (excluding '[')
-        content_start_pos = bracket_match.start() + 1
-        # Split by commas if present
-        parts = [part.strip() for part in bracket_content.split(",")]
-        for part in parts:
-            # Check if this part contains a dash (range citation)
-            if "-" in part:
-                # Split by dash and process each sub-part
-                dash_parts = [dash_part.strip() for dash_part in part.split("-")]
-                for dash_part in dash_parts:
-                    # Check for single block citation: T<key>B<idx>
-                    single_match = re.match(r"T(\d+)B(\d+)", dash_part)
-                    if single_match:
-                        transcript_idx = int(single_match.group(1))
-                        block_idx = int(single_match.group(2))
-                        # Find position within the original text
-                        citation_text = f"T{transcript_idx}B{block_idx}"
-                        part_pos_in_content = bracket_content.find(dash_part)
-                        ref_pos = content_start_pos + part_pos_in_content
-                        ref_end = ref_pos + len(citation_text)
-                        # Check if this citation overlaps with any existing citation
-                        if not any(
-                            citation["start_idx"] <= ref_pos < citation["end_idx"]
-                            or citation["start_idx"] < ref_end <= citation["end_idx"]
-                            for citation in citations
-                        ):
-                            citations.append(
-                                Citation(
-                                    start_idx=ref_pos,
-                                    end_idx=ref_end,
-                                    agent_run_idx=None,
-                                    transcript_idx=transcript_idx,
-                                    block_idx=block_idx,
-                                    action_unit_idx=None,
-                                )
-                            )
+def scan_brackets(text: str) -> list[tuple[int, int, str]]:
+    """Scan text for bracketed segments, respecting RANGE markers and nested brackets.
+    Returns a list of (start_index, end_index_exclusive, inner_content).
+    """
+    matches: list[tuple[int, int, str]] = []
+    i = 0
+    while i < len(text):
+        if text[i] == "[":
+            start = i
+            bracket_count = 1
+            j = i + 1
+            in_range = False
+            while j < len(text) and bracket_count > 0:
+                if text[j : j + len(RANGE_BEGIN)] == RANGE_BEGIN:
+                    in_range = True
+                elif text[j : j + len(RANGE_END)] == RANGE_END:
+                    in_range = False
+                elif text[j] == "[" and not in_range:
+                    bracket_count += 1
+                elif text[j] == "]" and not in_range:
+                    bracket_count -= 1
+                j += 1
+            if bracket_count == 0:
+                end_exclusive = j
+                bracket_content = text[start + 1 : end_exclusive - 1]
+                matches.append((start, end_exclusive, bracket_content))
+                i = j
             else:
-                # Check for single block citation: T<key>B<idx>
-                single_match = re.match(r"T(\d+)B(\d+)", part)
-                if single_match:
-                    transcript_idx = int(single_match.group(1))
-                    block_idx = int(single_match.group(2))
-                    # Find position within the original text
-                    citation_text = f"T{transcript_idx}B{block_idx}"
-                    part_pos_in_content = bracket_content.find(part)
-                    ref_pos = content_start_pos + part_pos_in_content
-                    ref_end = ref_pos + len(citation_text)
-                    # Check if this citation overlaps with any existing citation
-                    if not any(
-                        citation["start_idx"] <= ref_pos < citation["end_idx"]
-                        or citation["start_idx"] < ref_end <= citation["end_idx"]
-                        for citation in citations
-                    ):
-                        citations.append(
-                            Citation(
-                                start_idx=ref_pos,
-                                end_idx=ref_end,
-                                agent_run_idx=None,
-                                transcript_idx=transcript_idx,
-                                block_idx=block_idx,
-                                action_unit_idx=None,
-                            )
-                        )
-    return citations
-def parse_citations_multi_run(text: str) -> list[Citation]:
+                i += 1
+        else:
+            i += 1
+    return matches
+def parse_single_citation(part: str) -> tuple[int, int, str | None] | None:
     """
-    Parse citations from text in the format described by MULTI_BLOCK_CITE_INSTRUCTION.
+    Parse a single citation token inside a bracket and return its components.
+    Returns (transcript_idx, block_idx, start_pattern) or None if invalid.
+    """
+    token = part.strip()
+    if not token:
+        return None
+    if ":" in token:
+        citation_part, range_part = token.split(":", 1)
+        single_match = _SINGLE_RE.match(citation_part.strip())
+        if not single_match:
+            return None
+        transcript_idx = int(single_match.group(1))
+        block_idx = int(single_match.group(2))
+        start_pattern = _extract_range_pattern(range_part)
+        return transcript_idx, block_idx, start_pattern
+    else:
+        single_match = _SINGLE_RE.match(token)
+        if not single_match:
+            return None
+        transcript_idx = int(single_match.group(1))
+        block_idx = int(single_match.group(2))
+        return transcript_idx, block_idx, None
+def parse_citations(text: str) -> tuple[str, list[Citation]]:
+    """
+    Parse citations from text in the format described by BLOCK_RANGE_CITE_INSTRUCTION.
     Supported formats:
-    - Single block in transcript: [R<idx>T<key>B<idx>] or ([R<idx>T<key>B<idx>])
-    - Multiple blocks: [R<idx1>T<key1>B<idx1>][R<idx2>T<key2>B<idx2>]
-    - Comma-separated blocks: [R<idx1>T<key1>B<idx1>, R<idx2>T<key2>B<idx2>, ...]
-    - Dash-separated blocks: [R<idx1>T<key1>B<idx1>-R<idx2>T<key2>B<idx2>]
+    - Single block: [T<key>B<idx>]
+    - Text range with start pattern: [T<key>B<idx>:<RANGE>start_pattern</RANGE>]
     Args:
         text: The text to parse citations from
     Returns:
-        A list of Citation objects with start_idx and end_idx representing
-        the character positions in the text (excluding brackets)
+        A tuple of (cleaned_text, citations) where cleaned_text has brackets and range markers removed
+        and citations have start_idx and end_idx representing character positions
+        in the cleaned text
     """
     citations: list[Citation] = []
-    # Find all content within brackets - this handles nested brackets too
-    bracket_pattern = r"\[([^\[\]]*(?:\[[^\[\]]*\][^\[\]]*)*)\]"
-    # Also handle optional parentheses around the brackets
-    paren_bracket_pattern = r"\(\[([^\[\]]*(?:\[[^\[\]]*\][^\[\]]*)*)\]\)"
-    # Single citation pattern
-    single_pattern = r"R(\d+)T(\d+)B(\d+)"
-    # Find all bracket matches
-    for pattern in [bracket_pattern, paren_bracket_pattern]:
-        matches = re.finditer(pattern, text)
-        for match in matches:
-            # Get the content inside brackets
-            if pattern == bracket_pattern:
-                content = match.group(1)
-                start_pos = match.start() + 1  # +1 to skip the opening bracket
-            else:
-                content = match.group(1)
-                start_pos = match.start() + 2  # +2 to skip the opening parenthesis and bracket
-            # Split by comma if present
-            items = [item.strip() for item in content.split(",")]
-            for item in items:
-                # Check if this item contains a dash (range citation)
-                if "-" in item:
-                    # Split by dash and process each sub-item
-                    dash_items = [dash_item.strip() for dash_item in item.split("-")]
-                    for dash_item in dash_items:
-                        # Check for single citation
-                        single_match = re.match(single_pattern, dash_item)
-                        if single_match:
-                            agent_run_idx = int(single_match.group(1))
-                            transcript_idx = int(single_match.group(2))
-                            block_idx = int(single_match.group(3))
-                            # Calculate position in the original text
-                            citation_text = f"R{agent_run_idx}T{transcript_idx}B{block_idx}"
-                            citation_start = text.find(citation_text, start_pos)
-                            citation_end = citation_start + len(citation_text)
-                            # Move start_pos for the next item if there are more items
-                            start_pos = citation_end
-                            # Avoid duplicate citations
-                            if not any(
-                                citation["start_idx"] == citation_start
-                                and citation["end_idx"] == citation_end
-                                for citation in citations
-                            ):
-                                citations.append(
-                                    Citation(
-                                        start_idx=citation_start,
-                                        end_idx=citation_end,
-                                        agent_run_idx=agent_run_idx,
-                                        transcript_idx=transcript_idx,
-                                        block_idx=block_idx,
-                                        action_unit_idx=None,
-                                    )
-                                )
-                else:
-                    # Check for single citation
-                    single_match = re.match(single_pattern, item)
-                    if single_match:
-                        agent_run_idx = int(single_match.group(1))
-                        transcript_idx = int(single_match.group(2))
-                        block_idx = int(single_match.group(3))
-                        # Calculate position in the original text
-                        citation_text = f"R{agent_run_idx}T{transcript_idx}B{block_idx}"
-                        citation_start = text.find(citation_text, start_pos)
-                        citation_end = citation_start + len(citation_text)
-                        # Move start_pos for the next item if there are more items
-                        start_pos = citation_end
-                        # Avoid duplicate citations
-                        if not any(
-                            citation["start_idx"] == citation_start
-                            and citation["end_idx"] == citation_end
-                            for citation in citations
-                        ):
-                            citations.append(
-                                Citation(
-                                    start_idx=citation_start,
-                                    end_idx=citation_end,
-                                    agent_run_idx=agent_run_idx,
-                                    transcript_idx=transcript_idx,
-                                    block_idx=block_idx,
-                                    action_unit_idx=None,
-                                )
-                            )
-    return citations
+    cleaned_text = ""
+    bracket_matches = scan_brackets(text)
+    last_end = 0
+    for start, end, bracket_content in bracket_matches:
+        # Append non-bracket text segment as-is
+        cleaned_text += text[last_end:start]
+        # Parse a single citation token inside the bracket
+        parsed = parse_single_citation(bracket_content)
+        if parsed:
+            transcript_idx, block_idx, start_pattern = parsed
+            replacement = f"T{transcript_idx}B{block_idx}"
+            # Current absolute start position for this replacement in the cleaned text
+            start_idx = len(cleaned_text)
+            end_idx = start_idx + len(replacement)
+            citations.append(
+                Citation(
+                    start_idx=start_idx,
+                    end_idx=end_idx,
+                    agent_run_idx=None,
+                    transcript_idx=transcript_idx,
+                    block_idx=block_idx,
+                    action_unit_idx=None,
+                    start_pattern=start_pattern,
+                )
+            )
+            cleaned_text += replacement
+        last_end = end
+    # Append any remaining tail after the last bracket
+    cleaned_text += text[last_end:]
+    return cleaned_text, citations

docent/data_models/remove_invalid_citation_ranges.py ADDED Viewed

@@ -0,0 +1,166 @@
+import re
+from docent.data_models.agent_run import AgentRun
+from docent.data_models.citation import Citation, parse_single_citation, scan_brackets
+from docent.data_models.transcript import format_chat_message
+def build_whitespace_flexible_regex(pattern: str) -> re.Pattern[str]:
+    """Build regex that is flexible with whitespace matching."""
+    out = ""
+    i = 0
+    while i < len(pattern):
+        ch = pattern[i]
+        if ch.isspace():
+            # Skip all consecutive whitespace
+            while i < len(pattern) and pattern[i].isspace():
+                i += 1
+            out += r"\s+"
+            continue
+        out += re.escape(ch)
+        i += 1
+    return re.compile(out, re.DOTALL)
+def find_citation_matches_in_text(text: str, start_pattern: str) -> list[tuple[int, int]]:
+    """
+    Find all matches of a citation pattern in text.
+    Args:
+        text: The text to search in
+        start_pattern: The pattern to search for
+    Returns:
+        List of (start_index, end_index) tuples for matches
+    """
+    if not start_pattern:
+        return []
+    try:
+        regex = build_whitespace_flexible_regex(start_pattern)
+        matches: list[tuple[int, int]] = []
+        for match in regex.finditer(text):
+            if match.group().strip():  # Only count non-empty matches
+                matches.append((match.start(), match.end()))
+        return matches
+    except re.error:
+        return []
+def get_transcript_text_for_citation(agent_run: AgentRun, citation: Citation) -> str | None:
+    """
+    Get the text content of a specific transcript block from an AgentRun,
+    using the same formatting as shown to LLMs via format_chat_message.
+    Args:
+        agent_run: The agent run containing transcript data
+        citation: Citation with transcript_idx and block_idx
+    Returns:
+        Text content of the specified block (including tool calls), or None if not found
+    """
+    if citation.transcript_idx is None:
+        return None
+    try:
+        transcript_keys = list(agent_run.transcripts.keys())
+        if citation.transcript_idx >= len(transcript_keys):
+            return None
+        transcript_key = transcript_keys[citation.transcript_idx]
+        transcript = agent_run.transcripts[transcript_key]
+        if citation.block_idx >= len(transcript.messages):
+            return None
+        message = transcript.messages[citation.block_idx]
+        # Use the same formatting function that generates content for LLMs
+        # This ensures consistent formatting between citation validation and LLM serialization
+        return format_chat_message(
+            message, citation.block_idx, citation.transcript_idx, citation.agent_run_idx
+        )
+    except (KeyError, IndexError, AttributeError):
+        return None
+def validate_citation_text_range(agent_run: AgentRun, citation: Citation) -> bool:
+    """
+    Validate that a citation's text range exists in the referenced transcript.
+    Args:
+        agent_run: The agent run containing transcript data
+        citation: Citation to validate
+    Returns:
+        True if the citation's text range exists in the transcript, False otherwise
+    """
+    if not citation.start_pattern:
+        # Nothing to validate
+        return True
+    text = get_transcript_text_for_citation(agent_run, citation)
+    if text is None:
+        return False
+    matches = find_citation_matches_in_text(text, citation.start_pattern)
+    return len(matches) > 0
+def remove_invalid_citation_ranges(text: str, agent_run: AgentRun) -> str:
+    """
+    Remove invalid citation ranges from chat message/judge result. We do this as a separate step before normal citation parsing.
+    Normal citation parsing happens every time we load chat/results from db,
+    but invalid ranges should never make it to the db.
+    Args:
+        text: Original text containing citations
+        agent_run: Agent run with transcript data
+    Returns:
+        Tuple of (cleaned_text, valid_citations)
+    """
+    # Find all bracket positions in the original text
+    bracket_matches = scan_brackets(text)
+    citations: list[Citation] = []
+    for start, end, bracket_content in bracket_matches:
+        # Parse this bracket content to get citation info
+        parsed = parse_single_citation(bracket_content)
+        if parsed:
+            transcript_idx, block_idx, start_pattern = parsed
+            # The citation spans from start to end in the original text
+            citation = Citation(
+                start_idx=start,
+                end_idx=end,
+                agent_run_idx=None,
+                transcript_idx=transcript_idx,
+                block_idx=block_idx,
+                action_unit_idx=None,
+                start_pattern=start_pattern,
+            )
+            citations.append(citation)
+    # Filter to only citations with text ranges that need validation
+    citations_to_validate = [c for c in citations if c.start_pattern]
+    # Sort citations by start_idx in reverse order to avoid index shifting issues
+    sorted_citations = sorted(citations_to_validate, key=lambda c: c.start_idx, reverse=True)
+    invalid_citations: list[Citation] = [
+        c for c in sorted_citations if not validate_citation_text_range(agent_run, c)
+    ]
+    # Remove invalid text ranges from citations in the original text
+    modified_text = text
+    for citation in invalid_citations:
+        citation_without_range = f"[T{citation.transcript_idx}B{citation.block_idx}]"
+        before = modified_text[: citation.start_idx]
+        after = modified_text[citation.end_idx :]
+        modified_text = before + citation_without_range + after
+    return modified_text

docent/data_models/transcript.py CHANGED Viewed

@@ -12,6 +12,7 @@ from docent.data_models._tiktoken_util import (
     truncate_to_token_limit,
 )
 from docent.data_models.chat import AssistantMessage, ChatMessage, ContentReasoning
+from docent.data_models.citation import RANGE_BEGIN, RANGE_END
 # Template for formatting individual transcript blocks
 TRANSCRIPT_BLOCK_TEMPLATE = """
@@ -21,10 +22,20 @@ TRANSCRIPT_BLOCK_TEMPLATE = """
 """.strip()
 # Instructions for citing single transcript blocks
-SINGLE_RUN_CITE_INSTRUCTION = "Each transcript and each block has a unique index. Cite the relevant indices in brackets when relevant, like [T<idx>B<idx>]. Use multiple tags to cite multiple blocks, like [T<idx1>B<idx1>][T<idx2>B<idx2>]. Use an inner dash to cite a range of blocks, like [T<idx1>B<idx1>-T<idx2>B<idx2>]. Remember to cite specific blocks and NOT action units."
+TEXT_RANGE_CITE_INSTRUCTION = f"""Anytime you quote the transcript, or refer to something that happened in the transcript, or make any claim about the transcript, add an inline citation. Each transcript and each block has a unique index. Cite the relevant indices in brackets. For example, to cite the entirety of transcript 0, block 1, write [T0B1].
-# Instructions for citing multiple transcript blocks
-MULTI_RUN_CITE_INSTRUCTION = "Each run, each transcript, and each block has a unique index. Cite the relevant indices in brackets when relevant, like [R<idx>T<idx>B<idx>]. Use multiple tags to cite multiple blocks, like [R<idx1>T<idx1>B<idx1>][R<idx2>T<idx2>B<idx2>]. Use an inner dash to cite a range of blocks, like [R<idx1>T<idx1>B<idx1>-R<idx2>T<idx2>B<idx2>]. Remember to cite specific blocks and NOT action units."
+A citation may include a specific range of text within a block. Use {RANGE_BEGIN} and {RANGE_END} to mark the specific range of text. Add it after the block ID separated by a colon. For example, to cite the part of transcript 0, block 1, where the agent says "I understand the task", write [T0B1:{RANGE_BEGIN}I understand the task{RANGE_END}]. Citations must follow this exact format. The markers {RANGE_BEGIN} and {RANGE_END} must be used ONLY inside the brackets of a citation.
+Important notes:
+- You must include the full content of the text range {RANGE_BEGIN} and {RANGE_END}, EXACTLY as it appears in the transcript, word-for-word, including any markers or punctuation that appear in the middle of the text.
+- Citations must be as specific as possible. This means you should usually cite a specific text range within a block.
+- A citation is not a quote. For brevity, text ranges will not be rendered inline. The user will have to click on the citation to see the full text range.
+- Citations are self-contained. Do NOT label them as citation or evidence. Just insert the citation by itself at the appropriate place in the text.
+- Citations must come immediately after the part of a claim that they support. This may be in the middle of a sentence.
+- Each pair of brackets must contain only one citation. To cite multiple blocks, use multiple pairs of brackets, like [T0B0] [T0B1].
+"""
+BLOCK_CITE_INSTRUCTION = f"""Each transcript and each block has a unique index. Cite the relevant indices in brackets when relevant, like [T<idx>B<idx>]. Use multiple tags to cite multiple blocks, like [T<idx1>B<idx1>][T<idx2>B<idx2>]. Remember to cite specific blocks and NOT action units."""
 def format_chat_message(
@@ -291,66 +302,105 @@ class Transcript(BaseModel):
         agent_run_idx: int | None = None,
         highlight_action_unit: int | None = None,
     ) -> str:
-        return self.to_str_with_token_limit(
+        return self._to_str_with_token_limit_impl(
             token_limit=sys.maxsize,
-            agent_run_idx=agent_run_idx,
             transcript_idx=transcript_idx,
+            agent_run_idx=agent_run_idx,
+            use_action_units=True,
             highlight_action_unit=highlight_action_unit,
         )[0]
-    def to_str_with_token_limit(
+    def _generate_formatted_blocks(
         self,
-        token_limit: int,
         transcript_idx: int = 0,
         agent_run_idx: int | None = None,
+        use_action_units: bool = True,
         highlight_action_unit: int | None = None,
     ) -> list[str]:
-        """Represents the transcript as a list of strings, each of which is at most token_limit tokens
-        under the GPT-4 tokenization scheme.
+        """Generate formatted blocks for transcript representation.
-        We'll try to split up long transcripts along message boundaries and include metadata.
-        For very long messages, we'll have to truncate them and remove metadata.
+        Args:
+            transcript_idx: Index of the transcript
+            agent_run_idx: Optional agent run index
+            use_action_units: If True, group messages into action units. If False, use individual blocks.
+            highlight_action_unit: Optional action unit to highlight (only used with action units)
         Returns:
-            list[str]: A list of strings, each of which is at most token_limit tokens
-            under the GPT-4 tokenization scheme.
+            list[str]: List of formatted blocks
         """
-        if highlight_action_unit is not None and not (
-            0 <= highlight_action_unit < len(self._units_of_action or [])
-        ):
-            raise ValueError(f"Invalid action unit index: {highlight_action_unit}")
-        # Format blocks by units of action
-        au_blocks: list[str] = []
-        for unit_idx, unit in enumerate(self._units_of_action or []):
-            unit_blocks: list[str] = []
-            for msg_idx in unit:
-                unit_blocks.append(
+        if use_action_units:
+            if highlight_action_unit is not None and not (
+                0 <= highlight_action_unit < len(self._units_of_action or [])
+            ):
+                raise ValueError(f"Invalid action unit index: {highlight_action_unit}")
+            blocks: list[str] = []
+            for unit_idx, unit in enumerate(self._units_of_action or []):
+                unit_blocks: list[str] = []
+                for msg_idx in unit:
+                    unit_blocks.append(
+                        format_chat_message(
+                            self.messages[msg_idx],
+                            msg_idx,
+                            transcript_idx,
+                            agent_run_idx,
+                        )
+                    )
+                unit_content = "\n".join(unit_blocks)
+                # Add highlighting if requested
+                if highlight_action_unit and unit_idx == highlight_action_unit:
+                    blocks_str_template = "<HIGHLIGHTED>\n{}\n</HIGHLIGHTED>"
+                else:
+                    blocks_str_template = "{}"
+                blocks.append(
+                    blocks_str_template.format(
+                        f"<action unit {unit_idx}>\n{unit_content}\n</action unit {unit_idx}>"
+                    )
+                )
+        else:
+            # Individual message blocks
+            blocks = []
+            for msg_idx, message in enumerate(self.messages):
+                blocks.append(
                     format_chat_message(
-                        self.messages[msg_idx],
+                        message,
                         msg_idx,
                         transcript_idx,
                         agent_run_idx,
                     )
                 )
-            unit_content = "\n".join(unit_blocks)
+        return blocks
-            # Add highlighting if requested
-            if highlight_action_unit and unit_idx == highlight_action_unit:
-                blocks_str_template = "<HIGHLIGHTED>\n{}\n</HIGHLIGHTED>"
-            else:
-                blocks_str_template = "{}"
-            au_blocks.append(
-                blocks_str_template.format(
-                    f"<action unit {unit_idx}>\n{unit_content}\n</action unit {unit_idx}>"
-                )
-            )
-        blocks_str = "\n".join(au_blocks)
+    def _to_str_with_token_limit_impl(
+        self,
+        token_limit: int,
+        transcript_idx: int = 0,
+        agent_run_idx: int | None = None,
+        use_action_units: bool = True,
+        highlight_action_unit: int | None = None,
+    ) -> list[str]:
+        """Core implementation for string representation with token limits.
+        Args:
+            token_limit: Maximum tokens per returned string
+            transcript_idx: Index of the transcript
+            agent_run_idx: Optional agent run index
+            use_action_units: If True, group messages into action units. If False, use individual blocks.
+            highlight_action_unit: Optional action unit to highlight (only used with action units)
+        Returns:
+            list[str]: List of strings, each within token limit
+        """
+        blocks = self._generate_formatted_blocks(
+            transcript_idx, agent_run_idx, use_action_units, highlight_action_unit
+        )
+        blocks_str = "\n".join(blocks)
         # Gather metadata
         metadata_obj = fake_model_dump(self.metadata)
         yaml_width = float("inf")
         block_str = f"<blocks>\n{blocks_str}\n</blocks>\n"
         metadata_str = f"<metadata>\n{yaml.dump(metadata_obj, width=yaml_width)}\n</metadata>"
@@ -365,25 +415,75 @@ class Transcript(BaseModel):
             return [f"{block_str}" f"{metadata_str}"]
         else:
             results: list[str] = []
-            block_token_counts = [get_token_count(block) for block in au_blocks]
+            block_token_counts = [get_token_count(block) for block in blocks]
             ranges = group_messages_into_ranges(
                 block_token_counts, metadata_token_count, token_limit
             )
             for msg_range in ranges:
                 if msg_range.include_metadata:
-                    cur_au_blocks = "\n".join(au_blocks[msg_range.start : msg_range.end])
-                    results.append(f"<blocks>\n{cur_au_blocks}\n</blocks>\n" f"{metadata_str}")
+                    cur_blocks = "\n".join(blocks[msg_range.start : msg_range.end])
+                    results.append(f"<blocks>\n{cur_blocks}\n</blocks>\n" f"{metadata_str}")
                 else:
                     assert (
                         msg_range.end == msg_range.start + 1
                     ), "Ranges without metadata should be a single message"
-                    result = str(au_blocks[msg_range.start])
+                    result = str(blocks[msg_range.start])
                     if msg_range.num_tokens > token_limit - 10:
                         result = truncate_to_token_limit(result, token_limit - 10)
                     results.append(f"<blocks>\n{result}\n</blocks>\n")
             return results
+    def to_str_blocks(
+        self,
+        transcript_idx: int = 0,
+        agent_run_idx: int | None = None,
+    ) -> str:
+        """Represents the transcript as a string using individual message blocks.
+        Unlike to_str() which groups messages into action units, this method
+        formats each message as an individual block.
+        Returns:
+            str: A string representation with individual message blocks.
+        """
+        return self._to_str_with_token_limit_impl(
+            token_limit=sys.maxsize,
+            transcript_idx=transcript_idx,
+            agent_run_idx=agent_run_idx,
+            use_action_units=False,
+        )[0]
+    def to_str_with_token_limit(
+        self,
+        token_limit: int,
+        transcript_idx: int = 0,
+        agent_run_idx: int | None = None,
+        highlight_action_unit: int | None = None,
+    ) -> list[str]:
+        """Represents the transcript as a list of strings using action units with token limit handling."""
+        return self._to_str_with_token_limit_impl(
+            token_limit=token_limit,
+            transcript_idx=transcript_idx,
+            agent_run_idx=agent_run_idx,
+            use_action_units=True,
+            highlight_action_unit=highlight_action_unit,
+        )
+    def to_str_blocks_with_token_limit(
+        self,
+        token_limit: int,
+        transcript_idx: int = 0,
+        agent_run_idx: int | None = None,
+    ) -> list[str]:
+        """Represents the transcript as individual blocks with token limit handling."""
+        return self._to_str_with_token_limit_impl(
+            token_limit=token_limit,
+            transcript_idx=transcript_idx,
+            agent_run_idx=agent_run_idx,
+            use_action_units=False,
+        )
 class TranscriptWithoutMetadataValidator(Transcript):
     """

docent/sdk/client.py CHANGED Viewed

@@ -196,7 +196,7 @@ class Docent:
         response.raise_for_status()
         return response.json()
-    def list_searches(self, collection_id: str) -> list[dict[str, Any]]:
+    def list_rubrics(self, collection_id: str) -> list[dict[str, Any]]:
         """List all rubrics for a given collection.
         Args:
@@ -213,71 +213,73 @@ class Docent:
         response.raise_for_status()
         return response.json()
-    def get_search_results(
-        self, collection_id: str, rubric_id: str, rubric_version: int
-    ) -> list[dict[str, Any]]:
-        """Get rubric results for a given collection, rubric and version.
+    def get_rubric_run_state(self, collection_id: str, rubric_id: str) -> dict[str, Any]:
+        """Get rubric run state for a given collection and rubric.
         Args:
             collection_id: ID of the Collection.
-            rubric_id: The ID of the rubric to get results for.
-            rubric_version: The version of the rubric to get results for.
+            rubric_id: The ID of the rubric to get run state for.
         Returns:
-            list: List of dictionaries containing rubric result information.
+            dict: Dictionary containing rubric run state with results, job_id, and total_agent_runs.
         Raises:
             requests.exceptions.HTTPError: If the API request fails.
         """
-        url = f"{self._server_url}/rubric/{collection_id}/{rubric_id}/results"
-        response = self._session.get(url, params={"rubric_version": rubric_version})
+        url = f"{self._server_url}/rubric/{collection_id}/{rubric_id}/rubric_run_state"
+        response = self._session.get(url)
         response.raise_for_status()
         return response.json()
-    def list_search_clusters(
-        self, collection_id: str, rubric_id: str, rubric_version: int | None = None
-    ) -> list[dict[str, Any]]:
-        """List all centroids for a given collection and rubric.
+    def get_clustering_state(self, collection_id: str, rubric_id: str) -> dict[str, Any]:
+        """Get clustering state for a given collection and rubric.
         Args:
             collection_id: ID of the Collection.
-            rubric_id: The ID of the rubric to get centroids for.
-            rubric_version: Optional version of the rubric. If not provided, uses latest.
+            rubric_id: The ID of the rubric to get clustering state for.
         Returns:
-            list: List of dictionaries containing centroid information.
+            dict: Dictionary containing job_id, centroids, and assignments.
         Raises:
             requests.exceptions.HTTPError: If the API request fails.
         """
-        url = f"{self._server_url}/rubric/{collection_id}/{rubric_id}/centroids"
-        params: dict[str, int] = {}
-        if rubric_version is not None:
-            params["rubric_version"] = rubric_version
-        response = self._session.get(url, params=params)
+        url = f"{self._server_url}/rubric/{collection_id}/{rubric_id}/clustering_job"
+        response = self._session.get(url)
         response.raise_for_status()
         return response.json()
-    def get_cluster_matches(
-        self, collection_id: str, rubric_id: str, rubric_version: int
-    ) -> list[dict[str, Any]]:
+    def get_cluster_centroids(self, collection_id: str, rubric_id: str) -> list[dict[str, Any]]:
+        """Get centroids for a given collection and rubric.
+        Args:
+            collection_id: ID of the Collection.
+            rubric_id: The ID of the rubric to get centroids for.
+        Returns:
+            list: List of dictionaries containing centroid information.
+        Raises:
+            requests.exceptions.HTTPError: If the API request fails.
+        """
+        clustering_state = self.get_clustering_state(collection_id, rubric_id)
+        return clustering_state.get("centroids", [])
+    def get_cluster_assignments(self, collection_id: str, rubric_id: str) -> dict[str, list[str]]:
         """Get centroid assignments for a given rubric.
         Args:
             collection_id: ID of the Collection.
             rubric_id: The ID of the rubric to get assignments for.
-            rubric_version: The version of the rubric to get assignments for.
         Returns:
-            list: List of dictionaries containing centroid assignment information.
+            dict: Dictionary mapping centroid IDs to lists of judge result IDs.
         Raises:
             requests.exceptions.HTTPError: If the API request fails.
         """
-        url = f"{self._server_url}/rubric/{collection_id}/{rubric_id}/assignments"
-        response = self._session.get(url, params={"rubric_version": rubric_version})
-        response.raise_for_status()
-        return response.json()
+        clustering_state = self.get_clustering_state(collection_id, rubric_id)
+        return clustering_state.get("assignments", {})
     def get_agent_run(self, collection_id: str, agent_run_id: str) -> AgentRun | None:
         """Get a specific agent run by its ID.
@@ -348,3 +350,20 @@ class Docent:
         logger.info(f"Successfully shared Collection '{collection_id}' with {email}")
         return response.json()
+    def list_agent_run_ids(self, collection_id: str) -> list[str]:
+        """Get all agent run IDs for a collection.
+        Args:
+            collection_id: ID of the Collection.
+        Returns:
+            str: JSON string containing the list of agent run IDs.
+        Raises:
+            requests.exceptions.HTTPError: If the API request fails.
+        """
+        url = f"{self._server_url}/{collection_id}/agent_run_ids"
+        response = self._session.get(url)
+        response.raise_for_status()
+        return response.json()

{docent_python-0.1.9a0.dist-info → docent_python-0.1.11a0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docent-python
-Version: 0.1.9a0
+Version: 0.1.11a0
 Summary: Docent SDK
 Project-URL: Homepage, https://github.com/TransluceAI/docent
 Project-URL: Issues, https://github.com/TransluceAI/docent/issues

{docent_python-0.1.9a0.dist-info → docent_python-0.1.11a0.dist-info}/RECORD RENAMED Viewed

@@ -6,15 +6,16 @@ docent/_log_util/__init__.py,sha256=3HXXrxrSm8PxwG4llotrCnSnp7GuroK1FNHsdg6f7aE,
 docent/_log_util/logger.py,sha256=kwM0yRW1IJd6-XTorjWn48B4l8qvD2ZM6VDjY5eskQI,4422
 docent/data_models/__init__.py,sha256=4JbTDVzRhS5VZgo8MALwd_YI17GaN7X9E3rOc4Xl7kw,327
 docent/data_models/_tiktoken_util.py,sha256=hC0EDDWItv5-0cONBnHWgZtQOflDU7ZNEhXPFo4DvPc,3057
-docent/data_models/agent_run.py,sha256=bDRToWUlY52PugoHWU1D9hasr5t_fnTmRLpkzWP1s_k,9811
-docent/data_models/citation.py,sha256=WsVQZcBT2EJD24ysyeVOC5Xfo165RI7P5_cOnJBgHj0,10015
+docent/data_models/agent_run.py,sha256=AhokdyEscrlrg0q5aKaOv26cYvkA6LvAoQsz_WBg_pM,12240
+docent/data_models/citation.py,sha256=zpF9WuvVEfktltw1M9P3hwpg5yywizFUKF5zROBR2cY,5062
 docent/data_models/metadata.py,sha256=r0SYC4i2x096dXMLfw_rAMtcJQCsoV6EOMPZuEngbGA,9062
 docent/data_models/regex.py,sha256=0ciIerkrNwb91bY5mTcyO5nDWH67xx2tZYObV52fmBo,1684
+docent/data_models/remove_invalid_citation_ranges.py,sha256=0cn4Xg_tgg45nZvc-sjtqLgr1rywBBrsLJ_WBKEF0pY,5673
 docent/data_models/shared_types.py,sha256=jjm-Dh5S6v7UKInW7SEqoziOsx6Z7Uu4e3VzgCbTWvc,225
-docent/data_models/transcript.py,sha256=0iF2ujcWhTss8WkkpNMeIKJyKOfMEsiMoAQMGwY4ing,15753
+docent/data_models/transcript.py,sha256=Gmy4lYdlvC5SXzpnerFJ83lIMPPiYUPgjOUbwg6aWJQ,20238
 docent/data_models/chat/__init__.py,sha256=GleyRzYqKRkwwSRm_tQJw5BudCbgu9WRSa71Fntz0L0,610
 docent/data_models/chat/content.py,sha256=Co-jO8frQa_DSP11wJuhPX0s-GpJk8yqtKqPeiAIZ_U,1672
-docent/data_models/chat/message.py,sha256=iAo38kbV6wYbFh8S23cxLy6HY4C_i3PzQ6RpSQG5dxM,3861
+docent/data_models/chat/message.py,sha256=xGt09keA6HRxw40xB_toNzEqA9ip7k53dnhXrEbKGO8,4157
 docent/data_models/chat/tool.py,sha256=x7NKINswPe0Kqvcx4ubjHzB-n0-i4DbFodvaBb2vitk,3042
 docent/loaders/load_inspect.py,sha256=_cK2Qd6gyLQuJVzOlsvEZz7TrqzNmH6ZsLTkSCWAPqQ,6628
 docent/samples/__init__.py,sha256=roDFnU6515l9Q8v17Es_SpWyY9jbm5d6X9lV01V0MZo,143
@@ -22,8 +23,8 @@ docent/samples/load.py,sha256=ZGE07r83GBNO4A0QBh5aQ18WAu3mTWA1vxUoHd90nrM,207
 docent/samples/log.eval,sha256=orrW__9WBfANq7NwKsPSq9oTsQRcG6KohG5tMr_X_XY,397708
 docent/samples/tb_airline.json,sha256=eR2jFFRtOw06xqbEglh6-dPewjifOk-cuxJq67Dtu5I,47028
 docent/sdk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docent/sdk/client.py,sha256=fLdniy8JzMLoZpaS9SP2pHban_ToavgtI8VeHZLMNZo,12773
-docent_python-0.1.9a0.dist-info/METADATA,sha256=fgAhTw2bXGNLlU2Y6XFq2rvg7lloXipHXWRXXHLq4gw,1037
-docent_python-0.1.9a0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-docent_python-0.1.9a0.dist-info/licenses/LICENSE.md,sha256=vOHzq3K4Ndu0UV9hPrtXvlD7pHOjyDQmGjHuLSIkRQY,1087
-docent_python-0.1.9a0.dist-info/RECORD,,
+docent/sdk/client.py,sha256=rvOFXvyAr9QxCijN0_CWENbm8y3YQvR1msfFSBDZvOw,13309
+docent_python-0.1.11a0.dist-info/METADATA,sha256=6VpTCCXzOgvSPC3ox6eeIZepRxdcY9gP4SOh5QF5hQ4,1038
+docent_python-0.1.11a0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+docent_python-0.1.11a0.dist-info/licenses/LICENSE.md,sha256=vOHzq3K4Ndu0UV9hPrtXvlD7pHOjyDQmGjHuLSIkRQY,1087
+docent_python-0.1.11a0.dist-info/RECORD,,

{docent_python-0.1.9a0.dist-info → docent_python-0.1.11a0.dist-info}/WHEEL RENAMED Viewed

File without changes

{docent_python-0.1.9a0.dist-info → docent_python-0.1.11a0.dist-info}/licenses/LICENSE.md RENAMED Viewed

File without changes

docent-python 0.1.9a0__py3-none-any.whl → 0.1.11a0__py3-none-any.whl

Potentially problematic release.

docent-python 0.1.9a0py3-none-any.whl → 0.1.11a0py3-none-any.whl