docent-python 0.1.10a0__tar.gz → 0.1.12a0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docent-python might be problematic. Click here for more details.

Files changed (33) hide show
  1. {docent_python-0.1.10a0 → docent_python-0.1.12a0}/PKG-INFO +1 -1
  2. {docent_python-0.1.10a0 → docent_python-0.1.12a0}/docent/data_models/agent_run.py +68 -13
  3. {docent_python-0.1.10a0 → docent_python-0.1.12a0}/docent/data_models/chat/message.py +5 -0
  4. docent_python-0.1.12a0/docent/data_models/citation.py +152 -0
  5. docent_python-0.1.12a0/docent/data_models/remove_invalid_citation_ranges.py +166 -0
  6. {docent_python-0.1.10a0 → docent_python-0.1.12a0}/docent/data_models/transcript.py +142 -42
  7. {docent_python-0.1.10a0 → docent_python-0.1.12a0}/docent/sdk/client.py +17 -0
  8. {docent_python-0.1.10a0 → docent_python-0.1.12a0}/docent/trace.py +33 -52
  9. {docent_python-0.1.10a0 → docent_python-0.1.12a0}/pyproject.toml +1 -1
  10. docent_python-0.1.10a0/docent/data_models/citation.py +0 -223
  11. {docent_python-0.1.10a0 → docent_python-0.1.12a0}/.gitignore +0 -0
  12. {docent_python-0.1.10a0 → docent_python-0.1.12a0}/LICENSE.md +0 -0
  13. {docent_python-0.1.10a0 → docent_python-0.1.12a0}/README.md +0 -0
  14. {docent_python-0.1.10a0 → docent_python-0.1.12a0}/docent/__init__.py +0 -0
  15. {docent_python-0.1.10a0 → docent_python-0.1.12a0}/docent/_log_util/__init__.py +0 -0
  16. {docent_python-0.1.10a0 → docent_python-0.1.12a0}/docent/_log_util/logger.py +0 -0
  17. {docent_python-0.1.10a0 → docent_python-0.1.12a0}/docent/data_models/__init__.py +0 -0
  18. {docent_python-0.1.10a0 → docent_python-0.1.12a0}/docent/data_models/_tiktoken_util.py +0 -0
  19. {docent_python-0.1.10a0 → docent_python-0.1.12a0}/docent/data_models/chat/__init__.py +0 -0
  20. {docent_python-0.1.10a0 → docent_python-0.1.12a0}/docent/data_models/chat/content.py +0 -0
  21. {docent_python-0.1.10a0 → docent_python-0.1.12a0}/docent/data_models/chat/tool.py +0 -0
  22. {docent_python-0.1.10a0 → docent_python-0.1.12a0}/docent/data_models/metadata.py +0 -0
  23. {docent_python-0.1.10a0 → docent_python-0.1.12a0}/docent/data_models/regex.py +0 -0
  24. {docent_python-0.1.10a0 → docent_python-0.1.12a0}/docent/data_models/shared_types.py +0 -0
  25. {docent_python-0.1.10a0 → docent_python-0.1.12a0}/docent/loaders/load_inspect.py +0 -0
  26. {docent_python-0.1.10a0 → docent_python-0.1.12a0}/docent/py.typed +0 -0
  27. {docent_python-0.1.10a0 → docent_python-0.1.12a0}/docent/samples/__init__.py +0 -0
  28. {docent_python-0.1.10a0 → docent_python-0.1.12a0}/docent/samples/load.py +0 -0
  29. {docent_python-0.1.10a0 → docent_python-0.1.12a0}/docent/samples/log.eval +0 -0
  30. {docent_python-0.1.10a0 → docent_python-0.1.12a0}/docent/samples/tb_airline.json +0 -0
  31. {docent_python-0.1.10a0 → docent_python-0.1.12a0}/docent/sdk/__init__.py +0 -0
  32. {docent_python-0.1.10a0 → docent_python-0.1.12a0}/docent/trace_temp.py +0 -0
  33. {docent_python-0.1.10a0 → docent_python-0.1.12a0}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docent-python
3
- Version: 0.1.10a0
3
+ Version: 0.1.12a0
4
4
  Summary: Docent SDK
5
5
  Project-URL: Homepage, https://github.com/TransluceAI/docent
6
6
  Project-URL: Issues, https://github.com/TransluceAI/docent/issues
@@ -90,19 +90,36 @@ class AgentRun(BaseModel):
90
90
  raise ValueError("AgentRun must have at least one transcript")
91
91
  return self
92
92
 
93
- def to_text(self, token_limit: int = sys.maxsize) -> list[str]:
93
+ def _to_text_impl(self, token_limit: int = sys.maxsize, use_blocks: bool = False) -> list[str]:
94
94
  """
95
- Represents an agent run as a list of strings, each of which is at most token_limit tokens
96
- under the GPT-4 tokenization scheme.
95
+ Core implementation for converting agent run to text representation.
97
96
 
98
- We'll try to split up long AgentRuns along transcript boundaries and include metadata.
99
- For very long transcripts, we'll have to split them up further and remove metadata.
97
+ Args:
98
+ token_limit: Maximum tokens per returned string under the GPT-4 tokenization scheme
99
+ use_blocks: If True, use individual message blocks. If False, use action units.
100
+
101
+ Returns:
102
+ List of strings, each at most token_limit tokens
100
103
  """
104
+ # Generate transcript strings using appropriate method
105
+ transcript_strs: list[str] = []
106
+ for i, (t_key, t) in enumerate(self.transcripts.items()):
107
+ if use_blocks:
108
+ transcript_content = t.to_str_blocks_with_token_limit(
109
+ token_limit=sys.maxsize,
110
+ transcript_idx=i,
111
+ agent_run_idx=None,
112
+ )[0]
113
+ else:
114
+ transcript_content = t.to_str_with_token_limit(
115
+ token_limit=sys.maxsize,
116
+ transcript_idx=i,
117
+ agent_run_idx=None,
118
+ )[0]
119
+ transcript_strs.append(
120
+ f"<transcript {t_key}>\n{transcript_content}\n</transcript {t_key}>"
121
+ )
101
122
 
102
- transcript_strs: list[str] = [
103
- f"<transcript {t_key}>\n{t.to_str(agent_run_idx=None, transcript_idx=i)}\n</transcript {t_key}>"
104
- for i, (t_key, t) in enumerate(self.transcripts.items())
105
- ]
106
123
  transcripts_str = "\n\n".join(transcript_strs)
107
124
 
108
125
  # Gather metadata
@@ -128,7 +145,6 @@ class AgentRun(BaseModel):
128
145
  return [f"{transcripts_str}" f"{metadata_str}"]
129
146
 
130
147
  # Otherwise, split up the transcript and metadata into chunks
131
- # TODO(vincent, mengk): does this code account for multiple transcripts correctly? a little confused.
132
148
  else:
133
149
  results: list[str] = []
134
150
  transcript_token_counts = [get_token_count(t) for t in transcript_strs]
@@ -150,13 +166,23 @@ class AgentRun(BaseModel):
150
166
  ), "Ranges without metadata should be a single message"
151
167
  t_id, t = list(self.transcripts.items())[msg_range.start]
152
168
  if msg_range.num_tokens < token_limit - 50:
153
- transcript = f"<transcript {t_id}>\n{t.to_str()}\n</transcript {t_id}>"
169
+ if use_blocks:
170
+ transcript = f"<transcript {t_id}>\n{t.to_str_blocks_with_token_limit(token_limit=sys.maxsize)[0]}\n</transcript {t_id}>"
171
+ else:
172
+ transcript = f"<transcript {t_id}>\n{t.to_str_with_token_limit(token_limit=sys.maxsize)[0]}\n</transcript {t_id}>"
154
173
  result = (
155
174
  f"Here is a partial agent run for analysis purposes only:\n{transcript}"
156
175
  )
157
176
  results.append(result)
158
177
  else:
159
- transcript_fragments = t.to_str_with_token_limit(token_limit - 50)
178
+ if use_blocks:
179
+ transcript_fragments = t.to_str_blocks_with_token_limit(
180
+ token_limit=token_limit - 50,
181
+ )
182
+ else:
183
+ transcript_fragments = t.to_str_with_token_limit(
184
+ token_limit=token_limit - 50,
185
+ )
160
186
  for fragment in transcript_fragments:
161
187
  result = f"<transcript {t_id}>\n{fragment}\n</transcript {t_id}>"
162
188
  result = (
@@ -165,6 +191,26 @@ class AgentRun(BaseModel):
165
191
  results.append(result)
166
192
  return results
167
193
 
194
+ def to_text(self, token_limit: int = sys.maxsize) -> list[str]:
195
+ """
196
+ Represents an agent run as a list of strings, each of which is at most token_limit tokens
197
+ under the GPT-4 tokenization scheme.
198
+
199
+ We'll try to split up long AgentRuns along transcript boundaries and include metadata.
200
+ For very long transcripts, we'll have to split them up further and remove metadata.
201
+ """
202
+ return self._to_text_impl(token_limit=token_limit, use_blocks=False)
203
+
204
+ def to_text_blocks(self, token_limit: int = sys.maxsize) -> list[str]:
205
+ """
206
+ Represents an agent run as a list of strings using individual message blocks,
207
+ each of which is at most token_limit tokens under the GPT-4 tokenization scheme.
208
+
209
+ Unlike to_text() which uses action units, this method formats each message
210
+ as an individual block.
211
+ """
212
+ return self._to_text_impl(token_limit=token_limit, use_blocks=True)
213
+
168
214
  @property
169
215
  def text(self) -> str:
170
216
  """Concatenates all transcript texts with double newlines as separators.
@@ -172,7 +218,16 @@ class AgentRun(BaseModel):
172
218
  Returns:
173
219
  str: A string representation of all transcripts.
174
220
  """
175
- return self.to_text()[0]
221
+ return self._to_text_impl(token_limit=sys.maxsize, use_blocks=False)[0]
222
+
223
+ @property
224
+ def text_blocks(self) -> str:
225
+ """Concatenates all transcript texts using individual blocks format.
226
+
227
+ Returns:
228
+ str: A string representation of all transcripts using individual message blocks.
229
+ """
230
+ return self._to_text_impl(token_limit=sys.maxsize, use_blocks=True)[0]
176
231
 
177
232
  def model_dump(self, *args: Any, **kwargs: Any) -> dict[str, Any]:
178
233
  """Extends the parent model_dump method to include the text property.
@@ -5,6 +5,7 @@ from pydantic import BaseModel, Discriminator
5
5
 
6
6
  from docent.data_models.chat.content import Content
7
7
  from docent.data_models.chat.tool import ToolCall
8
+ from docent.data_models.citation import Citation
8
9
 
9
10
  logger = getLogger(__name__)
10
11
 
@@ -66,11 +67,15 @@ class AssistantMessage(BaseChatMessage):
66
67
  role: Always set to "assistant".
67
68
  model: Optional identifier for the model that generated this message.
68
69
  tool_calls: Optional list of tool calls made by the assistant.
70
+ citations: Optional list of citations referenced in the message content.
71
+ suggested_messages: Optional list of suggested followup messages.
69
72
  """
70
73
 
71
74
  role: Literal["assistant"] = "assistant" # type: ignore
72
75
  model: str | None = None
73
76
  tool_calls: list[ToolCall] | None = None
77
+ citations: list[Citation] | None = None
78
+ suggested_messages: list[str] | None = None
74
79
 
75
80
 
76
81
  class ToolMessage(BaseChatMessage):
@@ -0,0 +1,152 @@
1
+ import re
2
+
3
+ from pydantic import BaseModel
4
+
5
+
6
+ class Citation(BaseModel):
7
+ start_idx: int
8
+ end_idx: int
9
+ agent_run_idx: int | None = None
10
+ transcript_idx: int | None = None
11
+ block_idx: int
12
+ action_unit_idx: int | None = None
13
+ start_pattern: str | None = None
14
+
15
+
16
+ RANGE_BEGIN = "<RANGE>"
17
+ RANGE_END = "</RANGE>"
18
+
19
+ _SINGLE_RE = re.compile(r"T(\d+)B(\d+)")
20
+ _RANGE_CONTENT_RE = re.compile(r":\s*" + re.escape(RANGE_BEGIN) + r".*?" + re.escape(RANGE_END))
21
+
22
+
23
+ def _extract_range_pattern(range_part: str) -> str | None:
24
+ start_pattern: str | None = None
25
+
26
+ if RANGE_BEGIN in range_part and RANGE_END in range_part:
27
+ range_begin_idx = range_part.find(RANGE_BEGIN)
28
+ range_end_idx = range_part.find(RANGE_END)
29
+ if range_begin_idx != -1 and range_end_idx != -1:
30
+ range_content = range_part[range_begin_idx + len(RANGE_BEGIN) : range_end_idx]
31
+ start_pattern = range_content if range_content else None
32
+
33
+ return start_pattern
34
+
35
+
36
+ def scan_brackets(text: str) -> list[tuple[int, int, str]]:
37
+ """Scan text for bracketed segments, respecting RANGE markers and nested brackets.
38
+
39
+ Returns a list of (start_index, end_index_exclusive, inner_content).
40
+ """
41
+ matches: list[tuple[int, int, str]] = []
42
+ i = 0
43
+ while i < len(text):
44
+ if text[i] == "[":
45
+ start = i
46
+ bracket_count = 1
47
+ j = i + 1
48
+ in_range = False
49
+
50
+ while j < len(text) and bracket_count > 0:
51
+ if text[j : j + len(RANGE_BEGIN)] == RANGE_BEGIN:
52
+ in_range = True
53
+ elif text[j : j + len(RANGE_END)] == RANGE_END:
54
+ in_range = False
55
+ elif text[j] == "[" and not in_range:
56
+ bracket_count += 1
57
+ elif text[j] == "]" and not in_range:
58
+ bracket_count -= 1
59
+ j += 1
60
+
61
+ if bracket_count == 0:
62
+ end_exclusive = j
63
+ bracket_content = text[start + 1 : end_exclusive - 1]
64
+ matches.append((start, end_exclusive, bracket_content))
65
+ i = j
66
+ else:
67
+ i += 1
68
+ else:
69
+ i += 1
70
+ return matches
71
+
72
+
73
+ def parse_single_citation(part: str) -> tuple[int, int, str | None] | None:
74
+ """
75
+ Parse a single citation token inside a bracket and return its components.
76
+
77
+ Returns (transcript_idx, block_idx, start_pattern) or None if invalid.
78
+ """
79
+ token = part.strip()
80
+ if not token:
81
+ return None
82
+
83
+ if ":" in token:
84
+ citation_part, range_part = token.split(":", 1)
85
+ single_match = _SINGLE_RE.match(citation_part.strip())
86
+ if not single_match:
87
+ return None
88
+ transcript_idx = int(single_match.group(1))
89
+ block_idx = int(single_match.group(2))
90
+ start_pattern = _extract_range_pattern(range_part)
91
+ return transcript_idx, block_idx, start_pattern
92
+ else:
93
+ single_match = _SINGLE_RE.match(token)
94
+ if not single_match:
95
+ return None
96
+ transcript_idx = int(single_match.group(1))
97
+ block_idx = int(single_match.group(2))
98
+ return transcript_idx, block_idx, None
99
+
100
+
101
+ def parse_citations(text: str) -> tuple[str, list[Citation]]:
102
+ """
103
+ Parse citations from text in the format described by BLOCK_RANGE_CITE_INSTRUCTION.
104
+
105
+ Supported formats:
106
+ - Single block: [T<key>B<idx>]
107
+ - Text range with start pattern: [T<key>B<idx>:<RANGE>start_pattern</RANGE>]
108
+
109
+ Args:
110
+ text: The text to parse citations from
111
+
112
+ Returns:
113
+ A tuple of (cleaned_text, citations) where cleaned_text has brackets and range markers removed
114
+ and citations have start_idx and end_idx representing character positions
115
+ in the cleaned text
116
+ """
117
+ citations: list[Citation] = []
118
+ cleaned_text = ""
119
+
120
+ bracket_matches = scan_brackets(text)
121
+
122
+ last_end = 0
123
+ for start, end, bracket_content in bracket_matches:
124
+ # Append non-bracket text segment as-is
125
+ cleaned_text += text[last_end:start]
126
+
127
+ # Parse a single citation token inside the bracket
128
+ parsed = parse_single_citation(bracket_content)
129
+ if parsed:
130
+ transcript_idx, block_idx, start_pattern = parsed
131
+ replacement = f"T{transcript_idx}B{block_idx}"
132
+ # Current absolute start position for this replacement in the cleaned text
133
+ start_idx = len(cleaned_text)
134
+ end_idx = start_idx + len(replacement)
135
+ citations.append(
136
+ Citation(
137
+ start_idx=start_idx,
138
+ end_idx=end_idx,
139
+ agent_run_idx=None,
140
+ transcript_idx=transcript_idx,
141
+ block_idx=block_idx,
142
+ action_unit_idx=None,
143
+ start_pattern=start_pattern,
144
+ )
145
+ )
146
+ cleaned_text += replacement
147
+ last_end = end
148
+
149
+ # Append any remaining tail after the last bracket
150
+ cleaned_text += text[last_end:]
151
+
152
+ return cleaned_text, citations
@@ -0,0 +1,166 @@
1
+ import re
2
+
3
+ from docent.data_models.agent_run import AgentRun
4
+ from docent.data_models.citation import Citation, parse_single_citation, scan_brackets
5
+ from docent.data_models.transcript import format_chat_message
6
+
7
+
8
+ def build_whitespace_flexible_regex(pattern: str) -> re.Pattern[str]:
9
+ """Build regex that is flexible with whitespace matching."""
10
+ out = ""
11
+ i = 0
12
+ while i < len(pattern):
13
+ ch = pattern[i]
14
+ if ch.isspace():
15
+ # Skip all consecutive whitespace
16
+ while i < len(pattern) and pattern[i].isspace():
17
+ i += 1
18
+ out += r"\s+"
19
+ continue
20
+ out += re.escape(ch)
21
+ i += 1
22
+ return re.compile(out, re.DOTALL)
23
+
24
+
25
+ def find_citation_matches_in_text(text: str, start_pattern: str) -> list[tuple[int, int]]:
26
+ """
27
+ Find all matches of a citation pattern in text.
28
+
29
+ Args:
30
+ text: The text to search in
31
+ start_pattern: The pattern to search for
32
+
33
+ Returns:
34
+ List of (start_index, end_index) tuples for matches
35
+ """
36
+ if not start_pattern:
37
+ return []
38
+
39
+ try:
40
+ regex = build_whitespace_flexible_regex(start_pattern)
41
+ matches: list[tuple[int, int]] = []
42
+
43
+ for match in regex.finditer(text):
44
+ if match.group().strip(): # Only count non-empty matches
45
+ matches.append((match.start(), match.end()))
46
+
47
+ return matches
48
+
49
+ except re.error:
50
+ return []
51
+
52
+
53
+ def get_transcript_text_for_citation(agent_run: AgentRun, citation: Citation) -> str | None:
54
+ """
55
+ Get the text content of a specific transcript block from an AgentRun,
56
+ using the same formatting as shown to LLMs via format_chat_message.
57
+
58
+ Args:
59
+ agent_run: The agent run containing transcript data
60
+ citation: Citation with transcript_idx and block_idx
61
+
62
+ Returns:
63
+ Text content of the specified block (including tool calls), or None if not found
64
+ """
65
+ if citation.transcript_idx is None:
66
+ return None
67
+
68
+ try:
69
+ transcript_keys = list(agent_run.transcripts.keys())
70
+ if citation.transcript_idx >= len(transcript_keys):
71
+ return None
72
+
73
+ transcript_key = transcript_keys[citation.transcript_idx]
74
+
75
+ transcript = agent_run.transcripts[transcript_key]
76
+ if citation.block_idx >= len(transcript.messages):
77
+ return None
78
+
79
+ message = transcript.messages[citation.block_idx]
80
+
81
+ # Use the same formatting function that generates content for LLMs
82
+ # This ensures consistent formatting between citation validation and LLM serialization
83
+ return format_chat_message(
84
+ message, citation.block_idx, citation.transcript_idx, citation.agent_run_idx
85
+ )
86
+
87
+ except (KeyError, IndexError, AttributeError):
88
+ return None
89
+
90
+
91
+ def validate_citation_text_range(agent_run: AgentRun, citation: Citation) -> bool:
92
+ """
93
+ Validate that a citation's text range exists in the referenced transcript.
94
+
95
+ Args:
96
+ agent_run: The agent run containing transcript data
97
+ citation: Citation to validate
98
+
99
+ Returns:
100
+ True if the citation's text range exists in the transcript, False otherwise
101
+ """
102
+ if not citation.start_pattern:
103
+ # Nothing to validate
104
+ return True
105
+
106
+ text = get_transcript_text_for_citation(agent_run, citation)
107
+ if text is None:
108
+ return False
109
+
110
+ matches = find_citation_matches_in_text(text, citation.start_pattern)
111
+
112
+ return len(matches) > 0
113
+
114
+
115
+ def remove_invalid_citation_ranges(text: str, agent_run: AgentRun) -> str:
116
+ """
117
+ Remove invalid citation ranges from chat message/judge result. We do this as a separate step before normal citation parsing.
118
+ Normal citation parsing happens every time we load chat/results from db,
119
+ but invalid ranges should never make it to the db.
120
+
121
+ Args:
122
+ text: Original text containing citations
123
+ agent_run: Agent run with transcript data
124
+
125
+ Returns:
126
+ Tuple of (cleaned_text, valid_citations)
127
+ """
128
+ # Find all bracket positions in the original text
129
+ bracket_matches = scan_brackets(text)
130
+ citations: list[Citation] = []
131
+
132
+ for start, end, bracket_content in bracket_matches:
133
+ # Parse this bracket content to get citation info
134
+ parsed = parse_single_citation(bracket_content)
135
+ if parsed:
136
+ transcript_idx, block_idx, start_pattern = parsed
137
+ # The citation spans from start to end in the original text
138
+ citation = Citation(
139
+ start_idx=start,
140
+ end_idx=end,
141
+ agent_run_idx=None,
142
+ transcript_idx=transcript_idx,
143
+ block_idx=block_idx,
144
+ action_unit_idx=None,
145
+ start_pattern=start_pattern,
146
+ )
147
+ citations.append(citation)
148
+
149
+ # Filter to only citations with text ranges that need validation
150
+ citations_to_validate = [c for c in citations if c.start_pattern]
151
+
152
+ # Sort citations by start_idx in reverse order to avoid index shifting issues
153
+ sorted_citations = sorted(citations_to_validate, key=lambda c: c.start_idx, reverse=True)
154
+
155
+ invalid_citations: list[Citation] = [
156
+ c for c in sorted_citations if not validate_citation_text_range(agent_run, c)
157
+ ]
158
+
159
+ # Remove invalid text ranges from citations in the original text
160
+ modified_text = text
161
+ for citation in invalid_citations:
162
+ citation_without_range = f"[T{citation.transcript_idx}B{citation.block_idx}]"
163
+ before = modified_text[: citation.start_idx]
164
+ after = modified_text[citation.end_idx :]
165
+ modified_text = before + citation_without_range + after
166
+ return modified_text
@@ -12,6 +12,7 @@ from docent.data_models._tiktoken_util import (
12
12
  truncate_to_token_limit,
13
13
  )
14
14
  from docent.data_models.chat import AssistantMessage, ChatMessage, ContentReasoning
15
+ from docent.data_models.citation import RANGE_BEGIN, RANGE_END
15
16
 
16
17
  # Template for formatting individual transcript blocks
17
18
  TRANSCRIPT_BLOCK_TEMPLATE = """
@@ -21,10 +22,20 @@ TRANSCRIPT_BLOCK_TEMPLATE = """
21
22
  """.strip()
22
23
 
23
24
  # Instructions for citing single transcript blocks
24
- SINGLE_RUN_CITE_INSTRUCTION = "Each transcript and each block has a unique index. Cite the relevant indices in brackets when relevant, like [T<idx>B<idx>]. Use multiple tags to cite multiple blocks, like [T<idx1>B<idx1>][T<idx2>B<idx2>]. Use an inner dash to cite a range of blocks, like [T<idx1>B<idx1>-T<idx2>B<idx2>]. Remember to cite specific blocks and NOT action units."
25
+ TEXT_RANGE_CITE_INSTRUCTION = f"""Anytime you quote the transcript, or refer to something that happened in the transcript, or make any claim about the transcript, add an inline citation. Each transcript and each block has a unique index. Cite the relevant indices in brackets. For example, to cite the entirety of transcript 0, block 1, write [T0B1].
25
26
 
26
- # Instructions for citing multiple transcript blocks
27
- MULTI_RUN_CITE_INSTRUCTION = "Each run, each transcript, and each block has a unique index. Cite the relevant indices in brackets when relevant, like [R<idx>T<idx>B<idx>]. Use multiple tags to cite multiple blocks, like [R<idx1>T<idx1>B<idx1>][R<idx2>T<idx2>B<idx2>]. Use an inner dash to cite a range of blocks, like [R<idx1>T<idx1>B<idx1>-R<idx2>T<idx2>B<idx2>]. Remember to cite specific blocks and NOT action units."
27
+ A citation may include a specific range of text within a block. Use {RANGE_BEGIN} and {RANGE_END} to mark the specific range of text. Add it after the block ID separated by a colon. For example, to cite the part of transcript 0, block 1, where the agent says "I understand the task", write [T0B1:{RANGE_BEGIN}I understand the task{RANGE_END}]. Citations must follow this exact format. The markers {RANGE_BEGIN} and {RANGE_END} must be used ONLY inside the brackets of a citation.
28
+
29
+ Important notes:
30
+ - You must include the full content of the text range {RANGE_BEGIN} and {RANGE_END}, EXACTLY as it appears in the transcript, word-for-word, including any markers or punctuation that appear in the middle of the text.
31
+ - Citations must be as specific as possible. This means you should usually cite a specific text range within a block.
32
+ - A citation is not a quote. For brevity, text ranges will not be rendered inline. The user will have to click on the citation to see the full text range.
33
+ - Citations are self-contained. Do NOT label them as citation or evidence. Just insert the citation by itself at the appropriate place in the text.
34
+ - Citations must come immediately after the part of a claim that they support. This may be in the middle of a sentence.
35
+ - Each pair of brackets must contain only one citation. To cite multiple blocks, use multiple pairs of brackets, like [T0B0] [T0B1].
36
+ """
37
+
38
+ BLOCK_CITE_INSTRUCTION = f"""Each transcript and each block has a unique index. Cite the relevant indices in brackets when relevant, like [T<idx>B<idx>]. Use multiple tags to cite multiple blocks, like [T<idx1>B<idx1>][T<idx2>B<idx2>]. Remember to cite specific blocks and NOT action units."""
28
39
 
29
40
 
30
41
  def format_chat_message(
@@ -291,66 +302,105 @@ class Transcript(BaseModel):
291
302
  agent_run_idx: int | None = None,
292
303
  highlight_action_unit: int | None = None,
293
304
  ) -> str:
294
- return self.to_str_with_token_limit(
305
+ return self._to_str_with_token_limit_impl(
295
306
  token_limit=sys.maxsize,
296
- agent_run_idx=agent_run_idx,
297
307
  transcript_idx=transcript_idx,
308
+ agent_run_idx=agent_run_idx,
309
+ use_action_units=True,
298
310
  highlight_action_unit=highlight_action_unit,
299
311
  )[0]
300
312
 
301
- def to_str_with_token_limit(
313
+ def _generate_formatted_blocks(
302
314
  self,
303
- token_limit: int,
304
315
  transcript_idx: int = 0,
305
316
  agent_run_idx: int | None = None,
317
+ use_action_units: bool = True,
306
318
  highlight_action_unit: int | None = None,
307
319
  ) -> list[str]:
308
- """Represents the transcript as a list of strings, each of which is at most token_limit tokens
309
- under the GPT-4 tokenization scheme.
320
+ """Generate formatted blocks for transcript representation.
310
321
 
311
- We'll try to split up long transcripts along message boundaries and include metadata.
312
- For very long messages, we'll have to truncate them and remove metadata.
322
+ Args:
323
+ transcript_idx: Index of the transcript
324
+ agent_run_idx: Optional agent run index
325
+ use_action_units: If True, group messages into action units. If False, use individual blocks.
326
+ highlight_action_unit: Optional action unit to highlight (only used with action units)
313
327
 
314
328
  Returns:
315
- list[str]: A list of strings, each of which is at most token_limit tokens
316
- under the GPT-4 tokenization scheme.
329
+ list[str]: List of formatted blocks
317
330
  """
318
- if highlight_action_unit is not None and not (
319
- 0 <= highlight_action_unit < len(self._units_of_action or [])
320
- ):
321
- raise ValueError(f"Invalid action unit index: {highlight_action_unit}")
322
-
323
- # Format blocks by units of action
324
- au_blocks: list[str] = []
325
- for unit_idx, unit in enumerate(self._units_of_action or []):
326
- unit_blocks: list[str] = []
327
- for msg_idx in unit:
328
- unit_blocks.append(
331
+ if use_action_units:
332
+ if highlight_action_unit is not None and not (
333
+ 0 <= highlight_action_unit < len(self._units_of_action or [])
334
+ ):
335
+ raise ValueError(f"Invalid action unit index: {highlight_action_unit}")
336
+
337
+ blocks: list[str] = []
338
+ for unit_idx, unit in enumerate(self._units_of_action or []):
339
+ unit_blocks: list[str] = []
340
+ for msg_idx in unit:
341
+ unit_blocks.append(
342
+ format_chat_message(
343
+ self.messages[msg_idx],
344
+ msg_idx,
345
+ transcript_idx,
346
+ agent_run_idx,
347
+ )
348
+ )
349
+
350
+ unit_content = "\n".join(unit_blocks)
351
+
352
+ # Add highlighting if requested
353
+ if highlight_action_unit and unit_idx == highlight_action_unit:
354
+ blocks_str_template = "<HIGHLIGHTED>\n{}\n</HIGHLIGHTED>"
355
+ else:
356
+ blocks_str_template = "{}"
357
+ blocks.append(
358
+ blocks_str_template.format(
359
+ f"<action unit {unit_idx}>\n{unit_content}\n</action unit {unit_idx}>"
360
+ )
361
+ )
362
+ else:
363
+ # Individual message blocks
364
+ blocks = []
365
+ for msg_idx, message in enumerate(self.messages):
366
+ blocks.append(
329
367
  format_chat_message(
330
- self.messages[msg_idx],
368
+ message,
331
369
  msg_idx,
332
370
  transcript_idx,
333
371
  agent_run_idx,
334
372
  )
335
373
  )
336
374
 
337
- unit_content = "\n".join(unit_blocks)
375
+ return blocks
338
376
 
339
- # Add highlighting if requested
340
- if highlight_action_unit and unit_idx == highlight_action_unit:
341
- blocks_str_template = "<HIGHLIGHTED>\n{}\n</HIGHLIGHTED>"
342
- else:
343
- blocks_str_template = "{}"
344
- au_blocks.append(
345
- blocks_str_template.format(
346
- f"<action unit {unit_idx}>\n{unit_content}\n</action unit {unit_idx}>"
347
- )
348
- )
349
- blocks_str = "\n".join(au_blocks)
377
+ def _to_str_with_token_limit_impl(
378
+ self,
379
+ token_limit: int,
380
+ transcript_idx: int = 0,
381
+ agent_run_idx: int | None = None,
382
+ use_action_units: bool = True,
383
+ highlight_action_unit: int | None = None,
384
+ ) -> list[str]:
385
+ """Core implementation for string representation with token limits.
386
+
387
+ Args:
388
+ token_limit: Maximum tokens per returned string
389
+ transcript_idx: Index of the transcript
390
+ agent_run_idx: Optional agent run index
391
+ use_action_units: If True, group messages into action units. If False, use individual blocks.
392
+ highlight_action_unit: Optional action unit to highlight (only used with action units)
393
+
394
+ Returns:
395
+ list[str]: List of strings, each within token limit
396
+ """
397
+ blocks = self._generate_formatted_blocks(
398
+ transcript_idx, agent_run_idx, use_action_units, highlight_action_unit
399
+ )
400
+ blocks_str = "\n".join(blocks)
350
401
 
351
402
  # Gather metadata
352
403
  metadata_obj = fake_model_dump(self.metadata)
353
-
354
404
  yaml_width = float("inf")
355
405
  block_str = f"<blocks>\n{blocks_str}\n</blocks>\n"
356
406
  metadata_str = f"<metadata>\n{yaml.dump(metadata_obj, width=yaml_width)}\n</metadata>"
@@ -365,25 +415,75 @@ class Transcript(BaseModel):
365
415
  return [f"{block_str}" f"{metadata_str}"]
366
416
  else:
367
417
  results: list[str] = []
368
- block_token_counts = [get_token_count(block) for block in au_blocks]
418
+ block_token_counts = [get_token_count(block) for block in blocks]
369
419
  ranges = group_messages_into_ranges(
370
420
  block_token_counts, metadata_token_count, token_limit
371
421
  )
372
422
  for msg_range in ranges:
373
423
  if msg_range.include_metadata:
374
- cur_au_blocks = "\n".join(au_blocks[msg_range.start : msg_range.end])
375
- results.append(f"<blocks>\n{cur_au_blocks}\n</blocks>\n" f"{metadata_str}")
424
+ cur_blocks = "\n".join(blocks[msg_range.start : msg_range.end])
425
+ results.append(f"<blocks>\n{cur_blocks}\n</blocks>\n" f"{metadata_str}")
376
426
  else:
377
427
  assert (
378
428
  msg_range.end == msg_range.start + 1
379
429
  ), "Ranges without metadata should be a single message"
380
- result = str(au_blocks[msg_range.start])
430
+ result = str(blocks[msg_range.start])
381
431
  if msg_range.num_tokens > token_limit - 10:
382
432
  result = truncate_to_token_limit(result, token_limit - 10)
383
433
  results.append(f"<blocks>\n{result}\n</blocks>\n")
384
434
 
385
435
  return results
386
436
 
437
+ def to_str_blocks(
438
+ self,
439
+ transcript_idx: int = 0,
440
+ agent_run_idx: int | None = None,
441
+ ) -> str:
442
+ """Represents the transcript as a string using individual message blocks.
443
+
444
+ Unlike to_str() which groups messages into action units, this method
445
+ formats each message as an individual block.
446
+
447
+ Returns:
448
+ str: A string representation with individual message blocks.
449
+ """
450
+ return self._to_str_with_token_limit_impl(
451
+ token_limit=sys.maxsize,
452
+ transcript_idx=transcript_idx,
453
+ agent_run_idx=agent_run_idx,
454
+ use_action_units=False,
455
+ )[0]
456
+
457
+ def to_str_with_token_limit(
458
+ self,
459
+ token_limit: int,
460
+ transcript_idx: int = 0,
461
+ agent_run_idx: int | None = None,
462
+ highlight_action_unit: int | None = None,
463
+ ) -> list[str]:
464
+ """Represents the transcript as a list of strings using action units with token limit handling."""
465
+ return self._to_str_with_token_limit_impl(
466
+ token_limit=token_limit,
467
+ transcript_idx=transcript_idx,
468
+ agent_run_idx=agent_run_idx,
469
+ use_action_units=True,
470
+ highlight_action_unit=highlight_action_unit,
471
+ )
472
+
473
+ def to_str_blocks_with_token_limit(
474
+ self,
475
+ token_limit: int,
476
+ transcript_idx: int = 0,
477
+ agent_run_idx: int | None = None,
478
+ ) -> list[str]:
479
+ """Represents the transcript as individual blocks with token limit handling."""
480
+ return self._to_str_with_token_limit_impl(
481
+ token_limit=token_limit,
482
+ transcript_idx=transcript_idx,
483
+ agent_run_idx=agent_run_idx,
484
+ use_action_units=False,
485
+ )
486
+
387
487
 
388
488
  class TranscriptWithoutMetadataValidator(Transcript):
389
489
  """
@@ -350,3 +350,20 @@ class Docent:
350
350
 
351
351
  logger.info(f"Successfully shared Collection '{collection_id}' with {email}")
352
352
  return response.json()
353
+
354
+ def list_agent_run_ids(self, collection_id: str) -> list[str]:
355
+ """Get all agent run IDs for a collection.
356
+
357
+ Args:
358
+ collection_id: ID of the Collection.
359
+
360
+ Returns:
361
+ str: JSON string containing the list of agent run IDs.
362
+
363
+ Raises:
364
+ requests.exceptions.HTTPError: If the API request fails.
365
+ """
366
+ url = f"{self._server_url}/{collection_id}/agent_run_ids"
367
+ response = self._session.get(url)
368
+ response.raise_for_status()
369
+ return response.json()
@@ -3,7 +3,6 @@ import contextvars
3
3
  import itertools
4
4
  import logging
5
5
  import os
6
- import signal
7
6
  import sys
8
7
  import threading
9
8
  import uuid
@@ -158,6 +157,7 @@ class DocentTracer:
158
157
  lambda: itertools.count(0)
159
158
  )
160
159
  self._transcript_counter_lock = threading.Lock()
160
+ self._flush_lock = threading.Lock()
161
161
 
162
162
  def get_current_agent_run_id(self) -> Optional[str]:
163
163
  """
@@ -179,14 +179,6 @@ class DocentTracer:
179
179
  # Register atexit handler
180
180
  atexit.register(self.cleanup)
181
181
 
182
- # Register signal handlers for graceful shutdown
183
- try:
184
- signal.signal(signal.SIGINT, self._signal_handler)
185
- signal.signal(signal.SIGTERM, self._signal_handler)
186
- except (ValueError, OSError):
187
- # Signal handlers might not work in all environments
188
- pass
189
-
190
182
  self._cleanup_registered = True
191
183
 
192
184
  def _next_span_order(self, transcript_id: str) -> int:
@@ -197,10 +189,6 @@ class DocentTracer:
197
189
  with self._transcript_counter_lock:
198
190
  return next(self._transcript_counters[transcript_id])
199
191
 
200
- def _signal_handler(self, signum: int, frame: Optional[object]):
201
- """Handle shutdown signals."""
202
- self.cleanup()
203
-
204
192
  def _init_spans_exporter(self, endpoint: str) -> Optional[Union[HTTPExporter, GRPCExporter]]:
205
193
  """Initialize the appropriate span exporter based on endpoint."""
206
194
  if not self.enable_otlp_export:
@@ -211,9 +199,11 @@ class DocentTracer:
211
199
  http_exporter: HTTPExporter = HTTPExporter(
212
200
  endpoint=f"{endpoint}/v1/traces", headers=self.headers
213
201
  )
202
+ logger.debug(f"Initialized HTTP exporter for endpoint: {endpoint}/v1/traces")
214
203
  return http_exporter
215
204
  else:
216
205
  grpc_exporter: GRPCExporter = GRPCExporter(endpoint=endpoint, headers=self.headers)
206
+ logger.debug(f"Initialized gRPC exporter for endpoint: {endpoint}")
217
207
  return grpc_exporter
218
208
  except Exception as e:
219
209
  logger.error(f"Failed to initialize span exporter for {endpoint}: {e}")
@@ -239,9 +229,11 @@ class DocentTracer:
239
229
  """Create appropriate span processor based on configuration."""
240
230
  if self.disable_batch or _is_notebook():
241
231
  simple_processor: SimpleSpanProcessor = SimpleSpanProcessor(exporter)
232
+ logger.debug("Created SimpleSpanProcessor for immediate export")
242
233
  return simple_processor
243
234
  else:
244
235
  batch_processor: BatchSpanProcessor = BatchSpanProcessor(exporter)
236
+ logger.debug("Created BatchSpanProcessor for batched export")
245
237
  return batch_processor
246
238
 
247
239
  def initialize(self):
@@ -310,8 +302,19 @@ class DocentTracer:
310
302
  # attributes not available, skip them
311
303
  pass
312
304
 
305
+ # Debug logging for span creation
306
+ span_name = getattr(span, "name", "unknown")
307
+ span_attrs = getattr(span, "attributes", {})
308
+ logger.debug(
309
+ f"Created span: name='{span_name}', collection_id={self.manager.collection_id}, agent_run_id={span_attrs.get('agent_run_id')}, transcript_id={span_attrs.get('transcript_id')}"
310
+ )
311
+
313
312
  def on_end(self, span: ReadableSpan) -> None:
314
- pass
313
+ # Debug logging for span completion
314
+ span_attrs = span.attributes or {}
315
+ logger.debug(
316
+ f"Completed span: name='{span.name}', collection_id={span_attrs.get('collection_id')}, agent_run_id={span_attrs.get('agent_run_id')}, transcript_id={span_attrs.get('transcript_id')}, duration_ns={span.end_time - span.start_time if span.end_time and span.start_time else 'unknown'}"
317
+ )
315
318
 
316
319
  def shutdown(self) -> None:
317
320
  pass
@@ -422,15 +425,8 @@ class DocentTracer:
422
425
  return
423
426
 
424
427
  try:
425
- # Notify backend that trace is done (no span creation)
426
- try:
427
- self._send_trace_done()
428
- except Exception as e:
429
- logger.warning(f"Failed to notify trace done: {e}")
430
-
431
- self._root_context = None # type: ignore
428
+ self.flush()
432
429
 
433
- # Shutdown our isolated tracer provider
434
430
  if self._tracer_provider:
435
431
  self._tracer_provider.shutdown()
436
432
  self._tracer_provider = None
@@ -456,9 +452,12 @@ class DocentTracer:
456
452
  return
457
453
 
458
454
  try:
459
- for processor in self._spans_processors:
455
+ logger.debug(f"Flushing {len(self._spans_processors)} span processors")
456
+ for i, processor in enumerate(self._spans_processors):
460
457
  if hasattr(processor, "force_flush"):
461
- processor.force_flush()
458
+ logger.debug(f"Flushing span processor {i}")
459
+ processor.force_flush(timeout_millis=50)
460
+ logger.debug("Span flush completed")
462
461
  except Exception as e:
463
462
  logger.error(f"Error during flush: {e}")
464
463
 
@@ -476,29 +475,6 @@ class DocentTracer:
476
475
  """Verify if the manager is properly initialized."""
477
476
  return self._initialized
478
477
 
479
- def __enter__(self) -> "DocentTracer":
480
- """Context manager entry."""
481
- self.initialize()
482
- return self
483
-
484
- def __exit__(self, exc_type: type[BaseException], exc_val: Any, exc_tb: Any) -> None:
485
- """Context manager exit."""
486
- self.close()
487
-
488
- @property
489
- def tracer(self) -> Optional[trace.Tracer]:
490
- """Get the tracer instance."""
491
- if not self._initialized:
492
- self.initialize()
493
- return self._tracer
494
-
495
- @property
496
- def root_context(self) -> Optional[Context]:
497
- """Get the root context."""
498
- if not self._initialized:
499
- self.initialize()
500
- return self._root_context
501
-
502
478
  @contextmanager
503
479
  def agent_run_context(
504
480
  self,
@@ -617,13 +593,15 @@ class DocentTracer:
617
593
  Get the API headers for HTTP requests.
618
594
 
619
595
  Returns:
620
- Dictionary of headers including Authorization
596
+ Dictionary of headers including Authorization if set
621
597
  """
598
+ headers = {"Content-Type": "application/json"}
622
599
 
623
- return {
624
- "Content-Type": "application/json",
625
- "Authorization": self.headers.get("Authorization", ""),
626
- }
600
+ authorization = self.headers.get("Authorization")
601
+ if authorization:
602
+ headers["Authorization"] = authorization
603
+
604
+ return headers
627
605
 
628
606
  def _post_json(self, path: str, data: Dict[str, Any]) -> None:
629
607
  if not self._api_endpoint_base:
@@ -1157,7 +1135,10 @@ def close_tracing() -> None:
1157
1135
  def flush_tracing() -> None:
1158
1136
  """Force flush all spans to exporters."""
1159
1137
  if _global_tracer:
1138
+ logger.debug("Flushing global tracer")
1160
1139
  _global_tracer.flush()
1140
+ else:
1141
+ logger.debug("No global tracer available to flush")
1161
1142
 
1162
1143
 
1163
1144
  def verify_initialized() -> bool:
@@ -1,7 +1,7 @@
1
1
  [project]
2
2
  name = "docent-python"
3
3
  description = "Docent SDK"
4
- version = "0.1.10-alpha"
4
+ version = "0.1.12-alpha"
5
5
  authors = [
6
6
  { name="Transluce", email="info@transluce.org" },
7
7
  ]
@@ -1,223 +0,0 @@
1
- import re
2
- from typing import TypedDict
3
-
4
-
5
- class Citation(TypedDict):
6
- start_idx: int
7
- end_idx: int
8
- agent_run_idx: int | None
9
- transcript_idx: int | None
10
- block_idx: int
11
- action_unit_idx: int | None
12
-
13
-
14
- def parse_citations_single_run(text: str) -> list[Citation]:
15
- """
16
- Parse citations from text in the format described by SINGLE_BLOCK_CITE_INSTRUCTION.
17
-
18
- Supported formats:
19
- - Single block: [T<key>B<idx>]
20
- - Multiple blocks: [T<key1>B<idx1>, T<key2>B<idx2>, ...]
21
- - Dash-separated blocks: [T<key1>B<idx1>-T<key2>B<idx2>]
22
-
23
- Args:
24
- text: The text to parse citations from
25
-
26
- Returns:
27
- A list of Citation objects with start_idx and end_idx representing
28
- the character positions in the text (excluding brackets)
29
- """
30
- citations: list[Citation] = []
31
-
32
- # Find all bracketed content first
33
- bracket_pattern = r"\[(.*?)\]"
34
- bracket_matches = re.finditer(bracket_pattern, text)
35
-
36
- for bracket_match in bracket_matches:
37
- bracket_content = bracket_match.group(1)
38
- # Starting position of the bracket content (excluding '[')
39
- content_start_pos = bracket_match.start() + 1
40
-
41
- # Split by commas if present
42
- parts = [part.strip() for part in bracket_content.split(",")]
43
-
44
- for part in parts:
45
- # Check if this part contains a dash (range citation)
46
- if "-" in part:
47
- # Split by dash and process each sub-part
48
- dash_parts = [dash_part.strip() for dash_part in part.split("-")]
49
- for dash_part in dash_parts:
50
- # Check for single block citation: T<key>B<idx>
51
- single_match = re.match(r"T(\d+)B(\d+)", dash_part)
52
- if single_match:
53
- transcript_idx = int(single_match.group(1))
54
- block_idx = int(single_match.group(2))
55
-
56
- # Find position within the original text
57
- citation_text = f"T{transcript_idx}B{block_idx}"
58
- part_pos_in_content = bracket_content.find(dash_part)
59
- ref_pos = content_start_pos + part_pos_in_content
60
- ref_end = ref_pos + len(citation_text)
61
-
62
- # Check if this citation overlaps with any existing citation
63
- if not any(
64
- citation["start_idx"] <= ref_pos < citation["end_idx"]
65
- or citation["start_idx"] < ref_end <= citation["end_idx"]
66
- for citation in citations
67
- ):
68
- citations.append(
69
- Citation(
70
- start_idx=ref_pos,
71
- end_idx=ref_end,
72
- agent_run_idx=None,
73
- transcript_idx=transcript_idx,
74
- block_idx=block_idx,
75
- action_unit_idx=None,
76
- )
77
- )
78
- else:
79
- # Check for single block citation: T<key>B<idx>
80
- single_match = re.match(r"T(\d+)B(\d+)", part)
81
- if single_match:
82
- transcript_idx = int(single_match.group(1))
83
- block_idx = int(single_match.group(2))
84
-
85
- # Find position within the original text
86
- citation_text = f"T{transcript_idx}B{block_idx}"
87
- part_pos_in_content = bracket_content.find(part)
88
- ref_pos = content_start_pos + part_pos_in_content
89
- ref_end = ref_pos + len(citation_text)
90
-
91
- # Check if this citation overlaps with any existing citation
92
- if not any(
93
- citation["start_idx"] <= ref_pos < citation["end_idx"]
94
- or citation["start_idx"] < ref_end <= citation["end_idx"]
95
- for citation in citations
96
- ):
97
- citations.append(
98
- Citation(
99
- start_idx=ref_pos,
100
- end_idx=ref_end,
101
- agent_run_idx=None,
102
- transcript_idx=transcript_idx,
103
- block_idx=block_idx,
104
- action_unit_idx=None,
105
- )
106
- )
107
-
108
- return citations
109
-
110
-
111
- def parse_citations_multi_run(text: str) -> list[Citation]:
112
- """
113
- Parse citations from text in the format described by MULTI_BLOCK_CITE_INSTRUCTION.
114
-
115
- Supported formats:
116
- - Single block in transcript: [R<idx>T<key>B<idx>] or ([R<idx>T<key>B<idx>])
117
- - Multiple blocks: [R<idx1>T<key1>B<idx1>][R<idx2>T<key2>B<idx2>]
118
- - Comma-separated blocks: [R<idx1>T<key1>B<idx1>, R<idx2>T<key2>B<idx2>, ...]
119
- - Dash-separated blocks: [R<idx1>T<key1>B<idx1>-R<idx2>T<key2>B<idx2>]
120
-
121
- Args:
122
- text: The text to parse citations from
123
-
124
- Returns:
125
- A list of Citation objects with start_idx and end_idx representing
126
- the character positions in the text (excluding brackets)
127
- """
128
- citations: list[Citation] = []
129
-
130
- # Find all content within brackets - this handles nested brackets too
131
- bracket_pattern = r"\[([^\[\]]*(?:\[[^\[\]]*\][^\[\]]*)*)\]"
132
- # Also handle optional parentheses around the brackets
133
- paren_bracket_pattern = r"\(\[([^\[\]]*(?:\[[^\[\]]*\][^\[\]]*)*)\]\)"
134
-
135
- # Single citation pattern
136
- single_pattern = r"R(\d+)T(\d+)B(\d+)"
137
-
138
- # Find all bracket matches
139
- for pattern in [bracket_pattern, paren_bracket_pattern]:
140
- matches = re.finditer(pattern, text)
141
- for match in matches:
142
- # Get the content inside brackets
143
- if pattern == bracket_pattern:
144
- content = match.group(1)
145
- start_pos = match.start() + 1 # +1 to skip the opening bracket
146
- else:
147
- content = match.group(1)
148
- start_pos = match.start() + 2 # +2 to skip the opening parenthesis and bracket
149
-
150
- # Split by comma if present
151
- items = [item.strip() for item in content.split(",")]
152
-
153
- for item in items:
154
- # Check if this item contains a dash (range citation)
155
- if "-" in item:
156
- # Split by dash and process each sub-item
157
- dash_items = [dash_item.strip() for dash_item in item.split("-")]
158
- for dash_item in dash_items:
159
- # Check for single citation
160
- single_match = re.match(single_pattern, dash_item)
161
- if single_match:
162
- agent_run_idx = int(single_match.group(1))
163
- transcript_idx = int(single_match.group(2))
164
- block_idx = int(single_match.group(3))
165
-
166
- # Calculate position in the original text
167
- citation_text = f"R{agent_run_idx}T{transcript_idx}B{block_idx}"
168
- citation_start = text.find(citation_text, start_pos)
169
- citation_end = citation_start + len(citation_text)
170
-
171
- # Move start_pos for the next item if there are more items
172
- start_pos = citation_end
173
-
174
- # Avoid duplicate citations
175
- if not any(
176
- citation["start_idx"] == citation_start
177
- and citation["end_idx"] == citation_end
178
- for citation in citations
179
- ):
180
- citations.append(
181
- Citation(
182
- start_idx=citation_start,
183
- end_idx=citation_end,
184
- agent_run_idx=agent_run_idx,
185
- transcript_idx=transcript_idx,
186
- block_idx=block_idx,
187
- action_unit_idx=None,
188
- )
189
- )
190
- else:
191
- # Check for single citation
192
- single_match = re.match(single_pattern, item)
193
- if single_match:
194
- agent_run_idx = int(single_match.group(1))
195
- transcript_idx = int(single_match.group(2))
196
- block_idx = int(single_match.group(3))
197
-
198
- # Calculate position in the original text
199
- citation_text = f"R{agent_run_idx}T{transcript_idx}B{block_idx}"
200
- citation_start = text.find(citation_text, start_pos)
201
- citation_end = citation_start + len(citation_text)
202
-
203
- # Move start_pos for the next item if there are more items
204
- start_pos = citation_end
205
-
206
- # Avoid duplicate citations
207
- if not any(
208
- citation["start_idx"] == citation_start
209
- and citation["end_idx"] == citation_end
210
- for citation in citations
211
- ):
212
- citations.append(
213
- Citation(
214
- start_idx=citation_start,
215
- end_idx=citation_end,
216
- agent_run_idx=agent_run_idx,
217
- transcript_idx=transcript_idx,
218
- block_idx=block_idx,
219
- action_unit_idx=None,
220
- )
221
- )
222
-
223
- return citations