docent-python 0.1.10a0__tar.gz → 0.1.11a0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docent-python might be problematic. Click here for more details.
- {docent_python-0.1.10a0 → docent_python-0.1.11a0}/PKG-INFO +1 -1
- {docent_python-0.1.10a0 → docent_python-0.1.11a0}/docent/data_models/agent_run.py +68 -13
- {docent_python-0.1.10a0 → docent_python-0.1.11a0}/docent/data_models/chat/message.py +5 -0
- docent_python-0.1.11a0/docent/data_models/citation.py +152 -0
- docent_python-0.1.11a0/docent/data_models/remove_invalid_citation_ranges.py +166 -0
- {docent_python-0.1.10a0 → docent_python-0.1.11a0}/docent/data_models/transcript.py +142 -42
- {docent_python-0.1.10a0 → docent_python-0.1.11a0}/docent/sdk/client.py +17 -0
- {docent_python-0.1.10a0 → docent_python-0.1.11a0}/pyproject.toml +1 -1
- docent_python-0.1.10a0/docent/data_models/citation.py +0 -223
- {docent_python-0.1.10a0 → docent_python-0.1.11a0}/.gitignore +0 -0
- {docent_python-0.1.10a0 → docent_python-0.1.11a0}/LICENSE.md +0 -0
- {docent_python-0.1.10a0 → docent_python-0.1.11a0}/README.md +0 -0
- {docent_python-0.1.10a0 → docent_python-0.1.11a0}/docent/__init__.py +0 -0
- {docent_python-0.1.10a0 → docent_python-0.1.11a0}/docent/_log_util/__init__.py +0 -0
- {docent_python-0.1.10a0 → docent_python-0.1.11a0}/docent/_log_util/logger.py +0 -0
- {docent_python-0.1.10a0 → docent_python-0.1.11a0}/docent/data_models/__init__.py +0 -0
- {docent_python-0.1.10a0 → docent_python-0.1.11a0}/docent/data_models/_tiktoken_util.py +0 -0
- {docent_python-0.1.10a0 → docent_python-0.1.11a0}/docent/data_models/chat/__init__.py +0 -0
- {docent_python-0.1.10a0 → docent_python-0.1.11a0}/docent/data_models/chat/content.py +0 -0
- {docent_python-0.1.10a0 → docent_python-0.1.11a0}/docent/data_models/chat/tool.py +0 -0
- {docent_python-0.1.10a0 → docent_python-0.1.11a0}/docent/data_models/metadata.py +0 -0
- {docent_python-0.1.10a0 → docent_python-0.1.11a0}/docent/data_models/regex.py +0 -0
- {docent_python-0.1.10a0 → docent_python-0.1.11a0}/docent/data_models/shared_types.py +0 -0
- {docent_python-0.1.10a0 → docent_python-0.1.11a0}/docent/loaders/load_inspect.py +0 -0
- {docent_python-0.1.10a0 → docent_python-0.1.11a0}/docent/py.typed +0 -0
- {docent_python-0.1.10a0 → docent_python-0.1.11a0}/docent/samples/__init__.py +0 -0
- {docent_python-0.1.10a0 → docent_python-0.1.11a0}/docent/samples/load.py +0 -0
- {docent_python-0.1.10a0 → docent_python-0.1.11a0}/docent/samples/log.eval +0 -0
- {docent_python-0.1.10a0 → docent_python-0.1.11a0}/docent/samples/tb_airline.json +0 -0
- {docent_python-0.1.10a0 → docent_python-0.1.11a0}/docent/sdk/__init__.py +0 -0
- {docent_python-0.1.10a0 → docent_python-0.1.11a0}/docent/trace.py +0 -0
- {docent_python-0.1.10a0 → docent_python-0.1.11a0}/docent/trace_temp.py +0 -0
- {docent_python-0.1.10a0 → docent_python-0.1.11a0}/uv.lock +0 -0
|
@@ -90,19 +90,36 @@ class AgentRun(BaseModel):
|
|
|
90
90
|
raise ValueError("AgentRun must have at least one transcript")
|
|
91
91
|
return self
|
|
92
92
|
|
|
93
|
-
def
|
|
93
|
+
def _to_text_impl(self, token_limit: int = sys.maxsize, use_blocks: bool = False) -> list[str]:
|
|
94
94
|
"""
|
|
95
|
-
|
|
96
|
-
under the GPT-4 tokenization scheme.
|
|
95
|
+
Core implementation for converting agent run to text representation.
|
|
97
96
|
|
|
98
|
-
|
|
99
|
-
|
|
97
|
+
Args:
|
|
98
|
+
token_limit: Maximum tokens per returned string under the GPT-4 tokenization scheme
|
|
99
|
+
use_blocks: If True, use individual message blocks. If False, use action units.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
List of strings, each at most token_limit tokens
|
|
100
103
|
"""
|
|
104
|
+
# Generate transcript strings using appropriate method
|
|
105
|
+
transcript_strs: list[str] = []
|
|
106
|
+
for i, (t_key, t) in enumerate(self.transcripts.items()):
|
|
107
|
+
if use_blocks:
|
|
108
|
+
transcript_content = t.to_str_blocks_with_token_limit(
|
|
109
|
+
token_limit=sys.maxsize,
|
|
110
|
+
transcript_idx=i,
|
|
111
|
+
agent_run_idx=None,
|
|
112
|
+
)[0]
|
|
113
|
+
else:
|
|
114
|
+
transcript_content = t.to_str_with_token_limit(
|
|
115
|
+
token_limit=sys.maxsize,
|
|
116
|
+
transcript_idx=i,
|
|
117
|
+
agent_run_idx=None,
|
|
118
|
+
)[0]
|
|
119
|
+
transcript_strs.append(
|
|
120
|
+
f"<transcript {t_key}>\n{transcript_content}\n</transcript {t_key}>"
|
|
121
|
+
)
|
|
101
122
|
|
|
102
|
-
transcript_strs: list[str] = [
|
|
103
|
-
f"<transcript {t_key}>\n{t.to_str(agent_run_idx=None, transcript_idx=i)}\n</transcript {t_key}>"
|
|
104
|
-
for i, (t_key, t) in enumerate(self.transcripts.items())
|
|
105
|
-
]
|
|
106
123
|
transcripts_str = "\n\n".join(transcript_strs)
|
|
107
124
|
|
|
108
125
|
# Gather metadata
|
|
@@ -128,7 +145,6 @@ class AgentRun(BaseModel):
|
|
|
128
145
|
return [f"{transcripts_str}" f"{metadata_str}"]
|
|
129
146
|
|
|
130
147
|
# Otherwise, split up the transcript and metadata into chunks
|
|
131
|
-
# TODO(vincent, mengk): does this code account for multiple transcripts correctly? a little confused.
|
|
132
148
|
else:
|
|
133
149
|
results: list[str] = []
|
|
134
150
|
transcript_token_counts = [get_token_count(t) for t in transcript_strs]
|
|
@@ -150,13 +166,23 @@ class AgentRun(BaseModel):
|
|
|
150
166
|
), "Ranges without metadata should be a single message"
|
|
151
167
|
t_id, t = list(self.transcripts.items())[msg_range.start]
|
|
152
168
|
if msg_range.num_tokens < token_limit - 50:
|
|
153
|
-
|
|
169
|
+
if use_blocks:
|
|
170
|
+
transcript = f"<transcript {t_id}>\n{t.to_str_blocks_with_token_limit(token_limit=sys.maxsize)[0]}\n</transcript {t_id}>"
|
|
171
|
+
else:
|
|
172
|
+
transcript = f"<transcript {t_id}>\n{t.to_str_with_token_limit(token_limit=sys.maxsize)[0]}\n</transcript {t_id}>"
|
|
154
173
|
result = (
|
|
155
174
|
f"Here is a partial agent run for analysis purposes only:\n{transcript}"
|
|
156
175
|
)
|
|
157
176
|
results.append(result)
|
|
158
177
|
else:
|
|
159
|
-
|
|
178
|
+
if use_blocks:
|
|
179
|
+
transcript_fragments = t.to_str_blocks_with_token_limit(
|
|
180
|
+
token_limit=token_limit - 50,
|
|
181
|
+
)
|
|
182
|
+
else:
|
|
183
|
+
transcript_fragments = t.to_str_with_token_limit(
|
|
184
|
+
token_limit=token_limit - 50,
|
|
185
|
+
)
|
|
160
186
|
for fragment in transcript_fragments:
|
|
161
187
|
result = f"<transcript {t_id}>\n{fragment}\n</transcript {t_id}>"
|
|
162
188
|
result = (
|
|
@@ -165,6 +191,26 @@ class AgentRun(BaseModel):
|
|
|
165
191
|
results.append(result)
|
|
166
192
|
return results
|
|
167
193
|
|
|
194
|
+
def to_text(self, token_limit: int = sys.maxsize) -> list[str]:
|
|
195
|
+
"""
|
|
196
|
+
Represents an agent run as a list of strings, each of which is at most token_limit tokens
|
|
197
|
+
under the GPT-4 tokenization scheme.
|
|
198
|
+
|
|
199
|
+
We'll try to split up long AgentRuns along transcript boundaries and include metadata.
|
|
200
|
+
For very long transcripts, we'll have to split them up further and remove metadata.
|
|
201
|
+
"""
|
|
202
|
+
return self._to_text_impl(token_limit=token_limit, use_blocks=False)
|
|
203
|
+
|
|
204
|
+
def to_text_blocks(self, token_limit: int = sys.maxsize) -> list[str]:
|
|
205
|
+
"""
|
|
206
|
+
Represents an agent run as a list of strings using individual message blocks,
|
|
207
|
+
each of which is at most token_limit tokens under the GPT-4 tokenization scheme.
|
|
208
|
+
|
|
209
|
+
Unlike to_text() which uses action units, this method formats each message
|
|
210
|
+
as an individual block.
|
|
211
|
+
"""
|
|
212
|
+
return self._to_text_impl(token_limit=token_limit, use_blocks=True)
|
|
213
|
+
|
|
168
214
|
@property
|
|
169
215
|
def text(self) -> str:
|
|
170
216
|
"""Concatenates all transcript texts with double newlines as separators.
|
|
@@ -172,7 +218,16 @@ class AgentRun(BaseModel):
|
|
|
172
218
|
Returns:
|
|
173
219
|
str: A string representation of all transcripts.
|
|
174
220
|
"""
|
|
175
|
-
return self.
|
|
221
|
+
return self._to_text_impl(token_limit=sys.maxsize, use_blocks=False)[0]
|
|
222
|
+
|
|
223
|
+
@property
|
|
224
|
+
def text_blocks(self) -> str:
|
|
225
|
+
"""Concatenates all transcript texts using individual blocks format.
|
|
226
|
+
|
|
227
|
+
Returns:
|
|
228
|
+
str: A string representation of all transcripts using individual message blocks.
|
|
229
|
+
"""
|
|
230
|
+
return self._to_text_impl(token_limit=sys.maxsize, use_blocks=True)[0]
|
|
176
231
|
|
|
177
232
|
def model_dump(self, *args: Any, **kwargs: Any) -> dict[str, Any]:
|
|
178
233
|
"""Extends the parent model_dump method to include the text property.
|
|
@@ -5,6 +5,7 @@ from pydantic import BaseModel, Discriminator
|
|
|
5
5
|
|
|
6
6
|
from docent.data_models.chat.content import Content
|
|
7
7
|
from docent.data_models.chat.tool import ToolCall
|
|
8
|
+
from docent.data_models.citation import Citation
|
|
8
9
|
|
|
9
10
|
logger = getLogger(__name__)
|
|
10
11
|
|
|
@@ -66,11 +67,15 @@ class AssistantMessage(BaseChatMessage):
|
|
|
66
67
|
role: Always set to "assistant".
|
|
67
68
|
model: Optional identifier for the model that generated this message.
|
|
68
69
|
tool_calls: Optional list of tool calls made by the assistant.
|
|
70
|
+
citations: Optional list of citations referenced in the message content.
|
|
71
|
+
suggested_messages: Optional list of suggested followup messages.
|
|
69
72
|
"""
|
|
70
73
|
|
|
71
74
|
role: Literal["assistant"] = "assistant" # type: ignore
|
|
72
75
|
model: str | None = None
|
|
73
76
|
tool_calls: list[ToolCall] | None = None
|
|
77
|
+
citations: list[Citation] | None = None
|
|
78
|
+
suggested_messages: list[str] | None = None
|
|
74
79
|
|
|
75
80
|
|
|
76
81
|
class ToolMessage(BaseChatMessage):
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Citation(BaseModel):
|
|
7
|
+
start_idx: int
|
|
8
|
+
end_idx: int
|
|
9
|
+
agent_run_idx: int | None = None
|
|
10
|
+
transcript_idx: int | None = None
|
|
11
|
+
block_idx: int
|
|
12
|
+
action_unit_idx: int | None = None
|
|
13
|
+
start_pattern: str | None = None
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
RANGE_BEGIN = "<RANGE>"
|
|
17
|
+
RANGE_END = "</RANGE>"
|
|
18
|
+
|
|
19
|
+
_SINGLE_RE = re.compile(r"T(\d+)B(\d+)")
|
|
20
|
+
_RANGE_CONTENT_RE = re.compile(r":\s*" + re.escape(RANGE_BEGIN) + r".*?" + re.escape(RANGE_END))
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _extract_range_pattern(range_part: str) -> str | None:
|
|
24
|
+
start_pattern: str | None = None
|
|
25
|
+
|
|
26
|
+
if RANGE_BEGIN in range_part and RANGE_END in range_part:
|
|
27
|
+
range_begin_idx = range_part.find(RANGE_BEGIN)
|
|
28
|
+
range_end_idx = range_part.find(RANGE_END)
|
|
29
|
+
if range_begin_idx != -1 and range_end_idx != -1:
|
|
30
|
+
range_content = range_part[range_begin_idx + len(RANGE_BEGIN) : range_end_idx]
|
|
31
|
+
start_pattern = range_content if range_content else None
|
|
32
|
+
|
|
33
|
+
return start_pattern
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def scan_brackets(text: str) -> list[tuple[int, int, str]]:
|
|
37
|
+
"""Scan text for bracketed segments, respecting RANGE markers and nested brackets.
|
|
38
|
+
|
|
39
|
+
Returns a list of (start_index, end_index_exclusive, inner_content).
|
|
40
|
+
"""
|
|
41
|
+
matches: list[tuple[int, int, str]] = []
|
|
42
|
+
i = 0
|
|
43
|
+
while i < len(text):
|
|
44
|
+
if text[i] == "[":
|
|
45
|
+
start = i
|
|
46
|
+
bracket_count = 1
|
|
47
|
+
j = i + 1
|
|
48
|
+
in_range = False
|
|
49
|
+
|
|
50
|
+
while j < len(text) and bracket_count > 0:
|
|
51
|
+
if text[j : j + len(RANGE_BEGIN)] == RANGE_BEGIN:
|
|
52
|
+
in_range = True
|
|
53
|
+
elif text[j : j + len(RANGE_END)] == RANGE_END:
|
|
54
|
+
in_range = False
|
|
55
|
+
elif text[j] == "[" and not in_range:
|
|
56
|
+
bracket_count += 1
|
|
57
|
+
elif text[j] == "]" and not in_range:
|
|
58
|
+
bracket_count -= 1
|
|
59
|
+
j += 1
|
|
60
|
+
|
|
61
|
+
if bracket_count == 0:
|
|
62
|
+
end_exclusive = j
|
|
63
|
+
bracket_content = text[start + 1 : end_exclusive - 1]
|
|
64
|
+
matches.append((start, end_exclusive, bracket_content))
|
|
65
|
+
i = j
|
|
66
|
+
else:
|
|
67
|
+
i += 1
|
|
68
|
+
else:
|
|
69
|
+
i += 1
|
|
70
|
+
return matches
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def parse_single_citation(part: str) -> tuple[int, int, str | None] | None:
|
|
74
|
+
"""
|
|
75
|
+
Parse a single citation token inside a bracket and return its components.
|
|
76
|
+
|
|
77
|
+
Returns (transcript_idx, block_idx, start_pattern) or None if invalid.
|
|
78
|
+
"""
|
|
79
|
+
token = part.strip()
|
|
80
|
+
if not token:
|
|
81
|
+
return None
|
|
82
|
+
|
|
83
|
+
if ":" in token:
|
|
84
|
+
citation_part, range_part = token.split(":", 1)
|
|
85
|
+
single_match = _SINGLE_RE.match(citation_part.strip())
|
|
86
|
+
if not single_match:
|
|
87
|
+
return None
|
|
88
|
+
transcript_idx = int(single_match.group(1))
|
|
89
|
+
block_idx = int(single_match.group(2))
|
|
90
|
+
start_pattern = _extract_range_pattern(range_part)
|
|
91
|
+
return transcript_idx, block_idx, start_pattern
|
|
92
|
+
else:
|
|
93
|
+
single_match = _SINGLE_RE.match(token)
|
|
94
|
+
if not single_match:
|
|
95
|
+
return None
|
|
96
|
+
transcript_idx = int(single_match.group(1))
|
|
97
|
+
block_idx = int(single_match.group(2))
|
|
98
|
+
return transcript_idx, block_idx, None
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def parse_citations(text: str) -> tuple[str, list[Citation]]:
|
|
102
|
+
"""
|
|
103
|
+
Parse citations from text in the format described by BLOCK_RANGE_CITE_INSTRUCTION.
|
|
104
|
+
|
|
105
|
+
Supported formats:
|
|
106
|
+
- Single block: [T<key>B<idx>]
|
|
107
|
+
- Text range with start pattern: [T<key>B<idx>:<RANGE>start_pattern</RANGE>]
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
text: The text to parse citations from
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
A tuple of (cleaned_text, citations) where cleaned_text has brackets and range markers removed
|
|
114
|
+
and citations have start_idx and end_idx representing character positions
|
|
115
|
+
in the cleaned text
|
|
116
|
+
"""
|
|
117
|
+
citations: list[Citation] = []
|
|
118
|
+
cleaned_text = ""
|
|
119
|
+
|
|
120
|
+
bracket_matches = scan_brackets(text)
|
|
121
|
+
|
|
122
|
+
last_end = 0
|
|
123
|
+
for start, end, bracket_content in bracket_matches:
|
|
124
|
+
# Append non-bracket text segment as-is
|
|
125
|
+
cleaned_text += text[last_end:start]
|
|
126
|
+
|
|
127
|
+
# Parse a single citation token inside the bracket
|
|
128
|
+
parsed = parse_single_citation(bracket_content)
|
|
129
|
+
if parsed:
|
|
130
|
+
transcript_idx, block_idx, start_pattern = parsed
|
|
131
|
+
replacement = f"T{transcript_idx}B{block_idx}"
|
|
132
|
+
# Current absolute start position for this replacement in the cleaned text
|
|
133
|
+
start_idx = len(cleaned_text)
|
|
134
|
+
end_idx = start_idx + len(replacement)
|
|
135
|
+
citations.append(
|
|
136
|
+
Citation(
|
|
137
|
+
start_idx=start_idx,
|
|
138
|
+
end_idx=end_idx,
|
|
139
|
+
agent_run_idx=None,
|
|
140
|
+
transcript_idx=transcript_idx,
|
|
141
|
+
block_idx=block_idx,
|
|
142
|
+
action_unit_idx=None,
|
|
143
|
+
start_pattern=start_pattern,
|
|
144
|
+
)
|
|
145
|
+
)
|
|
146
|
+
cleaned_text += replacement
|
|
147
|
+
last_end = end
|
|
148
|
+
|
|
149
|
+
# Append any remaining tail after the last bracket
|
|
150
|
+
cleaned_text += text[last_end:]
|
|
151
|
+
|
|
152
|
+
return cleaned_text, citations
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
from docent.data_models.agent_run import AgentRun
|
|
4
|
+
from docent.data_models.citation import Citation, parse_single_citation, scan_brackets
|
|
5
|
+
from docent.data_models.transcript import format_chat_message
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def build_whitespace_flexible_regex(pattern: str) -> re.Pattern[str]:
|
|
9
|
+
"""Build regex that is flexible with whitespace matching."""
|
|
10
|
+
out = ""
|
|
11
|
+
i = 0
|
|
12
|
+
while i < len(pattern):
|
|
13
|
+
ch = pattern[i]
|
|
14
|
+
if ch.isspace():
|
|
15
|
+
# Skip all consecutive whitespace
|
|
16
|
+
while i < len(pattern) and pattern[i].isspace():
|
|
17
|
+
i += 1
|
|
18
|
+
out += r"\s+"
|
|
19
|
+
continue
|
|
20
|
+
out += re.escape(ch)
|
|
21
|
+
i += 1
|
|
22
|
+
return re.compile(out, re.DOTALL)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def find_citation_matches_in_text(text: str, start_pattern: str) -> list[tuple[int, int]]:
|
|
26
|
+
"""
|
|
27
|
+
Find all matches of a citation pattern in text.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
text: The text to search in
|
|
31
|
+
start_pattern: The pattern to search for
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
List of (start_index, end_index) tuples for matches
|
|
35
|
+
"""
|
|
36
|
+
if not start_pattern:
|
|
37
|
+
return []
|
|
38
|
+
|
|
39
|
+
try:
|
|
40
|
+
regex = build_whitespace_flexible_regex(start_pattern)
|
|
41
|
+
matches: list[tuple[int, int]] = []
|
|
42
|
+
|
|
43
|
+
for match in regex.finditer(text):
|
|
44
|
+
if match.group().strip(): # Only count non-empty matches
|
|
45
|
+
matches.append((match.start(), match.end()))
|
|
46
|
+
|
|
47
|
+
return matches
|
|
48
|
+
|
|
49
|
+
except re.error:
|
|
50
|
+
return []
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def get_transcript_text_for_citation(agent_run: AgentRun, citation: Citation) -> str | None:
|
|
54
|
+
"""
|
|
55
|
+
Get the text content of a specific transcript block from an AgentRun,
|
|
56
|
+
using the same formatting as shown to LLMs via format_chat_message.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
agent_run: The agent run containing transcript data
|
|
60
|
+
citation: Citation with transcript_idx and block_idx
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
Text content of the specified block (including tool calls), or None if not found
|
|
64
|
+
"""
|
|
65
|
+
if citation.transcript_idx is None:
|
|
66
|
+
return None
|
|
67
|
+
|
|
68
|
+
try:
|
|
69
|
+
transcript_keys = list(agent_run.transcripts.keys())
|
|
70
|
+
if citation.transcript_idx >= len(transcript_keys):
|
|
71
|
+
return None
|
|
72
|
+
|
|
73
|
+
transcript_key = transcript_keys[citation.transcript_idx]
|
|
74
|
+
|
|
75
|
+
transcript = agent_run.transcripts[transcript_key]
|
|
76
|
+
if citation.block_idx >= len(transcript.messages):
|
|
77
|
+
return None
|
|
78
|
+
|
|
79
|
+
message = transcript.messages[citation.block_idx]
|
|
80
|
+
|
|
81
|
+
# Use the same formatting function that generates content for LLMs
|
|
82
|
+
# This ensures consistent formatting between citation validation and LLM serialization
|
|
83
|
+
return format_chat_message(
|
|
84
|
+
message, citation.block_idx, citation.transcript_idx, citation.agent_run_idx
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
except (KeyError, IndexError, AttributeError):
|
|
88
|
+
return None
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def validate_citation_text_range(agent_run: AgentRun, citation: Citation) -> bool:
|
|
92
|
+
"""
|
|
93
|
+
Validate that a citation's text range exists in the referenced transcript.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
agent_run: The agent run containing transcript data
|
|
97
|
+
citation: Citation to validate
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
True if the citation's text range exists in the transcript, False otherwise
|
|
101
|
+
"""
|
|
102
|
+
if not citation.start_pattern:
|
|
103
|
+
# Nothing to validate
|
|
104
|
+
return True
|
|
105
|
+
|
|
106
|
+
text = get_transcript_text_for_citation(agent_run, citation)
|
|
107
|
+
if text is None:
|
|
108
|
+
return False
|
|
109
|
+
|
|
110
|
+
matches = find_citation_matches_in_text(text, citation.start_pattern)
|
|
111
|
+
|
|
112
|
+
return len(matches) > 0
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def remove_invalid_citation_ranges(text: str, agent_run: AgentRun) -> str:
|
|
116
|
+
"""
|
|
117
|
+
Remove invalid citation ranges from chat message/judge result. We do this as a separate step before normal citation parsing.
|
|
118
|
+
Normal citation parsing happens every time we load chat/results from db,
|
|
119
|
+
but invalid ranges should never make it to the db.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
text: Original text containing citations
|
|
123
|
+
agent_run: Agent run with transcript data
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
Tuple of (cleaned_text, valid_citations)
|
|
127
|
+
"""
|
|
128
|
+
# Find all bracket positions in the original text
|
|
129
|
+
bracket_matches = scan_brackets(text)
|
|
130
|
+
citations: list[Citation] = []
|
|
131
|
+
|
|
132
|
+
for start, end, bracket_content in bracket_matches:
|
|
133
|
+
# Parse this bracket content to get citation info
|
|
134
|
+
parsed = parse_single_citation(bracket_content)
|
|
135
|
+
if parsed:
|
|
136
|
+
transcript_idx, block_idx, start_pattern = parsed
|
|
137
|
+
# The citation spans from start to end in the original text
|
|
138
|
+
citation = Citation(
|
|
139
|
+
start_idx=start,
|
|
140
|
+
end_idx=end,
|
|
141
|
+
agent_run_idx=None,
|
|
142
|
+
transcript_idx=transcript_idx,
|
|
143
|
+
block_idx=block_idx,
|
|
144
|
+
action_unit_idx=None,
|
|
145
|
+
start_pattern=start_pattern,
|
|
146
|
+
)
|
|
147
|
+
citations.append(citation)
|
|
148
|
+
|
|
149
|
+
# Filter to only citations with text ranges that need validation
|
|
150
|
+
citations_to_validate = [c for c in citations if c.start_pattern]
|
|
151
|
+
|
|
152
|
+
# Sort citations by start_idx in reverse order to avoid index shifting issues
|
|
153
|
+
sorted_citations = sorted(citations_to_validate, key=lambda c: c.start_idx, reverse=True)
|
|
154
|
+
|
|
155
|
+
invalid_citations: list[Citation] = [
|
|
156
|
+
c for c in sorted_citations if not validate_citation_text_range(agent_run, c)
|
|
157
|
+
]
|
|
158
|
+
|
|
159
|
+
# Remove invalid text ranges from citations in the original text
|
|
160
|
+
modified_text = text
|
|
161
|
+
for citation in invalid_citations:
|
|
162
|
+
citation_without_range = f"[T{citation.transcript_idx}B{citation.block_idx}]"
|
|
163
|
+
before = modified_text[: citation.start_idx]
|
|
164
|
+
after = modified_text[citation.end_idx :]
|
|
165
|
+
modified_text = before + citation_without_range + after
|
|
166
|
+
return modified_text
|
|
@@ -12,6 +12,7 @@ from docent.data_models._tiktoken_util import (
|
|
|
12
12
|
truncate_to_token_limit,
|
|
13
13
|
)
|
|
14
14
|
from docent.data_models.chat import AssistantMessage, ChatMessage, ContentReasoning
|
|
15
|
+
from docent.data_models.citation import RANGE_BEGIN, RANGE_END
|
|
15
16
|
|
|
16
17
|
# Template for formatting individual transcript blocks
|
|
17
18
|
TRANSCRIPT_BLOCK_TEMPLATE = """
|
|
@@ -21,10 +22,20 @@ TRANSCRIPT_BLOCK_TEMPLATE = """
|
|
|
21
22
|
""".strip()
|
|
22
23
|
|
|
23
24
|
# Instructions for citing single transcript blocks
|
|
24
|
-
|
|
25
|
+
TEXT_RANGE_CITE_INSTRUCTION = f"""Anytime you quote the transcript, or refer to something that happened in the transcript, or make any claim about the transcript, add an inline citation. Each transcript and each block has a unique index. Cite the relevant indices in brackets. For example, to cite the entirety of transcript 0, block 1, write [T0B1].
|
|
25
26
|
|
|
26
|
-
|
|
27
|
-
|
|
27
|
+
A citation may include a specific range of text within a block. Use {RANGE_BEGIN} and {RANGE_END} to mark the specific range of text. Add it after the block ID separated by a colon. For example, to cite the part of transcript 0, block 1, where the agent says "I understand the task", write [T0B1:{RANGE_BEGIN}I understand the task{RANGE_END}]. Citations must follow this exact format. The markers {RANGE_BEGIN} and {RANGE_END} must be used ONLY inside the brackets of a citation.
|
|
28
|
+
|
|
29
|
+
Important notes:
|
|
30
|
+
- You must include the full content of the text range {RANGE_BEGIN} and {RANGE_END}, EXACTLY as it appears in the transcript, word-for-word, including any markers or punctuation that appear in the middle of the text.
|
|
31
|
+
- Citations must be as specific as possible. This means you should usually cite a specific text range within a block.
|
|
32
|
+
- A citation is not a quote. For brevity, text ranges will not be rendered inline. The user will have to click on the citation to see the full text range.
|
|
33
|
+
- Citations are self-contained. Do NOT label them as citation or evidence. Just insert the citation by itself at the appropriate place in the text.
|
|
34
|
+
- Citations must come immediately after the part of a claim that they support. This may be in the middle of a sentence.
|
|
35
|
+
- Each pair of brackets must contain only one citation. To cite multiple blocks, use multiple pairs of brackets, like [T0B0] [T0B1].
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
BLOCK_CITE_INSTRUCTION = f"""Each transcript and each block has a unique index. Cite the relevant indices in brackets when relevant, like [T<idx>B<idx>]. Use multiple tags to cite multiple blocks, like [T<idx1>B<idx1>][T<idx2>B<idx2>]. Remember to cite specific blocks and NOT action units."""
|
|
28
39
|
|
|
29
40
|
|
|
30
41
|
def format_chat_message(
|
|
@@ -291,66 +302,105 @@ class Transcript(BaseModel):
|
|
|
291
302
|
agent_run_idx: int | None = None,
|
|
292
303
|
highlight_action_unit: int | None = None,
|
|
293
304
|
) -> str:
|
|
294
|
-
return self.
|
|
305
|
+
return self._to_str_with_token_limit_impl(
|
|
295
306
|
token_limit=sys.maxsize,
|
|
296
|
-
agent_run_idx=agent_run_idx,
|
|
297
307
|
transcript_idx=transcript_idx,
|
|
308
|
+
agent_run_idx=agent_run_idx,
|
|
309
|
+
use_action_units=True,
|
|
298
310
|
highlight_action_unit=highlight_action_unit,
|
|
299
311
|
)[0]
|
|
300
312
|
|
|
301
|
-
def
|
|
313
|
+
def _generate_formatted_blocks(
|
|
302
314
|
self,
|
|
303
|
-
token_limit: int,
|
|
304
315
|
transcript_idx: int = 0,
|
|
305
316
|
agent_run_idx: int | None = None,
|
|
317
|
+
use_action_units: bool = True,
|
|
306
318
|
highlight_action_unit: int | None = None,
|
|
307
319
|
) -> list[str]:
|
|
308
|
-
"""
|
|
309
|
-
under the GPT-4 tokenization scheme.
|
|
320
|
+
"""Generate formatted blocks for transcript representation.
|
|
310
321
|
|
|
311
|
-
|
|
312
|
-
|
|
322
|
+
Args:
|
|
323
|
+
transcript_idx: Index of the transcript
|
|
324
|
+
agent_run_idx: Optional agent run index
|
|
325
|
+
use_action_units: If True, group messages into action units. If False, use individual blocks.
|
|
326
|
+
highlight_action_unit: Optional action unit to highlight (only used with action units)
|
|
313
327
|
|
|
314
328
|
Returns:
|
|
315
|
-
list[str]:
|
|
316
|
-
under the GPT-4 tokenization scheme.
|
|
329
|
+
list[str]: List of formatted blocks
|
|
317
330
|
"""
|
|
318
|
-
if
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
331
|
+
if use_action_units:
|
|
332
|
+
if highlight_action_unit is not None and not (
|
|
333
|
+
0 <= highlight_action_unit < len(self._units_of_action or [])
|
|
334
|
+
):
|
|
335
|
+
raise ValueError(f"Invalid action unit index: {highlight_action_unit}")
|
|
336
|
+
|
|
337
|
+
blocks: list[str] = []
|
|
338
|
+
for unit_idx, unit in enumerate(self._units_of_action or []):
|
|
339
|
+
unit_blocks: list[str] = []
|
|
340
|
+
for msg_idx in unit:
|
|
341
|
+
unit_blocks.append(
|
|
342
|
+
format_chat_message(
|
|
343
|
+
self.messages[msg_idx],
|
|
344
|
+
msg_idx,
|
|
345
|
+
transcript_idx,
|
|
346
|
+
agent_run_idx,
|
|
347
|
+
)
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
unit_content = "\n".join(unit_blocks)
|
|
351
|
+
|
|
352
|
+
# Add highlighting if requested
|
|
353
|
+
if highlight_action_unit and unit_idx == highlight_action_unit:
|
|
354
|
+
blocks_str_template = "<HIGHLIGHTED>\n{}\n</HIGHLIGHTED>"
|
|
355
|
+
else:
|
|
356
|
+
blocks_str_template = "{}"
|
|
357
|
+
blocks.append(
|
|
358
|
+
blocks_str_template.format(
|
|
359
|
+
f"<action unit {unit_idx}>\n{unit_content}\n</action unit {unit_idx}>"
|
|
360
|
+
)
|
|
361
|
+
)
|
|
362
|
+
else:
|
|
363
|
+
# Individual message blocks
|
|
364
|
+
blocks = []
|
|
365
|
+
for msg_idx, message in enumerate(self.messages):
|
|
366
|
+
blocks.append(
|
|
329
367
|
format_chat_message(
|
|
330
|
-
|
|
368
|
+
message,
|
|
331
369
|
msg_idx,
|
|
332
370
|
transcript_idx,
|
|
333
371
|
agent_run_idx,
|
|
334
372
|
)
|
|
335
373
|
)
|
|
336
374
|
|
|
337
|
-
|
|
375
|
+
return blocks
|
|
338
376
|
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
377
|
+
def _to_str_with_token_limit_impl(
|
|
378
|
+
self,
|
|
379
|
+
token_limit: int,
|
|
380
|
+
transcript_idx: int = 0,
|
|
381
|
+
agent_run_idx: int | None = None,
|
|
382
|
+
use_action_units: bool = True,
|
|
383
|
+
highlight_action_unit: int | None = None,
|
|
384
|
+
) -> list[str]:
|
|
385
|
+
"""Core implementation for string representation with token limits.
|
|
386
|
+
|
|
387
|
+
Args:
|
|
388
|
+
token_limit: Maximum tokens per returned string
|
|
389
|
+
transcript_idx: Index of the transcript
|
|
390
|
+
agent_run_idx: Optional agent run index
|
|
391
|
+
use_action_units: If True, group messages into action units. If False, use individual blocks.
|
|
392
|
+
highlight_action_unit: Optional action unit to highlight (only used with action units)
|
|
393
|
+
|
|
394
|
+
Returns:
|
|
395
|
+
list[str]: List of strings, each within token limit
|
|
396
|
+
"""
|
|
397
|
+
blocks = self._generate_formatted_blocks(
|
|
398
|
+
transcript_idx, agent_run_idx, use_action_units, highlight_action_unit
|
|
399
|
+
)
|
|
400
|
+
blocks_str = "\n".join(blocks)
|
|
350
401
|
|
|
351
402
|
# Gather metadata
|
|
352
403
|
metadata_obj = fake_model_dump(self.metadata)
|
|
353
|
-
|
|
354
404
|
yaml_width = float("inf")
|
|
355
405
|
block_str = f"<blocks>\n{blocks_str}\n</blocks>\n"
|
|
356
406
|
metadata_str = f"<metadata>\n{yaml.dump(metadata_obj, width=yaml_width)}\n</metadata>"
|
|
@@ -365,25 +415,75 @@ class Transcript(BaseModel):
|
|
|
365
415
|
return [f"{block_str}" f"{metadata_str}"]
|
|
366
416
|
else:
|
|
367
417
|
results: list[str] = []
|
|
368
|
-
block_token_counts = [get_token_count(block) for block in
|
|
418
|
+
block_token_counts = [get_token_count(block) for block in blocks]
|
|
369
419
|
ranges = group_messages_into_ranges(
|
|
370
420
|
block_token_counts, metadata_token_count, token_limit
|
|
371
421
|
)
|
|
372
422
|
for msg_range in ranges:
|
|
373
423
|
if msg_range.include_metadata:
|
|
374
|
-
|
|
375
|
-
results.append(f"<blocks>\n{
|
|
424
|
+
cur_blocks = "\n".join(blocks[msg_range.start : msg_range.end])
|
|
425
|
+
results.append(f"<blocks>\n{cur_blocks}\n</blocks>\n" f"{metadata_str}")
|
|
376
426
|
else:
|
|
377
427
|
assert (
|
|
378
428
|
msg_range.end == msg_range.start + 1
|
|
379
429
|
), "Ranges without metadata should be a single message"
|
|
380
|
-
result = str(
|
|
430
|
+
result = str(blocks[msg_range.start])
|
|
381
431
|
if msg_range.num_tokens > token_limit - 10:
|
|
382
432
|
result = truncate_to_token_limit(result, token_limit - 10)
|
|
383
433
|
results.append(f"<blocks>\n{result}\n</blocks>\n")
|
|
384
434
|
|
|
385
435
|
return results
|
|
386
436
|
|
|
437
|
+
def to_str_blocks(
|
|
438
|
+
self,
|
|
439
|
+
transcript_idx: int = 0,
|
|
440
|
+
agent_run_idx: int | None = None,
|
|
441
|
+
) -> str:
|
|
442
|
+
"""Represents the transcript as a string using individual message blocks.
|
|
443
|
+
|
|
444
|
+
Unlike to_str() which groups messages into action units, this method
|
|
445
|
+
formats each message as an individual block.
|
|
446
|
+
|
|
447
|
+
Returns:
|
|
448
|
+
str: A string representation with individual message blocks.
|
|
449
|
+
"""
|
|
450
|
+
return self._to_str_with_token_limit_impl(
|
|
451
|
+
token_limit=sys.maxsize,
|
|
452
|
+
transcript_idx=transcript_idx,
|
|
453
|
+
agent_run_idx=agent_run_idx,
|
|
454
|
+
use_action_units=False,
|
|
455
|
+
)[0]
|
|
456
|
+
|
|
457
|
+
def to_str_with_token_limit(
|
|
458
|
+
self,
|
|
459
|
+
token_limit: int,
|
|
460
|
+
transcript_idx: int = 0,
|
|
461
|
+
agent_run_idx: int | None = None,
|
|
462
|
+
highlight_action_unit: int | None = None,
|
|
463
|
+
) -> list[str]:
|
|
464
|
+
"""Represents the transcript as a list of strings using action units with token limit handling."""
|
|
465
|
+
return self._to_str_with_token_limit_impl(
|
|
466
|
+
token_limit=token_limit,
|
|
467
|
+
transcript_idx=transcript_idx,
|
|
468
|
+
agent_run_idx=agent_run_idx,
|
|
469
|
+
use_action_units=True,
|
|
470
|
+
highlight_action_unit=highlight_action_unit,
|
|
471
|
+
)
|
|
472
|
+
|
|
473
|
+
def to_str_blocks_with_token_limit(
|
|
474
|
+
self,
|
|
475
|
+
token_limit: int,
|
|
476
|
+
transcript_idx: int = 0,
|
|
477
|
+
agent_run_idx: int | None = None,
|
|
478
|
+
) -> list[str]:
|
|
479
|
+
"""Represents the transcript as individual blocks with token limit handling."""
|
|
480
|
+
return self._to_str_with_token_limit_impl(
|
|
481
|
+
token_limit=token_limit,
|
|
482
|
+
transcript_idx=transcript_idx,
|
|
483
|
+
agent_run_idx=agent_run_idx,
|
|
484
|
+
use_action_units=False,
|
|
485
|
+
)
|
|
486
|
+
|
|
387
487
|
|
|
388
488
|
class TranscriptWithoutMetadataValidator(Transcript):
|
|
389
489
|
"""
|
|
@@ -350,3 +350,20 @@ class Docent:
|
|
|
350
350
|
|
|
351
351
|
logger.info(f"Successfully shared Collection '{collection_id}' with {email}")
|
|
352
352
|
return response.json()
|
|
353
|
+
|
|
354
|
+
def list_agent_run_ids(self, collection_id: str) -> list[str]:
|
|
355
|
+
"""Get all agent run IDs for a collection.
|
|
356
|
+
|
|
357
|
+
Args:
|
|
358
|
+
collection_id: ID of the Collection.
|
|
359
|
+
|
|
360
|
+
Returns:
|
|
361
|
+
str: JSON string containing the list of agent run IDs.
|
|
362
|
+
|
|
363
|
+
Raises:
|
|
364
|
+
requests.exceptions.HTTPError: If the API request fails.
|
|
365
|
+
"""
|
|
366
|
+
url = f"{self._server_url}/{collection_id}/agent_run_ids"
|
|
367
|
+
response = self._session.get(url)
|
|
368
|
+
response.raise_for_status()
|
|
369
|
+
return response.json()
|
|
@@ -1,223 +0,0 @@
|
|
|
1
|
-
import re
|
|
2
|
-
from typing import TypedDict
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class Citation(TypedDict):
|
|
6
|
-
start_idx: int
|
|
7
|
-
end_idx: int
|
|
8
|
-
agent_run_idx: int | None
|
|
9
|
-
transcript_idx: int | None
|
|
10
|
-
block_idx: int
|
|
11
|
-
action_unit_idx: int | None
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
def parse_citations_single_run(text: str) -> list[Citation]:
|
|
15
|
-
"""
|
|
16
|
-
Parse citations from text in the format described by SINGLE_BLOCK_CITE_INSTRUCTION.
|
|
17
|
-
|
|
18
|
-
Supported formats:
|
|
19
|
-
- Single block: [T<key>B<idx>]
|
|
20
|
-
- Multiple blocks: [T<key1>B<idx1>, T<key2>B<idx2>, ...]
|
|
21
|
-
- Dash-separated blocks: [T<key1>B<idx1>-T<key2>B<idx2>]
|
|
22
|
-
|
|
23
|
-
Args:
|
|
24
|
-
text: The text to parse citations from
|
|
25
|
-
|
|
26
|
-
Returns:
|
|
27
|
-
A list of Citation objects with start_idx and end_idx representing
|
|
28
|
-
the character positions in the text (excluding brackets)
|
|
29
|
-
"""
|
|
30
|
-
citations: list[Citation] = []
|
|
31
|
-
|
|
32
|
-
# Find all bracketed content first
|
|
33
|
-
bracket_pattern = r"\[(.*?)\]"
|
|
34
|
-
bracket_matches = re.finditer(bracket_pattern, text)
|
|
35
|
-
|
|
36
|
-
for bracket_match in bracket_matches:
|
|
37
|
-
bracket_content = bracket_match.group(1)
|
|
38
|
-
# Starting position of the bracket content (excluding '[')
|
|
39
|
-
content_start_pos = bracket_match.start() + 1
|
|
40
|
-
|
|
41
|
-
# Split by commas if present
|
|
42
|
-
parts = [part.strip() for part in bracket_content.split(",")]
|
|
43
|
-
|
|
44
|
-
for part in parts:
|
|
45
|
-
# Check if this part contains a dash (range citation)
|
|
46
|
-
if "-" in part:
|
|
47
|
-
# Split by dash and process each sub-part
|
|
48
|
-
dash_parts = [dash_part.strip() for dash_part in part.split("-")]
|
|
49
|
-
for dash_part in dash_parts:
|
|
50
|
-
# Check for single block citation: T<key>B<idx>
|
|
51
|
-
single_match = re.match(r"T(\d+)B(\d+)", dash_part)
|
|
52
|
-
if single_match:
|
|
53
|
-
transcript_idx = int(single_match.group(1))
|
|
54
|
-
block_idx = int(single_match.group(2))
|
|
55
|
-
|
|
56
|
-
# Find position within the original text
|
|
57
|
-
citation_text = f"T{transcript_idx}B{block_idx}"
|
|
58
|
-
part_pos_in_content = bracket_content.find(dash_part)
|
|
59
|
-
ref_pos = content_start_pos + part_pos_in_content
|
|
60
|
-
ref_end = ref_pos + len(citation_text)
|
|
61
|
-
|
|
62
|
-
# Check if this citation overlaps with any existing citation
|
|
63
|
-
if not any(
|
|
64
|
-
citation["start_idx"] <= ref_pos < citation["end_idx"]
|
|
65
|
-
or citation["start_idx"] < ref_end <= citation["end_idx"]
|
|
66
|
-
for citation in citations
|
|
67
|
-
):
|
|
68
|
-
citations.append(
|
|
69
|
-
Citation(
|
|
70
|
-
start_idx=ref_pos,
|
|
71
|
-
end_idx=ref_end,
|
|
72
|
-
agent_run_idx=None,
|
|
73
|
-
transcript_idx=transcript_idx,
|
|
74
|
-
block_idx=block_idx,
|
|
75
|
-
action_unit_idx=None,
|
|
76
|
-
)
|
|
77
|
-
)
|
|
78
|
-
else:
|
|
79
|
-
# Check for single block citation: T<key>B<idx>
|
|
80
|
-
single_match = re.match(r"T(\d+)B(\d+)", part)
|
|
81
|
-
if single_match:
|
|
82
|
-
transcript_idx = int(single_match.group(1))
|
|
83
|
-
block_idx = int(single_match.group(2))
|
|
84
|
-
|
|
85
|
-
# Find position within the original text
|
|
86
|
-
citation_text = f"T{transcript_idx}B{block_idx}"
|
|
87
|
-
part_pos_in_content = bracket_content.find(part)
|
|
88
|
-
ref_pos = content_start_pos + part_pos_in_content
|
|
89
|
-
ref_end = ref_pos + len(citation_text)
|
|
90
|
-
|
|
91
|
-
# Check if this citation overlaps with any existing citation
|
|
92
|
-
if not any(
|
|
93
|
-
citation["start_idx"] <= ref_pos < citation["end_idx"]
|
|
94
|
-
or citation["start_idx"] < ref_end <= citation["end_idx"]
|
|
95
|
-
for citation in citations
|
|
96
|
-
):
|
|
97
|
-
citations.append(
|
|
98
|
-
Citation(
|
|
99
|
-
start_idx=ref_pos,
|
|
100
|
-
end_idx=ref_end,
|
|
101
|
-
agent_run_idx=None,
|
|
102
|
-
transcript_idx=transcript_idx,
|
|
103
|
-
block_idx=block_idx,
|
|
104
|
-
action_unit_idx=None,
|
|
105
|
-
)
|
|
106
|
-
)
|
|
107
|
-
|
|
108
|
-
return citations
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
def parse_citations_multi_run(text: str) -> list[Citation]:
|
|
112
|
-
"""
|
|
113
|
-
Parse citations from text in the format described by MULTI_BLOCK_CITE_INSTRUCTION.
|
|
114
|
-
|
|
115
|
-
Supported formats:
|
|
116
|
-
- Single block in transcript: [R<idx>T<key>B<idx>] or ([R<idx>T<key>B<idx>])
|
|
117
|
-
- Multiple blocks: [R<idx1>T<key1>B<idx1>][R<idx2>T<key2>B<idx2>]
|
|
118
|
-
- Comma-separated blocks: [R<idx1>T<key1>B<idx1>, R<idx2>T<key2>B<idx2>, ...]
|
|
119
|
-
- Dash-separated blocks: [R<idx1>T<key1>B<idx1>-R<idx2>T<key2>B<idx2>]
|
|
120
|
-
|
|
121
|
-
Args:
|
|
122
|
-
text: The text to parse citations from
|
|
123
|
-
|
|
124
|
-
Returns:
|
|
125
|
-
A list of Citation objects with start_idx and end_idx representing
|
|
126
|
-
the character positions in the text (excluding brackets)
|
|
127
|
-
"""
|
|
128
|
-
citations: list[Citation] = []
|
|
129
|
-
|
|
130
|
-
# Find all content within brackets - this handles nested brackets too
|
|
131
|
-
bracket_pattern = r"\[([^\[\]]*(?:\[[^\[\]]*\][^\[\]]*)*)\]"
|
|
132
|
-
# Also handle optional parentheses around the brackets
|
|
133
|
-
paren_bracket_pattern = r"\(\[([^\[\]]*(?:\[[^\[\]]*\][^\[\]]*)*)\]\)"
|
|
134
|
-
|
|
135
|
-
# Single citation pattern
|
|
136
|
-
single_pattern = r"R(\d+)T(\d+)B(\d+)"
|
|
137
|
-
|
|
138
|
-
# Find all bracket matches
|
|
139
|
-
for pattern in [bracket_pattern, paren_bracket_pattern]:
|
|
140
|
-
matches = re.finditer(pattern, text)
|
|
141
|
-
for match in matches:
|
|
142
|
-
# Get the content inside brackets
|
|
143
|
-
if pattern == bracket_pattern:
|
|
144
|
-
content = match.group(1)
|
|
145
|
-
start_pos = match.start() + 1 # +1 to skip the opening bracket
|
|
146
|
-
else:
|
|
147
|
-
content = match.group(1)
|
|
148
|
-
start_pos = match.start() + 2 # +2 to skip the opening parenthesis and bracket
|
|
149
|
-
|
|
150
|
-
# Split by comma if present
|
|
151
|
-
items = [item.strip() for item in content.split(",")]
|
|
152
|
-
|
|
153
|
-
for item in items:
|
|
154
|
-
# Check if this item contains a dash (range citation)
|
|
155
|
-
if "-" in item:
|
|
156
|
-
# Split by dash and process each sub-item
|
|
157
|
-
dash_items = [dash_item.strip() for dash_item in item.split("-")]
|
|
158
|
-
for dash_item in dash_items:
|
|
159
|
-
# Check for single citation
|
|
160
|
-
single_match = re.match(single_pattern, dash_item)
|
|
161
|
-
if single_match:
|
|
162
|
-
agent_run_idx = int(single_match.group(1))
|
|
163
|
-
transcript_idx = int(single_match.group(2))
|
|
164
|
-
block_idx = int(single_match.group(3))
|
|
165
|
-
|
|
166
|
-
# Calculate position in the original text
|
|
167
|
-
citation_text = f"R{agent_run_idx}T{transcript_idx}B{block_idx}"
|
|
168
|
-
citation_start = text.find(citation_text, start_pos)
|
|
169
|
-
citation_end = citation_start + len(citation_text)
|
|
170
|
-
|
|
171
|
-
# Move start_pos for the next item if there are more items
|
|
172
|
-
start_pos = citation_end
|
|
173
|
-
|
|
174
|
-
# Avoid duplicate citations
|
|
175
|
-
if not any(
|
|
176
|
-
citation["start_idx"] == citation_start
|
|
177
|
-
and citation["end_idx"] == citation_end
|
|
178
|
-
for citation in citations
|
|
179
|
-
):
|
|
180
|
-
citations.append(
|
|
181
|
-
Citation(
|
|
182
|
-
start_idx=citation_start,
|
|
183
|
-
end_idx=citation_end,
|
|
184
|
-
agent_run_idx=agent_run_idx,
|
|
185
|
-
transcript_idx=transcript_idx,
|
|
186
|
-
block_idx=block_idx,
|
|
187
|
-
action_unit_idx=None,
|
|
188
|
-
)
|
|
189
|
-
)
|
|
190
|
-
else:
|
|
191
|
-
# Check for single citation
|
|
192
|
-
single_match = re.match(single_pattern, item)
|
|
193
|
-
if single_match:
|
|
194
|
-
agent_run_idx = int(single_match.group(1))
|
|
195
|
-
transcript_idx = int(single_match.group(2))
|
|
196
|
-
block_idx = int(single_match.group(3))
|
|
197
|
-
|
|
198
|
-
# Calculate position in the original text
|
|
199
|
-
citation_text = f"R{agent_run_idx}T{transcript_idx}B{block_idx}"
|
|
200
|
-
citation_start = text.find(citation_text, start_pos)
|
|
201
|
-
citation_end = citation_start + len(citation_text)
|
|
202
|
-
|
|
203
|
-
# Move start_pos for the next item if there are more items
|
|
204
|
-
start_pos = citation_end
|
|
205
|
-
|
|
206
|
-
# Avoid duplicate citations
|
|
207
|
-
if not any(
|
|
208
|
-
citation["start_idx"] == citation_start
|
|
209
|
-
and citation["end_idx"] == citation_end
|
|
210
|
-
for citation in citations
|
|
211
|
-
):
|
|
212
|
-
citations.append(
|
|
213
|
-
Citation(
|
|
214
|
-
start_idx=citation_start,
|
|
215
|
-
end_idx=citation_end,
|
|
216
|
-
agent_run_idx=agent_run_idx,
|
|
217
|
-
transcript_idx=transcript_idx,
|
|
218
|
-
block_idx=block_idx,
|
|
219
|
-
action_unit_idx=None,
|
|
220
|
-
)
|
|
221
|
-
)
|
|
222
|
-
|
|
223
|
-
return citations
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|